From a7765a14df4ac0c62cfae568418950c3f23adb02 Mon Sep 17 00:00:00 2001 From: majin0824 Date: Wed, 15 Apr 2026 17:02:18 +0800 Subject: [PATCH 1/5] Refactor: migrate A5 examples and tests to SceneTestCase format - Replace golden.py + kernel_config.py with unified test_*.py files using @scene_test decorator and SceneTestCase base class - Covers examples/a5/{host_build_graph,tensormap_and_ringbuffer} (14 examples) and tests/st/a5/{host_build_graph,tensormap_and_ringbuffer} (3 tests) - Add a5sim to platforms for all cases that support simulation - Cross-directory kernel references use relative paths (../) --- .../paged_attention/golden.py | 58 ------ .../paged_attention/kernels/kernel_config.py | 76 -------- .../paged_attention/test_paged_attention.py | 118 +++++++++++++ .../tensormap_and_ringbuffer/bgemm/golden.py | 69 -------- .../bgemm/kernels/kernel_config.py | 49 ------ .../bgemm/test_bgemm.py | 81 +++++++++ .../mixed_example/golden.py | 122 ------------- .../mixed_example/kernels/kernel_config.py | 74 -------- .../mixed_example/test_mixed_example.py | 166 ++++++++++++++++++ .../paged_attention/golden.py | 75 -------- .../paged_attention/kernels/kernel_config.py | 78 -------- .../paged_attention/test_paged_attention.py | 152 ++++++++++++++++ .../spmd_basic/golden.py | 65 ------- .../spmd_basic/kernels/kernel_config.py | 50 ------ .../spmd_basic/test_spmd_basic.py | 75 ++++++++ .../spmd_multiblock_aiv/golden.py | 63 ------- .../kernels/kernel_config.py | 38 ---- .../test_spmd_multiblock_aiv.py | 76 ++++++++ .../spmd_multiblock_mix/golden.py | 68 ------- .../kernels/kernel_config.py | 50 ------ .../test_spmd_multiblock_mix.py | 82 +++++++++ .../spmd_starvation/golden.py | 84 --------- .../spmd_starvation/kernels/kernel_config.py | 52 ------ .../spmd_starvation/test_spmd_starvation.py | 101 +++++++++++ .../spmd_sync_start/golden.py | 66 ------- .../spmd_sync_start/kernels/kernel_config.py | 51 ------ .../spmd_sync_start/test_spmd_sync_start.py | 80 +++++++++ .../spmd_sync_start_aiv/golden.py | 62 ------- .../kernels/kernel_config.py | 40 ----- .../test_spmd_sync_start_aiv.py | 78 ++++++++ .../spmd_sync_start_edge/golden.py | 66 ------- .../kernels/kernel_config.py | 51 ------ .../test_spmd_sync_start_edge.py | 85 +++++++++ .../spmd_sync_start_stress/golden.py | 104 ----------- .../kernels/kernel_config.py | 61 ------- .../test_spmd_sync_start_stress.py | 112 ++++++++++++ .../paged_attention/golden.py | 58 ------ .../paged_attention/kernels/kernel_config.py | 78 -------- .../paged_attention/test_paged_attention.py | 118 +++++++++++++ .../paged_attention/golden.py | 63 ------- .../paged_attention/kernels/kernel_config.py | 78 -------- .../paged_attention/test_paged_attention.py | 134 ++++++++++++++ .../paged_attention_unroll/golden.py | 63 ------- .../kernels/kernel_config.py | 78 -------- .../test_paged_attention_unroll.py | 133 ++++++++++++++ 45 files changed, 1591 insertions(+), 1990 deletions(-) delete mode 100644 examples/a5/host_build_graph/paged_attention/golden.py delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py create mode 100644 examples/a5/host_build_graph/paged_attention/test_paged_attention.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/bgemm/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py delete mode 100644 tests/st/a5/host_build_graph/paged_attention/golden.py delete mode 100644 tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py create mode 100644 tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py create mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py create mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py diff --git a/examples/a5/host_build_graph/paged_attention/golden.py b/examples/a5/host_build_graph/paged_attention/golden.py deleted file mode 100644 index e9672d5dc..000000000 --- a/examples/a5/host_build_graph/paged_attention/golden.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged Attention Golden - host_build_graph example (small scale, float16). - -Args layout: [query, key_cache, value_cache, block_table, context_lens, out, scale] - - Tensors retain original multi-dimensional shapes (ContinuousTensor metadata carries shape/dtype) - - scale is a scalar float parameter -""" - -from simpler_setup.goldens.paged_attention import ( - compute_golden, # noqa: F401 - run_golden_test, -) -from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs - -__outputs__ = ["out"] - -RTOL = 1e-2 -ATOL = 1e-2 - -ALL_CASES = { - "Case1": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 16, - "max_model_len": 256, - "dtype": "float16", - }, - "Case2": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 64, - "max_model_len": 256, - "dtype": "float16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py deleted file mode 100644 index 0245cc8a5..000000000 --- a/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention Kernel and Orchestration Configuration - -Defines the kernels and orchestration function for paged attention -with AIC/AIV subgraph splitting: - -AIC Kernels (Matrix Multiplication): - - aic_qk_matmul: Q @ K^T computation - - aic_pv_matmul: P @ V computation - -AIV Kernels (Vector Operations): - - aiv_softmax_prepare: scale, rowmax, exp, rowsum - - aiv_online_update: online softmax accumulation + fused normalization -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -# Orchestration config -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -# Kernel configs -KERNELS = [ - # AIC kernels (matrix multiplication using Cube unit) - { - "func_id": 0, - "name": "QK", - "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - # AIV kernels (vector operations) - { - "func_id": 1, - "name": "SF", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, -] - -# Runtime configuration -RUNTIME_CONFIG = { - "runtime": "host_build_graph", - "aicpu_thread_num": 3, - "block_dim": 3, -} diff --git a/examples/a5/host_build_graph/paged_attention/test_paged_attention.py b/examples/a5/host_build_graph/paged_attention/test_paged_attention.py new file mode 100644 index 000000000..7d72b6be1 --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/test_paged_attention.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention — host_build_graph example (small scale, float16). + +AIC+AIV mixed execution with online softmax paged attention. +Small-scale cases for quick validation on A5. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="host_build_graph") +class TestPagedAttention(SceneTestCase): + """Paged attention with host_build_graph runtime on A5.""" + + RTOL = 1e-2 + ATOL = 1e-2 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "build_paged_attention_graph", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 16, + "max_model_len": 256, + "dtype": "float16", + }, + }, + { + "name": "Case2", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 64, + "max_model_len": 256, + "dtype": "float16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/golden.py b/examples/a5/tensormap_and_ringbuffer/bgemm/golden.py deleted file mode 100644 index 5ab0590c4..000000000 --- a/examples/a5/tensormap_and_ringbuffer/bgemm/golden.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test specification for BGEMM (tensormap_and_ringbuffer Runtime). - -Computation: C = A @ B (tiled matrix multiplication) -Configuration: 4x4x4 grid, 64x64 tiles - -Args layout: [A, B, C] — shape/dtype/size in ContinuousTensor metadata -""" - -import torch - -__outputs__ = ["C"] -RTOL = 1e-3 -ATOL = 1e-3 - -TILE_M = 64 -TILE_K = 64 -TILE_N = 64 - -GRID_M = 4 -GRID_K = 4 -GRID_N = 4 -BATCH = 2 - -M = TILE_M * GRID_M -K = TILE_K * GRID_K -N = TILE_N * GRID_N - - -def generate_inputs(params: dict) -> list: - """Generate input tensors with tile-first memory layout.""" - A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01 - B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01 - C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32) - - A_flat = A.flatten() - B_flat = B.flatten() - C_flat = C.flatten() - - return [ - ("A", A_flat), - ("B", B_flat), - ("C", C_flat), - ] - - -def compute_golden(tensors: dict, params: dict) -> None: - """Compute golden result: C[m,n] = sum(k) A[m,k] @ B[k,n].""" - A = torch.as_tensor(tensors["A"]).reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) - B = torch.as_tensor(tensors["B"]).reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) - C = torch.as_tensor(tensors["C"]).reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) - - C[:] = 0.0 - - for batch in range(BATCH): - for m_idx in range(GRID_M): - for n_idx in range(GRID_N): - for k_idx in range(GRID_K): - C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx]) - - tensors["C"][:] = C.flatten() diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py deleted file mode 100644 index 91f2830ec..000000000 --- a/examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for BGEMM (tensormap_and_ringbuffer Runtime). - -Cube core (AIC) for matrix multiplication, Vector core (AIV) for accumulation. -Uses TPUSH/TPOP for cube-to-vector data transfer (bypasses GM). -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "bgemm_orch.cpp"), - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.OUT], -} - -KERNELS = [ - { - "func_id": 0, - "name": "GEMM", - "source": str(_KERNELS_ROOT / "mix" / "kernel_bgemm.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "name": "ADD", - "source": str(_KERNELS_ROOT / "mix" / "kernel_bgemm.cpp"), - "core_type": "aiv", - "signature": [D.INOUT, D.IN], - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 3, -} diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py new file mode 100644 index 000000000..d7bc46a59 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""BGEMM: batched tiled matrix multiplication C = A @ B. + +Fixed 4x4x4 grid with 64x64 tiles, 2 batches. +Cube core (AIC) for matmul, Vector core (AIV) for accumulation. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +TILE_M, TILE_K, TILE_N = 64, 64, 64 +GRID_M, GRID_K, GRID_N = 4, 4, 4 +BATCH = 2 + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestBgemm(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/bgemm_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/mix/kernel_bgemm.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/mix/kernel_bgemm.cpp", + "core_type": "aiv", + "signature": [D.INOUT, D.IN], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {}, + } + ] + + def generate_args(self, params): + A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01 + B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01 + C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32) + return TaskArgsBuilder(Tensor("A", A.flatten()), Tensor("B", B.flatten()), Tensor("C", C.flatten())) + + def compute_golden(self, args, params): + A = args.A.reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) + B = args.B.reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) + C = args.C.reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) + C[:] = 0.0 + for batch in range(BATCH): + for m in range(GRID_M): + for n in range(GRID_N): + for k in range(GRID_K): + C[batch, m, n] += torch.matmul(A[batch, m, k], B[batch, k, n]) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py b/examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py deleted file mode 100644 index acf60ee26..000000000 --- a/examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test specification for mixed AIC+AIV example. - -Covers all 5 resource shapes per iteration: - 1. AIC_AIV_X2: C = A@B, F = D+E, I = G*H - 2. AIC_ONLY: J = A@B - 3. AIV_X1: K = D+E - 4. AIV_X2: L = D+E, M = G*H - 5. AIC_AIV_X1: N = A@B, O = D+E - -All use 128x128 float32 tiles, repeated over num_iters slices. - -Args layout (15 args): [A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] - Shape/dtype/size in ContinuousTensor metadata. -""" - -import torch - -__outputs__ = ["C", "F", "I", "J", "K", "L", "M", "N", "O"] -RTOL = 1e-3 -ATOL = 1e-3 - -ALL_CASES = { - "case1": {"num_iters": 4}, - "case2": {"num_iters": 1}, -} - -DEFAULT_CASE = "case1" - -MATMUL_SIZE = 128 -TILE_ELEMS = 128 * 128 - - -def generate_inputs(params: dict) -> list: - num_iters = params["num_iters"] - - torch.manual_seed(42) - - # Matmul inputs (shared by AIC tasks) - A = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01 - B = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01 - - # Add inputs (shared by AIV add tasks) - D = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 - E = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 - - # Mul inputs (shared by AIV mul tasks) - G = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 - H = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 - - # Output buffers (num_iters slices each) - C = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) - F = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) - I_out = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) # noqa: E741 - J = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) - K = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) - L = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) - M = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) - N = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) - O_out = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) # noqa: E741 - - A_flat = A.flatten() - B_flat = B.flatten() - - return [ - ("A", A_flat), - ("B", B_flat), - ("C", C), - ("D", D), - ("E", E), - ("F", F), - ("G", G), - ("H", H), - ("I", I_out), - ("J", J), - ("K", K), - ("L", L), - ("M", M), - ("N", N), - ("O", O_out), - ] - - -def compute_golden(tensors: dict, params: dict) -> None: - num_iters = params["num_iters"] - - A = torch.as_tensor(tensors["A"]).reshape(MATMUL_SIZE, MATMUL_SIZE) - B = torch.as_tensor(tensors["B"]).reshape(MATMUL_SIZE, MATMUL_SIZE) - D = torch.as_tensor(tensors["D"]) - E = torch.as_tensor(tensors["E"]) - G = torch.as_tensor(tensors["G"]) - H = torch.as_tensor(tensors["H"]) - - golden_matmul = torch.matmul(A, B).flatten() - golden_add = D + E - golden_mul = G * H - - for name in ["C", "J", "N"]: - out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS) - for i in range(num_iters): - out[i] = golden_matmul - tensors[name][:] = out.flatten() - - for name in ["F", "K", "L", "O"]: - out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS) - for i in range(num_iters): - out[i] = golden_add - tensors[name][:] = out.flatten() - - for name in ["I", "M"]: - out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS) - for i in range(num_iters): - out[i] = golden_mul - tensors[name][:] = out.flatten() diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py deleted file mode 100644 index 796d2b782..000000000 --- a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for mixed AIC+AIV example (tensormap_and_ringbuffer Runtime). - -Covers all 5 resource shapes: - - AIC_ONLY: standalone matmul - - AIV_X1: standalone add - - AIV_X2: add (AIV0) + mul (AIV1) - - AIC_AIV_X1: matmul (AIC) + add (AIV0) - - AIC_AIV_X2: matmul (AIC) + add (AIV0) + mul (AIV1) -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "mixed_orch.cpp"), - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT, D.OUT, D.OUT, D.OUT, D.OUT, D.OUT, D.OUT], -} - -KERNELS = [ - { - "func_id": 0, - "name": "MATMUL", - "source": str(_KERNELS_ROOT / "aic" / "kernel_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "name": "ADD", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_add.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "MUL", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 3, - "name": "ADD_STANDALONE", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_add_standalone.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 4, - "name": "MUL_STANDALONE", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul_standalone.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 3, -} diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py new file mode 100644 index 000000000..37a8a92ed --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Mixed AIC+AIV example covering all 5 resource shapes. + + 1. AIC_AIV_X2: C = A@B, F = D+E, I = G*H + 2. AIC_ONLY: J = A@B + 3. AIV_X1: K = D+E + 4. AIV_X2: L = D+E, M = G*H + 5. AIC_AIV_X1: N = A@B, O = D+E + +All use 128x128 float32 tiles, repeated over num_iters slices. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +MATMUL_SIZE = 128 +TILE_ELEMS = 128 * 128 + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestMixedExample(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/mixed_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [ + D.IN, + D.IN, + D.OUT, + D.IN, + D.IN, + D.OUT, + D.IN, + D.IN, + D.OUT, + D.OUT, + D.OUT, + D.OUT, + D.OUT, + D.OUT, + D.OUT, + ], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/kernel_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aiv/kernel_mul.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/kernel_add_standalone.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 4, + "source": "kernels/aiv/kernel_mul_standalone.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + CASES = [ + { + "name": "case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {"num_iters": 4}, + }, + { + "name": "case2", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "manual": True, + "params": {"num_iters": 1}, + }, + ] + + def generate_args(self, params): + num_iters = params["num_iters"] + torch.manual_seed(42) + + A = (torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01).flatten() + B = (torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01).flatten() + D = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + E = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + G = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + H = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + + def zeros(): + return torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32) + + return TaskArgsBuilder( + Tensor("A", A), + Tensor("B", B), + Tensor("C", zeros()), + Tensor("D", D), + Tensor("E", E), + Tensor("F", zeros()), + Tensor("G", G), + Tensor("H", H), + Tensor("I", zeros()), + Tensor("J", zeros()), + Tensor("K", zeros()), + Tensor("L", zeros()), + Tensor("M", zeros()), + Tensor("N", zeros()), + Tensor("O", zeros()), + ) + + def compute_golden(self, args, params): + num_iters = params["num_iters"] + + A = args.A.reshape(MATMUL_SIZE, MATMUL_SIZE) + B = args.B.reshape(MATMUL_SIZE, MATMUL_SIZE) + + golden_matmul = torch.matmul(A, B).flatten() + golden_add = args.D + args.E + golden_mul = args.G * args.H + + for name in ["C", "J", "N"]: + out = getattr(args, name).reshape(num_iters, TILE_ELEMS) + for i in range(num_iters): + out[i] = golden_matmul + + for name in ["F", "K", "L", "O"]: + out = getattr(args, name).reshape(num_iters, TILE_ELEMS) + for i in range(num_iters): + out[i] = golden_add + + for name in ["I", "M"]: + out = getattr(args, name).reshape(num_iters, TILE_ELEMS) + for i in range(num_iters): + out[i] = golden_mul + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py deleted file mode 100644 index 2b3842381..000000000 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged Attention Golden - tensormap_and_ringbuffer example (small scale, float16).""" - -from simpler_setup.goldens.paged_attention import ( - compute_golden, # noqa: F401 - run_golden_test, -) -from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs - -__outputs__ = ["out"] - -RTOL = 1e-2 -ATOL = 1e-2 - -ALL_CASES = { - "Case1": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 33, - "max_model_len": 256, - "dtype": "float16", - }, - "Case2": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 128, - "max_model_len": 256, - "dtype": "float16", - }, - "CaseVarSeq2": { - "batch": 2, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 33, - "context_lens_list": [33, 17], - "max_model_len": 256, - "dtype": "float16", - }, - "CaseVarSeq4": { - "batch": 4, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 128, - "context_lens_list": [33, 64, 128, 15], - "max_model_len": 256, - "dtype": "float16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py deleted file mode 100644 index eb373f968..000000000 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention Kernel and Orchestration Configuration - -Defines the kernels and orchestration function for paged attention -with AIC/AIV subgraph splitting: - -AIC Kernels (Matrix Multiplication): - - aic_qk_matmul: Q @ K^T computation - - aic_pv_matmul: P @ V computation - -AIV Kernels (Vector Operations): - - aiv_softmax_prepare: scale, rowmax, exp, rowsum - - aiv_online_update: online softmax accumulation + fused normalization - -Note: aiv_normalize has been merged into aiv_online_update for efficiency. -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -# Orchestration config -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -# Kernel configs (aiv_normalize removed - merged into aiv_online_update) -KERNELS = [ - # AIC kernels (matrix multiplication using Cube unit) - { - "func_id": 0, - "name": "QK", - "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - # AIV kernels (vector operations) - { - "func_id": 1, - "name": "SF", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, -] - -# Runtime configuration -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py new file mode 100644 index 000000000..2e6eb99fb --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention — tensormap_and_ringbuffer example (small scale, float16). + +AIC+AIV mixed execution with online softmax paged attention. +Small-scale cases including variable sequence lengths. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestPagedAttention(SceneTestCase): + """Paged attention with tensormap_and_ringbuffer runtime on A5.""" + + RTOL = 1e-2 + ATOL = 1e-2 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "max_model_len": 256, + "dtype": "float16", + }, + }, + { + "name": "Case2", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "max_model_len": 256, + "dtype": "float16", + }, + }, + { + "name": "CaseVarSeq2", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 2, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "context_lens_list": [33, 17], + "max_model_len": 256, + "dtype": "float16", + }, + }, + { + "name": "CaseVarSeq4", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 4, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "context_lens_list": [33, 64, 128, 15], + "max_model_len": 256, + "dtype": "float16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py deleted file mode 100644 index 0be689b66..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD context accessors (Phase 2: block_dim=1). - -Verifies that get_block_idx and get_block_num return correct values for all -three subtask slots (AIC, AIV0, AIV1) in a MIX task, and that AIV -kernels read the correct sub_block_id from GlobalContext. - -Phase 2 invariants: block_idx=0, block_num=1. -GlobalContext: sub_block_id 0 (AIV0/left), 1 (AIV1/right). - -Output layout (float32[48], 3 cache lines): - [0..15] = AIC slot: [block_idx, block_num, pad x14] - [16..31] = AIV0 slot: [block_idx, block_num, sub_block_id=0, pad x13] - [32..47] = AIV1 slot: [block_idx, block_num, sub_block_id=1, pad x13] - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -# 16 floats per slot = 64 bytes = 1 cache line -FLOATS_PER_CACHE_LINE = 16 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(3 * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [ - ("output", output), - ] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - # Cache line 0: AIC (no sub_block_id) - out[0] = 0.0 # block_idx - out[1] = 1.0 # block_num - # Cache line 1: AIV0 (sub_block_id=0) - base = 1 * FLOATS_PER_CACHE_LINE - out[base + 0] = 0.0 # block_idx - out[base + 1] = 1.0 # block_num - out[base + 2] = 0.0 # sub_block_id - # Cache line 2: AIV1 (sub_block_id=1) - base = 2 * FLOATS_PER_CACHE_LINE - out[base + 0] = 0.0 # block_idx - out[base + 1] = 1.0 # block_num - out[base + 2] = 1.0 # sub_block_id - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py deleted file mode 100644 index 8be342352..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD basic test (tensormap_and_ringbuffer Runtime). - -Submits a single MIX task (AIC + AIV0 + AIV1) so all three sub_block_id -values are exercised in one dispatch. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_basic_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - { - "func_id": 0, - "name": "SPMD_READ_AIC", - "source": str(_KERNELS_ROOT / "aic" / "kernel_spmd_read.cpp"), - "core_type": "aic", - }, - { - "func_id": 1, - "name": "SPMD_READ_AIV0", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_read.cpp"), - "core_type": "aiv", - }, - { - "func_id": 2, - "name": "SPMD_READ_AIV1", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_read.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py new file mode 100644 index 000000000..55d4cbfb7 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD basic context accessors: single MIX task verifying block_idx, block_num, sub_block_id. + +Submits one MIX task (AIC + AIV0 + AIV1) with block_dim=1. +Each subtask writes its SPMD context at a sub_block_id-based offset. + +Output layout (float32[48], 3 cache lines): + [0..15] = AIC slot: [block_idx, block_num, pad x14] + [16..31] = AIV0 slot: [block_idx, block_num, sub_block_id=0, pad x13] + [32..47] = AIV1 slot: [block_idx, block_num, sub_block_id=1, pad x13] +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdBasic(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_basic_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"}, + {"func_id": 1, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, + {"func_id": 2, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(3 * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + out[0] = 0.0 + out[1] = 1.0 + base = 1 * FLOATS_PER_CACHE_LINE + out[base + 0] = 0.0 + out[base + 1] = 1.0 + out[base + 2] = 0.0 + base = 2 * FLOATS_PER_CACHE_LINE + out[base + 0] = 0.0 + out[base + 1] = 1.0 + out[base + 2] = 1.0 + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py deleted file mode 100644 index 5573fd274..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD multi-block AIV. - -Submits five AIV tasks with block_num = 4, 16, 24, 48, 96 to verify: - T0 (block_num=4): basic multi-block — fits within one sched thread - T1 (block_num=16): saturates one sched thread (8 clusters × 2 AIV) - T2 (block_num=24): forces cross-thread dispatch via ready_queue re-push - T3 (block_num=48): occupies all AIV cores across all 3 sched threads - T4 (block_num=96): two full rounds of all AIV cores - -Each block writes float(block_idx) at cache line (base_cl + block_idx). -Output tensor: 188 cache lines = 3008 float32. - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -FLOATS_PER_CACHE_LINE = 16 - -# (block_num, base_cl) for each submitted task -TASKS = [ - (4, 0), # T0: basic - (16, 4), # T1: saturate single thread - (24, 20), # T2: cross-thread - (48, 44), # T3: all AIV cores - (96, 92), # T4: two full rounds -] - -TOTAL_CL = sum(block_num for block_num, _ in TASKS) # 44 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [ - ("output", output), - ] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - for block_num, base_cl in TASKS: - for block_idx in range(block_num): - out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx) - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py deleted file mode 100644 index 68eccd9f7..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD multi-block AIV test (tensormap_and_ringbuffer Runtime). - -Submits a single AIV task with block_num=4 so each block writes its -block_idx at a distinct cacheline-aligned offset. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_multiblock_aiv_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - { - "func_id": 0, - "name": "SPMD_WRITE_AIV", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_write.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py new file mode 100644 index 000000000..58becb0b8 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD multi-block AIV: five AIV tasks with varying block_num. + + T0 (block_num=4): basic multi-block + T1 (block_num=16): saturates one sched thread + T2 (block_num=24): forces cross-thread dispatch + T3 (block_num=48): occupies all AIV cores across all 3 sched threads + T4 (block_num=96): two full rounds of all AIV cores + +Each block writes float(block_idx) at cache line (base_cl + block_idx). +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 + +TASKS = [ + (4, 0), + (16, 4), + (24, 20), + (48, 44), + (96, 92), +] + +TOTAL_CL = sum(block_num for block_num, _ in TASKS) + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdMultiblockAiv(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_multiblock_aiv_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py deleted file mode 100644 index 9751813d7..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD multi-block MIX. - -Submits five MIX tasks (AIC + AIV0 + AIV1) with block_num = 2, 8, 12, 24, 48 to verify: - T0 (block_num=2): basic multi-block MIX - T1 (block_num=8): saturates one sched thread (8 clusters) - T2 (block_num=12): forces cross-thread dispatch via ready_queue re-push - T3 (block_num=24): occupies all clusters across all 3 sched threads - T4 (block_num=48): two full rounds of all clusters - -Each block occupies 3 cache lines (AIC, AIV0, AIV1). All three cores -in the same block write the same float(block_idx) to their respective CL. - -Output tensor: 282 cache lines = 4512 float32. - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -FLOATS_PER_CACHE_LINE = 16 -SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 - -# (block_num, base_cl) for each submitted task -TASKS = [ - (2, 0), # T0: basic MIX (6 CL) - (8, 6), # T1: saturate single thread (24 CL) - (12, 30), # T2: cross-thread (36 CL) - (24, 66), # T3: all clusters (72 CL) - (48, 138), # T4: two full rounds (144 CL) -] - -TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 66 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [ - ("output", output), - ] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - for block_num, base_cl in TASKS: - for block_idx in range(block_num): - for slot in range(SLOTS_PER_BLOCK): - cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot - out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py deleted file mode 100644 index 9f7a517ef..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD multi-block MIX test (tensormap_and_ringbuffer Runtime). - -Submits a single MIX task (AIC + AIV0 + AIV1) with block_num=2 so all -three subtask slots in both blocks see the correct block_idx. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_multiblock_mix_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - { - "func_id": 0, - "name": "SPMD_MIX_AIC", - "source": str(_KERNELS_ROOT / "aic" / "kernel_spmd_mix.cpp"), - "core_type": "aic", - }, - { - "func_id": 1, - "name": "SPMD_MIX_AIV0", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, - { - "func_id": 2, - "name": "SPMD_MIX_AIV1", - "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py new file mode 100644 index 000000000..1bac22c74 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD multi-block MIX: five MIX tasks with varying block_num. + + T0 (block_num=2): basic multi-block MIX + T1 (block_num=8): saturates one sched thread + T2 (block_num=12): forces cross-thread dispatch + T3 (block_num=24): occupies all clusters across all 3 sched threads + T4 (block_num=48): two full rounds of all clusters + +Each block occupies 3 cache lines (AIC, AIV0, AIV1). All three cores +in the same block write float(block_idx) to their respective CL. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 + +TASKS = [ + (2, 0), + (8, 6), + (12, 30), + (24, 66), + (48, 138), +] + +TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdMultiblockMix(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_multiblock_mix_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, + {"func_id": 1, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 2, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py deleted file mode 100644 index 2e85b0fb6..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD starvation prevention. - -Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and -verifies all 20 tasks complete with correct output. The test validates that -the drain mechanism prevents sync_start tasks from being starved. - -Layout: - Wave 1: 6 x normal(block_num=4) -> CL 0..71 - Sync 0: 1 x sync_start(block_num=6) -> CL 72..89 - Wave 2: 6 x normal(block_num=4) -> CL 90..161 - Sync 1: 1 x sync_start(block_num=6) -> CL 162..179 - Wave 3: 6 x normal(block_num=4) -> CL 180..251 - -Total: 252 CL = 4032 float32. - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -FLOATS_PER_CACHE_LINE = 16 -SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 -NORMAL_BLOCK_NUM = 4 -SYNC_BLOCK_NUM = 6 -NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK # 12 -SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK # 18 - - -# Build flat task list as (block_num, base_cl) -def _build_tasks(): - tasks = [] - cl = 0 - for _ in range(6): - tasks.append((NORMAL_BLOCK_NUM, cl)) - cl += NORMAL_CL - tasks.append((SYNC_BLOCK_NUM, cl)) - cl += SYNC_CL - for _ in range(6): - tasks.append((NORMAL_BLOCK_NUM, cl)) - cl += NORMAL_CL - tasks.append((SYNC_BLOCK_NUM, cl)) - cl += SYNC_CL - for _ in range(6): - tasks.append((NORMAL_BLOCK_NUM, cl)) - cl += NORMAL_CL - return tasks - - -TASKS = _build_tasks() -TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS) # 252 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [("output", output)] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - for block_num, base_cl in TASKS: - for block_idx in range(block_num): - for slot in range(SLOTS_PER_BLOCK): - cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot - out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py deleted file mode 100644 index 602265c7e..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD starvation prevention test (tensormap_and_ringbuffer Runtime). - -Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks to verify -the drain mechanism prevents sync_start tasks from being starved. -Reuses the same AIC/AIV kernels from spmd_multiblock_mix. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent -_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - { - "func_id": 0, - "name": "SPMD_MIX_AIC", - "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), - "core_type": "aic", - }, - { - "func_id": 1, - "name": "SPMD_MIX_AIV0", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, - { - "func_id": 2, - "name": "SPMD_MIX_AIV1", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py new file mode 100644 index 000000000..425ccdab0 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD starvation prevention: 18 normal MIX + 2 sync_start MIX tasks. + +Validates that the drain mechanism prevents sync_start tasks from being starved. + +Layout: + Wave 1: 6 x normal(block_num=4) -> CL 0..71 + Sync 0: 1 x sync_start(block_num=6) -> CL 72..89 + Wave 2: 6 x normal(block_num=4) -> CL 90..161 + Sync 1: 1 x sync_start(block_num=6) -> CL 162..179 + Wave 3: 6 x normal(block_num=4) -> CL 180..251 + +Total: 252 CL = 4032 float32. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 +NORMAL_BLOCK_NUM = 4 +SYNC_BLOCK_NUM = 6 +NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK +SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK + + +def _build_tasks(): + tasks = [] + cl = 0 + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + tasks.append((SYNC_BLOCK_NUM, cl)) + cl += SYNC_CL + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + tasks.append((SYNC_BLOCK_NUM, cl)) + cl += SYNC_CL + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + return tasks + + +TASKS = _build_tasks() +TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS) + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdStarvation(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_starvation_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, + {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py deleted file mode 100644 index 33acd1c1a..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD sync_start. - -Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies -all blocks of every task write the correct float(block_idx) to their cache line. - -Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2): - T0: block_num=2, sync_start=True -> CL 0..5 - T1: block_num=8, sync_start=True -> CL 6..29 - T2: block_num=2, sync_start=False -> CL 30..35 (baseline) - T3: block_num=12, sync_start=True -> CL 36..71 - -Output tensor: 72 cache lines = 1152 float32. - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -FLOATS_PER_CACHE_LINE = 16 -SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 - -# (block_num, base_cl) for each submitted task -TASKS = [ - (2, 0), # T0: sync_start=True - (8, 6), # T1: sync_start=True - (2, 30), # T2: sync_start=False (baseline) - (12, 36), # T3: sync_start=True -] - -TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 72 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [ - ("output", output), - ] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - for block_num, base_cl in TASKS: - for block_idx in range(block_num): - for slot in range(SLOTS_PER_BLOCK): - cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot - out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py deleted file mode 100644 index c689263d5..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime). - -Submits MIX tasks with require_sync_start=true to verify atomic batch launch. -Reuses the same AIC/AIV kernels from spmd_multiblock_mix. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent -_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - { - "func_id": 0, - "name": "SPMD_MIX_AIC", - "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), - "core_type": "aic", - }, - { - "func_id": 1, - "name": "SPMD_MIX_AIV0", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, - { - "func_id": 2, - "name": "SPMD_MIX_AIV1", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py new file mode 100644 index 000000000..18320397e --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD sync_start: 4 MIX tasks (3 sync_start + 1 baseline). + +Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2): + T0: block_num=2, sync_start=True -> CL 0..5 + T1: block_num=8, sync_start=True -> CL 6..29 + T2: block_num=2, sync_start=False -> CL 30..35 (baseline) + T3: block_num=12, sync_start=True -> CL 36..71 + +Output tensor: 72 cache lines = 1152 float32. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 + +TASKS = [ + (2, 0), + (8, 6), + (2, 30), + (12, 36), +] + +TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdSyncStart(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_sync_start_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, + {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py deleted file mode 100644 index 3c60f1ac8..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD sync_start with AIV-only tasks. - -Submits 4 AIV tasks (3 with require_sync_start=true, 1 baseline) to exercise -the AIV-specific fast path (count_idle_aiv_cores) and drain slow path. - -Tasks: - T0: block_num=4, sync_start=True -> CL 0..3 (fast path) - T1: block_num=16, sync_start=True -> CL 4..19 (saturate one thread) - T2: block_num=4, sync_start=False -> CL 20..23 (baseline) - T3: block_num=24, sync_start=True -> CL 24..47 (cross-thread drain) - -Output tensor: 48 cache lines = 768 float32. - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -FLOATS_PER_CACHE_LINE = 16 - -# (block_num, base_cl) for each submitted task -TASKS = [ - (4, 0), # T0: sync_start=True, fast path - (16, 4), # T1: sync_start=True, saturate single thread - (4, 20), # T2: sync_start=False, baseline - (24, 24), # T3: sync_start=True, cross-thread drain -] - -TOTAL_CL = sum(block_num for block_num, _ in TASKS) # 48 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [("output", output)] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - for block_num, base_cl in TASKS: - for block_idx in range(block_num): - cl = base_cl + block_idx - out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py deleted file mode 100644 index bb97aaee2..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD sync_start AIV test (tensormap_and_ringbuffer Runtime). - -Submits AIV tasks with require_sync_start=true to verify atomic batch launch -and the AIV-specific fast path (count_idle_aiv_cores). -Reuses the same AIV kernel from spmd_multiblock_aiv. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent -_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels" - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_aiv_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - { - "func_id": 0, - "name": "SPMD_WRITE_AIV", - "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py new file mode 100644 index 000000000..8a434caa5 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD sync_start AIV: 4 AIV tasks (3 sync_start + 1 baseline). + +Exercises AIV-specific fast path (count_idle_aiv_cores) and drain slow path. + +Tasks: + T0: block_num=4, sync_start=True -> CL 0..3 (fast path) + T1: block_num=16, sync_start=True -> CL 4..19 (saturate one thread) + T2: block_num=4, sync_start=False -> CL 20..23 (baseline) + T3: block_num=24, sync_start=True -> CL 24..47 (cross-thread drain) + +Output tensor: 48 cache lines = 768 float32. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 + +TASKS = [ + (4, 0), + (16, 4), + (4, 20), + (24, 24), +] + +TOTAL_CL = sum(block_num for block_num, _ in TASKS) + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdSyncStartAiv(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_sync_start_aiv_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + cl = base_cl + block_idx + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py deleted file mode 100644 index 2bfcaea4a..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD sync_start boundary conditions. - -Tests edge-case block_num values relative to per-thread cluster capacity (8 clusters -with 3 sched threads = 24 total clusters, 48 total AIV cores). - -MIX tasks (SLOTS_PER_BLOCK=3): - T0: block_num=1, sync_start=True -> CL 0..2 (degenerate: always fast path) - T1: block_num=8, sync_start=True -> CL 3..26 (exactly one thread's capacity) - T2: block_num=9, sync_start=True -> CL 27..53 (one over: must enter drain) - T3: block_num=23, sync_start=True -> CL 54..122 (max valid: total_clusters - 1) - T4: block_num=1, sync_start=False -> CL 123..125 (baseline) - -Output tensor: 126 cache lines = 2016 float32. - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -FLOATS_PER_CACHE_LINE = 16 -SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 - -# (block_num, base_cl) for each submitted task -TASKS = [ - (1, 0), # T0: sync=True, degenerate - (8, 3), # T1: sync=True, exactly one thread's clusters - (9, 27), # T2: sync=True, one over -> drain - (23, 54), # T3: sync=True, max valid (total_clusters - 1) - (1, 123), # T4: sync=False, baseline -] - -TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 126 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [("output", output)] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - for block_num, base_cl in TASKS: - for block_idx in range(block_num): - for slot in range(SLOTS_PER_BLOCK): - cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot - out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py deleted file mode 100644 index 30a9ebd1f..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD sync_start boundary test (tensormap_and_ringbuffer Runtime). - -Tests edge-case block_num values relative to per-thread cluster capacity. -Reuses the same AIC/AIV kernels from spmd_multiblock_mix. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent -_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_edge_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - { - "func_id": 0, - "name": "SPMD_MIX_AIC", - "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), - "core_type": "aic", - }, - { - "func_id": 1, - "name": "SPMD_MIX_AIV0", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, - { - "func_id": 2, - "name": "SPMD_MIX_AIV1", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py new file mode 100644 index 000000000..11a728a02 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD sync_start boundary conditions. + +Tests edge-case block_num values relative to per-thread cluster capacity +(8 clusters x 3 sched threads = 24 total clusters, 48 total AIV cores). + +MIX tasks (SLOTS_PER_BLOCK=3): + T0: block_num=1, sync_start=True -> CL 0..2 (degenerate: always fast path) + T1: block_num=8, sync_start=True -> CL 3..26 (exactly one thread's capacity) + T2: block_num=9, sync_start=True -> CL 27..53 (one over: must enter drain) + T3: block_num=23, sync_start=True -> CL 54..122 (max valid: total_clusters - 1) + T4: block_num=1, sync_start=False -> CL 123..125 (baseline) + +Output tensor: 126 cache lines = 2016 float32. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 + +TASKS = [ + (1, 0), + (8, 3), + (9, 27), + (23, 54), + (1, 123), +] + +TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdSyncStartEdge(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_sync_start_edge_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, + {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py deleted file mode 100644 index 3315360df..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Golden test for SPMD sync_start stress / CAS contention with mixed shapes. - -Submits 6 rounds of mixed-shape tasks to stress drain CAS contention, ack -barrier, and state cleanup across drain cycles. All three resource shapes -(MIX, AIV, AIC) are exercised with both sync and non-sync modes. - -Each round (9 tasks): - 4 x normal MIX (block_num=4, sync=false) -> 4 x 4 x 3 = 48 CL - 2 x sync MIX (block_num=12, sync=true) -> 2 x 12 x 3 = 72 CL - 2 x sync AIV (block_num=8, sync=true) -> 2 x 8 x 1 = 16 CL - 1 x normal AIV (block_num=4, sync=false) -> 1 x 4 x 1 = 4 CL - Round total: 140 CL - -6 rounds -> 54 tasks (24 normal MIX + 12 sync MIX + 12 sync AIV + 6 normal AIV) -Grand total: 840 CL = 13440 float32 - -Args layout: [output] -""" - -import torch - -__outputs__ = ["output"] -RTOL = 0 -ATOL = 0 - -ALL_CASES = { - "Case1": {}, -} - -DEFAULT_CASE = "Case1" - -FLOATS_PER_CACHE_LINE = 16 -ROUNDS = 6 - -# shape constants: (slots_per_block, written_slots) -# MIX: kernel writes at base_cl + block_idx * 3 + {0,1,2}, 3 CL per block, all written -# AIV: kernel writes at base_cl + block_idx, 1 CL per block -SHAPE_MIX = "MIX" -SHAPE_AIV = "AIV" - -MIX_SLOTS = 3 -AIV_SLOTS = 1 - -NORMAL_MIX_BN = 4 -SYNC_MIX_BN = 12 -SYNC_AIV_BN = 8 -NORMAL_AIV_BN = 4 - - -def _build_tasks(): - """Returns list of (block_num, base_cl, shape_str).""" - tasks = [] - cl = 0 - for _ in range(ROUNDS): - # 4 x normal MIX - for _ in range(4): - tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX)) - cl += NORMAL_MIX_BN * MIX_SLOTS - # 2 x sync MIX - for _ in range(2): - tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX)) - cl += SYNC_MIX_BN * MIX_SLOTS - # 2 x sync AIV - for _ in range(2): - tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV)) - cl += SYNC_AIV_BN * AIV_SLOTS - # 1 x normal AIV - tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV)) - cl += NORMAL_AIV_BN * AIV_SLOTS - return tasks - - -TASKS = _build_tasks() -TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS) # 840 - - -def generate_inputs(params: dict) -> list: - output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) - return [("output", output)] - - -def compute_golden(tensors: dict, params: dict) -> None: - out = torch.as_tensor(tensors["output"]) - for block_num, base_cl, shape in TASKS: - for block_idx in range(block_num): - if shape == SHAPE_MIX: - # MIX kernel writes float(block_idx) at all 3 slots - for slot in range(MIX_SLOTS): - cl = base_cl + block_idx * MIX_SLOTS + slot - out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) - else: - # AIV kernel writes float(block_idx) at 1 slot - cl = base_cl + block_idx - out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) - tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py deleted file mode 100644 index d04b6b27f..000000000 --- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for SPMD sync_start stress test with mixed shapes. - -Submits 54 tasks (MIX + AIV) over 6 rounds to stress-test drain CAS contention, -ack barrier, and state cleanup between drain cycles. -Reuses AIC/AIV kernels from spmd_multiblock_mix and spmd_multiblock_aiv. -""" - -from pathlib import Path - -_KERNELS_ROOT = Path(__file__).parent -_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" -_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels" - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_stress_orch.cpp"), - "function_name": "aicpu_orchestration_entry", -} - -KERNELS = [ - # func_id 0-2: MIX kernels (AIC + AIV0 + AIV1) - { - "func_id": 0, - "name": "SPMD_MIX_AIC", - "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), - "core_type": "aic", - }, - { - "func_id": 1, - "name": "SPMD_MIX_AIV0", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, - { - "func_id": 2, - "name": "SPMD_MIX_AIV1", - "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), - "core_type": "aiv", - }, - # func_id 3: standalone AIV kernel - { - "func_id": 3, - "name": "SPMD_WRITE_AIV", - "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"), - "core_type": "aiv", - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py new file mode 100644 index 000000000..a87eb7209 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SPMD sync_start stress with mixed shapes (MIX + AIV). + +Submits 6 rounds of mixed-shape tasks to stress drain CAS contention, +ack barrier, and state cleanup across drain cycles. + +Each round (9 tasks): + 4 x normal MIX (block_num=4, sync=false) -> 48 CL + 2 x sync MIX (block_num=12, sync=true) -> 72 CL + 2 x sync AIV (block_num=8, sync=true) -> 16 CL + 1 x normal AIV (block_num=4, sync=false) -> 4 CL + Round total: 140 CL + +6 rounds -> 54 tasks, grand total: 840 CL = 13440 float32. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +FLOATS_PER_CACHE_LINE = 16 +ROUNDS = 6 + +SHAPE_MIX = "MIX" +SHAPE_AIV = "AIV" +MIX_SLOTS = 3 +AIV_SLOTS = 1 + +NORMAL_MIX_BN = 4 +SYNC_MIX_BN = 12 +SYNC_AIV_BN = 8 +NORMAL_AIV_BN = 4 + + +def _build_tasks(): + tasks = [] + cl = 0 + for _ in range(ROUNDS): + for _ in range(4): + tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX)) + cl += NORMAL_MIX_BN * MIX_SLOTS + for _ in range(2): + tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX)) + cl += SYNC_MIX_BN * MIX_SLOTS + for _ in range(2): + tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV)) + cl += SYNC_AIV_BN * AIV_SLOTS + tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV)) + cl += NORMAL_AIV_BN * AIV_SLOTS + return tasks + + +TASKS = _build_tasks() +TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS) + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSpmdSyncStartStress(SceneTestCase): + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/spmd_sync_start_stress_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT], + }, + "incores": [ + {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, + {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 3, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {}, + }, + ] + + def generate_args(self, params): + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return TaskArgsBuilder(Tensor("output", output)) + + def compute_golden(self, args, params): + out = args.output + for block_num, base_cl, shape in TASKS: + for block_idx in range(block_num): + if shape == SHAPE_MIX: + for slot in range(MIX_SLOTS): + cl = base_cl + block_idx * MIX_SLOTS + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + else: + cl = base_cl + block_idx + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a5/host_build_graph/paged_attention/golden.py b/tests/st/a5/host_build_graph/paged_attention/golden.py deleted file mode 100644 index 623712602..000000000 --- a/tests/st/a5/host_build_graph/paged_attention/golden.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged Attention Golden - host_build_graph test (production scale, bfloat16). - -Args layout: [query, key_cache, value_cache, block_table, context_lens, out, scale] - - Tensors retain original multi-dimensional shapes (ContinuousTensor metadata carries shape/dtype) - - scale is a scalar float parameter -""" - -from simpler_setup.goldens.paged_attention import ( - compute_golden, # noqa: F401 - run_golden_test, -) -from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs - -__outputs__ = ["out"] - -RTOL = 1e-3 -ATOL = 1e-3 - -ALL_CASES = { - "Case1": { - "batch": 256, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8100, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - "Case2": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 64, - "context_len": 8150, - "max_model_len": 32768, - "dtype": "bfloat16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py deleted file mode 100644 index 188d983a9..000000000 --- a/tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention Kernel and Orchestration Configuration - -Defines the kernels and orchestration function for paged attention -with AIC/AIV subgraph splitting: - -AIC Kernels (Matrix Multiplication): - - aic_qk_matmul: Q @ K^T computation - - aic_pv_matmul: P @ V computation - -AIV Kernels (Vector Operations): - - aiv_softmax_prepare: scale, rowmax, exp, rowsum - - aiv_online_update: online softmax accumulation + fused normalization - -Note: aiv_normalize has been merged into aiv_online_update for efficiency. -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -# Orchestration config -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -# Kernel configs (aiv_normalize removed - merged into aiv_online_update) -KERNELS = [ - # AIC kernels (matrix multiplication using Cube unit) - { - "func_id": 0, - "name": "QK", - "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - # AIV kernels (vector operations) - { - "func_id": 1, - "name": "SF", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, -] - -# Runtime configuration -RUNTIME_CONFIG = { - "runtime": "host_build_graph", - "aicpu_thread_num": 3, - "block_dim": 24, -} diff --git a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py new file mode 100644 index 000000000..2d3b12d3b --- /dev/null +++ b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention — host_build_graph test (production scale, bfloat16). + +AIC+AIV mixed execution with online softmax paged attention. +Production-scale cases for A5 hardware validation. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="host_build_graph") +class TestPagedAttentionHostBuildGraph(SceneTestCase): + """Paged attention with host_build_graph runtime on A5.""" + + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "build_paged_attention_graph", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 3, "block_dim": 24}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8100, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 3, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8150, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py deleted file mode 100644 index 86d5ccb9f..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged Attention Golden - tensormap_and_ringbuffer test (production scale, bfloat16).""" - -from simpler_setup.goldens.paged_attention import ( - compute_golden, # noqa: F401 - run_golden_test, -) -from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs - -__outputs__ = ["out"] - -RTOL = 1e-3 -ATOL = 1e-3 - -ALL_CASES = { - "Case1": { - "batch": 256, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - "Case2": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - "Case3": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 256, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py deleted file mode 100644 index 415af4dee..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention Kernel and Orchestration Configuration - -Defines the kernels and orchestration function for paged attention -with AIC/AIV subgraph splitting: - -AIC Kernels (Matrix Multiplication): - - aic_qk_matmul: Q @ K^T computation - - aic_pv_matmul: P @ V computation - -AIV Kernels (Vector Operations): - - aiv_softmax_prepare: scale, rowmax, exp, rowsum - - aiv_online_update: online softmax accumulation + fused normalization - -Note: aiv_normalize has been merged into aiv_online_update for efficiency. -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -# Orchestration config -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -# Kernel configs (aiv_normalize removed - merged into aiv_online_update) -KERNELS = [ - # AIC kernels (matrix multiplication using Cube unit) - { - "func_id": 0, - "name": "QK", - "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - # AIV kernels (vector operations) - { - "func_id": 1, - "name": "SF", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, -] - -# Runtime configuration -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py new file mode 100644 index 000000000..4e3a52890 --- /dev/null +++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention — tensormap_and_ringbuffer test (production scale, bfloat16). + +AIC+AIV mixed execution with online softmax paged attention. +Production-scale cases for A5 hardware validation. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestPagedAttention(SceneTestCase): + """Paged attention with tensormap_and_ringbuffer runtime on A5.""" + + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "build_paged_attention_graph", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case3", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py deleted file mode 100644 index 4bbbe98ad..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged Attention Unroll Golden - tensormap_and_ringbuffer test (production scale, bfloat16).""" - -from simpler_setup.goldens.paged_attention import ( - compute_golden, # noqa: F401 # re-exported for ci.py's dynamic golden-module loader - run_golden_test, -) -from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs - -__outputs__ = ["out"] - -RTOL = 1e-3 -ATOL = 1e-3 - -ALL_CASES = { - "Case1": { - "batch": 256, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - "Case2": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - "Case3": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 256, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs, label="Paged Attention Unroll") diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py deleted file mode 100644 index 5d51b4917..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention Kernel and Orchestration Configuration - -Defines the kernels and orchestration function for paged attention -with AIC/AIV subgraph splitting: - -AIC Kernels (Matrix Multiplication): - - aic_qk_matmul: Q @ K^T computation - - aic_pv_matmul: P @ V computation - -AIV Kernels (Vector Operations): - - aiv_softmax_prepare: scale, rowmax, exp, rowsum - - aiv_online_update: online softmax accumulation + fused normalization - -Note: aiv_normalize has been merged into aiv_online_update for efficiency. -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -# Orchestration config -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -# Kernel configs (aiv_normalize removed - merged into aiv_online_update) -KERNELS = [ - # AIC kernels (matrix multiplication using Cube unit) - { - "func_id": 0, - "name": "QK", - "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - # AIV kernels (vector operations) - { - "func_id": 1, - "name": "SF", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, -] - -# Runtime configuration -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 36, -} diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py new file mode 100644 index 000000000..f79a98c0d --- /dev/null +++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention unroll — tensormap_and_ringbuffer test (production scale, bfloat16). + +Same algorithm as paged_attention but with higher block_dim for unrolled dispatch. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestPagedAttentionUnroll(SceneTestCase): + """Paged attention unroll with tensormap_and_ringbuffer runtime on A5.""" + + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "build_paged_attention_graph", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 36}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 36}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case3", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 36}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) From 3f8119c5a90a760fd2be31bcbb4df3593941894b Mon Sep 17 00:00:00 2001 From: majin0824 Date: Thu, 16 Apr 2026 10:13:43 +0800 Subject: [PATCH 2/5] Refactor: migrate remaining TMR examples to tests/st and upgrade paged attention - Move spmd_*, mixed_example from examples/tmr/ to tests/st/tmr/ - Remove duplicate HBG paged_attention from examples/ (already in tests/st/) - Remove old TMR paged_attention from tests/st/ (kept in examples/ as evolving reference) - Upgrade TMR paged_attention: fp16 -> bfloat16, multi-tile dispatch (16x128, 64x64), production-scale cases (batch=256, head_dim=128/256), tighter tolerances (1e-3) - Add small-tile (16,16,16) dispatch path to HBG paged_attention kernels with SmallCase1/SmallCase2 sim-compatible test cases --- .../kernels/aic/aic_pv_matmul.cpp | 101 ------- .../kernels/aic/aic_qk_matmul.cpp | 102 ------- .../kernels/aiv/aiv_online_update.cpp | 230 -------------- .../kernels/aiv/aiv_softmax_prepare.cpp | 110 ------- .../orchestration/paged_attention_orch.cpp | 252 ---------------- .../paged_attention/test_paged_attention.py | 118 -------- .../kernels/aic/aic_pv_matmul.cpp | 46 +-- .../kernels/aic/aic_qk_matmul.cpp | 44 ++- .../kernels/aiv/aiv_online_update.cpp | 209 +++++++------ .../kernels/aiv/aiv_softmax_prepare.cpp | 66 ++-- .../orchestration/paged_attention_orch.cpp | 157 ++++++++-- .../paged_attention/test_paged_attention.py | 71 ++++- .../paged_attention/README.md | 73 +++-- .../kernels/aic/aic_pv_matmul.cpp | 7 +- .../kernels/aic/aic_qk_matmul.cpp | 7 +- .../kernels/aiv/aiv_online_update.cpp | 6 +- .../kernels/aiv/aiv_softmax_prepare.cpp | 6 +- .../paged_attention/test_paged_attention.py | 31 ++ .../kernels/aic/kernel_matmul.cpp | 0 .../mixed_example/kernels/aiv/kernel_add.cpp | 0 .../kernels/aiv/kernel_add_standalone.cpp | 0 .../mixed_example/kernels/aiv/kernel_mul.cpp | 0 .../kernels/aiv/kernel_mul_standalone.cpp | 0 .../kernels/orchestration/mixed_orch.cpp | 0 .../mixed_example/test_mixed_example.py | 0 .../kernels/aic/aic_pv_matmul.cpp | 112 ------- .../kernels/aic/aic_qk_matmul.cpp | 113 ------- .../kernels/aiv/aiv_online_update.cpp | 246 --------------- .../kernels/aiv/aiv_softmax_prepare.cpp | 146 --------- .../orchestration/paged_attention_orch.cpp | 281 ------------------ .../paged_attention/test_paged_attention.py | 134 --------- .../kernels/aic/kernel_spmd_read.cpp | 0 .../kernels/aiv/kernel_spmd_read.cpp | 0 .../kernels/orchestration/spmd_basic_orch.cpp | 0 .../spmd_basic/test_spmd_basic.py | 0 .../kernels/aiv/kernel_spmd_write.cpp | 0 .../spmd_multiblock_aiv_orch.cpp | 0 .../test_spmd_multiblock_aiv.py | 0 .../kernels/aic/kernel_spmd_mix.cpp | 0 .../kernels/aiv/kernel_spmd_mix.cpp | 0 .../spmd_multiblock_mix_orch.cpp | 0 .../test_spmd_multiblock_mix.py | 0 .../orchestration/spmd_starvation_orch.cpp | 0 .../spmd_starvation/test_spmd_starvation.py | 0 .../orchestration/spmd_sync_start_orch.cpp | 0 .../spmd_sync_start/test_spmd_sync_start.py | 0 .../spmd_sync_start_aiv_orch.cpp | 0 .../test_spmd_sync_start_aiv.py | 0 .../spmd_sync_start_edge_orch.cpp | 0 .../test_spmd_sync_start_edge.py | 0 .../spmd_sync_start_stress_orch.cpp | 0 .../test_spmd_sync_start_stress.py | 0 52 files changed, 483 insertions(+), 2185 deletions(-) delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp delete mode 100644 examples/a5/host_build_graph/paged_attention/test_paged_attention.py rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py (100%) delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp (100%) rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py (100%) diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp deleted file mode 100644 index 75aa44e5b..000000000 --- a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) -// -// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16) -// -// pij is float16 (converted from fp32 in softmax_prepare via TCVT). -// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. -// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. - -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -static __aicore__ void pv_matmul_impl(__gm__ uint8_t *pij_raw, __gm__ uint8_t *vj_raw, __gm__ uint8_t *oi_raw) { - constexpr int M = 16, K = 16, N = 16; - - __gm__ half *pij = reinterpret_cast<__gm__ half *>(pij_raw); - __gm__ half *vj = reinterpret_cast<__gm__ half *>(vj_raw); - __gm__ float *oi = reinterpret_cast<__gm__ float *>(oi_raw); - - // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32 - using GlobalA = GlobalTensor, pto::Stride>; - using GlobalB = GlobalTensor, pto::Stride>; - using GlobalOut = GlobalTensor, pto::Stride>; - - GlobalA pijGlobal(pij); - GlobalB vjGlobal(vj); - GlobalOut oiGlobal(oi); - - // L1 Mat tiles: standard ND pattern for both A and B - using TileMatA = Tile; - using TileMatB = Tile; - - // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - // Load pij and vj to L1 - TLOAD(aMatTile, pijGlobal); - TLOAD(bMatTile, vjGlobal); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - // Move to L0A/L0B - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - // Single matmul: (M,K) x (K,N) -> (M,N) - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(oiGlobal, cTile); - - set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); - wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ uint8_t *pij = reinterpret_cast<__gm__ uint8_t *>(args[0]); - __gm__ uint8_t *vj = reinterpret_cast<__gm__ uint8_t *>(args[1]); - __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]); - - pv_matmul_impl(pij, vj, oi_new); -} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp deleted file mode 100644 index 6322ee6ab..000000000 --- a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) -// -// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16) -// -// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. -// This is equivalent to (K, N) in column-major (DN) layout. -// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. - -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -static __aicore__ void qk_matmul_impl(__gm__ uint8_t *qi_raw, __gm__ uint8_t *kj_raw, __gm__ uint8_t *sij_raw) { - constexpr int M = 16, K = 16, N = 16; - - __gm__ half *qi = reinterpret_cast<__gm__ half *>(qi_raw); - __gm__ half *kj = reinterpret_cast<__gm__ half *>(kj_raw); - __gm__ float *sij = reinterpret_cast<__gm__ float *>(sij_raw); - - // qi (M, K) fp16 in ND (row-major) layout - using GlobalA = GlobalTensor, pto::Stride>; - // kj stored as (N, K) row-major = (K, N) column-major -> DN layout - using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; - using GlobalOut = GlobalTensor, pto::Stride>; - - GlobalA qiGlobal(qi); - GlobalB kjGlobal(kj); - GlobalOut sijGlobal(sij); - - // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) - using TileMatA = Tile; - using TileMatB = Tile; - - // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - // Load qi and kj to L1 - TLOAD(aMatTile, qiGlobal); - TLOAD(bMatTile, kjGlobal); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - // Move to L0A/L0B - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - // Single matmul: (M,K) x (K,N) -> (M,N) - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(sijGlobal, cTile); - - set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); - wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ uint8_t *qi = reinterpret_cast<__gm__ uint8_t *>(args[0]); - __gm__ uint8_t *kj = reinterpret_cast<__gm__ uint8_t *>(args[1]); - __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[2]); - - qk_matmul_impl(qi, kj, sij); -} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp deleted file mode 100644 index 5563b36ff..000000000 --- a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Online Softmax Update + Normalize Kernel (AIV) -// -// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors -// -// Scalar layout strategy: -// M scalar floats stored contiguously in GM can be loaded as either: -// - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD) -// - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV) -// Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD. - -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -static __aicore__ void online_update_impl( - __gm__ uint8_t *mij_raw, __gm__ uint8_t *lij_raw, __gm__ uint8_t *oi_new_raw, __gm__ uint8_t *mi_raw, - __gm__ uint8_t *li_raw, __gm__ uint8_t *oi_raw, int is_first, int is_last, __gm__ uint8_t *dst_raw -) { - constexpr int M = 16, N = 16; - - __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij_raw); - __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij_raw); - __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new_raw); - __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi_raw); - __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li_raw); - __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi_raw); - __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst_raw); - - // Scalar tile dimensions for RowMajor layout: - // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block) - // kScalarRows = M / 8 (M=16 -> 2 rows) - constexpr int kScalarCols = 32 / sizeof(float); - constexpr int kScalarRows = M / kScalarCols; - // Aligned rows for ColMajor DN tiles (32-byte alignment) - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - - // --- GlobalTensor types --- - - // Data (M, N) RowMajor - using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - - // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor - using GlobalScalarND = - GlobalTensor, pto::Stride<1, 1, 1, kScalarCols, 1>>; - - // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor - using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; - - // --- GlobalTensor instances --- - - GlobalDataMxN oiNewGlobal(oi_new_ptr); - GlobalDataMxN oiGlobal(oi_ptr); - GlobalDataMxN dstGlobal(dst_ptr); - - // ND globals for scalar element-wise operations - GlobalScalarND mijGlobalND(mij_ptr); - GlobalScalarND lijGlobalND(lij_ptr); - GlobalScalarND miGlobalND(mi_ptr); - GlobalScalarND liGlobalND(li_ptr); - - // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE) - GlobalScalarDN mijGlobalDN(mij_ptr); - GlobalScalarDN lijGlobalDN(lij_ptr); - GlobalScalarDN liGlobalDN(li_ptr); - - // --- Tile types --- - - using TileDataMxN = Tile; - using TileScalarND = - Tile; - using TileScalarDN = Tile; - - // --- UB memory layout --- - - constexpr int kDataBytes = M * N * sizeof(float); - constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float); - constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); - - // Data tiles - TileDataMxN oiNewTile; - TileDataMxN oiTile; - - // Scalar ND tiles for element-wise arithmetic - TileScalarND mijND, lijND, miND, liND; - TileScalarND miNewND, alphaND, betaND, tmpND; - - // Scalar DN tiles for TROWEXPAND operations - TileScalarDN alphaDN, betaDN, liDN; - - TASSIGN(oiNewTile, 0); - TASSIGN(oiTile, kDataBytes); - TASSIGN(mijND, 2 * kDataBytes); - TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes); - TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes); - TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes); - TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes); - TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes); - TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes); - TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes); - TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes); - TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes); - TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes); - - if (is_first) { - // --- First block: copy inputs to accumulators --- - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(mijND, mijGlobalND); - TLOAD(lijND, lijGlobalND); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Passthrough to MTE3 (no V compute needed) - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, mijND); // mi = mij - TSTORE(liGlobalND, lijND); // li = lij - TSTORE(oiGlobal, oiNewTile); // oi = oi_new - - if (is_last) { - // Single block: normalize dst = oi_new / lij - // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV - set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - TLOAD(liDN, liGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - TROWEXPANDDIV(oiNewTile, oiNewTile, liDN); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(dstGlobal, oiNewTile); - } - } else { - // --- Subsequent blocks: accumulate --- - - // Phase 1: Load all inputs - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(oiTile, oiGlobal); - TLOAD(mijND, mijGlobalND); - TLOAD(lijND, lijGlobalND); - TLOAD(miND, miGlobalND); - TLOAD(liND, liGlobalND); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols) - TMAX(miNewND, miND, mijND); // mi_new = max(mi, mij) - TSUB(alphaND, miND, miNewND); // alpha = mi - mi_new - TEXP(alphaND, alphaND); // alpha = exp(mi - mi_new) - TSUB(betaND, mijND, miNewND); // beta = mij - mi_new - TEXP(betaND, betaND); // beta = exp(mij - mi_new) - TMUL(liND, alphaND, liND); // li = alpha * li - TMUL(tmpND, betaND, lijND); // tmp = beta * lij - TADD(liND, liND, tmpND); // li = alpha * li + beta * lij (= li_new) - - // Phase 3: Store scalar results to GM (ND format) - // mi_new -> mi accumulator, li_new -> li accumulator - // alpha -> mij buffer (reuse), beta -> lij buffer (reuse) - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liND); // persist li_new - TSTORE(mijGlobalND, alphaND); // temp: alpha to mij buffer - TSTORE(lijGlobalND, betaND); // temp: beta to lij buffer - - // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN - set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - TLOAD(alphaDN, mijGlobalDN); // alpha from mij buffer as DN - TLOAD(betaDN, lijGlobalDN); // beta from lij buffer as DN - if (is_last) { - TLOAD(liDN, liGlobalDN); // li_new from li buffer as DN - } - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - - // Phase 5: Scale data tiles using row-broadcast multiply - TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha - TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta - TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new - - if (is_last) { - // Phase 6: Normalize and output - TROWEXPANDDIV(oiTile, oiTile, liDN); // dst = oi / li_new - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(dstGlobal, oiTile); - } else { - // Phase 6: Store updated accumulators - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(oiGlobal, oiTile); - } - } - - set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); - wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[0]); - __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[1]); - __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]); - __gm__ uint8_t *mi = reinterpret_cast<__gm__ uint8_t *>(args[3]); - __gm__ uint8_t *li = reinterpret_cast<__gm__ uint8_t *>(args[4]); - __gm__ uint8_t *oi = reinterpret_cast<__gm__ uint8_t *>(args[5]); - int is_first = static_cast(args[6]); - int is_last = static_cast(args[7]); - __gm__ uint8_t *dst = reinterpret_cast<__gm__ uint8_t *>(args[8]); - - online_update_impl(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); -} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp deleted file mode 100644 index c07ca22a1..000000000 --- a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Softmax Preparation Kernel (AIV) -// -// Fixed tile size: sij is (16, 16) -// -// Computes: -// sij_scale = sij * scale -// mij = row_max(sij_scale) -> (M, 1) -// pij = exp(sij_scale - mij) -> (M, N) -// lij = row_sum(pij) -> (M, 1) - -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -static __aicore__ void softmax_prepare_impl( - __gm__ uint8_t *sij_raw, float scale_value, __gm__ uint8_t *pij_raw, __gm__ uint8_t *mij_raw, - __gm__ uint8_t *lij_raw -) { - constexpr int M = 16, N = 16; - - __gm__ float *sij = reinterpret_cast<__gm__ float *>(sij_raw); - __gm__ half *pij = reinterpret_cast<__gm__ half *>(pij_raw); - __gm__ float *mij = reinterpret_cast<__gm__ float *>(mij_raw); - __gm__ float *lij = reinterpret_cast<__gm__ float *>(lij_raw); - - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - - using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - using GlobalDataMxN_f16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; - - GlobalDataMxN sijGlobal(sij); - GlobalDataMxN_f16 pijGlobal(pij); - GlobalScalarDN mijGlobal(mij); - GlobalScalarDN lijGlobal(lij); - - using TileVecMxN = Tile; - using TileVecMxN_f16 = Tile; - using TileScalarDN = Tile; - - TileVecMxN sijTile; - TileVecMxN pijTile; - TileVecMxN tmpTile; - TileScalarDN maxTile; - TileScalarDN sumTile; - TileVecMxN_f16 pijF16Tile; - - TASSIGN(sijTile, 0x0); - TASSIGN(pijTile, M * N * sizeof(float)); - TASSIGN(tmpTile, 2 * M * N * sizeof(float)); - TASSIGN(maxTile, 3 * M * N * sizeof(float)); - TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); - TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); - - TLOAD(sijTile, sijGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - TMULS(sijTile, sijTile, scale_value); - TROWMAX(maxTile, sijTile, tmpTile); - TROWEXPANDSUB(pijTile, sijTile, maxTile); - TEXP(pijTile, pijTile); - // Truncate pij to fp16 first, then compute lij from truncated values (matches golden) - TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND); - TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND); - TROWSUM(sumTile, pijTile, tmpTile); - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(mijGlobal, maxTile); - TSTORE(lijGlobal, sumTile); - TSTORE(pijGlobal, pijF16Tile); - - set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); - wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[0]); - union { - uint64_t u; - float f; - } scale_conv; - scale_conv.u = static_cast(args[1]); - float scale_value = scale_conv.f; - __gm__ uint8_t *pij = reinterpret_cast<__gm__ uint8_t *>(args[2]); - __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[3]); - __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[4]); - - softmax_prepare_impl(sij, scale_value, pij, mij, lij); -} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp deleted file mode 100644 index 17dbd02ce..000000000 --- a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Paged Attention Orchestration - Small Scale (16x16) - * - * Supports small-scale paged attention with: - * Query: (batch, q_head_num, head_dim) fp16 - * Key: (total_blocks, block_size, kv_head_num, head_dim) fp16 (NOT transposed) - * Value: (total_blocks, block_size, kv_head_num, head_dim) fp16 - * Output: (batch, q_head_num, head_dim) float32 - * - * Head tiling: q_tile_size = min(num_heads, 128) - * GQA: kv_head_num can differ from q_head_num - * - * ChipStorageTaskArgs layout: tensors=[query, key_cache, value_cache, block_table, context_lens, out], scalars=[scale] - */ - -#include -#include -#include - -#include "orchestration_api.h" // NOLINT(build/include_subdir) - -#define FUNC_QK_MATMUL 0 -#define FUNC_SOFTMAX_PREPARE 1 -#define FUNC_PV_MATMUL 2 -#define FUNC_ONLINE_UPDATE 3 - -extern "C" { - -int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { - if (orch_args.tensor_count() < 6) { - std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n'; - return -1; - } - - // Extract host pointers from tensor metadata - void *host_query = orch_args.tensor(0).data_as(); - void *host_key_cache = orch_args.tensor(1).data_as(); - void *host_value_cache = orch_args.tensor(2).data_as(); - int *host_block_table = orch_args.tensor(3).data_as(); - int *host_context_lens = orch_args.tensor(4).data_as(); - void *host_out = orch_args.tensor(5).data_as(); - - // Extract sizes from tensor metadata - size_t query_size = orch_args.tensor(0).nbytes(); - size_t key_cache_size = orch_args.tensor(1).nbytes(); - size_t value_cache_size = orch_args.tensor(2).nbytes(); - size_t out_size = orch_args.tensor(5).nbytes(); - - // Read dimensions from tensor shapes - // query: (batch, num_heads, head_dim) - uint32_t batch = orch_args.tensor(0).shapes[0]; - uint32_t num_heads = orch_args.tensor(0).shapes[1]; - uint32_t head_dim = orch_args.tensor(0).shapes[2]; - - // key_cache: (total_blocks, block_size, kv_head_num, head_dim) - uint32_t block_size = orch_args.tensor(1).shapes[1]; - uint32_t kv_head_num = orch_args.tensor(1).shapes[2]; - - // block_table: (batch, max_num_blocks_per_req) - uint32_t max_num_blocks = orch_args.tensor(3).shapes[1]; - - // scale: first scalar argument - uint64_t scale_value_bits = orch_args.scalar(0); - - uint32_t q_tile_size = std::min(num_heads, 128u); - uint32_t num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size; - - std::cout << "\n=== build_paged_attention_graph ===" << '\n'; - std::cout << "batch=" << batch << ", num_heads=" << num_heads << ", kv_head_num=" << kv_head_num - << ", head_dim=" << head_dim << '\n'; - std::cout << "block_size=" << block_size << ", max_num_blocks=" << max_num_blocks << '\n'; - std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n'; - - // Allocate device memory for inputs/outputs - void *dev_query = device_malloc(runtime, query_size); - void *dev_key_cache = device_malloc(runtime, key_cache_size); - void *dev_value_cache = device_malloc(runtime, value_cache_size); - void *dev_out = device_malloc(runtime, out_size); - - if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) { - std::cerr << "Error: Failed to allocate device memory\n"; - return -1; - } - - copy_to_device(runtime, dev_query, host_query, query_size); - copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size); - copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size); - record_tensor_pair(runtime, host_out, dev_out, out_size); - - // Buffer sizes depend on q_tile_size and block_size - size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); - size_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); - size_t mij_size = static_cast(q_tile_size) * sizeof(float); - size_t lij_size = mij_size; - size_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); - - // Per-batch-per-block intermediate buffers - uint32_t total_buffers = batch * max_num_blocks; - void **dev_sij_arr = new void *[total_buffers]; - void **dev_pij_arr = new void *[total_buffers]; - void **dev_mij_arr = new void *[total_buffers]; - void **dev_lij_arr = new void *[total_buffers]; - void **dev_oi_new_arr = new void *[total_buffers]; - - for (uint32_t i = 0; i < total_buffers; i++) { - dev_sij_arr[i] = device_malloc(runtime, sij_size); - dev_pij_arr[i] = device_malloc(runtime, pij_size); - dev_mij_arr[i] = device_malloc(runtime, mij_size); - dev_lij_arr[i] = device_malloc(runtime, lij_size); - dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size); - } - - // Per-(batch, head_tile) accumulators - uint32_t total_accums = batch * num_head_tiles; - size_t mi_size = static_cast(q_tile_size) * sizeof(float); - size_t li_size = mi_size; - size_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); - - void **dev_mi_arr = new void *[total_accums]; - void **dev_li_arr = new void *[total_accums]; - void **dev_oi_arr = new void *[total_accums]; - - for (uint32_t i = 0; i < total_accums; i++) { - dev_mi_arr[i] = device_malloc(runtime, mi_size); - dev_li_arr[i] = device_malloc(runtime, li_size); - dev_oi_arr[i] = device_malloc(runtime, oi_size); - } - - std::cout << "Allocated " << total_buffers << " per-block buffers\n"; - std::cout << "Allocated " << total_accums << " per-(batch,head_tile) accumulators\n"; - - int total_tasks = 0; - - for (uint32_t b_idx = 0; b_idx < batch; b_idx++) { - int cur_seq = host_context_lens[b_idx]; - uint32_t bn_this_batch = (static_cast(cur_seq) + block_size - 1) / block_size; - - for (uint32_t ht = 0; ht < num_head_tiles; ht++) { - uint32_t cur_offset = ht * q_tile_size; - - // Query: (batch, q_head_num, head_dim) fp16 - // qi points to heads [cur_offset .. cur_offset+q_tile_size) for batch b_idx - uint8_t *qi_ptr = reinterpret_cast(dev_query) + - static_cast(b_idx * num_heads + cur_offset) * head_dim * sizeof(uint16_t); - - // Output: (batch * q_head_num, head_dim) float32 - uint8_t *out_ptr = reinterpret_cast(dev_out) + - static_cast(b_idx * num_heads + cur_offset) * head_dim * sizeof(float); - - // GQA: which kv_head this head tile maps to - uint32_t kv_head_idx = cur_offset / (num_heads / kv_head_num); - - // Per-(batch, head_tile) accumulators - uint32_t accum_idx = b_idx * num_head_tiles + ht; - void *dev_mi = dev_mi_arr[accum_idx]; - void *dev_li = dev_li_arr[accum_idx]; - void *dev_oi = dev_oi_arr[accum_idx]; - - int t_up_prev = -1; - - for (uint32_t bn = 0; bn < bn_this_batch; bn++) { - int cur_block_idx = host_block_table[b_idx * max_num_blocks + bn]; - - // Key: (total_blocks, block_size, kv_head_num, head_dim) fp16 - uint8_t *kj_ptr = reinterpret_cast(dev_key_cache) + - (static_cast(cur_block_idx) * block_size * kv_head_num + kv_head_idx) * - head_dim * sizeof(uint16_t); - - // Value: (total_blocks, block_size, kv_head_num, head_dim) fp16 - uint8_t *vj_ptr = reinterpret_cast(dev_value_cache) + - (static_cast(cur_block_idx) * block_size * kv_head_num + kv_head_idx) * - head_dim * sizeof(uint16_t); - - uint32_t buf_idx = b_idx * max_num_blocks + bn; - void *dev_sij = dev_sij_arr[buf_idx]; - void *dev_pij = dev_pij_arr[buf_idx]; - void *dev_mij = dev_mij_arr[buf_idx]; - void *dev_lij = dev_lij_arr[buf_idx]; - void *dev_oi_new = dev_oi_new_arr[buf_idx]; - - // QK: qi(M, K) @ kj.T(K, N) -> sij(M, N) - uint64_t qk_args[6] = {reinterpret_cast(qi_ptr), reinterpret_cast(kj_ptr), - reinterpret_cast(dev_sij), static_cast(q_tile_size), - static_cast(head_dim), static_cast(block_size)}; - int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); - total_tasks++; - - // SF: scale, rowmax, exp, rowsum -> pij, mij, lij - uint64_t sf_args[7] = {reinterpret_cast(dev_sij), scale_value_bits, - reinterpret_cast(dev_pij), reinterpret_cast(dev_mij), - reinterpret_cast(dev_lij), static_cast(q_tile_size), - static_cast(block_size)}; - int t_sf = add_task(runtime, sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV); - total_tasks++; - - // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N') - uint64_t pv_args[6] = {reinterpret_cast(dev_pij), reinterpret_cast(vj_ptr), - reinterpret_cast(dev_oi_new), static_cast(q_tile_size), - static_cast(block_size), static_cast(head_dim)}; - int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); - total_tasks++; - - add_successor(runtime, t_qk, t_sf); - add_successor(runtime, t_sf, t_pv); - - // Online Update: serialized across blocks (each depends on previous) - int is_first = (bn == 0) ? 1 : 0; - int is_last = (bn == bn_this_batch - 1) ? 1 : 0; - - uint64_t up_args[11] = {reinterpret_cast(dev_mij), reinterpret_cast(dev_lij), - reinterpret_cast(dev_oi_new), reinterpret_cast(dev_mi), - reinterpret_cast(dev_li), reinterpret_cast(dev_oi), - static_cast(is_first), static_cast(is_last), - reinterpret_cast(out_ptr), static_cast(q_tile_size), - static_cast(head_dim)}; - int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); - total_tasks++; - - add_successor(runtime, t_pv, t_up); - if (t_up_prev >= 0) { - add_successor(runtime, t_up_prev, t_up); - } - t_up_prev = t_up; - } - } - } - - delete[] dev_sij_arr; - delete[] dev_pij_arr; - delete[] dev_mij_arr; - delete[] dev_lij_arr; - delete[] dev_oi_new_arr; - delete[] dev_mi_arr; - delete[] dev_li_arr; - delete[] dev_oi_arr; - - std::cout << "Created " << total_tasks << " tasks\n"; - print_runtime(runtime); - - return 0; -} -} diff --git a/examples/a5/host_build_graph/paged_attention/test_paged_attention.py b/examples/a5/host_build_graph/paged_attention/test_paged_attention.py deleted file mode 100644 index 7d72b6be1..000000000 --- a/examples/a5/host_build_graph/paged_attention/test_paged_attention.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged attention — host_build_graph example (small scale, float16). - -AIC+AIV mixed execution with online softmax paged attention. -Small-scale cases for quick validation on A5. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test -from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden -from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs - - -@scene_test(level=2, runtime="host_build_graph") -class TestPagedAttention(SceneTestCase): - """Paged attention with host_build_graph runtime on A5.""" - - RTOL = 1e-2 - ATOL = 1e-2 - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/paged_attention_orch.cpp", - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "source": "kernels/aic/aic_qk_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "source": "kernels/aic/aic_pv_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "source": "kernels/aiv/aiv_softmax_prepare.cpp", - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "source": "kernels/aiv/aiv_online_update.cpp", - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, - ], - } - - CASES = [ - { - "name": "Case1", - "platforms": ["a5sim", "a5"], - "config": {"aicpu_thread_num": 3, "block_dim": 3}, - "params": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 16, - "max_model_len": 256, - "dtype": "float16", - }, - }, - { - "name": "Case2", - "platforms": ["a5sim", "a5"], - "config": {"aicpu_thread_num": 3, "block_dim": 3}, - "manual": True, - "params": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 64, - "max_model_len": 256, - "dtype": "float16", - }, - }, - ] - - def generate_args(self, params): - inputs = _pa_generate_inputs(params) - specs = [] - for name, val in inputs: - if isinstance(val, torch.Tensor): - specs.append(Tensor(name, val)) - else: - specs.append(Scalar(name, val)) - return TaskArgsBuilder(*specs) - - def compute_golden(self, args, params): - tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} - _pa_compute_golden(tensors, params) - for s in args.specs: - if isinstance(s, Tensor) and s.name in tensors: - getattr(args, s.name)[:] = tensors[s.name] - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp index c2800abcb..c6e04d559 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -10,9 +10,11 @@ */ // PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) // -// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16) +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128) -> (16, 128) +// Case2: (64, 64) @ ( 64, 128) -> (64, 128) // -// pij is float16 (converted from fp32 in softmax_prepare via TCVT). +// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT). // vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. // Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. @@ -33,13 +35,13 @@ using namespace pto; template static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) { - __gm__ half *pij_addr = reinterpret_cast<__gm__ half *>(pij->buffer.addr); - __gm__ half *vj_addr = reinterpret_cast<__gm__ half *>(vj->buffer.addr); + __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); + __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr); __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); - // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32 - using GlobalA = GlobalTensor, pto::Stride>; - using GlobalB = GlobalTensor, pto::Stride>; + // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32 + using GlobalA = GlobalTensor, pto::Stride>; + using GlobalB = GlobalTensor, pto::Stride>; using GlobalOut = GlobalTensor, pto::Stride>; GlobalA pijGlobal(pij_addr + pij->start_offset); @@ -47,12 +49,12 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __g GlobalOut oiGlobal(oi_addr + oi->start_offset); // L1 Mat tiles: standard ND pattern for both A and B - using TileMatA = Tile; - using TileMatB = Tile; + using TileMatA = Tile; + using TileMatB = Tile; // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; + using LeftTile = TileLeft; + using RightTile = TileRight; using AccTile = TileAcc; TileMatA aMatTile; @@ -67,15 +69,17 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __g TASSIGN(bTile, 0x0); TASSIGN(cTile, 0x0); - // Load pij and vj to L1 + // Load pij and vj to L1 with separate events for pipeline overlap TLOAD(aMatTile, pijGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done TLOAD(bMatTile, vjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + // Move A to L0A as soon as A load completes (B may still be loading) wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - // Move to L0A/L0B TMOV(aTile, aMatTile); + // Move B to L0B after B load completes + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); TMOV(bTile, bMatTile); set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); @@ -97,6 +101,14 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]); __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]); __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); - - pv_matmul_impl<16, 16, 16>(pij, vj, oi_new); + uint64_t q_tile_size = static_cast(pij->shapes[0]); + // args[4] = block_size, args[5] = head_dim + + if (q_tile_size == 16 && pij->shapes[1] <= 16) { + pv_matmul_impl<16, 16, 16>(pij, vj, oi_new); + } else if (q_tile_size == 16) { + pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); + } else { + pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); + } } diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp index cb1de3e1e..c3e38f7d2 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -10,7 +10,9 @@ */ // QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) // -// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16) +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128).T -> (16, 128) +// Case2: (64, 128) @ (128, 64).T -> (64, 64) // // kj is stored as (N, K) = (block_size, head_dim) in row-major memory. // This is equivalent to (K, N) in column-major (DN) layout. @@ -33,14 +35,14 @@ using namespace pto; template static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) { - __gm__ half *qi_addr = reinterpret_cast<__gm__ half *>(qi->buffer.addr); - __gm__ half *kj_addr = reinterpret_cast<__gm__ half *>(kj->buffer.addr); + __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr); + __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr); __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); - // qi (M, K) fp16 in ND (row-major) layout - using GlobalA = GlobalTensor, pto::Stride>; + // qi (M, K) bf16 in ND (row-major) layout + using GlobalA = GlobalTensor, pto::Stride>; // kj stored as (N, K) row-major = (K, N) column-major -> DN layout - using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; + using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; using GlobalOut = GlobalTensor, pto::Stride>; GlobalA qiGlobal(qi_addr + qi->start_offset); @@ -48,12 +50,12 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm GlobalOut sijGlobal(sij_addr + sij->start_offset); // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) - using TileMatA = Tile; - using TileMatB = Tile; + using TileMatA = Tile; + using TileMatB = Tile; // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; + using LeftTile = TileLeft; + using RightTile = TileRight; using AccTile = TileAcc; TileMatA aMatTile; @@ -68,15 +70,17 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm TASSIGN(bTile, 0x0); TASSIGN(cTile, 0x0); - // Load A and B to L1 + // Load A and B to L1 with separate events for pipeline overlap TLOAD(aMatTile, qiGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done TLOAD(bMatTile, kjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + // Move A to L0A as soon as A load completes (B may still be loading) wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - // Move from L1 to L0A/L0B TMOV(aTile, aMatTile); + // Move B to L0B after B load completes + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); TMOV(bTile, bMatTile); set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); @@ -98,6 +102,14 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]); __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]); __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]); - - qk_matmul_impl<16, 16, 16>(qi, kj, sij); + uint64_t q_tile_size = static_cast(qi->shapes[0]); + // args[4] = head_dim (128), args[5] = block_size + + if (q_tile_size == 16 && qi->shapes[1] <= 16) { + qk_matmul_impl<16, 16, 16>(qi, kj, sij); + } else if (q_tile_size == 16) { + qk_matmul_impl<16, 128, 128>(qi, kj, sij); + } else { + qk_matmul_impl<64, 128, 64>(qi, kj, sij); + } } diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp index d0b09a69b..cb841572c 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp @@ -10,13 +10,15 @@ */ // Online Softmax Update + Normalize Kernel (AIV) // -// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors +// Operates on full tiles where M=q_tile_size, N=head_dim (128): +// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors +// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors // -// Scalar layout strategy: -// M scalar floats stored contiguously in GM can be loaded as either: -// - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD) -// - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV) -// Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD. +// Scalar layout strategy using TRESHAPE (zero-copy UB reshape): +// Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV. +// For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M). +// After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops. +// This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original. #include #include @@ -46,11 +48,6 @@ static __aicore__ void online_update_impl( __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr); - // Scalar tile dimensions for RowMajor layout: - // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block) - // kScalarRows = M / 8 (M=16 -> 2 rows) - constexpr int kScalarCols = 32 / sizeof(float); - constexpr int kScalarRows = M / kScalarCols; // Aligned rows for ColMajor DN tiles (32-byte alignment) constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); @@ -59,77 +56,84 @@ static __aicore__ void online_update_impl( // Data (M, N) RowMajor using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor + // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading + using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // Scalar ND: for storing mi_new and li_new back to GM + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; using GlobalScalarND = GlobalTensor, pto::Stride<1, 1, 1, kScalarCols, 1>>; - // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor - using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; - // --- GlobalTensor instances --- GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); - // ND globals for scalar element-wise operations - GlobalScalarND mijGlobalND(mij_ptr + mij->start_offset); - GlobalScalarND lijGlobalND(lij_ptr + lij->start_offset); - GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); - GlobalScalarND liGlobalND(li_ptr + li->start_offset); - - // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE) + // DN globals for loading scalars as ColMajor GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); + GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset); GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); + // ND globals for storing scalar results + GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); + GlobalScalarND liGlobalND(li_ptr + li->start_offset); + // --- Tile types --- using TileDataMxN = Tile; + using TileScalarDN = Tile; + + // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE + using TileScalarRow = Tile; + + // ND tile for storing back to GM using TileScalarND = Tile; - using TileScalarDN = Tile; // --- UB memory layout --- constexpr int kDataBytes = M * N * sizeof(float); - constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float); constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); // Data tiles TileDataMxN oiNewTile; TileDataMxN oiTile; - // Scalar ND tiles for element-wise arithmetic - TileScalarND mijND, lijND, miND, liND; - TileScalarND miNewND, alphaND, betaND, tmpND; + // Scalar DN tiles loaded from GM (ColMajor) + TileScalarDN mijDN, lijDN, miDN, liDN; - // Scalar DN tiles for TROWEXPAND operations - TileScalarDN alphaDN, betaDN, liDN; + // Temporary DN tiles for results + TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN; TASSIGN(oiNewTile, 0); TASSIGN(oiTile, kDataBytes); - TASSIGN(mijND, 2 * kDataBytes); - TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes); - TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes); - TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes); - TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes); - TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes); - TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes); - TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes); - TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes); - TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes); - TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes); + TASSIGN(mijDN, 2 * kDataBytes); + TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes); + TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes); + TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes); if (is_first) { // --- First block: copy inputs to accumulators --- TLOAD(oiNewTile, oiNewGlobal); - TLOAD(mijND, mijGlobalND); - TLOAD(lijND, lijGlobalND); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - // Passthrough to MTE3 (no V compute needed) + // Store mi = mij, li = lij, oi = oi_new + // Alias ND tiles to the same UB as DN tiles for storing as ND format + TileScalarND mijND, lijND; + TASSIGN(mijND, 2 * kDataBytes); // alias same UB as mijDN + TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes); // alias same UB as lijDN + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); TSTORE(miGlobalND, mijND); // mi = mij @@ -138,13 +142,10 @@ static __aicore__ void online_update_impl( if (is_last) { // Single block: normalize dst = oi_new / lij - // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV - set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - TLOAD(liDN, liGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - TROWEXPANDDIV(oiNewTile, oiNewTile, liDN); + // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN); set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); TSTORE(dstGlobal, oiNewTile); @@ -152,64 +153,70 @@ static __aicore__ void online_update_impl( } else { // --- Subsequent blocks: accumulate --- - // Phase 1: Load all inputs + // Load all inputs TLOAD(oiNewTile, oiNewGlobal); TLOAD(oiTile, oiGlobal); - TLOAD(mijND, mijGlobalND); - TLOAD(lijND, lijGlobalND); - TLOAD(miND, miGlobalND); - TLOAD(liND, liGlobalND); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); + TLOAD(miDN, miGlobalDN); + TLOAD(liDN, liGlobalDN); set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols) - // pipe_barrier(PIPE_V) required between each dependent vector operation - // to resolve RAW hazards on shared UB tiles. - TMAX(miNewND, miND, mijND); // mi_new = max(mi, mij) - TSUB(alphaND, miND, miNewND); // alpha = mi - mi_new - TEXP(alphaND, alphaND); // alpha = exp(mi - mi_new) - TSUB(betaND, mijND, miNewND); // beta = mij - mi_new - TEXP(betaND, betaND); // beta = exp(mij - mi_new) - TMUL(liND, alphaND, liND); // li = alpha * li - TMUL(tmpND, betaND, lijND); // tmp = beta * lij - TADD(liND, liND, tmpND); // li = alpha * li + beta * lij (= li_new) - - // Phase 3: Store scalar results to GM (ND format) - // mi_new -> mi accumulator, li_new -> li accumulator - // alpha -> mij buffer (reuse), beta -> lij buffer (reuse) - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liND); // persist li_new - TSTORE(mijGlobalND, alphaND); // temp: alpha to mij buffer - TSTORE(lijGlobalND, betaND); // temp: beta to lij buffer - - // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN - set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - TLOAD(alphaDN, mijGlobalDN); // alpha from mij buffer as DN - TLOAD(betaDN, lijGlobalDN); // beta from lij buffer as DN - if (is_last) { - TLOAD(liDN, liGlobalDN); // li_new from li buffer as DN - } - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); - - // Phase 5: Scale data tiles using row-broadcast multiply + // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic + TileScalarRow miRow, mijRow, liRow, lijRow; + TRESHAPE(miRow, miDN); + TRESHAPE(mijRow, mijDN); + TRESHAPE(liRow, liDN); + TRESHAPE(lijRow, lijDN); + + // Scalar arithmetic in RowMajor (1, M) layout + TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow; + TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes); + + TMAX(miNewRow, miRow, mijRow); // mi_new = max(mi, mij) + TSUB(alphaRow, miRow, miNewRow); // alpha_exp = mi - mi_new + TEXP(alphaRow, alphaRow); // alpha = exp(mi - mi_new) + TSUB(betaRow, mijRow, miNewRow); // beta_exp = mij - mi_new + TEXP(betaRow, betaRow); // beta = exp(mij - mi_new) + TMUL(tmpRow, alphaRow, liRow); // alpha * li + TMUL(liNewRow, betaRow, lijRow); // beta * lij + TADD(liNewRow, tmpRow, liNewRow); // li_new = alpha*li + beta*lij + + // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL + TRESHAPE(alphaDN, alphaRow); + TRESHAPE(betaDN, betaRow); + + // Scale data tiles using row-broadcast multiply TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + // Store mi_new and li_new to GM (ND format) + // Alias ND tiles to the same UB locations as miNewRow and liNewRow + TileScalarND miNewND, liNewND; + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes); + if (is_last) { - // Phase 6: Normalize and output - TROWEXPANDDIV(oiTile, oiTile, liDN); // dst = oi / li_new - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + // Normalize and output: dst = oi / li_new + TRESHAPE(liNewDN, liNewRow); + TROWEXPANDDIV(oiTile, oiTile, liNewDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new TSTORE(dstGlobal, oiTile); } else { - // Phase 6: Store updated accumulators - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + // Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new TSTORE(oiGlobal, oiTile); } } @@ -228,6 +235,14 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]); uint64_t is_first = static_cast(args[7]); uint64_t is_last = static_cast(args[8]); + uint64_t q_tile_size = static_cast(mij->shapes[0]); + // args[10] = head_dim (128) - online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + if (q_tile_size == 16 && oi_new->shapes[1] <= 16) { + online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else if (q_tile_size == 16) { + online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else { + online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } } diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp index 7729bbbd8..4bb21f68b 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -10,14 +10,16 @@ */ // Softmax Preparation Kernel (AIV) with partial block masking // -// Fixed tile size: sij is (16, 16) +// Operates on (M, N) tile where M=q_tile_size, N=block_size: +// Case1: sij is (16, 128) +// Case2: sij is (64, 64) // // For partial blocks (valid_len < N), positions [valid_len, N) in sij are -// filled with -inf before softmax, ensuring exp(-inf)=0 so that invalid -// key positions contribute zero attention weight. +// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0 +// so that invalid key positions contribute zero attention weight. // // Computes: -// sij_masked = pad(sij, valid_len, -inf) +// sij_masked = TFILLPAD(sij, valid_len, pad=-inf) // sij_scale = sij_masked * scale // mij = row_max(sij_scale) -> (M, 1) // pij = exp(sij_scale - mij) -> (M, N) @@ -26,9 +28,8 @@ #include #include -#include "tensor.h" // NOLINT(build/include_subdir) +#include "tensor.h" -// NOLINTNEXTLINE(build/namespaces) using namespace pto; #ifndef __gm__ @@ -36,7 +37,7 @@ using namespace pto; #endif #ifndef __aicore__ -#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#define __aicore__ [aicore] #endif template @@ -45,18 +46,18 @@ static __aicore__ void softmax_prepare_impl( ) { uint64_t valid_len = static_cast(sij->shapes[1]); __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); - __gm__ half *pij_addr = reinterpret_cast<__gm__ half *>(pij->buffer.addr); + __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - using GlobalDataMxN_f16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_bf16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; GlobalDataMxN sijGlobal(sij_addr + sij->start_offset); - GlobalDataMxN_f16 pijGlobal(pij_addr + pij->start_offset); + GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset); GlobalScalarDN mijGlobal(mij_addr + mij->start_offset); GlobalScalarDN lijGlobal(lij_addr + lij->start_offset); @@ -66,7 +67,7 @@ static __aicore__ void softmax_prepare_impl( using TileSijPad = Tile; using TileVecMxN = Tile; - using TileVecMxN_f16 = Tile; + using TileVecMxN_bf16 = Tile; using TileScalarDN = Tile; TileVecMxN sijTile; @@ -76,8 +77,9 @@ static __aicore__ void softmax_prepare_impl( TileVecMxN tmpTile; TileScalarDN maxTile; TileScalarDN sumTile; - TileVecMxN_f16 pijF16Tile; + TileVecMxN_bf16 pijBf16Tile; + // All sij tiles share UB address 0x0 (in-place masking) TASSIGN(sijTile, 0x0); TASSIGN(sijDynTile, 0x0); TASSIGN(sijPadTile, 0x0); @@ -85,28 +87,38 @@ static __aicore__ void softmax_prepare_impl( TASSIGN(tmpTile, 2 * M * N * sizeof(float)); TASSIGN(maxTile, 3 * M * N * sizeof(float)); TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); - TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks + // printf("sij addr incore %x\n", sij->buffer.addr); TLOAD(sijTile, sijGlobal); set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - // manually fill invalid columns with -inf as a workaround. + // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary, + // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N. TFILLPAD_INPLACE(sijPadTile, sijDynTile); TMULS(sijTile, sijTile, scale_value); TROWMAX(maxTile, sijTile, tmpTile); TROWEXPANDSUB(pijTile, sijTile, maxTile); TEXP(pijTile, pijTile); - // Truncate pij to fp16 first, then compute lij from truncated values (matches golden) - TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND); - TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND); + // Truncate pij to bf16 first + TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); // pij bf16 ready, can store early + + // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel + TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); TROWSUM(sumTile, pijTile, tmpTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); // sum ready - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + // Store pij (overlaps with TCVT + TROWSUM above) wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(pijGlobal, pijBf16Tile); + + // Store max and sum TSTORE(mijGlobal, maxTile); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); TSTORE(lijGlobal, sumTile); TSTORE(pijGlobal, pijF16Tile); @@ -119,7 +131,19 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]); __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]); __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]); - float scale_value = from_u64(static_cast(args[4])); - - softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij); + union { + uint64_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(args[4]); + float scale_value = scale_conv.f; + uint64_t q_tile_size = static_cast(sij->shapes[0]); + + if (q_tile_size == 16 && pij->shapes[1] <= 16) { + softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij); + } else if (q_tile_size == 16) { + softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij); + } else { + softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij); + } } diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 5a528eb49..b3314019a 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -15,18 +15,15 @@ * Each block processes a single 16x16 matmul operation. * * Memory Layout: - * Query: (batch, 16, 16) - one 16x16 tile per batch fp16 - * Key: (total_blocks, 16, 16) - stored as K^T for direct matmul fp16 - * Value: (total_blocks, 16, 16) - direct format fp16 - * - * This file compiles as a standalone .so with zero runtime link dependencies. - * All runtime calls go through the PTO2RuntimeOps function-pointer table. + * Query: (batch, 16, 16) - one 16x16 tile per batch + * Key: (total_blocks, 16, 16) - stored as K^T for direct matmul + * Value: (total_blocks, 16, 16) - direct format */ -#include -#include - +#include #include +#include +#include #include "pto_orchestration_api.h" // NOLINT(build/include_subdir) @@ -34,6 +31,26 @@ #define FUNC_SOFTMAX_PREPARE 1 #define FUNC_PV_MATMUL 2 #define FUNC_ONLINE_UPDATE 3 +constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz + +inline double cycles_to_us(uint64_t cycles) { + return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; +} + +inline uint64_t get_sys_cnt_aicpu() { + uint64_t ticks; + asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); + return ticks; +} + +#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) + extern "C" { __attribute__((visibility("default"))) PTO2OrchestrationConfig @@ -44,27 +61,37 @@ aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { }; } -__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { +__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) { + uint64_t prof_param_extract = 0; + uint64_t prof_ext_tensor = 0; + uint64_t prof_scope = 0; + uint64_t prof_make_tensor = 0; + uint64_t prof_tensor_view = 0; + uint64_t prof_param_setup = 0; + uint64_t prof_submit_task = 0; + int prof_submit_count = 0; + int prof_make_count = 0; + int prof_view_count = 0; + + CYCLE_COUNT_START(); + // Read dimensions from tensor metadata - // query: shape=[batch, num_heads, head_dim] uint64_t batch = orch_args.tensor(0).shapes[0]; uint64_t num_heads = orch_args.tensor(0).shapes[1]; uint64_t head_dim = orch_args.tensor(0).shapes[2]; DataType data_type = orch_args.tensor(0).dtype; - // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim] uint64_t block_size = orch_args.tensor(1).shapes[1]; - - // block_table: shape=[batch, max_num_blocks_per_req] uint64_t block_num = orch_args.tensor(3).shapes[1]; - // scale from scalar arg uint64_t scale_value = orch_args.scalar(0); uint64_t q_head_num = num_heads; - uint64_t q_tile = 16; + uint64_t q_tile = std::min(num_heads, 128UL); uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; - uint64_t elem_size = get_element_size(data_type); + CYCLE_COUNT_LAP(prof_param_extract); + + LOG_ALWAYS(">>>>>> batch = %" PRIu64, batch); // Reshape tensors for kernel consumption (2D flattened) void *query_ptr = orch_args.tensor(0).data_as(); @@ -72,22 +99,21 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip void *vc_ptr = orch_args.tensor(2).data_as(); void *out_ptr = orch_args.tensor(5).data_as(); - // Compute kv_total_rows from key_cache tensor metadata uint64_t total_blocks_count = orch_args.tensor(1).shapes[0]; - uint64_t kv_total_rows = total_blocks_count * block_size; uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; - uint32_t key_cache_shapes[2] = {static_cast(kv_total_rows), static_cast(head_dim)}; - uint32_t value_cache_shapes[2] = {static_cast(kv_total_rows), static_cast(head_dim)}; + uint32_t key_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t value_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type); Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type); Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type); Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); - LOG_DEBUG("query=%s", query.dump().c_str()); - LOG_DEBUG("key_cache=%s", key_cache.dump().c_str()); - LOG_DEBUG("value_cache=%s", value_cache.dump().c_str()); - LOG_DEBUG("out=%s", out.dump().c_str()); + CYCLE_COUNT_LAP(prof_ext_tensor); uint32_t bt_shapes[2] = {static_cast(batch), static_cast(block_num)}; Tensor block_table = @@ -105,64 +131,93 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32); TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type); + prof_make_count += 4; + CYCLE_COUNT_LAP(prof_make_tensor); + + int total_tasks = 0; + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { uint32_t cl_idx[1] = {static_cast(b_idx)}; uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { PTO2_SCOPE() { - uint32_t cur_offset = static_cast(b_idx * q_head_num + q_idx * q_tile); + CYCLE_COUNT_LAP(prof_scope); + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; - uint32_t qi_offsets[2] = {cur_offset, 0}; + uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; Tensor qi = query.view(tile2d_shapes, qi_offsets); - uint32_t out_view_offsets[2] = {cur_offset, 0}; + uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; Tensor out_view = out.view(tile2d_shapes, out_view_offsets); + prof_view_count += 2; + CYCLE_COUNT_LAP(prof_tensor_view); + CYCLE_COUNT_LAP(prof_param_setup); TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci); const Tensor &oi = alloc_outs.get_ref(0); const Tensor &li_update = alloc_outs.get_ref(1); const Tensor &mi_update = alloc_outs.get_ref(2); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); for (uint64_t bn = 0; bn < bn_this_batch; bn++) { + PTO2_SCOPE_GUARD(); + uint32_t bt_idx[2] = {static_cast(b_idx), static_cast(bn)}; uint64_t cur_block_idx = static_cast(get_tensor_data(block_table, 2, bt_idx)); - uint64_t valid_len = - block_size < (cur_seq - bn * block_size) ? block_size : (cur_seq - bn * block_size); + uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); + CYCLE_COUNT_LAP(prof_param_extract); + uint32_t kv_shapes[2] = {static_cast(block_size), static_cast(head_dim)}; uint32_t kv_offsets[2] = {static_cast(cur_block_idx * block_size), 0}; Tensor kj = key_cache.view(kv_shapes, kv_offsets); Tensor vj = value_cache.view(kv_shapes, kv_offsets); + prof_view_count += 2; + CYCLE_COUNT_LAP(prof_tensor_view); Arg params_qk; params_qk.add_input(qi); params_qk.add_input(kj); params_qk.add_output(sij_ci); + CYCLE_COUNT_LAP(prof_param_setup); TaskOutputTensors qk_outs = pto2_rt_submit_aic_task(FUNC_QK_MATMUL, params_qk); const Tensor &sij = qk_outs.get_ref(0); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); uint32_t sij_valid_shapes[2] = {static_cast(q_tile), static_cast(valid_len)}; uint32_t sij_valid_offsets[2] = {0, 0}; Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); + prof_view_count += 1; + CYCLE_COUNT_LAP(prof_tensor_view); + Arg params_sf; params_sf.add_input(sij_valid); params_sf.add_output(pij_f16_ci); params_sf.add_output(scalar_ci); params_sf.add_output(scalar_ci); params_sf.add_scalar(scale_value); + CYCLE_COUNT_LAP(prof_param_setup); TaskOutputTensors sf_outs = pto2_rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf); const Tensor &pij_f16 = sf_outs.get_ref(0); const Tensor &mi = sf_outs.get_ref(1); const Tensor &li = sf_outs.get_ref(2); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); Arg params_pv; params_pv.add_input(pij_f16); params_pv.add_input(vj); params_pv.add_output(tile2d_ci); + CYCLE_COUNT_LAP(prof_param_setup); TaskOutputTensors pv_outs = pto2_rt_submit_aic_task(FUNC_PV_MATMUL, params_pv); const Tensor &oi_tmp = pv_outs.get_ref(0); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); uint64_t is_first = (bn == 0) ? 1 : 0; uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; + CYCLE_COUNT_LAP(prof_param_extract); Arg params_up; params_up.add_input(mi); @@ -174,13 +229,53 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip params_up.add_inout(out_view); params_up.add_scalar(is_first); params_up.add_scalar(is_last); + CYCLE_COUNT_LAP(prof_param_setup); pto2_rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); } } + CYCLE_COUNT_LAP(prof_scope); } } - LOG_INFO("tasks submitted for batch=%" PRIu64 ", num_heads=%" PRIu64, batch, num_heads); + uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + + prof_submit_task + prof_scope; + LOG_ALWAYS( + "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count, + prof_make_count, prof_view_count, cycles_to_us(total) + ); + if (total > 0) { + LOG_ALWAYS( + " param_extract : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract), + prof_param_extract * 100.0 / total + ); + LOG_ALWAYS( + " ext_tensor(x4) : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total + ); + LOG_ALWAYS( + " create_info(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor), + prof_make_tensor * 100.0 / total, + prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0 + ); + LOG_ALWAYS( + " tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view), + prof_tensor_view * 100.0 / total, + prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0 + ); + LOG_ALWAYS( + " param_setup : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total + ); + LOG_ALWAYS(" scope : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total); + LOG_ALWAYS( + " submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task), + prof_submit_task * 100.0 / total, + prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0 + ); + } + +#undef CYCLE_COUNT_START +#undef CYCLE_COUNT_LAP } } // extern "C" diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py index 2e6eb99fb..a877c3ab2 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py @@ -7,10 +7,10 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -"""Paged attention — tensormap_and_ringbuffer example (small scale, float16). +"""Paged attention — tensormap_and_ringbuffer test (production scale, bfloat16). AIC+AIV mixed execution with online softmax paged attention. -Small-scale cases including variable sequence lengths. +Production-scale cases for A5 hardware validation. """ import torch @@ -25,13 +25,13 @@ class TestPagedAttention(SceneTestCase): """Paged attention with tensormap_and_ringbuffer runtime on A5.""" - RTOL = 1e-2 - ATOL = 1e-2 + RTOL = 1e-3 + ATOL = 1e-3 CALLABLE = { "orchestration": { "source": "kernels/orchestration/paged_attention_orch.cpp", - "function_name": "aicpu_orchestration_entry", + "function_name": "build_paged_attention_graph", "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], }, "incores": [ @@ -65,6 +65,53 @@ class TestPagedAttention(SceneTestCase): CASES = [ { "name": "Case1", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case3", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "SmallCase1", "platforms": ["a5sim", "a5"], "config": {"aicpu_thread_num": 4, "block_dim": 24}, "params": { @@ -75,11 +122,11 @@ class TestPagedAttention(SceneTestCase): "block_size": 16, "context_len": 33, "max_model_len": 256, - "dtype": "float16", + "dtype": "bfloat16", }, }, { - "name": "Case2", + "name": "SmallCase2", "platforms": ["a5sim", "a5"], "config": {"aicpu_thread_num": 4, "block_dim": 24}, "manual": True, @@ -91,11 +138,11 @@ class TestPagedAttention(SceneTestCase): "block_size": 16, "context_len": 128, "max_model_len": 256, - "dtype": "float16", + "dtype": "bfloat16", }, }, { - "name": "CaseVarSeq2", + "name": "SmallCaseVarSeq2", "platforms": ["a5sim", "a5"], "config": {"aicpu_thread_num": 4, "block_dim": 24}, "manual": True, @@ -108,11 +155,11 @@ class TestPagedAttention(SceneTestCase): "context_len": 33, "context_lens_list": [33, 17], "max_model_len": 256, - "dtype": "float16", + "dtype": "bfloat16", }, }, { - "name": "CaseVarSeq4", + "name": "SmallCaseVarSeq4", "platforms": ["a5sim", "a5"], "config": {"aicpu_thread_num": 4, "block_dim": 24}, "manual": True, @@ -125,7 +172,7 @@ class TestPagedAttention(SceneTestCase): "context_len": 128, "context_lens_list": [33, 64, 128, 15], "max_model_len": 256, - "dtype": "float16", + "dtype": "bfloat16", }, }, ] diff --git a/tests/st/a5/host_build_graph/paged_attention/README.md b/tests/st/a5/host_build_graph/paged_attention/README.md index bb280c331..c6c7a56a3 100644 --- a/tests/st/a5/host_build_graph/paged_attention/README.md +++ b/tests/st/a5/host_build_graph/paged_attention/README.md @@ -1,4 +1,4 @@ -# Paged Attention (Device Test) +# Paged Attention (A5 host_build_graph) This example demonstrates Paged Attention implementation using CCE (Cube Core Engine) code generation, with AIC matmul kernels and AIV vector kernels using PTO Tile API. @@ -13,16 +13,18 @@ Paged Attention is an efficient attention mechanism that processes KV cache in f ### Supported Platforms | Platform | Description | -|----------|-------------| -| a2a3 | Ascend hardware (requires device ID) | +| -------- | ----------- | +| a5sim | Simulator | +| a5 | Ascend hardware | -> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware. +This directory contains the `host_build_graph` variant of the A5 paged attention scene test. +The `tensormap_and_ringbuffer` variant lives separately under `examples/a5/tensormap_and_ringbuffer/paged_attention/`. ### Algorithm For each query token, the attention is computed incrementally across KV cache blocks: -``` +```text For each block j: sij = Qi @ Kj^T # QK MatMul (AIC) mij, lij, pij = softmax_prepare(sij) # Softmax (AIV) @@ -33,7 +35,7 @@ For each block j: ### Kernel Design (AIC/AIV Split) | Kernel | Core Type | Operation | Key Instructions | -|--------|-----------|-----------|------------------| +| ------ | --------- | --------- | ---------------- | | aic_qk_matmul | AIC (Cube) | Q @ K^T | TLOAD/TMOV/TMATMUL/TSTORE | | aiv_softmax_prepare | AIV (Vector) | scale, rowmax, exp, rowsum | TMULS/TROWMAX/TROWEXPANDSUB/TEXP/TROWSUM | | aic_pv_matmul | AIC (Cube) | P @ V | TLOAD/TMOV/TMATMUL/TSTORE | @@ -41,7 +43,7 @@ For each block j: ### Memory Hierarchy (AIC Matmul) -``` +```text GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM ``` @@ -49,7 +51,7 @@ GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM For each batch, the task dependency pattern is: -``` +```text Block 0: QK -> SF -> PV --+ Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n] Block n: QK -> SF -> PV --+ @@ -61,45 +63,40 @@ Block n: QK -> SF -> PV --+ ## Quick Start ```bash -# Run on hardware (specify device ID) -python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 - -# Run multi-block test case -PA_CASE=Case2 python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 +# Run the default case on sim +python tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py -p a5sim + +# Run a specific hardware case +python tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py -p a5 -d 0 -k Case2 ``` ## Directory Structure -``` +```text paged_attention/ ├── README.md # This file -├── golden.py # Input generation and expected output +├── test_paged_attention.py # Scene test entry └── kernels/ - ├── kernel_config.py # Kernel registration config - ├── aic/ # AIC kernels (CCE codegen style) - │ ├── aic_qk_matmul.cpp # Q @ K^T matmul - │ └── aic_pv_matmul.cpp # P @ V matmul - ├── aiv/ # AIV kernels (PTO Tile API) - │ ├── aiv_softmax_prepare.cpp # Softmax preparation - │ └── aiv_online_update.cpp # Online Softmax update + normalize + ├── aic/ + │ ├── aic_qk_matmul.cpp + │ └── aic_pv_matmul.cpp + ├── aiv/ + │ ├── aiv_softmax_prepare.cpp + │ └── aiv_online_update.cpp └── orchestration/ - └── paged_attention_orch.cpp # Task graph builder + └── paged_attention_orch.cpp ``` ## Test Cases -| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description | -|------|-------|-----------|-------------|----------|------------|-------------|-------------| -| Case1 | 1 | 16 | 1 | 128 | 128 | 256 | Small scale (default) | -| Case2 | 8 | 64 | 1 | 128 | 64 | 8192 | Production scale | +| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Platforms | +| ---- | ----- | --------- | ----------- | -------- | ---------- | ----------- | --------- | +| Case1 | 256 | 16 | 1 | 128 | 128 | 8100 | a5 | +| Case2 | 64 | 64 | 1 | 128 | 64 | 8150 | a5 | +| SmallCase1 | 1 | 16 | 1 | 16 | 16 | 16 | a5sim, a5 | +| SmallCase2 | 1 | 16 | 1 | 16 | 16 | 64 | a5sim, a5 | -All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1). +All cases use **bfloat16** Q/K/V inputs with GQA (`kv_head_num=1`). ## Key Technical Details @@ -161,16 +158,16 @@ TROWEXPANDMUL(oiTile, oiTile, alphaTileDN); ## Expected Output -``` +```text === Compiling and Registering Kernels === Compiling kernel: .../aic_qk_matmul.cpp (func_id=0) Compiling kernel: .../aiv_softmax_prepare.cpp (func_id=1) Compiling kernel: .../aic_pv_matmul.cpp (func_id=2) Compiling kernel: .../aiv_online_update.cpp (func_id=3) ... -=== build_paged_attention_graph (16x16 framework version) === +=== build_paged_attention_graph === batch=1, num_heads=16, kv_head_num=1, head_dim=16 -block_size=16, block_num=1 +block_size=16, max_num_blocks=16 ... Created 4 tasks ... @@ -185,7 +182,7 @@ TEST PASSED ## Reference -This implementation uses the Online Softmax algorithm for paged attention, with identical kernel structure to the PyPTO reference implementation. +This implementation uses the Online Softmax algorithm for paged attention, with an AIC/AIV split tailored for the `host_build_graph` runtime on A5. ## See Also diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp index 8d708928c..74584d6a1 100644 --- a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp +++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -98,9 +98,12 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { __gm__ uint8_t *vj = reinterpret_cast<__gm__ uint8_t *>(args[1]); __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]); int q_tile_size = static_cast(args[3]); - // args[4] = block_size, args[5] = head_dim + int block_size = static_cast(args[4]); + // args[5] = head_dim - if (q_tile_size == 16) { + if (q_tile_size == 16 && block_size <= 16) { + pv_matmul_impl<16, 16, 16>(pij, vj, oi_new); + } else if (q_tile_size == 16) { pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); } else { pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp index abbf6537c..27c524a32 100644 --- a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp +++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -99,9 +99,12 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { __gm__ uint8_t *kj = reinterpret_cast<__gm__ uint8_t *>(args[1]); __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[2]); int q_tile_size = static_cast(args[3]); - // args[4] = head_dim (128), args[5] = block_size + int head_dim = static_cast(args[4]); + // args[5] = block_size - if (q_tile_size == 16) { + if (q_tile_size == 16 && head_dim <= 16) { + qk_matmul_impl<16, 16, 16>(qi, kj, sij); + } else if (q_tile_size == 16) { qk_matmul_impl<16, 128, 128>(qi, kj, sij); } else { qk_matmul_impl<64, 128, 64>(qi, kj, sij); diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp index fbc55c324..965dffbbc 100644 --- a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp +++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp @@ -229,9 +229,11 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { int is_last = static_cast(args[7]); __gm__ uint8_t *dst = reinterpret_cast<__gm__ uint8_t *>(args[8]); int q_tile_size = static_cast(args[9]); - // args[10] = head_dim (128) + int head_dim = static_cast(args[10]); - if (q_tile_size == 16) { + if (q_tile_size == 16 && head_dim <= 16) { + online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else if (q_tile_size == 16) { online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); } else { online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp index 0e87b525d..51a3315c9 100644 --- a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -127,10 +127,12 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[3]); __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[4]); int q_tile_size = static_cast(args[5]); - // args[6] = block_size + int block_size = static_cast(args[6]); int valid_len = static_cast(args[7]); - if (q_tile_size == 16) { + if (q_tile_size == 16 && block_size <= 16) { + softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij, valid_len); + } else if (q_tile_size == 16) { softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij, valid_len); } else { softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij, valid_len); diff --git a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py index 2d3b12d3b..143092ce5 100644 --- a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py +++ b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py @@ -94,6 +94,37 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase): "dtype": "bfloat16", }, }, + { + "name": "SmallCase1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 16, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "SmallCase2", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 64, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, ] def generate_args(self, params): diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp deleted file mode 100644 index 5bca56442..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) -// -// Supports two tile configurations via runtime dispatch: -// Case1: (16, 128) @ (128, 128) -> (16, 128) -// Case2: (64, 64) @ ( 64, 128) -> (64, 128) -// -// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT). -// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. -// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) { - __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); - __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr); - __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); - - // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32 - using GlobalA = GlobalTensor, pto::Stride>; - using GlobalB = GlobalTensor, pto::Stride>; - using GlobalOut = GlobalTensor, pto::Stride>; - - GlobalA pijGlobal(pij_addr + pij->start_offset); - GlobalB vjGlobal(vj_addr + vj->start_offset); - GlobalOut oiGlobal(oi_addr + oi->start_offset); - - // L1 Mat tiles: standard ND pattern for both A and B - using TileMatA = Tile; - using TileMatB = Tile; - - // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - // Load pij and vj to L1 with separate events for pipeline overlap - TLOAD(aMatTile, pijGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done - TLOAD(bMatTile, vjGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done - - // Move A to L0A as soon as A load completes (B may still be loading) - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - TMOV(aTile, aMatTile); - // Move B to L0B after B load completes - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - // Single matmul: (M,K) x (K,N) -> (M,N) - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(oiGlobal, cTile); - - set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); - wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); - uint64_t q_tile_size = static_cast(pij->shapes[0]); - // args[4] = block_size, args[5] = head_dim - - if (q_tile_size == 16) { - pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); - } else { - pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); - } -} diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp deleted file mode 100644 index 0bfa9c460..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) -// -// Supports two tile configurations via runtime dispatch: -// Case1: (16, 128) @ (128, 128).T -> (16, 128) -// Case2: (64, 128) @ (128, 64).T -> (64, 64) -// -// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. -// This is equivalent to (K, N) in column-major (DN) layout. -// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) { - __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr); - __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr); - __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); - - // qi (M, K) bf16 in ND (row-major) layout - using GlobalA = GlobalTensor, pto::Stride>; - // kj stored as (N, K) row-major = (K, N) column-major -> DN layout - using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; - using GlobalOut = GlobalTensor, pto::Stride>; - - GlobalA qiGlobal(qi_addr + qi->start_offset); - GlobalB kjGlobal(kj_addr + kj->start_offset); - GlobalOut sijGlobal(sij_addr + sij->start_offset); - - // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) - using TileMatA = Tile; - using TileMatB = Tile; - - // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - // Load A and B to L1 with separate events for pipeline overlap - TLOAD(aMatTile, qiGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done - TLOAD(bMatTile, kjGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done - - // Move A to L0A as soon as A load completes (B may still be loading) - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - TMOV(aTile, aMatTile); - // Move B to L0B after B load completes - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - // Matmul - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(sijGlobal, cTile); - - set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); - wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]); - uint64_t q_tile_size = static_cast(qi->shapes[0]); - // args[4] = head_dim (128), args[5] = block_size - - if (q_tile_size == 16) { - qk_matmul_impl<16, 128, 128>(qi, kj, sij); - } else { - qk_matmul_impl<64, 128, 64>(qi, kj, sij); - } -} diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp deleted file mode 100644 index a7ffed408..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Online Softmax Update + Normalize Kernel (AIV) -// -// Operates on full tiles where M=q_tile_size, N=head_dim (128): -// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors -// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors -// -// Scalar layout strategy using TRESHAPE (zero-copy UB reshape): -// Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV. -// For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M). -// After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops. -// This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void online_update_impl( - __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li, - __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst -) { - __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); - __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); - __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr); - __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr); - __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr); - __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); - __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr); - - // Aligned rows for ColMajor DN tiles (32-byte alignment) - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - - // --- GlobalTensor types --- - - // Data (M, N) RowMajor - using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - - // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading - using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; - - // Scalar ND: for storing mi_new and li_new back to GM - constexpr int kScalarCols = 32 / sizeof(float); - constexpr int kScalarRows = M / kScalarCols; - using GlobalScalarND = - GlobalTensor, pto::Stride<1, 1, 1, kScalarCols, 1>>; - - // --- GlobalTensor instances --- - - GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); - GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); - GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); - - // DN globals for loading scalars as ColMajor - GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); - GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); - GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset); - GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); - - // ND globals for storing scalar results - GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); - GlobalScalarND liGlobalND(li_ptr + li->start_offset); - - // --- Tile types --- - - using TileDataMxN = Tile; - using TileScalarDN = Tile; - - // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE - using TileScalarRow = Tile; - - // ND tile for storing back to GM - using TileScalarND = - Tile; - - // --- UB memory layout --- - - constexpr int kDataBytes = M * N * sizeof(float); - constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); - - // Data tiles - TileDataMxN oiNewTile; - TileDataMxN oiTile; - - // Scalar DN tiles loaded from GM (ColMajor) - TileScalarDN mijDN, lijDN, miDN, liDN; - - // Temporary DN tiles for results - TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN; - - TASSIGN(oiNewTile, 0); - TASSIGN(oiTile, kDataBytes); - TASSIGN(mijDN, 2 * kDataBytes); - TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes); - TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes); - TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes); - TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes); - TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes); - TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes); - TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes); - - if (is_first) { - // --- First block: copy inputs to accumulators --- - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(mijDN, mijGlobalDN); - TLOAD(lijDN, lijGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Store mi = mij, li = lij, oi = oi_new - // Alias ND tiles to the same UB as DN tiles for storing as ND format - TileScalarND mijND, lijND; - TASSIGN(mijND, 2 * kDataBytes); // alias same UB as mijDN - TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes); // alias same UB as lijDN - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, mijND); // mi = mij - TSTORE(liGlobalND, lijND); // li = lij - TSTORE(oiGlobal, oiNewTile); // oi = oi_new - - if (is_last) { - // Single block: normalize dst = oi_new / lij - // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); - TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(dstGlobal, oiNewTile); - } - } else { - // --- Subsequent blocks: accumulate --- - - // Load all inputs - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(oiTile, oiGlobal); - TLOAD(mijDN, mijGlobalDN); - TLOAD(lijDN, lijGlobalDN); - TLOAD(miDN, miGlobalDN); - TLOAD(liDN, liGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic - TileScalarRow miRow, mijRow, liRow, lijRow; - TRESHAPE(miRow, miDN); - TRESHAPE(mijRow, mijDN); - TRESHAPE(liRow, liDN); - TRESHAPE(lijRow, lijDN); - - // Scalar arithmetic in RowMajor (1, M) layout - TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow; - TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes); - TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes); - TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes); - TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes); - - TMAX(miNewRow, miRow, mijRow); // mi_new = max(mi, mij) - TSUB(alphaRow, miRow, miNewRow); // alpha_exp = mi - mi_new - TEXP(alphaRow, alphaRow); // alpha = exp(mi - mi_new) - TSUB(betaRow, mijRow, miNewRow); // beta_exp = mij - mi_new - TEXP(betaRow, betaRow); // beta = exp(mij - mi_new) - TMUL(tmpRow, alphaRow, liRow); // alpha * li - TMUL(liNewRow, betaRow, lijRow); // beta * lij - TADD(liNewRow, tmpRow, liNewRow); // li_new = alpha*li + beta*lij - - // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL - TRESHAPE(alphaDN, alphaRow); - TRESHAPE(betaDN, betaRow); - - // Scale data tiles using row-broadcast multiply - TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha - TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta - TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new - - // Store mi_new and li_new to GM (ND format) - // Alias ND tiles to the same UB locations as miNewRow and liNewRow - TileScalarND miNewND, liNewND; - TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes); - - if (is_last) { - // Normalize and output: dst = oi / li_new - TRESHAPE(liNewDN, liNewRow); - TROWEXPANDDIV(oiTile, oiTile, liNewDN); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liNewND); // persist li_new - TSTORE(dstGlobal, oiTile); - } else { - // Store updated accumulators - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liNewND); // persist li_new - TSTORE(oiGlobal, oiTile); - } - } - - set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); - wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); - __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]); - __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]); - __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]); - __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]); - uint64_t is_first = static_cast(args[7]); - uint64_t is_last = static_cast(args[8]); - uint64_t q_tile_size = static_cast(mij->shapes[0]); - // args[10] = head_dim (128) - - if (q_tile_size == 16) { - online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); - } else { - online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); - } -} diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp deleted file mode 100644 index 0e6e6bd9c..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Softmax Preparation Kernel (AIV) with partial block masking -// -// Operates on (M, N) tile where M=q_tile_size, N=block_size: -// Case1: sij is (16, 128) -// Case2: sij is (64, 64) -// -// For partial blocks (valid_len < N), positions [valid_len, N) in sij are -// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0 -// so that invalid key positions contribute zero attention weight. -// -// Computes: -// sij_masked = TFILLPAD(sij, valid_len, pad=-inf) -// sij_scale = sij_masked * scale -// mij = row_max(sij_scale) -> (M, 1) -// pij = exp(sij_scale - mij) -> (M, N) -// lij = row_sum(pij) -> (M, 1) - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void softmax_prepare_impl( - __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij -) { - uint64_t valid_len = static_cast(sij->shapes[1]); - __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); - __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); - __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); - __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); - - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - - using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - using GlobalDataMxN_bf16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; - using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; - - GlobalDataMxN sijGlobal(sij_addr + sij->start_offset); - GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset); - GlobalScalarDN mijGlobal(mij_addr + mij->start_offset); - GlobalScalarDN lijGlobal(lij_addr + lij->start_offset); - - // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary - using TileSijDyn = Tile; - // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf - using TileSijPad = Tile; - - using TileVecMxN = Tile; - using TileVecMxN_bf16 = Tile; - using TileScalarDN = Tile; - - TileVecMxN sijTile; - TileSijDyn sijDynTile(static_cast(valid_len)); - TileSijPad sijPadTile; - TileVecMxN pijTile; - TileVecMxN tmpTile; - TileScalarDN maxTile; - TileScalarDN sumTile; - TileVecMxN_bf16 pijBf16Tile; - - // All sij tiles share UB address 0x0 (in-place masking) - TASSIGN(sijTile, 0x0); - TASSIGN(sijDynTile, 0x0); - TASSIGN(sijPadTile, 0x0); - TASSIGN(pijTile, M * N * sizeof(float)); - TASSIGN(tmpTile, 2 * M * N * sizeof(float)); - TASSIGN(maxTile, 3 * M * N * sizeof(float)); - TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); - TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); - - // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks - // printf("sij addr incore %x\n", sij->buffer.addr); - TLOAD(sijTile, sijGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary, - // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N. - TFILLPAD_INPLACE(sijPadTile, sijDynTile); - - TMULS(sijTile, sijTile, scale_value); - TROWMAX(maxTile, sijTile, tmpTile); - TROWEXPANDSUB(pijTile, sijTile, maxTile); - TEXP(pijTile, pijTile); - // Truncate pij to bf16 first - TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); // pij bf16 ready, can store early - - // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel - TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); - TROWSUM(sumTile, pijTile, tmpTile); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); // sum ready - - // Store pij (overlaps with TCVT + TROWSUM above) - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(pijGlobal, pijBf16Tile); - - // Store max and sum - TSTORE(mijGlobal, maxTile); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(lijGlobal, sumTile); - - set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); - wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]); - __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]); - union { - uint64_t u; - float f; - } scale_conv; - scale_conv.u = static_cast(args[4]); - float scale_value = scale_conv.f; - uint64_t q_tile_size = static_cast(sij->shapes[0]); - - if (q_tile_size == 16) { - softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij); - } else { - softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij); - } -} diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp deleted file mode 100644 index b3314019a..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Paged Attention Orchestration Function - 16x16 Version - * - * Simplified for 16x16 framework-generated matmul kernels. - * Each block processes a single 16x16 matmul operation. - * - * Memory Layout: - * Query: (batch, 16, 16) - one 16x16 tile per batch - * Key: (total_blocks, 16, 16) - stored as K^T for direct matmul - * Value: (total_blocks, 16, 16) - direct format - */ - -#include -#include -#include -#include - -#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) - -#define FUNC_QK_MATMUL 0 -#define FUNC_SOFTMAX_PREPARE 1 -#define FUNC_PV_MATMUL 2 -#define FUNC_ONLINE_UPDATE 3 -constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz - -inline double cycles_to_us(uint64_t cycles) { - return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; -} - -inline uint64_t get_sys_cnt_aicpu() { - uint64_t ticks; - asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); - return ticks; -} - -#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 -#define CYCLE_COUNT_LAP(acc) \ - do { \ - _t1 = get_sys_cnt_aicpu(); \ - acc += (_t1 - _t0); \ - _t0 = _t1; \ - } while (0) - -extern "C" { - -__attribute__((visibility("default"))) PTO2OrchestrationConfig -aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { - (void)orch_args; // NOLINT(readability/casting) - return PTO2OrchestrationConfig{ - .expected_arg_count = 7, - }; -} - -__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) { - uint64_t prof_param_extract = 0; - uint64_t prof_ext_tensor = 0; - uint64_t prof_scope = 0; - uint64_t prof_make_tensor = 0; - uint64_t prof_tensor_view = 0; - uint64_t prof_param_setup = 0; - uint64_t prof_submit_task = 0; - int prof_submit_count = 0; - int prof_make_count = 0; - int prof_view_count = 0; - - CYCLE_COUNT_START(); - - // Read dimensions from tensor metadata - uint64_t batch = orch_args.tensor(0).shapes[0]; - uint64_t num_heads = orch_args.tensor(0).shapes[1]; - uint64_t head_dim = orch_args.tensor(0).shapes[2]; - DataType data_type = orch_args.tensor(0).dtype; - - uint64_t block_size = orch_args.tensor(1).shapes[1]; - uint64_t block_num = orch_args.tensor(3).shapes[1]; - - uint64_t scale_value = orch_args.scalar(0); - - uint64_t q_head_num = num_heads; - uint64_t q_tile = std::min(num_heads, 128UL); - uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; - CYCLE_COUNT_LAP(prof_param_extract); - - LOG_ALWAYS(">>>>>> batch = %" PRIu64, batch); - - // Reshape tensors for kernel consumption (2D flattened) - void *query_ptr = orch_args.tensor(0).data_as(); - void *kc_ptr = orch_args.tensor(1).data_as(); - void *vc_ptr = orch_args.tensor(2).data_as(); - void *out_ptr = orch_args.tensor(5).data_as(); - - uint64_t total_blocks_count = orch_args.tensor(1).shapes[0]; - - uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; - uint32_t key_cache_shapes[2] = { - static_cast(total_blocks_count * block_size), static_cast(head_dim) - }; - uint32_t value_cache_shapes[2] = { - static_cast(total_blocks_count * block_size), static_cast(head_dim) - }; - uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; - Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type); - Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type); - Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type); - Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); - CYCLE_COUNT_LAP(prof_ext_tensor); - - uint32_t bt_shapes[2] = {static_cast(batch), static_cast(block_num)}; - Tensor block_table = - make_tensor_external(orch_args.tensor(3).data_as(), bt_shapes, 2, DataType::INT32, false); - uint32_t cl_shapes[1] = {static_cast(batch)}; - Tensor context_lens = - make_tensor_external(orch_args.tensor(4).data_as(), cl_shapes, 1, DataType::INT32, false); - - // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size - uint32_t tile2d_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - uint32_t scalar_shapes[1] = {static_cast(q_tile)}; - uint32_t sij_shapes[2] = {static_cast(q_tile), static_cast(block_size)}; - TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32); - TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32); - TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32); - TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type); - - prof_make_count += 4; - CYCLE_COUNT_LAP(prof_make_tensor); - - int total_tasks = 0; - - for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { - uint32_t cl_idx[1] = {static_cast(b_idx)}; - uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); - uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; - for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { - PTO2_SCOPE() { - CYCLE_COUNT_LAP(prof_scope); - uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; - - uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; - Tensor qi = query.view(tile2d_shapes, qi_offsets); - uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; - Tensor out_view = out.view(tile2d_shapes, out_view_offsets); - prof_view_count += 2; - CYCLE_COUNT_LAP(prof_tensor_view); - - CYCLE_COUNT_LAP(prof_param_setup); - TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci); - const Tensor &oi = alloc_outs.get_ref(0); - const Tensor &li_update = alloc_outs.get_ref(1); - const Tensor &mi_update = alloc_outs.get_ref(2); - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); - - for (uint64_t bn = 0; bn < bn_this_batch; bn++) { - PTO2_SCOPE_GUARD(); - - uint32_t bt_idx[2] = {static_cast(b_idx), static_cast(bn)}; - uint64_t cur_block_idx = static_cast(get_tensor_data(block_table, 2, bt_idx)); - uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); - CYCLE_COUNT_LAP(prof_param_extract); - - uint32_t kv_shapes[2] = {static_cast(block_size), static_cast(head_dim)}; - uint32_t kv_offsets[2] = {static_cast(cur_block_idx * block_size), 0}; - Tensor kj = key_cache.view(kv_shapes, kv_offsets); - Tensor vj = value_cache.view(kv_shapes, kv_offsets); - prof_view_count += 2; - CYCLE_COUNT_LAP(prof_tensor_view); - - Arg params_qk; - params_qk.add_input(qi); - params_qk.add_input(kj); - params_qk.add_output(sij_ci); - CYCLE_COUNT_LAP(prof_param_setup); - TaskOutputTensors qk_outs = pto2_rt_submit_aic_task(FUNC_QK_MATMUL, params_qk); - const Tensor &sij = qk_outs.get_ref(0); - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); - - uint32_t sij_valid_shapes[2] = {static_cast(q_tile), static_cast(valid_len)}; - uint32_t sij_valid_offsets[2] = {0, 0}; - Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); - prof_view_count += 1; - CYCLE_COUNT_LAP(prof_tensor_view); - - Arg params_sf; - params_sf.add_input(sij_valid); - params_sf.add_output(pij_f16_ci); - params_sf.add_output(scalar_ci); - params_sf.add_output(scalar_ci); - params_sf.add_scalar(scale_value); - CYCLE_COUNT_LAP(prof_param_setup); - TaskOutputTensors sf_outs = pto2_rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf); - const Tensor &pij_f16 = sf_outs.get_ref(0); - const Tensor &mi = sf_outs.get_ref(1); - const Tensor &li = sf_outs.get_ref(2); - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); - - Arg params_pv; - params_pv.add_input(pij_f16); - params_pv.add_input(vj); - params_pv.add_output(tile2d_ci); - CYCLE_COUNT_LAP(prof_param_setup); - TaskOutputTensors pv_outs = pto2_rt_submit_aic_task(FUNC_PV_MATMUL, params_pv); - const Tensor &oi_tmp = pv_outs.get_ref(0); - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); - - uint64_t is_first = (bn == 0) ? 1 : 0; - uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; - CYCLE_COUNT_LAP(prof_param_extract); - - Arg params_up; - params_up.add_input(mi); - params_up.add_input(li); - params_up.add_input(oi_tmp); - params_up.add_inout(mi_update); - params_up.add_inout(li_update); - params_up.add_inout(oi); - params_up.add_inout(out_view); - params_up.add_scalar(is_first); - params_up.add_scalar(is_last); - CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up); - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); - } - } - CYCLE_COUNT_LAP(prof_scope); - } - } - - uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + - prof_submit_task + prof_scope; - LOG_ALWAYS( - "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count, - prof_make_count, prof_view_count, cycles_to_us(total) - ); - if (total > 0) { - LOG_ALWAYS( - " param_extract : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract), - prof_param_extract * 100.0 / total - ); - LOG_ALWAYS( - " ext_tensor(x4) : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total - ); - LOG_ALWAYS( - " create_info(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor), - prof_make_tensor * 100.0 / total, - prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0 - ); - LOG_ALWAYS( - " tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view), - prof_tensor_view * 100.0 / total, - prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0 - ); - LOG_ALWAYS( - " param_setup : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total - ); - LOG_ALWAYS(" scope : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total); - LOG_ALWAYS( - " submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task), - prof_submit_task * 100.0 / total, - prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0 - ); - } - -#undef CYCLE_COUNT_START -#undef CYCLE_COUNT_LAP -} - -} // extern "C" diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py deleted file mode 100644 index 4e3a52890..000000000 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged attention — tensormap_and_ringbuffer test (production scale, bfloat16). - -AIC+AIV mixed execution with online softmax paged attention. -Production-scale cases for A5 hardware validation. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test -from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden -from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs - - -@scene_test(level=2, runtime="tensormap_and_ringbuffer") -class TestPagedAttention(SceneTestCase): - """Paged attention with tensormap_and_ringbuffer runtime on A5.""" - - RTOL = 1e-3 - ATOL = 1e-3 - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/paged_attention_orch.cpp", - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "source": "kernels/aic/aic_qk_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "source": "kernels/aic/aic_pv_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "source": "kernels/aiv/aiv_softmax_prepare.cpp", - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "source": "kernels/aiv/aiv_online_update.cpp", - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, - ], - } - - CASES = [ - { - "name": "Case1", - "platforms": ["a5"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "params": { - "batch": 256, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - { - "name": "Case2", - "platforms": ["a5"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "manual": True, - "params": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - { - "name": "Case3", - "platforms": ["a5"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "manual": True, - "params": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 256, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - ] - - def generate_args(self, params): - inputs = _pa_generate_inputs(params) - specs = [] - for name, val in inputs: - if isinstance(val, torch.Tensor): - specs.append(Tensor(name, val)) - else: - specs.append(Scalar(name, val)) - return TaskArgsBuilder(*specs) - - def compute_golden(self, args, params): - tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} - _pa_compute_golden(tensors, params) - for s in args.specs: - if isinstance(s, Tensor) and s.name in tensors: - getattr(args, s.name)[:] = tensors[s.name] - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py similarity index 100% rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py From fb39511f90dbe201d9445ded2ea06d0fa45e1921 Mon Sep 17 00:00:00 2001 From: majin0824 Date: Thu, 16 Apr 2026 21:06:06 +0800 Subject: [PATCH 3/5] fix: Complete the missing function names that were omitted during the migration process - During the previous use case migration process, some kernels lacked the definition of function names. - This submission has completed the missing names in the aic and aiv modules of test_*.py to maintain the integrity and consistency of the code. --- .../bgemm/test_bgemm.py | 2 ++ .../paged_attention/test_paged_attention.py | 4 +++ .../bgemm/test_bgemm.py | 2 ++ .../paged_attention/test_paged_attention.py | 4 +++ .../test_paged_attention_unroll.py | 6 ++++ .../paged_attention/test_paged_attention.py | 4 +++ .../test_batch_paged_attention.py | 4 +++ .../benchmark_bgemm/test_benchmark_bgemm.py | 2 ++ .../mixed_example/test_mixed_example.py | 5 ++++ .../test_multi_round_paged_attention.py | 4 +++ .../test_paged_attention_unroll.py | 4 +++ .../spmd_basic/test_spmd_basic.py | 6 ++-- .../test_spmd_multiblock_aiv.py | 2 +- .../test_spmd_multiblock_mix.py | 6 ++-- .../spmd_starvation/test_spmd_starvation.py | 21 ++++++++++++-- .../spmd_sync_start/test_spmd_sync_start.py | 21 ++++++++++++-- .../test_spmd_sync_start_aiv.py | 7 ++++- .../test_spmd_sync_start_edge.py | 21 ++++++++++++-- .../test_spmd_sync_start_stress.py | 28 ++++++++++++++++--- .../paged_attention/test_paged_attention.py | 4 +++ .../mixed_example/test_mixed_example.py | 5 ++++ .../test_paged_attention_unroll.py | 4 +++ .../spmd_basic/test_spmd_basic.py | 6 ++-- .../test_spmd_multiblock_aiv.py | 2 +- .../test_spmd_multiblock_mix.py | 6 ++-- .../spmd_starvation/test_spmd_starvation.py | 21 ++++++++++++-- .../spmd_sync_start/test_spmd_sync_start.py | 21 ++++++++++++-- .../test_spmd_sync_start_aiv.py | 7 ++++- .../test_spmd_sync_start_edge.py | 21 ++++++++++++-- .../test_spmd_sync_start_stress.py | 28 ++++++++++++++++--- 30 files changed, 236 insertions(+), 42 deletions(-) diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py index f3e2d1c31..276f71175 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py +++ b/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py @@ -36,12 +36,14 @@ class TestBgemm(SceneTestCase): "incores": [ { "func_id": 0, + "name": "GEMM", "source": "kernels/aic/kernel_gemm_tile.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "ADD", "source": "kernels/aiv/kernel_tile_add.cpp", "core_type": "aiv", "signature": [D.INOUT, D.IN], diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py index ee58ece6a..559de8522 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py @@ -31,24 +31,28 @@ class TestPagedAttention(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "PV", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 2, + "name": "SF", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py index d7bc46a59..9601fcdf5 100644 --- a/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py +++ b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py @@ -37,12 +37,14 @@ class TestBgemm(SceneTestCase): "incores": [ { "func_id": 0, + "name": "GEMM", "source": "kernels/mix/kernel_bgemm.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "ADD", "source": "kernels/mix/kernel_bgemm.cpp", "core_type": "aiv", "signature": [D.INOUT, D.IN], diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py index a877c3ab2..3579a2d6a 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py @@ -37,24 +37,28 @@ class TestPagedAttention(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 2, + "name": "PV", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "SF", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py index c8d78acbe..d0b982df0 100644 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py +++ b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py @@ -37,36 +37,42 @@ class TestPagedAttentionUnrollAicpuBuildGraph(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 2, + "name": "PV", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 4, + "name": "AIC_HUB", "source": "kernels/aic/aic_hub.cpp", "core_type": "aic", "signature": [], }, { "func_id": 1, + "name": "SF", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], }, { "func_id": 5, + "name": "AIV_HUB", "source": "kernels/aiv/aiv_hub.cpp", "core_type": "aiv", "signature": [], diff --git a/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py index 13b5159b7..232b68b29 100644 --- a/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py +++ b/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py @@ -37,24 +37,28 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 2, + "name": "PV", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "SF", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py index cc1ed20e9..ecee598fd 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py @@ -31,24 +31,28 @@ class TestBatchPagedAttention(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "PV", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 2, + "name": "SF", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py b/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py index 514e2189a..05ea2d7a2 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py @@ -29,12 +29,14 @@ class TestBenchmarkBgemm(SceneTestCase): "incores": [ { "func_id": 0, + "name": "GEMM", "source": "kernels/aic/kernel_gemm_tile.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "ADD", "source": "kernels/aiv/kernel_tile_add.cpp", "core_type": "aiv", "signature": [D.INOUT, D.IN], diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py index daf598969..da21e903c 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py @@ -51,30 +51,35 @@ class TestMixedExample(SceneTestCase): "incores": [ { "func_id": 0, + "name": "MATMUL", "source": "kernels/aic/kernel_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "ADD", "source": "kernels/aiv/kernel_add.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 2, + "name": "MUL", "source": "kernels/aiv/kernel_mul.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 3, + "name": "ADD_STANDALONE", "source": "kernels/aiv/kernel_add_standalone.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 4, + "name": "MUL_STANDALONE", "source": "kernels/aiv/kernel_mul_standalone.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py b/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py index b9520e5af..a78b91de5 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py @@ -36,24 +36,28 @@ class TestMultiRoundPagedAttention(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": f"{_PA_KERNELS}/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "PV", "source": f"{_PA_KERNELS}/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 2, + "name": "SF", "source": f"{_PA_KERNELS}/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 3, + "name": "UP", "source": f"{_PA_KERNELS}/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py index 847882d0a..f9dc66c54 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py @@ -31,24 +31,28 @@ class TestPagedAttentionUnroll(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "PV", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 2, + "name": "SF", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py index a35358e22..39ecfb73c 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py @@ -40,9 +40,9 @@ class TestSpmdBasic(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, + {"func_id": 0, "name": "SPMD_READ_AIC", "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"}, + {"func_id": 1, "name": "SPMD_READ_AIV0", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, + {"func_id": 2, "name": "SPMD_READ_AIV1", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, ], } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py index f3d74a142..63b5f3ea8 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py @@ -35,7 +35,7 @@ class TestSpmdMultiblockAiv(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + {"func_id": 0, "name": "SPMD_WRITE_AIV", "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, ], } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py index edb931451..c0cc20cd6 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py @@ -36,9 +36,9 @@ class TestSpmdMultiblockMix(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 0, "name": "SPMD_MIX_AIC", "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, + {"func_id": 1, "name": "SPMD_MIX_AIV0", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 2, "name": "SPMD_MIX_AIV1", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, ], } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py index d952c905f..1b3e5ff8a 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py @@ -57,9 +57,24 @@ class TestSpmdStarvation(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py index 1aa0758d9..f8bf33830 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py @@ -32,9 +32,24 @@ class TestSpmdSyncStart(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py index 3f9b0272b..e35b004a1 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py @@ -31,7 +31,12 @@ class TestSpmdSyncStartAiv(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_WRITE_AIV", + "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py index 550ac3211..5ebfb87b0 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py @@ -32,9 +32,24 @@ class TestSpmdSyncStartEdge(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py index a230c8264..c8e46a624 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py @@ -57,10 +57,30 @@ class TestSpmdSyncStartStress(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 3, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 3, + "name": "SPMD_WRITE_AIV", + "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py index 143092ce5..54d7afc39 100644 --- a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py +++ b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py @@ -37,24 +37,28 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 2, + "name": "PV", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "SF", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py index 37a8a92ed..be7c792ee 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py +++ b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py @@ -57,30 +57,35 @@ class TestMixedExample(SceneTestCase): "incores": [ { "func_id": 0, + "name": "MATMUL", "source": "kernels/aic/kernel_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "ADD", "source": "kernels/aiv/kernel_add.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 2, + "name": "MUL", "source": "kernels/aiv/kernel_mul.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 3, + "name": "ADD_STANDALONE", "source": "kernels/aiv/kernel_add_standalone.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 4, + "name": "MUL_STANDALONE", "source": "kernels/aiv/kernel_mul_standalone.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.OUT], diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py index f79a98c0d..5421f9245 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py +++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py @@ -36,24 +36,28 @@ class TestPagedAttentionUnroll(SceneTestCase): "incores": [ { "func_id": 0, + "name": "QK", "source": "kernels/aic/aic_qk_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 2, + "name": "PV", "source": "kernels/aic/aic_pv_matmul.cpp", "core_type": "aic", "signature": [D.IN, D.IN, D.OUT], }, { "func_id": 1, + "name": "SF", "source": "kernels/aiv/aiv_softmax_prepare.cpp", "core_type": "aiv", "signature": [D.IN, D.OUT, D.OUT, D.OUT], }, { "func_id": 3, + "name": "UP", "source": "kernels/aiv/aiv_online_update.cpp", "core_type": "aiv", "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py index 55d4cbfb7..e62ecee01 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py @@ -38,9 +38,9 @@ class TestSpmdBasic(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, + {"func_id": 0, "name": "SPMD_READ_AIC", "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"}, + {"func_id": 1, "name": "SPMD_READ_AIV0", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, + {"func_id": 2, "name": "SPMD_READ_AIV1", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"}, ], } diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py index 58becb0b8..254a37d55 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py @@ -48,7 +48,7 @@ class TestSpmdMultiblockAiv(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + {"func_id": 0, "name": "SPMD_WRITE_AIV", "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, ], } diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py index 1bac22c74..0ef57c2c6 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py @@ -50,9 +50,9 @@ class TestSpmdMultiblockMix(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 0, "name": "SPMD_MIX_AIC", "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, + {"func_id": 1, "name": "SPMD_MIX_AIV0", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + {"func_id": 2, "name": "SPMD_MIX_AIV1", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, ], } diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py index 425ccdab0..06d8a541c 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py @@ -69,9 +69,24 @@ class TestSpmdStarvation(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py index 18320397e..d4592cef7 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py @@ -48,9 +48,24 @@ class TestSpmdSyncStart(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py index 8a434caa5..7d0c2b314 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py @@ -49,7 +49,12 @@ class TestSpmdSyncStartAiv(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_WRITE_AIV", + "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py index 11a728a02..35497a419 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py @@ -53,9 +53,24 @@ class TestSpmdSyncStartEdge(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, ], } diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py index a87eb7209..4c7c9c789 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py +++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py @@ -75,10 +75,30 @@ class TestSpmdSyncStartStress(SceneTestCase): "signature": [D.INOUT], }, "incores": [ - {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"}, - {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"}, - {"func_id": 3, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"}, + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", + "core_type": "aiv", + }, + { + "func_id": 3, + "name": "SPMD_WRITE_AIV", + "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", + "core_type": "aiv", + }, ], } From 949fcfe476192e1066bebb7e4d82542e8581b13f Mon Sep 17 00:00:00 2001 From: majin0824 Date: Fri, 17 Apr 2026 11:02:48 +0800 Subject: [PATCH 4/5] Refactor: merge bgemm into benchmark_bgemm and fix a5 paged_attention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete examples/a2a3/bgemm (fixed-config), move benchmark_bgemm from tests/st to examples/a2a3 with a Bgemm64 case covering the old example config (tile=64, grid_k=4, block_dim=3) - Add platform guards for aarch64 timer asm in a5 paged_attention orchestration files (mrs cntvct_el0 → rdtsc on x86_64) --- .github/workflows/ci.yml | 4 +- .../kernels/aic/kernel_gemm_tile.cpp | 0 .../kernels/aiv/kernel_tile_add.cpp | 0 .../kernels/orchestration/bgemm_orch.cpp | 0 .../benchmark_bgemm/test_benchmark_bgemm.py | 6 + .../bgemm/kernels/aic/kernel_gemm_tile.cpp | 121 ----------------- .../bgemm/kernels/aiv/kernel_tile_add.cpp | 74 ----------- .../kernels/orchestration/bgemm_orch.cpp | 124 ------------------ .../bgemm/test_bgemm.py | 82 ------------ .../kernels/aiv/aiv_softmax_prepare.cpp | 1 - .../orchestration/paged_attention_orch.cpp | 10 +- .../orchestration/paged_attention_orch.cpp | 10 +- 12 files changed, 26 insertions(+), 406 deletions(-) rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp (100%) rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp (100%) rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp (100%) rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py (94%) delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1a2d03e36..2869c3f93 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -267,7 +267,7 @@ jobs: pip install '.[test]' - name: Run simulation examples (a5sim) - run: python ci.py -p a5sim -c d96c8784 -t 600 --clone-protocol https + run: python ci.py -p a5sim -c 3cf259e8 -t 600 --clone-protocol https - name: Run pytest scene tests (a5sim) run: | @@ -277,7 +277,7 @@ jobs: if [ $rc -eq 124 ]; then echo "pytest timed out; retrying with pinned PTO-ISA commit" pytest examples tests/st --platform a5sim --device 0-15 -v \ - --pto-session-timeout 600 --pto-isa-commit d96c8784 --clone-protocol https + --pto-session-timeout 600 --pto-isa-commit 3cf259e8 --clone-protocol https rc=$? fi exit $rc diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py similarity index 94% rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py index 05ea2d7a2..a3b888f75 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py +++ b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py @@ -79,6 +79,12 @@ class TestBenchmarkBgemm(SceneTestCase): "config": {"aicpu_thread_num": 4, "block_dim": 24}, "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 4, "grid_k": 4}, }, + { + "name": "Bgemm64", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {"matmul_add_task_num": 32, "incore_data_size": 64, "incore_loop": 1, "grid_k": 4}, + }, ] def generate_args(self, params): diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp deleted file mode 100644 index 56077fc90..000000000 --- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Tile-based Matrix Multiplication Kernel (Cube Core) - * - * Computes: output = input_a @ input_b (64x64 tile matmul) - * Uses TMATMUL instruction - * - * Args (Tensor*): - * args[0] = input_a (INPUT) - * args[1] = input_b (INPUT) - * args[2] = output (OUTPUT) - */ - -#include -#include -#include -#include - -#include "tensor.h" - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -AICORE constexpr inline T CeilAlign(T num_1, T num_2) { - if (num_2 == 0) { - return 0; - } - return (num_1 + num_2 - 1) / num_2 * num_2; -} - -static __aicore__ void -gemm_tile_impl(__gm__ Tensor *input_a_tensor, __gm__ Tensor *input_b_tensor, __gm__ Tensor *output_tensor) { - __gm__ float *input_a = - reinterpret_cast<__gm__ float *>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset; - __gm__ float *input_b = - reinterpret_cast<__gm__ float *>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset; - __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset; - - constexpr int TILE = 64; - constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float); - constexpr int M = CeilAlign(TILE, 16); - constexpr int K = CeilAlign(TILE, blockAlign); - constexpr int N = CeilAlign(TILE, blockAlign); - - using GlobalDataA = - GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; - using GlobalDataB = - GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; - using GlobalDataC = - GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; - - GlobalDataA src0Global(input_a); - GlobalDataB src1Global(input_b); - GlobalDataC dstGlobal(output); - - using TileMatA = Tile; - using TileMatB = Tile; - - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - TLOAD(aMatTile, src0Global); - TLOAD(bMatTile, src1Global); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(dstGlobal, cTile); - - set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); - wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]); - - gemm_tile_impl(input_a, input_b, output); -} diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp deleted file mode 100644 index 2dce84dcd..000000000 --- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Tile-based Element-wise Addition Kernel (Vector Core) - INOUT Pattern - * - * Computes: C_tile = C_tile + P (64x64 tile accumulation) - * Uses TADD instruction - * - * Args (Tensor*): - * args[0] = C_tile (INOUT: read + write accumulator) - * args[1] = P (INPUT: matmul result to accumulate) - */ - -#include -#include -#include - -#include "tensor.h" - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *c_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); - - __gm__ float *c_ptr = reinterpret_cast<__gm__ float *>(c_tensor->buffer.addr) + c_tensor->start_offset; - __gm__ float *p_ptr = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset; - - constexpr int TILE = 64; - - using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>; - using DynStridDim5 = Stride<1, 1, 1, TILE, 1>; - using GlobalData = GlobalTensor; - using TileData = Tile; - - TileData cTile(TILE, TILE); - TileData pTile(TILE, TILE); - TileData outTile(TILE, TILE); - TASSIGN(cTile, 0x0); - TASSIGN(pTile, 0x10000); - TASSIGN(outTile, 0x20000); - - GlobalData cGlobal(c_ptr); - GlobalData pGlobal(p_ptr); - GlobalData outGlobal(c_ptr); // write back to same C location - - TLOAD(cTile, cGlobal); - TLOAD(pTile, pGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - TADD(outTile, cTile, pTile); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(outGlobal, outTile); - - set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); - wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); -} diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp deleted file mode 100644 index 452e472fe..000000000 --- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * BGEMM Orchestration Function (tensormap_and_ringbuffer Runtime) - * - * Builds the task graph for tiled matrix multiplication: C = A @ B - * - * Configuration: - * - Tile size: 64 x 64 - * - Grid: 4 x 4 x 4 (GRID_M x GRID_K x GRID_N) - * - Batch: 2 - * - * Memory layout (tile-first, 5D flattened): - * A: [BATCH, GRID_M, GRID_K, TILE, TILE] - * B: [BATCH, GRID_K, GRID_N, TILE, TILE] - * C: [BATCH, GRID_M, GRID_N, TILE, TILE] - * - * Task graph per output tile C[batch, m, n]: - * for k in [0, GRID_K): - * P = A[m,k] @ B[k,n] (gemm_tile on Cube core, func_id=0) - * C[m,n] = C[m,n] + P (tile_add on Vector core, func_id=1) - * - * Dependencies are automatic via TensorMap overlap detection. - * - * Arg layout: [A, B, C] — shape/dtype/size in tensor metadata - */ - -#include -#include - -#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) - -#define FUNC_GEMM_TILE 0 -#define FUNC_TILE_ADD 1 - -// Grid and tile constants -static constexpr int TILE = 64; -static constexpr int GRID_M = 4; -static constexpr int GRID_K = 4; -static constexpr int GRID_N = 4; -static constexpr int BATCH = 2; - -static constexpr uint32_t TILE_ELEMS = TILE * TILE; // 4096 elements - -extern "C" { - -__attribute__((visibility("default"))) PTO2OrchestrationConfig -aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { - (void)orch_args; // NOLINT(readability/casting) - return PTO2OrchestrationConfig{ - .expected_arg_count = 3, - }; -} - -__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { - // 1D external tensors for the full A, B, C arrays - Tensor ext_A = from_tensor_arg(orch_args.tensor(0)); - Tensor ext_B = from_tensor_arg(orch_args.tensor(1)); - Tensor ext_C = from_tensor_arg(orch_args.tensor(2)); - - LOG_INFO("[bgemm_orch] Grid: %dx%dx%d, Batch: %d, Tile: %d", GRID_M, GRID_K, GRID_N, BATCH, TILE); - - uint32_t tile_shapes[1] = {TILE_ELEMS}; - TensorCreateInfo tile_ci(tile_shapes, 1, DataType::FLOAT32); - - for (int batch = 0; batch < BATCH; batch++) { - for (int m_idx = 0; m_idx < GRID_M; m_idx++) { - for (int n_idx = 0; n_idx < GRID_N; n_idx++) { - PTO2_SCOPE() { - uint32_t c_elem_offset = (static_cast(batch) * GRID_M * GRID_N + - static_cast(m_idx) * GRID_N + static_cast(n_idx)) * - TILE_ELEMS; - uint32_t c_view_offsets[1] = {c_elem_offset}; - Tensor C_view = ext_C.view(tile_shapes, c_view_offsets); - - for (int k_idx = 0; k_idx < GRID_K; k_idx++) { - uint32_t a_elem_offset = - (static_cast(batch) * GRID_M * GRID_K + static_cast(m_idx) * GRID_K + - static_cast(k_idx)) * - TILE_ELEMS; - uint32_t b_elem_offset = - (static_cast(batch) * GRID_K * GRID_N + static_cast(k_idx) * GRID_N + - static_cast(n_idx)) * - TILE_ELEMS; - - uint32_t a_view_offsets[1] = {a_elem_offset}; - Tensor A_view = ext_A.view(tile_shapes, a_view_offsets); - uint32_t b_view_offsets[1] = {b_elem_offset}; - Tensor B_view = ext_B.view(tile_shapes, b_view_offsets); - // P = A[m,k] @ B[k,n] - Arg params_gemm; - params_gemm.add_input(A_view); - params_gemm.add_input(B_view); - params_gemm.add_output(tile_ci); - TaskOutputTensors gemm_outs = pto2_rt_submit_aic_task(FUNC_GEMM_TILE, - params_gemm); // gemm - - // C[m,n] += P - Arg params_add; - params_add.add_inout(C_view); - params_add.add_input(gemm_outs.get_ref(0)); - pto2_rt_submit_aiv_task(FUNC_TILE_ADD, - params_add); // add - } - } - } - } - } - - LOG_INFO( - "[bgemm_orch] Submitted tasks for %d batches, %dx%d output tiles, %d K steps each", BATCH, GRID_M, GRID_N, - GRID_K - ); -} - -} // extern "C" diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py deleted file mode 100644 index 276f71175..000000000 --- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""BGEMM: batched tiled matrix multiplication C = A @ B. - -Fixed 4x4x4 grid with 64x64 tiles, 2 batches. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test - -TILE_M, TILE_K, TILE_N = 64, 64, 64 -GRID_M, GRID_K, GRID_N = 4, 4, 4 -BATCH = 2 - - -@scene_test(level=2, runtime="tensormap_and_ringbuffer") -class TestBgemm(SceneTestCase): - RTOL = 1e-3 - ATOL = 1e-3 - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/bgemm_orch.cpp", - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "name": "GEMM", - "source": "kernels/aic/kernel_gemm_tile.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "name": "ADD", - "source": "kernels/aiv/kernel_tile_add.cpp", - "core_type": "aiv", - "signature": [D.INOUT, D.IN], - }, - ], - } - - CASES = [ - { - "name": "default", - "platforms": ["a2a3sim", "a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 3}, - "params": {}, - } - ] - - def generate_args(self, params): - A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01 - B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01 - C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32) - return TaskArgsBuilder(Tensor("A", A.flatten()), Tensor("B", B.flatten()), Tensor("C", C.flatten())) - - def compute_golden(self, args, params): - A = args.A.reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) - B = args.B.reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) - C = args.C.reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) - C[:] = 0.0 - for batch in range(BATCH): - for m in range(GRID_M): - for n in range(GRID_N): - for k in range(GRID_K): - C[batch, m, n] += torch.matmul(A[batch, m, k], B[batch, k, n]) - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp index 4bb21f68b..8fa605e68 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -120,7 +120,6 @@ static __aicore__ void softmax_prepare_impl( TSTORE(mijGlobal, maxTile); wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); TSTORE(lijGlobal, sumTile); - TSTORE(pijGlobal, pijF16Tile); set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index b3314019a..c59abecf2 100644 --- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -39,7 +39,15 @@ inline double cycles_to_us(uint64_t cycles) { inline uint64_t get_sys_cnt_aicpu() { uint64_t ticks; +#if defined(__aarch64__) asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); +#elif defined(__x86_64__) + unsigned int lo, hi; + asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); + ticks = (static_cast(hi) << 32) | lo; +#else + ticks = 0; +#endif return ticks; } @@ -87,7 +95,7 @@ __attribute__((visibility("default"))) void build_paged_attention_graph(const Ch uint64_t scale_value = orch_args.scalar(0); uint64_t q_head_num = num_heads; - uint64_t q_tile = std::min(num_heads, 128UL); + uint64_t q_tile = std::min(num_heads, static_cast(128)); uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; CYCLE_COUNT_LAP(prof_param_extract); diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp index 1460a588d..fba81681a 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp @@ -43,7 +43,15 @@ inline double cycles_to_us(uint64_t cycles) { inline uint64_t get_sys_cnt_aicpu() { uint64_t ticks; +#if defined(__aarch64__) asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); +#elif defined(__x86_64__) + unsigned int lo, hi; + asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); + ticks = (static_cast(hi) << 32) | lo; +#else + ticks = 0; +#endif return ticks; } @@ -105,7 +113,7 @@ __attribute__((visibility("default"))) void build_paged_attention_graph(const Ch // scale from scalar arg uint64_t scale_value = orch_args.scalar(0); uint64_t q_head_num = num_heads; - uint64_t q_tile = std::min(num_heads, 128UL); + uint64_t q_tile = std::min(num_heads, static_cast(128)); uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; CYCLE_COUNT_LAP(prof_param_extract); From 76d9ed152fec132939d7a48a7caafb30dbac7117 Mon Sep 17 00:00:00 2001 From: majin0824 Date: Fri, 17 Apr 2026 18:15:59 +0800 Subject: [PATCH 5/5] Refactor: remove legacy run_example.py and ci.py runners All golden.py-based tests have been migrated to pytest @scene_test format, making both legacy runners dead code. - Delete examples/scripts/run_example.py and ci.py - Remove ci.py steps from CI workflow; add --clone-protocol https to all first-attempt pytest calls so PTO-ISA clones via HTTPS in CI - Update conftest.py to pre-clone PTO-ISA when --clone-protocol is non-default, replacing ci.py's pre-clone responsibility - Update ci.sh run_task() to use test_*.py instead of run_example.py - Remove run_example.py fallback from tools/benchmark_rounds.sh - Remove ci.py and run_example.py smoke tests from verify_packaging.sh --- .github/workflows/ci.yml | 25 +- ci.py | 1326 ------------------------------- ci.sh | 13 +- conftest.py | 8 +- examples/scripts/run_example.py | 316 -------- tools/benchmark_rounds.sh | 13 +- tools/verify_packaging.sh | 6 - 7 files changed, 23 insertions(+), 1684 deletions(-) delete mode 100644 ci.py delete mode 100644 examples/scripts/run_example.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2869c3f93..6eadda84c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -207,13 +207,10 @@ jobs: pip install torch --index-url https://download.pytorch.org/whl/cpu pip install '.[test]' - - name: Run simulation examples (a2a3sim) - run: python ci.py -p a2a3sim -c d96c8784 -t 600 --clone-protocol https - - name: Run pytest scene tests (a2a3sim) run: | set +e - pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600 + pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https rc=$? if [ $rc -eq 124 ]; then echo "pytest timed out; retrying with pinned PTO-ISA commit" @@ -266,13 +263,10 @@ jobs: pip install torch --index-url https://download.pytorch.org/whl/cpu pip install '.[test]' - - name: Run simulation examples (a5sim) - run: python ci.py -p a5sim -c 3cf259e8 -t 600 --clone-protocol https - - name: Run pytest scene tests (a5sim) run: | set +e - pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600 + pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https rc=$? if [ $rc -eq 124 ]; then echo "pytest timed out; retrying with pinned PTO-ISA commit" @@ -319,17 +313,12 @@ jobs: pip install --upgrade pip pip install '.[test]' - - name: Run on-device examples (a2a3) - run: | - source .venv/bin/activate - source ${ASCEND_HOME_PATH}/bin/setenv.bash && python ci.py -p a2a3 -d ${DEVICE_RANGE} -c d96c8784 -t 600 --clone-protocol https - - name: Run pytest scene tests (a2a3) run: | set +e source .venv/bin/activate source ${ASCEND_HOME_PATH}/bin/setenv.bash - python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600 + python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600 --clone-protocol https rc=$? if [ $rc -eq 124 ]; then echo "pytest timed out; retrying with pinned PTO-ISA commit" @@ -407,15 +396,9 @@ jobs: source ${ASCEND_HOME_PATH}/bin/setenv.bash pip install '.[test]' - - name: Run on-device examples (a5) - run: | - source ${ASCEND_HOME_PATH}/bin/setenv.bash - DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))") - task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "python ci.py -p a5 -d ${DEVICE_RANGE} -c d96c8784 -t 1200 --clone-protocol https" - - name: Run pytest scene tests (a5) run: | source ${ASCEND_HOME_PATH}/bin/setenv.bash DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))") - PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v" + PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v --clone-protocol https" task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -eq 124 ]; then echo 'pytest timed out; retrying with pinned PTO-ISA commit'; $PYTEST --pto-session-timeout 1200 --pto-isa-commit d96c8784 --clone-protocol https; rc=\$?; fi; exit \$rc" diff --git a/ci.py b/ci.py deleted file mode 100644 index 931920275..000000000 --- a/ci.py +++ /dev/null @@ -1,1326 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Batch CI test runner using ChipWorker for efficient device reuse. - -Replaces ci.sh by running all test tasks (sim + HW) in a single Python process -per device, reusing ChipWorker across tasks that share the same runtime. - -Usage: - python ci.py # all sim platforms - python ci.py -p a2a3sim -r tensormap_and_ringbuffer -c 6622890 # single platform - python ci.py -p a2a3 -d 5-8 -c 6622890 -t 600 # hardware with devices -""" - -from __future__ import annotations - -import os -import sys - -# --------------------------------------------------------------------------- -# macOS libomp collision workaround — MUST run before any import that may -# transitively load numpy or torch. See docs/macos-libomp-collision.md for -# the full analysis. -# -# On macOS with a --system-site-packages venv, homebrew's numpy pulls in -# /opt/homebrew/opt/libomp/lib/libomp.dylib (via openblas), while pip's -# torch ships its own .venv/.../torch/lib/libomp.dylib under a different -# install name (/opt/llvm-openmp/lib/libomp.dylib). Because the two -# dylibs have distinct install names, dyld loads them both, and Intel's -# libomp aborts the process with "OMP: Error #15 ... libomp already -# initialized" (SIGABRT). -# -# The officially-documented escape hatch is KMP_DUPLICATE_LIB_OK=TRUE. -# For our CI workload (numpy random + torch golden compute, no heavy -# parallel OMP regions) the two runtimes never actually race, so allowing -# the duplicate load is safe in practice. -# --------------------------------------------------------------------------- -if sys.platform == "darwin": - os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE") - -import argparse -import importlib.util -import json -import logging -import signal -import subprocess -import tempfile -import time -from concurrent.futures import ThreadPoolExecutor -from dataclasses import asdict, dataclass, field -from pathlib import Path -from queue import Empty, Queue -from threading import Lock, Thread -from typing import Any, Callable, Protocol, cast - -from simpler.task_interface import ( # type: ignore[import-not-found] - ChipCallable, # pyright: ignore[reportAttributeAccessIssue] - ChipCallConfig, # pyright: ignore[reportAttributeAccessIssue] - ChipStorageTaskArgs, # pyright: ignore[reportAttributeAccessIssue] - ChipWorker, # pyright: ignore[reportAttributeAccessIssue] - CoreCallable, # pyright: ignore[reportAttributeAccessIssue] - make_tensor_arg, - scalar_to_uint64, -) - -from simpler_setup.log_config import DEFAULT_LOG_LEVEL, LOG_LEVEL_CHOICES, configure_logging - -PROJECT_ROOT = Path(__file__).resolve().parent - -logger = logging.getLogger("ci") - -# --------------------------------------------------------------------------- -# Data classes -# --------------------------------------------------------------------------- - -EXAMPLES_DIR = PROJECT_ROOT / "examples" -DEVICE_TESTS_DIR = PROJECT_ROOT / "tests" / "st" -MAX_RETRIES = 3 - - -@dataclass -class TaskSpec: - name: str - task_dir: Path - kernels_dir: Path - golden_path: Path - platform: str - runtime_name: str - - -class BinaryArtifactPathLike(Protocol): - def read_bytes(self) -> bytes: ... - - def __str__(self) -> str: ... - - -class RuntimeBinariesLike(Protocol): - host_path: BinaryArtifactPathLike - aicpu_path: BinaryArtifactPathLike - aicore_path: BinaryArtifactPathLike - sim_context_path: Any - - -class GoldenModuleLike(Protocol): - def generate_inputs(self, params: dict[str, Any]) -> object: ... - - def compute_golden(self, tensors: dict[str, Any], params: dict[str, Any]) -> None: ... - - -@dataclass -class CompiledTask: - spec: TaskSpec - chip_callable: Any # ChipCallable - cases: list[dict[str, Any]] - runtime_bins: Any - golden_module: Any - kernel_config: Any - rtol: float = 1e-5 - atol: float = 1e-5 - output_names: list[str] = field(default_factory=list) - - -@dataclass -class TaskResult: - name: str - platform: str - passed: bool - device: str - attempt: int - elapsed_s: float - error: str | None = None - - -# --------------------------------------------------------------------------- -# Module loading helpers (from code_runner.py) -# --------------------------------------------------------------------------- - - -def _load_module(path: Path, name: str): - spec = importlib.util.spec_from_file_location(name, path) - if spec is None or spec.loader is None: - raise ImportError(f"Cannot load module from {path}") - mod = importlib.util.module_from_spec(spec) - sys.modules[name] = mod - spec.loader.exec_module(mod) - return mod - - -def _write_results_json(results: list[TaskResult], output_path: str | None) -> None: - if output_path is None: - return - Path(output_path).write_text(json.dumps([asdict(result) for result in results], indent=2) + "\n") - - -def _read_results_json(result_path: Path) -> list[TaskResult]: - if not result_path.is_file(): - return [] - raw = result_path.read_text().strip() - if not raw: - return [] - try: - payload = json.loads(raw) - except json.JSONDecodeError: - logger.warning("Ignoring invalid result JSON from %s", result_path) - return [] - return [TaskResult(**item) for item in payload] - - -def _write_task_list_json(tasks: list[TaskSpec], output_path: str | None) -> None: - if output_path is None: - return - Path(output_path).write_text(json.dumps([task.name for task in tasks], indent=2) + "\n") - - -def _read_task_list_json(task_list_path: str | None) -> set[str] | None: - if task_list_path is None: - return None - path = Path(task_list_path) - if not path.is_file(): - return None - return set(json.loads(path.read_text())) - - -# --------------------------------------------------------------------------- -# Task discovery -# --------------------------------------------------------------------------- - - -def _discover_runtimes_for_platform(platform: str) -> list[str]: - from simpler_setup.platform_info import discover_runtimes, parse_platform # noqa: PLC0415 - - arch, _ = parse_platform(platform) - return discover_runtimes(arch) - - -def discover_tasks(platform: str, runtime_filter: str | None = None) -> list[TaskSpec]: - """Scan examples/ and tests/st/ for test directories matching the given platform.""" - from simpler_setup.platform_info import parse_platform # noqa: PLC0415 - - arch, variant = parse_platform(platform) - is_sim = variant == "sim" - supported_runtimes = set(_discover_runtimes_for_platform(platform)) - - if runtime_filter: - if runtime_filter not in supported_runtimes: - raise ValueError( - f"Runtime '{runtime_filter}' not available for '{platform}'. Available: {sorted(supported_runtimes)}" - ) - supported_runtimes = {runtime_filter} - - tasks: list[TaskSpec] = [] - - search_dirs = [EXAMPLES_DIR] - if not is_sim: - search_dirs.append(DEVICE_TESTS_DIR) - - for base_dir in search_dirs: - if not base_dir.is_dir(): - continue - arch_dir = base_dir / arch - if not arch_dir.is_dir(): - continue - for runtime_dir in sorted(arch_dir.iterdir()): - if not runtime_dir.is_dir(): - continue - rt_name = runtime_dir.name - if rt_name not in supported_runtimes: - continue - for example_dir in sorted(runtime_dir.iterdir()): - if not example_dir.is_dir(): - continue - kernels_dir = example_dir / "kernels" - golden_path = example_dir / "golden.py" - kernel_config_path = kernels_dir / "kernel_config.py" - if not (kernel_config_path.is_file() and golden_path.is_file()): - continue - - rel = example_dir.relative_to(base_dir) - prefix = "device_test" if base_dir == DEVICE_TESTS_DIR else "example" - name = f"{prefix}:{rel}" - - tasks.append( - TaskSpec( - name=name, - task_dir=example_dir, - kernels_dir=kernels_dir, - golden_path=golden_path, - platform=platform, - runtime_name=rt_name, - ) - ) - - return tasks - - -# --------------------------------------------------------------------------- -# PTO-ISA management (reuses code_runner logic) -# --------------------------------------------------------------------------- - - -def ensure_pto_isa(commit: str | None, clone_protocol: str) -> str: - from simpler_setup.pto_isa import ensure_pto_isa_root # noqa: PLC0415 - - # update_if_exists=True: when no commit is pinned, fetch latest origin/HEAD - # so CI runs reproducibly track main rather than whatever local checkout - # happens to be on disk. - return ensure_pto_isa_root( - commit=commit, - clone_protocol=clone_protocol, - update_if_exists=True, - verbose=True, - ) - - -# --------------------------------------------------------------------------- -# Compilation -# --------------------------------------------------------------------------- - - -def compile_task( - spec: TaskSpec, - pto_isa_root: str, - build_runtime: bool = False, - run_all_cases: bool = False, -) -> CompiledTask: - """Compile orchestration + kernels for a single task, return CompiledTask.""" - from simpler_setup.elf_parser import extract_text_section # noqa: PLC0415 - from simpler_setup.kernel_compiler import KernelCompiler # noqa: PLC0415 - from simpler_setup.runtime_builder import RuntimeBuilder # noqa: PLC0415 - - # Load kernel_config and golden - kc = _load_module(spec.kernels_dir / "kernel_config.py", f"kc_{id(spec)}") - golden = _load_module(spec.golden_path, f"golden_{id(spec)}") - - kernels = kc.KERNELS - orchestration = kc.ORCHESTRATION - - builder = RuntimeBuilder(platform=spec.platform) - compiler = KernelCompiler(platform=spec.platform) - - # Resolve runtime include dirs - from simpler_setup.platform_info import parse_platform # noqa: PLC0415 - - arch, _ = parse_platform(spec.platform) - runtime_base = PROJECT_ROOT / "src" / arch / "runtime" / spec.runtime_name - build_config_path = runtime_base / "build_config.py" - runtime_include_dirs = [] - if build_config_path.is_file(): - bc = _load_module(build_config_path, f"bc_{id(spec)}") - aicore_cfg = bc.BUILD_CONFIG.get("aicore", {}) - for p in aicore_cfg.get("include_dirs", []): - runtime_include_dirs.append(str((runtime_base / p).resolve())) - else: - runtime_include_dirs.append(str(runtime_base / "runtime")) - runtime_include_dirs.append(str(PROJECT_ROOT / "src" / "common" / "task_interface")) - - is_sim = spec.platform.endswith("sim") - - # Compile runtime + orch + kernels in parallel - def _build_runtime(): - return builder.get_binaries(spec.runtime_name, build=build_runtime) - - def _compile_orch(): - return compiler.compile_orchestration(spec.runtime_name, orchestration["source"]) - - def _compile_kernel(kernel): - incore_o = compiler.compile_incore( - kernel["source"], - core_type=kernel["core_type"], - pto_isa_root=pto_isa_root, - extra_include_dirs=runtime_include_dirs, - ) - kernel_bin = incore_o if is_sim else extract_text_section(incore_o) - sig = kernel.get("signature", []) - return (kernel["func_id"], CoreCallable.build(signature=sig, binary=kernel_bin)) - - max_w = 2 + len(kernels) - with ThreadPoolExecutor(max_workers=max_w) as pool: - fut_rt = pool.submit(_build_runtime) - fut_orch = pool.submit(_compile_orch) - fut_kernels = [pool.submit(_compile_kernel, k) for k in kernels] - - runtime_bins = fut_rt.result() - orch_binary = fut_orch.result() - kernel_binaries = [f.result() for f in fut_kernels] - - orch_sig = orchestration.get("signature", []) - callable_obj = ChipCallable.build( - signature=orch_sig, - func_name=orchestration["function_name"], - binary=orch_binary, - children=kernel_binaries, - config_name=orchestration.get("config_name", ""), - ) - - all_cases = getattr(golden, "ALL_CASES", {"Default": {}}) - if run_all_cases: - cases = [{"name": name, **params} for name, params in all_cases.items()] - else: - default_case = getattr(golden, "DEFAULT_CASE", "Default") - cases = [{"name": default_case, **all_cases[default_case]}] - - return CompiledTask( - spec=spec, - chip_callable=callable_obj, - cases=cases, - runtime_bins=runtime_bins, - golden_module=golden, - kernel_config=kc, - rtol=getattr(golden, "RTOL", 1e-5), - atol=getattr(golden, "ATOL", 1e-5), - output_names=getattr(golden, "__outputs__", []), - ) - - -def compile_all_tasks( - tasks: list[TaskSpec], - pto_isa_root: str, - build_runtime: bool = False, - run_all_cases: bool = False, - max_workers: int = 4, -) -> list[CompiledTask]: - """Compile all tasks in parallel. Returns list in same order as input.""" - compiled: list[CompiledTask | None] = [None] * len(tasks) - errors: list[tuple[int, Exception]] = [] - lock = Lock() - - def _do(idx: int): - try: - result = compile_task(tasks[idx], pto_isa_root, build_runtime, run_all_cases) - with lock: - compiled[idx] = result - except Exception as e: - with lock: - errors.append((idx, e)) - - with ThreadPoolExecutor(max_workers=max_workers) as pool: - list(pool.map(_do, range(len(tasks)))) - - if errors: - for idx, e in errors: - logger.error(f"Failed to compile {tasks[idx].name}: {e}") - raise RuntimeError(f"{len(errors)} task(s) failed to compile") - - return cast(list[CompiledTask], compiled) - - -# --------------------------------------------------------------------------- -# Single task execution -# --------------------------------------------------------------------------- - - -def run_single_task( - task: CompiledTask, - worker, - device_id: int, -) -> bool: - """Run all cases in a compiled task on a given worker. Returns True if all pass.""" - import ctypes # noqa: PLC0415 - - import torch # noqa: PLC0415 - - from simpler_setup.code_runner import _kernel_config_runtime_env, _temporary_env # noqa: PLC0415 - - golden_mod = cast(GoldenModuleLike, task.golden_module) - kc = task.kernel_config - runtime_config = getattr(kc, "RUNTIME_CONFIG", {}) - - run_env = _kernel_config_runtime_env(kc, task.spec.kernels_dir) - - for params in task.cases: - result = golden_mod.generate_inputs(params) - - if isinstance(result, list): - # New-style: flat argument list - orch_args = ChipStorageTaskArgs() - args = {} - inputs = {} - outputs = {} - output_set = set(task.output_names) - - for item in result: - name, value = item - if isinstance(value, torch.Tensor): - tensor = value.cpu().contiguous() - args[name] = tensor - orch_args.add_tensor(make_tensor_arg(tensor)) - if name in output_set: - outputs[name] = tensor - else: - inputs[name] = tensor - elif isinstance(value, ctypes._SimpleCData): - orch_args.add_scalar(scalar_to_uint64(value)) - args[name] = value.value - else: - raise TypeError(f"Unsupported arg type for '{name}': {type(value)}") - else: - raise TypeError("Legacy dict-style generate_inputs not supported in ci.py; use list-style") - - # Compute golden - golden_outputs = {k: v.clone() for k, v in outputs.items()} - golden_with_inputs = {**inputs, **golden_outputs} - golden_mod.compute_golden(golden_with_inputs, params) - - # Run on device - config = ChipCallConfig() - config.block_dim = runtime_config.get("block_dim", 24) - config.aicpu_thread_num = runtime_config.get("aicpu_thread_num", 3) - - with _temporary_env(run_env): - worker.run(task.chip_callable, orch_args, config) - - # Compare - for name, actual_tensor in outputs.items(): - actual = actual_tensor.cpu() - expected = golden_outputs[name].cpu() - if not torch.allclose(actual, expected, rtol=task.rtol, atol=task.atol): - close_mask = torch.isclose(actual, expected, rtol=task.rtol, atol=task.atol) - mismatches = (~close_mask).sum().item() - total = actual.numel() - raise AssertionError( - f"Output '{name}' mismatch in case '{params.get('name', '?')}': " - f"{mismatches}/{total} elements differ (rtol={task.rtol}, atol={task.atol})" - ) - - return True - - -# --------------------------------------------------------------------------- -# Group tasks by runtime for ChipWorker reuse -# --------------------------------------------------------------------------- - - -def group_by_runtime(tasks: list[CompiledTask]) -> dict[str, list[CompiledTask]]: - groups: dict[str, list[CompiledTask]] = {} - for t in tasks: - groups.setdefault(t.spec.runtime_name, []).append(t) - return groups - - -# --------------------------------------------------------------------------- -# Device worker -# --------------------------------------------------------------------------- - - -def device_worker( - device_id: int, - task_queue: Queue, - results: list, - results_lock: Lock, - quarantined: set, - quarantine_lock: Lock, -): - """Worker thread: pull tasks from queue, run them, handle retries.""" - while True: - try: - item = task_queue.get_nowait() - except Empty: - break - - runtime_name, compiled_tasks, attempt = item - rt_bins = cast(RuntimeBinariesLike, compiled_tasks[0].runtime_bins) - - # Init worker for this runtime group - worker = ChipWorker() - try: - worker.init( - str(rt_bins.host_path), - str(rt_bins.aicpu_path), - str(rt_bins.aicore_path), - sim_context_lib_path=str(rt_bins.sim_context_path) if rt_bins.sim_context_path else "", - ) - worker.set_device(device_id) - except Exception as e: - logger.error(f"[dev{device_id}] Failed to init ChipWorker for {runtime_name}: {e}") - for ct in compiled_tasks: - with results_lock: - results.append( - TaskResult( - name=ct.spec.name, - platform=ct.spec.platform, - passed=False, - device=str(device_id), - attempt=attempt, - elapsed_s=0, - error=str(e), - ) - ) - with quarantine_lock: - quarantined.add(device_id) - task_queue.task_done() - break - - failed_tasks = [] - for ct in compiled_tasks: - start = time.monotonic() - logger.info(f"[dev{device_id}] Running: {ct.spec.name} (attempt {attempt})") - try: - run_single_task(ct, worker, device_id) - elapsed = time.monotonic() - start - logger.info(f"[dev{device_id}] PASS: {ct.spec.name} ({elapsed:.1f}s)") - with results_lock: - results.append( - TaskResult( - name=ct.spec.name, - platform=ct.spec.platform, - passed=True, - device=str(device_id), - attempt=attempt, - elapsed_s=elapsed, - ) - ) - except Exception as e: - elapsed = time.monotonic() - start - logger.error(f"[dev{device_id}] FAIL: {ct.spec.name} ({elapsed:.1f}s): {e}") - with results_lock: - results.append( - TaskResult( - name=ct.spec.name, - platform=ct.spec.platform, - passed=False, - device=str(device_id), - attempt=attempt, - elapsed_s=elapsed, - error=str(e), - ) - ) - failed_tasks.append(ct) - - worker.reset_device() - worker.finalize() - - # Re-enqueue failed tasks for retry (individually, not as a group) - if failed_tasks and attempt + 1 < MAX_RETRIES: - for ct in failed_tasks: - task_queue.put((ct.spec.runtime_name, [ct], attempt + 1)) - elif failed_tasks and attempt + 1 >= MAX_RETRIES: - logger.warning(f"[dev{device_id}] Quarantined after exhausting retries") - with quarantine_lock: - quarantined.add(device_id) - task_queue.task_done() - break - - task_queue.task_done() - - -# --------------------------------------------------------------------------- -# Orchestrators: sim and HW -# --------------------------------------------------------------------------- - - -def run_hw_tasks( - compiled: list[CompiledTask], - devices: list[int], -) -> list[TaskResult]: - """Run hardware tasks in-process with ChipWorker reuse per runtime group.""" - groups = group_by_runtime(compiled) - - task_queue: Queue = Queue() - for rt_name, tasks in groups.items(): - task_queue.put((rt_name, tasks, 0)) - - results: list[TaskResult] = [] - results_lock = Lock() - quarantined: set[int] = set() - quarantine_lock = Lock() - - threads = [] - for dev_id in devices: - t = Thread( - target=device_worker, - args=(dev_id, task_queue, results, results_lock, quarantined, quarantine_lock), - ) - t.start() - threads.append(t) - - for t in threads: - t.join() - - if quarantined: - logger.warning("[hw] Quarantined devices: %s", sorted(quarantined)) - - return results - - -def _build_device_worker_base_args(args: argparse.Namespace) -> list[str]: - base_args = [ - sys.executable, - str(Path(__file__).resolve()), - "--device-worker", - "-p", - args.platform, - "--clone-protocol", - args.clone_protocol, - ] - if args.runtime: - base_args += ["-r", args.runtime] - if args.build_runtime: - base_args.append("--build-runtime") - if args.run_all_cases: - base_args.append("--all") - return base_args - - -def _run_device_worker_subprocess( - tasks: list[TaskSpec], - device_id: int, - args: argparse.Namespace, - tag: str, - pto_isa_commit: str | None = None, - print_log_on_fail: bool = False, - quiet: bool = True, - timeout: int | None = None, -) -> list[TaskResult]: - """Run a task batch in one device-worker subprocess and return its reported results. - - When *quiet* is False, stdout streams to the terminal in real time - (useful for serial sim runs). When True, output is captured and only - shown on failure if *print_log_on_fail* is set. - """ - base_args = _build_device_worker_base_args(args) - if pto_isa_commit: - base_args += ["-c", pto_isa_commit] - - with tempfile.NamedTemporaryFile( - prefix=f"ci_{tag}_tasks_dev{device_id}_", - suffix=".json", - delete=False, - ) as task_file: - task_list_path = Path(task_file.name) - - with tempfile.NamedTemporaryFile( - prefix=f"ci_{tag}_dev{device_id}_", - suffix=".json", - delete=False, - ) as result_file: - result_path = Path(result_file.name) - - _write_task_list_json(tasks, str(task_list_path)) - full_cmd = base_args + [ - "-d", - str(device_id), - "--task-list-json", - str(task_list_path), - "--result-json", - str(result_path), - ] - - logger.info(f"[{tag}:dev{device_id}] Launching: {' '.join(full_cmd)}") - try: - if quiet: - proc = subprocess.run(full_cmd, check=False, capture_output=True, text=True, timeout=timeout) - else: - proc = subprocess.run( - full_cmd, check=False, stdout=None, stderr=subprocess.PIPE, text=True, timeout=timeout - ) - device_results = _read_results_json(result_path) - if proc.returncode != 0: - if print_log_on_fail and quiet: - logger.error(f"[{tag}:dev{device_id}] Failed:\n{proc.stdout}\n{proc.stderr}") - elif print_log_on_fail and proc.stderr: - logger.error(f"[{tag}:dev{device_id}] stderr:\n{proc.stderr}") - # When the subprocess crashes without reporting per-task failures, - # generate FAIL results for every task that has no result yet so - # that pin-retry can match them by name. - if proc.returncode != 0 and not any(not r.passed for r in device_results): - reported_names = {r.name for r in device_results} - error_msg = (proc.stderr or proc.stdout or f"Device worker exited with code {proc.returncode}").strip() - for t in tasks: - if t.name not in reported_names: - device_results.append( - TaskResult( - name=t.name, - platform=t.platform, - passed=False, - device=str(device_id), - attempt=0, - elapsed_s=0, - error=error_msg, - ) - ) - return device_results - except subprocess.TimeoutExpired: - logger.error(f"[{tag}:dev{device_id}] Subprocess timed out after {timeout}s") - device_results = _read_results_json(result_path) - reported_names = {r.name for r in device_results} - for t in tasks: - if t.name not in reported_names: - device_results.append( - TaskResult( - name=t.name, - platform=t.platform, - passed=False, - device=str(device_id), - attempt=0, - elapsed_s=0, - error=f"Timed out after {timeout}s", - ) - ) - return device_results - finally: - task_list_path.unlink(missing_ok=True) - result_path.unlink(missing_ok=True) - - -def _normalize_task_result( - task: TaskSpec, - device_id: int, - attempt: int, - task_results: list[TaskResult], -) -> TaskResult: - matching = [result for result in task_results if result.name == task.name] - source = matching[-1] if matching else task_results[-1] - return TaskResult( - name=task.name, - platform=task.platform, - passed=source.passed, - device=str(device_id), - attempt=attempt, - elapsed_s=source.elapsed_s, - error=source.error, - ) - - -def run_hw_tasks_subprocess( - tasks: list[TaskSpec], - devices: list[int], - args: argparse.Namespace, - pto_isa_commit: str | None = None, -) -> list[TaskResult]: - """Run hardware tasks: one subprocess per task. - - On any failure the device is immediately quarantined (worker exits). Healthy - devices keep pulling from the shared queue. Tasks that were never run or failed - are collected so the caller can re-run them in a pin-commit pass with all devices - refreshed. - """ - task_queue: Queue[tuple[TaskSpec, int]] = Queue() - total = len(tasks) - for task in tasks: - task_queue.put((task, 0)) - - results: list[TaskResult] = [] - results_lock = Lock() - completed = [0] # mutable counter for thread-safe increment - quarantined: set[int] = set() - quarantine_lock = Lock() - tag = "hw" - - is_pin_retry = pto_isa_commit is not None - - def _run_device(dev_id: int): - while True: - try: - task, attempt = task_queue.get_nowait() - except Empty: - return - - is_last_attempt = attempt + 1 >= MAX_RETRIES - task_results = _run_device_worker_subprocess( - [task], - dev_id, - args, - tag=tag, - pto_isa_commit=pto_isa_commit, - print_log_on_fail=is_pin_retry and is_last_attempt, - ) - normalized = _normalize_task_result(task, dev_id, attempt, task_results) - with results_lock: - results.append(normalized) - if normalized.passed or is_last_attempt: - completed[0] += 1 - n = completed[0] - status = "PASS" if normalized.passed else "FAIL" - attempt_info = f" attempt {attempt + 1}" if attempt > 0 else "" - logger.info( - f"[{tag}:dev{dev_id}] [{n}/{total}] {status}: {task.name}{attempt_info} ({normalized.elapsed_s:.1f}s)" - ) - - if normalized.passed: - continue - - # Failure: re-enqueue with attempt+1 if under limit, quarantine this device - if not is_last_attempt: - task_queue.put((task, attempt + 1)) - logger.warning(f"[{tag}:dev{dev_id}] Quarantined after failure on {task.name}") - with quarantine_lock: - quarantined.add(dev_id) - return - - threads = [Thread(target=_run_device, args=(device_id,)) for device_id in devices] - for t in threads: - t.start() - for t in threads: - t.join() - - # Tasks stranded in queue — all devices quarantined before queue emptied - while True: - try: - task, attempt = task_queue.get_nowait() - except Empty: - break - results.append( - TaskResult( - name=task.name, - platform=task.platform, - passed=False, - device="N/A", - attempt=attempt, - elapsed_s=0, - error="All devices quarantined", - ) - ) - - if quarantined: - logger.warning(f"[{tag}] Quarantined devices: {sorted(quarantined)}") - - return results - - -# --------------------------------------------------------------------------- -# Summary -# --------------------------------------------------------------------------- - - -def print_summary(results: list[TaskResult]) -> int: - """Print results table. Returns exit code (0 = all pass, 1 = failures).""" - # Deduplicate: keep last result per task name (retries produce multiple entries) - final: dict[str, TaskResult] = {} - for r in results: - final[r.name] = r - - ordered = list(final.values()) - pass_count = sum(1 for r in ordered if r.passed) - fail_count = sum(1 for r in ordered if not r.passed) - total = len(ordered) - - is_tty = sys.stdout.isatty() - red = "\033[31m" if is_tty else "" - green = "\033[32m" if is_tty else "" - reset = "\033[0m" if is_tty else "" - - # Column widths - name_w = max((len(r.name) for r in ordered), default=40) - name_w = max(40, min(72, name_w)) - - border = "=" * (name_w + 40) - - # Print failure details first - for r in ordered: - if not r.passed and r.error: - print(f"\n--- FAIL: {r.name} (dev{r.device}, attempt {r.attempt + 1}) ---") - print(r.error) - print("--- END ---") - - print(f"\n{border}") - print(f"{'CI RESULTS SUMMARY':^{len(border)}}") - print(border) - print(f"{'TASK':<{name_w}} {'PLATFORM':<10} {'DEVICE':<8} {'ATTEMPT':<8} {'TIME':<8} RESULT") - print(f"{'-' * name_w} {'-' * 10} {'-' * 8} {'-' * 8} {'-' * 8} ------") - - for r in ordered: - name_display = r.name[: name_w - 3] + "..." if len(r.name) > name_w else r.name - status_str = f"{green}PASS{reset}" if r.passed else f"{red}FAIL{reset}" - print( - f"{name_display:<{name_w}} {r.platform:<10} {r.device:<8} " - f"{r.attempt + 1:<8} {r.elapsed_s:.0f}s{'':<5} {status_str}" - ) - - print(border) - print(f"Total: {total} Passed: {pass_count} Failed: {fail_count}") - print(border) - - if fail_count == 0: - print("All tests passed!") - return 0 - return 1 - - -# --------------------------------------------------------------------------- -# PTO-ISA pin on failure (two-pass) -# --------------------------------------------------------------------------- - - -def reset_pto_isa(commit: str, clone_protocol: str) -> str: - """Checkout PTO-ISA at the pinned commit (or re-clone if needed).""" - from simpler_setup.pto_isa import checkout_pto_isa_commit, get_pto_isa_clone_path # noqa: PLC0415 - - clone_path = get_pto_isa_clone_path() - if clone_path.exists(): - checkout_pto_isa_commit(clone_path, commit, verbose=True) - return str(clone_path.resolve()) - return ensure_pto_isa(commit, clone_protocol) - - -# --------------------------------------------------------------------------- -# Device-worker sub-command -# --------------------------------------------------------------------------- - - -def device_worker_main(args: argparse.Namespace) -> int: - """Entry point when invoked as --device-worker. Runs all tasks on one device.""" - device_id = args.devices[0] if args.devices else 0 - platform = args.platform - - pto_isa_root = ensure_pto_isa(args.pto_isa_commit, args.clone_protocol) - - tasks = discover_tasks(platform, runtime_filter=args.runtime) - selected_names = _read_task_list_json(args.task_list_json) - if selected_names is not None: - tasks = [task for task in tasks if task.name in selected_names] - if not tasks: - logger.info("No tasks found") - return 0 - - all_results = _run_tasks_on_device(tasks, device_id, platform, pto_isa_root, args) - _write_results_json(all_results, args.result_json) - return print_summary(all_results) - - -def _run_tasks_on_device( - tasks: list[TaskSpec], - device_id: int, - platform: str, - pto_isa_root: str, - args: argparse.Namespace, -) -> list[TaskResult]: - """Compile and run all tasks on a single device. Returns all TaskResults. - - For simulation platforms with sufficient CPUs, tasks are distributed - across multiple virtual device IDs and executed in parallel threads. - ChipWorker.run() internally uses std::thread + join, so GIL is released - during execution, enabling true parallelism. - """ - logger.info(f"Compiling {len(tasks)} tasks...") - try: - compiled = compile_all_tasks( - tasks, pto_isa_root, build_runtime=args.build_runtime, run_all_cases=args.run_all_cases - ) - except RuntimeError: - return [ - TaskResult( - name=t.name, - platform=platform, - passed=False, - device=str(device_id), - attempt=0, - elapsed_s=0, - error="compile failed", - ) - for t in tasks - ] - - is_sim = platform.endswith("sim") - if is_sim: - cpu_count = os.cpu_count() or 1 - max_workers = min(max(cpu_count // 20, 1), len(compiled)) - else: - max_workers = 1 - - if max_workers <= 1: - return _run_compiled_tasks(compiled, device_id, platform) - - # Parallel: distribute tasks round-robin across virtual device IDs - buckets: list[list[CompiledTask]] = [[] for _ in range(max_workers)] - for i, ct in enumerate(compiled): - buckets[i % max_workers].append(ct) - - logger.info(f"[sim] Parallel execution: {max_workers} workers, {len(compiled)} tasks") - - results: list[TaskResult] = [] - results_lock = Lock() - completed_count = [0] - total = len(compiled) - - def _worker(worker_id: int, worker_tasks: list[CompiledTask]): - dev_id = worker_id - worker_results = _run_compiled_tasks(worker_tasks, dev_id, platform) - with results_lock: - for r in worker_results: - completed_count[0] += 1 - n = completed_count[0] - results.append(r) - status = "PASS" if r.passed else "FAIL" - logger.info(f"[dev{dev_id}] [{n}/{total}] {status}: {r.name} ({r.elapsed_s:.1f}s)") - - threads = [] - for i in range(max_workers): - if not buckets[i]: - continue - t = Thread(target=_worker, args=(i, buckets[i])) - t.start() - threads.append(t) - - for t in threads: - t.join() - - return results - - -def _run_compiled_tasks( - compiled: list[CompiledTask], - device_id: int, - platform: str, -) -> list[TaskResult]: - """Run compiled tasks serially on a single device.""" - - groups = group_by_runtime(compiled) - all_results: list[TaskResult] = [] - - for rt_name, group_tasks in groups.items(): - rt_bins = cast(RuntimeBinariesLike, group_tasks[0].runtime_bins) - worker = ChipWorker() - try: - worker.init( - str(rt_bins.host_path), - str(rt_bins.aicpu_path), - str(rt_bins.aicore_path), - sim_context_lib_path=str(rt_bins.sim_context_path) if rt_bins.sim_context_path else "", - ) - worker.set_device(device_id) - except Exception as e: - logger.error(f"[dev{device_id}] Failed to init ChipWorker for {rt_name}: {e}") - all_results.extend( - TaskResult( - name=ct.spec.name, - platform=platform, - passed=False, - device=str(device_id), - attempt=0, - elapsed_s=0, - error=str(e), - ) - for ct in group_tasks - ) - continue - - for ct in group_tasks: - start = time.monotonic() - try: - run_single_task(ct, worker, device_id) - elapsed = time.monotonic() - start - logger.info(f"[dev{device_id}] PASS: {ct.spec.name} ({elapsed:.1f}s)") - all_results.append( - TaskResult( - name=ct.spec.name, - platform=platform, - passed=True, - device=str(device_id), - attempt=0, - elapsed_s=elapsed, - ) - ) - except Exception as e: - elapsed = time.monotonic() - start - logger.error(f"[dev{device_id}] FAIL: {ct.spec.name} ({elapsed:.1f}s): {e}") - all_results.append( - TaskResult( - name=ct.spec.name, - platform=platform, - passed=False, - device=str(device_id), - attempt=0, - elapsed_s=elapsed, - error=str(e), - ) - ) - - worker.reset_device() - worker.finalize() - - return all_results - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - - -def _discover_valid_platforms() -> list[str]: - """Discover valid platforms from src/ directory structure (mirrors ci.sh logic).""" - platforms = [] - src_dir = PROJECT_ROOT / "src" - if not src_dir.is_dir(): - return platforms - for arch_dir in sorted(src_dir.iterdir()): - if not arch_dir.is_dir(): - continue - arch = arch_dir.name - platform_dir = arch_dir / "platform" - if (platform_dir / "onboard").is_dir(): - platforms.append(arch) - if (platform_dir / "sim").is_dir(): - platforms.append(f"{arch}sim") - return platforms - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Batch CI test runner with ChipWorker reuse") - parser.add_argument("-p", "--platform", default=None) - parser.add_argument("-d", "--device", dest="device_range", default="0") - parser.add_argument("-r", "--runtime", default=None) - parser.add_argument( - "--build-runtime", - action="store_true", - help="Rebuild runtime binaries from src/ instead of using pre-built build/lib artifacts", - ) - parser.add_argument("-c", "--pto-isa-commit", default=None) - parser.add_argument("-t", "--timeout", type=int, default=600) - parser.add_argument("--clone-protocol", choices=["ssh", "https"], default="ssh") - parser.add_argument("--all", dest="run_all_cases", action="store_true", help="Run all cases, not just DEFAULT_CASE") - parser.add_argument( - "--log-level", choices=LOG_LEVEL_CHOICES, default=DEFAULT_LOG_LEVEL, help="Root logger level (default: info)" - ) - parser.add_argument("--device-worker", action="store_true", help=argparse.SUPPRESS) - parser.add_argument("--result-json", default=None, help=argparse.SUPPRESS) - parser.add_argument("--task-list-json", default=None, help=argparse.SUPPRESS) - return parser.parse_args() - - -def parse_device_range(device_range: str) -> list[int]: - if "-" in device_range: - start, end = device_range.split("-", 1) - return list(range(int(start), int(end) + 1)) - return [int(device_range)] - - -def _run_with_timeout( - phase_name: str, - timeout_s: int, - runner: Callable[[], list[TaskResult]], -) -> list[TaskResult]: - def _watchdog_handler(signum, frame): - print(f"\n{'=' * 40}", flush=True) - print( - f"[CI] TIMEOUT: {phase_name} exceeded {timeout_s}s ({timeout_s // 60}min) limit, aborting", - flush=True, - ) - print(f"{'=' * 40}", flush=True) - os._exit(1) - - previous_handler = signal.getsignal(signal.SIGALRM) - signal.signal(signal.SIGALRM, _watchdog_handler) - signal.alarm(timeout_s) - try: - return runner() - finally: - signal.alarm(0) - signal.signal(signal.SIGALRM, previous_handler) - - -def _run_single_platform(platform: str, args: argparse.Namespace) -> list[TaskResult]: - """Run all tasks for a single platform. Returns list of TaskResults.""" - is_sim = platform.endswith("sim") - - # Ensure PTO-ISA is available before task discovery so that downstream - # pytest scene tests (which share the same clone path) can find it even - # when ci.py itself has no tasks to run. - ensure_pto_isa(args.pto_isa_commit, args.clone_protocol) - - tasks = discover_tasks(platform, runtime_filter=args.runtime) - if not tasks: - logger.info(f"[{platform}] No tasks found") - return [] - logger.info(f"[{platform}] Discovered {len(tasks)} tasks") - - # Compile and run via subprocess isolation. - # Sim: single subprocess with all tasks (ChipWorker reuse + parallel within). - # HW: one subprocess per task with device-level quarantine. - sub_args = argparse.Namespace(**vars(args)) - sub_args.platform = platform - if is_sim: - all_results = _run_device_worker_subprocess(tasks, 0, sub_args, tag="sim", timeout=args.timeout, quiet=False) - else: - all_results = _run_with_timeout( - f"{platform} initial pass", - args.timeout, - lambda: run_hw_tasks_subprocess(tasks, args.devices, sub_args), - ) - - # Pin retry — re-run failed tasks with pinned PTO-ISA commit. - final: dict[str, TaskResult] = {} - for r in all_results: - final[r.name] = r - failures = [r for r in final.values() if not r.passed] - - if failures and args.pto_isa_commit: - failed_names = {r.name for r in failures} - failed_tasks = [t for t in tasks if t.name in failed_names] - logger.info(f"[{platform}] {len(failed_tasks)} failure(s), retrying with pinned PTO-ISA {args.pto_isa_commit}") - if is_sim: - pin_results = _run_device_worker_subprocess( - failed_tasks, - 0, - sub_args, - tag="sim", - pto_isa_commit=args.pto_isa_commit, - print_log_on_fail=True, - quiet=False, - timeout=args.timeout, - ) - else: - pin_results = _run_with_timeout( - f"{platform} pin retry", - args.timeout, - lambda: run_hw_tasks_subprocess( - failed_tasks, - args.devices, - sub_args, - pto_isa_commit=args.pto_isa_commit, - ), - ) - all_results.extend(pin_results) - - return all_results - - -def main() -> int: - args = parse_args() - configure_logging(args.log_level) - args.devices = parse_device_range(args.device_range) - - valid_platforms = _discover_valid_platforms() - - # Device-worker sub-command (always needs explicit -p) - if args.device_worker: - if not args.platform: - print("--device-worker requires -p/--platform") - return 1 - return device_worker_main(args) - - # Determine which platforms to run - if args.platform: - if args.platform not in valid_platforms: - print(f"Unknown platform: {args.platform}") - print(f"Valid platforms: {' '.join(valid_platforms)}") - return 1 - platforms = [args.platform] - else: - # No -p: run all sim platforms - platforms = [p for p in valid_platforms if p.endswith("sim")] - if not platforms: - print("No sim platforms found") - return 1 - logger.info(f"No platform specified, running all sim platforms: {', '.join(platforms)}") - - all_results: list[TaskResult] = [] - for platform in platforms: - all_results.extend(_run_single_platform(platform, args)) - - if not all_results: - logger.info("No tasks found") - return 0 - - return print_summary(all_results) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/ci.sh b/ci.sh index b46377c53..c15ef0c84 100755 --- a/ci.sh +++ b/ci.sh @@ -323,9 +323,16 @@ run_task() { local start_time=$SECONDS local -a cmd - cmd=(env PYTHONDONTWRITEBYTECODE=1 python examples/scripts/run_example.py - -k "${dir}/kernels" -g "${dir}/golden.py" - -p "$platform" --clone-protocol "$CLONE_PROTOCOL" "${commit_flag[@]}") + # Prefer test_*.py if available + local test_file + test_file=$(find "$dir" -maxdepth 1 -name 'test_*.py' -print -quit 2>/dev/null || true) + if [[ -n "$test_file" ]]; then + cmd=(env PYTHONDONTWRITEBYTECODE=1 python "$test_file" + -p "$platform" --clone-protocol "$CLONE_PROTOCOL" "${commit_flag[@]}") + else + echo "[${platform}] SKIP: no test_*.py found in $dir" + return 1 + fi [[ -n "$device_id" ]] && cmd+=(-d "$device_id") # Progress to stdout (not captured in log) diff --git a/conftest.py b/conftest.py index 97e43d534..4ff21bfc4 100644 --- a/conftest.py +++ b/conftest.py @@ -146,13 +146,17 @@ def pytest_configure(config): os.environ["PTO_LOG_LEVEL"] = log_level commit = config.getoption("--pto-isa-commit") - if commit: + clone_protocol = config.getoption("--clone-protocol") + # Always pre-clone PTO-ISA so the clone_protocol is respected (CI needs + # https, but scene_test.py defaults to ssh). Previously ci.py handled + # this; now conftest owns it. + if commit or clone_protocol != "ssh": from simpler_setup.pto_isa import ensure_pto_isa_root # noqa: PLC0415 root = ensure_pto_isa_root( verbose=True, commit=commit, - clone_protocol=config.getoption("--clone-protocol"), + clone_protocol=clone_protocol, ) if root: os.environ["PTO_ISA_ROOT"] = root diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py deleted file mode 100644 index e9eb24bdb..000000000 --- a/examples/scripts/run_example.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Simplified test runner for PTO runtime tests. - -This script provides a command-line interface to run PTO runtime tests -with minimal configuration. Users only need to provide: -1. A kernels directory with kernel_config.py -2. A golden.py script - -Usage: - python examples/scripts/run_example.py --kernels ./my_test/kernels --golden ./my_test/golden.py - python examples/scripts/run_example.py -k ./kernels -g ./golden.py --device 0 --platform a2a3sim - -Examples: - # Run hardware example (requires Ascend device) - python examples/scripts/run_example.py -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py - - # Run simulation example (no hardware required) - python examples/scripts/run_example.py -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ - -p a2a3sim - - # Run with specific device - python examples/scripts/run_example.py -k ./kernels -g ./golden.py -d 0 -""" - -import argparse -import logging -import os -import sys -import time -from pathlib import Path - -from simpler_setup.code_runner import create_code_runner -from simpler_setup.log_config import DEFAULT_LOG_LEVEL, LOG_LEVEL_CHOICES, configure_logging - -project_root = Path(__file__).parent.parent.parent - -logger = logging.getLogger(__name__) - - -def _get_device_log_dir(device_id): - """Return the device log directory using the same logic as device_log_resolver.""" - ascend_work_path = os.environ.get("ASCEND_WORK_PATH") - if ascend_work_path: - root = Path(ascend_work_path).expanduser() / "log" / "debug" - if root.exists(): - return root / f"device-{device_id}" - return Path.home() / "ascend" / "log" / "debug" / f"device-{device_id}" - - -def _wait_for_new_device_log(log_dir, pre_run_logs, timeout=15, interval=0.5): - """Wait for a new device log file that wasn't present before the run. - - CANN dlog writes device logs asynchronously, so the file may appear - a few seconds after the run completes. - """ - deadline = time.monotonic() + timeout - while time.monotonic() < deadline: - if log_dir.exists(): - current_logs = set(log_dir.glob("*.log")) - new_logs = current_logs - pre_run_logs - if new_logs: - return max(new_logs, key=lambda p: p.stat().st_mtime) - time.sleep(interval) - return None - - -def main(): # noqa: PLR0912 - import warnings # noqa: PLC0415 - - warnings.warn( - "run_example.py is deprecated. Use 'python test_*.py' with the same CLI options instead. " - "See docs/testing.md for details.", - DeprecationWarning, - stacklevel=1, - ) - - parser = argparse.ArgumentParser( - description="Run PTO runtime test with kernel config and golden script", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - python examples/scripts/run_example.py --kernels ./my_test/kernels --golden ./my_test/golden.py - python examples/scripts/run_example.py -k ./kernels -g ./golden.py -d 0 - -Golden.py interface: - def generate_inputs(params: dict) -> dict: - '''Return dict of torch tensors (inputs + outputs)''' - return {"a": torch.tensor(...), "out_f": torch.zeros(...)} - - def compute_golden(tensors: dict, params: dict) -> None: - '''Compute expected outputs in-place''' - tensors["out_f"][:] = tensors["a"] + 1 - - # Optional — for parameterized test cases: - ALL_CASES = {"Case1": {"size": 1024}, "Case2": {"size": 2048}} - DEFAULT_CASE = "Case1" - RTOL = 1e-5 # Relative tolerance - ATOL = 1e-5 # Absolute tolerance - __outputs__ = ["out_f"] # Or use 'out_' prefix - """, - ) - - parser.add_argument( - "-k", - "--kernels", - required=True, - help="Path to kernels directory containing kernel_config.py", - ) - - parser.add_argument("-g", "--golden", required=True, help="Path to golden.py script") - - parser.add_argument("-d", "--device", type=int, default=0, help="Device ID (default: 0)") - - parser.add_argument( - "-p", - "--platform", - default="a2a3", - choices=["a2a3", "a2a3sim", "a5", "a5sim"], - help="Platform name: 'a2a3'/'a5' for hardware, 'a2a3sim'/'a5sim' for simulation (default: a2a3)", - ) - - parser.add_argument( - "--log-level", - choices=LOG_LEVEL_CHOICES, - default=DEFAULT_LOG_LEVEL, - help=f"Root logger level (default: {DEFAULT_LOG_LEVEL})", - ) - - parser.add_argument( - "--enable-profiling", - action="store_true", - help="Enable profiling and generate swimlane.json", - ) - - parser.add_argument( - "--dump-tensor", - action="store_true", - help="Dump per-task tensor I/O at runtime (controlled by enable_dump_tensor flag)", - ) - - parser.add_argument( - "--all", - action="store_true", - help="Run all test cases defined in ALL_CASES (default: run only DEFAULT_CASE)", - ) - - parser.add_argument( - "--case", - type=str, - default=None, - help="Run a specific test case by name (e.g., --case Case2)", - ) - - parser.add_argument( - "-c", - "--pto-isa-commit", - type=str, - default=None, - help="Checkout PTO-ISA at this commit (e.g., -c 1b22fea)", - ) - - parser.add_argument( - "--rounds", - type=int, - default=None, - metavar="ROUNDS", - help="Number of rounds to run per case (overrides kernel_config RUNTIME_CONFIG['rounds'])", - ) - - parser.add_argument( - "--clone-protocol", - choices=["ssh", "https"], - default="ssh", - help="Git protocol for cloning pto-isa (default: ssh)", - ) - - parser.add_argument( - "--skip-golden", - action="store_true", - help="Skip golden computation and comparison (for benchmarking)", - ) - - parser.add_argument( - "--build", - action="store_true", - help="Compile runtime from source instead of using pre-built binaries", - ) - - args = parser.parse_args() - - if args.all and args.case: - parser.error("--all and --case are mutually exclusive") - - configure_logging(args.log_level) - - if args.rounds is not None and args.rounds > 1 and args.enable_profiling: - logger.warning("Profiling disabled: --rounds > 1") - args.enable_profiling = False - - # Validate paths - kernels_path = Path(args.kernels) - golden_path = Path(args.golden) - - if not kernels_path.exists(): - logger.error(f"Kernels directory not found: {kernels_path}") - return 1 - - if not golden_path.exists(): - logger.error(f"Golden script not found: {golden_path}") - return 1 - - kernel_config_path = kernels_path / "kernel_config.py" - if not kernel_config_path.exists(): - logger.error(f"kernel_config.py not found in {kernels_path}") - return 1 - - try: - runner = create_code_runner( - kernels_dir=str(args.kernels), - golden_path=str(args.golden), - device_id=args.device, - platform=args.platform, - enable_profiling=args.enable_profiling, - enable_dump_tensor=args.dump_tensor, - run_all_cases=args.all, - case_name=args.case, - pto_isa_commit=args.pto_isa_commit, - build_runtime=args.build, - repeat_rounds=args.rounds, - clone_protocol=args.clone_protocol, - skip_golden=args.skip_golden, - ) - - # Snapshot existing device logs before the run so we can identify the - # new log created by this run (CANN writes device logs asynchronously). - pre_run_device_logs = set() - device_log_dir = None - if args.enable_profiling and args.platform == "a2a3": - device_log_dir = _get_device_log_dir(args.device) - if device_log_dir.exists(): - pre_run_device_logs = set(device_log_dir.glob("*.log")) - - runner.run() - logger.info("=" * 60) - logger.info("TEST PASSED") - logger.info("=" * 60) - - # If profiling was enabled, generate merged swimlane JSON - if args.enable_profiling: - logger.info("Generating swimlane visualization...") - kernel_config_path = kernels_path / "kernel_config.py" - swimlane_script = project_root / "tools" / "swimlane_converter.py" - - if swimlane_script.exists(): - import subprocess # noqa: PLC0415 - - try: - cmd = [ - sys.executable, - str(swimlane_script), - "-k", - str(kernel_config_path), - ] - - # Find the device log created by this run via snapshot diff - if device_log_dir is not None: - device_log_file = _wait_for_new_device_log(device_log_dir, pre_run_device_logs) - if device_log_file: - cmd += ["--device-log", str(device_log_file)] - else: - logger.warning("No new device log found, falling back to device-id") - cmd += ["-d", str(args.device)] - else: - cmd += ["-d", str(args.device)] - - if logger.isEnabledFor(logging.DEBUG): - cmd.append("-v") - - result = subprocess.run(cmd, check=True, capture_output=True, text=True) - logger.info(result.stdout) - logger.info("Swimlane JSON generation completed") - except subprocess.CalledProcessError as e: - logger.warning(f"Failed to generate swimlane JSON: {e}") - logger.debug(f"stderr: {e.stderr}") - else: - logger.warning(f"Swimlane converter script not found: {swimlane_script}") - - return 0 - - except ImportError as e: - logger.error(f"Import error: {e}") - logger.error("Make sure you're running from the project root directory.") - return 1 - - except Exception as e: - logger.error(f"TEST FAILED: {e}") - if logger.isEnabledFor(logging.DEBUG): - import traceback # noqa: PLC0415 - - traceback.print_exc() - return 1 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh index 5e11f4327..e11048d1a 100755 --- a/tools/benchmark_rounds.sh +++ b/tools/benchmark_rounds.sh @@ -19,7 +19,6 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -RUN_EXAMPLE="$PROJECT_ROOT/examples/scripts/run_example.py" # --------------------------------------------------------------------------- # Examples to benchmark and their case lists, per runtime. @@ -388,7 +387,7 @@ run_bench() { trap 'rm -f -- "$pre_log_file"' RETURN ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort > "$pre_log_file" || true - # Build run command: prefer test_*.py, fall back to run_example.py + # Build run command using test_*.py local test_file test_file=$(find "$example_dir" -maxdepth 1 -name 'test_*.py' -print -quit 2>/dev/null || true) @@ -400,14 +399,8 @@ run_bench() { -n "$ROUNDS" --skip-golden ) else - local kernels_dir="$example_dir/kernels" - local golden="$example_dir/golden.py" - run_cmd=( - python3 "$RUN_EXAMPLE" - -k "$kernels_dir" -g "$golden" - -p "$PLATFORM" -d "$DEVICE_ID" - -n "$ROUNDS" --skip-golden - ) + echo " SKIPPED: no test_*.py found in $example_dir" + return fi if [[ -n "$case_name" ]]; then run_cmd+=(--case "$case_name") diff --git a/tools/verify_packaging.sh b/tools/verify_packaging.sh index 2e897978a..c5da1f304 100755 --- a/tools/verify_packaging.sh +++ b/tools/verify_packaging.sh @@ -63,12 +63,6 @@ print('simpler_setup:', simpler_setup.__file__) echo "::group::[${mode}] standalone test_*.py --help" python tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py --help >/dev/null echo "::endgroup::" - echo "::group::[${mode}] ci.py --help" - python ci.py --help >/dev/null - echo "::endgroup::" - echo "::group::[${mode}] run_example.py --help" - python examples/scripts/run_example.py --help >/dev/null - echo "::endgroup::" echo "smoke[${mode}] OK" }