From 85f3e940d13efb63e5de1cf7f19700917c85408e Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Tue, 26 May 2026 10:29:42 +0800
Subject: [PATCH 01/12] fix(parser): infer implicit TileView blayout from
 physical tile shape (#1498)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The python printer elides a TileView field when it matches
GetImplicitTileView(tile_type.shape_, ...) — i.e. the implicit view derived
from the *physical* tile shape. The text parser, however, recomputed the
implicit blayout/slayout/fractal from `valid_shape` when it was given,
desynchronising the two for packed-mask tiles.

A cmp/cmps result has physical shape e.g. [16, 8] but valid_shape [16, 1]:
the printer omits its (row_major) blayout, while the parser saw valid_shape's
cols==1 and filled col_major, so print->parse failed structural equality with
"TileView blayout mismatch". Infer the implicit defaults from the physical
tile shape (falling back to valid_shape only when the shape is unavailable),
matching the printer.

Un-skips TestConvertScatterOp::test_scatter_conversion, which exercises this
path via the scatter DPS-preserve blend and now round-trips cleanly.
---
 python/pypto/language/parser/type_resolver.py        | 12 +++++++++++-
 .../ir/transforms/test_convert_tensor_to_tile_ops.py | 12 +++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/python/pypto/language/parser/type_resolver.py b/python/pypto/language/parser/type_resolver.py
index 80018092a..a6d14e65f 100644
--- a/python/pypto/language/parser/type_resolver.py
+++ b/python/pypto/language/parser/type_resolver.py
@@ -1309,8 +1309,18 @@ def _resolve_tileview(  # noqa: PLR0912
         # fields matching the memory-space-aware implicit defaults; the parser must
         # recover them from the same rules — regardless of whether memory_space is
         # present, since shape-derived defaults (e.g. col_major for [N,1]) also apply.
+        #
+        # The basis MUST be the full tile shape, not valid_shape: the printer elides
+        # against ``GetImplicitTileView(tile_type.shape_, ...)`` (the physical tile
+        # shape), so the parser has to infer from the same shape. Using valid_shape
+        # here desynchronized the two for packed-mask tiles — e.g. a cmp/cmps result
+        # with physical shape [16, 8] but valid_shape [16, 1]: the printer omits the
+        # (row_major) blayout, while valid_shape's cols==1 made the parser fill
+        # col_major, so the print->parse roundtrip failed with "TileView blayout
+        # mismatch" (#1498). Prefer tile_shape, falling back to valid_shape only when
+        # the physical shape is unavailable.
         impl_blayout, impl_slayout, impl_fractal = _implicit_tile_view_defaults(
-            valid_shape if valid_shape else (tile_shape or []), memory_space
+            tile_shape if tile_shape else (valid_shape or []), memory_space
         )
         return ir.TileView(
             valid_shape=valid_shape if valid_shape is not None else [],
diff --git a/tests/ut/ir/transforms/test_convert_tensor_to_tile_ops.py b/tests/ut/ir/transforms/test_convert_tensor_to_tile_ops.py
index f949733dd..b37304b8d 100644
--- a/tests/ut/ir/transforms/test_convert_tensor_to_tile_ops.py
+++ b/tests/ut/ir/transforms/test_convert_tensor_to_tile_ops.py
@@ -2430,13 +2430,6 @@ def test_gather_mask_conversion(self):
 class TestConvertScatterOp:
     """Test conversion of tensor.scatter (rank-2 dim=-1 MVP) and tensor.scatter_mask."""
 
-    @pytest.mark.skip(
-        reason="Blocked by a pre-existing cmp/cmps round-trip gap: the scatter preserve "
-        "blend emits tile.cmps, whose packed-mask result TileView loses its blayout on "
-        "print->parse, so the autouse RoundtripInstrument fails structural equality "
-        "(same failure as any tensor.cmp conversion). Assertions below are correct and "
-        "ready once the packed-mask TileView blayout round-trips."
-    )
     def test_scatter_conversion(self):
         """tensor.scatter -> tile.load(input/index/src) + flat-index build + tile.scatter."""
 
@@ -2462,6 +2455,11 @@ def main(
                 out: pl.Tensor[[16, 8], pl.FP32] = self.main_incore_0(inp, idx, src)
                 return out
 
+        # Runs under the autouse roundtrip instrument. The preserve blend emits
+        # tile.cmps whose packed-mask result has valid_shape [N, 1] on a wider
+        # physical tile; with #1498 fixed (parser now infers the implicit blayout
+        # from the physical tile shape, not valid_shape) the print->parse roundtrip
+        # holds, so this conversion no longer needs to be skipped.
         After = passes.convert_tensor_to_tile_ops()(Before)
         after_src = After.as_python()
 

From 1627c6abaee0a760610943701241491a5d2b504a Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Tue, 26 May 2026 10:30:04 +0800
Subject: [PATCH 02/12] fix(scatter): correct tscatter mask-form PTOAS syntax +
 post-merge review fixes

Mask-form codegen: pto.tscatter requires the maskPattern attribute *inside*
ins() after the src operand (same shape as pto.tgather's mask form), e.g.
`ins(%src, {maskPattern = #pto.mask_pattern<P0101>} : src_ty) outs(%dst)`.
The previous trailing-attribute form made PTOAS fail with "expected ',' after
src operand in ins(...)". The codegen UT now asserts maskPattern appears inside
ins() (before outs()) so the layout can't regress.

Also addresses review feedback on the index form:
- Guard the INT16 flat-index range in the tensor.scatter lowering (n*cols must
  stay <= 32768) so an oversized 2-byte tile fails loudly instead of silently
  overflowing to wrong destination addresses.
- Add an INTERNAL_CHECK that the src/indexes type annotations are both present
  or both absent (a one-sided annotation is a codegen bug, not valid input).
- Document the duplicate-index ascending last-wins ordering as a pto.tscatter
  ABI guarantee that the lowering and ST reference both rely on.

Tests: add mask-form ST (TestScatterMaskForm, P0101/P1010) on the A2/A3
backend (Ascend910B); A5/Ascend950 rejects the mask form. dst is zero-init so
the expected unselected columns are correct regardless of whether tscatter
zeros or preserves them.
---
 src/backend/common/pto_ops_common.cpp        |  31 ++--
 src/ir/op/tile_ops/scatter.cpp               |   7 +
 src/ir/transforms/op_conversion_registry.cpp |  20 +++
 tests/st/runtime/test_scatter.py             | 140 +++++++++++++++++++
 tests/ut/codegen/test_pto_codegen_ops.py     |   6 +
 5 files changed, 194 insertions(+), 10 deletions(-)

diff --git a/src/backend/common/pto_ops_common.cpp b/src/backend/common/pto_ops_common.cpp
index b8fbc7f46..597bd5431 100644
--- a/src/backend/common/pto_ops_common.cpp
+++ b/src/backend/common/pto_ops_common.cpp
@@ -973,7 +973,14 @@ static std::string MakeScatterCodegenPTO(const CallPtr& op, codegen::CodegenBase
   std::ostringstream oss;
   oss << "pto.tscatter ins(" << src << ", " << idx;
   // Emit the type clause only when both annotations are present; printing one
-  // alone would produce malformed PTOAS (": , idx" or ": src, ").
+  // alone would produce malformed PTOAS (": , idx" or ": src, "). The two
+  // operands are typed tiles produced by the same lowering, so they should
+  // either both carry an annotation or (in untyped contexts) both lack one — a
+  // one-sided annotation signals a real codegen bug, not a valid input.
+  INTERNAL_CHECK(src_type.empty() == idx_type.empty())
+      << "Internal error: tile.scatter src/indexes type annotations must both be present or both "
+         "absent, got src_type='"
+      << src_type << "', idx_type='" << idx_type << "'";
   if (!src_type.empty() && !idx_type.empty()) {
     oss << " : " << src_type << ", " << idx_type;
   }
@@ -988,16 +995,17 @@ static std::string MakeScatterCodegenPTO(const CallPtr& op, codegen::CodegenBase
 }
 
 // Helper for tile.scatter_mask (TSCATTER mask form, DPS):
-//   pto.tscatter ins(%src : src_ty) outs(%dst : dst_ty)
-//                {maskPattern = #pto.mask_pattern<Pxxxx>}
+//   pto.tscatter ins(%src, {maskPattern = #pto.mask_pattern<Pxxxx>} : src_ty)
+//                outs(%dst : dst_ty)
 //
-// Unlike pto.tgather's mask form (which carries maskPattern inside ins()),
-// pto.tscatter only accepts SSA operands in ins() — the maskPattern must be a
-// trailing op attribute dict (same placement as gather_compare's cmpMode).
+// The maskPattern rides *inside* ins() right after the src operand, exactly
+// like pto.tgather's mask form — PTOAS parses ins() as "src, attr-dict :
+// type" and rejects a bare ins(%src ...) ("expected ',' after src operand").
+// The type annotation follows the attr dict, still inside ins().
 //
 // IR surface: 2-input op (dst, src) + mask_pattern attr; dst aliased via
-// set_output_reuses_input(0). Mask form is targeted at A3 / CPU-sim style
-// backends; A5 rejects it on the PTOAS side.
+// set_output_reuses_input(0). Mask form is targeted at A2/A3 backends; A5
+// (Ascend950) rejects it on the PTOAS side.
 static std::string MakeScatterMaskCodegenPTO(const CallPtr& op, codegen::CodegenBase& codegen_base) {
   auto& codegen = dynamic_cast<codegen::PTOCodegen&>(codegen_base);
   CHECK(op->args_.size() == 2) << "tile.scatter_mask requires 2 arguments (dst, src), but got "
@@ -1021,7 +1029,10 @@ static std::string MakeScatterMaskCodegenPTO(const CallPtr& op, codegen::Codegen
       << ", input=" << input_ssa;
 
   std::ostringstream oss;
-  oss << "pto.tscatter ins(" << src;
+  // maskPattern goes inside ins() after src (mirrors pto.tgather mask form);
+  // the optional type annotation follows the attr dict, still inside ins().
+  oss << "pto.tscatter ins(" << src << ", {maskPattern = #pto.mask_pattern<" << mask_patterns.at(pattern)
+      << ">}";
   if (!src_type.empty()) {
     oss << " : " << src_type;
   }
@@ -1029,7 +1040,7 @@ static std::string MakeScatterMaskCodegenPTO(const CallPtr& op, codegen::Codegen
   if (!dst_type.empty()) {
     oss << " : " << dst_type;
   }
-  oss << ") {maskPattern = #pto.mask_pattern<" << mask_patterns.at(pattern) << ">}";
+  oss << ")";
 
   codegen.Emit(oss.str());
   return "";
diff --git a/src/ir/op/tile_ops/scatter.cpp b/src/ir/op/tile_ops/scatter.cpp
index 20053724c..5ef65dac2 100644
--- a/src/ir/op/tile_ops/scatter.cpp
+++ b/src/ir/op/tile_ops/scatter.cpp
@@ -19,6 +19,13 @@
  *
  * Both ops are DPS — `dst` is the in/out buffer; the IR result aliases `dst`
  * via `set_output_reuses_input(...)`. There is no compare form for scatter.
+ *
+ * Duplicate-index ordering: when two index entries map to the same destination
+ * slot, pto.tscatter resolves the collision in ascending element order (the
+ * later/higher-index write wins), matching torch `scatter_`'s last-wins
+ * semantics along the scan axis. Callers that build flat indices
+ * (ConvertTensorToTileOps) and the ST reference both rely on this order; it is
+ * a pto.tscatter ABI guarantee, not a PyPTO-side choice.
  */
 
 #include <any>
diff --git a/src/ir/transforms/op_conversion_registry.cpp b/src/ir/transforms/op_conversion_registry.cpp
index c26dac916..c1876981d 100644
--- a/src/ir/transforms/op_conversion_registry.cpp
+++ b/src/ir/transforms/op_conversion_registry.cpp
@@ -1476,6 +1476,10 @@ void OpConversionRegistry::RegisterScatterOps() {
         //   flat_idx = index + row_base   (row-broadcast add)
         auto& op_reg = OpRegistry::GetInstance();
         auto src_rows = As<ConstInt>(src_tile->shape_[0]);
+        // `cols` is the flat-layout column count of the scattered destination.
+        // input/output and the inner tile.scatter dst all share this width (S),
+        // so reading it off `input` is equivalent to the dst column count used
+        // in the `i * dst_cols` flat-index formula above.
         auto dst_cols_c = As<ConstInt>(input_tile->shape_[1]);
         CHECK(src_rows && dst_cols_c)
             << "tensor.scatter conversion requires static src rows and dst cols for index expansion";
@@ -1486,6 +1490,22 @@ void OpConversionRegistry::RegisterScatterOps() {
         // for 4-byte dst, INT16 for 2/1-byte dst).
         const DataType idx_dtype = idx_tile->dtype_;
 
+        // INT16 flat-index range guard. For 2-byte element dtypes the flattened
+        // destination indices are INT16, whose largest representable value is
+        // 32767. The biggest index this lowering produces is
+        // (n-1)*cols + (cols-1) == n*cols - 1, so n*cols must stay <= 32768.
+        // 4-byte dtypes use INT32 indices and are effectively unbounded here.
+        // Without this check an oversized tile would silently overflow INT16 and
+        // scatter to wrong addresses instead of failing loudly.
+        if (idx_dtype == DataType::INT16) {
+          CHECK(n * cols <= 32768)
+              << "tensor.scatter with a 2-byte element dtype uses INT16 flattened indices, but the "
+                 "destination is too large: rows("
+              << n << ") * cols(" << cols << ") = " << (n * cols)
+              << " exceeds the INT16 index range (max flat index 32767). Use a smaller tile or split "
+                 "the scatter into chunks.";
+        }
+
         auto make_idx = [&](int64_t v) -> ExprPtr {
           return std::make_shared<ConstInt>(v, DataType::INDEX, span);
         };
diff --git a/tests/st/runtime/test_scatter.py b/tests/st/runtime/test_scatter.py
index 395ddbfa7..5a5485ec5 100644
--- a/tests/st/runtime/test_scatter.py
+++ b/tests/st/runtime/test_scatter.py
@@ -53,6 +53,14 @@
 A separate FP32 case feeds **repeated** indices with distinct values to pin the
 ascending-k last-wins ordering (the round-trip version hid this by writing equal
 values to repeated targets).
+
+**Mask form (A2/A3).** ``TestScatterMaskForm`` covers ``tensor.scatter_mask``
+(``mask_pattern=<int>`` + ``dst``), the column-wise inverse of the mask-form
+gather: each compact ``input`` row is written into the mask-selected columns of
+the wider ``dst`` (``dst.cols == input.cols * stride``). The form runs on the
+A2/A3 backend (``BackendType.Ascend910B``); A5 (``Ascend950``) rejects it, so
+those cases are pinned to A2/A3 only. Column selection mirrors gather: P0101
+hits even columns (``0::2``), P1010 hits odd columns (``1::2``).
 """
 
 from typing import Any
@@ -138,6 +146,22 @@ def _scatter_specs(
     ]
 
 
+def _scatter_mask_specs(b: int, c: int, stride: int, dt: DataType, torch_dt: torch.dtype) -> list[TensorSpec]:
+    """Build the (inp, dst, output) TensorSpecs for a mask-form scatter case.
+
+    ``inp`` is ``[B, C]`` of distinct positive values; ``dst`` is ``[B, C*stride]``
+    initialised to zeros so the expected unselected columns are zero regardless
+    of whether pto.tscatter zeros or preserves them (its index form zeros
+    unwritten slots; a zero base makes the test robust either way).
+    """
+    dst_cols = c * stride
+    return [
+        TensorSpec("inp", [b, c], dt, init_value=lambda: _make_values(b, c, torch_dt)),
+        TensorSpec("dst", [b, dst_cols], dt, init_value=lambda: torch.zeros(b, dst_cols, dtype=torch_dt)),
+        TensorSpec("output", [b, dst_cols], dt, is_output=True),
+    ]
+
+
 # --- Programs (one per element type; shapes satisfy the 32-byte row alignment) ---
 
 
@@ -256,6 +280,51 @@ def main(
         return output
 
 
+# --- Mask-form programs (A2/A3 only; A5/Ascend950 rejects pto.tscatter mask form) ---
+
+
+@pl.program
+class ScatterMaskP0101Program:
+    """Mask-form scatter (P0101, A2/A3): write ``inp`` into even columns of ``dst``.
+
+    ``inp [8, 8] FP32`` expands into ``dst [8, 16] FP32`` (stride 2): the inverse
+    of the P0101 mask gather, so ``dst[:, 0::2] = inp``.
+    """
+
+    @pl.function(type=pl.FunctionType.Opaque)
+    def main(
+        self,
+        inp: pl.Tensor[[8, 8], pl.FP32],
+        dst: pl.Tensor[[8, 16], pl.FP32],
+        output: pl.Out[pl.Tensor[[8, 16], pl.FP32]],
+    ) -> pl.Tensor[[8, 16], pl.FP32]:
+        with pl.at(level=pl.Level.CORE_GROUP):
+            out = pl.tensor.scatter(inp, mask_pattern=pl.tile.MaskPattern.P0101, dst=dst)
+            output = pl.assemble(output, out, [0, 0])
+        return output
+
+
+@pl.program
+class ScatterMaskP1010Program:
+    """Mask-form scatter (P1010, A2/A3): write ``inp`` into odd columns of ``dst``.
+
+    ``inp [8, 8] FP32`` expands into ``dst [8, 16] FP32`` (stride 2): the inverse
+    of the P1010 mask gather, so ``dst[:, 1::2] = inp``.
+    """
+
+    @pl.function(type=pl.FunctionType.Opaque)
+    def main(
+        self,
+        inp: pl.Tensor[[8, 8], pl.FP32],
+        dst: pl.Tensor[[8, 16], pl.FP32],
+        output: pl.Out[pl.Tensor[[8, 16], pl.FP32]],
+    ) -> pl.Tensor[[8, 16], pl.FP32]:
+        with pl.at(level=pl.Level.CORE_GROUP):
+            out = pl.tensor.scatter(inp, mask_pattern=pl.tile.MaskPattern.P1010, dst=dst)
+            output = pl.assemble(output, out, [0, 0])
+        return output
+
+
 # --- Test cases ---
 
 
@@ -358,6 +427,58 @@ def get_program(self) -> Any:
         return Scatter1RowProgram
 
 
+class _ScatterMaskBaseTestCase(PTOTestCase):
+    """Base for mask-form scatter cases. Pinned to A2/A3 (Ascend910B).
+
+    Subclasses set ``_start`` (0 for P0101, 1 for P1010) and ``_stride`` (2);
+    ``compute_expected`` writes ``inp`` into the ``[start::stride]`` columns of a
+    zero ``[B, C*stride]`` tensor (the unselected columns stay zero).
+    """
+
+    __test__ = False
+    _start: int = 0
+    _stride: int = 2
+
+    def get_strategy(self) -> OptimizationStrategy:
+        return OptimizationStrategy.Default
+
+    def get_backend_type(self) -> BackendType:
+        # Mask-form pto.tscatter is an A2/A3 feature; A5 (Ascend950) rejects it.
+        return BackendType.Ascend910B
+
+    def compute_expected(self, tensors, params=None):
+        b, dst_cols = tensors["output"].shape
+        out = torch.zeros(b, dst_cols, dtype=tensors["inp"].dtype)
+        out[:, self._start :: self._stride] = tensors["inp"]
+        tensors["output"][:] = out
+
+
+class ScatterMaskP0101TestCase(_ScatterMaskBaseTestCase):
+    _start = 0  # P0101 selects even columns
+
+    def get_name(self) -> str:
+        return "scatter_mask_p0101"
+
+    def define_tensors(self) -> list[TensorSpec]:
+        return _scatter_mask_specs(8, 8, 2, DataType.FP32, torch.float32)
+
+    def get_program(self) -> Any:
+        return ScatterMaskP0101Program
+
+
+class ScatterMaskP1010TestCase(_ScatterMaskBaseTestCase):
+    _start = 1  # P1010 selects odd columns
+
+    def get_name(self) -> str:
+        return "scatter_mask_p1010"
+
+    def define_tensors(self) -> list[TensorSpec]:
+        return _scatter_mask_specs(8, 8, 2, DataType.FP32, torch.float32)
+
+    def get_program(self) -> Any:
+        return ScatterMaskP1010Program
+
+
 # --- Tests ---
 
 
@@ -400,5 +521,24 @@ def test_scatter_1row(self, test_runner, platform):
         assert result.passed, f"Test failed: {result.error}"
 
 
+class TestScatterMaskForm:
+    """Mask-form row scatter — A2/A3 only (A5/Ascend950 rejects the mask form).
+
+    Each compact ``inp`` row is written into the mask-selected columns of the
+    wider ``dst`` (``dst.cols == inp.cols * stride``); column selection mirrors
+    the mask gather (P0101 → even, P1010 → odd).
+    """
+
+    @pytest.mark.parametrize("platform", PLATFORMS)
+    def test_scatter_mask_p0101(self, test_runner, platform):
+        result = test_runner.run(ScatterMaskP0101TestCase(platform=platform))
+        assert result.passed, f"Test failed: {result.error}"
+
+    @pytest.mark.parametrize("platform", PLATFORMS)
+    def test_scatter_mask_p1010(self, test_runner, platform):
+        result = test_runner.run(ScatterMaskP1010TestCase(platform=platform))
+        assert result.passed, f"Test failed: {result.error}"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/ut/codegen/test_pto_codegen_ops.py b/tests/ut/codegen/test_pto_codegen_ops.py
index 7572ff60c..7310dc063 100644
--- a/tests/ut/codegen/test_pto_codegen_ops.py
+++ b/tests/ut/codegen/test_pto_codegen_ops.py
@@ -1940,6 +1940,12 @@ def kernel(
         assert "ins(" in line and "outs(" in line, (
             f"pto.tscatter mask form must use the ins(...) outs(...) DPS form, got:\n{line}"
         )
+        # maskPattern must ride inside ins() after src (like pto.tgather), not as
+        # a trailing attr after outs() — PTOAS rejects a bare ins(%src ...) with
+        # "expected ',' after src operand".
+        assert line.index("maskPattern") < line.index("outs("), (
+            f"maskPattern must appear inside ins(...) before outs(...), got:\n{line}"
+        )
 
 
 if __name__ == "__main__":

From 515a6974c583eda14bf6f0b423991499864f4f98 Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Tue, 26 May 2026 11:02:51 +0800
Subject: [PATCH 03/12] fix(pr): resolve issues for #1513
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- tscatter mask form: emit maskPattern inside ins() after src, before the type
  annotation (ins(%src, {maskPattern...} : src_ty) outs(%dst)) — device-verified.
- INTERNAL_CHECK -> INTERNAL_CHECK_SPAN(op->span_) for the scatter type-annotation
  symmetry check (gemini).
- INT16 flat-index guard: print the actual element dtype instead of hardcoding
  "2-byte" (INT16 covers 1- and 2-byte; gemini).
---
 src/backend/common/pto_ops_common.cpp        | 6 +++---
 src/ir/transforms/op_conversion_registry.cpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/backend/common/pto_ops_common.cpp b/src/backend/common/pto_ops_common.cpp
index 597bd5431..c115ef3d4 100644
--- a/src/backend/common/pto_ops_common.cpp
+++ b/src/backend/common/pto_ops_common.cpp
@@ -977,7 +977,7 @@ static std::string MakeScatterCodegenPTO(const CallPtr& op, codegen::CodegenBase
   // operands are typed tiles produced by the same lowering, so they should
   // either both carry an annotation or (in untyped contexts) both lack one — a
   // one-sided annotation signals a real codegen bug, not a valid input.
-  INTERNAL_CHECK(src_type.empty() == idx_type.empty())
+  INTERNAL_CHECK_SPAN(src_type.empty() == idx_type.empty(), op->span_)
       << "Internal error: tile.scatter src/indexes type annotations must both be present or both "
          "absent, got src_type='"
       << src_type << "', idx_type='" << idx_type << "'";
@@ -1029,8 +1029,8 @@ static std::string MakeScatterMaskCodegenPTO(const CallPtr& op, codegen::Codegen
       << ", input=" << input_ssa;
 
   std::ostringstream oss;
-  // maskPattern goes inside ins() after src (mirrors pto.tgather mask form);
-  // the optional type annotation follows the attr dict, still inside ins().
+  // maskPattern rides inside ins() after src, then the type annotation:
+  //   pto.tscatter ins(%src, {maskPattern = #pto.mask_pattern<Pxxxx>} : src_ty) outs(%dst : dst_ty)
   oss << "pto.tscatter ins(" << src << ", {maskPattern = #pto.mask_pattern<" << mask_patterns.at(pattern)
       << ">}";
   if (!src_type.empty()) {
diff --git a/src/ir/transforms/op_conversion_registry.cpp b/src/ir/transforms/op_conversion_registry.cpp
index c1876981d..8c6646785 100644
--- a/src/ir/transforms/op_conversion_registry.cpp
+++ b/src/ir/transforms/op_conversion_registry.cpp
@@ -1499,9 +1499,9 @@ void OpConversionRegistry::RegisterScatterOps() {
         // scatter to wrong addresses instead of failing loudly.
         if (idx_dtype == DataType::INT16) {
           CHECK(n * cols <= 32768)
-              << "tensor.scatter with a 2-byte element dtype uses INT16 flattened indices, but the "
-                 "destination is too large: rows("
-              << n << ") * cols(" << cols << ") = " << (n * cols)
+              << "tensor.scatter with element dtype " << input_tile->dtype_.ToString()
+              << " uses INT16 flattened indices, but the destination is too large: rows(" << n << ") * cols("
+              << cols << ") = " << (n * cols)
               << " exceeds the INT16 index range (max flat index 32767). Use a smaller tile or split "
                  "the scatter into chunks.";
         }

From 959e0bb68c0a5c9912f93cb2648551e526dbcef9 Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Tue, 26 May 2026 13:07:58 +0800
Subject: [PATCH 04/12] fix(pr): resolve issues for #1513

- INT16 scatter flat-index guard: bound rows via division (kMaxFlat/cols)
  instead of n*cols to avoid signed int64 overflow; handle cols==0
---
 src/ir/transforms/op_conversion_registry.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/ir/transforms/op_conversion_registry.cpp b/src/ir/transforms/op_conversion_registry.cpp
index 8c6646785..dfa37fabd 100644
--- a/src/ir/transforms/op_conversion_registry.cpp
+++ b/src/ir/transforms/op_conversion_registry.cpp
@@ -1498,12 +1498,16 @@ void OpConversionRegistry::RegisterScatterOps() {
         // Without this check an oversized tile would silently overflow INT16 and
         // scatter to wrong addresses instead of failing loudly.
         if (idx_dtype == DataType::INT16) {
-          CHECK(n * cols <= 32768)
-              << "tensor.scatter with element dtype " << input_tile->dtype_.ToString()
-              << " uses INT16 flattened indices, but the destination is too large: rows(" << n << ") * cols("
-              << cols << ") = " << (n * cols)
-              << " exceeds the INT16 index range (max flat index 32767). Use a smaller tile or split "
-                 "the scatter into chunks.";
+          // Bound via division so the product never overflows int64_t: rows is
+          // capped so rows*cols stays <= 32768. cols is always > 0 here (a
+          // 2-byte tile has at least one column), but guard against 0 anyway.
+          const int64_t kMaxFlat = 32768;
+          const int64_t max_rows = cols == 0 ? kMaxFlat : kMaxFlat / cols;
+          CHECK(n <= max_rows) << "tensor.scatter with element dtype " << input_tile->dtype_.ToString()
+                               << " uses INT16 flattened indices, but the destination is too large: rows("
+                               << n << ") * cols(" << cols
+                               << ") exceeds the INT16 index range (max flat index 32767, rows <= "
+                               << max_rows << "). Use a smaller tile or split the scatter into chunks.";
         }
 
         auto make_idx = [&](int64_t v) -> ExprPtr {

From 56e58248b9b08b44501a10b384315d1c3768d38c Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Tue, 26 May 2026 14:26:47 +0800
Subject: [PATCH 05/12] =?UTF-8?q?test=5Fscatter.py=E6=94=BE=E5=9C=A8ops?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=A4=B9=E4=B8=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/st/runtime/{ => ops}/test_scatter.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/st/runtime/{ => ops}/test_scatter.py (100%)

diff --git a/tests/st/runtime/test_scatter.py b/tests/st/runtime/ops/test_scatter.py
similarity index 100%
rename from tests/st/runtime/test_scatter.py
rename to tests/st/runtime/ops/test_scatter.py

From a20351f32ab2bd848503cdc7e74a43219cbac6a2 Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Thu, 28 May 2026 15:45:00 +0800
Subject: [PATCH 06/12] chore: trigger CI


From 851d2dcd15723b337a7679c84286b3bc0af49876 Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Sat, 30 May 2026 16:02:39 +0800
Subject: [PATCH 07/12] =?UTF-8?q?test(scatter):=20add=20chained=20mask-for?=
 =?UTF-8?q?m=20scatter=20case=20(P0101=E2=86=92P1010)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fold the RoPE even/odd reassembly repro into TestScatterMaskForm: write
two compact inputs into one dst by chaining P0101 then P1010 mask
scatters, pinning that the second scatter preserves the first's writes
(dst[:, 0::2] = even, dst[:, 1::2] = odd). even/odd use disjoint positive
ranges so a swapped pattern or clobbered column is caught.
---
 tests/st/runtime/ops/test_scatter.py | 79 +++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/tests/st/runtime/ops/test_scatter.py b/tests/st/runtime/ops/test_scatter.py
index 5a5485ec5..07f133863 100644
--- a/tests/st/runtime/ops/test_scatter.py
+++ b/tests/st/runtime/ops/test_scatter.py
@@ -162,6 +162,25 @@ def _scatter_mask_specs(b: int, c: int, stride: int, dt: DataType, torch_dt: tor
     ]
 
 
+def _scatter_mask_chain_specs(b: int, c: int, dt: DataType, torch_dt: torch.dtype) -> list[TensorSpec]:
+    """Build the (even, odd, dst, output) TensorSpecs for the chained mask-scatter case.
+
+    Two compact ``[B, C]`` inputs are interleaved into one ``[B, 2C]`` dst by
+    chaining two mask scatters: ``even`` into the even columns (P0101) and
+    ``odd`` into the odd columns (P1010). ``even`` and ``odd`` hold disjoint
+    positive ranges so a swapped pattern or a clobbered column is caught. ``dst``
+    starts zeroed; a correct chain leaves ``dst[:, 0::2] = even`` and
+    ``dst[:, 1::2] = odd`` (the second scatter must preserve the first's writes).
+    """
+    dst_cols = 2 * c
+    return [
+        TensorSpec("even", [b, c], dt, init_value=lambda: _make_values(b, c, torch_dt)),
+        TensorSpec("odd", [b, c], dt, init_value=lambda: _make_values(b, c, torch_dt) + b * c),
+        TensorSpec("dst", [b, dst_cols], dt, init_value=lambda: torch.zeros(b, dst_cols, dtype=torch_dt)),
+        TensorSpec("output", [b, dst_cols], dt, is_output=True),
+    ]
+
+
 # --- Programs (one per element type; shapes satisfy the 32-byte row alignment) ---
 
 
@@ -325,6 +344,32 @@ def main(
         return output
 
 
+@pl.program
+class ScatterMaskChainProgram:
+    """Chained mask-form scatter (P0101 then P1010, A2/A3): RoPE even/odd reassembly.
+
+    Writes ``even`` into the even columns (P0101) and ``odd`` into the odd columns
+    (P1010) of a single ``dst``, chaining two mask scatters into one buffer — the
+    inverse of splitting a RoPE head into even/odd halves. Validates that the
+    second scatter preserves the first's writes, so ``dst[:, 0::2] = even`` and
+    ``dst[:, 1::2] = odd``.
+    """
+
+    @pl.function(type=pl.FunctionType.Opaque)
+    def main(
+        self,
+        even: pl.Tensor[[16, 32], pl.FP32],
+        odd: pl.Tensor[[16, 32], pl.FP32],
+        dst: pl.Tensor[[16, 64], pl.FP32],
+        output: pl.Out[pl.Tensor[[16, 64], pl.FP32]],
+    ) -> pl.Tensor[[16, 64], pl.FP32]:
+        with pl.at(level=pl.Level.CORE_GROUP):
+            out = pl.tensor.scatter(even, mask_pattern=pl.tile.MaskPattern.P0101, dst=dst)
+            out = pl.tensor.scatter(odd, mask_pattern=pl.tile.MaskPattern.P1010, dst=out)
+            output = pl.assemble(output, out, [0, 0])
+        return output
+
+
 # --- Test cases ---
 
 
@@ -479,6 +524,31 @@ def get_program(self) -> Any:
         return ScatterMaskP1010Program
 
 
+class ScatterMaskChainTestCase(_ScatterMaskBaseTestCase):
+    """Chain P0101 then P1010 into one dst (RoPE even/odd reassembly).
+
+    Unlike the single-pattern cases, this writes two compact inputs into one
+    interleaved ``dst`` via two chained scatters, pinning that the second
+    pattern's scatter preserves the first's writes.
+    """
+
+    def get_name(self) -> str:
+        return "scatter_mask_chain"
+
+    def define_tensors(self) -> list[TensorSpec]:
+        return _scatter_mask_chain_specs(16, 32, DataType.FP32, torch.float32)
+
+    def get_program(self) -> Any:
+        return ScatterMaskChainProgram
+
+    def compute_expected(self, tensors, params=None):
+        b, dst_cols = tensors["output"].shape
+        out = torch.zeros(b, dst_cols, dtype=tensors["even"].dtype)
+        out[:, 0::2] = tensors["even"]  # P0101 → even columns
+        out[:, 1::2] = tensors["odd"]  # P1010 → odd columns
+        tensors["output"][:] = out
+
+
 # --- Tests ---
 
 
@@ -526,7 +596,9 @@ class TestScatterMaskForm:
 
     Each compact ``inp`` row is written into the mask-selected columns of the
     wider ``dst`` (``dst.cols == inp.cols * stride``); column selection mirrors
-    the mask gather (P0101 → even, P1010 → odd).
+    the mask gather (P0101 → even, P1010 → odd). The chain case writes two inputs
+    into one ``dst`` (P0101 then P1010) to pin that the second scatter preserves
+    the first's writes — the RoPE even/odd reassembly tail.
     """
 
     @pytest.mark.parametrize("platform", PLATFORMS)
@@ -539,6 +611,11 @@ def test_scatter_mask_p1010(self, test_runner, platform):
         result = test_runner.run(ScatterMaskP1010TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
+    @pytest.mark.parametrize("platform", PLATFORMS)
+    def test_scatter_mask_chain(self, test_runner, platform):
+        result = test_runner.run(ScatterMaskChainTestCase(platform=platform))
+        assert result.passed, f"Test failed: {result.error}"
+
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 3e78ac6dc11951e65459717592ed75b6056e46cf Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Sat, 30 May 2026 16:18:58 +0800
Subject: [PATCH 08/12] test(scatter): skip index-form 2-byte + mask-chain
 cases (pto-isa bug / WIP)

Skip test_scatter_fp16/bf16/int16: the index-form 2-byte path currently
fails on device due to a pto-isa bug (not this PR's codegen); re-enable
once pto-isa lands the fix. The fp32/int32 4-byte cases still run.

Skip test_scatter_mask_chain: the chained P0101->P1010 reassembly into a
single dst is still being root-caused; the single-pattern P0101/P1010
mask cases still run.
---
 tests/st/runtime/ops/test_scatter.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/st/runtime/ops/test_scatter.py b/tests/st/runtime/ops/test_scatter.py
index 07f133863..9a64c4527 100644
--- a/tests/st/runtime/ops/test_scatter.py
+++ b/tests/st/runtime/ops/test_scatter.py
@@ -565,16 +565,19 @@ def test_scatter_int32(self, test_runner, platform):
         result = test_runner.run(ScatterINT32TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
+    @pytest.mark.skip(reason="index-form 2-byte (fp16/bf16/int16) fails on a pto-isa bug; skip pending fix")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_fp16(self, test_runner, platform):
         result = test_runner.run(ScatterFP16TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
+    @pytest.mark.skip(reason="index-form 2-byte (fp16/bf16/int16) fails on a pto-isa bug; skip pending fix")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_bf16(self, test_runner, platform):
         result = test_runner.run(ScatterBF16TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
+    @pytest.mark.skip(reason="index-form 2-byte (fp16/bf16/int16) fails on a pto-isa bug; skip pending fix")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_int16(self, test_runner, platform):
         result = test_runner.run(ScatterINT16TestCase(platform=platform))
@@ -611,6 +614,7 @@ def test_scatter_mask_p1010(self, test_runner, platform):
         result = test_runner.run(ScatterMaskP1010TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
+    @pytest.mark.skip(reason="chained mask scatter (P0101→P1010 into one dst) under investigation")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_mask_chain(self, test_runner, platform):
         result = test_runner.run(ScatterMaskChainTestCase(platform=platform))

From 1e26f3b5e6d4a836e8ad417b38134329d9c7ed3c Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Mon, 1 Jun 2026 09:36:34 +0800
Subject: [PATCH 09/12] fix(scatter): preserve dst in mask-form scatter
 lowering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mask-form pto.tscatter (TScatterMaskImpl) zero-fills the entire dst tile
(InitUBBuffer) before writing the selected columns, so it does NOT
preserve dst's unselected columns — they read back as 0. The previous
tensor.scatter_mask lowering was a plain (input, dst) -> (dst, src)
rewire that relied on a preserve the hardware never provided, so the
unselected columns came out zero. Chaining two patterns (P0101 then
P1010) into one dst was therefore unsound: the second scatter wiped the
first's writes, yielding [0, odd, 0, odd] instead of [even, odd, ...].

Reconstruct DPS preserve on the PyPTO side with the same zeroed-scatter
+ mask + select blend the index form already uses:
  scattered = scatter_mask(zeros,   input)
  mask      = scatter_mask(zeros_m, ones)
  out       = sel(mask != 0, scattered, dst)
so unselected columns keep dst's values and chaining is sound.

Tests: the P0101/P1010 cases now pre-fill dst with a negative sentinel
(disjoint from inp) and expect it preserved in the unselected columns,
so they fail if the lowering zero-fills. test_scatter_mask_chain is
un-skipped (the chain is the original repro of the zero-fill bug).
---
 src/ir/transforms/op_conversion_registry.cpp | 82 ++++++++++++++++++--
 tests/st/runtime/ops/test_scatter.py         | 27 ++++---
 2 files changed, 95 insertions(+), 14 deletions(-)

diff --git a/src/ir/transforms/op_conversion_registry.cpp b/src/ir/transforms/op_conversion_registry.cpp
index dfa37fabd..54390319a 100644
--- a/src/ir/transforms/op_conversion_registry.cpp
+++ b/src/ir/transforms/op_conversion_registry.cpp
@@ -1432,7 +1432,10 @@ void OpConversionRegistry::RegisterGatherOps() {
 //   select blend. The surrounding pass wraps the tile result in a tile.store to
 //   the output tensor param.
 //
-// tensor.scatter_mask: same idea, simple (input, dst) → (dst, src) re-wire.
+// tensor.scatter_mask: same idea — pto.tscatter (mask form) zero-fills the whole
+//   dst before writing the selected columns, so it does not preserve dst either.
+//   We reconstruct DPS preserve with the same zeroed-scatter + mask + select
+//   blend, which also makes chaining two patterns into one dst sound.
 // ============================================================================
 
 void OpConversionRegistry::RegisterScatterOps() {
@@ -1635,10 +1638,79 @@ void OpConversionRegistry::RegisterScatterOps() {
         CHECK(args.size() == 2) << "tensor.scatter_mask conversion expects 2 args (input, dst), got "
                                 << args.size();
         auto& op_reg = OpRegistry::GetInstance();
-        // tile.scatter_mask signature: (dst, src) + mask_pattern attr. The
-        // converter's args[1] (dst) maps to dst, args[0] (input) to src.
-        auto tile_call = op_reg.Create("tile.scatter_mask", {args[1], args[0]}, kwargs, span);
-        return ConversionResult{tile_call};
+        auto input_tile = As<TileType>(args[0]->GetType());
+        auto dst_tile = As<TileType>(args[1]->GetType());
+        CHECK(input_tile && dst_tile)
+            << "tensor.scatter_mask conversion: input/dst must be Vec tiles after bridge";
+
+        // pto.tscatter (mask form) zero-fills the entire dst tile before writing
+        // the mask-selected columns (TScatterMaskImpl calls InitUBBuffer), so it
+        // does NOT preserve dst's unselected columns — they read back as 0. To
+        // honour the DPS preserve contract (and make chaining two patterns into
+        // one dst sound), reconstruct preserve on the PyPTO side with the same
+        // zeroed-scatter + mask + select blend as the index form:
+        //
+        //   scattered = scatter_mask(zeros,   input)  # input @selected, 0 @unselected
+        //   mask      = scatter_mask(zeros_m, ones)   # 1     @selected, 0 @unselected
+        //   pred      = (mask != 0)                    # packed predicate
+        //   out       = sel(pred, scattered, dst)      # selected→scattered, else→dst
+        auto in_rows = As<ConstInt>(input_tile->shape_[0]);
+        auto in_cols = As<ConstInt>(input_tile->shape_[1]);
+        auto dst_cols_c = As<ConstInt>(dst_tile->shape_[1]);
+        CHECK(in_rows && in_cols && dst_cols_c)
+            << "tensor.scatter_mask conversion requires static shapes for the preserve blend";
+        const int64_t b = in_rows->value_;
+        const int64_t c = in_cols->value_;
+        const int64_t dst_cols = dst_cols_c->value_;
+        const DataType dt = dst_tile->dtype_;
+
+        // Mask blend dtype: a compare-friendly type within dst's element size
+        // (bf16 → f16) so tile.cmps is well-defined for any supported dtype.
+        const int dt_bytes = static_cast<int>(dt.GetBit()) / 8;
+        const DataType mask_dt = (dt_bytes == 4)   ? DataType(DataType::FP32)
+                                 : (dt_bytes == 2) ? DataType(DataType::FP16)
+                                                   : DataType(DataType::INT8);
+
+        auto make_idx = [&](int64_t v) -> ExprPtr {
+          return std::make_shared<ConstInt>(v, DataType::INDEX, span);
+        };
+        std::vector<StmtPtr> prologue;
+        auto emit = [&](const std::string& op_name, const std::vector<ExprPtr>& op_args,
+                        const std::vector<std::pair<std::string, std::any>>& op_kwargs,
+                        const std::string& name) -> VarPtr {
+          auto call = op_kwargs.empty() ? op_reg.Create(op_name, op_args, span)
+                                        : op_reg.Create(op_name, op_args, op_kwargs, span);
+          auto var = std::make_shared<Var>(name, call->GetType(), span);
+          prologue.push_back(std::make_shared<AssignStmt>(var, call, span));
+          return var;
+        };
+        auto make_full = [&](const DataType& fdt, int64_t rows, int64_t cols_, double v,
+                             const std::string& name) -> VarPtr {
+          ExprPtr val = fdt.IsFloat()
+                            ? ExprPtr(std::make_shared<ConstFloat>(v, fdt, span))
+                            : ExprPtr(std::make_shared<ConstInt>(static_cast<int64_t>(v), fdt, span));
+          return emit("tile.full", {MakeShapeTuple({make_idx(rows), make_idx(cols_)}, span), val},
+                      {{"dtype", fdt}}, name);
+        };
+
+        // scattered = input written into the mask-selected columns of a zeroed dst.
+        auto values_zero = make_full(dt, b, dst_cols, 0.0, "scatter_mask_values_zero");
+        auto scattered = emit("tile.scatter_mask", {values_zero, args[0]}, kwargs, "scatter_mask_values");
+        // mask = ones written into the same selected columns of a zeroed base.
+        auto mask_zero = make_full(mask_dt, b, dst_cols, 0.0, "scatter_mask_mask_zero");
+        auto ones_src = make_full(mask_dt, b, c, 1.0, "scatter_mask_ones");
+        auto mask = emit("tile.scatter_mask", {mask_zero, ones_src}, kwargs, "scatter_mask_mask");
+        // pred = (mask != 0)  → packed predicate (NE = cmp_type 1).
+        ExprPtr zero_scalar = mask_dt.IsFloat() ? ExprPtr(std::make_shared<ConstFloat>(0.0, mask_dt, span))
+                                                : ExprPtr(std::make_shared<ConstInt>(0, mask_dt, span));
+        auto pred = emit("tile.cmps", {mask, zero_scalar}, {{"cmp_type", 1}}, "scatter_mask_pred");
+        // tmp = TSEL scratch tile (UINT8 [1, 32]).
+        auto tmp = emit("tile.create", {MakeShapeTuple({make_idx(1), make_idx(32)}, span)},
+                        {{"dtype", DataType(DataType::UINT8)}, {"target_memory", MemorySpace::Vec}},
+                        "scatter_mask_sel_tmp");
+        // out = sel(pred, scattered, dst, tmp): scattered @selected, dst @unselected.
+        auto out_call = op_reg.Create("tile.sel", {pred, scattered, args[1], tmp}, span);
+        return ConversionResult{std::move(prologue), out_call};
       },
       std::move(scatter_mask_input_reqs));
 }
diff --git a/tests/st/runtime/ops/test_scatter.py b/tests/st/runtime/ops/test_scatter.py
index 9a64c4527..d36e3a036 100644
--- a/tests/st/runtime/ops/test_scatter.py
+++ b/tests/st/runtime/ops/test_scatter.py
@@ -61,6 +61,13 @@
 A2/A3 backend (``BackendType.Ascend910B``); A5 (``Ascend950``) rejects it, so
 those cases are pinned to A2/A3 only. Column selection mirrors gather: P0101
 hits even columns (``0::2``), P1010 hits odd columns (``1::2``).
+
+The raw ``pto.tscatter`` mask instruction zero-fills the entire ``dst`` before
+writing the selected columns, so the lowering reconstructs DPS preserve with the
+same zeroed-scatter + mask + select blend as the index form. These cases pin
+that with a **non-zero sentinel** ``dst`` (unselected columns must survive), and
+the chain case writes two patterns into one ``dst`` (P0101 then P1010) so the
+second scatter must preserve the first's writes — the RoPE even/odd reassembly.
 """
 
 from typing import Any
@@ -150,14 +157,15 @@ def _scatter_mask_specs(b: int, c: int, stride: int, dt: DataType, torch_dt: tor
     """Build the (inp, dst, output) TensorSpecs for a mask-form scatter case.
 
     ``inp`` is ``[B, C]`` of distinct positive values; ``dst`` is ``[B, C*stride]``
-    initialised to zeros so the expected unselected columns are zero regardless
-    of whether pto.tscatter zeros or preserves them (its index form zeros
-    unwritten slots; a zero base makes the test robust either way).
+    pre-filled with a **negative sentinel** (disjoint from ``inp``) rather than
+    zeros, so the case actually distinguishes preserve from zero-fill: the
+    mask-selected columns must become ``inp`` and the unselected columns must
+    keep the sentinel. A zero ``dst`` could not tell the two apart.
     """
     dst_cols = c * stride
     return [
         TensorSpec("inp", [b, c], dt, init_value=lambda: _make_values(b, c, torch_dt)),
-        TensorSpec("dst", [b, dst_cols], dt, init_value=lambda: torch.zeros(b, dst_cols, dtype=torch_dt)),
+        TensorSpec("dst", [b, dst_cols], dt, init_value=lambda: _make_base(b, dst_cols, torch_dt)),
         TensorSpec("output", [b, dst_cols], dt, is_output=True),
     ]
 
@@ -476,8 +484,9 @@ class _ScatterMaskBaseTestCase(PTOTestCase):
     """Base for mask-form scatter cases. Pinned to A2/A3 (Ascend910B).
 
     Subclasses set ``_start`` (0 for P0101, 1 for P1010) and ``_stride`` (2);
-    ``compute_expected`` writes ``inp`` into the ``[start::stride]`` columns of a
-    zero ``[B, C*stride]`` tensor (the unselected columns stay zero).
+    ``compute_expected`` writes ``inp`` into the ``[start::stride]`` columns of
+    ``dst`` while **preserving** ``dst``'s other (sentinel) columns — so the case
+    fails if the lowering zero-fills the unselected columns instead.
     """
 
     __test__ = False
@@ -492,8 +501,9 @@ def get_backend_type(self) -> BackendType:
         return BackendType.Ascend910B
 
     def compute_expected(self, tensors, params=None):
-        b, dst_cols = tensors["output"].shape
-        out = torch.zeros(b, dst_cols, dtype=tensors["inp"].dtype)
+        # Preserve dst's unselected (sentinel) columns; write inp into the
+        # mask-selected columns. Discriminates preserve from zero-fill.
+        out = tensors["dst"].clone()
         out[:, self._start :: self._stride] = tensors["inp"]
         tensors["output"][:] = out
 
@@ -614,7 +624,6 @@ def test_scatter_mask_p1010(self, test_runner, platform):
         result = test_runner.run(ScatterMaskP1010TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
-    @pytest.mark.skip(reason="chained mask scatter (P0101→P1010 into one dst) under investigation")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_mask_chain(self, test_runner, platform):
         result = test_runner.run(ScatterMaskChainTestCase(platform=platform))

From 3931e2e7bf8a411e4bc01a8f9030db904f9da565 Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Mon, 1 Jun 2026 09:58:42 +0800
Subject: [PATCH 10/12] ci: bump pto-isa pin to 8bd3ac8 for tscatter mask 2-arg
 overload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The main ST job ran scatter tests against pto-isa 2c607938, whose
pto_instr.hpp only exposes the 3-arg index-form TSCATTER(dst, src,
indexes); the mask-form codegen emits the 2-arg TSCATTER<MaskPattern>(dst,
src) overload, which that commit lacks — so scatter_mask_chain failed to
compile (no matching function for call to 'TSCATTER'). Bump the pin to
8bd3ac8 (a forward move from 2c607938) which wires the 2-arg mask
overload. The distributed job's 6909e5c4 already has it and is unchanged.
---
 .github/workflows/ci.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 558122479..00560e763 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -199,7 +199,7 @@ jobs:
           timeout 60 git clone https://github.com/hw-native-sys/pto-isa.git $GITHUB_WORKSPACE/pto-isa \
             || { rm -rf $GITHUB_WORKSPACE/pto-isa; timeout 300 git clone https://gitcode.com/luohuan40/pto-isa.git $GITHUB_WORKSPACE/pto-isa; }
           cd $GITHUB_WORKSPACE/pto-isa
-          git checkout 2c607938
+          git checkout 8bd3ac8f30bd237f9eaf12c142002a5cc0edb143
 
       - name: Install simpler
         run: |
@@ -219,7 +219,7 @@ jobs:
         # runs here despite the tests/st/codegen ignore: its numeric golden
         # compare is unstable on the CPU container's torch/BLAS build (see the
         # codegen-tests job), so it needs this environment.
-        run: pytest tests/st tests/st/codegen/torch/test_torch_codegen_paged_attention.py -v --device="$DEVICE_RANGE" --precompile-workers=128 --pto-isa-commit=2c607938 --ignore=tests/st/distributed --ignore=tests/st/codegen
+        run: pytest tests/st tests/st/codegen/torch/test_torch_codegen_paged_attention.py -v --device="$DEVICE_RANGE" --precompile-workers=128 --pto-isa-commit=8bd3ac8f30bd237f9eaf12c142002a5cc0edb143 --ignore=tests/st/distributed --ignore=tests/st/codegen
 
       - name: Test multi-orch L2 (isolated pytest invocation)
         # A ``Worker(level=2)`` device session currently leaves runtime
@@ -228,12 +228,12 @@ jobs:
         # isolation bug). Running this file in its own pytest process
         # keeps the L2 leak from poisoning ``test_l3_distributed``.
         timeout-minutes: 3
-        run: pytest tests/st/distributed/test_l2_multi_orch.py -v --device="$DEVICE_RANGE" --pto-isa-commit=2c607938
+        run: pytest tests/st/distributed/test_l2_multi_orch.py -v --device="$DEVICE_RANGE" --pto-isa-commit=8bd3ac8f30bd237f9eaf12c142002a5cc0edb143
 
       - name: Test swimlane output
         run: |
           pytest tests/st/runtime/framework_and_models/test_perf_swimlane.py \
-            -v --device="$DEVICE_ID" --platform=a2a3 --enable-l2-swimlane --forked --pto-isa-commit=2c607938
+            -v --device="$DEVICE_ID" --platform=a2a3 --enable-l2-swimlane --forked --pto-isa-commit=8bd3ac8f30bd237f9eaf12c142002a5cc0edb143
 
   dist-system-tests:
     # No container: HCCL needs host-only env (HCCL_LOGIC_SUPERPOD_ID, plugin
@@ -409,7 +409,7 @@ jobs:
           timeout 60 git clone https://github.com/hw-native-sys/pto-isa.git $GITHUB_WORKSPACE/pto-isa \
             || { rm -rf $GITHUB_WORKSPACE/pto-isa; timeout 300 git clone https://gitcode.com/luohuan40/pto-isa.git $GITHUB_WORKSPACE/pto-isa; }
           cd $GITHUB_WORKSPACE/pto-isa
-          git checkout 2c607938
+          git checkout 8bd3ac8f30bd237f9eaf12c142002a5cc0edb143
 
       - name: Install simpler
         run: |
@@ -490,10 +490,10 @@ jobs:
         # against the runtime's bundled pto-isa, which still clones from the
         # stale PTO-ISA/pto-isa mirror (lacks pto::Coalesce) until the runtime
         # submodule is bumped past simpler #806. Re-enable once that lands.
-        run: pytest tests/st/runtime/ops/test_assemble.py tests/st/runtime/ops/test_mscatter.py tests/st/runtime/framework_and_models/test_qwen3_decode_scope3_mixed.py tests/st/runtime/control_flow/test_dyn_orch_shape.py::TestDynOrchShapeOperations::test_dyn_orch_paged_attention -v --platform=a5sim --forked --pto-isa-commit=2c607938 -k "not TestMscatter"
+        run: pytest tests/st/runtime/ops/test_assemble.py tests/st/runtime/ops/test_mscatter.py tests/st/runtime/framework_and_models/test_qwen3_decode_scope3_mixed.py tests/st/runtime/control_flow/test_dyn_orch_shape.py::TestDynOrchShapeOperations::test_dyn_orch_paged_attention -v --platform=a5sim --forked --pto-isa-commit=8bd3ac8f30bd237f9eaf12c142002a5cc0edb143 -k "not TestMscatter"
 
       - name: Test A5 cross-core system tests (simulator)
-        run: pytest tests/st/runtime/cross_core/test_cross_core.py -v --forked --platform=a5sim --pto-isa-commit=2c607938
+        run: pytest tests/st/runtime/cross_core/test_cross_core.py -v --forked --platform=a5sim --pto-isa-commit=8bd3ac8f30bd237f9eaf12c142002a5cc0edb143
 
       - name: Test A2A3 cross-core system tests (simulator)
-        run: pytest tests/st/runtime/cross_core/test_cross_core.py -v --forked --platform=a2a3sim --pto-isa-commit=2c607938
+        run: pytest tests/st/runtime/cross_core/test_cross_core.py -v --forked --platform=a2a3sim --pto-isa-commit=8bd3ac8f30bd237f9eaf12c142002a5cc0edb143

From 03f123a94d99e0bf594995e57124e8ea39eab2c2 Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Mon, 1 Jun 2026 10:04:04 +0800
Subject: [PATCH 11/12] chore: trigger CI


From 55e2d3eb89402608fb265a16538d4daa7baa8344 Mon Sep 17 00:00:00 2001
From: Youhezhen <youhezhen@huawei.com>
Date: Mon, 1 Jun 2026 10:53:20 +0800
Subject: [PATCH 12/12] test(scatter): un-skip index-form 2-byte cases
 (fp16/bf16/int16)

The fp16/bf16/int16 index-form ST cases now run on device under the
bumped pto-isa pin; remove the temporary @pytest.mark.skip markers.
---
 tests/st/runtime/ops/test_scatter.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/st/runtime/ops/test_scatter.py b/tests/st/runtime/ops/test_scatter.py
index d36e3a036..6870185c3 100644
--- a/tests/st/runtime/ops/test_scatter.py
+++ b/tests/st/runtime/ops/test_scatter.py
@@ -575,19 +575,16 @@ def test_scatter_int32(self, test_runner, platform):
         result = test_runner.run(ScatterINT32TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
-    @pytest.mark.skip(reason="index-form 2-byte (fp16/bf16/int16) fails on a pto-isa bug; skip pending fix")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_fp16(self, test_runner, platform):
         result = test_runner.run(ScatterFP16TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
-    @pytest.mark.skip(reason="index-form 2-byte (fp16/bf16/int16) fails on a pto-isa bug; skip pending fix")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_bf16(self, test_runner, platform):
         result = test_runner.run(ScatterBF16TestCase(platform=platform))
         assert result.passed, f"Test failed: {result.error}"
 
-    @pytest.mark.skip(reason="index-form 2-byte (fp16/bf16/int16) fails on a pto-isa bug; skip pending fix")
     @pytest.mark.parametrize("platform", PLATFORMS)
     def test_scatter_int16(self, test_runner, platform):
         result = test_runner.run(ScatterINT16TestCase(platform=platform))