From 422a2c90b544ff41dfc2719bf8b00366969c1447 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Mon, 22 Jun 2026 23:48:22 +0800
Subject: [PATCH 01/11] Add LogProb reference operator interface(reuse)

---
 docs/operators/fused-logp.md               |  35 +++--
 rl_engine/kernels/ops/pytorch/loss/logp.py |  20 ++-
 tests/test_logp.py                         | 144 +++++++++++++++++++++
 3 files changed, 187 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_logp.py

diff --git a/docs/operators/fused-logp.md b/docs/operators/fused-logp.md
index d5008e5..ccc0f61 100644
--- a/docs/operators/fused-logp.md
+++ b/docs/operators/fused-logp.md
@@ -13,21 +13,36 @@ logp_op = kernel_registry.get_op("logp")
 output = logp_op(logits, token_ids)
 ```
 
+The PyTorch native reference also exposes the Issue #108 interface:
+
+```python
+from rl_engine.kernels.ops.pytorch.loss.logp import NativeLogpOp
+
+logp_ref = NativeLogpOp()
+output = logp_ref.forward(logits, token_ids)
+reference = logp_ref.forward_fp32(logits, token_ids)
+```
+
+`apply(...)` and `apply_fp32(...)` remain available as backward-compatible aliases.
+
 ## Backends
 
 | Backend | Wrapper | Native symbol | Notes |
 | --- | --- | --- | --- |
 | CUDA SM90 | `FusedLogpSM90Op` | `_C.fused_logp_sm90` | TMA-oriented path for Hopper-class GPUs. |
 | CUDA generic | `FusedLogpGenericOp` | `_C.fused_logp` | Generic compiled extension fallback. |
-| PyTorch native | `NativeOp` | None | Baseline fallback path. |
+| PyTorch native | `NativeLogpOp` | None | PyTorch baseline/reference path. |
 
 ## Tensor Contract
 
 | Argument | Shape | Dtype | Requirements |
 | --- | --- | --- | --- |
-| `logits` | `[N, V]` | `bfloat16` for SM90 path | Contiguous, on the target device. |
-| `token_ids` / `labels` | `[N]` | Converted to `int32` | Same logical device as `logits`. |
-| Output | `[N]` | Backend-defined tensor dtype | One selected log probability per row. |
+| `logits` | `[..., V]` | Floating point | Contiguous for fused CUDA paths; arbitrary leading dimensions. |
+| `token_ids` / `labels` | `[...]` | Integer | Must match `logits.shape[:-1]`. |
+| Output | `[...]` | See below | One selected log probability per row. |
+
+For `NativeLogpOp`, `forward(...)` returns the input dtype and `forward_fp32(...)`
+returns `torch.float32`.
 
 ## Reference Semantics
 
@@ -39,16 +54,20 @@ ref = torch.gather(ref, dim=-1, index=token_ids.unsqueeze(-1).long()).squeeze(-1
 ## Tests
 
 ```bash
-python tests/test_op_accuracy.py
+python -m pytest tests/test_logp.py -q
+python -m pytest tests/test_op_accuracy.py -q
 ```
 
-The current accuracy test compares the dispatched operator with a PyTorch reference and
-uses a dtype-dependent threshold.
+`tests/test_logp.py` covers the PyTorch reference contract, dtype behavior,
+backward-compatible aliases, batch invariance, and registry dispatch. The existing
+operator accuracy tests continue to validate native/CUDA fused API compatibility.
 
 ## Implementation Files
 
 - `rl_engine/kernels/registry.py`
-- `rl_engine/kernels/ops/cuda.py`
+- `rl_engine/kernels/ops/pytorch/loss/logp.py`
+- `rl_engine/kernels/ops/cuda/loss/logp.py`
 - `csrc/ops.cpp`
 - `csrc/fused_logp_kernel.cu`
 - `csrc/cuda/fused_logp_sm90.cu`
+- `tests/test_logp.py`
diff --git a/rl_engine/kernels/ops/pytorch/loss/logp.py b/rl_engine/kernels/ops/pytorch/loss/logp.py
index c791927..4ef1f24 100644
--- a/rl_engine/kernels/ops/pytorch/loss/logp.py
+++ b/rl_engine/kernels/ops/pytorch/loss/logp.py
@@ -1,17 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) 2026 RL-Kernel Contributors
 
+from __future__ import annotations
+
 import torch
 
 
 class NativeLogpOp:
     """Pure PyTorch native fallback for Fused LogP."""
 
-    def __init__(self):
+    op_class = "logprob"
+
+    def __init__(self) -> None:
         pass
 
     def __call__(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
-        return self.apply(logits, token_ids)
+        return self.forward(logits, token_ids)
 
     def _selected_logps(
         self,
@@ -45,14 +49,22 @@ def _validate_output_shape(self, output: torch.Tensor, logits: torch.Tensor) ->
                 f"{tuple(logits.shape[:-1])}"
             )
 
-    def apply(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
+    def forward(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
         """Baseline selected-token log probability extraction using torch.gather."""
         return self._selected_logps(logits, token_ids, output_dtype=logits.dtype)
 
-    def apply_fp32(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
+    def forward_fp32(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
         """Same as apply but forces float32 output for numerical stability."""
         return self._selected_logps(logits, token_ids, output_dtype=torch.float32)
 
+    def apply(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
+        """Backward-compatible alias for forward."""
+        return self.forward(logits, token_ids)
+
+    def apply_fp32(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
+        """Backward-compatible alias for forward_fp32."""
+        return self.forward_fp32(logits, token_ids)
+
     def indexed_out(
         self,
         logits: torch.Tensor,
diff --git a/tests/test_logp.py b/tests/test_logp.py
new file mode 100644
index 0000000..48bb950
--- /dev/null
+++ b/tests/test_logp.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+"""Tests for NativeLogpOp, the PyTorch selected-logprob reference."""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from rl_engine.kernels.ops.pytorch.loss.logp import NativeLogpOp
+
+
+def _make_inputs(
+    batch: int,
+    seq: int,
+    vocab: int,
+    *,
+    dtype: torch.dtype = torch.float32,
+    seed: int = 123,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    gen = torch.Generator().manual_seed(seed)
+    logits = torch.randn(batch, seq, vocab, generator=gen, dtype=dtype)
+    token_ids = torch.randint(0, vocab, (batch, seq), generator=gen, dtype=torch.long)
+    return logits, token_ids
+
+
+def _reference_selected_logp(logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
+    log_probs = torch.log_softmax(logits.float(), dim=-1)
+    return torch.gather(log_probs, dim=-1, index=token_ids.long().unsqueeze(-1)).squeeze(-1)
+
+
+class TestNativeLogpOpCorrectness:
+    def test_output_shape_matches_token_ids(self):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 257)
+        out = op.forward_fp32(logits, token_ids)
+        assert out.shape == token_ids.shape
+
+    def test_forward_fp32_returns_fp32(self):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 257, dtype=torch.bfloat16)
+        out = op.forward_fp32(logits, token_ids)
+        assert out.dtype == torch.float32
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
+    def test_forward_returns_input_dtype(self, dtype):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 257, dtype=dtype)
+        out = op.forward(logits, token_ids)
+        assert out.dtype == dtype
+
+    def test_call_and_apply_alias_forward(self):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 257)
+        forward = op.forward(logits, token_ids)
+        assert torch.equal(op(logits, token_ids), forward)
+        assert torch.equal(op.apply(logits, token_ids), forward)
+
+    def test_apply_fp32_alias_forward_fp32(self):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 257)
+        assert torch.equal(op.apply_fp32(logits, token_ids), op.forward_fp32(logits, token_ids))
+
+    def test_matches_fp32_reference_bitwise(self):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 257)
+        out = op.forward_fp32(logits, token_ids)
+        ref = _reference_selected_logp(logits, token_ids)
+        assert torch.equal(out, ref)
+
+    def test_pure_function_no_inplace(self):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 257)
+        logits_orig = logits.clone()
+        token_ids_orig = token_ids.clone()
+        _ = op.forward_fp32(logits, token_ids)
+        assert torch.equal(logits, logits_orig)
+        assert torch.equal(token_ids, token_ids_orig)
+
+    def test_op_class_is_logprob(self):
+        assert NativeLogpOp.op_class == "logprob"
+
+    def test_rejects_mismatched_shapes(self):
+        op = NativeLogpOp()
+        logits = torch.randn(2, 3, 5)
+        token_ids = torch.randint(0, 5, (2, 4))
+        with pytest.raises(ValueError, match="must match"):
+            op.forward_fp32(logits, token_ids)
+
+
+class TestNativeLogpOpBatchInvariance:
+    def test_batch1_vs_batchN_bitwise(self):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(4, 16, 257, seed=321)
+        full_out = op.forward_fp32(logits, token_ids)
+        for row in range(logits.shape[0]):
+            single_out = op.forward_fp32(logits[row : row + 1], token_ids[row : row + 1])
+            assert torch.equal(full_out[row], single_out[0]), (
+                f"Batch invariance broken at row {row}"
+            )
+
+    def test_batch_invariance_with_padding(self):
+        op = NativeLogpOp()
+        logits_valid, token_ids_valid = _make_inputs(2, 16, 257, seed=456)
+        gen = torch.Generator().manual_seed(789)
+        logits_padding = torch.randn(3, 16, 257, generator=gen)
+        token_padding = torch.randint(0, 257, (3, 16), generator=gen)
+        logits_padded = torch.cat([logits_valid, logits_padding], dim=0)
+        token_ids_padded = torch.cat([token_ids_valid, token_padding], dim=0)
+
+        out_valid = op.forward_fp32(logits_valid, token_ids_valid)
+        out_padded = op.forward_fp32(logits_padded, token_ids_padded)
+        assert torch.equal(out_valid[0], out_padded[0])
+        assert torch.equal(out_valid[1], out_padded[1])
+
+
+class TestNativeLogpOpAccuracy:
+    @pytest.mark.parametrize(
+        "dtype, atol",
+        [
+            (torch.float32, 1e-5),
+            (torch.bfloat16, 2e-2),
+            (torch.float16, 5e-3),
+        ],
+    )
+    def test_forward_vs_fp32_within_tolerance(self, dtype, atol):
+        op = NativeLogpOp()
+        logits, token_ids = _make_inputs(2, 16, 17, dtype=dtype)
+        out_typed = op.forward(logits, token_ids).float()
+        out_fp32 = op.forward_fp32(logits, token_ids)
+        diff = (out_typed - out_fp32).abs().max().item()
+        assert torch.allclose(out_typed, out_fp32, atol=atol, rtol=0.0), (
+            f"dtype={dtype}, max_abs_error={diff:.3e} exceeds atol={atol}"
+        )
+
+
+class TestNativeLogpOpRegistry:
+    @pytest.mark.skipif(torch.cuda.is_available(), reason="CUDA dispatch may select fused logp")
+    def test_registry_returns_logp_op(self):
+        from rl_engine.kernels.registry import kernel_registry
+
+        op = kernel_registry.get_op("logp")
+        assert isinstance(op, NativeLogpOp)

From 3aa797a7bd0fb17f9027c0c04c62642b296d42c9 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Tue, 23 Jun 2026 00:00:57 +0800
Subject: [PATCH 02/11] format with pre-commit

---
 tests/test_logp.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_logp.py b/tests/test_logp.py
index 48bb950..42492fb 100644
--- a/tests/test_logp.py
+++ b/tests/test_logp.py
@@ -96,9 +96,9 @@ def test_batch1_vs_batchN_bitwise(self):
         full_out = op.forward_fp32(logits, token_ids)
         for row in range(logits.shape[0]):
             single_out = op.forward_fp32(logits[row : row + 1], token_ids[row : row + 1])
-            assert torch.equal(full_out[row], single_out[0]), (
-                f"Batch invariance broken at row {row}"
-            )
+            assert torch.equal(
+                full_out[row], single_out[0]
+            ), f"Batch invariance broken at row {row}"
 
     def test_batch_invariance_with_padding(self):
         op = NativeLogpOp()
@@ -130,9 +130,9 @@ def test_forward_vs_fp32_within_tolerance(self, dtype, atol):
         out_typed = op.forward(logits, token_ids).float()
         out_fp32 = op.forward_fp32(logits, token_ids)
         diff = (out_typed - out_fp32).abs().max().item()
-        assert torch.allclose(out_typed, out_fp32, atol=atol, rtol=0.0), (
-            f"dtype={dtype}, max_abs_error={diff:.3e} exceeds atol={atol}"
-        )
+        assert torch.allclose(
+            out_typed, out_fp32, atol=atol, rtol=0.0
+        ), f"dtype={dtype}, max_abs_error={diff:.3e} exceeds atol={atol}"
 
 
 class TestNativeLogpOpRegistry:

From b86d8a1425398b4a2ef55ddab6b72e04c4c636f9 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Wed, 24 Jun 2026 21:12:00 +0800
Subject: [PATCH 03/11] add tolerance table

---
 rl_engine/testing/tolerance.py            | 21 ++++++++++++++++
 rl_engine/testing/tolerance_contract.yaml | 25 +++++++++++++++++++
 tests/test_tolerance_contract.py          | 29 +++++++++++++++++++++++
 3 files changed, 75 insertions(+)
 create mode 100644 rl_engine/testing/tolerance.py
 create mode 100644 rl_engine/testing/tolerance_contract.yaml
 create mode 100644 tests/test_tolerance_contract.py

diff --git a/rl_engine/testing/tolerance.py b/rl_engine/testing/tolerance.py
new file mode 100644
index 0000000..9a1cc0b
--- /dev/null
+++ b/rl_engine/testing/tolerance.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+
+_CONTRACT_PATH = Path(__file__).with_name("tolerance_contract.yaml")
+
+
+def load_contract(path: str | Path = _CONTRACT_PATH) -> dict[str, Any]:
+    """Load the dtype/operator-class tolerance contract."""
+
+    with Path(path).open("r", encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+__all__ = ["load_contract"]
diff --git a/rl_engine/testing/tolerance_contract.yaml b/rl_engine/testing/tolerance_contract.yaml
new file mode 100644
index 0000000..7a2ce2b
--- /dev/null
+++ b/rl_engine/testing/tolerance_contract.yaml
@@ -0,0 +1,25 @@
+{
+  "batch_invariance": {"atol": 0.0, "rtol": 0.0},
+  "accuracy": {
+    "default": {
+      "elementwise": {
+        "float32": {"atol": 1.0e-5, "rtol": 1.0e-5},
+        "bfloat16": {"atol": 2.0e-2, "rtol": 1.6e-2},
+        "float16": {"atol": 1.0e-3, "rtol": 1.0e-3}
+      },
+      "reduction": {
+        "float32": {"atol": 1.0e-4, "rtol": 1.0e-4},
+        "bfloat16": {"atol": 5.0e-2, "rtol": 2.0e-2},
+        "float16": {"atol": 1.0e-3, "rtol": 1.0e-3}
+      },
+      "logprob": {
+        "float32": {"atol": 1.0e-5, "rtol": 0.0},
+        "bfloat16": {"atol": 5.0e-2, "rtol": 0.0},
+        "float16": {"atol": 5.0e-3, "rtol": 0.0}
+      }
+    },
+    "arch_overrides": {
+      "sm90": {}
+    }
+  }
+}
diff --git a/tests/test_tolerance_contract.py b/tests/test_tolerance_contract.py
new file mode 100644
index 0000000..0e62d7f
--- /dev/null
+++ b/tests/test_tolerance_contract.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+from rl_engine.testing.tolerance import load_contract
+
+
+def test_load_contract_contains_expected_operator_classes():
+    contract = load_contract()
+    accuracy = contract["accuracy"]["default"]
+    assert set(accuracy) == {"elementwise", "reduction", "logprob"}
+
+
+def test_load_contract_contains_expected_dtypes():
+    contract = load_contract()
+    for op_class in ("elementwise", "reduction", "logprob"):
+        assert set(contract["accuracy"]["default"][op_class]) == {
+            "float32",
+            "bfloat16",
+            "float16",
+        }
+
+
+def test_logprob_bfloat16_tolerance_covers_observed_reference_drift():
+    contract = load_contract()
+    tolerance = contract["accuracy"]["default"]["logprob"]["bfloat16"]
+    assert tolerance["atol"] >= 5.0e-2
+    assert tolerance["rtol"] == 0.0

From 7defbddec479cbbcf1b6fd510745fcfb4d53d190 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Sun, 28 Jun 2026 05:30:59 +0800
Subject: [PATCH 04/11] Add kernel gtest operator checker

---
 docs/contributing/issue-108-session-log.md    | 1044 +++++++++++++++++
 rl_engine/kernels/gtest/__init__.py           |   10 +
 rl_engine/kernels/gtest/op_checks.py          |  292 +++++
 rl_engine/kernels/gtest/operator_inputs.py    |  244 ++++
 rl_engine/kernels/gtest/operator_specs.py     |  107 ++
 .../{testing => kernels/gtest}/tolerance.py   |    0
 .../gtest}/tolerance_contract.yaml            |    0
 scripts/check_operator.py                     |  116 ++
 tests/test_op_checks.py                       |  129 ++
 tests/test_operator_inputs.py                 |   70 ++
 tests/test_tolerance_contract.py              |    2 +-
 11 files changed, 2013 insertions(+), 1 deletion(-)
 create mode 100644 docs/contributing/issue-108-session-log.md
 create mode 100644 rl_engine/kernels/gtest/__init__.py
 create mode 100644 rl_engine/kernels/gtest/op_checks.py
 create mode 100644 rl_engine/kernels/gtest/operator_inputs.py
 create mode 100644 rl_engine/kernels/gtest/operator_specs.py
 rename rl_engine/{testing => kernels/gtest}/tolerance.py (100%)
 rename rl_engine/{testing => kernels/gtest}/tolerance_contract.yaml (100%)
 create mode 100644 scripts/check_operator.py
 create mode 100644 tests/test_op_checks.py
 create mode 100644 tests/test_operator_inputs.py

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
new file mode 100644
index 0000000..f49ddde
--- /dev/null
+++ b/docs/contributing/issue-108-session-log.md
@@ -0,0 +1,1044 @@
+# ISSUE-108 Session Log
+
+本文档记录本 session 中围绕 RL-Kernel 算子测试框架、CUDA 验证和 upstream 同步的所有关键修改。后续本 session 中每次代码修改都必须继续追加到本文档，记录目标、设计判断、修改文件、验证方式和结果。
+
+## 记录原则
+
+- 使用中文记录工程判断。
+- 每个改动都需要说明为什么做，而不只记录改了什么。
+- 保持最小增量：一次修改尽量只围绕一个明确问题。
+- 每个子任务需要能独立验证；无法验证时必须明确说明原因。
+- CUDA 验证必须如实记录环境、命令、误差和失败范围。
+- 不把失败路径写成已支持能力。
+
+## 总体目标
+
+本 session 的目标是把算子验证从零散脚本推进为可复用、可扩展、可审查的工程化框架：
+
+- 建立统一 tolerance contract，用于管理不同算子的误差阈值。
+- 建立公共 operator check runner，替代单算子专用验证脚本。
+- 建立统一 operator input 生成逻辑，覆盖后训练常见算子的基础输入。
+- 将测试入口改造成可指定 `op`、`candidate`、`dtype`、`device`、shape 参数的 CLI。
+- 同步 upstream/main，吸收 PR #122 中的 SM90 相关修复。
+- 在 H20 机器上验证普通 CUDA `fused_logp` 路径。
+- 明确 SM90 `fused_linear_logp` 在 CUDA 12.4 下仍未通过。
+
+## 时间线
+
+### 1. tolerance table 和 contract loader
+
+目标：
+
+- 将不同 dtype、op class 的误差容差从测试代码中抽离出来。
+- 让误差阈值可以被审查和维护，而不是散落在测试断言中。
+
+修改文件：
+
+- `rl_engine/testing/tolerance.py`
+- `rl_engine/testing/tolerance_contract.yaml`
+- `tests/test_tolerance_contract.py`
+
+设计判断：
+
+- 使用 YAML 保存 contract，便于人工 review。
+- 将容差按 `accuracy.default` 和可选硬件 override 组织。
+- `default` 是通用 fallback，不等同于 CPU；CPU、SM90、SM100、ROCm、Ascend 等未来可作为明确 override key。
+
+验证：
+
+- `tests/test_tolerance_contract.py` 验证 contract 可读、结构正确。
+
+结果：
+
+- tolerance contract 框架建立完成。
+
+### 2. operator check runner
+
+目标：
+
+- 建立类似 GoogleTest 思路的算子验证 runner。
+- 一个 case 表示一组确定输入和 gold path，一个 candidate 表示被测实现。
+
+修改文件：
+
+- `rl_engine/testing/op_checks.py`
+- `tests/test_op_checks.py`
+- `rl_engine/testing/__init__.py`
+
+设计判断：
+
+- `OperatorCase` 表示测试对象：`name`、`op_class`、`dtype`、`inputs`、`gold_fn`。
+- `CandidateSpec` 表示被测实现：`name`、`fn`、`backend`、`arch_key`。
+- runner 负责：
+  - 调用 candidate。
+  - 调用 gold。
+  - flatten 多输出。
+  - 按 `op_class + dtype + arch_key` 解析容差。
+  - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。
+  - 返回结构化 report。
+
+验证：
+
+- `tests/test_op_checks.py` 覆盖 native logp、registry logp、失败 candidate、arch override 等场景。
+
+结果：
+
+- 公共 operator check runner 建立完成。
+
+### 3. `check_operator.py` 从 logp 专用入口改为公共入口
+
+目标：
+
+- 让测试者通过 CLI 指定算子、candidate、dtype、device 和 shape。
+- 避免后续每个算子都写一个独立测试脚本。
+
+修改文件：
+
+- `scripts/check_operator.py`
+
+设计判断：
+
+- `check_operator.py` 只负责：
+  - 解析参数。
+  - 选择 device/dtype。
+  - 调用 `make_candidate`。
+  - 调用 `make_operator_case`。
+  - 调用 `run_operator_suite`。
+  - 输出 summary 或 JSON。
+- 不在入口中硬编码具体算子实现。
+
+验证：
+
+```bash
+python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17
+python scripts/check_operator.py --op logp --candidate registry --dtype bf16 --batch 2 --seq 16 --vocab 257 --json
+python -m pytest tests/test_op_checks.py -q
+```
+
+结果：
+
+- 公共 CLI 最小闭环通过。
+
+### 4. 抽离 operator specs
+
+目标：
+
+- 避免新增算子时修改测试入口。
+- 将算子元信息集中到专门文件。
+
+修改文件：
+
+- `rl_engine/testing/operator_specs.py`
+
+设计判断：
+
+- 每个算子通过 `OperatorSpec` 描述：
+  - `name`
+  - `op_class`
+  - `gold_path`
+  - `registry_name`
+  - `candidate_paths`
+- `check_operator.py` 不直接知道某个算子的 Python 类路径。
+- `--candidate cuda` 明确选择 CUDA candidate。
+- `--candidate registry` 仅用于测试 dispatcher 分发结果，不作为具体 CUDA correctness 的替代。
+
+当前 logp 映射：
+
+```text
+pytorch      -> NativeLogpOp
+cuda         -> FusedLogpGenericOp
+cuda-generic -> FusedLogpGenericOp
+cuda-sm90    -> FusedLogpSM90Op
+registry     -> kernel_registry.get_op("logp")
+```
+
+结果：
+
+- 后续新增算子主要扩展 `operator_specs.py`，不再修改公共入口。
+
+### 5. 统一 operator input 工厂
+
+目标：
+
+- 用户指出不希望每个新算子都手写 `_make_xxx_inputs` 和 `_xxx_shape_name`。
+- 统一准备 ISSUE #108 中所有算子的输入初始化。
+
+修改文件：
+
+- `rl_engine/testing/operator_inputs.py`
+- `tests/test_operator_inputs.py`
+- `rl_engine/testing/operator_specs.py`
+- `scripts/check_operator.py`
+
+设计判断：
+
+- 新增 `make_operator_inputs(op_name, args, dtype, device)`。
+- 新增 `operator_shape_name(op_name, args)`。
+- 支持 `random` 和 `constant` 两种输入模式：
+  - `random` 用 seed 控制可复现。
+  - `constant` 用固定值便于 debug。
+- 支持的算子输入：
+  - `rms_norm`
+  - `matmul`
+  - `attention`
+  - `logp`
+  - `rope`
+  - `silu`
+  - `swiglu`
+  - `embedding`
+  - `lm_head`
+  - `kv_cache_attention`
+
+CLI 增加参数：
+
+```text
+--input-mode random|constant
+--constant-value
+--token-value
+--normalized-dim
+--k-dim
+--n-dim
+--theta
+--eps
+```
+
+验证：
+
+```bash
+python -m pytest tests/test_operator_inputs.py -q
+python -m pytest tests/test_op_checks.py -q
+python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17 --input-mode constant --constant-value 0.5 --token-value 3
+```
+
+结果：
+
+- 多算子输入生成能力建立。
+- logp CLI 随机输入和固定输入均通过。
+
+### 6. 删除模型命名和档位参数
+
+目标：
+
+- 用户指出测试框架不应绑定某个模型，也不需要冗余 `--size` 档位。
+- 输入生成应表达为通用数据规模，而不是某个模型配置。
+
+修改文件：
+
+- `rl_engine/testing/operator_inputs.py`
+- `scripts/check_operator.py`
+- `tests/test_operator_inputs.py`
+
+设计判断：
+
+- 删除 `QWEN3_8B` 命名。
+- 删除 `ModelShape` / `DEFAULT_MODEL_SHAPE` 抽象。
+- 改为普通常量：
+
+```python
+DEFAULT_HIDDEN = 4096
+DEFAULT_N_HEADS = 32
+DEFAULT_N_KV_HEADS = 8
+DEFAULT_HEAD_DIM = 128
+DEFAULT_INTERMEDIATE = 12288
+DEFAULT_VOCAB = 151936
+DEFAULT_ROPE_THETA = 1.0e6
+DEFAULT_RMS_EPS = 1.0e-6
+```
+
+- 删除 `--size small|medium|large`。
+- 仅保留显式 `--batch` 和 `--seq`。
+
+验证：
+
+```bash
+rg -n "Qwen|QWEN|qwen" rl_engine/testing scripts tests
+rg -n "ModelShape|DEFAULT_MODEL|model" rl_engine/testing/operator_inputs.py
+rg -n "BATCH_SHAPES|BatchShape|--size|small|medium|large" scripts/check_operator.py rl_engine/testing/operator_inputs.py tests/test_operator_inputs.py
+python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q
+```
+
+结果：
+
+- 模型耦合和档位参数均已删除。
+- 相关测试通过。
+
+### 7. 同步 upstream/main 并处理冲突
+
+目标：
+
+- 用户要求检查当前代码是否过旧，拉取最新代码，如有冲突则解决。
+
+操作：
+
+```bash
+git fetch --all --prune
+git stash push -u -m pre-upstream-main-sync
+git rebase upstream/main
+git stash pop
+```
+
+冲突文件：
+
+- `csrc/cuda/fused_logp_sm90.cu`
+
+设计判断：
+
+- `upstream/main` 已包含 PR #122。
+- PR #122 中已经包含 SM90 文件的两项修复：
+  - `#include <math_constants.h>`
+  - `reinterpret_cast<const nv_bfloat16*>`
+- 因此冲突解决时采用 upstream/main 版本。
+- 丢弃本地临时加入的 `#include <cuda_runtime.h>`。
+
+验证：
+
+```bash
+python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q
+```
+
+结果：
+
+- rebase 到最新 `upstream/main` 成功。
+- 冲突解决完成。
+- 本地相对 `upstream/main` 为 `ahead 3, behind 0`。
+
+### 8. H20 CUDA 环境和普通 CUDA logp 验证
+
+目标：
+
+- 将测试框架迁移到 H 系列 GPU 环境验证。
+- 先确认普通 CUDA `fused_logp` 路径是否可用。
+
+环境记录：
+
+```text
+GPU: NVIDIA H20
+Driver: 565.57.01
+Driver CUDA capability: 12.7
+nvcc: 12.4
+Python: 3.11.15
+```
+
+普通 CUDA 扩展检查：
+
+```text
+_EXT_AVAILABLE: True
+has fused_logp: True
+has fused_logp_sm90: False
+```
+
+验证命令：
+
+```bash
+python scripts/check_operator.py \
+  --op logp \
+  --candidate cuda \
+  --device cuda \
+  --dtype bf16 \
+  --arch-key sm90 \
+  --batch 2 \
+  --seq 16 \
+  --vocab 257
+```
+
+输出：
+
+```text
+INFO [RL-Kernel]: Successfully linked to precompiled _C.fused_logp fallback kernel.
+suite=logp passed=True pass_rate=1.0000
+candidate=cuda-logp backend=cuda passed=True pass_rate=1.0000
+case=logp-torch.bfloat16-2x16x257 output=0 shape=(2, 16)
+max_abs=1.49779320e-02
+mean_abs=7.53845274e-03
+max_rel=2.70811981e-03
+tol=(atol=5.000e-02, rtol=0.000e+00)
+passed=True
+```
+
+结论：
+
+- 普通 CUDA `FusedLogpGenericOp -> _C.fused_logp` 路径通过。
+- 这证明测试框架最小 GPU 闭环已经打通：
+
+```text
+CLI
+-> operator_specs
+-> operator_inputs
+-> PyTorch gold
+-> CUDA candidate
+-> run_operator_suite
+-> tolerance contract
+-> compare_output
+-> structured report
+```
+
+### 9. SM90 fused_linear_logp 当前状态
+
+目标：
+
+- 尝试编译和验证 SM90 路径。
+
+结果：
+
+- `fused_logp_sm90` 的旧 include 和 type 问题已由 upstream PR #122 解决。
+- 但 `fused_linear_logp_sm90.cu` 在 CUDA 12.4 下仍未通过 ptxas。
+
+错误摘要：
+
+```text
+ptxas error: State space incorrect for instruction 'cp.async.bulk.tensor'
+ptxas fatal: Ptx assembly aborted due to errors
+```
+
+设计判断：
+
+- PR #122 描述中提到相关 SM90 路径在 CUDA 13.1 下 assembled。
+- 当前 H20 环境是 nvcc 12.4，不应将该路径写为已通过。
+- 当前应先以普通 CUDA `fused_logp` 作为验证通过范围。
+
+结论：
+
+- 已通过：`--candidate cuda`
+- 未通过：`--candidate cuda-sm90` / SM90 fused linear logp
+
+## 当前文件状态摘要
+
+本 session 产生或涉及的主要文件：
+
+```text
+rl_engine/testing/tolerance.py
+rl_engine/testing/tolerance_contract.yaml
+tests/test_tolerance_contract.py
+rl_engine/testing/op_checks.py
+tests/test_op_checks.py
+rl_engine/testing/__init__.py
+scripts/check_operator.py
+rl_engine/testing/operator_specs.py
+rl_engine/testing/operator_inputs.py
+tests/test_operator_inputs.py
+csrc/cuda/fused_logp_sm90.cu
+docs/contributing/issue-108-session-log.md
+```
+
+说明：
+
+- `AGENTS.md` 是未跟踪文件，未纳入本 session 的代码修改范围。
+- `csrc/cuda/fused_logp_sm90.cu` 最终与 upstream PR #122 版本一致。
+
+## 后续记录模板
+
+之后每次代码修改都在本文档追加如下条目：
+
+```markdown
+### YYYY-MM-DD HH:MM - 变更标题
+
+目标：
+
+- 本次最小子任务要解决什么问题。
+
+修改文件：
+
+- `path/to/file.py`
+
+设计决策：
+
+- 为什么这样改。
+- 为什么没有选择其他方案。
+
+验证方式：
+
+- 执行的测试命令。
+- CUDA 环境，如 GPU、CUDA 版本、driver、arch。
+- 关键输出指标。
+
+结果：
+
+- 通过 / 未通过 / 部分通过。
+- 未通过时必须记录完整错误摘要。
+
+后续：
+
+- 是否需要继续拆分子任务。
+- 是否影响 CI、benchmark 或其他算子。
+```
+
+CUDA 验证建议额外记录：
+
+```markdown
+GPU:
+CUDA:
+Driver:
+Arch:
+Candidate:
+Backend:
+Command:
+max_abs:
+mean_abs:
+max_rel:
+atol:
+rtol:
+Result:
+Known issue:
+```
+
+### 2026-06-28 - CUDA 13 CUB reduce functor 兼容修复
+
+目标：
+
+- 修复 H100 + CUDA 13.0 环境下 SM90 编译失败的问题。
+
+修改文件：
+
+- `csrc/cuda/fused_logp_sm90.cu`
+- `docs/contributing/issue-108-session-log.md`
+
+错误摘要：
+
+```text
+csrc/cuda/fused_logp_sm90.cu(76): error: namespace "cub" has no member "Max"
+csrc/cuda/fused_logp_sm90.cu(86): error: namespace "cub" has no member "Sum"
+```
+
+设计决策：
+
+- 不继续依赖 CUB 内置 `cub::Max()` 和 `cub::Sum()` functor 名称。
+- 在当前 SM90 文件内定义本地 `FloatMax` 和 `FloatSum`，传给 `cub::BlockReduce::Reduce`。
+- 这样保留原有 reduction 语义，同时规避 CUDA 13 / CCCL 中 CUB functor API 变化。
+
+验证方式：
+
+- 本地只做源码修改；需要在 H100 + CUDA 13.0 机器上重新执行：
+
+```bash
+rm -rf build
+find rl_engine -name "*.so" -delete
+
+export CUDA_HOME=/usr/local/cuda
+export PATH=$CUDA_HOME/bin:$PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}
+export OMP_NUM_THREADS=8
+export MAX_JOBS=1
+
+KERNEL_ALIGN_FORCE_SM90=1 pip install -v --no-build-isolation -e . 2>&1 | tee build_sm90.log
+```
+
+结果：
+
+- 待 H100 机器重新编译确认。
+
+后续：
+
+- 如果继续失败，优先查看 `grep -nE "FAILED:|error:|ptxas|fatal" build_sm90.log | head -n 80`。
+
+### 2026-06-28 - 对齐 SM90 LogP Python wrapper 输入接口
+
+目标：
+
+- 修复 `check_operator.py --candidate cuda-sm90` 调用失败的问题。
+
+修改文件：
+
+- `rl_engine/kernels/ops/cuda/loss/logp.py`
+- `docs/contributing/issue-108-session-log.md`
+
+错误摘要：
+
+```text
+TypeError: FusedLogpSM90Op.__call__() got an unexpected keyword argument 'token_ids'
+```
+
+设计决策：
+
+- 测试框架统一通过 `case.inputs` 传递 `token_ids`。
+- `FusedLogpGenericOp`、`NativeLogpOp` 都使用 `token_ids` 命名。
+- 因此将 `FusedLogpSM90Op.__call__(logits, labels)` 改为 `__call__(logits, token_ids)`，让 candidate 接口与 gold/case 输入一致。
+- 同时在 wrapper 内部把 `[B, S, V]` logits reshape 为 `[B*S, V]`，把 `[B, S]` token ids flatten 为 `[B*S]`，再把 `_C.fused_logp_sm90` 的 `[B*S]` 输出 reshape 回 `[B, S]`。
+
+验证方式：
+
+- 需要在 H100 + CUDA 13.0 机器上同步该 patch 后运行：
+
+```bash
+python scripts/check_operator.py \
+  --op logp \
+  --candidate cuda-sm90 \
+  --device cuda \
+  --dtype bf16 \
+  --arch-key sm90 \
+  --batch 2 \
+  --seq 16 \
+  --vocab 257
+```
+
+结果：
+
+- 已撤回该方向。`rl_engine/kernels/ops/cuda/loss/logp.py` 属于被测 CUDA 算子实现，不应为了测试框架改动其接口。
+- 后续适配应放在 testing 层，例如在 candidate adapter 中把测试框架统一的 `token_ids` 映射为 SM90 wrapper 需要的 `labels`，并处理 flatten/reshape。
+
+### 2026-06-28 - 在 testing 层适配 SM90 LogP candidate
+
+目标：
+
+- 保持 `rl_engine/kernels/ops/cuda` 下被测实现不变。
+- 让 `check_operator.py --candidate cuda-sm90` 可以使用测试框架统一的 `token_ids` 输入。
+
+修改文件：
+
+- `rl_engine/testing/operator_specs.py`
+- `docs/contributing/issue-108-session-log.md`
+
+设计决策：
+
+- 新增 `_LogpSM90CandidateAdapter`，只在 `args.op == "logp"` 且 `candidate == "cuda-sm90"` 时使用。
+- adapter 接收测试框架标准输入 `logits` 和 `token_ids`。
+- adapter 内部把 `logits` 从 `[B, S, V]` flatten 为 `[B*S, V]`，把 `token_ids` 从 `[B, S]` flatten 为 `[B*S]`。
+- adapter 调用原始 SM90 candidate：`self._candidate(logits_2d, labels_1d)`。
+- adapter 将输出 reshape 回 `[B, S]`，以便 `compare_output` 按原始 case shape 比较。
+
+验证方式：
+
+- 本地执行 Python 测试和编译检查：
+
+```bash
+python -m py_compile rl_engine/testing/operator_specs.py
+python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q
+```
+
+- H100 机器需要重新运行：
+
+```bash
+python scripts/check_operator.py \
+  --op logp \
+  --candidate cuda-sm90 \
+  --device cuda \
+  --dtype bf16 \
+  --arch-key sm90 \
+  --batch 2 \
+  --seq 16 \
+  --vocab 257
+```
+
+结果：
+
+- 本地待验证；H100 CUDA 结果待重新运行确认。
+
+### 2026-06-28 - H100 CUDA generic 与 SM90 LogP 对照验证
+
+目标：
+
+- 记录 H100 + CUDA 13.0 环境下 `logp` 的 generic CUDA 和 SM90 candidate 行为差异。
+- 明确测试框架已能区分“通过的 CUDA generic candidate”和“编译/运行存在问题的 SM90 candidate”。
+
+环境：
+
+```text
+GPU: NVIDIA H100 80GB HBM3
+Driver: 580.95.05
+CUDA driver capability: 13.0
+nvcc: 13.0
+Python: 3.12.13
+torch: 2.12.0+cu130
+torch cuda: 13.0
+compute capability: (9, 0)
+```
+
+修改文件：
+
+- `docs/contributing/issue-108-session-log.md`
+
+验证命令：
+
+```bash
+for v in 256 512 1024 2048 4096; do
+  echo "=== vocab=$v ==="
+  python scripts/check_operator.py \
+    --op logp \
+    --candidate cuda \
+    --device cuda \
+    --dtype bf16 \
+    --arch-key sm90 \
+    --batch 1 \
+    --seq 1 \
+    --vocab $v
+done
+```
+
+generic CUDA 结果：
+
+```text
+vocab=256:  passed=True, max_abs=5.77497482e-03
+vocab=512:  passed=True, max_abs=8.04328918e-03
+vocab=1024: passed=True, max_abs=1.80721283e-04
+vocab=2048: passed=True, max_abs=1.77164078e-02
+vocab=4096: passed=True, max_abs=2.69813538e-02
+atol=5.000e-02, rtol=0.000e+00
+```
+
+结论：
+
+- `FusedLogpGenericOp -> _C.fused_logp` 在 H100 + CUDA 13.0 上多 vocab correctness 全部通过。
+- 这进一步确认测试框架、input 生成、gold path、candidate 调用和 compare_output 链路是通的。
+
+SM90 对照现象：
+
+```text
+TILE_V=4096:
+  vocab=257/4096/151936 均在 cuTensorMapEncodeTiled 失败。
+  错误：CUDA_ERROR_INVALID_VALUE。
+
+TILE_V=256:
+  vocab=256 返回结果，但 passed=False，max_abs≈1.04094028e+00。
+  vocab=512/1024/2048/4096 在 20s timeout 下没有输出 report，表现为 hang/timeout。
+```
+
+结论：
+
+- `cuda-sm90` 已能编译和加载，但当前 SM90 TMA kernel 仍不能标记为通过。
+- 当前通过范围只包括 `--candidate cuda` generic CUDA logp。
+- SM90 问题应作为独立 CUDA kernel bugfix 处理，不归因于测试框架。
+
+### 2026-06-28 - 最终整理：不提交 CUDA 源码改动
+
+目标：
+
+- 用户明确要求本阶段不修改 `csrc` 下 CUDA/TMA 源码。
+- 本阶段只提交算子测试框架和文档，不把 SM90 kernel 实验 patch 混入测试框架 PR。
+
+本地处理：
+
+- 已还原 `csrc/cuda/fused_logp_sm90.cu`。
+- 本地 `csrc/utils/tma_utils.cuh` 没有 diff。
+- 因此本地最终不会提交任何 `csrc` 改动。
+
+服务器状态对照：
+
+用户在 H100 服务器上看到：
+
+```text
+Changes not staged for commit:
+  modified:   csrc/cuda/fused_logp_sm90.cu
+  modified:   csrc/utils/tma_utils.cuh
+```
+
+本地当前状态不同：
+
+```text
+csrc/cuda/fused_logp_sm90.cu: no diff after restore
+csrc/utils/tma_utils.cuh: no diff locally
+```
+
+结论：
+
+- H100 服务器上的 `csrc/utils/tma_utils.cuh` 改动不是本地当前工作区的一部分。
+- 如果服务器要回到与本地一致的测试框架提交状态，需要在服务器上还原两个 CUDA/TMA 文件：
+
+```bash
+git restore csrc/cuda/fused_logp_sm90.cu csrc/utils/tma_utils.cuh
+```
+
+保留记录的 CUDA 现象：
+
+- H100 环境：
+
+```text
+GPU: NVIDIA H100 80GB HBM3
+Driver: 580.95.05
+CUDA driver capability: 13.0
+nvcc: 13.0.88
+Python: 3.12.13
+torch: 2.12.0+cu130
+torch cuda: 13.0
+compute capability: (9, 0)
+```
+
+- `--candidate cuda` generic logp 在 vocab 256/512/1024/2048/4096 上通过。
+- `--candidate cuda-sm90` 可以编译和加载，但：
+  - `TILE_V=4096` 触发 `cuTensorMapEncodeTiled failed`。
+  - `TILE_V=256` 时 vocab=256 返回但数值不通过，vocab>=512 出现 timeout/hang。
+- 所以 SM90 fused logp 目前记录为 CUDA kernel 问题，不作为测试框架失败。
+
+### 2026-06-28 - 测试框架目录归位
+
+目标：
+
+- 用户指出测试框架文件放在通用 `rl_engine/testing` 下过于分散。
+- 本阶段将 operator correctness checking 代码移动到 kernel 相关目录，避免和 RL batch/reference testing 混在一起。
+
+最终目录：
+
+```text
+rl_engine/kernels/gtest/
+  __init__.py
+  op_checks.py
+  operator_inputs.py
+  operator_specs.py
+  tolerance.py
+  tolerance_contract.yaml
+```
+
+职责划分：
+
+- `op_checks.py`
+  - 定义 `OperatorCase`、`CandidateSpec`、report dataclass。
+  - 调用 gold 和 candidate。
+  - flatten 输出。
+  - 解析 tolerance。
+  - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。
+  - 返回通过率和结构化 report。
+- `operator_inputs.py`
+  - 统一构造标准语义输入。
+  - 支持 `random` 和 `constant`。
+  - 支持 `batch`、`seq`、`vocab` 等 CLI 参数。
+  - 当前覆盖 ISSUE-108 相关算子的输入初始化骨架。
+- `operator_specs.py`
+  - 注册每个算子的 gold path 和 candidate path。
+  - gold path 必须来自 `rl_engine.kernels.ops.pytorch`。
+  - candidate path 来自 `cuda`、`triton`、`rocm` 或未来 backend。
+- `tolerance.py` 和 `tolerance_contract.yaml`
+  - 加载 dtype/operator-class 容差表。
+  - 供 `op_checks.py` 在 compare output 时解析 `atol` 和 `rtol`。
+- `scripts/check_operator.py`
+  - 命令行入口。
+  - 不直接硬编码具体算子实现。
+
+导入边界：
+
+- operator checking 框架只从 `rl_engine.kernels.gtest` 导入。
+- `rl_engine/testing/__init__.py` 不导出 `CandidateSpec`、`OperatorCase`、`run_operator_suite`。
+- 这样可以避免 kernel correctness checking 和通用 RL testing helper 混在一起。
+
+### 2026-06-28 - 添加新算子的傻瓜式流程
+
+目标：
+
+- 新增算子时不修改测试主逻辑。
+- 新增算子只改注册信息、输入工厂和必要测试。
+- gold 永远使用 `rl_engine/kernels/ops/pytorch` 下实现。
+
+步骤 1：确认算子标准接口
+
+先确定这个算子的标准语义输入。例如：
+
+```text
+logp:
+  inputs:
+    logits: [B, S, V]
+    token_ids: [B, S]
+  output:
+    selected_logp: [B, S]
+```
+
+要求：
+
+- PyTorch gold、CUDA、Triton、ROCm wrapper 都应尽量使用同一套 Python 接口。
+- 不同 backend 不应要求测试框架长期维护 shape/参数名 adapter。
+- 当前 `_LogpSM90CandidateAdapter` 只是为了验证框架最小闭环的临时例外，不作为长期模式。
+
+步骤 2：在 `operator_inputs.py` 添加输入构造
+
+文件：
+
+```text
+rl_engine/kernels/gtest/operator_inputs.py
+```
+
+需要做三件事：
+
+1. 在 `make_operator_inputs()` 的 `builders` 中加入算子名。
+2. 在 `operator_shape_name()` 的 `names` 中加入 shape 描述。
+3. 新增 `_make_xxx_inputs(args, dtype, device)`。
+
+要求：
+
+- 输入必须是标准语义输入，不是某个 CUDA kernel 的私有格式。
+- `random` 模式必须可由 `--seed` 复现。
+- `constant` 模式必须便于 debug。
+- 多 batch 情况默认保留 `[B, S, ...]` 语义形状，不提前为某个 backend flatten。
+
+步骤 3：在 `operator_specs.py` 注册 gold 和 candidate
+
+文件：
+
+```text
+rl_engine/kernels/gtest/operator_specs.py
+```
+
+添加：
+
+```python
+"new_op": OperatorSpec(
+    name="new_op",
+    op_class="...",
+    gold_path="rl_engine.kernels.ops.pytorch....NativeNewOp",
+    registry_name="new_op",
+    candidate_paths={
+        "pytorch": "rl_engine.kernels.ops.pytorch....NativeNewOp",
+        "cuda": "rl_engine.kernels.ops.cuda....CudaNewOp",
+        "triton": "rl_engine.kernels.ops.triton....TritonNewOp",
+    },
+)
+```
+
+硬性规则：
+
+- `gold_path` 必须来自 `rl_engine.kernels.ops.pytorch`。
+- `candidate_paths["pytorch"]` 只能用于框架自检，不代表高性能算子通过。
+- `candidate_paths["cuda"]`、`candidate_paths["triton"]` 等必须对应实际被测 backend。
+- 不允许用实现了不同数学功能的算子互相比较，例如不能用 `linear_logp` 测普通 `logp`。
+
+步骤 4：确认 gold 调用方法
+
+当前 `make_operator_case()` 对 logp 使用：
+
+```python
+gold_fn=gold_op.forward_fp32
+```
+
+这对 `NativeLogpOp` 是正确的。新增算子时必须确认 PyTorch gold 是否有对应方法。
+
+如果新算子没有 `forward_fp32`，不要在测试主逻辑中硬编码临时分支；应在 `operator_specs.py` 中显式补充 gold 调用策略，作为一个独立小改动提交。
+
+步骤 5：新增输入和 runner 单测
+
+至少补两类测试：
+
+```text
+tests/test_operator_inputs.py:
+  确认 make_operator_inputs("new_op", ...) 能生成输入。
+  确认 random seed 可复现。
+  确认 constant 模式值正确。
+
+tests/test_op_checks.py 或新测试文件:
+  用 pytorch candidate vs pytorch gold 验证框架能跑通。
+  用 bad candidate 验证失败报告符合预期。
+```
+
+步骤 6：本地验证
+
+CPU 框架验证：
+
+```bash
+python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q
+```
+
+CUDA candidate 验证：
+
+```bash
+python scripts/check_operator.py \
+  --op new_op \
+  --candidate cuda \
+  --device cuda \
+  --dtype bf16 \
+  --arch-key sm90 \
+  --batch 2 \
+  --seq 16
+```
+
+如果 CUDA candidate 不通过，先判断：
+
+```text
+1. gold 和 candidate 是否真的是同一个数学函数。
+2. candidate Python wrapper 是否使用标准接口。
+3. 输入 dtype / shape 是否符合 candidate 声明。
+4. 误差是否超过 tolerance。
+5. 是否是 kernel 编译或运行错误。
+```
+
+不要为了让测试通过去修改 gold，也不要把不同功能的算子混在一起比较。
+
+### 2026-06-28 - `check_operator.py` 支持参数
+
+入口：
+
+```bash
+python scripts/check_operator.py [options]
+```
+
+核心参数：
+
+```text
+--op
+  算子名。当前最小版本支持 logp。
+
+--candidate
+  被测实现。当前 logp 支持 registry、pytorch、native、cuda、cuda-generic、cuda-sm90。
+
+--dtype
+  fp32、bf16、fp16。
+
+--device
+  auto、cpu、cuda 或 torch 可识别的 device 字符串。
+
+--arch-key
+  tolerance override key，例如 sm90。为空时使用 default tolerance。
+```
+
+shape 参数：
+
+```text
+--batch
+  batch size，默认 2。
+
+--seq
+  sequence length，默认 16。
+
+--vocab
+  vocabulary size，默认 257。
+
+--normalized-dim
+  norm 类算子的 hidden/normalized dimension。
+
+--k-dim
+  matmul K dimension。
+
+--n-dim
+  matmul N dimension。
+```
+
+输入初始化参数：
+
+```text
+--input-mode
+  random 或 constant。
+
+--constant-value
+  constant 模式下浮点 tensor 的基础值。
+
+--token-value
+  constant 模式下 token id 的基础值，会对 vocab 取模。
+
+--seed
+  random 模式下的随机种子。
+```
+
+其他参数：
+
+```text
+--theta
+  RoPE theta。
+
+--eps
+  norm epsilon。
+
+--json
+  输出完整 JSON report。
+```
+
+当前最小可运行示例：
+
+```bash
+python scripts/check_operator.py \
+  --op logp \
+  --candidate cuda \
+  --device cuda \
+  --dtype bf16 \
+  --arch-key sm90 \
+  --batch 1 \
+  --seq 1 \
+  --vocab 4096
+```
+
+预期：
+
+- 在 H100 + CUDA 13.0 环境中，generic CUDA logp 已观察到通过。
+- `cuda-sm90` 当前不应作为通过路径使用。
diff --git a/rl_engine/kernels/gtest/__init__.py b/rl_engine/kernels/gtest/__init__.py
new file mode 100644
index 0000000..c3fc366
--- /dev/null
+++ b/rl_engine/kernels/gtest/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from .op_checks import CandidateSpec, OperatorCase, run_operator_suite
+
+__all__ = [
+    "CandidateSpec",
+    "OperatorCase",
+    "run_operator_suite",
+]
diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py
new file mode 100644
index 0000000..433bbfc
--- /dev/null
+++ b/rl_engine/kernels/gtest/op_checks.py
@@ -0,0 +1,292 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+from collections.abc import Callable, Mapping, Sequence
+from dataclasses import asdict, dataclass
+from typing import Any
+
+import torch
+
+from rl_engine.kernels.gtest.tolerance import load_contract
+
+
+@dataclass(frozen=True)
+class OperatorCase:
+    """One deterministic test object for an operator candidate."""
+
+    name: str
+    op_class: str
+    dtype: torch.dtype
+    inputs: Mapping[str, Any]
+    gold_fn: Callable[..., Any]
+
+
+@dataclass(frozen=True)
+class CandidateSpec:
+    """One implementation to validate against the gold path."""
+
+    name: str
+    fn: Callable[..., Any] | Any
+    backend: str = "unknown"
+    arch_key: str | None = None
+
+
+@dataclass(frozen=True)
+class OutputCheck:
+    """Per-output comparison result."""
+
+    output_index: int
+    shape: tuple[int, ...]
+    candidate_dtype: str
+    gold_dtype: str
+    atol: float
+    rtol: float
+    max_abs_error: float
+    mean_abs_error: float
+    max_rel_error: float
+    passed: bool
+    message: str = ""
+
+
+@dataclass(frozen=True)
+class CaseCheck:
+    """Per-case result for one candidate."""
+
+    case_name: str
+    dtype: str
+    op_class: str
+    passed: bool
+    outputs: list[OutputCheck]
+
+
+@dataclass(frozen=True)
+class CandidateReport:
+    """Aggregate report for one candidate implementation."""
+
+    candidate_name: str
+    backend: str
+    total_outputs: int
+    passed_outputs: int
+    pass_rate: float
+    passed: bool
+    cases: list[CaseCheck]
+
+
+@dataclass(frozen=True)
+class OperatorCheckReport:
+    """Suite-level report across candidates."""
+
+    suite_name: str
+    total_candidates: int
+    passed_candidates: int
+    pass_rate: float
+    passed: bool
+    candidates: list[CandidateReport]
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+def run_operator_suite(
+    suite_name: str,
+    *,
+    candidates: Sequence[CandidateSpec],
+    cases: Sequence[OperatorCase],
+    contract: Mapping[str, Any] | None = None,
+) -> OperatorCheckReport:
+    """Run candidates against gold outputs and return a structured report."""
+
+    loaded_contract = dict(contract or load_contract())
+    # run all test ops
+    # cases : test object
+    # camdidate : test instance
+    # loaded_contract : tolerance table
+    candidate_reports = [
+        _run_candidate(candidate, cases, loaded_contract) for candidate in candidates
+    ]
+    passed_candidates = sum(1 for report in candidate_reports if report.passed)
+    total_candidates = len(candidate_reports)
+    pass_rate = float(passed_candidates / total_candidates) if total_candidates else 0.0
+    return OperatorCheckReport(
+        suite_name=suite_name,
+        total_candidates=total_candidates,
+        passed_candidates=passed_candidates,
+        pass_rate=pass_rate,
+        passed=passed_candidates == total_candidates,
+        candidates=candidate_reports,
+    )
+
+
+def _run_candidate(
+    candidate: CandidateSpec,
+    cases: Sequence[OperatorCase],
+    contract: Mapping[str, Any],
+) -> CandidateReport:
+    case_checks = [_run_case(candidate, case, contract) for case in cases]
+    total_outputs = sum(len(case.outputs) for case in case_checks)
+    passed_outputs = sum(
+        1 for case in case_checks for output in case.outputs if output.passed
+    )
+    pass_rate = float(passed_outputs / total_outputs) if total_outputs else 0.0
+    return CandidateReport(
+        candidate_name=candidate.name,
+        backend=candidate.backend,
+        total_outputs=total_outputs,
+        passed_outputs=passed_outputs,
+        pass_rate=pass_rate,
+        passed=passed_outputs == total_outputs,
+        cases=case_checks,
+    )
+
+
+def _run_case(
+    candidate: CandidateSpec,
+    case: OperatorCase,
+    contract: Mapping[str, Any],
+) -> CaseCheck:
+    candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, case.inputs))
+    gold_outputs = _flatten_tensors(case.gold_fn(**case.inputs))
+    if len(candidate_outputs) != len(gold_outputs):
+        raise ValueError(
+            f"candidate {candidate.name!r} returned {len(candidate_outputs)} outputs, "
+            f"gold returned {len(gold_outputs)}"
+        )
+    atol, rtol = _resolve_tolerance(
+        contract,
+        op_class=case.op_class,
+        dtype=case.dtype,
+        arch_key=candidate.arch_key,
+    )
+    output_checks = [
+        _compare_output(
+            candidate_output,
+            gold_output,
+            output_index=index,
+            atol=atol,
+            rtol=rtol,
+        )
+        for index, (candidate_output, gold_output) in enumerate(
+            zip(candidate_outputs, gold_outputs, strict=True)
+        )
+    ]
+    return CaseCheck(
+        case_name=case.name,
+        dtype=str(case.dtype),
+        op_class=case.op_class,
+        passed=all(output.passed for output in output_checks),
+        outputs=output_checks,
+    )
+
+
+# compatibility function or forward
+def _call_candidate(candidate: Callable[..., Any] | Any, inputs: Mapping[str, Any]) -> Any:
+    if hasattr(candidate, "forward") and callable(candidate.forward):
+        return candidate.forward(**inputs)
+    return candidate(**inputs)
+
+
+def _flatten_tensors(value: Any) -> list[torch.Tensor]:
+    if isinstance(value, torch.Tensor):
+        return [value]
+    if isinstance(value, (tuple, list)):
+        outputs: list[torch.Tensor] = []
+        for item in value:
+            outputs.extend(_flatten_tensors(item))
+        return outputs
+    raise TypeError(f"operator output must be Tensor or sequence, got {type(value)!r}")
+
+
+def _resolve_tolerance(
+    contract: Mapping[str, Any],
+    *,
+    op_class: str,
+    dtype: torch.dtype,
+    arch_key: str | None = None,
+) -> tuple[float, float]:
+    dtype_name = _dtype_name(dtype)
+    if arch_key is not None:
+        arch_values = (
+            contract["accuracy"]
+            .get("arch_overrides", {})
+            .get(arch_key, {})
+            .get(op_class, {})
+            .get(dtype_name)
+        )
+        if arch_values is not None:
+            return float(arch_values["atol"]), float(arch_values.get("rtol", 0.0))
+
+    values = contract["accuracy"]["default"][op_class][dtype_name]
+    return float(values["atol"]), float(values.get("rtol", 0.0))
+
+
+def _dtype_name(dtype: torch.dtype) -> str:
+    if dtype is torch.float32:
+        return "float32"
+    if dtype is torch.bfloat16:
+        return "bfloat16"
+    if dtype is torch.float16:
+        return "float16"
+    raise ValueError(f"unsupported dtype: {dtype}")
+
+
+def _compare_output(
+    candidate: torch.Tensor,
+    gold: torch.Tensor,
+    *,
+    output_index: int,
+    atol: float,
+    rtol: float,
+) -> OutputCheck:
+    if candidate.shape != gold.shape:
+        return OutputCheck(
+            output_index=output_index,
+            shape=tuple(candidate.shape),
+            candidate_dtype=str(candidate.dtype),
+            gold_dtype=str(gold.dtype),
+            atol=atol,
+            rtol=rtol,
+            max_abs_error=float("inf"),
+            mean_abs_error=float("inf"),
+            max_rel_error=float("inf"),
+            passed=False,
+            message=f"shape mismatch: candidate={tuple(candidate.shape)} gold={tuple(gold.shape)}",
+        )
+
+    candidate_fp32 = candidate.float()
+    gold_fp32 = gold.float()
+    abs_error = (candidate_fp32 - gold_fp32).abs()
+    if abs_error.numel() == 0:
+        max_abs_error = 0.0
+        mean_abs_error = 0.0
+        max_rel_error = 0.0
+    else:
+        max_abs_error = float(abs_error.max().item())
+        mean_abs_error = float(abs_error.mean().item())
+        rel_error = abs_error / gold_fp32.abs().clamp_min(1e-12)
+        max_rel_error = float(rel_error.max().item())
+
+    return OutputCheck(
+        output_index=output_index,
+        shape=tuple(candidate.shape),
+        candidate_dtype=str(candidate.dtype),
+        gold_dtype=str(gold.dtype),
+        atol=atol,
+        rtol=rtol,
+        max_abs_error=max_abs_error,
+        mean_abs_error=mean_abs_error,
+        max_rel_error=max_rel_error,
+        passed=bool(torch.allclose(candidate_fp32, gold_fp32, atol=atol, rtol=rtol)),
+    )
+
+
+__all__ = [
+    "CandidateReport",
+    "CandidateSpec",
+    "CaseCheck",
+    "OperatorCase",
+    "OperatorCheckReport",
+    "OutputCheck",
+    "run_operator_suite",
+]
diff --git a/rl_engine/kernels/gtest/operator_inputs.py b/rl_engine/kernels/gtest/operator_inputs.py
new file mode 100644
index 0000000..7e09719
--- /dev/null
+++ b/rl_engine/kernels/gtest/operator_inputs.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+import argparse
+from typing import Any
+
+import torch
+
+
+DEFAULT_HIDDEN = 4096
+DEFAULT_N_HEADS = 32
+DEFAULT_N_KV_HEADS = 8
+DEFAULT_HEAD_DIM = 128
+DEFAULT_INTERMEDIATE = 12288
+DEFAULT_VOCAB = 151936
+DEFAULT_ROPE_THETA = 1.0e6
+DEFAULT_RMS_EPS = 1.0e-6
+
+
+def make_operator_inputs(
+    op_name: str,
+    args: argparse.Namespace,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> dict[str, Any]:
+    builders = {
+        "rms_norm": _make_rms_norm_inputs,
+        "matmul": _make_matmul_inputs,
+        "attention": _make_attention_inputs,
+        "logp": _make_logp_inputs,
+        "rope": _make_rope_inputs,
+        "silu": _make_silu_inputs,
+        "swiglu": _make_swiglu_inputs,
+        "embedding": _make_embedding_inputs,
+        "lm_head": _make_lm_head_inputs,
+        "kv_cache_attention": _make_kv_cache_attention_inputs,
+    }
+    try:
+        return builders[op_name](args, dtype, device)
+    except KeyError as exc:
+        raise ValueError(f"unsupported operator inputs: {op_name}") from exc
+
+
+def operator_shape_name(op_name: str, args: argparse.Namespace) -> str:
+    batch, seq = _batch_seq(args)
+    vocab = _arg_int(args, "vocab", DEFAULT_VOCAB)
+    names = {
+        "rms_norm": f"{batch}x{seq}x{_normalized_dim(args)}",
+        "matmul": f"{batch}x{seq}x{_matmul_k(args)}x{_matmul_n(args)}",
+        "attention": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}",
+        "logp": f"{batch}x{seq}x{vocab}",
+        "rope": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}",
+        "silu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}",
+        "swiglu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}",
+        "embedding": f"{batch}x{seq}x{vocab}x{DEFAULT_HIDDEN}",
+        "lm_head": f"{batch}x{seq}x{vocab}",
+        "kv_cache_attention": f"{batch}x{DEFAULT_N_HEADS}x1x{seq + 1}x{DEFAULT_HEAD_DIM}",
+    }
+    try:
+        return names[op_name]
+    except KeyError as exc:
+        raise ValueError(f"unsupported operator shape: {op_name}") from exc
+
+
+def _make_rms_norm_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    normalized_dim = _normalized_dim(args)
+    return {
+        "x": _floating_tensor((batch, seq, normalized_dim), args, dtype, device, offset=0),
+        "weight": _floating_tensor((normalized_dim,), args, dtype, device, offset=1),
+        "eps": _arg_float(args, "eps", DEFAULT_RMS_EPS),
+    }
+
+
+def _make_matmul_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    k_dim = _matmul_k(args)
+    n_dim = _matmul_n(args)
+    return {
+        "a": _floating_tensor((batch, seq, k_dim), args, dtype, device, offset=0),
+        "b": _floating_tensor((k_dim, n_dim), args, dtype, device, offset=1),
+    }
+
+
+def _make_attention_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    return {
+        "q": _floating_tensor((batch, DEFAULT_N_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 0),
+        "k": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 1),
+        "v": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 2),
+        "causal": True,
+    }
+
+
+def _make_logp_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    vocab = _arg_int(args, "vocab", DEFAULT_VOCAB)
+    return {
+        "logits": _floating_tensor((batch, seq, vocab), args, dtype, device, offset=0),
+        "token_ids": _token_ids((batch, seq), vocab, args, device),
+    }
+
+
+def _make_rope_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    return {
+        "x": _floating_tensor((batch, DEFAULT_N_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 0),
+        "positions": torch.arange(seq, device=device, dtype=torch.long),
+        "theta": _arg_float(args, "theta", DEFAULT_ROPE_THETA),
+    }
+
+
+def _make_silu_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    return {
+        "x": _floating_tensor((batch, seq, DEFAULT_INTERMEDIATE), args, dtype, device, 0),
+    }
+
+
+def _make_swiglu_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    return {
+        "gate": _floating_tensor((batch, seq, DEFAULT_INTERMEDIATE), args, dtype, device, 0),
+        "up": _floating_tensor((batch, seq, DEFAULT_INTERMEDIATE), args, dtype, device, 1),
+    }
+
+
+def _make_embedding_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    vocab = _arg_int(args, "vocab", DEFAULT_VOCAB)
+    return {
+        "token_ids": _token_ids((batch, seq), vocab, args, device),
+        "weight": _floating_tensor((vocab, DEFAULT_HIDDEN), args, dtype, device, 0),
+    }
+
+
+def _make_lm_head_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    vocab = _arg_int(args, "vocab", DEFAULT_VOCAB)
+    return {
+        "hidden": _floating_tensor((batch, seq, DEFAULT_HIDDEN), args, dtype, device, 0),
+        "weight": _floating_tensor((vocab, DEFAULT_HIDDEN), args, dtype, device, 1),
+        "bias": None,
+    }
+
+
+def _make_kv_cache_attention_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    return {
+        "q": _floating_tensor((batch, DEFAULT_N_HEADS, 1, DEFAULT_HEAD_DIM), args, dtype, device, 0),
+        "k_cache": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 1),
+        "v_cache": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 2),
+        "k_new": _floating_tensor((batch, DEFAULT_N_KV_HEADS, 1, DEFAULT_HEAD_DIM), args, dtype, device, 3),
+        "v_new": _floating_tensor((batch, DEFAULT_N_KV_HEADS, 1, DEFAULT_HEAD_DIM), args, dtype, device, 4),
+        "causal": True,
+    }
+
+
+def _floating_tensor(
+    shape: tuple[int, ...],
+    args: argparse.Namespace,
+    dtype: torch.dtype,
+    device: torch.device,
+    offset: int,
+) -> torch.Tensor:
+    # Example: torch.randn((B, S, V), device="cuda", dtype=torch.bfloat16) 
+    mode = _arg_str(args, "input_mode", "random")
+    if mode == "constant":
+        value = _arg_float(args, "constant_value", 0.25) + float(offset) * 0.01
+        return torch.full(shape, value, device=device, dtype=dtype)
+    if mode != "random":
+        raise ValueError(f"unsupported input_mode: {mode}")
+    generator = _generator(args, device, offset)
+    return torch.randn(shape, generator=generator, device=device, dtype=dtype)
+
+
+def _token_ids(
+    shape: tuple[int, ...],
+    vocab: int,
+    args: argparse.Namespace,
+    device: torch.device,
+) -> torch.Tensor:
+    mode = _arg_str(args, "input_mode", "random")
+    if mode == "constant":
+        value = _arg_int(args, "token_value", 0) % vocab
+        return torch.full(shape, value, device=device, dtype=torch.long)
+    generator = _generator(args, device, offset=13)
+    return torch.randint(0, vocab, shape, generator=generator, device=device, dtype=torch.long)
+
+
+def _generator(args: argparse.Namespace, device: torch.device, offset: int) -> torch.Generator:
+    generator = torch.Generator(device=device)
+    generator.manual_seed(_arg_int(args, "seed", 123) + offset)
+    return generator
+
+
+def _batch_seq(args: argparse.Namespace) -> tuple[int, int]:
+    return _arg_int(args, "batch", 2), _arg_int(args, "seq", 16)
+
+
+def _normalized_dim(args: argparse.Namespace) -> int:
+    return _arg_int(args, "normalized_dim", DEFAULT_HIDDEN)
+
+
+def _matmul_k(args: argparse.Namespace) -> int:
+    return _arg_int(args, "k_dim", DEFAULT_HIDDEN)
+
+
+def _matmul_n(args: argparse.Namespace) -> int:
+    return _arg_int(args, "n_dim", DEFAULT_HIDDEN)
+
+
+def _arg_float(args: argparse.Namespace, name: str, default: float) -> float:
+    return float(getattr(args, name, default))
+
+
+def _arg_int(args: argparse.Namespace, name: str, default: int) -> int:
+    return int(getattr(args, name, default))
+
+
+def _arg_str(args: argparse.Namespace, name: str, default: str) -> str:
+    return str(getattr(args, name, default))
diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py
new file mode 100644
index 0000000..dc4674c
--- /dev/null
+++ b/rl_engine/kernels/gtest/operator_specs.py
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+import argparse
+import importlib
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+
+from rl_engine.kernels.gtest.operator_inputs import make_operator_inputs, operator_shape_name
+from rl_engine.kernels.gtest.op_checks import CandidateSpec, OperatorCase
+
+
+@dataclass(frozen=True)
+class OperatorSpec:
+    name: str
+    op_class: str
+    gold_path: str
+    registry_name: str
+    candidate_paths: dict[str, str]
+
+
+def _load_object(path: str) -> Any:
+    module_path, object_name = path.rsplit(".", 1)
+    # dynamic loading ops
+    module = importlib.import_module(module_path)
+    return getattr(module, object_name)
+
+
+OP_SPECS = {
+    "logp": OperatorSpec(
+        name="logp",
+        op_class="logprob",
+        gold_path="rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp",
+        registry_name="logp",
+        candidate_paths={
+            "pytorch": "rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp",
+            "cuda": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp",
+            "cuda-generic": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp",
+            "cuda-sm90": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpSM90Op",
+        },
+    ),
+}
+
+
+class _LogpSM90CandidateAdapter:
+    def __init__(self, candidate: Any) -> None:
+        self._candidate = candidate
+
+    def __call__(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
+        orig_shape = logits.shape[:-1]
+        logits_2d = logits.contiguous().view(-1, logits.size(-1))
+        labels_1d = token_ids.contiguous().view(-1)
+        return self._candidate(logits_2d, labels_1d).view(orig_shape)
+
+
+def operator_names() -> tuple[str, ...]:
+    return tuple(OP_SPECS)
+
+
+def make_operator_case(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> OperatorCase:
+    spec = OP_SPECS[args.op]
+    gold_op = _load_object(spec.gold_path)()
+    return OperatorCase(
+        name=f"{args.op}-{dtype}-{operator_shape_name(args.op, args)}",
+        op_class=spec.op_class,
+        dtype=dtype,
+        inputs=make_operator_inputs(args.op, args, dtype, device),
+        gold_fn=gold_op.forward_fp32,
+    )
+
+
+def make_candidate(args: argparse.Namespace) -> CandidateSpec:
+    spec = OP_SPECS[args.op]
+    candidate_name = "pytorch" if args.candidate == "native" else args.candidate
+
+    if candidate_name in spec.candidate_paths:
+        candidate_op = _load_object(spec.candidate_paths[candidate_name])()
+        if args.op == "logp" and candidate_name == "cuda-sm90":
+            candidate_op = _LogpSM90CandidateAdapter(candidate_op)
+        return CandidateSpec(
+            name=f"{candidate_name}-{args.op}",
+            backend=candidate_name,
+            arch_key=args.arch_key,
+            fn=candidate_op,
+        )
+
+    if candidate_name == "registry":
+        from rl_engine.kernels.registry import kernel_registry
+
+        return CandidateSpec(
+            name=f"registry-{args.op}",
+            backend="registry",
+            arch_key=args.arch_key,
+            fn=kernel_registry.get_op(spec.registry_name),
+        )
+
+    supported = sorted([*spec.candidate_paths, "native", "registry"])
+    raise ValueError(
+        f"unsupported candidate {args.candidate!r} for op {args.op!r}; "
+        f"supported candidates: {', '.join(supported)}"
+    )
diff --git a/rl_engine/testing/tolerance.py b/rl_engine/kernels/gtest/tolerance.py
similarity index 100%
rename from rl_engine/testing/tolerance.py
rename to rl_engine/kernels/gtest/tolerance.py
diff --git a/rl_engine/testing/tolerance_contract.yaml b/rl_engine/kernels/gtest/tolerance_contract.yaml
similarity index 100%
rename from rl_engine/testing/tolerance_contract.yaml
rename to rl_engine/kernels/gtest/tolerance_contract.yaml
diff --git a/scripts/check_operator.py b/scripts/check_operator.py
new file mode 100644
index 0000000..9b99612
--- /dev/null
+++ b/scripts/check_operator.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+import argparse
+import json
+import pathlib
+import sys
+from typing import Any
+
+import torch
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from rl_engine.kernels.gtest import run_operator_suite  # noqa: E402
+from rl_engine.kernels.gtest.operator_specs import (  # noqa: E402
+    make_candidate,
+    make_operator_case,
+    operator_names,
+)
+
+
+def _parse_dtype(value: str) -> torch.dtype:
+    normalized = value.lower()
+    if normalized in {"fp32", "float32"}:
+        return torch.float32
+    if normalized in {"bf16", "bfloat16"}:
+        return torch.bfloat16
+    if normalized in {"fp16", "float16", "half"}:
+        return torch.float16
+    raise ValueError(f"unsupported dtype: {value}")
+
+
+def _select_device(value: str) -> torch.device:
+    if value == "auto":
+        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device(value)
+    if device.type == "cuda" and not torch.cuda.is_available():
+        raise RuntimeError("--device cuda was requested, but CUDA is not available")
+    return device
+
+
+def _summarize(report: Any) -> None:
+    print(f"suite={report.suite_name} passed={report.passed} pass_rate={report.pass_rate:.4f}")
+    for candidate in report.candidates:
+        print(
+            f"candidate={candidate.candidate_name} backend={candidate.backend} "
+            f"passed={candidate.passed} pass_rate={candidate.pass_rate:.4f}"
+        )
+        for case in candidate.cases:
+            for output in case.outputs:
+                print(
+                    f"  case={case.case_name} output={output.output_index} "
+                    f"shape={output.shape} dtype={output.candidate_dtype} "
+                    f"max_abs={output.max_abs_error:.8e} "
+                    f"mean_abs={output.mean_abs_error:.8e} "
+                    f"max_rel={output.max_rel_error:.8e} "
+                    f"tol=(atol={output.atol:.3e}, rtol={output.rtol:.3e}) "
+                    f"passed={output.passed}"
+                )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Validate an operator candidate against a PyTorch gold path.")
+    parser.add_argument("--op", choices=operator_names(), default="logp")
+    parser.add_argument(
+        "--candidate",
+        default="registry",
+        help="Candidate backend to validate, for example registry, pytorch, cuda, cuda-sm90.",
+    )
+    parser.add_argument("--dtype", choices=("fp32", "bf16", "fp16"), default="fp32")
+    parser.add_argument("--device", default="auto")
+    parser.add_argument("--batch", type=int, default=2)
+    parser.add_argument("--seq", type=int, default=16)
+    parser.add_argument("--vocab", type=int, default=257)
+    parser.add_argument("--input-mode", choices=("random", "constant"), default="random")
+    parser.add_argument("--constant-value", type=float, default=0.25)
+    parser.add_argument("--token-value", type=int, default=0)
+    parser.add_argument("--normalized-dim", type=int, default=4096)
+    parser.add_argument("--k-dim", type=int, default=4096)
+    parser.add_argument("--n-dim", type=int, default=4096)
+    parser.add_argument("--theta", type=float, default=1.0e6)
+    parser.add_argument("--eps", type=float, default=1.0e-6)
+    parser.add_argument("--seed", type=int, default=123)
+    parser.add_argument(
+        "--arch-key",
+        default=None,
+        help="Optional tolerance override key, for example sm90. Defaults to contract.default.",
+    )
+    parser.add_argument("--json", action="store_true", help="Print the full structured report as JSON.")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    dtype = _parse_dtype(args.dtype)
+    device = _select_device(args.device)
+    candidate = make_candidate(args)
+    case = make_operator_case(args, dtype, device)
+    report = run_operator_suite(args.op, candidates=[candidate], cases=[case])
+
+    if args.json:
+        print(json.dumps(report.to_dict(), indent=2, default=str))
+    else:
+        _summarize(report)
+
+    if not report.passed:
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py
new file mode 100644
index 0000000..9f8a901
--- /dev/null
+++ b/tests/test_op_checks.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+import torch
+
+from rl_engine.kernels.ops.pytorch.loss.logp import NativeLogpOp
+from rl_engine.kernels.gtest.op_checks import CandidateSpec, OperatorCase, run_operator_suite
+
+
+def _logp_case(name: str, dtype: torch.dtype, *, seed: int = 0) -> OperatorCase:
+    generator = torch.Generator().manual_seed(seed)
+    logits = torch.randn(2, 8, 257, dtype=dtype, generator=generator)
+    token_ids = torch.randint(0, logits.size(-1), (2, 8), generator=generator)
+    return OperatorCase(
+        name=name,
+        op_class="logprob",
+        dtype=dtype,
+        inputs={"logits": logits, "token_ids": token_ids},
+        gold_fn=NativeLogpOp().forward_fp32,
+    )
+
+
+def test_logp_native_candidate_suite_passes():
+    report = run_operator_suite(
+        "logp",
+        candidates=[CandidateSpec(name="native-logp", backend="pytorch", fn=NativeLogpOp())],
+        cases=[
+            _logp_case("fp32", torch.float32, seed=1),
+            _logp_case("bf16", torch.bfloat16, seed=2),
+            _logp_case("fp16", torch.float16, seed=3),
+        ],
+    )
+
+    assert report.passed
+    assert report.pass_rate == 1.0
+    assert report.candidates[0].passed_outputs == 3
+    assert all(case.passed for case in report.candidates[0].cases)
+
+
+def test_logp_registry_candidate_suite_passes_on_cpu():
+    from rl_engine.kernels.registry import kernel_registry
+
+    report = run_operator_suite(
+        "logp",
+        candidates=[
+            CandidateSpec(
+                name="registry-logp",
+                backend="registry",
+                fn=kernel_registry.get_op("logp"),
+            )
+        ],
+        cases=[_logp_case("fp32", torch.float32, seed=4)],
+    )
+
+    assert report.passed
+    assert report.candidates[0].candidate_name == "registry-logp"
+
+
+def test_suite_reports_failure_for_bad_candidate():
+    def bad_logp(logits, token_ids):
+        del token_ids
+        return torch.zeros(logits.shape[:-1], dtype=logits.dtype)
+
+    report = run_operator_suite(
+        "logp",
+        candidates=[CandidateSpec(name="bad-logp", backend="test", fn=bad_logp)],
+        cases=[_logp_case("fp32", torch.float32, seed=5)],
+    )
+
+    output = report.candidates[0].cases[0].outputs[0]
+    assert not report.passed
+    assert report.pass_rate == 0.0
+    assert output.max_abs_error > 0.0
+
+
+def test_suite_report_to_dict_contains_error_metrics():
+    report = run_operator_suite(
+        "logp",
+        candidates=[CandidateSpec(name="native-logp", backend="pytorch", fn=NativeLogpOp())],
+        cases=[_logp_case("fp32", torch.float32, seed=6)],
+    )
+
+    data = report.to_dict()
+    output = data["candidates"][0]["cases"][0]["outputs"][0]
+    assert data["suite_name"] == "logp"
+    assert "max_abs_error" in output
+    assert "atol" in output
+    assert "passed" in output
+
+
+def test_candidate_arch_key_uses_tolerance_override():
+    def slightly_shifted_logp(logits, token_ids):
+        return NativeLogpOp().forward_fp32(logits, token_ids) + 0.02
+
+    contract = {
+        "accuracy": {
+            "default": {
+                "logprob": {
+                    "float32": {"atol": 1.0e-5, "rtol": 0.0},
+                }
+            },
+            "arch_overrides": {
+                "testarch": {
+                    "logprob": {
+                        "float32": {"atol": 5.0e-2, "rtol": 0.0},
+                    }
+                }
+            },
+        }
+    }
+    report = run_operator_suite(
+        "logp",
+        candidates=[
+            CandidateSpec(
+                name="shifted-logp",
+                backend="test",
+                fn=slightly_shifted_logp,
+                arch_key="testarch",
+            )
+        ],
+        cases=[_logp_case("fp32", torch.float32, seed=7)],
+        contract=contract,
+    )
+
+    output = report.candidates[0].cases[0].outputs[0]
+    assert report.passed
+    assert output.atol == 5.0e-2
diff --git a/tests/test_operator_inputs.py b/tests/test_operator_inputs.py
new file mode 100644
index 0000000..22d39ae
--- /dev/null
+++ b/tests/test_operator_inputs.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 RL-Kernel Contributors
+
+from __future__ import annotations
+
+import argparse
+
+import pytest
+import torch
+
+from rl_engine.kernels.gtest.operator_inputs import make_operator_inputs, operator_shape_name
+
+
+def _args(**overrides):
+    values = {
+        "batch": 1,
+        "seq": 2,
+        "vocab": 17,
+        "seed": 123,
+        "input_mode": "constant",
+        "constant_value": 0.5,
+        "token_value": 3,
+        "normalized_dim": 128,
+        "k_dim": 16,
+        "n_dim": 32,
+        "theta": 1.0e6,
+        "eps": 1.0e-6,
+    }
+    values.update(overrides)
+    return argparse.Namespace(**values)
+
+
+@pytest.mark.parametrize(
+    "op_name",
+    [
+        "rms_norm",
+        "matmul",
+        "attention",
+        "logp",
+        "rope",
+        "silu",
+        "swiglu",
+        "embedding",
+        "lm_head",
+        "kv_cache_attention",
+    ],
+)
+def test_operator_inputs_support_all_issue_108_ops(op_name):
+    args = _args()
+    inputs = make_operator_inputs(op_name, args, torch.float32, torch.device("cpu"))
+
+    assert inputs
+    assert operator_shape_name(op_name, args)
+
+
+def test_constant_logp_inputs_are_deterministic():
+    args = _args(input_mode="constant", constant_value=0.5, token_value=3)
+    inputs = make_operator_inputs("logp", args, torch.float32, torch.device("cpu"))
+
+    assert torch.equal(inputs["logits"], torch.full((1, 2, 17), 0.5))
+    assert torch.equal(inputs["token_ids"], torch.full((1, 2), 3, dtype=torch.long))
+
+
+def test_random_logp_inputs_are_seeded():
+    args = _args(input_mode="random", seed=7)
+    first = make_operator_inputs("logp", args, torch.float32, torch.device("cpu"))
+    second = make_operator_inputs("logp", args, torch.float32, torch.device("cpu"))
+
+    assert torch.equal(first["logits"], second["logits"])
+    assert torch.equal(first["token_ids"], second["token_ids"])
diff --git a/tests/test_tolerance_contract.py b/tests/test_tolerance_contract.py
index 0e62d7f..fb429d8 100644
--- a/tests/test_tolerance_contract.py
+++ b/tests/test_tolerance_contract.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-from rl_engine.testing.tolerance import load_contract
+from rl_engine.kernels.gtest.tolerance import load_contract
 
 
 def test_load_contract_contains_expected_operator_classes():

From 6f361c4ad6178b1282750cc18d3a1ea8c44a420a Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Mon, 29 Jun 2026 19:44:34 +0800
Subject: [PATCH 05/11] add english doc and TODO for _run_case

---
 docs/contributing/issue-108-session-log.md | 1088 ++++----------------
 rl_engine/kernels/gtest/op_checks.py       |    1 +
 2 files changed, 208 insertions(+), 881 deletions(-)

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
index f49ddde..dd66a1d 100644
--- a/docs/contributing/issue-108-session-log.md
+++ b/docs/contributing/issue-108-session-log.md
@@ -1,874 +1,277 @@
 # ISSUE-108 Session Log
 
-本文档记录本 session 中围绕 RL-Kernel 算子测试框架、CUDA 验证和 upstream 同步的所有关键修改。后续本 session 中每次代码修改都必须继续追加到本文档，记录目标、设计判断、修改文件、验证方式和结果。
+This document records the engineering decisions made while building the ISSUE-108 kernel correctness checker. It is intentionally concise and review-oriented: it explains what was added, why it was added, how to use it, and what is still out of scope.
 
-## 记录原则
+## Logging Rules
 
-- 使用中文记录工程判断。
-- 每个改动都需要说明为什么做，而不只记录改了什么。
-- 保持最小增量：一次修改尽量只围绕一个明确问题。
-- 每个子任务需要能独立验证；无法验证时必须明确说明原因。
-- CUDA 验证必须如实记录环境、命令、误差和失败范围。
-- 不把失败路径写成已支持能力。
+- Record the reason for each meaningful change, not only the files touched.
+- Keep changes minimal and independently verifiable.
+- Be explicit when a path is only a smoke test or an experimental path.
+- Do not present failed CUDA paths as supported capabilities.
+- Gold implementations must come from `rl_engine.kernels.ops.pytorch`.
 
-## 总体目标
+## Goal
 
-本 session 的目标是把算子验证从零散脚本推进为可复用、可扩展、可审查的工程化框架：
+The goal of this work is to add a minimal, reusable operator correctness framework for post-training kernels.
 
-- 建立统一 tolerance contract，用于管理不同算子的误差阈值。
-- 建立公共 operator check runner，替代单算子专用验证脚本。
-- 建立统一 operator input 生成逻辑，覆盖后训练常见算子的基础输入。
-- 将测试入口改造成可指定 `op`、`candidate`、`dtype`、`device`、shape 参数的 CLI。
-- 同步 upstream/main，吸收 PR #122 中的 SM90 相关修复。
-- 在 H20 机器上验证普通 CUDA `fused_logp` 路径。
-- 明确 SM90 `fused_linear_logp` 在 CUDA 12.4 下仍未通过。
+The framework should:
 
-## 时间线
+- Generate deterministic operator inputs.
+- Run PyTorch gold implementations and backend candidates on the same inputs.
+- Compare every tensor output with dtype/operator-class tolerances.
+- Report absolute error, relative error, pass rate, and final pass/fail status.
+- Expose a CLI so a developer can validate a backend candidate without editing test files.
 
-### 1. tolerance table 和 contract loader
-
-目标：
-
-- 将不同 dtype、op class 的误差容差从测试代码中抽离出来。
-- 让误差阈值可以被审查和维护，而不是散落在测试断言中。
-
-修改文件：
-
-- `rl_engine/testing/tolerance.py`
-- `rl_engine/testing/tolerance_contract.yaml`
-- `tests/test_tolerance_contract.py`
-
-设计判断：
-
-- 使用 YAML 保存 contract，便于人工 review。
-- 将容差按 `accuracy.default` 和可选硬件 override 组织。
-- `default` 是通用 fallback，不等同于 CPU；CPU、SM90、SM100、ROCm、Ascend 等未来可作为明确 override key。
-
-验证：
-
-- `tests/test_tolerance_contract.py` 验证 contract 可读、结构正确。
-
-结果：
-
-- tolerance contract 框架建立完成。
-
-### 2. operator check runner
-
-目标：
-
-- 建立类似 GoogleTest 思路的算子验证 runner。
-- 一个 case 表示一组确定输入和 gold path，一个 candidate 表示被测实现。
-
-修改文件：
-
-- `rl_engine/testing/op_checks.py`
-- `tests/test_op_checks.py`
-- `rl_engine/testing/__init__.py`
-
-设计判断：
-
-- `OperatorCase` 表示测试对象：`name`、`op_class`、`dtype`、`inputs`、`gold_fn`。
-- `CandidateSpec` 表示被测实现：`name`、`fn`、`backend`、`arch_key`。
-- runner 负责：
-  - 调用 candidate。
-  - 调用 gold。
-  - flatten 多输出。
-  - 按 `op_class + dtype + arch_key` 解析容差。
-  - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。
-  - 返回结构化 report。
-
-验证：
-
-- `tests/test_op_checks.py` 覆盖 native logp、registry logp、失败 candidate、arch override 等场景。
-
-结果：
-
-- 公共 operator check runner 建立完成。
-
-### 3. `check_operator.py` 从 logp 专用入口改为公共入口
-
-目标：
-
-- 让测试者通过 CLI 指定算子、candidate、dtype、device 和 shape。
-- 避免后续每个算子都写一个独立测试脚本。
-
-修改文件：
-
-- `scripts/check_operator.py`
-
-设计判断：
-
-- `check_operator.py` 只负责：
-  - 解析参数。
-  - 选择 device/dtype。
-  - 调用 `make_candidate`。
-  - 调用 `make_operator_case`。
-  - 调用 `run_operator_suite`。
-  - 输出 summary 或 JSON。
-- 不在入口中硬编码具体算子实现。
-
-验证：
-
-```bash
-python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17
-python scripts/check_operator.py --op logp --candidate registry --dtype bf16 --batch 2 --seq 16 --vocab 257 --json
-python -m pytest tests/test_op_checks.py -q
-```
-
-结果：
-
-- 公共 CLI 最小闭环通过。
-
-### 4. 抽离 operator specs
-
-目标：
-
-- 避免新增算子时修改测试入口。
-- 将算子元信息集中到专门文件。
-
-修改文件：
-
-- `rl_engine/testing/operator_specs.py`
-
-设计判断：
-
-- 每个算子通过 `OperatorSpec` 描述：
-  - `name`
-  - `op_class`
-  - `gold_path`
-  - `registry_name`
-  - `candidate_paths`
-- `check_operator.py` 不直接知道某个算子的 Python 类路径。
-- `--candidate cuda` 明确选择 CUDA candidate。
-- `--candidate registry` 仅用于测试 dispatcher 分发结果，不作为具体 CUDA correctness 的替代。
-
-当前 logp 映射：
-
-```text
-pytorch      -> NativeLogpOp
-cuda         -> FusedLogpGenericOp
-cuda-generic -> FusedLogpGenericOp
-cuda-sm90    -> FusedLogpSM90Op
-registry     -> kernel_registry.get_op("logp")
-```
-
-结果：
-
-- 后续新增算子主要扩展 `operator_specs.py`，不再修改公共入口。
-
-### 5. 统一 operator input 工厂
-
-目标：
-
-- 用户指出不希望每个新算子都手写 `_make_xxx_inputs` 和 `_xxx_shape_name`。
-- 统一准备 ISSUE #108 中所有算子的输入初始化。
-
-修改文件：
-
-- `rl_engine/testing/operator_inputs.py`
-- `tests/test_operator_inputs.py`
-- `rl_engine/testing/operator_specs.py`
-- `scripts/check_operator.py`
-
-设计判断：
-
-- 新增 `make_operator_inputs(op_name, args, dtype, device)`。
-- 新增 `operator_shape_name(op_name, args)`。
-- 支持 `random` 和 `constant` 两种输入模式：
-  - `random` 用 seed 控制可复现。
-  - `constant` 用固定值便于 debug。
-- 支持的算子输入：
-  - `rms_norm`
-  - `matmul`
-  - `attention`
-  - `logp`
-  - `rope`
-  - `silu`
-  - `swiglu`
-  - `embedding`
-  - `lm_head`
-  - `kv_cache_attention`
-
-CLI 增加参数：
+## Final Layout
 
 ```text
---input-mode random|constant
---constant-value
---token-value
---normalized-dim
---k-dim
---n-dim
---theta
---eps
-```
-
-验证：
-
-```bash
-python -m pytest tests/test_operator_inputs.py -q
-python -m pytest tests/test_op_checks.py -q
-python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17 --input-mode constant --constant-value 0.5 --token-value 3
-```
-
-结果：
-
-- 多算子输入生成能力建立。
-- logp CLI 随机输入和固定输入均通过。
-
-### 6. 删除模型命名和档位参数
-
-目标：
-
-- 用户指出测试框架不应绑定某个模型，也不需要冗余 `--size` 档位。
-- 输入生成应表达为通用数据规模，而不是某个模型配置。
-
-修改文件：
-
-- `rl_engine/testing/operator_inputs.py`
-- `scripts/check_operator.py`
-- `tests/test_operator_inputs.py`
-
-设计判断：
-
-- 删除 `QWEN3_8B` 命名。
-- 删除 `ModelShape` / `DEFAULT_MODEL_SHAPE` 抽象。
-- 改为普通常量：
-
-```python
-DEFAULT_HIDDEN = 4096
-DEFAULT_N_HEADS = 32
-DEFAULT_N_KV_HEADS = 8
-DEFAULT_HEAD_DIM = 128
-DEFAULT_INTERMEDIATE = 12288
-DEFAULT_VOCAB = 151936
-DEFAULT_ROPE_THETA = 1.0e6
-DEFAULT_RMS_EPS = 1.0e-6
-```
-
-- 删除 `--size small|medium|large`。
-- 仅保留显式 `--batch` 和 `--seq`。
-
-验证：
-
-```bash
-rg -n "Qwen|QWEN|qwen" rl_engine/testing scripts tests
-rg -n "ModelShape|DEFAULT_MODEL|model" rl_engine/testing/operator_inputs.py
-rg -n "BATCH_SHAPES|BatchShape|--size|small|medium|large" scripts/check_operator.py rl_engine/testing/operator_inputs.py tests/test_operator_inputs.py
-python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q
-```
-
-结果：
-
-- 模型耦合和档位参数均已删除。
-- 相关测试通过。
-
-### 7. 同步 upstream/main 并处理冲突
-
-目标：
-
-- 用户要求检查当前代码是否过旧，拉取最新代码，如有冲突则解决。
-
-操作：
-
-```bash
-git fetch --all --prune
-git stash push -u -m pre-upstream-main-sync
-git rebase upstream/main
-git stash pop
-```
-
-冲突文件：
-
-- `csrc/cuda/fused_logp_sm90.cu`
-
-设计判断：
-
-- `upstream/main` 已包含 PR #122。
-- PR #122 中已经包含 SM90 文件的两项修复：
-  - `#include <math_constants.h>`
-  - `reinterpret_cast<const nv_bfloat16*>`
-- 因此冲突解决时采用 upstream/main 版本。
-- 丢弃本地临时加入的 `#include <cuda_runtime.h>`。
+rl_engine/kernels/gtest/
+  __init__.py
+  op_checks.py
+  operator_inputs.py
+  operator_specs.py
+  tolerance.py
+  tolerance_contract.yaml
 
-验证：
+scripts/check_operator.py
 
-```bash
-python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q
+tests/test_op_checks.py
+tests/test_operator_inputs.py
+tests/test_tolerance_contract.py
 ```
 
-结果：
-
-- rebase 到最新 `upstream/main` 成功。
-- 冲突解决完成。
-- 本地相对 `upstream/main` 为 `ahead 3, behind 0`。
-
-### 8. H20 CUDA 环境和普通 CUDA logp 验证
+## Key Design Decisions
 
-目标：
+### Tolerance Contract
 
-- 将测试框架迁移到 H 系列 GPU 环境验证。
-- 先确认普通 CUDA `fused_logp` 路径是否可用。
-
-环境记录：
+Files:
 
 ```text
-GPU: NVIDIA H20
-Driver: 565.57.01
-Driver CUDA capability: 12.7
-nvcc: 12.4
-Python: 3.11.15
-```
-
-普通 CUDA 扩展检查：
-
-```text
-_EXT_AVAILABLE: True
-has fused_logp: True
-has fused_logp_sm90: False
-```
-
-验证命令：
-
-```bash
-python scripts/check_operator.py \
-  --op logp \
-  --candidate cuda \
-  --device cuda \
-  --dtype bf16 \
-  --arch-key sm90 \
-  --batch 2 \
-  --seq 16 \
-  --vocab 257
+rl_engine/kernels/gtest/tolerance.py
+rl_engine/kernels/gtest/tolerance_contract.yaml
+tests/test_tolerance_contract.py
 ```
 
-输出：
-
-```text
-INFO [RL-Kernel]: Successfully linked to precompiled _C.fused_logp fallback kernel.
-suite=logp passed=True pass_rate=1.0000
-candidate=cuda-logp backend=cuda passed=True pass_rate=1.0000
-case=logp-torch.bfloat16-2x16x257 output=0 shape=(2, 16)
-max_abs=1.49779320e-02
-mean_abs=7.53845274e-03
-max_rel=2.70811981e-03
-tol=(atol=5.000e-02, rtol=0.000e+00)
-passed=True
-```
+Decision:
 
-结论：
+- Store tolerance values in a small contract file rather than hard-coding them inside tests.
+- Resolve tolerance by `op_class + dtype`, with optional `arch_key` overrides.
+- Treat `default` as the generic fallback, not as CPU-specific tolerance.
 
-- 普通 CUDA `FusedLogpGenericOp -> _C.fused_logp` 路径通过。
-- 这证明测试框架最小 GPU 闭环已经打通：
+Current accuracy classes:
 
 ```text
-CLI
--> operator_specs
--> operator_inputs
--> PyTorch gold
--> CUDA candidate
--> run_operator_suite
--> tolerance contract
--> compare_output
--> structured report
+elementwise
+reduction
+logprob
 ```
 
-### 9. SM90 fused_linear_logp 当前状态
-
-目标：
-
-- 尝试编译和验证 SM90 路径。
-
-结果：
+### Operator Check Runner
 
-- `fused_logp_sm90` 的旧 include 和 type 问题已由 upstream PR #122 解决。
-- 但 `fused_linear_logp_sm90.cu` 在 CUDA 12.4 下仍未通过 ptxas。
-
-错误摘要：
+Files:
 
 ```text
-ptxas error: State space incorrect for instruction 'cp.async.bulk.tensor'
-ptxas fatal: Ptx assembly aborted due to errors
+rl_engine/kernels/gtest/op_checks.py
+tests/test_op_checks.py
 ```
 
-设计判断：
+Decision:
 
-- PR #122 描述中提到相关 SM90 路径在 CUDA 13.1 下 assembled。
-- 当前 H20 环境是 nvcc 12.4，不应将该路径写为已通过。
-- 当前应先以普通 CUDA `fused_logp` 作为验证通过范围。
+- `OperatorCase` describes one deterministic test case: name, op class, dtype, inputs, and gold function.
+- `CandidateSpec` describes one implementation under test: name, function, backend, and optional arch key.
+- `run_operator_suite()` runs candidates against gold outputs and returns structured reports.
+- The runner compares forward outputs only in this minimal version.
 
-结论：
+Review follow-up:
 
-- 已通过：`--candidate cuda`
-- 未通过：`--candidate cuda-sm90` / SM90 fused linear logp
+- `op_checks.py` includes a TODO for optional gradient checks on differentiable operators.
+- Gradient checks require additional metadata and input cloning rules, so they are intentionally tracked as follow-up work instead of being silently implied by this PR.
 
-## 当前文件状态摘要
+### Operator Inputs
 
-本 session 产生或涉及的主要文件：
+Files:
 
 ```text
-rl_engine/testing/tolerance.py
-rl_engine/testing/tolerance_contract.yaml
-tests/test_tolerance_contract.py
-rl_engine/testing/op_checks.py
-tests/test_op_checks.py
-rl_engine/testing/__init__.py
-scripts/check_operator.py
-rl_engine/testing/operator_specs.py
-rl_engine/testing/operator_inputs.py
+rl_engine/kernels/gtest/operator_inputs.py
 tests/test_operator_inputs.py
-csrc/cuda/fused_logp_sm90.cu
-docs/contributing/issue-108-session-log.md
-```
-
-说明：
-
-- `AGENTS.md` 是未跟踪文件，未纳入本 session 的代码修改范围。
-- `csrc/cuda/fused_logp_sm90.cu` 最终与 upstream PR #122 版本一致。
-
-## 后续记录模板
-
-之后每次代码修改都在本文档追加如下条目：
-
-```markdown
-### YYYY-MM-DD HH:MM - 变更标题
-
-目标：
-
-- 本次最小子任务要解决什么问题。
-
-修改文件：
-
-- `path/to/file.py`
-
-设计决策：
-
-- 为什么这样改。
-- 为什么没有选择其他方案。
-
-验证方式：
-
-- 执行的测试命令。
-- CUDA 环境，如 GPU、CUDA 版本、driver、arch。
-- 关键输出指标。
-
-结果：
-
-- 通过 / 未通过 / 部分通过。
-- 未通过时必须记录完整错误摘要。
-
-后续：
-
-- 是否需要继续拆分子任务。
-- 是否影响 CI、benchmark 或其他算子。
-```
-
-CUDA 验证建议额外记录：
-
-```markdown
-GPU:
-CUDA:
-Driver:
-Arch:
-Candidate:
-Backend:
-Command:
-max_abs:
-mean_abs:
-max_rel:
-atol:
-rtol:
-Result:
-Known issue:
 ```
 
-### 2026-06-28 - CUDA 13 CUB reduce functor 兼容修复
+Decision:
 
-目标：
+- Build standard semantic inputs for each operator.
+- Support both `random` and `constant` initialization.
+- Make random inputs reproducible with `--seed`.
+- Preserve semantic shapes such as `[B, S, V]`; do not flatten inputs for backend-specific kernels inside input generation.
 
-- 修复 H100 + CUDA 13.0 环境下 SM90 编译失败的问题。
-
-修改文件：
-
-- `csrc/cuda/fused_logp_sm90.cu`
-- `docs/contributing/issue-108-session-log.md`
-
-错误摘要：
+Current input builders cover:
 
 ```text
-csrc/cuda/fused_logp_sm90.cu(76): error: namespace "cub" has no member "Max"
-csrc/cuda/fused_logp_sm90.cu(86): error: namespace "cub" has no member "Sum"
-```
-
-设计决策：
-
-- 不继续依赖 CUB 内置 `cub::Max()` 和 `cub::Sum()` functor 名称。
-- 在当前 SM90 文件内定义本地 `FloatMax` 和 `FloatSum`，传给 `cub::BlockReduce::Reduce`。
-- 这样保留原有 reduction 语义，同时规避 CUDA 13 / CCCL 中 CUB functor API 变化。
-
-验证方式：
-
-- 本地只做源码修改；需要在 H100 + CUDA 13.0 机器上重新执行：
-
-```bash
-rm -rf build
-find rl_engine -name "*.so" -delete
-
-export CUDA_HOME=/usr/local/cuda
-export PATH=$CUDA_HOME/bin:$PATH
-export LD_LIBRARY_PATH=$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}
-export OMP_NUM_THREADS=8
-export MAX_JOBS=1
-
-KERNEL_ALIGN_FORCE_SM90=1 pip install -v --no-build-isolation -e . 2>&1 | tee build_sm90.log
+rms_norm
+matmul
+attention
+logp
+rope
+silu
+swiglu
+embedding
+lm_head
+kv_cache_attention
 ```
 
-结果：
-
-- 待 H100 机器重新编译确认。
+### Operator Specs
 
-后续：
-
-- 如果继续失败，优先查看 `grep -nE "FAILED:|error:|ptxas|fatal" build_sm90.log | head -n 80`。
-
-### 2026-06-28 - 对齐 SM90 LogP Python wrapper 输入接口
-
-目标：
-
-- 修复 `check_operator.py --candidate cuda-sm90` 调用失败的问题。
-
-修改文件：
-
-- `rl_engine/kernels/ops/cuda/loss/logp.py`
-- `docs/contributing/issue-108-session-log.md`
-
-错误摘要：
+File:
 
 ```text
-TypeError: FusedLogpSM90Op.__call__() got an unexpected keyword argument 'token_ids'
+rl_engine/kernels/gtest/operator_specs.py
 ```
 
-设计决策：
-
-- 测试框架统一通过 `case.inputs` 传递 `token_ids`。
-- `FusedLogpGenericOp`、`NativeLogpOp` 都使用 `token_ids` 命名。
-- 因此将 `FusedLogpSM90Op.__call__(logits, labels)` 改为 `__call__(logits, token_ids)`，让 candidate 接口与 gold/case 输入一致。
-- 同时在 wrapper 内部把 `[B, S, V]` logits reshape 为 `[B*S, V]`，把 `[B, S]` token ids flatten 为 `[B*S]`，再把 `_C.fused_logp_sm90` 的 `[B*S]` 输出 reshape 回 `[B, S]`。
+Decision:
 
-验证方式：
+- Keep operator-specific registration outside `scripts/check_operator.py`.
+- Register PyTorch gold paths and backend candidate paths in one place.
+- Require `gold_path` to point into `rl_engine.kernels.ops.pytorch`.
 
-- 需要在 H100 + CUDA 13.0 机器上同步该 patch 后运行：
+Current minimal registered operator:
 
-```bash
-python scripts/check_operator.py \
-  --op logp \
-  --candidate cuda-sm90 \
-  --device cuda \
-  --dtype bf16 \
-  --arch-key sm90 \
-  --batch 2 \
-  --seq 16 \
-  --vocab 257
+```text
+op: logp
+op_class: logprob
+gold: rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp
+candidates:
+  pytorch      -> NativeLogpOp
+  cuda         -> FusedLogpGenericOp
+  cuda-generic -> FusedLogpGenericOp
+  cuda-sm90    -> FusedLogpSM90Op
+  registry     -> kernel_registry.get_op("logp")
 ```
 
-结果：
-
-- 已撤回该方向。`rl_engine/kernels/ops/cuda/loss/logp.py` 属于被测 CUDA 算子实现，不应为了测试框架改动其接口。
-- 后续适配应放在 testing 层，例如在 candidate adapter 中把测试框架统一的 `token_ids` 映射为 SM90 wrapper 需要的 `labels`，并处理 flatten/reshape。
+Important note:
 
-### 2026-06-28 - 在 testing 层适配 SM90 LogP candidate
+- `candidate=pytorch` is only a smoke test for the checker itself.
+- CUDA, Triton, ROCm, and future hardware-specific implementations are candidates.
+- Do not compare two operators that implement different math, such as ordinary `logp` and `linear_logp`.
 
-目标：
+### SM90 Adapter Exception
 
-- 保持 `rl_engine/kernels/ops/cuda` 下被测实现不变。
-- 让 `check_operator.py --candidate cuda-sm90` 可以使用测试框架统一的 `token_ids` 输入。
+Current code contains `_LogpSM90CandidateAdapter` in `operator_specs.py`.
 
-修改文件：
+Reason:
 
-- `rl_engine/testing/operator_specs.py`
-- `docs/contributing/issue-108-session-log.md`
+- The existing SM90 logp wrapper accepts flattened inputs, while the checker standard input for `logp` is `[B, S, V]` logits and `[B, S]` token ids.
+- The adapter exists only to validate the checker path against the current SM90 wrapper.
 
-设计决策：
+Long-term rule:
 
-- 新增 `_LogpSM90CandidateAdapter`，只在 `args.op == "logp"` 且 `candidate == "cuda-sm90"` 时使用。
-- adapter 接收测试框架标准输入 `logits` 和 `token_ids`。
-- adapter 内部把 `logits` 从 `[B, S, V]` flatten 为 `[B*S, V]`，把 `token_ids` 从 `[B, S]` flatten 为 `[B*S]`。
-- adapter 调用原始 SM90 candidate：`self._candidate(logits_2d, labels_1d)`。
-- adapter 将输出 reshape 回 `[B, S]`，以便 `compare_output` 按原始 case shape 比较。
+- Backend wrappers should align with the standard operator interface whenever possible.
+- New operators should not rely on permanent test-side adapters for ordinary shape or parameter-name differences.
 
-验证方式：
+## CLI Usage
 
-- 本地执行 Python 测试和编译检查：
+CPU smoke check against the PyTorch candidate:
 
 ```bash
-python -m py_compile rl_engine/testing/operator_specs.py
-python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q
+python scripts/check_operator.py   --op logp   --candidate pytorch   --device cpu   --dtype fp32   --batch 1   --seq 2   --vocab 17
 ```
 
-- H100 机器需要重新运行：
+CUDA candidate check against the PyTorch gold path:
 
 ```bash
-python scripts/check_operator.py \
-  --op logp \
-  --candidate cuda-sm90 \
-  --device cuda \
-  --dtype bf16 \
-  --arch-key sm90 \
-  --batch 2 \
-  --seq 16 \
-  --vocab 257
-```
-
-结果：
-
-- 本地待验证；H100 CUDA 结果待重新运行确认。
-
-### 2026-06-28 - H100 CUDA generic 与 SM90 LogP 对照验证
-
-目标：
-
-- 记录 H100 + CUDA 13.0 环境下 `logp` 的 generic CUDA 和 SM90 candidate 行为差异。
-- 明确测试框架已能区分“通过的 CUDA generic candidate”和“编译/运行存在问题的 SM90 candidate”。
-
-环境：
-
-```text
-GPU: NVIDIA H100 80GB HBM3
-Driver: 580.95.05
-CUDA driver capability: 13.0
-nvcc: 13.0
-Python: 3.12.13
-torch: 2.12.0+cu130
-torch cuda: 13.0
-compute capability: (9, 0)
+python scripts/check_operator.py   --op logp   --candidate cuda   --device cuda   --dtype bf16   --arch-key sm90   --batch 1   --seq 1   --vocab 4096
 ```
 
-修改文件：
-
-- `docs/contributing/issue-108-session-log.md`
-
-验证命令：
+JSON report:
 
 ```bash
-for v in 256 512 1024 2048 4096; do
-  echo "=== vocab=$v ==="
-  python scripts/check_operator.py \
-    --op logp \
-    --candidate cuda \
-    --device cuda \
-    --dtype bf16 \
-    --arch-key sm90 \
-    --batch 1 \
-    --seq 1 \
-    --vocab $v
-done
+python scripts/check_operator.py   --op logp   --candidate pytorch   --device cpu   --dtype fp32   --batch 1   --seq 2   --vocab 17   --json
 ```
 
-generic CUDA 结果：
+Supported key options:
 
 ```text
-vocab=256:  passed=True, max_abs=5.77497482e-03
-vocab=512:  passed=True, max_abs=8.04328918e-03
-vocab=1024: passed=True, max_abs=1.80721283e-04
-vocab=2048: passed=True, max_abs=1.77164078e-02
-vocab=4096: passed=True, max_abs=2.69813538e-02
-atol=5.000e-02, rtol=0.000e+00
-```
-
-结论：
-
-- `FusedLogpGenericOp -> _C.fused_logp` 在 H100 + CUDA 13.0 上多 vocab correctness 全部通过。
-- 这进一步确认测试框架、input 生成、gold path、candidate 调用和 compare_output 链路是通的。
-
-SM90 对照现象：
+--op              Operator name. The minimal version supports logp.
+--candidate       Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, registry.
+--dtype           fp32, bf16, or fp16.
+--device          auto, cpu, cuda, or another torch device string.
+--arch-key        Optional tolerance override key such as sm90.
+--batch           Batch size.
+--seq             Sequence length.
+--vocab           Vocabulary size.
+--input-mode      random or constant.
+--constant-value  Floating-point value for constant mode.
+--token-value     Token id for constant mode, reduced modulo vocab.
+--seed            Random seed for reproducible random inputs.
+--json            Print the full structured report as JSON.
+```
+
+Example output:
 
 ```text
-TILE_V=4096:
-  vocab=257/4096/151936 均在 cuTensorMapEncodeTiled 失败。
-  错误：CUDA_ERROR_INVALID_VALUE。
-
-TILE_V=256:
-  vocab=256 返回结果，但 passed=False，max_abs≈1.04094028e+00。
-  vocab=512/1024/2048/4096 在 20s timeout 下没有输出 report，表现为 hang/timeout。
+suite=logp passed=True pass_rate=1.0000
+candidate=cuda-logp backend=cuda passed=True pass_rate=1.0000
+  case=logp-torch.bfloat16-1x1x4096 output=0 shape=(1, 1) dtype=torch.bfloat16 max_abs=2.69813538e-02 mean_abs=2.69813538e-02 max_rel=3.03093810e-03 tol=(atol=5.000e-02, rtol=0.000e+00) passed=True
 ```
 
-结论：
+## Adding a New Operator
 
-- `cuda-sm90` 已能编译和加载，但当前 SM90 TMA kernel 仍不能标记为通过。
-- 当前通过范围只包括 `--candidate cuda` generic CUDA logp。
-- SM90 问题应作为独立 CUDA kernel bugfix 处理，不归因于测试框架。
+To add a new operator, keep the shared checker flow unchanged. Add only operator-specific inputs, specs, and tests.
 
-### 2026-06-28 - 最终整理：不提交 CUDA 源码改动
+### 1. Add Input Generation
 
-目标：
-
-- 用户明确要求本阶段不修改 `csrc` 下 CUDA/TMA 源码。
-- 本阶段只提交算子测试框架和文档，不把 SM90 kernel 实验 patch 混入测试框架 PR。
-
-本地处理：
-
-- 已还原 `csrc/cuda/fused_logp_sm90.cu`。
-- 本地 `csrc/utils/tma_utils.cuh` 没有 diff。
-- 因此本地最终不会提交任何 `csrc` 改动。
-
-服务器状态对照：
-
-用户在 H100 服务器上看到：
-
-```text
-Changes not staged for commit:
-  modified:   csrc/cuda/fused_logp_sm90.cu
-  modified:   csrc/utils/tma_utils.cuh
-```
-
-本地当前状态不同：
+File:
 
 ```text
-csrc/cuda/fused_logp_sm90.cu: no diff after restore
-csrc/utils/tma_utils.cuh: no diff locally
-```
-
-结论：
-
-- H100 服务器上的 `csrc/utils/tma_utils.cuh` 改动不是本地当前工作区的一部分。
-- 如果服务器要回到与本地一致的测试框架提交状态，需要在服务器上还原两个 CUDA/TMA 文件：
-
-```bash
-git restore csrc/cuda/fused_logp_sm90.cu csrc/utils/tma_utils.cuh
-```
-
-保留记录的 CUDA 现象：
-
-- H100 环境：
-
-```text
-GPU: NVIDIA H100 80GB HBM3
-Driver: 580.95.05
-CUDA driver capability: 13.0
-nvcc: 13.0.88
-Python: 3.12.13
-torch: 2.12.0+cu130
-torch cuda: 13.0
-compute capability: (9, 0)
+rl_engine/kernels/gtest/operator_inputs.py
 ```
 
-- `--candidate cuda` generic logp 在 vocab 256/512/1024/2048/4096 上通过。
-- `--candidate cuda-sm90` 可以编译和加载，但：
-  - `TILE_V=4096` 触发 `cuTensorMapEncodeTiled failed`。
-  - `TILE_V=256` 时 vocab=256 返回但数值不通过，vocab>=512 出现 timeout/hang。
-- 所以 SM90 fused logp 目前记录为 CUDA kernel 问题，不作为测试框架失败。
-
-### 2026-06-28 - 测试框架目录归位
-
-目标：
+Update `make_operator_inputs()`:
 
-- 用户指出测试框架文件放在通用 `rl_engine/testing` 下过于分散。
-- 本阶段将 operator correctness checking 代码移动到 kernel 相关目录，避免和 RL batch/reference testing 混在一起。
-
-最终目录：
-
-```text
-rl_engine/kernels/gtest/
-  __init__.py
-  op_checks.py
-  operator_inputs.py
-  operator_specs.py
-  tolerance.py
-  tolerance_contract.yaml
+```python
+builders = {
+    ...
+    "new_op": _make_new_op_inputs,
+}
 ```
 
-职责划分：
-
-- `op_checks.py`
-  - 定义 `OperatorCase`、`CandidateSpec`、report dataclass。
-  - 调用 gold 和 candidate。
-  - flatten 输出。
-  - 解析 tolerance。
-  - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。
-  - 返回通过率和结构化 report。
-- `operator_inputs.py`
-  - 统一构造标准语义输入。
-  - 支持 `random` 和 `constant`。
-  - 支持 `batch`、`seq`、`vocab` 等 CLI 参数。
-  - 当前覆盖 ISSUE-108 相关算子的输入初始化骨架。
-- `operator_specs.py`
-  - 注册每个算子的 gold path 和 candidate path。
-  - gold path 必须来自 `rl_engine.kernels.ops.pytorch`。
-  - candidate path 来自 `cuda`、`triton`、`rocm` 或未来 backend。
-- `tolerance.py` 和 `tolerance_contract.yaml`
-  - 加载 dtype/operator-class 容差表。
-  - 供 `op_checks.py` 在 compare output 时解析 `atol` 和 `rtol`。
-- `scripts/check_operator.py`
-  - 命令行入口。
-  - 不直接硬编码具体算子实现。
-
-导入边界：
-
-- operator checking 框架只从 `rl_engine.kernels.gtest` 导入。
-- `rl_engine/testing/__init__.py` 不导出 `CandidateSpec`、`OperatorCase`、`run_operator_suite`。
-- 这样可以避免 kernel correctness checking 和通用 RL testing helper 混在一起。
-
-### 2026-06-28 - 添加新算子的傻瓜式流程
-
-目标：
-
-- 新增算子时不修改测试主逻辑。
-- 新增算子只改注册信息、输入工厂和必要测试。
-- gold 永远使用 `rl_engine/kernels/ops/pytorch` 下实现。
-
-步骤 1：确认算子标准接口
-
-先确定这个算子的标准语义输入。例如：
+Update `operator_shape_name()`:
 
-```text
-logp:
-  inputs:
-    logits: [B, S, V]
-    token_ids: [B, S]
-  output:
-    selected_logp: [B, S]
+```python
+names = {
+    ...
+    "new_op": f"{batch}x{seq}x...",
+}
 ```
 
-要求：
-
-- PyTorch gold、CUDA、Triton、ROCm wrapper 都应尽量使用同一套 Python 接口。
-- 不同 backend 不应要求测试框架长期维护 shape/参数名 adapter。
-- 当前 `_LogpSM90CandidateAdapter` 只是为了验证框架最小闭环的临时例外，不作为长期模式。
+Add the input builder:
 
-步骤 2：在 `operator_inputs.py` 添加输入构造
-
-文件：
-
-```text
-rl_engine/kernels/gtest/operator_inputs.py
+```python
+def _make_new_op_inputs(args, dtype, device):
+    batch, seq = _batch_seq(args)
+    return {
+        "x": _floating_tensor((batch, seq, ...), args, dtype, device, offset=0),
+    }
 ```
 
-需要做三件事：
-
-1. 在 `make_operator_inputs()` 的 `builders` 中加入算子名。
-2. 在 `operator_shape_name()` 的 `names` 中加入 shape 描述。
-3. 新增 `_make_xxx_inputs(args, dtype, device)`。
-
-要求：
+Rules:
 
-- 输入必须是标准语义输入，不是某个 CUDA kernel 的私有格式。
-- `random` 模式必须可由 `--seed` 复现。
-- `constant` 模式必须便于 debug。
-- 多 batch 情况默认保留 `[B, S, ...]` 语义形状，不提前为某个 backend flatten。
+- Inputs should represent the operator's standard semantic interface.
+- Do not generate backend-specific flattened inputs here.
+- Support deterministic random inputs and constant inputs where practical.
 
-步骤 3：在 `operator_specs.py` 注册 gold 和 candidate
+### 2. Register Gold and Candidates
 
-文件：
+File:
 
 ```text
 rl_engine/kernels/gtest/operator_specs.py
 ```
 
-添加：
+Add an `OperatorSpec` entry:
 
 ```python
 "new_op": OperatorSpec(
     name="new_op",
-    op_class="...",
+    op_class="elementwise",
     gold_path="rl_engine.kernels.ops.pytorch....NativeNewOp",
     registry_name="new_op",
     candidate_paths={
@@ -879,166 +282,89 @@ rl_engine/kernels/gtest/operator_specs.py
 )
 ```
 
-硬性规则：
-
-- `gold_path` 必须来自 `rl_engine.kernels.ops.pytorch`。
-- `candidate_paths["pytorch"]` 只能用于框架自检，不代表高性能算子通过。
-- `candidate_paths["cuda"]`、`candidate_paths["triton"]` 等必须对应实际被测 backend。
-- 不允许用实现了不同数学功能的算子互相比较，例如不能用 `linear_logp` 测普通 `logp`。
-
-步骤 4：确认 gold 调用方法
-
-当前 `make_operator_case()` 对 logp 使用：
-
-```python
-gold_fn=gold_op.forward_fp32
-```
+Rules:
 
-这对 `NativeLogpOp` 是正确的。新增算子时必须确认 PyTorch gold 是否有对应方法。
+- `gold_path` must come from `rl_engine.kernels.ops.pytorch`.
+- Backend implementations are candidates only.
+- `candidate=pytorch` is for checker smoke tests only.
+- Do not compare operators with different math.
 
-如果新算子没有 `forward_fp32`，不要在测试主逻辑中硬编码临时分支；应在 `operator_specs.py` 中显式补充 gold 调用策略，作为一个独立小改动提交。
+### 3. Update Tolerances If Needed
 
-步骤 5：新增输入和 runner 单测
-
-至少补两类测试：
+File:
 
 ```text
-tests/test_operator_inputs.py:
-  确认 make_operator_inputs("new_op", ...) 能生成输入。
-  确认 random seed 可复现。
-  确认 constant 模式值正确。
-
-tests/test_op_checks.py 或新测试文件:
-  用 pytorch candidate vs pytorch gold 验证框架能跑通。
-  用 bad candidate 验证失败报告符合预期。
+rl_engine/kernels/gtest/tolerance_contract.yaml
 ```
 
-步骤 6：本地验证
-
-CPU 框架验证：
+Reuse an existing class when possible:
 
-```bash
-python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q
+```text
+elementwise
+reduction
+logprob
 ```
 
-CUDA candidate 验证：
+If a new class is needed, add dtype tolerances and set `op_class` accordingly in `operator_specs.py`.
 
-```bash
-python scripts/check_operator.py \
-  --op new_op \
-  --candidate cuda \
-  --device cuda \
-  --dtype bf16 \
-  --arch-key sm90 \
-  --batch 2 \
-  --seq 16
-```
+### 4. Add Tests
 
-如果 CUDA candidate 不通过，先判断：
+Files:
 
 ```text
-1. gold 和 candidate 是否真的是同一个数学函数。
-2. candidate Python wrapper 是否使用标准接口。
-3. 输入 dtype / shape 是否符合 candidate 声明。
-4. 误差是否超过 tolerance。
-5. 是否是 kernel 编译或运行错误。
+tests/test_operator_inputs.py
+tests/test_op_checks.py
 ```
 
-不要为了让测试通过去修改 gold，也不要把不同功能的算子混在一起比较。
+Minimum expected coverage:
 
-### 2026-06-28 - `check_operator.py` 支持参数
+- Add the operator to the `test_operator_inputs_support_all_issue_108_ops` parametrized list.
+- Add a PyTorch-vs-PyTorch smoke case if the operator adds new runner behavior.
+- Add a bad-candidate case if the operator introduces new comparison behavior.
 
-入口：
+### 5. Validate
 
 ```bash
-python scripts/check_operator.py [options]
+python -m pytest tests/test_tolerance_contract.py tests/test_op_checks.py tests/test_operator_inputs.py -q
 ```
 
-核心参数：
-
-```text
---op
-  算子名。当前最小版本支持 logp。
-
---candidate
-  被测实现。当前 logp 支持 registry、pytorch、native、cuda、cuda-generic、cuda-sm90。
-
---dtype
-  fp32、bf16、fp16。
+Then run the CLI:
 
---device
-  auto、cpu、cuda 或 torch 可识别的 device 字符串。
-
---arch-key
-  tolerance override key，例如 sm90。为空时使用 default tolerance。
+```bash
+python scripts/check_operator.py   --op new_op   --candidate pytorch   --device cpu   --dtype fp32
 ```
 
-shape 参数：
-
-```text
---batch
-  batch size，默认 2。
-
---seq
-  sequence length，默认 16。
-
---vocab
-  vocabulary size，默认 257。
+For CUDA:
 
---normalized-dim
-  norm 类算子的 hidden/normalized dimension。
-
---k-dim
-  matmul K dimension。
-
---n-dim
-  matmul N dimension。
+```bash
+python scripts/check_operator.py   --op new_op   --candidate cuda   --device cuda   --dtype bf16   --arch-key sm90
 ```
 
-输入初始化参数：
-
-```text
---input-mode
-  random 或 constant。
+## CUDA Validation Notes
 
---constant-value
-  constant 模式下浮点 tensor 的基础值。
+H100 environment observed during development:
 
---token-value
-  constant 模式下 token id 的基础值，会对 vocab 取模。
-
---seed
-  random 模式下的随机种子。
+```text
+GPU: NVIDIA H100 80GB HBM3
+Driver: 580.95.05
+CUDA driver capability: 13.0
+nvcc: 13.0
+torch: 2.12.0+cu130
+compute capability: (9, 0)
 ```
 
-其他参数：
+Generic CUDA `logp` passed on H100 for vocab sizes 256, 512, 1024, 2048, and 4096 with bf16 inputs under the current tolerance contract.
 
-```text
---theta
-  RoPE theta。
+SM90 fused logp is not marked as a passing path in this PR. It compiled and loaded in some experiments, but runtime failures and accuracy failures were observed separately. Treat SM90 fused logp as a separate CUDA kernel validation task unless `check_operator.py` reports `passed=True` for the target case.
 
---eps
-  norm epsilon。
+## Validation Performed
 
---json
-  输出完整 JSON report。
+```bash
+python -m pytest tests/test_tolerance_contract.py tests/test_op_checks.py tests/test_operator_inputs.py -q
 ```
 
-当前最小可运行示例：
+CPU CLI smoke test:
 
 ```bash
-python scripts/check_operator.py \
-  --op logp \
-  --candidate cuda \
-  --device cuda \
-  --dtype bf16 \
-  --arch-key sm90 \
-  --batch 1 \
-  --seq 1 \
-  --vocab 4096
+python scripts/check_operator.py   --op logp   --candidate pytorch   --device cpu   --dtype fp32   --batch 1   --seq 2   --vocab 17
 ```
-
-预期：
-
-- 在 H100 + CUDA 13.0 环境中，generic CUDA logp 已观察到通过。
-- `cuda-sm90` 当前不应作为通过路径使用。
diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py
index 433bbfc..e0e778c 100644
--- a/rl_engine/kernels/gtest/op_checks.py
+++ b/rl_engine/kernels/gtest/op_checks.py
@@ -146,6 +146,7 @@ def _run_case(
     case: OperatorCase,
     contract: Mapping[str, Any],
 ) -> CaseCheck:
+    # TODO(issue-108): add optional gradient checks for differentiable operators.
     candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, case.inputs))
     gold_outputs = _flatten_tensors(case.gold_fn(**case.inputs))
     if len(candidate_outputs) != len(gold_outputs):

From 035f21b840d4dad1298c1b19f3b2463d14cf2fe6 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Mon, 29 Jun 2026 21:26:23 +0800
Subject: [PATCH 06/11] Add logp gradient test

---
 docs/contributing/issue-108-session-log.md | 21 +++++++++++++++++++++
 tests/test_logp.py                         | 14 ++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
index dd66a1d..6c0c892 100644
--- a/docs/contributing/issue-108-session-log.md
+++ b/docs/contributing/issue-108-session-log.md
@@ -368,3 +368,24 @@ CPU CLI smoke test:
 ```bash
 python scripts/check_operator.py   --op logp   --candidate pytorch   --device cpu   --dtype fp32   --batch 1   --seq 2   --vocab 17
 ```
+
+## PR Review Updates
+
+### LogP Gradient Coverage
+
+Files:
+
+```text
+tests/test_logp.py
+docs/contributing/issue-108-session-log.md
+```
+
+Change:
+
+- Added a forward-gradient test for `NativeLogpOp.forward_fp32`.
+
+Reasoning:
+
+- The checker PR already validates forward output values, but review feedback called out that logprob coverage should also prove gradient propagation and batch invariance.
+- The new gradient test compares the op gradient against a direct PyTorch `log_softmax + gather` reference under a non-unit upstream gradient.
+- Batch invariance was already covered by `TestNativeLogpOpBatchInvariance` in `tests/test_logp.py`, so no duplicate batch-invariance test was added.
diff --git a/tests/test_logp.py b/tests/test_logp.py
index 42492fb..bcb0e3f 100644
--- a/tests/test_logp.py
+++ b/tests/test_logp.py
@@ -78,6 +78,20 @@ def test_pure_function_no_inplace(self):
         assert torch.equal(logits, logits_orig)
         assert torch.equal(token_ids, token_ids_orig)
 
+    def test_forward_fp32_gradient_matches_reference(self):
+        gen = torch.Generator().manual_seed(654)
+        logits = torch.randn(2, 4, 17, generator=gen, requires_grad=True)
+        ref_logits = logits.detach().clone().requires_grad_(True)
+        token_ids = torch.randint(0, logits.size(-1), (2, 4), generator=gen)
+        upstream = torch.randn(2, 4, generator=gen)
+
+        (NativeLogpOp().forward_fp32(logits, token_ids) * upstream).sum().backward()
+        (_reference_selected_logp(ref_logits, token_ids) * upstream).sum().backward()
+
+        assert logits.grad is not None
+        assert ref_logits.grad is not None
+        assert torch.allclose(logits.grad, ref_logits.grad, atol=1e-6, rtol=1e-6)
+
     def test_op_class_is_logprob(self):
         assert NativeLogpOp.op_class == "logprob"
 

From 6dd09a42048162f4dc9c7b7c44a6642f5c822f54 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Mon, 29 Jun 2026 22:31:31 +0800
Subject: [PATCH 07/11] Add gtest backward checks

---
 docs/contributing/issue-108-session-log.md |  33 ++++++
 rl_engine/kernels/gtest/op_checks.py       | 111 ++++++++++++++++++++-
 rl_engine/kernels/gtest/operator_specs.py  |   3 +
 scripts/check_operator.py                  |  11 +-
 tests/test_op_checks.py                    |  43 ++++++++
 5 files changed, 196 insertions(+), 5 deletions(-)

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
index 6c0c892..0644424 100644
--- a/docs/contributing/issue-108-session-log.md
+++ b/docs/contributing/issue-108-session-log.md
@@ -201,6 +201,7 @@ Supported key options:
 --constant-value  Floating-point value for constant mode.
 --token-value     Token id for constant mode, reduced modulo vocab.
 --seed            Random seed for reproducible random inputs.
+--check-grad      Also compare gradients for inputs declared by the operator spec.
 --json            Print the full structured report as JSON.
 ```
 
@@ -369,6 +370,12 @@ CPU CLI smoke test:
 python scripts/check_operator.py   --op logp   --candidate pytorch   --device cpu   --dtype fp32   --batch 1   --seq 2   --vocab 17
 ```
 
+Backward CLI smoke test:
+
+```bash
+python scripts/check_operator.py   --op logp   --candidate pytorch   --device cpu   --dtype fp32   --batch 1   --seq 2   --vocab 17   --check-grad
+```
+
 ## PR Review Updates
 
 ### LogP Gradient Coverage
@@ -389,3 +396,29 @@ Reasoning:
 - The checker PR already validates forward output values, but review feedback called out that logprob coverage should also prove gradient propagation and batch invariance.
 - The new gradient test compares the op gradient against a direct PyTorch `log_softmax + gather` reference under a non-unit upstream gradient.
 - Batch invariance was already covered by `TestNativeLogpOpBatchInvariance` in `tests/test_logp.py`, so no duplicate batch-invariance test was added.
+
+### GTest Backward Check Support
+
+Files:
+
+```text
+rl_engine/kernels/gtest/op_checks.py
+rl_engine/kernels/gtest/operator_specs.py
+scripts/check_operator.py
+tests/test_op_checks.py
+docs/contributing/issue-108-session-log.md
+```
+
+Change:
+
+- Added `OperatorCase.grad_input_names` to declare which inputs should be checked for gradients.
+- Added `run_operator_suite(..., check_grad=True)`.
+- Added `_run_case_backward()` to compare candidate forward outputs and selected input gradients against the PyTorch gold path.
+- Added `OperatorSpec.grad_input_names`; `logp` declares `("logits",)`.
+- Added `scripts/check_operator.py --check-grad`.
+
+Reasoning:
+
+- Forward-only checks can miss incorrect or disconnected backward paths.
+- Gradient inputs must be declared per operator because not every floating tensor should receive gradients.
+- Input generation remains independent of autograd; the runner clones inputs and enables `requires_grad` only inside the backward check path.
diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py
index e0e778c..fd590d9 100644
--- a/rl_engine/kernels/gtest/op_checks.py
+++ b/rl_engine/kernels/gtest/op_checks.py
@@ -21,6 +21,7 @@ class OperatorCase:
     dtype: torch.dtype
     inputs: Mapping[str, Any]
     gold_fn: Callable[..., Any]
+    grad_input_names: tuple[str, ...] = ()
 
 
 @dataclass(frozen=True)
@@ -95,6 +96,7 @@ def run_operator_suite(
     candidates: Sequence[CandidateSpec],
     cases: Sequence[OperatorCase],
     contract: Mapping[str, Any] | None = None,
+    check_grad: bool = False,
 ) -> OperatorCheckReport:
     """Run candidates against gold outputs and return a structured report."""
 
@@ -104,7 +106,8 @@ def run_operator_suite(
     # camdidate : test instance
     # loaded_contract : tolerance table
     candidate_reports = [
-        _run_candidate(candidate, cases, loaded_contract) for candidate in candidates
+        _run_candidate(candidate, cases, loaded_contract, check_grad=check_grad)
+        for candidate in candidates
     ]
     passed_candidates = sum(1 for report in candidate_reports if report.passed)
     total_candidates = len(candidate_reports)
@@ -123,8 +126,11 @@ def _run_candidate(
     candidate: CandidateSpec,
     cases: Sequence[OperatorCase],
     contract: Mapping[str, Any],
+    *,
+    check_grad: bool,
 ) -> CandidateReport:
-    case_checks = [_run_case(candidate, case, contract) for case in cases]
+    case_runner = _run_case_backward if check_grad else _run_case
+    case_checks = [case_runner(candidate, case, contract) for case in cases]
     total_outputs = sum(len(case.outputs) for case in case_checks)
     passed_outputs = sum(
         1 for case in case_checks for output in case.outputs if output.passed
@@ -146,9 +152,68 @@ def _run_case(
     case: OperatorCase,
     contract: Mapping[str, Any],
 ) -> CaseCheck:
-    # TODO(issue-108): add optional gradient checks for differentiable operators.
     candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, case.inputs))
     gold_outputs = _flatten_tensors(case.gold_fn(**case.inputs))
+    return _compare_case_outputs(candidate, case, contract, candidate_outputs, gold_outputs)
+
+
+def _run_case_backward(
+    candidate: CandidateSpec,
+    case: OperatorCase,
+    contract: Mapping[str, Any],
+) -> CaseCheck:
+    if not case.grad_input_names:
+        raise ValueError(f"case {case.name!r} does not declare gradient inputs")
+
+    candidate_inputs = _clone_inputs_for_backward(case.inputs, case.grad_input_names)
+    gold_inputs = _clone_inputs_for_backward(case.inputs, case.grad_input_names)
+    candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, candidate_inputs))
+    gold_outputs = _flatten_tensors(case.gold_fn(**gold_inputs))
+    candidate_grads = _backward_grads(candidate_outputs, candidate_inputs, case.grad_input_names)
+    gold_grads = _backward_grads(gold_outputs, gold_inputs, case.grad_input_names)
+    output_checks = _compare_case_outputs(
+        candidate,
+        case,
+        contract,
+        candidate_outputs,
+        gold_outputs,
+    ).outputs
+    atol, rtol = _resolve_tolerance(
+        contract,
+        op_class=case.op_class,
+        dtype=case.dtype,
+        arch_key=candidate.arch_key,
+    )
+    grad_checks = [
+        _compare_output(
+            candidate_grad,
+            gold_grad,
+            output_index=len(output_checks) + index,
+            atol=atol,
+            rtol=rtol,
+            message=f"gradient:{name}",
+        )
+        for index, (name, candidate_grad, gold_grad) in enumerate(
+            zip(case.grad_input_names, candidate_grads, gold_grads, strict=True)
+        )
+    ]
+    checks = [*output_checks, *grad_checks]
+    return CaseCheck(
+        case_name=case.name,
+        dtype=str(case.dtype),
+        op_class=case.op_class,
+        passed=all(output.passed for output in checks),
+        outputs=checks,
+    )
+
+
+def _compare_case_outputs(
+    candidate: CandidateSpec,
+    case: OperatorCase,
+    contract: Mapping[str, Any],
+    candidate_outputs: list[torch.Tensor],
+    gold_outputs: list[torch.Tensor],
+) -> CaseCheck:
     if len(candidate_outputs) != len(gold_outputs):
         raise ValueError(
             f"candidate {candidate.name!r} returned {len(candidate_outputs)} outputs, "
@@ -188,6 +253,44 @@ def _call_candidate(candidate: Callable[..., Any] | Any, inputs: Mapping[str, An
     return candidate(**inputs)
 
 
+def _clone_inputs_for_backward(
+    inputs: Mapping[str, Any],
+    grad_input_names: tuple[str, ...],
+) -> dict[str, Any]:
+    grad_names = set(grad_input_names)
+    cloned: dict[str, Any] = {}
+    for name, value in inputs.items():
+        if isinstance(value, torch.Tensor):
+            tensor = value.detach().clone()
+            if name in grad_names:
+                if not tensor.is_floating_point():
+                    raise TypeError(f"gradient input {name!r} must be floating point")
+                tensor.requires_grad_(True)
+            cloned[name] = tensor
+        else:
+            cloned[name] = value
+    missing = grad_names.difference(cloned)
+    if missing:
+        raise ValueError(f"missing gradient inputs: {', '.join(sorted(missing))}")
+    return cloned
+
+
+def _backward_grads(
+    outputs: list[torch.Tensor],
+    inputs: Mapping[str, Any],
+    grad_input_names: tuple[str, ...],
+) -> list[torch.Tensor]:
+    loss = sum(output.float().sum() for output in outputs)
+    loss.backward()
+    grads: list[torch.Tensor] = []
+    for name in grad_input_names:
+        grad = inputs[name].grad
+        if grad is None:
+            raise ValueError(f"gradient for input {name!r} is None")
+        grads.append(grad)
+    return grads
+
+
 def _flatten_tensors(value: Any) -> list[torch.Tensor]:
     if isinstance(value, torch.Tensor):
         return [value]
@@ -239,6 +342,7 @@ def _compare_output(
     output_index: int,
     atol: float,
     rtol: float,
+    message: str = "",
 ) -> OutputCheck:
     if candidate.shape != gold.shape:
         return OutputCheck(
@@ -279,6 +383,7 @@ def _compare_output(
         mean_abs_error=mean_abs_error,
         max_rel_error=max_rel_error,
         passed=bool(torch.allclose(candidate_fp32, gold_fp32, atol=atol, rtol=rtol)),
+        message=message,
     )
 
 
diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py
index dc4674c..1373151 100644
--- a/rl_engine/kernels/gtest/operator_specs.py
+++ b/rl_engine/kernels/gtest/operator_specs.py
@@ -21,6 +21,7 @@ class OperatorSpec:
     gold_path: str
     registry_name: str
     candidate_paths: dict[str, str]
+    grad_input_names: tuple[str, ...] = ()
 
 
 def _load_object(path: str) -> Any:
@@ -42,6 +43,7 @@ def _load_object(path: str) -> Any:
             "cuda-generic": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp",
             "cuda-sm90": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpSM90Op",
         },
+        grad_input_names=("logits",),
     ),
 }
 
@@ -72,6 +74,7 @@ def make_operator_case(
         dtype=dtype,
         inputs=make_operator_inputs(args.op, args, dtype, device),
         gold_fn=gold_op.forward_fp32,
+        grad_input_names=spec.grad_input_names,
     )
 
 
diff --git a/scripts/check_operator.py b/scripts/check_operator.py
index 9b99612..0b31b70 100644
--- a/scripts/check_operator.py
+++ b/scripts/check_operator.py
@@ -53,8 +53,9 @@ def _summarize(report: Any) -> None:
         )
         for case in candidate.cases:
             for output in case.outputs:
+                label = f" {output.message}" if output.message else ""
                 print(
-                    f"  case={case.case_name} output={output.output_index} "
+                    f"  case={case.case_name} output={output.output_index}{label} "
                     f"shape={output.shape} dtype={output.candidate_dtype} "
                     f"max_abs={output.max_abs_error:.8e} "
                     f"mean_abs={output.mean_abs_error:.8e} "
@@ -91,6 +92,7 @@ def parse_args() -> argparse.Namespace:
         default=None,
         help="Optional tolerance override key, for example sm90. Defaults to contract.default.",
     )
+    parser.add_argument("--check-grad", action="store_true", help="Also compare gradients for supported inputs.")
     parser.add_argument("--json", action="store_true", help="Print the full structured report as JSON.")
     return parser.parse_args()
 
@@ -101,7 +103,12 @@ def main() -> None:
     device = _select_device(args.device)
     candidate = make_candidate(args)
     case = make_operator_case(args, dtype, device)
-    report = run_operator_suite(args.op, candidates=[candidate], cases=[case])
+    report = run_operator_suite(
+        args.op,
+        candidates=[candidate],
+        cases=[case],
+        check_grad=args.check_grad,
+    )
 
     if args.json:
         print(json.dumps(report.to_dict(), indent=2, default=str))
diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py
index 9f8a901..b28148b 100644
--- a/tests/test_op_checks.py
+++ b/tests/test_op_checks.py
@@ -22,6 +22,18 @@ def _logp_case(name: str, dtype: torch.dtype, *, seed: int = 0) -> OperatorCase:
     )
 
 
+def _logp_backward_case(name: str, *, seed: int = 0) -> OperatorCase:
+    case = _logp_case(name, torch.float32, seed=seed)
+    return OperatorCase(
+        name=case.name,
+        op_class=case.op_class,
+        dtype=case.dtype,
+        inputs=case.inputs,
+        gold_fn=case.gold_fn,
+        grad_input_names=("logits",),
+    )
+
+
 def test_logp_native_candidate_suite_passes():
     report = run_operator_suite(
         "logp",
@@ -127,3 +139,34 @@ def slightly_shifted_logp(logits, token_ids):
     output = report.candidates[0].cases[0].outputs[0]
     assert report.passed
     assert output.atol == 5.0e-2
+
+
+def test_logp_native_candidate_backward_suite_passes():
+    report = run_operator_suite(
+        "logp",
+        candidates=[CandidateSpec(name="native-logp", backend="pytorch", fn=NativeLogpOp())],
+        cases=[_logp_backward_case("fp32", seed=8)],
+        check_grad=True,
+    )
+
+    assert report.passed
+    assert report.candidates[0].passed_outputs == 2
+    assert report.candidates[0].cases[0].outputs[1].message == "gradient:logits"
+
+
+def test_backward_suite_reports_failure_for_bad_gradient():
+    def bad_grad_logp(logits, token_ids):
+        values = NativeLogpOp().forward_fp32(logits, token_ids)
+        return values.detach() + logits.sum(dim=-1) * 0.0
+
+    report = run_operator_suite(
+        "logp",
+        candidates=[CandidateSpec(name="bad-grad-logp", backend="test", fn=bad_grad_logp)],
+        cases=[_logp_backward_case("fp32", seed=9)],
+        check_grad=True,
+    )
+
+    gradient_output = report.candidates[0].cases[0].outputs[1]
+    assert not report.passed
+    assert gradient_output.message == "gradient:logits"
+    assert gradient_output.max_abs_error > 0.0

From d53a5a670823a55bbd1a728c6afd058f97cadb86 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Mon, 29 Jun 2026 23:40:55 +0800
Subject: [PATCH 08/11] Add linear logp gtest smoke coverage

---
 docs/contributing/issue-108-session-log.md | 57 +++++++++++++++++++++-
 rl_engine/kernels/gtest/operator_inputs.py | 16 ++++++
 rl_engine/kernels/gtest/operator_specs.py  | 18 ++++++-
 tests/test_operator_inputs.py              | 11 +++++
 4 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
index 0644424..b55806f 100644
--- a/docs/contributing/issue-108-session-log.md
+++ b/docs/contributing/issue-108-session-log.md
@@ -189,7 +189,7 @@ python scripts/check_operator.py   --op logp   --candidate pytorch   --device cp
 Supported key options:
 
 ```text
---op              Operator name. The minimal version supports logp.
+--op              Operator name. The minimal version supports logp and linear_logp.
 --candidate       Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, registry.
 --dtype           fp32, bf16, or fp16.
 --device          auto, cpu, cuda, or another torch device string.
@@ -422,3 +422,58 @@ Reasoning:
 - Forward-only checks can miss incorrect or disconnected backward paths.
 - Gradient inputs must be declared per operator because not every floating tensor should receive gradients.
 - Input generation remains independent of autograd; the runner clones inputs and enables `requires_grad` only inside the backward check path.
+
+Known backend limitation:
+
+- `cuda` `logp` currently calls the compiled `_C.fused_logp` forward path directly and does not produce an autograd-connected output.
+- Running `--check-grad` against `candidate=cuda` fails with a missing `grad_fn`; this is a backend implementation gap, not a tolerance issue.
+- To support `cuda logp` backward, the backend must add or wrap a real backward path, usually via `torch.autograd.Function` or an explicit CUDA backward kernel.
+
+### Linear LogP Triton GTest Smoke Support
+
+Files:
+
+```text
+rl_engine/kernels/gtest/operator_inputs.py
+rl_engine/kernels/gtest/operator_specs.py
+tests/test_operator_inputs.py
+docs/contributing/issue-108-session-log.md
+```
+
+Change:
+
+- Added `linear_logp` input construction with `hidden`, `lm_head_weight`, `target_ids`, and `bias=None`.
+- Added a `linear_logp` operator spec using `NativeLinearLogpOp.apply` as the PyTorch gold path.
+- Added `triton` as a `linear_logp` candidate backend.
+- Declared `("hidden", "lm_head_weight")` as gradient inputs for backward checks.
+
+Reasoning:
+
+- `linear_logp` is the smallest real fused op in the repository with an implemented Triton backward path.
+- The first gtest integration keeps bias disabled to avoid optional-gradient handling in the initial smoke path.
+- The op reuses the `logprob` tolerance class because it produces selected-token log probabilities.
+- It gives the checker a real non-PyTorch differentiable candidate for end-to-end forward/backward reporting.
+
+Example Triton smoke command:
+
+```bash
+python scripts/check_operator.py   --op linear_logp   --candidate triton   --device cuda   --dtype bf16   --batch 1   --seq 2   --vocab 1024   --normalized-dim 4096   --check-grad
+```
+
+Observed bf16 result on H100:
+
+```text
+suite=linear_logp passed=False pass_rate=0.0000
+candidate=triton-linear_logp backend=triton passed=False pass_rate=0.6667
+  case=linear_logp-torch.bfloat16-1x2x4096x1024 output=0 shape=(1, 2) dtype=torch.float32 max_abs=4.76226807e-01 mean_abs=4.74334717e-01 max_rel=2.96434597e-03 tol=(atol=5.000e-02, rtol=0.000e+00) passed=False
+  case=linear_logp-torch.bfloat16-1x2x4096x1024 output=1 gradient:hidden shape=(1, 2, 4096) dtype=torch.bfloat16 max_abs=0.00000000e+00 mean_abs=0.00000000e+00 max_rel=0.00000000e+00 tol=(atol=5.000e-02, rtol=0.000e+00) passed=True
+  case=linear_logp-torch.bfloat16-1x2x4096x1024 output=2 gradient:lm_head_weight shape=(1024, 4096) dtype=torch.bfloat16 max_abs=0.00000000e+00 mean_abs=0.00000000e+00 max_rel=0.00000000e+00 tol=(atol=5.000e-02, rtol=0.000e+00) passed=True
+```
+
+Interpretation:
+
+- The checker flow is complete: CLI parsing, input construction, PyTorch gold loading, Triton candidate loading, forward execution, backward execution, gradient collection, comparison, and report formatting all ran successfully.
+- The suite is intentionally not marked as passing because the bf16 forward output exceeded the current `logprob` absolute tolerance.
+- The current tolerance uses `atol=5.0e-2` and `rtol=0.0`; the observed forward absolute error is about `4.76e-1`, while the relative error is about `2.96e-3`.
+- `linear_logp` includes a large bf16 matrix multiply before selected-token logprob, so it likely needs an operator-specific gold policy or tolerance rather than reusing plain `logprob` tolerances unchanged.
+- In this case, both checked gradients passed, so the failure is a forward accuracy/tolerance calibration issue, not a failure to execute the backward checker path.
diff --git a/rl_engine/kernels/gtest/operator_inputs.py b/rl_engine/kernels/gtest/operator_inputs.py
index 7e09719..4d131e2 100644
--- a/rl_engine/kernels/gtest/operator_inputs.py
+++ b/rl_engine/kernels/gtest/operator_inputs.py
@@ -30,6 +30,7 @@ def make_operator_inputs(
         "matmul": _make_matmul_inputs,
         "attention": _make_attention_inputs,
         "logp": _make_logp_inputs,
+        "linear_logp": _make_linear_logp_inputs,
         "rope": _make_rope_inputs,
         "silu": _make_silu_inputs,
         "swiglu": _make_swiglu_inputs,
@@ -51,6 +52,7 @@ def operator_shape_name(op_name: str, args: argparse.Namespace) -> str:
         "matmul": f"{batch}x{seq}x{_matmul_k(args)}x{_matmul_n(args)}",
         "attention": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}",
         "logp": f"{batch}x{seq}x{vocab}",
+        "linear_logp": f"{batch}x{seq}x{_normalized_dim(args)}x{vocab}",
         "rope": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}",
         "silu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}",
         "swiglu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}",
@@ -111,6 +113,20 @@ def _make_logp_inputs(
     }
 
 
+def _make_linear_logp_inputs(
+    args: argparse.Namespace, dtype: torch.dtype, device: torch.device
+) -> dict[str, Any]:
+    batch, seq = _batch_seq(args)
+    hidden_dim = _normalized_dim(args)
+    vocab = _arg_int(args, "vocab", DEFAULT_VOCAB)
+    return {
+        "hidden": _floating_tensor((batch, seq, hidden_dim), args, dtype, device, offset=0),
+        "lm_head_weight": _floating_tensor((vocab, hidden_dim), args, dtype, device, offset=1),
+        "target_ids": _token_ids((batch, seq), vocab, args, device),
+        "bias": None,
+    }
+
+
 def _make_rope_inputs(
     args: argparse.Namespace, dtype: torch.dtype, device: torch.device
 ) -> dict[str, Any]:
diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py
index 1373151..334224b 100644
--- a/rl_engine/kernels/gtest/operator_specs.py
+++ b/rl_engine/kernels/gtest/operator_specs.py
@@ -19,6 +19,7 @@ class OperatorSpec:
     name: str
     op_class: str
     gold_path: str
+    gold_method: str
     registry_name: str
     candidate_paths: dict[str, str]
     grad_input_names: tuple[str, ...] = ()
@@ -36,6 +37,7 @@ def _load_object(path: str) -> Any:
         name="logp",
         op_class="logprob",
         gold_path="rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp",
+        gold_method="forward_fp32",
         registry_name="logp",
         candidate_paths={
             "pytorch": "rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp",
@@ -45,6 +47,19 @@ def _load_object(path: str) -> Any:
         },
         grad_input_names=("logits",),
     ),
+    "linear_logp": OperatorSpec(
+        name="linear_logp",
+        op_class="logprob",
+        gold_path="rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp",
+        gold_method="apply",
+        registry_name="linear_logp",
+        candidate_paths={
+            "pytorch": "rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp",
+            "triton": "rl_engine.kernels.ops.triton.loss.linear_logp.TritonLinearLogpOp",
+            "cuda-sm90": "rl_engine.kernels.ops.cuda.loss.linear_logp.FusedLinearLogpSM90Op",
+        },
+        grad_input_names=("hidden", "lm_head_weight"),
+    ),
 }
 
 
@@ -68,12 +83,13 @@ def make_operator_case(
 ) -> OperatorCase:
     spec = OP_SPECS[args.op]
     gold_op = _load_object(spec.gold_path)()
+    gold_fn = getattr(gold_op, spec.gold_method)
     return OperatorCase(
         name=f"{args.op}-{dtype}-{operator_shape_name(args.op, args)}",
         op_class=spec.op_class,
         dtype=dtype,
         inputs=make_operator_inputs(args.op, args, dtype, device),
-        gold_fn=gold_op.forward_fp32,
+        gold_fn=gold_fn,
         grad_input_names=spec.grad_input_names,
     )
 
diff --git a/tests/test_operator_inputs.py b/tests/test_operator_inputs.py
index 22d39ae..bb1a222 100644
--- a/tests/test_operator_inputs.py
+++ b/tests/test_operator_inputs.py
@@ -37,6 +37,7 @@ def _args(**overrides):
         "matmul",
         "attention",
         "logp",
+        "linear_logp",
         "rope",
         "silu",
         "swiglu",
@@ -68,3 +69,13 @@ def test_random_logp_inputs_are_seeded():
 
     assert torch.equal(first["logits"], second["logits"])
     assert torch.equal(first["token_ids"], second["token_ids"])
+
+
+def test_constant_linear_logp_inputs_match_operator_contract():
+    args = _args(input_mode="constant", constant_value=0.5, token_value=3)
+    inputs = make_operator_inputs("linear_logp", args, torch.float32, torch.device("cpu"))
+
+    assert torch.equal(inputs["hidden"], torch.full((1, 2, 128), 0.5))
+    assert torch.equal(inputs["lm_head_weight"], torch.full((17, 128), 0.51))
+    assert torch.equal(inputs["target_ids"], torch.full((1, 2), 3, dtype=torch.long))
+    assert inputs["bias"] is None

From 4bd2ed8420560fe880bf810da0aabbd8a490b6bc Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Thu, 2 Jul 2026 08:14:01 +0800
Subject: [PATCH 09/11] add random grad for gtest

---
 docs/contributing/issue-108-session-log.md |   7 +-
 rl_engine/kernels/gtest/op_checks.py       | 110 +++++++++++++++++++--
 scripts/check_operator.py                  |  10 ++
 tests/test_op_checks.py                    |  49 +++++++++
 4 files changed, 169 insertions(+), 7 deletions(-)

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
index b55806f..cd8c0f7 100644
--- a/docs/contributing/issue-108-session-log.md
+++ b/docs/contributing/issue-108-session-log.md
@@ -202,6 +202,8 @@ Supported key options:
 --token-value     Token id for constant mode, reduced modulo vocab.
 --seed            Random seed for reproducible random inputs.
 --check-grad      Also compare gradients for inputs declared by the operator spec.
+--grad-mode       Upstream gradient mode for --check-grad: random by default; ones for smoke tests.
+--grad-seed       Random upstream gradient seed for --grad-mode random.
 --json            Print the full structured report as JSON.
 ```
 
@@ -416,12 +418,15 @@ Change:
 - Added `_run_case_backward()` to compare candidate forward outputs and selected input gradients against the PyTorch gold path.
 - Added `OperatorSpec.grad_input_names`; `logp` declares `("logits",)`.
 - Added `scripts/check_operator.py --check-grad`.
+- Added `--grad-mode ones|random` and `--grad-seed` for backward checks.
 
 Reasoning:
 
 - Forward-only checks can miss incorrect or disconnected backward paths.
 - Gradient inputs must be declared per operator because not every floating tensor should receive gradients.
 - Input generation remains independent of autograd; the runner clones inputs and enables `requires_grad` only inside the backward check path.
+- `grad_mode=random` is the default and catches backward bugs hidden by all-one upstream gradients.
+- `grad_mode=ones` remains available for quick smoke tests and preserves the old `output.sum().backward()` behavior.
 
 Known backend limitation:
 
@@ -457,7 +462,7 @@ Reasoning:
 Example Triton smoke command:
 
 ```bash
-python scripts/check_operator.py   --op linear_logp   --candidate triton   --device cuda   --dtype bf16   --batch 1   --seq 2   --vocab 1024   --normalized-dim 4096   --check-grad
+python scripts/check_operator.py   --op linear_logp   --candidate triton   --device cuda   --dtype bf16   --batch 1   --seq 2   --vocab 1024   --normalized-dim 4096   --check-grad   --grad-mode random   --grad-seed 123
 ```
 
 Observed bf16 result on H100:
diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py
index fd590d9..cbd0fd6 100644
--- a/rl_engine/kernels/gtest/op_checks.py
+++ b/rl_engine/kernels/gtest/op_checks.py
@@ -97,6 +97,8 @@ def run_operator_suite(
     cases: Sequence[OperatorCase],
     contract: Mapping[str, Any] | None = None,
     check_grad: bool = False,
+    grad_mode: str = "random",
+    grad_seed: int = 123,
 ) -> OperatorCheckReport:
     """Run candidates against gold outputs and return a structured report."""
 
@@ -106,7 +108,14 @@ def run_operator_suite(
     # camdidate : test instance
     # loaded_contract : tolerance table
     candidate_reports = [
-        _run_candidate(candidate, cases, loaded_contract, check_grad=check_grad)
+        _run_candidate(
+            candidate,
+            cases,
+            loaded_contract,
+            check_grad=check_grad,
+            grad_mode=grad_mode,
+            grad_seed=grad_seed,
+        )
         for candidate in candidates
     ]
     passed_candidates = sum(1 for report in candidate_reports if report.passed)
@@ -128,9 +137,22 @@ def _run_candidate(
     contract: Mapping[str, Any],
     *,
     check_grad: bool,
+    grad_mode: str,
+    grad_seed: int,
 ) -> CandidateReport:
-    case_runner = _run_case_backward if check_grad else _run_case
-    case_checks = [case_runner(candidate, case, contract) for case in cases]
+    if check_grad:
+        case_checks = [
+            _run_case_backward(
+                candidate,
+                case,
+                contract,
+                grad_mode=grad_mode,
+                grad_seed=grad_seed,
+            )
+            for case in cases
+        ]
+    else:
+        case_checks = [_run_case(candidate, case, contract) for case in cases]
     total_outputs = sum(len(case.outputs) for case in case_checks)
     passed_outputs = sum(
         1 for case in case_checks for output in case.outputs if output.passed
@@ -161,6 +183,9 @@ def _run_case_backward(
     candidate: CandidateSpec,
     case: OperatorCase,
     contract: Mapping[str, Any],
+    *,
+    grad_mode: str,
+    grad_seed: int,
 ) -> CaseCheck:
     if not case.grad_input_names:
         raise ValueError(f"case {case.name!r} does not declare gradient inputs")
@@ -169,8 +194,23 @@ def _run_case_backward(
     gold_inputs = _clone_inputs_for_backward(case.inputs, case.grad_input_names)
     candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, candidate_inputs))
     gold_outputs = _flatten_tensors(case.gold_fn(**gold_inputs))
-    candidate_grads = _backward_grads(candidate_outputs, candidate_inputs, case.grad_input_names)
-    gold_grads = _backward_grads(gold_outputs, gold_inputs, case.grad_input_names)
+    # Candidate and gold must use the same upstream gradients; otherwise we
+    # would compare different vector-Jacobian products.
+    # grad_mode="ones" is the old output.sum().backward() smoke path.
+    # grad_mode="random" is closer to training, where dL/doutput is non-uniform.
+    grad_outputs = _make_grad_outputs(candidate_outputs, grad_mode=grad_mode, seed=grad_seed)
+    candidate_grads = _backward_grads(
+        candidate_outputs,
+        candidate_inputs,
+        case.grad_input_names,
+        grad_outputs=grad_outputs,
+    )
+    gold_grads = _backward_grads(
+        gold_outputs,
+        gold_inputs,
+        case.grad_input_names,
+        grad_outputs=_match_grad_outputs(grad_outputs, gold_outputs),
+    )
     output_checks = _compare_case_outputs(
         candidate,
         case,
@@ -178,6 +218,9 @@ def _run_case_backward(
         candidate_outputs,
         gold_outputs,
     ).outputs
+    # Reuse the same tolerance class for gradients as for values. This is a
+    # first conservative default; operator-specific gradient tolerances can be
+    # split out later if a real backend shows different numerical behavior.
     atol, rtol = _resolve_tolerance(
         contract,
         op_class=case.op_class,
@@ -279,8 +322,19 @@ def _backward_grads(
     outputs: list[torch.Tensor],
     inputs: Mapping[str, Any],
     grad_input_names: tuple[str, ...],
+    *,
+    grad_outputs: list[torch.Tensor],
 ) -> list[torch.Tensor]:
-    loss = sum(output.float().sum() for output in outputs)
+    if len(outputs) != len(grad_outputs):
+        raise ValueError(
+            f"got {len(grad_outputs)} upstream gradients for {len(outputs)} outputs"
+        )
+    # `ones` makes this equivalent to output.sum().backward(); `random` tests a
+    # stricter vector-Jacobian product.
+    loss = sum(
+        (output.float() * grad_output.to(device=output.device).float()).sum()
+        for output, grad_output in zip(outputs, grad_outputs, strict=True)
+    )
     loss.backward()
     grads: list[torch.Tensor] = []
     for name in grad_input_names:
@@ -291,6 +345,50 @@ def _backward_grads(
     return grads
 
 
+def _make_grad_outputs(
+    outputs: list[torch.Tensor],
+    *,
+    grad_mode: str,
+    seed: int,
+) -> list[torch.Tensor]:
+    if grad_mode == "ones":
+        # All-one upstream gradients make the scalar loss equal output.sum().
+        return [torch.ones_like(output, dtype=torch.float32) for output in outputs]
+    if grad_mode != "random":
+        raise ValueError(f"unsupported grad_mode: {grad_mode}")
+
+    grad_outputs: list[torch.Tensor] = []
+    generators: dict[torch.device, torch.Generator] = {}
+    for output in outputs:
+        if output.device not in generators:
+            # Generators are device-local; a CUDA generator cannot draw CPU tensors.
+            generator = torch.Generator(device=output.device)
+            generator.manual_seed(seed)
+            generators[output.device] = generator
+        # Random upstream gradients test a non-uniform dL/doutput. The same
+        # tensors are later reused for gold so the comparison stays fair.
+        grad_outputs.append(
+            torch.randn(
+                output.shape,
+                generator=generators[output.device],
+                device=output.device,
+                dtype=torch.float32,
+            )
+        )
+    return grad_outputs
+
+
+def _match_grad_outputs(
+    grad_outputs: list[torch.Tensor],
+    outputs: list[torch.Tensor],
+) -> list[torch.Tensor]:
+    # Reuse upstream values for gold; only move device when needed.
+    return [
+        grad_output.to(device=output.device)
+        for grad_output, output in zip(grad_outputs, outputs, strict=True)
+    ]
+
+
 def _flatten_tensors(value: Any) -> list[torch.Tensor]:
     if isinstance(value, torch.Tensor):
         return [value]
diff --git a/scripts/check_operator.py b/scripts/check_operator.py
index 0b31b70..461e3f9 100644
--- a/scripts/check_operator.py
+++ b/scripts/check_operator.py
@@ -93,6 +93,14 @@ def parse_args() -> argparse.Namespace:
         help="Optional tolerance override key, for example sm90. Defaults to contract.default.",
     )
     parser.add_argument("--check-grad", action="store_true", help="Also compare gradients for supported inputs.")
+    # Defaults to random because it catches bugs hidden by output.sum().backward().
+    parser.add_argument(
+        "--grad-mode",
+        choices=("ones", "random"),
+        default="random",
+        help="Upstream gradient mode used with --check-grad.",
+    )
+    parser.add_argument("--grad-seed", type=int, default=123, help="Seed for --grad-mode random.")
     parser.add_argument("--json", action="store_true", help="Print the full structured report as JSON.")
     return parser.parse_args()
 
@@ -108,6 +116,8 @@ def main() -> None:
         candidates=[candidate],
         cases=[case],
         check_grad=args.check_grad,
+        grad_mode=args.grad_mode,
+        grad_seed=args.grad_seed,
     )
 
     if args.json:
diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py
index b28148b..74de3b6 100644
--- a/tests/test_op_checks.py
+++ b/tests/test_op_checks.py
@@ -170,3 +170,52 @@ def bad_grad_logp(logits, token_ids):
     assert not report.passed
     assert gradient_output.message == "gradient:logits"
     assert gradient_output.max_abs_error > 0.0
+
+
+def test_random_grad_mode_catches_nonuniform_upstream_gradient_bug():
+    # Forward is identity, so only a non-uniform upstream gradient can expose
+    # the intentionally wrong backward below.
+    class MeanUpstreamIdentity(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            return x.clone()
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            # Wrong for random upstream gradients, but correct when all values are 1.
+            return grad_output.mean().expand_as(grad_output)
+
+    def bad_identity(x):
+        return MeanUpstreamIdentity.apply(x)
+
+    case = OperatorCase(
+        name="identity",
+        op_class="elementwise",
+        dtype=torch.float32,
+        inputs={"x": torch.randn(8, dtype=torch.float32)},
+        gold_fn=lambda x: x,
+        grad_input_names=("x",),
+    )
+
+    ones_report = run_operator_suite(
+        "identity",
+        candidates=[CandidateSpec(name="bad-identity", backend="test", fn=bad_identity)],
+        cases=[case],
+        check_grad=True,
+        grad_mode="ones",
+    )
+    # ones passes by design; random must fail and prove the stricter path works.
+    random_report = run_operator_suite(
+        "identity",
+        candidates=[CandidateSpec(name="bad-identity", backend="test", fn=bad_identity)],
+        cases=[case],
+        check_grad=True,
+        grad_mode="random",
+        grad_seed=7,
+    )
+
+    assert ones_report.passed
+    gradient_output = random_report.candidates[0].cases[0].outputs[1]
+    assert not random_report.passed
+    assert gradient_output.message == "gradient:x"
+    assert gradient_output.max_abs_error > 0.0

From 4fbb7fdf40439adb5a708e5fd0257a6f9cd9daa6 Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Thu, 2 Jul 2026 09:19:33 +0800
Subject: [PATCH 10/11] Remove registry candidate from gtest checker

---
 docs/contributing/issue-108-session-log.md |  5 ++---
 rl_engine/kernels/gtest/operator_specs.py  | 15 +--------------
 scripts/check_operator.py                  |  4 ++--
 tests/test_op_checks.py                    | 19 -------------------
 4 files changed, 5 insertions(+), 38 deletions(-)

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
index cd8c0f7..6261d7c 100644
--- a/docs/contributing/issue-108-session-log.md
+++ b/docs/contributing/issue-108-session-log.md
@@ -143,13 +143,13 @@ candidates:
   cuda         -> FusedLogpGenericOp
   cuda-generic -> FusedLogpGenericOp
   cuda-sm90    -> FusedLogpSM90Op
-  registry     -> kernel_registry.get_op("logp")
 ```
 
 Important note:
 
 - `candidate=pytorch` is only a smoke test for the checker itself.
 - CUDA, Triton, ROCm, and future hardware-specific implementations are candidates.
+- Registry dispatch is tested separately from this accuracy harness because it is hardware-dependent.
 - Do not compare two operators that implement different math, such as ordinary `logp` and `linear_logp`.
 
 ### SM90 Adapter Exception
@@ -190,7 +190,7 @@ Supported key options:
 
 ```text
 --op              Operator name. The minimal version supports logp and linear_logp.
---candidate       Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, registry.
+--candidate       Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, triton.
 --dtype           fp32, bf16, or fp16.
 --device          auto, cpu, cuda, or another torch device string.
 --arch-key        Optional tolerance override key such as sm90.
@@ -276,7 +276,6 @@ Add an `OperatorSpec` entry:
     name="new_op",
     op_class="elementwise",
     gold_path="rl_engine.kernels.ops.pytorch....NativeNewOp",
-    registry_name="new_op",
     candidate_paths={
         "pytorch": "rl_engine.kernels.ops.pytorch....NativeNewOp",
         "cuda": "rl_engine.kernels.ops.cuda....CudaNewOp",
diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py
index 334224b..99c45ec 100644
--- a/rl_engine/kernels/gtest/operator_specs.py
+++ b/rl_engine/kernels/gtest/operator_specs.py
@@ -20,7 +20,6 @@ class OperatorSpec:
     op_class: str
     gold_path: str
     gold_method: str
-    registry_name: str
     candidate_paths: dict[str, str]
     grad_input_names: tuple[str, ...] = ()
 
@@ -38,7 +37,6 @@ def _load_object(path: str) -> Any:
         op_class="logprob",
         gold_path="rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp",
         gold_method="forward_fp32",
-        registry_name="logp",
         candidate_paths={
             "pytorch": "rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp",
             "cuda": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp",
@@ -52,7 +50,6 @@ def _load_object(path: str) -> Any:
         op_class="logprob",
         gold_path="rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp",
         gold_method="apply",
-        registry_name="linear_logp",
         candidate_paths={
             "pytorch": "rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp",
             "triton": "rl_engine.kernels.ops.triton.loss.linear_logp.TritonLinearLogpOp",
@@ -109,17 +106,7 @@ def make_candidate(args: argparse.Namespace) -> CandidateSpec:
             fn=candidate_op,
         )
 
-    if candidate_name == "registry":
-        from rl_engine.kernels.registry import kernel_registry
-
-        return CandidateSpec(
-            name=f"registry-{args.op}",
-            backend="registry",
-            arch_key=args.arch_key,
-            fn=kernel_registry.get_op(spec.registry_name),
-        )
-
-    supported = sorted([*spec.candidate_paths, "native", "registry"])
+    supported = sorted([*spec.candidate_paths, "native"])
     raise ValueError(
         f"unsupported candidate {args.candidate!r} for op {args.op!r}; "
         f"supported candidates: {', '.join(supported)}"
diff --git a/scripts/check_operator.py b/scripts/check_operator.py
index 461e3f9..677f01b 100644
--- a/scripts/check_operator.py
+++ b/scripts/check_operator.py
@@ -70,8 +70,8 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--op", choices=operator_names(), default="logp")
     parser.add_argument(
         "--candidate",
-        default="registry",
-        help="Candidate backend to validate, for example registry, pytorch, cuda, cuda-sm90.",
+        default="pytorch",
+        help="Candidate backend to validate, for example pytorch, cuda, cuda-sm90, triton.",
     )
     parser.add_argument("--dtype", choices=("fp32", "bf16", "fp16"), default="fp32")
     parser.add_argument("--device", default="auto")
diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py
index 74de3b6..8f08d87 100644
--- a/tests/test_op_checks.py
+++ b/tests/test_op_checks.py
@@ -51,25 +51,6 @@ def test_logp_native_candidate_suite_passes():
     assert all(case.passed for case in report.candidates[0].cases)
 
 
-def test_logp_registry_candidate_suite_passes_on_cpu():
-    from rl_engine.kernels.registry import kernel_registry
-
-    report = run_operator_suite(
-        "logp",
-        candidates=[
-            CandidateSpec(
-                name="registry-logp",
-                backend="registry",
-                fn=kernel_registry.get_op("logp"),
-            )
-        ],
-        cases=[_logp_case("fp32", torch.float32, seed=4)],
-    )
-
-    assert report.passed
-    assert report.candidates[0].candidate_name == "registry-logp"
-
-
 def test_suite_reports_failure_for_bad_candidate():
     def bad_logp(logits, token_ids):
         del token_ids

From 347726b4747fa5ffe6c7cc9bf21773b57551117d Mon Sep 17 00:00:00 2001
From: a-kaa <siruhe666@gmail.com>
Date: Thu, 2 Jul 2026 09:21:37 +0800
Subject: [PATCH 11/11] Rename tolerance contract to json

---
 docs/contributing/issue-108-session-log.md                  | 6 +++---
 rl_engine/kernels/gtest/tolerance.py                        | 2 +-
 .../{tolerance_contract.yaml => tolerance_contract.json}    | 0
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename rl_engine/kernels/gtest/{tolerance_contract.yaml => tolerance_contract.json} (100%)

diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md
index 6261d7c..a6aa9c1 100644
--- a/docs/contributing/issue-108-session-log.md
+++ b/docs/contributing/issue-108-session-log.md
@@ -31,7 +31,7 @@ rl_engine/kernels/gtest/
   operator_inputs.py
   operator_specs.py
   tolerance.py
-  tolerance_contract.yaml
+  tolerance_contract.json
 
 scripts/check_operator.py
 
@@ -48,7 +48,7 @@ Files:
 
 ```text
 rl_engine/kernels/gtest/tolerance.py
-rl_engine/kernels/gtest/tolerance_contract.yaml
+rl_engine/kernels/gtest/tolerance_contract.json
 tests/test_tolerance_contract.py
 ```
 
@@ -296,7 +296,7 @@ Rules:
 File:
 
 ```text
-rl_engine/kernels/gtest/tolerance_contract.yaml
+rl_engine/kernels/gtest/tolerance_contract.json
 ```
 
 Reuse an existing class when possible:
diff --git a/rl_engine/kernels/gtest/tolerance.py b/rl_engine/kernels/gtest/tolerance.py
index 9a1cc0b..3265a45 100644
--- a/rl_engine/kernels/gtest/tolerance.py
+++ b/rl_engine/kernels/gtest/tolerance.py
@@ -8,7 +8,7 @@
 from typing import Any
 
 
-_CONTRACT_PATH = Path(__file__).with_name("tolerance_contract.yaml")
+_CONTRACT_PATH = Path(__file__).with_name("tolerance_contract.json")
 
 
 def load_contract(path: str | Path = _CONTRACT_PATH) -> dict[str, Any]:
diff --git a/rl_engine/kernels/gtest/tolerance_contract.yaml b/rl_engine/kernels/gtest/tolerance_contract.json
similarity index 100%
rename from rl_engine/kernels/gtest/tolerance_contract.yaml
rename to rl_engine/kernels/gtest/tolerance_contract.json