From 422a2c90b544ff41dfc2719bf8b00366969c1447 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Mon, 22 Jun 2026 23:48:22 +0800 Subject: [PATCH 01/11] Add LogProb reference operator interface(reuse) --- docs/operators/fused-logp.md | 35 +++-- rl_engine/kernels/ops/pytorch/loss/logp.py | 20 ++- tests/test_logp.py | 144 +++++++++++++++++++++ 3 files changed, 187 insertions(+), 12 deletions(-) create mode 100644 tests/test_logp.py diff --git a/docs/operators/fused-logp.md b/docs/operators/fused-logp.md index d5008e5..ccc0f61 100644 --- a/docs/operators/fused-logp.md +++ b/docs/operators/fused-logp.md @@ -13,21 +13,36 @@ logp_op = kernel_registry.get_op("logp") output = logp_op(logits, token_ids) ``` +The PyTorch native reference also exposes the Issue #108 interface: + +```python +from rl_engine.kernels.ops.pytorch.loss.logp import NativeLogpOp + +logp_ref = NativeLogpOp() +output = logp_ref.forward(logits, token_ids) +reference = logp_ref.forward_fp32(logits, token_ids) +``` + +`apply(...)` and `apply_fp32(...)` remain available as backward-compatible aliases. + ## Backends | Backend | Wrapper | Native symbol | Notes | | --- | --- | --- | --- | | CUDA SM90 | `FusedLogpSM90Op` | `_C.fused_logp_sm90` | TMA-oriented path for Hopper-class GPUs. | | CUDA generic | `FusedLogpGenericOp` | `_C.fused_logp` | Generic compiled extension fallback. | -| PyTorch native | `NativeOp` | None | Baseline fallback path. | +| PyTorch native | `NativeLogpOp` | None | PyTorch baseline/reference path. | ## Tensor Contract | Argument | Shape | Dtype | Requirements | | --- | --- | --- | --- | -| `logits` | `[N, V]` | `bfloat16` for SM90 path | Contiguous, on the target device. | -| `token_ids` / `labels` | `[N]` | Converted to `int32` | Same logical device as `logits`. | -| Output | `[N]` | Backend-defined tensor dtype | One selected log probability per row. | +| `logits` | `[..., V]` | Floating point | Contiguous for fused CUDA paths; arbitrary leading dimensions. | +| `token_ids` / `labels` | `[...]` | Integer | Must match `logits.shape[:-1]`. | +| Output | `[...]` | See below | One selected log probability per row. | + +For `NativeLogpOp`, `forward(...)` returns the input dtype and `forward_fp32(...)` +returns `torch.float32`. ## Reference Semantics @@ -39,16 +54,20 @@ ref = torch.gather(ref, dim=-1, index=token_ids.unsqueeze(-1).long()).squeeze(-1 ## Tests ```bash -python tests/test_op_accuracy.py +python -m pytest tests/test_logp.py -q +python -m pytest tests/test_op_accuracy.py -q ``` -The current accuracy test compares the dispatched operator with a PyTorch reference and -uses a dtype-dependent threshold. +`tests/test_logp.py` covers the PyTorch reference contract, dtype behavior, +backward-compatible aliases, batch invariance, and registry dispatch. The existing +operator accuracy tests continue to validate native/CUDA fused API compatibility. ## Implementation Files - `rl_engine/kernels/registry.py` -- `rl_engine/kernels/ops/cuda.py` +- `rl_engine/kernels/ops/pytorch/loss/logp.py` +- `rl_engine/kernels/ops/cuda/loss/logp.py` - `csrc/ops.cpp` - `csrc/fused_logp_kernel.cu` - `csrc/cuda/fused_logp_sm90.cu` +- `tests/test_logp.py` diff --git a/rl_engine/kernels/ops/pytorch/loss/logp.py b/rl_engine/kernels/ops/pytorch/loss/logp.py index c791927..4ef1f24 100644 --- a/rl_engine/kernels/ops/pytorch/loss/logp.py +++ b/rl_engine/kernels/ops/pytorch/loss/logp.py @@ -1,17 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2026 RL-Kernel Contributors +from __future__ import annotations + import torch class NativeLogpOp: """Pure PyTorch native fallback for Fused LogP.""" - def __init__(self): + op_class = "logprob" + + def __init__(self) -> None: pass def __call__(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: - return self.apply(logits, token_ids) + return self.forward(logits, token_ids) def _selected_logps( self, @@ -45,14 +49,22 @@ def _validate_output_shape(self, output: torch.Tensor, logits: torch.Tensor) -> f"{tuple(logits.shape[:-1])}" ) - def apply(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: + def forward(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: """Baseline selected-token log probability extraction using torch.gather.""" return self._selected_logps(logits, token_ids, output_dtype=logits.dtype) - def apply_fp32(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: + def forward_fp32(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: """Same as apply but forces float32 output for numerical stability.""" return self._selected_logps(logits, token_ids, output_dtype=torch.float32) + def apply(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: + """Backward-compatible alias for forward.""" + return self.forward(logits, token_ids) + + def apply_fp32(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: + """Backward-compatible alias for forward_fp32.""" + return self.forward_fp32(logits, token_ids) + def indexed_out( self, logits: torch.Tensor, diff --git a/tests/test_logp.py b/tests/test_logp.py new file mode 100644 index 0000000..48bb950 --- /dev/null +++ b/tests/test_logp.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +"""Tests for NativeLogpOp, the PyTorch selected-logprob reference.""" + +from __future__ import annotations + +import pytest +import torch + +from rl_engine.kernels.ops.pytorch.loss.logp import NativeLogpOp + + +def _make_inputs( + batch: int, + seq: int, + vocab: int, + *, + dtype: torch.dtype = torch.float32, + seed: int = 123, +) -> tuple[torch.Tensor, torch.Tensor]: + gen = torch.Generator().manual_seed(seed) + logits = torch.randn(batch, seq, vocab, generator=gen, dtype=dtype) + token_ids = torch.randint(0, vocab, (batch, seq), generator=gen, dtype=torch.long) + return logits, token_ids + + +def _reference_selected_logp(logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: + log_probs = torch.log_softmax(logits.float(), dim=-1) + return torch.gather(log_probs, dim=-1, index=token_ids.long().unsqueeze(-1)).squeeze(-1) + + +class TestNativeLogpOpCorrectness: + def test_output_shape_matches_token_ids(self): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 257) + out = op.forward_fp32(logits, token_ids) + assert out.shape == token_ids.shape + + def test_forward_fp32_returns_fp32(self): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 257, dtype=torch.bfloat16) + out = op.forward_fp32(logits, token_ids) + assert out.dtype == torch.float32 + + @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16]) + def test_forward_returns_input_dtype(self, dtype): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 257, dtype=dtype) + out = op.forward(logits, token_ids) + assert out.dtype == dtype + + def test_call_and_apply_alias_forward(self): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 257) + forward = op.forward(logits, token_ids) + assert torch.equal(op(logits, token_ids), forward) + assert torch.equal(op.apply(logits, token_ids), forward) + + def test_apply_fp32_alias_forward_fp32(self): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 257) + assert torch.equal(op.apply_fp32(logits, token_ids), op.forward_fp32(logits, token_ids)) + + def test_matches_fp32_reference_bitwise(self): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 257) + out = op.forward_fp32(logits, token_ids) + ref = _reference_selected_logp(logits, token_ids) + assert torch.equal(out, ref) + + def test_pure_function_no_inplace(self): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 257) + logits_orig = logits.clone() + token_ids_orig = token_ids.clone() + _ = op.forward_fp32(logits, token_ids) + assert torch.equal(logits, logits_orig) + assert torch.equal(token_ids, token_ids_orig) + + def test_op_class_is_logprob(self): + assert NativeLogpOp.op_class == "logprob" + + def test_rejects_mismatched_shapes(self): + op = NativeLogpOp() + logits = torch.randn(2, 3, 5) + token_ids = torch.randint(0, 5, (2, 4)) + with pytest.raises(ValueError, match="must match"): + op.forward_fp32(logits, token_ids) + + +class TestNativeLogpOpBatchInvariance: + def test_batch1_vs_batchN_bitwise(self): + op = NativeLogpOp() + logits, token_ids = _make_inputs(4, 16, 257, seed=321) + full_out = op.forward_fp32(logits, token_ids) + for row in range(logits.shape[0]): + single_out = op.forward_fp32(logits[row : row + 1], token_ids[row : row + 1]) + assert torch.equal(full_out[row], single_out[0]), ( + f"Batch invariance broken at row {row}" + ) + + def test_batch_invariance_with_padding(self): + op = NativeLogpOp() + logits_valid, token_ids_valid = _make_inputs(2, 16, 257, seed=456) + gen = torch.Generator().manual_seed(789) + logits_padding = torch.randn(3, 16, 257, generator=gen) + token_padding = torch.randint(0, 257, (3, 16), generator=gen) + logits_padded = torch.cat([logits_valid, logits_padding], dim=0) + token_ids_padded = torch.cat([token_ids_valid, token_padding], dim=0) + + out_valid = op.forward_fp32(logits_valid, token_ids_valid) + out_padded = op.forward_fp32(logits_padded, token_ids_padded) + assert torch.equal(out_valid[0], out_padded[0]) + assert torch.equal(out_valid[1], out_padded[1]) + + +class TestNativeLogpOpAccuracy: + @pytest.mark.parametrize( + "dtype, atol", + [ + (torch.float32, 1e-5), + (torch.bfloat16, 2e-2), + (torch.float16, 5e-3), + ], + ) + def test_forward_vs_fp32_within_tolerance(self, dtype, atol): + op = NativeLogpOp() + logits, token_ids = _make_inputs(2, 16, 17, dtype=dtype) + out_typed = op.forward(logits, token_ids).float() + out_fp32 = op.forward_fp32(logits, token_ids) + diff = (out_typed - out_fp32).abs().max().item() + assert torch.allclose(out_typed, out_fp32, atol=atol, rtol=0.0), ( + f"dtype={dtype}, max_abs_error={diff:.3e} exceeds atol={atol}" + ) + + +class TestNativeLogpOpRegistry: + @pytest.mark.skipif(torch.cuda.is_available(), reason="CUDA dispatch may select fused logp") + def test_registry_returns_logp_op(self): + from rl_engine.kernels.registry import kernel_registry + + op = kernel_registry.get_op("logp") + assert isinstance(op, NativeLogpOp) From 3aa797a7bd0fb17f9027c0c04c62642b296d42c9 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Tue, 23 Jun 2026 00:00:57 +0800 Subject: [PATCH 02/11] format with pre-commit --- tests/test_logp.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_logp.py b/tests/test_logp.py index 48bb950..42492fb 100644 --- a/tests/test_logp.py +++ b/tests/test_logp.py @@ -96,9 +96,9 @@ def test_batch1_vs_batchN_bitwise(self): full_out = op.forward_fp32(logits, token_ids) for row in range(logits.shape[0]): single_out = op.forward_fp32(logits[row : row + 1], token_ids[row : row + 1]) - assert torch.equal(full_out[row], single_out[0]), ( - f"Batch invariance broken at row {row}" - ) + assert torch.equal( + full_out[row], single_out[0] + ), f"Batch invariance broken at row {row}" def test_batch_invariance_with_padding(self): op = NativeLogpOp() @@ -130,9 +130,9 @@ def test_forward_vs_fp32_within_tolerance(self, dtype, atol): out_typed = op.forward(logits, token_ids).float() out_fp32 = op.forward_fp32(logits, token_ids) diff = (out_typed - out_fp32).abs().max().item() - assert torch.allclose(out_typed, out_fp32, atol=atol, rtol=0.0), ( - f"dtype={dtype}, max_abs_error={diff:.3e} exceeds atol={atol}" - ) + assert torch.allclose( + out_typed, out_fp32, atol=atol, rtol=0.0 + ), f"dtype={dtype}, max_abs_error={diff:.3e} exceeds atol={atol}" class TestNativeLogpOpRegistry: From b86d8a1425398b4a2ef55ddab6b72e04c4c636f9 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Wed, 24 Jun 2026 21:12:00 +0800 Subject: [PATCH 03/11] add tolerance table --- rl_engine/testing/tolerance.py | 21 ++++++++++++++++ rl_engine/testing/tolerance_contract.yaml | 25 +++++++++++++++++++ tests/test_tolerance_contract.py | 29 +++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 rl_engine/testing/tolerance.py create mode 100644 rl_engine/testing/tolerance_contract.yaml create mode 100644 tests/test_tolerance_contract.py diff --git a/rl_engine/testing/tolerance.py b/rl_engine/testing/tolerance.py new file mode 100644 index 0000000..9a1cc0b --- /dev/null +++ b/rl_engine/testing/tolerance.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + + +_CONTRACT_PATH = Path(__file__).with_name("tolerance_contract.yaml") + + +def load_contract(path: str | Path = _CONTRACT_PATH) -> dict[str, Any]: + """Load the dtype/operator-class tolerance contract.""" + + with Path(path).open("r", encoding="utf-8") as handle: + return json.load(handle) + + +__all__ = ["load_contract"] diff --git a/rl_engine/testing/tolerance_contract.yaml b/rl_engine/testing/tolerance_contract.yaml new file mode 100644 index 0000000..7a2ce2b --- /dev/null +++ b/rl_engine/testing/tolerance_contract.yaml @@ -0,0 +1,25 @@ +{ + "batch_invariance": {"atol": 0.0, "rtol": 0.0}, + "accuracy": { + "default": { + "elementwise": { + "float32": {"atol": 1.0e-5, "rtol": 1.0e-5}, + "bfloat16": {"atol": 2.0e-2, "rtol": 1.6e-2}, + "float16": {"atol": 1.0e-3, "rtol": 1.0e-3} + }, + "reduction": { + "float32": {"atol": 1.0e-4, "rtol": 1.0e-4}, + "bfloat16": {"atol": 5.0e-2, "rtol": 2.0e-2}, + "float16": {"atol": 1.0e-3, "rtol": 1.0e-3} + }, + "logprob": { + "float32": {"atol": 1.0e-5, "rtol": 0.0}, + "bfloat16": {"atol": 5.0e-2, "rtol": 0.0}, + "float16": {"atol": 5.0e-3, "rtol": 0.0} + } + }, + "arch_overrides": { + "sm90": {} + } + } +} diff --git a/tests/test_tolerance_contract.py b/tests/test_tolerance_contract.py new file mode 100644 index 0000000..0e62d7f --- /dev/null +++ b/tests/test_tolerance_contract.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +from rl_engine.testing.tolerance import load_contract + + +def test_load_contract_contains_expected_operator_classes(): + contract = load_contract() + accuracy = contract["accuracy"]["default"] + assert set(accuracy) == {"elementwise", "reduction", "logprob"} + + +def test_load_contract_contains_expected_dtypes(): + contract = load_contract() + for op_class in ("elementwise", "reduction", "logprob"): + assert set(contract["accuracy"]["default"][op_class]) == { + "float32", + "bfloat16", + "float16", + } + + +def test_logprob_bfloat16_tolerance_covers_observed_reference_drift(): + contract = load_contract() + tolerance = contract["accuracy"]["default"]["logprob"]["bfloat16"] + assert tolerance["atol"] >= 5.0e-2 + assert tolerance["rtol"] == 0.0 From 7defbddec479cbbcf1b6fd510745fcfb4d53d190 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Sun, 28 Jun 2026 05:30:59 +0800 Subject: [PATCH 04/11] Add kernel gtest operator checker --- docs/contributing/issue-108-session-log.md | 1044 +++++++++++++++++ rl_engine/kernels/gtest/__init__.py | 10 + rl_engine/kernels/gtest/op_checks.py | 292 +++++ rl_engine/kernels/gtest/operator_inputs.py | 244 ++++ rl_engine/kernels/gtest/operator_specs.py | 107 ++ .../{testing => kernels/gtest}/tolerance.py | 0 .../gtest}/tolerance_contract.yaml | 0 scripts/check_operator.py | 116 ++ tests/test_op_checks.py | 129 ++ tests/test_operator_inputs.py | 70 ++ tests/test_tolerance_contract.py | 2 +- 11 files changed, 2013 insertions(+), 1 deletion(-) create mode 100644 docs/contributing/issue-108-session-log.md create mode 100644 rl_engine/kernels/gtest/__init__.py create mode 100644 rl_engine/kernels/gtest/op_checks.py create mode 100644 rl_engine/kernels/gtest/operator_inputs.py create mode 100644 rl_engine/kernels/gtest/operator_specs.py rename rl_engine/{testing => kernels/gtest}/tolerance.py (100%) rename rl_engine/{testing => kernels/gtest}/tolerance_contract.yaml (100%) create mode 100644 scripts/check_operator.py create mode 100644 tests/test_op_checks.py create mode 100644 tests/test_operator_inputs.py diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md new file mode 100644 index 0000000..f49ddde --- /dev/null +++ b/docs/contributing/issue-108-session-log.md @@ -0,0 +1,1044 @@ +# ISSUE-108 Session Log + +本文档记录本 session 中围绕 RL-Kernel 算子测试框架、CUDA 验证和 upstream 同步的所有关键修改。后续本 session 中每次代码修改都必须继续追加到本文档,记录目标、设计判断、修改文件、验证方式和结果。 + +## 记录原则 + +- 使用中文记录工程判断。 +- 每个改动都需要说明为什么做,而不只记录改了什么。 +- 保持最小增量:一次修改尽量只围绕一个明确问题。 +- 每个子任务需要能独立验证;无法验证时必须明确说明原因。 +- CUDA 验证必须如实记录环境、命令、误差和失败范围。 +- 不把失败路径写成已支持能力。 + +## 总体目标 + +本 session 的目标是把算子验证从零散脚本推进为可复用、可扩展、可审查的工程化框架: + +- 建立统一 tolerance contract,用于管理不同算子的误差阈值。 +- 建立公共 operator check runner,替代单算子专用验证脚本。 +- 建立统一 operator input 生成逻辑,覆盖后训练常见算子的基础输入。 +- 将测试入口改造成可指定 `op`、`candidate`、`dtype`、`device`、shape 参数的 CLI。 +- 同步 upstream/main,吸收 PR #122 中的 SM90 相关修复。 +- 在 H20 机器上验证普通 CUDA `fused_logp` 路径。 +- 明确 SM90 `fused_linear_logp` 在 CUDA 12.4 下仍未通过。 + +## 时间线 + +### 1. tolerance table 和 contract loader + +目标: + +- 将不同 dtype、op class 的误差容差从测试代码中抽离出来。 +- 让误差阈值可以被审查和维护,而不是散落在测试断言中。 + +修改文件: + +- `rl_engine/testing/tolerance.py` +- `rl_engine/testing/tolerance_contract.yaml` +- `tests/test_tolerance_contract.py` + +设计判断: + +- 使用 YAML 保存 contract,便于人工 review。 +- 将容差按 `accuracy.default` 和可选硬件 override 组织。 +- `default` 是通用 fallback,不等同于 CPU;CPU、SM90、SM100、ROCm、Ascend 等未来可作为明确 override key。 + +验证: + +- `tests/test_tolerance_contract.py` 验证 contract 可读、结构正确。 + +结果: + +- tolerance contract 框架建立完成。 + +### 2. operator check runner + +目标: + +- 建立类似 GoogleTest 思路的算子验证 runner。 +- 一个 case 表示一组确定输入和 gold path,一个 candidate 表示被测实现。 + +修改文件: + +- `rl_engine/testing/op_checks.py` +- `tests/test_op_checks.py` +- `rl_engine/testing/__init__.py` + +设计判断: + +- `OperatorCase` 表示测试对象:`name`、`op_class`、`dtype`、`inputs`、`gold_fn`。 +- `CandidateSpec` 表示被测实现:`name`、`fn`、`backend`、`arch_key`。 +- runner 负责: + - 调用 candidate。 + - 调用 gold。 + - flatten 多输出。 + - 按 `op_class + dtype + arch_key` 解析容差。 + - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。 + - 返回结构化 report。 + +验证: + +- `tests/test_op_checks.py` 覆盖 native logp、registry logp、失败 candidate、arch override 等场景。 + +结果: + +- 公共 operator check runner 建立完成。 + +### 3. `check_operator.py` 从 logp 专用入口改为公共入口 + +目标: + +- 让测试者通过 CLI 指定算子、candidate、dtype、device 和 shape。 +- 避免后续每个算子都写一个独立测试脚本。 + +修改文件: + +- `scripts/check_operator.py` + +设计判断: + +- `check_operator.py` 只负责: + - 解析参数。 + - 选择 device/dtype。 + - 调用 `make_candidate`。 + - 调用 `make_operator_case`。 + - 调用 `run_operator_suite`。 + - 输出 summary 或 JSON。 +- 不在入口中硬编码具体算子实现。 + +验证: + +```bash +python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17 +python scripts/check_operator.py --op logp --candidate registry --dtype bf16 --batch 2 --seq 16 --vocab 257 --json +python -m pytest tests/test_op_checks.py -q +``` + +结果: + +- 公共 CLI 最小闭环通过。 + +### 4. 抽离 operator specs + +目标: + +- 避免新增算子时修改测试入口。 +- 将算子元信息集中到专门文件。 + +修改文件: + +- `rl_engine/testing/operator_specs.py` + +设计判断: + +- 每个算子通过 `OperatorSpec` 描述: + - `name` + - `op_class` + - `gold_path` + - `registry_name` + - `candidate_paths` +- `check_operator.py` 不直接知道某个算子的 Python 类路径。 +- `--candidate cuda` 明确选择 CUDA candidate。 +- `--candidate registry` 仅用于测试 dispatcher 分发结果,不作为具体 CUDA correctness 的替代。 + +当前 logp 映射: + +```text +pytorch -> NativeLogpOp +cuda -> FusedLogpGenericOp +cuda-generic -> FusedLogpGenericOp +cuda-sm90 -> FusedLogpSM90Op +registry -> kernel_registry.get_op("logp") +``` + +结果: + +- 后续新增算子主要扩展 `operator_specs.py`,不再修改公共入口。 + +### 5. 统一 operator input 工厂 + +目标: + +- 用户指出不希望每个新算子都手写 `_make_xxx_inputs` 和 `_xxx_shape_name`。 +- 统一准备 ISSUE #108 中所有算子的输入初始化。 + +修改文件: + +- `rl_engine/testing/operator_inputs.py` +- `tests/test_operator_inputs.py` +- `rl_engine/testing/operator_specs.py` +- `scripts/check_operator.py` + +设计判断: + +- 新增 `make_operator_inputs(op_name, args, dtype, device)`。 +- 新增 `operator_shape_name(op_name, args)`。 +- 支持 `random` 和 `constant` 两种输入模式: + - `random` 用 seed 控制可复现。 + - `constant` 用固定值便于 debug。 +- 支持的算子输入: + - `rms_norm` + - `matmul` + - `attention` + - `logp` + - `rope` + - `silu` + - `swiglu` + - `embedding` + - `lm_head` + - `kv_cache_attention` + +CLI 增加参数: + +```text +--input-mode random|constant +--constant-value +--token-value +--normalized-dim +--k-dim +--n-dim +--theta +--eps +``` + +验证: + +```bash +python -m pytest tests/test_operator_inputs.py -q +python -m pytest tests/test_op_checks.py -q +python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17 --input-mode constant --constant-value 0.5 --token-value 3 +``` + +结果: + +- 多算子输入生成能力建立。 +- logp CLI 随机输入和固定输入均通过。 + +### 6. 删除模型命名和档位参数 + +目标: + +- 用户指出测试框架不应绑定某个模型,也不需要冗余 `--size` 档位。 +- 输入生成应表达为通用数据规模,而不是某个模型配置。 + +修改文件: + +- `rl_engine/testing/operator_inputs.py` +- `scripts/check_operator.py` +- `tests/test_operator_inputs.py` + +设计判断: + +- 删除 `QWEN3_8B` 命名。 +- 删除 `ModelShape` / `DEFAULT_MODEL_SHAPE` 抽象。 +- 改为普通常量: + +```python +DEFAULT_HIDDEN = 4096 +DEFAULT_N_HEADS = 32 +DEFAULT_N_KV_HEADS = 8 +DEFAULT_HEAD_DIM = 128 +DEFAULT_INTERMEDIATE = 12288 +DEFAULT_VOCAB = 151936 +DEFAULT_ROPE_THETA = 1.0e6 +DEFAULT_RMS_EPS = 1.0e-6 +``` + +- 删除 `--size small|medium|large`。 +- 仅保留显式 `--batch` 和 `--seq`。 + +验证: + +```bash +rg -n "Qwen|QWEN|qwen" rl_engine/testing scripts tests +rg -n "ModelShape|DEFAULT_MODEL|model" rl_engine/testing/operator_inputs.py +rg -n "BATCH_SHAPES|BatchShape|--size|small|medium|large" scripts/check_operator.py rl_engine/testing/operator_inputs.py tests/test_operator_inputs.py +python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q +``` + +结果: + +- 模型耦合和档位参数均已删除。 +- 相关测试通过。 + +### 7. 同步 upstream/main 并处理冲突 + +目标: + +- 用户要求检查当前代码是否过旧,拉取最新代码,如有冲突则解决。 + +操作: + +```bash +git fetch --all --prune +git stash push -u -m pre-upstream-main-sync +git rebase upstream/main +git stash pop +``` + +冲突文件: + +- `csrc/cuda/fused_logp_sm90.cu` + +设计判断: + +- `upstream/main` 已包含 PR #122。 +- PR #122 中已经包含 SM90 文件的两项修复: + - `#include ` + - `reinterpret_cast` +- 因此冲突解决时采用 upstream/main 版本。 +- 丢弃本地临时加入的 `#include `。 + +验证: + +```bash +python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q +``` + +结果: + +- rebase 到最新 `upstream/main` 成功。 +- 冲突解决完成。 +- 本地相对 `upstream/main` 为 `ahead 3, behind 0`。 + +### 8. H20 CUDA 环境和普通 CUDA logp 验证 + +目标: + +- 将测试框架迁移到 H 系列 GPU 环境验证。 +- 先确认普通 CUDA `fused_logp` 路径是否可用。 + +环境记录: + +```text +GPU: NVIDIA H20 +Driver: 565.57.01 +Driver CUDA capability: 12.7 +nvcc: 12.4 +Python: 3.11.15 +``` + +普通 CUDA 扩展检查: + +```text +_EXT_AVAILABLE: True +has fused_logp: True +has fused_logp_sm90: False +``` + +验证命令: + +```bash +python scripts/check_operator.py \ + --op logp \ + --candidate cuda \ + --device cuda \ + --dtype bf16 \ + --arch-key sm90 \ + --batch 2 \ + --seq 16 \ + --vocab 257 +``` + +输出: + +```text +INFO [RL-Kernel]: Successfully linked to precompiled _C.fused_logp fallback kernel. +suite=logp passed=True pass_rate=1.0000 +candidate=cuda-logp backend=cuda passed=True pass_rate=1.0000 +case=logp-torch.bfloat16-2x16x257 output=0 shape=(2, 16) +max_abs=1.49779320e-02 +mean_abs=7.53845274e-03 +max_rel=2.70811981e-03 +tol=(atol=5.000e-02, rtol=0.000e+00) +passed=True +``` + +结论: + +- 普通 CUDA `FusedLogpGenericOp -> _C.fused_logp` 路径通过。 +- 这证明测试框架最小 GPU 闭环已经打通: + +```text +CLI +-> operator_specs +-> operator_inputs +-> PyTorch gold +-> CUDA candidate +-> run_operator_suite +-> tolerance contract +-> compare_output +-> structured report +``` + +### 9. SM90 fused_linear_logp 当前状态 + +目标: + +- 尝试编译和验证 SM90 路径。 + +结果: + +- `fused_logp_sm90` 的旧 include 和 type 问题已由 upstream PR #122 解决。 +- 但 `fused_linear_logp_sm90.cu` 在 CUDA 12.4 下仍未通过 ptxas。 + +错误摘要: + +```text +ptxas error: State space incorrect for instruction 'cp.async.bulk.tensor' +ptxas fatal: Ptx assembly aborted due to errors +``` + +设计判断: + +- PR #122 描述中提到相关 SM90 路径在 CUDA 13.1 下 assembled。 +- 当前 H20 环境是 nvcc 12.4,不应将该路径写为已通过。 +- 当前应先以普通 CUDA `fused_logp` 作为验证通过范围。 + +结论: + +- 已通过:`--candidate cuda` +- 未通过:`--candidate cuda-sm90` / SM90 fused linear logp + +## 当前文件状态摘要 + +本 session 产生或涉及的主要文件: + +```text +rl_engine/testing/tolerance.py +rl_engine/testing/tolerance_contract.yaml +tests/test_tolerance_contract.py +rl_engine/testing/op_checks.py +tests/test_op_checks.py +rl_engine/testing/__init__.py +scripts/check_operator.py +rl_engine/testing/operator_specs.py +rl_engine/testing/operator_inputs.py +tests/test_operator_inputs.py +csrc/cuda/fused_logp_sm90.cu +docs/contributing/issue-108-session-log.md +``` + +说明: + +- `AGENTS.md` 是未跟踪文件,未纳入本 session 的代码修改范围。 +- `csrc/cuda/fused_logp_sm90.cu` 最终与 upstream PR #122 版本一致。 + +## 后续记录模板 + +之后每次代码修改都在本文档追加如下条目: + +```markdown +### YYYY-MM-DD HH:MM - 变更标题 + +目标: + +- 本次最小子任务要解决什么问题。 + +修改文件: + +- `path/to/file.py` + +设计决策: + +- 为什么这样改。 +- 为什么没有选择其他方案。 + +验证方式: + +- 执行的测试命令。 +- CUDA 环境,如 GPU、CUDA 版本、driver、arch。 +- 关键输出指标。 + +结果: + +- 通过 / 未通过 / 部分通过。 +- 未通过时必须记录完整错误摘要。 + +后续: + +- 是否需要继续拆分子任务。 +- 是否影响 CI、benchmark 或其他算子。 +``` + +CUDA 验证建议额外记录: + +```markdown +GPU: +CUDA: +Driver: +Arch: +Candidate: +Backend: +Command: +max_abs: +mean_abs: +max_rel: +atol: +rtol: +Result: +Known issue: +``` + +### 2026-06-28 - CUDA 13 CUB reduce functor 兼容修复 + +目标: + +- 修复 H100 + CUDA 13.0 环境下 SM90 编译失败的问题。 + +修改文件: + +- `csrc/cuda/fused_logp_sm90.cu` +- `docs/contributing/issue-108-session-log.md` + +错误摘要: + +```text +csrc/cuda/fused_logp_sm90.cu(76): error: namespace "cub" has no member "Max" +csrc/cuda/fused_logp_sm90.cu(86): error: namespace "cub" has no member "Sum" +``` + +设计决策: + +- 不继续依赖 CUB 内置 `cub::Max()` 和 `cub::Sum()` functor 名称。 +- 在当前 SM90 文件内定义本地 `FloatMax` 和 `FloatSum`,传给 `cub::BlockReduce::Reduce`。 +- 这样保留原有 reduction 语义,同时规避 CUDA 13 / CCCL 中 CUB functor API 变化。 + +验证方式: + +- 本地只做源码修改;需要在 H100 + CUDA 13.0 机器上重新执行: + +```bash +rm -rf build +find rl_engine -name "*.so" -delete + +export CUDA_HOME=/usr/local/cuda +export PATH=$CUDA_HOME/bin:$PATH +export LD_LIBRARY_PATH=$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-} +export OMP_NUM_THREADS=8 +export MAX_JOBS=1 + +KERNEL_ALIGN_FORCE_SM90=1 pip install -v --no-build-isolation -e . 2>&1 | tee build_sm90.log +``` + +结果: + +- 待 H100 机器重新编译确认。 + +后续: + +- 如果继续失败,优先查看 `grep -nE "FAILED:|error:|ptxas|fatal" build_sm90.log | head -n 80`。 + +### 2026-06-28 - 对齐 SM90 LogP Python wrapper 输入接口 + +目标: + +- 修复 `check_operator.py --candidate cuda-sm90` 调用失败的问题。 + +修改文件: + +- `rl_engine/kernels/ops/cuda/loss/logp.py` +- `docs/contributing/issue-108-session-log.md` + +错误摘要: + +```text +TypeError: FusedLogpSM90Op.__call__() got an unexpected keyword argument 'token_ids' +``` + +设计决策: + +- 测试框架统一通过 `case.inputs` 传递 `token_ids`。 +- `FusedLogpGenericOp`、`NativeLogpOp` 都使用 `token_ids` 命名。 +- 因此将 `FusedLogpSM90Op.__call__(logits, labels)` 改为 `__call__(logits, token_ids)`,让 candidate 接口与 gold/case 输入一致。 +- 同时在 wrapper 内部把 `[B, S, V]` logits reshape 为 `[B*S, V]`,把 `[B, S]` token ids flatten 为 `[B*S]`,再把 `_C.fused_logp_sm90` 的 `[B*S]` 输出 reshape 回 `[B, S]`。 + +验证方式: + +- 需要在 H100 + CUDA 13.0 机器上同步该 patch 后运行: + +```bash +python scripts/check_operator.py \ + --op logp \ + --candidate cuda-sm90 \ + --device cuda \ + --dtype bf16 \ + --arch-key sm90 \ + --batch 2 \ + --seq 16 \ + --vocab 257 +``` + +结果: + +- 已撤回该方向。`rl_engine/kernels/ops/cuda/loss/logp.py` 属于被测 CUDA 算子实现,不应为了测试框架改动其接口。 +- 后续适配应放在 testing 层,例如在 candidate adapter 中把测试框架统一的 `token_ids` 映射为 SM90 wrapper 需要的 `labels`,并处理 flatten/reshape。 + +### 2026-06-28 - 在 testing 层适配 SM90 LogP candidate + +目标: + +- 保持 `rl_engine/kernels/ops/cuda` 下被测实现不变。 +- 让 `check_operator.py --candidate cuda-sm90` 可以使用测试框架统一的 `token_ids` 输入。 + +修改文件: + +- `rl_engine/testing/operator_specs.py` +- `docs/contributing/issue-108-session-log.md` + +设计决策: + +- 新增 `_LogpSM90CandidateAdapter`,只在 `args.op == "logp"` 且 `candidate == "cuda-sm90"` 时使用。 +- adapter 接收测试框架标准输入 `logits` 和 `token_ids`。 +- adapter 内部把 `logits` 从 `[B, S, V]` flatten 为 `[B*S, V]`,把 `token_ids` 从 `[B, S]` flatten 为 `[B*S]`。 +- adapter 调用原始 SM90 candidate:`self._candidate(logits_2d, labels_1d)`。 +- adapter 将输出 reshape 回 `[B, S]`,以便 `compare_output` 按原始 case shape 比较。 + +验证方式: + +- 本地执行 Python 测试和编译检查: + +```bash +python -m py_compile rl_engine/testing/operator_specs.py +python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q +``` + +- H100 机器需要重新运行: + +```bash +python scripts/check_operator.py \ + --op logp \ + --candidate cuda-sm90 \ + --device cuda \ + --dtype bf16 \ + --arch-key sm90 \ + --batch 2 \ + --seq 16 \ + --vocab 257 +``` + +结果: + +- 本地待验证;H100 CUDA 结果待重新运行确认。 + +### 2026-06-28 - H100 CUDA generic 与 SM90 LogP 对照验证 + +目标: + +- 记录 H100 + CUDA 13.0 环境下 `logp` 的 generic CUDA 和 SM90 candidate 行为差异。 +- 明确测试框架已能区分“通过的 CUDA generic candidate”和“编译/运行存在问题的 SM90 candidate”。 + +环境: + +```text +GPU: NVIDIA H100 80GB HBM3 +Driver: 580.95.05 +CUDA driver capability: 13.0 +nvcc: 13.0 +Python: 3.12.13 +torch: 2.12.0+cu130 +torch cuda: 13.0 +compute capability: (9, 0) +``` + +修改文件: + +- `docs/contributing/issue-108-session-log.md` + +验证命令: + +```bash +for v in 256 512 1024 2048 4096; do + echo "=== vocab=$v ===" + python scripts/check_operator.py \ + --op logp \ + --candidate cuda \ + --device cuda \ + --dtype bf16 \ + --arch-key sm90 \ + --batch 1 \ + --seq 1 \ + --vocab $v +done +``` + +generic CUDA 结果: + +```text +vocab=256: passed=True, max_abs=5.77497482e-03 +vocab=512: passed=True, max_abs=8.04328918e-03 +vocab=1024: passed=True, max_abs=1.80721283e-04 +vocab=2048: passed=True, max_abs=1.77164078e-02 +vocab=4096: passed=True, max_abs=2.69813538e-02 +atol=5.000e-02, rtol=0.000e+00 +``` + +结论: + +- `FusedLogpGenericOp -> _C.fused_logp` 在 H100 + CUDA 13.0 上多 vocab correctness 全部通过。 +- 这进一步确认测试框架、input 生成、gold path、candidate 调用和 compare_output 链路是通的。 + +SM90 对照现象: + +```text +TILE_V=4096: + vocab=257/4096/151936 均在 cuTensorMapEncodeTiled 失败。 + 错误:CUDA_ERROR_INVALID_VALUE。 + +TILE_V=256: + vocab=256 返回结果,但 passed=False,max_abs≈1.04094028e+00。 + vocab=512/1024/2048/4096 在 20s timeout 下没有输出 report,表现为 hang/timeout。 +``` + +结论: + +- `cuda-sm90` 已能编译和加载,但当前 SM90 TMA kernel 仍不能标记为通过。 +- 当前通过范围只包括 `--candidate cuda` generic CUDA logp。 +- SM90 问题应作为独立 CUDA kernel bugfix 处理,不归因于测试框架。 + +### 2026-06-28 - 最终整理:不提交 CUDA 源码改动 + +目标: + +- 用户明确要求本阶段不修改 `csrc` 下 CUDA/TMA 源码。 +- 本阶段只提交算子测试框架和文档,不把 SM90 kernel 实验 patch 混入测试框架 PR。 + +本地处理: + +- 已还原 `csrc/cuda/fused_logp_sm90.cu`。 +- 本地 `csrc/utils/tma_utils.cuh` 没有 diff。 +- 因此本地最终不会提交任何 `csrc` 改动。 + +服务器状态对照: + +用户在 H100 服务器上看到: + +```text +Changes not staged for commit: + modified: csrc/cuda/fused_logp_sm90.cu + modified: csrc/utils/tma_utils.cuh +``` + +本地当前状态不同: + +```text +csrc/cuda/fused_logp_sm90.cu: no diff after restore +csrc/utils/tma_utils.cuh: no diff locally +``` + +结论: + +- H100 服务器上的 `csrc/utils/tma_utils.cuh` 改动不是本地当前工作区的一部分。 +- 如果服务器要回到与本地一致的测试框架提交状态,需要在服务器上还原两个 CUDA/TMA 文件: + +```bash +git restore csrc/cuda/fused_logp_sm90.cu csrc/utils/tma_utils.cuh +``` + +保留记录的 CUDA 现象: + +- H100 环境: + +```text +GPU: NVIDIA H100 80GB HBM3 +Driver: 580.95.05 +CUDA driver capability: 13.0 +nvcc: 13.0.88 +Python: 3.12.13 +torch: 2.12.0+cu130 +torch cuda: 13.0 +compute capability: (9, 0) +``` + +- `--candidate cuda` generic logp 在 vocab 256/512/1024/2048/4096 上通过。 +- `--candidate cuda-sm90` 可以编译和加载,但: + - `TILE_V=4096` 触发 `cuTensorMapEncodeTiled failed`。 + - `TILE_V=256` 时 vocab=256 返回但数值不通过,vocab>=512 出现 timeout/hang。 +- 所以 SM90 fused logp 目前记录为 CUDA kernel 问题,不作为测试框架失败。 + +### 2026-06-28 - 测试框架目录归位 + +目标: + +- 用户指出测试框架文件放在通用 `rl_engine/testing` 下过于分散。 +- 本阶段将 operator correctness checking 代码移动到 kernel 相关目录,避免和 RL batch/reference testing 混在一起。 + +最终目录: + +```text +rl_engine/kernels/gtest/ + __init__.py + op_checks.py + operator_inputs.py + operator_specs.py + tolerance.py + tolerance_contract.yaml +``` + +职责划分: + +- `op_checks.py` + - 定义 `OperatorCase`、`CandidateSpec`、report dataclass。 + - 调用 gold 和 candidate。 + - flatten 输出。 + - 解析 tolerance。 + - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。 + - 返回通过率和结构化 report。 +- `operator_inputs.py` + - 统一构造标准语义输入。 + - 支持 `random` 和 `constant`。 + - 支持 `batch`、`seq`、`vocab` 等 CLI 参数。 + - 当前覆盖 ISSUE-108 相关算子的输入初始化骨架。 +- `operator_specs.py` + - 注册每个算子的 gold path 和 candidate path。 + - gold path 必须来自 `rl_engine.kernels.ops.pytorch`。 + - candidate path 来自 `cuda`、`triton`、`rocm` 或未来 backend。 +- `tolerance.py` 和 `tolerance_contract.yaml` + - 加载 dtype/operator-class 容差表。 + - 供 `op_checks.py` 在 compare output 时解析 `atol` 和 `rtol`。 +- `scripts/check_operator.py` + - 命令行入口。 + - 不直接硬编码具体算子实现。 + +导入边界: + +- operator checking 框架只从 `rl_engine.kernels.gtest` 导入。 +- `rl_engine/testing/__init__.py` 不导出 `CandidateSpec`、`OperatorCase`、`run_operator_suite`。 +- 这样可以避免 kernel correctness checking 和通用 RL testing helper 混在一起。 + +### 2026-06-28 - 添加新算子的傻瓜式流程 + +目标: + +- 新增算子时不修改测试主逻辑。 +- 新增算子只改注册信息、输入工厂和必要测试。 +- gold 永远使用 `rl_engine/kernels/ops/pytorch` 下实现。 + +步骤 1:确认算子标准接口 + +先确定这个算子的标准语义输入。例如: + +```text +logp: + inputs: + logits: [B, S, V] + token_ids: [B, S] + output: + selected_logp: [B, S] +``` + +要求: + +- PyTorch gold、CUDA、Triton、ROCm wrapper 都应尽量使用同一套 Python 接口。 +- 不同 backend 不应要求测试框架长期维护 shape/参数名 adapter。 +- 当前 `_LogpSM90CandidateAdapter` 只是为了验证框架最小闭环的临时例外,不作为长期模式。 + +步骤 2:在 `operator_inputs.py` 添加输入构造 + +文件: + +```text +rl_engine/kernels/gtest/operator_inputs.py +``` + +需要做三件事: + +1. 在 `make_operator_inputs()` 的 `builders` 中加入算子名。 +2. 在 `operator_shape_name()` 的 `names` 中加入 shape 描述。 +3. 新增 `_make_xxx_inputs(args, dtype, device)`。 + +要求: + +- 输入必须是标准语义输入,不是某个 CUDA kernel 的私有格式。 +- `random` 模式必须可由 `--seed` 复现。 +- `constant` 模式必须便于 debug。 +- 多 batch 情况默认保留 `[B, S, ...]` 语义形状,不提前为某个 backend flatten。 + +步骤 3:在 `operator_specs.py` 注册 gold 和 candidate + +文件: + +```text +rl_engine/kernels/gtest/operator_specs.py +``` + +添加: + +```python +"new_op": OperatorSpec( + name="new_op", + op_class="...", + gold_path="rl_engine.kernels.ops.pytorch....NativeNewOp", + registry_name="new_op", + candidate_paths={ + "pytorch": "rl_engine.kernels.ops.pytorch....NativeNewOp", + "cuda": "rl_engine.kernels.ops.cuda....CudaNewOp", + "triton": "rl_engine.kernels.ops.triton....TritonNewOp", + }, +) +``` + +硬性规则: + +- `gold_path` 必须来自 `rl_engine.kernels.ops.pytorch`。 +- `candidate_paths["pytorch"]` 只能用于框架自检,不代表高性能算子通过。 +- `candidate_paths["cuda"]`、`candidate_paths["triton"]` 等必须对应实际被测 backend。 +- 不允许用实现了不同数学功能的算子互相比较,例如不能用 `linear_logp` 测普通 `logp`。 + +步骤 4:确认 gold 调用方法 + +当前 `make_operator_case()` 对 logp 使用: + +```python +gold_fn=gold_op.forward_fp32 +``` + +这对 `NativeLogpOp` 是正确的。新增算子时必须确认 PyTorch gold 是否有对应方法。 + +如果新算子没有 `forward_fp32`,不要在测试主逻辑中硬编码临时分支;应在 `operator_specs.py` 中显式补充 gold 调用策略,作为一个独立小改动提交。 + +步骤 5:新增输入和 runner 单测 + +至少补两类测试: + +```text +tests/test_operator_inputs.py: + 确认 make_operator_inputs("new_op", ...) 能生成输入。 + 确认 random seed 可复现。 + 确认 constant 模式值正确。 + +tests/test_op_checks.py 或新测试文件: + 用 pytorch candidate vs pytorch gold 验证框架能跑通。 + 用 bad candidate 验证失败报告符合预期。 +``` + +步骤 6:本地验证 + +CPU 框架验证: + +```bash +python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q +``` + +CUDA candidate 验证: + +```bash +python scripts/check_operator.py \ + --op new_op \ + --candidate cuda \ + --device cuda \ + --dtype bf16 \ + --arch-key sm90 \ + --batch 2 \ + --seq 16 +``` + +如果 CUDA candidate 不通过,先判断: + +```text +1. gold 和 candidate 是否真的是同一个数学函数。 +2. candidate Python wrapper 是否使用标准接口。 +3. 输入 dtype / shape 是否符合 candidate 声明。 +4. 误差是否超过 tolerance。 +5. 是否是 kernel 编译或运行错误。 +``` + +不要为了让测试通过去修改 gold,也不要把不同功能的算子混在一起比较。 + +### 2026-06-28 - `check_operator.py` 支持参数 + +入口: + +```bash +python scripts/check_operator.py [options] +``` + +核心参数: + +```text +--op + 算子名。当前最小版本支持 logp。 + +--candidate + 被测实现。当前 logp 支持 registry、pytorch、native、cuda、cuda-generic、cuda-sm90。 + +--dtype + fp32、bf16、fp16。 + +--device + auto、cpu、cuda 或 torch 可识别的 device 字符串。 + +--arch-key + tolerance override key,例如 sm90。为空时使用 default tolerance。 +``` + +shape 参数: + +```text +--batch + batch size,默认 2。 + +--seq + sequence length,默认 16。 + +--vocab + vocabulary size,默认 257。 + +--normalized-dim + norm 类算子的 hidden/normalized dimension。 + +--k-dim + matmul K dimension。 + +--n-dim + matmul N dimension。 +``` + +输入初始化参数: + +```text +--input-mode + random 或 constant。 + +--constant-value + constant 模式下浮点 tensor 的基础值。 + +--token-value + constant 模式下 token id 的基础值,会对 vocab 取模。 + +--seed + random 模式下的随机种子。 +``` + +其他参数: + +```text +--theta + RoPE theta。 + +--eps + norm epsilon。 + +--json + 输出完整 JSON report。 +``` + +当前最小可运行示例: + +```bash +python scripts/check_operator.py \ + --op logp \ + --candidate cuda \ + --device cuda \ + --dtype bf16 \ + --arch-key sm90 \ + --batch 1 \ + --seq 1 \ + --vocab 4096 +``` + +预期: + +- 在 H100 + CUDA 13.0 环境中,generic CUDA logp 已观察到通过。 +- `cuda-sm90` 当前不应作为通过路径使用。 diff --git a/rl_engine/kernels/gtest/__init__.py b/rl_engine/kernels/gtest/__init__.py new file mode 100644 index 0000000..c3fc366 --- /dev/null +++ b/rl_engine/kernels/gtest/__init__.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from .op_checks import CandidateSpec, OperatorCase, run_operator_suite + +__all__ = [ + "CandidateSpec", + "OperatorCase", + "run_operator_suite", +] diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py new file mode 100644 index 0000000..433bbfc --- /dev/null +++ b/rl_engine/kernels/gtest/op_checks.py @@ -0,0 +1,292 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +from collections.abc import Callable, Mapping, Sequence +from dataclasses import asdict, dataclass +from typing import Any + +import torch + +from rl_engine.kernels.gtest.tolerance import load_contract + + +@dataclass(frozen=True) +class OperatorCase: + """One deterministic test object for an operator candidate.""" + + name: str + op_class: str + dtype: torch.dtype + inputs: Mapping[str, Any] + gold_fn: Callable[..., Any] + + +@dataclass(frozen=True) +class CandidateSpec: + """One implementation to validate against the gold path.""" + + name: str + fn: Callable[..., Any] | Any + backend: str = "unknown" + arch_key: str | None = None + + +@dataclass(frozen=True) +class OutputCheck: + """Per-output comparison result.""" + + output_index: int + shape: tuple[int, ...] + candidate_dtype: str + gold_dtype: str + atol: float + rtol: float + max_abs_error: float + mean_abs_error: float + max_rel_error: float + passed: bool + message: str = "" + + +@dataclass(frozen=True) +class CaseCheck: + """Per-case result for one candidate.""" + + case_name: str + dtype: str + op_class: str + passed: bool + outputs: list[OutputCheck] + + +@dataclass(frozen=True) +class CandidateReport: + """Aggregate report for one candidate implementation.""" + + candidate_name: str + backend: str + total_outputs: int + passed_outputs: int + pass_rate: float + passed: bool + cases: list[CaseCheck] + + +@dataclass(frozen=True) +class OperatorCheckReport: + """Suite-level report across candidates.""" + + suite_name: str + total_candidates: int + passed_candidates: int + pass_rate: float + passed: bool + candidates: list[CandidateReport] + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +def run_operator_suite( + suite_name: str, + *, + candidates: Sequence[CandidateSpec], + cases: Sequence[OperatorCase], + contract: Mapping[str, Any] | None = None, +) -> OperatorCheckReport: + """Run candidates against gold outputs and return a structured report.""" + + loaded_contract = dict(contract or load_contract()) + # run all test ops + # cases : test object + # camdidate : test instance + # loaded_contract : tolerance table + candidate_reports = [ + _run_candidate(candidate, cases, loaded_contract) for candidate in candidates + ] + passed_candidates = sum(1 for report in candidate_reports if report.passed) + total_candidates = len(candidate_reports) + pass_rate = float(passed_candidates / total_candidates) if total_candidates else 0.0 + return OperatorCheckReport( + suite_name=suite_name, + total_candidates=total_candidates, + passed_candidates=passed_candidates, + pass_rate=pass_rate, + passed=passed_candidates == total_candidates, + candidates=candidate_reports, + ) + + +def _run_candidate( + candidate: CandidateSpec, + cases: Sequence[OperatorCase], + contract: Mapping[str, Any], +) -> CandidateReport: + case_checks = [_run_case(candidate, case, contract) for case in cases] + total_outputs = sum(len(case.outputs) for case in case_checks) + passed_outputs = sum( + 1 for case in case_checks for output in case.outputs if output.passed + ) + pass_rate = float(passed_outputs / total_outputs) if total_outputs else 0.0 + return CandidateReport( + candidate_name=candidate.name, + backend=candidate.backend, + total_outputs=total_outputs, + passed_outputs=passed_outputs, + pass_rate=pass_rate, + passed=passed_outputs == total_outputs, + cases=case_checks, + ) + + +def _run_case( + candidate: CandidateSpec, + case: OperatorCase, + contract: Mapping[str, Any], +) -> CaseCheck: + candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, case.inputs)) + gold_outputs = _flatten_tensors(case.gold_fn(**case.inputs)) + if len(candidate_outputs) != len(gold_outputs): + raise ValueError( + f"candidate {candidate.name!r} returned {len(candidate_outputs)} outputs, " + f"gold returned {len(gold_outputs)}" + ) + atol, rtol = _resolve_tolerance( + contract, + op_class=case.op_class, + dtype=case.dtype, + arch_key=candidate.arch_key, + ) + output_checks = [ + _compare_output( + candidate_output, + gold_output, + output_index=index, + atol=atol, + rtol=rtol, + ) + for index, (candidate_output, gold_output) in enumerate( + zip(candidate_outputs, gold_outputs, strict=True) + ) + ] + return CaseCheck( + case_name=case.name, + dtype=str(case.dtype), + op_class=case.op_class, + passed=all(output.passed for output in output_checks), + outputs=output_checks, + ) + + +# compatibility function or forward +def _call_candidate(candidate: Callable[..., Any] | Any, inputs: Mapping[str, Any]) -> Any: + if hasattr(candidate, "forward") and callable(candidate.forward): + return candidate.forward(**inputs) + return candidate(**inputs) + + +def _flatten_tensors(value: Any) -> list[torch.Tensor]: + if isinstance(value, torch.Tensor): + return [value] + if isinstance(value, (tuple, list)): + outputs: list[torch.Tensor] = [] + for item in value: + outputs.extend(_flatten_tensors(item)) + return outputs + raise TypeError(f"operator output must be Tensor or sequence, got {type(value)!r}") + + +def _resolve_tolerance( + contract: Mapping[str, Any], + *, + op_class: str, + dtype: torch.dtype, + arch_key: str | None = None, +) -> tuple[float, float]: + dtype_name = _dtype_name(dtype) + if arch_key is not None: + arch_values = ( + contract["accuracy"] + .get("arch_overrides", {}) + .get(arch_key, {}) + .get(op_class, {}) + .get(dtype_name) + ) + if arch_values is not None: + return float(arch_values["atol"]), float(arch_values.get("rtol", 0.0)) + + values = contract["accuracy"]["default"][op_class][dtype_name] + return float(values["atol"]), float(values.get("rtol", 0.0)) + + +def _dtype_name(dtype: torch.dtype) -> str: + if dtype is torch.float32: + return "float32" + if dtype is torch.bfloat16: + return "bfloat16" + if dtype is torch.float16: + return "float16" + raise ValueError(f"unsupported dtype: {dtype}") + + +def _compare_output( + candidate: torch.Tensor, + gold: torch.Tensor, + *, + output_index: int, + atol: float, + rtol: float, +) -> OutputCheck: + if candidate.shape != gold.shape: + return OutputCheck( + output_index=output_index, + shape=tuple(candidate.shape), + candidate_dtype=str(candidate.dtype), + gold_dtype=str(gold.dtype), + atol=atol, + rtol=rtol, + max_abs_error=float("inf"), + mean_abs_error=float("inf"), + max_rel_error=float("inf"), + passed=False, + message=f"shape mismatch: candidate={tuple(candidate.shape)} gold={tuple(gold.shape)}", + ) + + candidate_fp32 = candidate.float() + gold_fp32 = gold.float() + abs_error = (candidate_fp32 - gold_fp32).abs() + if abs_error.numel() == 0: + max_abs_error = 0.0 + mean_abs_error = 0.0 + max_rel_error = 0.0 + else: + max_abs_error = float(abs_error.max().item()) + mean_abs_error = float(abs_error.mean().item()) + rel_error = abs_error / gold_fp32.abs().clamp_min(1e-12) + max_rel_error = float(rel_error.max().item()) + + return OutputCheck( + output_index=output_index, + shape=tuple(candidate.shape), + candidate_dtype=str(candidate.dtype), + gold_dtype=str(gold.dtype), + atol=atol, + rtol=rtol, + max_abs_error=max_abs_error, + mean_abs_error=mean_abs_error, + max_rel_error=max_rel_error, + passed=bool(torch.allclose(candidate_fp32, gold_fp32, atol=atol, rtol=rtol)), + ) + + +__all__ = [ + "CandidateReport", + "CandidateSpec", + "CaseCheck", + "OperatorCase", + "OperatorCheckReport", + "OutputCheck", + "run_operator_suite", +] diff --git a/rl_engine/kernels/gtest/operator_inputs.py b/rl_engine/kernels/gtest/operator_inputs.py new file mode 100644 index 0000000..7e09719 --- /dev/null +++ b/rl_engine/kernels/gtest/operator_inputs.py @@ -0,0 +1,244 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +import argparse +from typing import Any + +import torch + + +DEFAULT_HIDDEN = 4096 +DEFAULT_N_HEADS = 32 +DEFAULT_N_KV_HEADS = 8 +DEFAULT_HEAD_DIM = 128 +DEFAULT_INTERMEDIATE = 12288 +DEFAULT_VOCAB = 151936 +DEFAULT_ROPE_THETA = 1.0e6 +DEFAULT_RMS_EPS = 1.0e-6 + + +def make_operator_inputs( + op_name: str, + args: argparse.Namespace, + dtype: torch.dtype, + device: torch.device, +) -> dict[str, Any]: + builders = { + "rms_norm": _make_rms_norm_inputs, + "matmul": _make_matmul_inputs, + "attention": _make_attention_inputs, + "logp": _make_logp_inputs, + "rope": _make_rope_inputs, + "silu": _make_silu_inputs, + "swiglu": _make_swiglu_inputs, + "embedding": _make_embedding_inputs, + "lm_head": _make_lm_head_inputs, + "kv_cache_attention": _make_kv_cache_attention_inputs, + } + try: + return builders[op_name](args, dtype, device) + except KeyError as exc: + raise ValueError(f"unsupported operator inputs: {op_name}") from exc + + +def operator_shape_name(op_name: str, args: argparse.Namespace) -> str: + batch, seq = _batch_seq(args) + vocab = _arg_int(args, "vocab", DEFAULT_VOCAB) + names = { + "rms_norm": f"{batch}x{seq}x{_normalized_dim(args)}", + "matmul": f"{batch}x{seq}x{_matmul_k(args)}x{_matmul_n(args)}", + "attention": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}", + "logp": f"{batch}x{seq}x{vocab}", + "rope": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}", + "silu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}", + "swiglu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}", + "embedding": f"{batch}x{seq}x{vocab}x{DEFAULT_HIDDEN}", + "lm_head": f"{batch}x{seq}x{vocab}", + "kv_cache_attention": f"{batch}x{DEFAULT_N_HEADS}x1x{seq + 1}x{DEFAULT_HEAD_DIM}", + } + try: + return names[op_name] + except KeyError as exc: + raise ValueError(f"unsupported operator shape: {op_name}") from exc + + +def _make_rms_norm_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + normalized_dim = _normalized_dim(args) + return { + "x": _floating_tensor((batch, seq, normalized_dim), args, dtype, device, offset=0), + "weight": _floating_tensor((normalized_dim,), args, dtype, device, offset=1), + "eps": _arg_float(args, "eps", DEFAULT_RMS_EPS), + } + + +def _make_matmul_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + k_dim = _matmul_k(args) + n_dim = _matmul_n(args) + return { + "a": _floating_tensor((batch, seq, k_dim), args, dtype, device, offset=0), + "b": _floating_tensor((k_dim, n_dim), args, dtype, device, offset=1), + } + + +def _make_attention_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + return { + "q": _floating_tensor((batch, DEFAULT_N_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 0), + "k": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 1), + "v": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 2), + "causal": True, + } + + +def _make_logp_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + vocab = _arg_int(args, "vocab", DEFAULT_VOCAB) + return { + "logits": _floating_tensor((batch, seq, vocab), args, dtype, device, offset=0), + "token_ids": _token_ids((batch, seq), vocab, args, device), + } + + +def _make_rope_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + return { + "x": _floating_tensor((batch, DEFAULT_N_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 0), + "positions": torch.arange(seq, device=device, dtype=torch.long), + "theta": _arg_float(args, "theta", DEFAULT_ROPE_THETA), + } + + +def _make_silu_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + return { + "x": _floating_tensor((batch, seq, DEFAULT_INTERMEDIATE), args, dtype, device, 0), + } + + +def _make_swiglu_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + return { + "gate": _floating_tensor((batch, seq, DEFAULT_INTERMEDIATE), args, dtype, device, 0), + "up": _floating_tensor((batch, seq, DEFAULT_INTERMEDIATE), args, dtype, device, 1), + } + + +def _make_embedding_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + vocab = _arg_int(args, "vocab", DEFAULT_VOCAB) + return { + "token_ids": _token_ids((batch, seq), vocab, args, device), + "weight": _floating_tensor((vocab, DEFAULT_HIDDEN), args, dtype, device, 0), + } + + +def _make_lm_head_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + vocab = _arg_int(args, "vocab", DEFAULT_VOCAB) + return { + "hidden": _floating_tensor((batch, seq, DEFAULT_HIDDEN), args, dtype, device, 0), + "weight": _floating_tensor((vocab, DEFAULT_HIDDEN), args, dtype, device, 1), + "bias": None, + } + + +def _make_kv_cache_attention_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + return { + "q": _floating_tensor((batch, DEFAULT_N_HEADS, 1, DEFAULT_HEAD_DIM), args, dtype, device, 0), + "k_cache": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 1), + "v_cache": _floating_tensor((batch, DEFAULT_N_KV_HEADS, seq, DEFAULT_HEAD_DIM), args, dtype, device, 2), + "k_new": _floating_tensor((batch, DEFAULT_N_KV_HEADS, 1, DEFAULT_HEAD_DIM), args, dtype, device, 3), + "v_new": _floating_tensor((batch, DEFAULT_N_KV_HEADS, 1, DEFAULT_HEAD_DIM), args, dtype, device, 4), + "causal": True, + } + + +def _floating_tensor( + shape: tuple[int, ...], + args: argparse.Namespace, + dtype: torch.dtype, + device: torch.device, + offset: int, +) -> torch.Tensor: + # Example: torch.randn((B, S, V), device="cuda", dtype=torch.bfloat16) + mode = _arg_str(args, "input_mode", "random") + if mode == "constant": + value = _arg_float(args, "constant_value", 0.25) + float(offset) * 0.01 + return torch.full(shape, value, device=device, dtype=dtype) + if mode != "random": + raise ValueError(f"unsupported input_mode: {mode}") + generator = _generator(args, device, offset) + return torch.randn(shape, generator=generator, device=device, dtype=dtype) + + +def _token_ids( + shape: tuple[int, ...], + vocab: int, + args: argparse.Namespace, + device: torch.device, +) -> torch.Tensor: + mode = _arg_str(args, "input_mode", "random") + if mode == "constant": + value = _arg_int(args, "token_value", 0) % vocab + return torch.full(shape, value, device=device, dtype=torch.long) + generator = _generator(args, device, offset=13) + return torch.randint(0, vocab, shape, generator=generator, device=device, dtype=torch.long) + + +def _generator(args: argparse.Namespace, device: torch.device, offset: int) -> torch.Generator: + generator = torch.Generator(device=device) + generator.manual_seed(_arg_int(args, "seed", 123) + offset) + return generator + + +def _batch_seq(args: argparse.Namespace) -> tuple[int, int]: + return _arg_int(args, "batch", 2), _arg_int(args, "seq", 16) + + +def _normalized_dim(args: argparse.Namespace) -> int: + return _arg_int(args, "normalized_dim", DEFAULT_HIDDEN) + + +def _matmul_k(args: argparse.Namespace) -> int: + return _arg_int(args, "k_dim", DEFAULT_HIDDEN) + + +def _matmul_n(args: argparse.Namespace) -> int: + return _arg_int(args, "n_dim", DEFAULT_HIDDEN) + + +def _arg_float(args: argparse.Namespace, name: str, default: float) -> float: + return float(getattr(args, name, default)) + + +def _arg_int(args: argparse.Namespace, name: str, default: int) -> int: + return int(getattr(args, name, default)) + + +def _arg_str(args: argparse.Namespace, name: str, default: str) -> str: + return str(getattr(args, name, default)) diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py new file mode 100644 index 0000000..dc4674c --- /dev/null +++ b/rl_engine/kernels/gtest/operator_specs.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +import argparse +import importlib +from dataclasses import dataclass +from typing import Any + +import torch + +from rl_engine.kernels.gtest.operator_inputs import make_operator_inputs, operator_shape_name +from rl_engine.kernels.gtest.op_checks import CandidateSpec, OperatorCase + + +@dataclass(frozen=True) +class OperatorSpec: + name: str + op_class: str + gold_path: str + registry_name: str + candidate_paths: dict[str, str] + + +def _load_object(path: str) -> Any: + module_path, object_name = path.rsplit(".", 1) + # dynamic loading ops + module = importlib.import_module(module_path) + return getattr(module, object_name) + + +OP_SPECS = { + "logp": OperatorSpec( + name="logp", + op_class="logprob", + gold_path="rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp", + registry_name="logp", + candidate_paths={ + "pytorch": "rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp", + "cuda": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp", + "cuda-generic": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp", + "cuda-sm90": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpSM90Op", + }, + ), +} + + +class _LogpSM90CandidateAdapter: + def __init__(self, candidate: Any) -> None: + self._candidate = candidate + + def __call__(self, logits: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor: + orig_shape = logits.shape[:-1] + logits_2d = logits.contiguous().view(-1, logits.size(-1)) + labels_1d = token_ids.contiguous().view(-1) + return self._candidate(logits_2d, labels_1d).view(orig_shape) + + +def operator_names() -> tuple[str, ...]: + return tuple(OP_SPECS) + + +def make_operator_case( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> OperatorCase: + spec = OP_SPECS[args.op] + gold_op = _load_object(spec.gold_path)() + return OperatorCase( + name=f"{args.op}-{dtype}-{operator_shape_name(args.op, args)}", + op_class=spec.op_class, + dtype=dtype, + inputs=make_operator_inputs(args.op, args, dtype, device), + gold_fn=gold_op.forward_fp32, + ) + + +def make_candidate(args: argparse.Namespace) -> CandidateSpec: + spec = OP_SPECS[args.op] + candidate_name = "pytorch" if args.candidate == "native" else args.candidate + + if candidate_name in spec.candidate_paths: + candidate_op = _load_object(spec.candidate_paths[candidate_name])() + if args.op == "logp" and candidate_name == "cuda-sm90": + candidate_op = _LogpSM90CandidateAdapter(candidate_op) + return CandidateSpec( + name=f"{candidate_name}-{args.op}", + backend=candidate_name, + arch_key=args.arch_key, + fn=candidate_op, + ) + + if candidate_name == "registry": + from rl_engine.kernels.registry import kernel_registry + + return CandidateSpec( + name=f"registry-{args.op}", + backend="registry", + arch_key=args.arch_key, + fn=kernel_registry.get_op(spec.registry_name), + ) + + supported = sorted([*spec.candidate_paths, "native", "registry"]) + raise ValueError( + f"unsupported candidate {args.candidate!r} for op {args.op!r}; " + f"supported candidates: {', '.join(supported)}" + ) diff --git a/rl_engine/testing/tolerance.py b/rl_engine/kernels/gtest/tolerance.py similarity index 100% rename from rl_engine/testing/tolerance.py rename to rl_engine/kernels/gtest/tolerance.py diff --git a/rl_engine/testing/tolerance_contract.yaml b/rl_engine/kernels/gtest/tolerance_contract.yaml similarity index 100% rename from rl_engine/testing/tolerance_contract.yaml rename to rl_engine/kernels/gtest/tolerance_contract.yaml diff --git a/scripts/check_operator.py b/scripts/check_operator.py new file mode 100644 index 0000000..9b99612 --- /dev/null +++ b/scripts/check_operator.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +import argparse +import json +import pathlib +import sys +from typing import Any + +import torch + +REPO_ROOT = pathlib.Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from rl_engine.kernels.gtest import run_operator_suite # noqa: E402 +from rl_engine.kernels.gtest.operator_specs import ( # noqa: E402 + make_candidate, + make_operator_case, + operator_names, +) + + +def _parse_dtype(value: str) -> torch.dtype: + normalized = value.lower() + if normalized in {"fp32", "float32"}: + return torch.float32 + if normalized in {"bf16", "bfloat16"}: + return torch.bfloat16 + if normalized in {"fp16", "float16", "half"}: + return torch.float16 + raise ValueError(f"unsupported dtype: {value}") + + +def _select_device(value: str) -> torch.device: + if value == "auto": + return torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device(value) + if device.type == "cuda" and not torch.cuda.is_available(): + raise RuntimeError("--device cuda was requested, but CUDA is not available") + return device + + +def _summarize(report: Any) -> None: + print(f"suite={report.suite_name} passed={report.passed} pass_rate={report.pass_rate:.4f}") + for candidate in report.candidates: + print( + f"candidate={candidate.candidate_name} backend={candidate.backend} " + f"passed={candidate.passed} pass_rate={candidate.pass_rate:.4f}" + ) + for case in candidate.cases: + for output in case.outputs: + print( + f" case={case.case_name} output={output.output_index} " + f"shape={output.shape} dtype={output.candidate_dtype} " + f"max_abs={output.max_abs_error:.8e} " + f"mean_abs={output.mean_abs_error:.8e} " + f"max_rel={output.max_rel_error:.8e} " + f"tol=(atol={output.atol:.3e}, rtol={output.rtol:.3e}) " + f"passed={output.passed}" + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Validate an operator candidate against a PyTorch gold path.") + parser.add_argument("--op", choices=operator_names(), default="logp") + parser.add_argument( + "--candidate", + default="registry", + help="Candidate backend to validate, for example registry, pytorch, cuda, cuda-sm90.", + ) + parser.add_argument("--dtype", choices=("fp32", "bf16", "fp16"), default="fp32") + parser.add_argument("--device", default="auto") + parser.add_argument("--batch", type=int, default=2) + parser.add_argument("--seq", type=int, default=16) + parser.add_argument("--vocab", type=int, default=257) + parser.add_argument("--input-mode", choices=("random", "constant"), default="random") + parser.add_argument("--constant-value", type=float, default=0.25) + parser.add_argument("--token-value", type=int, default=0) + parser.add_argument("--normalized-dim", type=int, default=4096) + parser.add_argument("--k-dim", type=int, default=4096) + parser.add_argument("--n-dim", type=int, default=4096) + parser.add_argument("--theta", type=float, default=1.0e6) + parser.add_argument("--eps", type=float, default=1.0e-6) + parser.add_argument("--seed", type=int, default=123) + parser.add_argument( + "--arch-key", + default=None, + help="Optional tolerance override key, for example sm90. Defaults to contract.default.", + ) + parser.add_argument("--json", action="store_true", help="Print the full structured report as JSON.") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + dtype = _parse_dtype(args.dtype) + device = _select_device(args.device) + candidate = make_candidate(args) + case = make_operator_case(args, dtype, device) + report = run_operator_suite(args.op, candidates=[candidate], cases=[case]) + + if args.json: + print(json.dumps(report.to_dict(), indent=2, default=str)) + else: + _summarize(report) + + if not report.passed: + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py new file mode 100644 index 0000000..9f8a901 --- /dev/null +++ b/tests/test_op_checks.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +import torch + +from rl_engine.kernels.ops.pytorch.loss.logp import NativeLogpOp +from rl_engine.kernels.gtest.op_checks import CandidateSpec, OperatorCase, run_operator_suite + + +def _logp_case(name: str, dtype: torch.dtype, *, seed: int = 0) -> OperatorCase: + generator = torch.Generator().manual_seed(seed) + logits = torch.randn(2, 8, 257, dtype=dtype, generator=generator) + token_ids = torch.randint(0, logits.size(-1), (2, 8), generator=generator) + return OperatorCase( + name=name, + op_class="logprob", + dtype=dtype, + inputs={"logits": logits, "token_ids": token_ids}, + gold_fn=NativeLogpOp().forward_fp32, + ) + + +def test_logp_native_candidate_suite_passes(): + report = run_operator_suite( + "logp", + candidates=[CandidateSpec(name="native-logp", backend="pytorch", fn=NativeLogpOp())], + cases=[ + _logp_case("fp32", torch.float32, seed=1), + _logp_case("bf16", torch.bfloat16, seed=2), + _logp_case("fp16", torch.float16, seed=3), + ], + ) + + assert report.passed + assert report.pass_rate == 1.0 + assert report.candidates[0].passed_outputs == 3 + assert all(case.passed for case in report.candidates[0].cases) + + +def test_logp_registry_candidate_suite_passes_on_cpu(): + from rl_engine.kernels.registry import kernel_registry + + report = run_operator_suite( + "logp", + candidates=[ + CandidateSpec( + name="registry-logp", + backend="registry", + fn=kernel_registry.get_op("logp"), + ) + ], + cases=[_logp_case("fp32", torch.float32, seed=4)], + ) + + assert report.passed + assert report.candidates[0].candidate_name == "registry-logp" + + +def test_suite_reports_failure_for_bad_candidate(): + def bad_logp(logits, token_ids): + del token_ids + return torch.zeros(logits.shape[:-1], dtype=logits.dtype) + + report = run_operator_suite( + "logp", + candidates=[CandidateSpec(name="bad-logp", backend="test", fn=bad_logp)], + cases=[_logp_case("fp32", torch.float32, seed=5)], + ) + + output = report.candidates[0].cases[0].outputs[0] + assert not report.passed + assert report.pass_rate == 0.0 + assert output.max_abs_error > 0.0 + + +def test_suite_report_to_dict_contains_error_metrics(): + report = run_operator_suite( + "logp", + candidates=[CandidateSpec(name="native-logp", backend="pytorch", fn=NativeLogpOp())], + cases=[_logp_case("fp32", torch.float32, seed=6)], + ) + + data = report.to_dict() + output = data["candidates"][0]["cases"][0]["outputs"][0] + assert data["suite_name"] == "logp" + assert "max_abs_error" in output + assert "atol" in output + assert "passed" in output + + +def test_candidate_arch_key_uses_tolerance_override(): + def slightly_shifted_logp(logits, token_ids): + return NativeLogpOp().forward_fp32(logits, token_ids) + 0.02 + + contract = { + "accuracy": { + "default": { + "logprob": { + "float32": {"atol": 1.0e-5, "rtol": 0.0}, + } + }, + "arch_overrides": { + "testarch": { + "logprob": { + "float32": {"atol": 5.0e-2, "rtol": 0.0}, + } + } + }, + } + } + report = run_operator_suite( + "logp", + candidates=[ + CandidateSpec( + name="shifted-logp", + backend="test", + fn=slightly_shifted_logp, + arch_key="testarch", + ) + ], + cases=[_logp_case("fp32", torch.float32, seed=7)], + contract=contract, + ) + + output = report.candidates[0].cases[0].outputs[0] + assert report.passed + assert output.atol == 5.0e-2 diff --git a/tests/test_operator_inputs.py b/tests/test_operator_inputs.py new file mode 100644 index 0000000..22d39ae --- /dev/null +++ b/tests/test_operator_inputs.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 RL-Kernel Contributors + +from __future__ import annotations + +import argparse + +import pytest +import torch + +from rl_engine.kernels.gtest.operator_inputs import make_operator_inputs, operator_shape_name + + +def _args(**overrides): + values = { + "batch": 1, + "seq": 2, + "vocab": 17, + "seed": 123, + "input_mode": "constant", + "constant_value": 0.5, + "token_value": 3, + "normalized_dim": 128, + "k_dim": 16, + "n_dim": 32, + "theta": 1.0e6, + "eps": 1.0e-6, + } + values.update(overrides) + return argparse.Namespace(**values) + + +@pytest.mark.parametrize( + "op_name", + [ + "rms_norm", + "matmul", + "attention", + "logp", + "rope", + "silu", + "swiglu", + "embedding", + "lm_head", + "kv_cache_attention", + ], +) +def test_operator_inputs_support_all_issue_108_ops(op_name): + args = _args() + inputs = make_operator_inputs(op_name, args, torch.float32, torch.device("cpu")) + + assert inputs + assert operator_shape_name(op_name, args) + + +def test_constant_logp_inputs_are_deterministic(): + args = _args(input_mode="constant", constant_value=0.5, token_value=3) + inputs = make_operator_inputs("logp", args, torch.float32, torch.device("cpu")) + + assert torch.equal(inputs["logits"], torch.full((1, 2, 17), 0.5)) + assert torch.equal(inputs["token_ids"], torch.full((1, 2), 3, dtype=torch.long)) + + +def test_random_logp_inputs_are_seeded(): + args = _args(input_mode="random", seed=7) + first = make_operator_inputs("logp", args, torch.float32, torch.device("cpu")) + second = make_operator_inputs("logp", args, torch.float32, torch.device("cpu")) + + assert torch.equal(first["logits"], second["logits"]) + assert torch.equal(first["token_ids"], second["token_ids"]) diff --git a/tests/test_tolerance_contract.py b/tests/test_tolerance_contract.py index 0e62d7f..fb429d8 100644 --- a/tests/test_tolerance_contract.py +++ b/tests/test_tolerance_contract.py @@ -3,7 +3,7 @@ from __future__ import annotations -from rl_engine.testing.tolerance import load_contract +from rl_engine.kernels.gtest.tolerance import load_contract def test_load_contract_contains_expected_operator_classes(): From 6f361c4ad6178b1282750cc18d3a1ea8c44a420a Mon Sep 17 00:00:00 2001 From: a-kaa Date: Mon, 29 Jun 2026 19:44:34 +0800 Subject: [PATCH 05/11] add english doc and TODO for _run_case --- docs/contributing/issue-108-session-log.md | 1088 ++++---------------- rl_engine/kernels/gtest/op_checks.py | 1 + 2 files changed, 208 insertions(+), 881 deletions(-) diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md index f49ddde..dd66a1d 100644 --- a/docs/contributing/issue-108-session-log.md +++ b/docs/contributing/issue-108-session-log.md @@ -1,874 +1,277 @@ # ISSUE-108 Session Log -本文档记录本 session 中围绕 RL-Kernel 算子测试框架、CUDA 验证和 upstream 同步的所有关键修改。后续本 session 中每次代码修改都必须继续追加到本文档,记录目标、设计判断、修改文件、验证方式和结果。 +This document records the engineering decisions made while building the ISSUE-108 kernel correctness checker. It is intentionally concise and review-oriented: it explains what was added, why it was added, how to use it, and what is still out of scope. -## 记录原则 +## Logging Rules -- 使用中文记录工程判断。 -- 每个改动都需要说明为什么做,而不只记录改了什么。 -- 保持最小增量:一次修改尽量只围绕一个明确问题。 -- 每个子任务需要能独立验证;无法验证时必须明确说明原因。 -- CUDA 验证必须如实记录环境、命令、误差和失败范围。 -- 不把失败路径写成已支持能力。 +- Record the reason for each meaningful change, not only the files touched. +- Keep changes minimal and independently verifiable. +- Be explicit when a path is only a smoke test or an experimental path. +- Do not present failed CUDA paths as supported capabilities. +- Gold implementations must come from `rl_engine.kernels.ops.pytorch`. -## 总体目标 +## Goal -本 session 的目标是把算子验证从零散脚本推进为可复用、可扩展、可审查的工程化框架: +The goal of this work is to add a minimal, reusable operator correctness framework for post-training kernels. -- 建立统一 tolerance contract,用于管理不同算子的误差阈值。 -- 建立公共 operator check runner,替代单算子专用验证脚本。 -- 建立统一 operator input 生成逻辑,覆盖后训练常见算子的基础输入。 -- 将测试入口改造成可指定 `op`、`candidate`、`dtype`、`device`、shape 参数的 CLI。 -- 同步 upstream/main,吸收 PR #122 中的 SM90 相关修复。 -- 在 H20 机器上验证普通 CUDA `fused_logp` 路径。 -- 明确 SM90 `fused_linear_logp` 在 CUDA 12.4 下仍未通过。 +The framework should: -## 时间线 +- Generate deterministic operator inputs. +- Run PyTorch gold implementations and backend candidates on the same inputs. +- Compare every tensor output with dtype/operator-class tolerances. +- Report absolute error, relative error, pass rate, and final pass/fail status. +- Expose a CLI so a developer can validate a backend candidate without editing test files. -### 1. tolerance table 和 contract loader - -目标: - -- 将不同 dtype、op class 的误差容差从测试代码中抽离出来。 -- 让误差阈值可以被审查和维护,而不是散落在测试断言中。 - -修改文件: - -- `rl_engine/testing/tolerance.py` -- `rl_engine/testing/tolerance_contract.yaml` -- `tests/test_tolerance_contract.py` - -设计判断: - -- 使用 YAML 保存 contract,便于人工 review。 -- 将容差按 `accuracy.default` 和可选硬件 override 组织。 -- `default` 是通用 fallback,不等同于 CPU;CPU、SM90、SM100、ROCm、Ascend 等未来可作为明确 override key。 - -验证: - -- `tests/test_tolerance_contract.py` 验证 contract 可读、结构正确。 - -结果: - -- tolerance contract 框架建立完成。 - -### 2. operator check runner - -目标: - -- 建立类似 GoogleTest 思路的算子验证 runner。 -- 一个 case 表示一组确定输入和 gold path,一个 candidate 表示被测实现。 - -修改文件: - -- `rl_engine/testing/op_checks.py` -- `tests/test_op_checks.py` -- `rl_engine/testing/__init__.py` - -设计判断: - -- `OperatorCase` 表示测试对象:`name`、`op_class`、`dtype`、`inputs`、`gold_fn`。 -- `CandidateSpec` 表示被测实现:`name`、`fn`、`backend`、`arch_key`。 -- runner 负责: - - 调用 candidate。 - - 调用 gold。 - - flatten 多输出。 - - 按 `op_class + dtype + arch_key` 解析容差。 - - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。 - - 返回结构化 report。 - -验证: - -- `tests/test_op_checks.py` 覆盖 native logp、registry logp、失败 candidate、arch override 等场景。 - -结果: - -- 公共 operator check runner 建立完成。 - -### 3. `check_operator.py` 从 logp 专用入口改为公共入口 - -目标: - -- 让测试者通过 CLI 指定算子、candidate、dtype、device 和 shape。 -- 避免后续每个算子都写一个独立测试脚本。 - -修改文件: - -- `scripts/check_operator.py` - -设计判断: - -- `check_operator.py` 只负责: - - 解析参数。 - - 选择 device/dtype。 - - 调用 `make_candidate`。 - - 调用 `make_operator_case`。 - - 调用 `run_operator_suite`。 - - 输出 summary 或 JSON。 -- 不在入口中硬编码具体算子实现。 - -验证: - -```bash -python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17 -python scripts/check_operator.py --op logp --candidate registry --dtype bf16 --batch 2 --seq 16 --vocab 257 --json -python -m pytest tests/test_op_checks.py -q -``` - -结果: - -- 公共 CLI 最小闭环通过。 - -### 4. 抽离 operator specs - -目标: - -- 避免新增算子时修改测试入口。 -- 将算子元信息集中到专门文件。 - -修改文件: - -- `rl_engine/testing/operator_specs.py` - -设计判断: - -- 每个算子通过 `OperatorSpec` 描述: - - `name` - - `op_class` - - `gold_path` - - `registry_name` - - `candidate_paths` -- `check_operator.py` 不直接知道某个算子的 Python 类路径。 -- `--candidate cuda` 明确选择 CUDA candidate。 -- `--candidate registry` 仅用于测试 dispatcher 分发结果,不作为具体 CUDA correctness 的替代。 - -当前 logp 映射: - -```text -pytorch -> NativeLogpOp -cuda -> FusedLogpGenericOp -cuda-generic -> FusedLogpGenericOp -cuda-sm90 -> FusedLogpSM90Op -registry -> kernel_registry.get_op("logp") -``` - -结果: - -- 后续新增算子主要扩展 `operator_specs.py`,不再修改公共入口。 - -### 5. 统一 operator input 工厂 - -目标: - -- 用户指出不希望每个新算子都手写 `_make_xxx_inputs` 和 `_xxx_shape_name`。 -- 统一准备 ISSUE #108 中所有算子的输入初始化。 - -修改文件: - -- `rl_engine/testing/operator_inputs.py` -- `tests/test_operator_inputs.py` -- `rl_engine/testing/operator_specs.py` -- `scripts/check_operator.py` - -设计判断: - -- 新增 `make_operator_inputs(op_name, args, dtype, device)`。 -- 新增 `operator_shape_name(op_name, args)`。 -- 支持 `random` 和 `constant` 两种输入模式: - - `random` 用 seed 控制可复现。 - - `constant` 用固定值便于 debug。 -- 支持的算子输入: - - `rms_norm` - - `matmul` - - `attention` - - `logp` - - `rope` - - `silu` - - `swiglu` - - `embedding` - - `lm_head` - - `kv_cache_attention` - -CLI 增加参数: +## Final Layout ```text ---input-mode random|constant ---constant-value ---token-value ---normalized-dim ---k-dim ---n-dim ---theta ---eps -``` - -验证: - -```bash -python -m pytest tests/test_operator_inputs.py -q -python -m pytest tests/test_op_checks.py -q -python scripts/check_operator.py --op logp --candidate pytorch --dtype fp32 --batch 1 --seq 4 --vocab 17 --input-mode constant --constant-value 0.5 --token-value 3 -``` - -结果: - -- 多算子输入生成能力建立。 -- logp CLI 随机输入和固定输入均通过。 - -### 6. 删除模型命名和档位参数 - -目标: - -- 用户指出测试框架不应绑定某个模型,也不需要冗余 `--size` 档位。 -- 输入生成应表达为通用数据规模,而不是某个模型配置。 - -修改文件: - -- `rl_engine/testing/operator_inputs.py` -- `scripts/check_operator.py` -- `tests/test_operator_inputs.py` - -设计判断: - -- 删除 `QWEN3_8B` 命名。 -- 删除 `ModelShape` / `DEFAULT_MODEL_SHAPE` 抽象。 -- 改为普通常量: - -```python -DEFAULT_HIDDEN = 4096 -DEFAULT_N_HEADS = 32 -DEFAULT_N_KV_HEADS = 8 -DEFAULT_HEAD_DIM = 128 -DEFAULT_INTERMEDIATE = 12288 -DEFAULT_VOCAB = 151936 -DEFAULT_ROPE_THETA = 1.0e6 -DEFAULT_RMS_EPS = 1.0e-6 -``` - -- 删除 `--size small|medium|large`。 -- 仅保留显式 `--batch` 和 `--seq`。 - -验证: - -```bash -rg -n "Qwen|QWEN|qwen" rl_engine/testing scripts tests -rg -n "ModelShape|DEFAULT_MODEL|model" rl_engine/testing/operator_inputs.py -rg -n "BATCH_SHAPES|BatchShape|--size|small|medium|large" scripts/check_operator.py rl_engine/testing/operator_inputs.py tests/test_operator_inputs.py -python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q -``` - -结果: - -- 模型耦合和档位参数均已删除。 -- 相关测试通过。 - -### 7. 同步 upstream/main 并处理冲突 - -目标: - -- 用户要求检查当前代码是否过旧,拉取最新代码,如有冲突则解决。 - -操作: - -```bash -git fetch --all --prune -git stash push -u -m pre-upstream-main-sync -git rebase upstream/main -git stash pop -``` - -冲突文件: - -- `csrc/cuda/fused_logp_sm90.cu` - -设计判断: - -- `upstream/main` 已包含 PR #122。 -- PR #122 中已经包含 SM90 文件的两项修复: - - `#include ` - - `reinterpret_cast` -- 因此冲突解决时采用 upstream/main 版本。 -- 丢弃本地临时加入的 `#include `。 +rl_engine/kernels/gtest/ + __init__.py + op_checks.py + operator_inputs.py + operator_specs.py + tolerance.py + tolerance_contract.yaml -验证: +scripts/check_operator.py -```bash -python -m pytest tests/test_operator_inputs.py tests/test_op_checks.py -q +tests/test_op_checks.py +tests/test_operator_inputs.py +tests/test_tolerance_contract.py ``` -结果: - -- rebase 到最新 `upstream/main` 成功。 -- 冲突解决完成。 -- 本地相对 `upstream/main` 为 `ahead 3, behind 0`。 - -### 8. H20 CUDA 环境和普通 CUDA logp 验证 +## Key Design Decisions -目标: +### Tolerance Contract -- 将测试框架迁移到 H 系列 GPU 环境验证。 -- 先确认普通 CUDA `fused_logp` 路径是否可用。 - -环境记录: +Files: ```text -GPU: NVIDIA H20 -Driver: 565.57.01 -Driver CUDA capability: 12.7 -nvcc: 12.4 -Python: 3.11.15 -``` - -普通 CUDA 扩展检查: - -```text -_EXT_AVAILABLE: True -has fused_logp: True -has fused_logp_sm90: False -``` - -验证命令: - -```bash -python scripts/check_operator.py \ - --op logp \ - --candidate cuda \ - --device cuda \ - --dtype bf16 \ - --arch-key sm90 \ - --batch 2 \ - --seq 16 \ - --vocab 257 +rl_engine/kernels/gtest/tolerance.py +rl_engine/kernels/gtest/tolerance_contract.yaml +tests/test_tolerance_contract.py ``` -输出: - -```text -INFO [RL-Kernel]: Successfully linked to precompiled _C.fused_logp fallback kernel. -suite=logp passed=True pass_rate=1.0000 -candidate=cuda-logp backend=cuda passed=True pass_rate=1.0000 -case=logp-torch.bfloat16-2x16x257 output=0 shape=(2, 16) -max_abs=1.49779320e-02 -mean_abs=7.53845274e-03 -max_rel=2.70811981e-03 -tol=(atol=5.000e-02, rtol=0.000e+00) -passed=True -``` +Decision: -结论: +- Store tolerance values in a small contract file rather than hard-coding them inside tests. +- Resolve tolerance by `op_class + dtype`, with optional `arch_key` overrides. +- Treat `default` as the generic fallback, not as CPU-specific tolerance. -- 普通 CUDA `FusedLogpGenericOp -> _C.fused_logp` 路径通过。 -- 这证明测试框架最小 GPU 闭环已经打通: +Current accuracy classes: ```text -CLI --> operator_specs --> operator_inputs --> PyTorch gold --> CUDA candidate --> run_operator_suite --> tolerance contract --> compare_output --> structured report +elementwise +reduction +logprob ``` -### 9. SM90 fused_linear_logp 当前状态 - -目标: - -- 尝试编译和验证 SM90 路径。 - -结果: +### Operator Check Runner -- `fused_logp_sm90` 的旧 include 和 type 问题已由 upstream PR #122 解决。 -- 但 `fused_linear_logp_sm90.cu` 在 CUDA 12.4 下仍未通过 ptxas。 - -错误摘要: +Files: ```text -ptxas error: State space incorrect for instruction 'cp.async.bulk.tensor' -ptxas fatal: Ptx assembly aborted due to errors +rl_engine/kernels/gtest/op_checks.py +tests/test_op_checks.py ``` -设计判断: +Decision: -- PR #122 描述中提到相关 SM90 路径在 CUDA 13.1 下 assembled。 -- 当前 H20 环境是 nvcc 12.4,不应将该路径写为已通过。 -- 当前应先以普通 CUDA `fused_logp` 作为验证通过范围。 +- `OperatorCase` describes one deterministic test case: name, op class, dtype, inputs, and gold function. +- `CandidateSpec` describes one implementation under test: name, function, backend, and optional arch key. +- `run_operator_suite()` runs candidates against gold outputs and returns structured reports. +- The runner compares forward outputs only in this minimal version. -结论: +Review follow-up: -- 已通过:`--candidate cuda` -- 未通过:`--candidate cuda-sm90` / SM90 fused linear logp +- `op_checks.py` includes a TODO for optional gradient checks on differentiable operators. +- Gradient checks require additional metadata and input cloning rules, so they are intentionally tracked as follow-up work instead of being silently implied by this PR. -## 当前文件状态摘要 +### Operator Inputs -本 session 产生或涉及的主要文件: +Files: ```text -rl_engine/testing/tolerance.py -rl_engine/testing/tolerance_contract.yaml -tests/test_tolerance_contract.py -rl_engine/testing/op_checks.py -tests/test_op_checks.py -rl_engine/testing/__init__.py -scripts/check_operator.py -rl_engine/testing/operator_specs.py -rl_engine/testing/operator_inputs.py +rl_engine/kernels/gtest/operator_inputs.py tests/test_operator_inputs.py -csrc/cuda/fused_logp_sm90.cu -docs/contributing/issue-108-session-log.md -``` - -说明: - -- `AGENTS.md` 是未跟踪文件,未纳入本 session 的代码修改范围。 -- `csrc/cuda/fused_logp_sm90.cu` 最终与 upstream PR #122 版本一致。 - -## 后续记录模板 - -之后每次代码修改都在本文档追加如下条目: - -```markdown -### YYYY-MM-DD HH:MM - 变更标题 - -目标: - -- 本次最小子任务要解决什么问题。 - -修改文件: - -- `path/to/file.py` - -设计决策: - -- 为什么这样改。 -- 为什么没有选择其他方案。 - -验证方式: - -- 执行的测试命令。 -- CUDA 环境,如 GPU、CUDA 版本、driver、arch。 -- 关键输出指标。 - -结果: - -- 通过 / 未通过 / 部分通过。 -- 未通过时必须记录完整错误摘要。 - -后续: - -- 是否需要继续拆分子任务。 -- 是否影响 CI、benchmark 或其他算子。 -``` - -CUDA 验证建议额外记录: - -```markdown -GPU: -CUDA: -Driver: -Arch: -Candidate: -Backend: -Command: -max_abs: -mean_abs: -max_rel: -atol: -rtol: -Result: -Known issue: ``` -### 2026-06-28 - CUDA 13 CUB reduce functor 兼容修复 +Decision: -目标: +- Build standard semantic inputs for each operator. +- Support both `random` and `constant` initialization. +- Make random inputs reproducible with `--seed`. +- Preserve semantic shapes such as `[B, S, V]`; do not flatten inputs for backend-specific kernels inside input generation. -- 修复 H100 + CUDA 13.0 环境下 SM90 编译失败的问题。 - -修改文件: - -- `csrc/cuda/fused_logp_sm90.cu` -- `docs/contributing/issue-108-session-log.md` - -错误摘要: +Current input builders cover: ```text -csrc/cuda/fused_logp_sm90.cu(76): error: namespace "cub" has no member "Max" -csrc/cuda/fused_logp_sm90.cu(86): error: namespace "cub" has no member "Sum" -``` - -设计决策: - -- 不继续依赖 CUB 内置 `cub::Max()` 和 `cub::Sum()` functor 名称。 -- 在当前 SM90 文件内定义本地 `FloatMax` 和 `FloatSum`,传给 `cub::BlockReduce::Reduce`。 -- 这样保留原有 reduction 语义,同时规避 CUDA 13 / CCCL 中 CUB functor API 变化。 - -验证方式: - -- 本地只做源码修改;需要在 H100 + CUDA 13.0 机器上重新执行: - -```bash -rm -rf build -find rl_engine -name "*.so" -delete - -export CUDA_HOME=/usr/local/cuda -export PATH=$CUDA_HOME/bin:$PATH -export LD_LIBRARY_PATH=$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-} -export OMP_NUM_THREADS=8 -export MAX_JOBS=1 - -KERNEL_ALIGN_FORCE_SM90=1 pip install -v --no-build-isolation -e . 2>&1 | tee build_sm90.log +rms_norm +matmul +attention +logp +rope +silu +swiglu +embedding +lm_head +kv_cache_attention ``` -结果: - -- 待 H100 机器重新编译确认。 +### Operator Specs -后续: - -- 如果继续失败,优先查看 `grep -nE "FAILED:|error:|ptxas|fatal" build_sm90.log | head -n 80`。 - -### 2026-06-28 - 对齐 SM90 LogP Python wrapper 输入接口 - -目标: - -- 修复 `check_operator.py --candidate cuda-sm90` 调用失败的问题。 - -修改文件: - -- `rl_engine/kernels/ops/cuda/loss/logp.py` -- `docs/contributing/issue-108-session-log.md` - -错误摘要: +File: ```text -TypeError: FusedLogpSM90Op.__call__() got an unexpected keyword argument 'token_ids' +rl_engine/kernels/gtest/operator_specs.py ``` -设计决策: - -- 测试框架统一通过 `case.inputs` 传递 `token_ids`。 -- `FusedLogpGenericOp`、`NativeLogpOp` 都使用 `token_ids` 命名。 -- 因此将 `FusedLogpSM90Op.__call__(logits, labels)` 改为 `__call__(logits, token_ids)`,让 candidate 接口与 gold/case 输入一致。 -- 同时在 wrapper 内部把 `[B, S, V]` logits reshape 为 `[B*S, V]`,把 `[B, S]` token ids flatten 为 `[B*S]`,再把 `_C.fused_logp_sm90` 的 `[B*S]` 输出 reshape 回 `[B, S]`。 +Decision: -验证方式: +- Keep operator-specific registration outside `scripts/check_operator.py`. +- Register PyTorch gold paths and backend candidate paths in one place. +- Require `gold_path` to point into `rl_engine.kernels.ops.pytorch`. -- 需要在 H100 + CUDA 13.0 机器上同步该 patch 后运行: +Current minimal registered operator: -```bash -python scripts/check_operator.py \ - --op logp \ - --candidate cuda-sm90 \ - --device cuda \ - --dtype bf16 \ - --arch-key sm90 \ - --batch 2 \ - --seq 16 \ - --vocab 257 +```text +op: logp +op_class: logprob +gold: rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp +candidates: + pytorch -> NativeLogpOp + cuda -> FusedLogpGenericOp + cuda-generic -> FusedLogpGenericOp + cuda-sm90 -> FusedLogpSM90Op + registry -> kernel_registry.get_op("logp") ``` -结果: - -- 已撤回该方向。`rl_engine/kernels/ops/cuda/loss/logp.py` 属于被测 CUDA 算子实现,不应为了测试框架改动其接口。 -- 后续适配应放在 testing 层,例如在 candidate adapter 中把测试框架统一的 `token_ids` 映射为 SM90 wrapper 需要的 `labels`,并处理 flatten/reshape。 +Important note: -### 2026-06-28 - 在 testing 层适配 SM90 LogP candidate +- `candidate=pytorch` is only a smoke test for the checker itself. +- CUDA, Triton, ROCm, and future hardware-specific implementations are candidates. +- Do not compare two operators that implement different math, such as ordinary `logp` and `linear_logp`. -目标: +### SM90 Adapter Exception -- 保持 `rl_engine/kernels/ops/cuda` 下被测实现不变。 -- 让 `check_operator.py --candidate cuda-sm90` 可以使用测试框架统一的 `token_ids` 输入。 +Current code contains `_LogpSM90CandidateAdapter` in `operator_specs.py`. -修改文件: +Reason: -- `rl_engine/testing/operator_specs.py` -- `docs/contributing/issue-108-session-log.md` +- The existing SM90 logp wrapper accepts flattened inputs, while the checker standard input for `logp` is `[B, S, V]` logits and `[B, S]` token ids. +- The adapter exists only to validate the checker path against the current SM90 wrapper. -设计决策: +Long-term rule: -- 新增 `_LogpSM90CandidateAdapter`,只在 `args.op == "logp"` 且 `candidate == "cuda-sm90"` 时使用。 -- adapter 接收测试框架标准输入 `logits` 和 `token_ids`。 -- adapter 内部把 `logits` 从 `[B, S, V]` flatten 为 `[B*S, V]`,把 `token_ids` 从 `[B, S]` flatten 为 `[B*S]`。 -- adapter 调用原始 SM90 candidate:`self._candidate(logits_2d, labels_1d)`。 -- adapter 将输出 reshape 回 `[B, S]`,以便 `compare_output` 按原始 case shape 比较。 +- Backend wrappers should align with the standard operator interface whenever possible. +- New operators should not rely on permanent test-side adapters for ordinary shape or parameter-name differences. -验证方式: +## CLI Usage -- 本地执行 Python 测试和编译检查: +CPU smoke check against the PyTorch candidate: ```bash -python -m py_compile rl_engine/testing/operator_specs.py -python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q +python scripts/check_operator.py --op logp --candidate pytorch --device cpu --dtype fp32 --batch 1 --seq 2 --vocab 17 ``` -- H100 机器需要重新运行: +CUDA candidate check against the PyTorch gold path: ```bash -python scripts/check_operator.py \ - --op logp \ - --candidate cuda-sm90 \ - --device cuda \ - --dtype bf16 \ - --arch-key sm90 \ - --batch 2 \ - --seq 16 \ - --vocab 257 -``` - -结果: - -- 本地待验证;H100 CUDA 结果待重新运行确认。 - -### 2026-06-28 - H100 CUDA generic 与 SM90 LogP 对照验证 - -目标: - -- 记录 H100 + CUDA 13.0 环境下 `logp` 的 generic CUDA 和 SM90 candidate 行为差异。 -- 明确测试框架已能区分“通过的 CUDA generic candidate”和“编译/运行存在问题的 SM90 candidate”。 - -环境: - -```text -GPU: NVIDIA H100 80GB HBM3 -Driver: 580.95.05 -CUDA driver capability: 13.0 -nvcc: 13.0 -Python: 3.12.13 -torch: 2.12.0+cu130 -torch cuda: 13.0 -compute capability: (9, 0) +python scripts/check_operator.py --op logp --candidate cuda --device cuda --dtype bf16 --arch-key sm90 --batch 1 --seq 1 --vocab 4096 ``` -修改文件: - -- `docs/contributing/issue-108-session-log.md` - -验证命令: +JSON report: ```bash -for v in 256 512 1024 2048 4096; do - echo "=== vocab=$v ===" - python scripts/check_operator.py \ - --op logp \ - --candidate cuda \ - --device cuda \ - --dtype bf16 \ - --arch-key sm90 \ - --batch 1 \ - --seq 1 \ - --vocab $v -done +python scripts/check_operator.py --op logp --candidate pytorch --device cpu --dtype fp32 --batch 1 --seq 2 --vocab 17 --json ``` -generic CUDA 结果: +Supported key options: ```text -vocab=256: passed=True, max_abs=5.77497482e-03 -vocab=512: passed=True, max_abs=8.04328918e-03 -vocab=1024: passed=True, max_abs=1.80721283e-04 -vocab=2048: passed=True, max_abs=1.77164078e-02 -vocab=4096: passed=True, max_abs=2.69813538e-02 -atol=5.000e-02, rtol=0.000e+00 -``` - -结论: - -- `FusedLogpGenericOp -> _C.fused_logp` 在 H100 + CUDA 13.0 上多 vocab correctness 全部通过。 -- 这进一步确认测试框架、input 生成、gold path、candidate 调用和 compare_output 链路是通的。 - -SM90 对照现象: +--op Operator name. The minimal version supports logp. +--candidate Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, registry. +--dtype fp32, bf16, or fp16. +--device auto, cpu, cuda, or another torch device string. +--arch-key Optional tolerance override key such as sm90. +--batch Batch size. +--seq Sequence length. +--vocab Vocabulary size. +--input-mode random or constant. +--constant-value Floating-point value for constant mode. +--token-value Token id for constant mode, reduced modulo vocab. +--seed Random seed for reproducible random inputs. +--json Print the full structured report as JSON. +``` + +Example output: ```text -TILE_V=4096: - vocab=257/4096/151936 均在 cuTensorMapEncodeTiled 失败。 - 错误:CUDA_ERROR_INVALID_VALUE。 - -TILE_V=256: - vocab=256 返回结果,但 passed=False,max_abs≈1.04094028e+00。 - vocab=512/1024/2048/4096 在 20s timeout 下没有输出 report,表现为 hang/timeout。 +suite=logp passed=True pass_rate=1.0000 +candidate=cuda-logp backend=cuda passed=True pass_rate=1.0000 + case=logp-torch.bfloat16-1x1x4096 output=0 shape=(1, 1) dtype=torch.bfloat16 max_abs=2.69813538e-02 mean_abs=2.69813538e-02 max_rel=3.03093810e-03 tol=(atol=5.000e-02, rtol=0.000e+00) passed=True ``` -结论: +## Adding a New Operator -- `cuda-sm90` 已能编译和加载,但当前 SM90 TMA kernel 仍不能标记为通过。 -- 当前通过范围只包括 `--candidate cuda` generic CUDA logp。 -- SM90 问题应作为独立 CUDA kernel bugfix 处理,不归因于测试框架。 +To add a new operator, keep the shared checker flow unchanged. Add only operator-specific inputs, specs, and tests. -### 2026-06-28 - 最终整理:不提交 CUDA 源码改动 +### 1. Add Input Generation -目标: - -- 用户明确要求本阶段不修改 `csrc` 下 CUDA/TMA 源码。 -- 本阶段只提交算子测试框架和文档,不把 SM90 kernel 实验 patch 混入测试框架 PR。 - -本地处理: - -- 已还原 `csrc/cuda/fused_logp_sm90.cu`。 -- 本地 `csrc/utils/tma_utils.cuh` 没有 diff。 -- 因此本地最终不会提交任何 `csrc` 改动。 - -服务器状态对照: - -用户在 H100 服务器上看到: - -```text -Changes not staged for commit: - modified: csrc/cuda/fused_logp_sm90.cu - modified: csrc/utils/tma_utils.cuh -``` - -本地当前状态不同: +File: ```text -csrc/cuda/fused_logp_sm90.cu: no diff after restore -csrc/utils/tma_utils.cuh: no diff locally -``` - -结论: - -- H100 服务器上的 `csrc/utils/tma_utils.cuh` 改动不是本地当前工作区的一部分。 -- 如果服务器要回到与本地一致的测试框架提交状态,需要在服务器上还原两个 CUDA/TMA 文件: - -```bash -git restore csrc/cuda/fused_logp_sm90.cu csrc/utils/tma_utils.cuh -``` - -保留记录的 CUDA 现象: - -- H100 环境: - -```text -GPU: NVIDIA H100 80GB HBM3 -Driver: 580.95.05 -CUDA driver capability: 13.0 -nvcc: 13.0.88 -Python: 3.12.13 -torch: 2.12.0+cu130 -torch cuda: 13.0 -compute capability: (9, 0) +rl_engine/kernels/gtest/operator_inputs.py ``` -- `--candidate cuda` generic logp 在 vocab 256/512/1024/2048/4096 上通过。 -- `--candidate cuda-sm90` 可以编译和加载,但: - - `TILE_V=4096` 触发 `cuTensorMapEncodeTiled failed`。 - - `TILE_V=256` 时 vocab=256 返回但数值不通过,vocab>=512 出现 timeout/hang。 -- 所以 SM90 fused logp 目前记录为 CUDA kernel 问题,不作为测试框架失败。 - -### 2026-06-28 - 测试框架目录归位 - -目标: +Update `make_operator_inputs()`: -- 用户指出测试框架文件放在通用 `rl_engine/testing` 下过于分散。 -- 本阶段将 operator correctness checking 代码移动到 kernel 相关目录,避免和 RL batch/reference testing 混在一起。 - -最终目录: - -```text -rl_engine/kernels/gtest/ - __init__.py - op_checks.py - operator_inputs.py - operator_specs.py - tolerance.py - tolerance_contract.yaml +```python +builders = { + ... + "new_op": _make_new_op_inputs, +} ``` -职责划分: - -- `op_checks.py` - - 定义 `OperatorCase`、`CandidateSpec`、report dataclass。 - - 调用 gold 和 candidate。 - - flatten 输出。 - - 解析 tolerance。 - - 计算 `max_abs_error`、`mean_abs_error`、`max_rel_error`。 - - 返回通过率和结构化 report。 -- `operator_inputs.py` - - 统一构造标准语义输入。 - - 支持 `random` 和 `constant`。 - - 支持 `batch`、`seq`、`vocab` 等 CLI 参数。 - - 当前覆盖 ISSUE-108 相关算子的输入初始化骨架。 -- `operator_specs.py` - - 注册每个算子的 gold path 和 candidate path。 - - gold path 必须来自 `rl_engine.kernels.ops.pytorch`。 - - candidate path 来自 `cuda`、`triton`、`rocm` 或未来 backend。 -- `tolerance.py` 和 `tolerance_contract.yaml` - - 加载 dtype/operator-class 容差表。 - - 供 `op_checks.py` 在 compare output 时解析 `atol` 和 `rtol`。 -- `scripts/check_operator.py` - - 命令行入口。 - - 不直接硬编码具体算子实现。 - -导入边界: - -- operator checking 框架只从 `rl_engine.kernels.gtest` 导入。 -- `rl_engine/testing/__init__.py` 不导出 `CandidateSpec`、`OperatorCase`、`run_operator_suite`。 -- 这样可以避免 kernel correctness checking 和通用 RL testing helper 混在一起。 - -### 2026-06-28 - 添加新算子的傻瓜式流程 - -目标: - -- 新增算子时不修改测试主逻辑。 -- 新增算子只改注册信息、输入工厂和必要测试。 -- gold 永远使用 `rl_engine/kernels/ops/pytorch` 下实现。 - -步骤 1:确认算子标准接口 - -先确定这个算子的标准语义输入。例如: +Update `operator_shape_name()`: -```text -logp: - inputs: - logits: [B, S, V] - token_ids: [B, S] - output: - selected_logp: [B, S] +```python +names = { + ... + "new_op": f"{batch}x{seq}x...", +} ``` -要求: - -- PyTorch gold、CUDA、Triton、ROCm wrapper 都应尽量使用同一套 Python 接口。 -- 不同 backend 不应要求测试框架长期维护 shape/参数名 adapter。 -- 当前 `_LogpSM90CandidateAdapter` 只是为了验证框架最小闭环的临时例外,不作为长期模式。 +Add the input builder: -步骤 2:在 `operator_inputs.py` 添加输入构造 - -文件: - -```text -rl_engine/kernels/gtest/operator_inputs.py +```python +def _make_new_op_inputs(args, dtype, device): + batch, seq = _batch_seq(args) + return { + "x": _floating_tensor((batch, seq, ...), args, dtype, device, offset=0), + } ``` -需要做三件事: - -1. 在 `make_operator_inputs()` 的 `builders` 中加入算子名。 -2. 在 `operator_shape_name()` 的 `names` 中加入 shape 描述。 -3. 新增 `_make_xxx_inputs(args, dtype, device)`。 - -要求: +Rules: -- 输入必须是标准语义输入,不是某个 CUDA kernel 的私有格式。 -- `random` 模式必须可由 `--seed` 复现。 -- `constant` 模式必须便于 debug。 -- 多 batch 情况默认保留 `[B, S, ...]` 语义形状,不提前为某个 backend flatten。 +- Inputs should represent the operator's standard semantic interface. +- Do not generate backend-specific flattened inputs here. +- Support deterministic random inputs and constant inputs where practical. -步骤 3:在 `operator_specs.py` 注册 gold 和 candidate +### 2. Register Gold and Candidates -文件: +File: ```text rl_engine/kernels/gtest/operator_specs.py ``` -添加: +Add an `OperatorSpec` entry: ```python "new_op": OperatorSpec( name="new_op", - op_class="...", + op_class="elementwise", gold_path="rl_engine.kernels.ops.pytorch....NativeNewOp", registry_name="new_op", candidate_paths={ @@ -879,166 +282,89 @@ rl_engine/kernels/gtest/operator_specs.py ) ``` -硬性规则: - -- `gold_path` 必须来自 `rl_engine.kernels.ops.pytorch`。 -- `candidate_paths["pytorch"]` 只能用于框架自检,不代表高性能算子通过。 -- `candidate_paths["cuda"]`、`candidate_paths["triton"]` 等必须对应实际被测 backend。 -- 不允许用实现了不同数学功能的算子互相比较,例如不能用 `linear_logp` 测普通 `logp`。 - -步骤 4:确认 gold 调用方法 - -当前 `make_operator_case()` 对 logp 使用: - -```python -gold_fn=gold_op.forward_fp32 -``` +Rules: -这对 `NativeLogpOp` 是正确的。新增算子时必须确认 PyTorch gold 是否有对应方法。 +- `gold_path` must come from `rl_engine.kernels.ops.pytorch`. +- Backend implementations are candidates only. +- `candidate=pytorch` is for checker smoke tests only. +- Do not compare operators with different math. -如果新算子没有 `forward_fp32`,不要在测试主逻辑中硬编码临时分支;应在 `operator_specs.py` 中显式补充 gold 调用策略,作为一个独立小改动提交。 +### 3. Update Tolerances If Needed -步骤 5:新增输入和 runner 单测 - -至少补两类测试: +File: ```text -tests/test_operator_inputs.py: - 确认 make_operator_inputs("new_op", ...) 能生成输入。 - 确认 random seed 可复现。 - 确认 constant 模式值正确。 - -tests/test_op_checks.py 或新测试文件: - 用 pytorch candidate vs pytorch gold 验证框架能跑通。 - 用 bad candidate 验证失败报告符合预期。 +rl_engine/kernels/gtest/tolerance_contract.yaml ``` -步骤 6:本地验证 - -CPU 框架验证: +Reuse an existing class when possible: -```bash -python -m pytest tests/test_op_checks.py tests/test_operator_inputs.py -q +```text +elementwise +reduction +logprob ``` -CUDA candidate 验证: +If a new class is needed, add dtype tolerances and set `op_class` accordingly in `operator_specs.py`. -```bash -python scripts/check_operator.py \ - --op new_op \ - --candidate cuda \ - --device cuda \ - --dtype bf16 \ - --arch-key sm90 \ - --batch 2 \ - --seq 16 -``` +### 4. Add Tests -如果 CUDA candidate 不通过,先判断: +Files: ```text -1. gold 和 candidate 是否真的是同一个数学函数。 -2. candidate Python wrapper 是否使用标准接口。 -3. 输入 dtype / shape 是否符合 candidate 声明。 -4. 误差是否超过 tolerance。 -5. 是否是 kernel 编译或运行错误。 +tests/test_operator_inputs.py +tests/test_op_checks.py ``` -不要为了让测试通过去修改 gold,也不要把不同功能的算子混在一起比较。 +Minimum expected coverage: -### 2026-06-28 - `check_operator.py` 支持参数 +- Add the operator to the `test_operator_inputs_support_all_issue_108_ops` parametrized list. +- Add a PyTorch-vs-PyTorch smoke case if the operator adds new runner behavior. +- Add a bad-candidate case if the operator introduces new comparison behavior. -入口: +### 5. Validate ```bash -python scripts/check_operator.py [options] +python -m pytest tests/test_tolerance_contract.py tests/test_op_checks.py tests/test_operator_inputs.py -q ``` -核心参数: - -```text ---op - 算子名。当前最小版本支持 logp。 - ---candidate - 被测实现。当前 logp 支持 registry、pytorch、native、cuda、cuda-generic、cuda-sm90。 - ---dtype - fp32、bf16、fp16。 +Then run the CLI: ---device - auto、cpu、cuda 或 torch 可识别的 device 字符串。 - ---arch-key - tolerance override key,例如 sm90。为空时使用 default tolerance。 +```bash +python scripts/check_operator.py --op new_op --candidate pytorch --device cpu --dtype fp32 ``` -shape 参数: - -```text ---batch - batch size,默认 2。 - ---seq - sequence length,默认 16。 - ---vocab - vocabulary size,默认 257。 +For CUDA: ---normalized-dim - norm 类算子的 hidden/normalized dimension。 - ---k-dim - matmul K dimension。 - ---n-dim - matmul N dimension。 +```bash +python scripts/check_operator.py --op new_op --candidate cuda --device cuda --dtype bf16 --arch-key sm90 ``` -输入初始化参数: - -```text ---input-mode - random 或 constant。 +## CUDA Validation Notes ---constant-value - constant 模式下浮点 tensor 的基础值。 +H100 environment observed during development: ---token-value - constant 模式下 token id 的基础值,会对 vocab 取模。 - ---seed - random 模式下的随机种子。 +```text +GPU: NVIDIA H100 80GB HBM3 +Driver: 580.95.05 +CUDA driver capability: 13.0 +nvcc: 13.0 +torch: 2.12.0+cu130 +compute capability: (9, 0) ``` -其他参数: +Generic CUDA `logp` passed on H100 for vocab sizes 256, 512, 1024, 2048, and 4096 with bf16 inputs under the current tolerance contract. -```text ---theta - RoPE theta。 +SM90 fused logp is not marked as a passing path in this PR. It compiled and loaded in some experiments, but runtime failures and accuracy failures were observed separately. Treat SM90 fused logp as a separate CUDA kernel validation task unless `check_operator.py` reports `passed=True` for the target case. ---eps - norm epsilon。 +## Validation Performed ---json - 输出完整 JSON report。 +```bash +python -m pytest tests/test_tolerance_contract.py tests/test_op_checks.py tests/test_operator_inputs.py -q ``` -当前最小可运行示例: +CPU CLI smoke test: ```bash -python scripts/check_operator.py \ - --op logp \ - --candidate cuda \ - --device cuda \ - --dtype bf16 \ - --arch-key sm90 \ - --batch 1 \ - --seq 1 \ - --vocab 4096 +python scripts/check_operator.py --op logp --candidate pytorch --device cpu --dtype fp32 --batch 1 --seq 2 --vocab 17 ``` - -预期: - -- 在 H100 + CUDA 13.0 环境中,generic CUDA logp 已观察到通过。 -- `cuda-sm90` 当前不应作为通过路径使用。 diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py index 433bbfc..e0e778c 100644 --- a/rl_engine/kernels/gtest/op_checks.py +++ b/rl_engine/kernels/gtest/op_checks.py @@ -146,6 +146,7 @@ def _run_case( case: OperatorCase, contract: Mapping[str, Any], ) -> CaseCheck: + # TODO(issue-108): add optional gradient checks for differentiable operators. candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, case.inputs)) gold_outputs = _flatten_tensors(case.gold_fn(**case.inputs)) if len(candidate_outputs) != len(gold_outputs): From 035f21b840d4dad1298c1b19f3b2463d14cf2fe6 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Mon, 29 Jun 2026 21:26:23 +0800 Subject: [PATCH 06/11] Add logp gradient test --- docs/contributing/issue-108-session-log.md | 21 +++++++++++++++++++++ tests/test_logp.py | 14 ++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md index dd66a1d..6c0c892 100644 --- a/docs/contributing/issue-108-session-log.md +++ b/docs/contributing/issue-108-session-log.md @@ -368,3 +368,24 @@ CPU CLI smoke test: ```bash python scripts/check_operator.py --op logp --candidate pytorch --device cpu --dtype fp32 --batch 1 --seq 2 --vocab 17 ``` + +## PR Review Updates + +### LogP Gradient Coverage + +Files: + +```text +tests/test_logp.py +docs/contributing/issue-108-session-log.md +``` + +Change: + +- Added a forward-gradient test for `NativeLogpOp.forward_fp32`. + +Reasoning: + +- The checker PR already validates forward output values, but review feedback called out that logprob coverage should also prove gradient propagation and batch invariance. +- The new gradient test compares the op gradient against a direct PyTorch `log_softmax + gather` reference under a non-unit upstream gradient. +- Batch invariance was already covered by `TestNativeLogpOpBatchInvariance` in `tests/test_logp.py`, so no duplicate batch-invariance test was added. diff --git a/tests/test_logp.py b/tests/test_logp.py index 42492fb..bcb0e3f 100644 --- a/tests/test_logp.py +++ b/tests/test_logp.py @@ -78,6 +78,20 @@ def test_pure_function_no_inplace(self): assert torch.equal(logits, logits_orig) assert torch.equal(token_ids, token_ids_orig) + def test_forward_fp32_gradient_matches_reference(self): + gen = torch.Generator().manual_seed(654) + logits = torch.randn(2, 4, 17, generator=gen, requires_grad=True) + ref_logits = logits.detach().clone().requires_grad_(True) + token_ids = torch.randint(0, logits.size(-1), (2, 4), generator=gen) + upstream = torch.randn(2, 4, generator=gen) + + (NativeLogpOp().forward_fp32(logits, token_ids) * upstream).sum().backward() + (_reference_selected_logp(ref_logits, token_ids) * upstream).sum().backward() + + assert logits.grad is not None + assert ref_logits.grad is not None + assert torch.allclose(logits.grad, ref_logits.grad, atol=1e-6, rtol=1e-6) + def test_op_class_is_logprob(self): assert NativeLogpOp.op_class == "logprob" From 6dd09a42048162f4dc9c7b7c44a6642f5c822f54 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Mon, 29 Jun 2026 22:31:31 +0800 Subject: [PATCH 07/11] Add gtest backward checks --- docs/contributing/issue-108-session-log.md | 33 ++++++ rl_engine/kernels/gtest/op_checks.py | 111 ++++++++++++++++++++- rl_engine/kernels/gtest/operator_specs.py | 3 + scripts/check_operator.py | 11 +- tests/test_op_checks.py | 43 ++++++++ 5 files changed, 196 insertions(+), 5 deletions(-) diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md index 6c0c892..0644424 100644 --- a/docs/contributing/issue-108-session-log.md +++ b/docs/contributing/issue-108-session-log.md @@ -201,6 +201,7 @@ Supported key options: --constant-value Floating-point value for constant mode. --token-value Token id for constant mode, reduced modulo vocab. --seed Random seed for reproducible random inputs. +--check-grad Also compare gradients for inputs declared by the operator spec. --json Print the full structured report as JSON. ``` @@ -369,6 +370,12 @@ CPU CLI smoke test: python scripts/check_operator.py --op logp --candidate pytorch --device cpu --dtype fp32 --batch 1 --seq 2 --vocab 17 ``` +Backward CLI smoke test: + +```bash +python scripts/check_operator.py --op logp --candidate pytorch --device cpu --dtype fp32 --batch 1 --seq 2 --vocab 17 --check-grad +``` + ## PR Review Updates ### LogP Gradient Coverage @@ -389,3 +396,29 @@ Reasoning: - The checker PR already validates forward output values, but review feedback called out that logprob coverage should also prove gradient propagation and batch invariance. - The new gradient test compares the op gradient against a direct PyTorch `log_softmax + gather` reference under a non-unit upstream gradient. - Batch invariance was already covered by `TestNativeLogpOpBatchInvariance` in `tests/test_logp.py`, so no duplicate batch-invariance test was added. + +### GTest Backward Check Support + +Files: + +```text +rl_engine/kernels/gtest/op_checks.py +rl_engine/kernels/gtest/operator_specs.py +scripts/check_operator.py +tests/test_op_checks.py +docs/contributing/issue-108-session-log.md +``` + +Change: + +- Added `OperatorCase.grad_input_names` to declare which inputs should be checked for gradients. +- Added `run_operator_suite(..., check_grad=True)`. +- Added `_run_case_backward()` to compare candidate forward outputs and selected input gradients against the PyTorch gold path. +- Added `OperatorSpec.grad_input_names`; `logp` declares `("logits",)`. +- Added `scripts/check_operator.py --check-grad`. + +Reasoning: + +- Forward-only checks can miss incorrect or disconnected backward paths. +- Gradient inputs must be declared per operator because not every floating tensor should receive gradients. +- Input generation remains independent of autograd; the runner clones inputs and enables `requires_grad` only inside the backward check path. diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py index e0e778c..fd590d9 100644 --- a/rl_engine/kernels/gtest/op_checks.py +++ b/rl_engine/kernels/gtest/op_checks.py @@ -21,6 +21,7 @@ class OperatorCase: dtype: torch.dtype inputs: Mapping[str, Any] gold_fn: Callable[..., Any] + grad_input_names: tuple[str, ...] = () @dataclass(frozen=True) @@ -95,6 +96,7 @@ def run_operator_suite( candidates: Sequence[CandidateSpec], cases: Sequence[OperatorCase], contract: Mapping[str, Any] | None = None, + check_grad: bool = False, ) -> OperatorCheckReport: """Run candidates against gold outputs and return a structured report.""" @@ -104,7 +106,8 @@ def run_operator_suite( # camdidate : test instance # loaded_contract : tolerance table candidate_reports = [ - _run_candidate(candidate, cases, loaded_contract) for candidate in candidates + _run_candidate(candidate, cases, loaded_contract, check_grad=check_grad) + for candidate in candidates ] passed_candidates = sum(1 for report in candidate_reports if report.passed) total_candidates = len(candidate_reports) @@ -123,8 +126,11 @@ def _run_candidate( candidate: CandidateSpec, cases: Sequence[OperatorCase], contract: Mapping[str, Any], + *, + check_grad: bool, ) -> CandidateReport: - case_checks = [_run_case(candidate, case, contract) for case in cases] + case_runner = _run_case_backward if check_grad else _run_case + case_checks = [case_runner(candidate, case, contract) for case in cases] total_outputs = sum(len(case.outputs) for case in case_checks) passed_outputs = sum( 1 for case in case_checks for output in case.outputs if output.passed @@ -146,9 +152,68 @@ def _run_case( case: OperatorCase, contract: Mapping[str, Any], ) -> CaseCheck: - # TODO(issue-108): add optional gradient checks for differentiable operators. candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, case.inputs)) gold_outputs = _flatten_tensors(case.gold_fn(**case.inputs)) + return _compare_case_outputs(candidate, case, contract, candidate_outputs, gold_outputs) + + +def _run_case_backward( + candidate: CandidateSpec, + case: OperatorCase, + contract: Mapping[str, Any], +) -> CaseCheck: + if not case.grad_input_names: + raise ValueError(f"case {case.name!r} does not declare gradient inputs") + + candidate_inputs = _clone_inputs_for_backward(case.inputs, case.grad_input_names) + gold_inputs = _clone_inputs_for_backward(case.inputs, case.grad_input_names) + candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, candidate_inputs)) + gold_outputs = _flatten_tensors(case.gold_fn(**gold_inputs)) + candidate_grads = _backward_grads(candidate_outputs, candidate_inputs, case.grad_input_names) + gold_grads = _backward_grads(gold_outputs, gold_inputs, case.grad_input_names) + output_checks = _compare_case_outputs( + candidate, + case, + contract, + candidate_outputs, + gold_outputs, + ).outputs + atol, rtol = _resolve_tolerance( + contract, + op_class=case.op_class, + dtype=case.dtype, + arch_key=candidate.arch_key, + ) + grad_checks = [ + _compare_output( + candidate_grad, + gold_grad, + output_index=len(output_checks) + index, + atol=atol, + rtol=rtol, + message=f"gradient:{name}", + ) + for index, (name, candidate_grad, gold_grad) in enumerate( + zip(case.grad_input_names, candidate_grads, gold_grads, strict=True) + ) + ] + checks = [*output_checks, *grad_checks] + return CaseCheck( + case_name=case.name, + dtype=str(case.dtype), + op_class=case.op_class, + passed=all(output.passed for output in checks), + outputs=checks, + ) + + +def _compare_case_outputs( + candidate: CandidateSpec, + case: OperatorCase, + contract: Mapping[str, Any], + candidate_outputs: list[torch.Tensor], + gold_outputs: list[torch.Tensor], +) -> CaseCheck: if len(candidate_outputs) != len(gold_outputs): raise ValueError( f"candidate {candidate.name!r} returned {len(candidate_outputs)} outputs, " @@ -188,6 +253,44 @@ def _call_candidate(candidate: Callable[..., Any] | Any, inputs: Mapping[str, An return candidate(**inputs) +def _clone_inputs_for_backward( + inputs: Mapping[str, Any], + grad_input_names: tuple[str, ...], +) -> dict[str, Any]: + grad_names = set(grad_input_names) + cloned: dict[str, Any] = {} + for name, value in inputs.items(): + if isinstance(value, torch.Tensor): + tensor = value.detach().clone() + if name in grad_names: + if not tensor.is_floating_point(): + raise TypeError(f"gradient input {name!r} must be floating point") + tensor.requires_grad_(True) + cloned[name] = tensor + else: + cloned[name] = value + missing = grad_names.difference(cloned) + if missing: + raise ValueError(f"missing gradient inputs: {', '.join(sorted(missing))}") + return cloned + + +def _backward_grads( + outputs: list[torch.Tensor], + inputs: Mapping[str, Any], + grad_input_names: tuple[str, ...], +) -> list[torch.Tensor]: + loss = sum(output.float().sum() for output in outputs) + loss.backward() + grads: list[torch.Tensor] = [] + for name in grad_input_names: + grad = inputs[name].grad + if grad is None: + raise ValueError(f"gradient for input {name!r} is None") + grads.append(grad) + return grads + + def _flatten_tensors(value: Any) -> list[torch.Tensor]: if isinstance(value, torch.Tensor): return [value] @@ -239,6 +342,7 @@ def _compare_output( output_index: int, atol: float, rtol: float, + message: str = "", ) -> OutputCheck: if candidate.shape != gold.shape: return OutputCheck( @@ -279,6 +383,7 @@ def _compare_output( mean_abs_error=mean_abs_error, max_rel_error=max_rel_error, passed=bool(torch.allclose(candidate_fp32, gold_fp32, atol=atol, rtol=rtol)), + message=message, ) diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py index dc4674c..1373151 100644 --- a/rl_engine/kernels/gtest/operator_specs.py +++ b/rl_engine/kernels/gtest/operator_specs.py @@ -21,6 +21,7 @@ class OperatorSpec: gold_path: str registry_name: str candidate_paths: dict[str, str] + grad_input_names: tuple[str, ...] = () def _load_object(path: str) -> Any: @@ -42,6 +43,7 @@ def _load_object(path: str) -> Any: "cuda-generic": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp", "cuda-sm90": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpSM90Op", }, + grad_input_names=("logits",), ), } @@ -72,6 +74,7 @@ def make_operator_case( dtype=dtype, inputs=make_operator_inputs(args.op, args, dtype, device), gold_fn=gold_op.forward_fp32, + grad_input_names=spec.grad_input_names, ) diff --git a/scripts/check_operator.py b/scripts/check_operator.py index 9b99612..0b31b70 100644 --- a/scripts/check_operator.py +++ b/scripts/check_operator.py @@ -53,8 +53,9 @@ def _summarize(report: Any) -> None: ) for case in candidate.cases: for output in case.outputs: + label = f" {output.message}" if output.message else "" print( - f" case={case.case_name} output={output.output_index} " + f" case={case.case_name} output={output.output_index}{label} " f"shape={output.shape} dtype={output.candidate_dtype} " f"max_abs={output.max_abs_error:.8e} " f"mean_abs={output.mean_abs_error:.8e} " @@ -91,6 +92,7 @@ def parse_args() -> argparse.Namespace: default=None, help="Optional tolerance override key, for example sm90. Defaults to contract.default.", ) + parser.add_argument("--check-grad", action="store_true", help="Also compare gradients for supported inputs.") parser.add_argument("--json", action="store_true", help="Print the full structured report as JSON.") return parser.parse_args() @@ -101,7 +103,12 @@ def main() -> None: device = _select_device(args.device) candidate = make_candidate(args) case = make_operator_case(args, dtype, device) - report = run_operator_suite(args.op, candidates=[candidate], cases=[case]) + report = run_operator_suite( + args.op, + candidates=[candidate], + cases=[case], + check_grad=args.check_grad, + ) if args.json: print(json.dumps(report.to_dict(), indent=2, default=str)) diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py index 9f8a901..b28148b 100644 --- a/tests/test_op_checks.py +++ b/tests/test_op_checks.py @@ -22,6 +22,18 @@ def _logp_case(name: str, dtype: torch.dtype, *, seed: int = 0) -> OperatorCase: ) +def _logp_backward_case(name: str, *, seed: int = 0) -> OperatorCase: + case = _logp_case(name, torch.float32, seed=seed) + return OperatorCase( + name=case.name, + op_class=case.op_class, + dtype=case.dtype, + inputs=case.inputs, + gold_fn=case.gold_fn, + grad_input_names=("logits",), + ) + + def test_logp_native_candidate_suite_passes(): report = run_operator_suite( "logp", @@ -127,3 +139,34 @@ def slightly_shifted_logp(logits, token_ids): output = report.candidates[0].cases[0].outputs[0] assert report.passed assert output.atol == 5.0e-2 + + +def test_logp_native_candidate_backward_suite_passes(): + report = run_operator_suite( + "logp", + candidates=[CandidateSpec(name="native-logp", backend="pytorch", fn=NativeLogpOp())], + cases=[_logp_backward_case("fp32", seed=8)], + check_grad=True, + ) + + assert report.passed + assert report.candidates[0].passed_outputs == 2 + assert report.candidates[0].cases[0].outputs[1].message == "gradient:logits" + + +def test_backward_suite_reports_failure_for_bad_gradient(): + def bad_grad_logp(logits, token_ids): + values = NativeLogpOp().forward_fp32(logits, token_ids) + return values.detach() + logits.sum(dim=-1) * 0.0 + + report = run_operator_suite( + "logp", + candidates=[CandidateSpec(name="bad-grad-logp", backend="test", fn=bad_grad_logp)], + cases=[_logp_backward_case("fp32", seed=9)], + check_grad=True, + ) + + gradient_output = report.candidates[0].cases[0].outputs[1] + assert not report.passed + assert gradient_output.message == "gradient:logits" + assert gradient_output.max_abs_error > 0.0 From d53a5a670823a55bbd1a728c6afd058f97cadb86 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Mon, 29 Jun 2026 23:40:55 +0800 Subject: [PATCH 08/11] Add linear logp gtest smoke coverage --- docs/contributing/issue-108-session-log.md | 57 +++++++++++++++++++++- rl_engine/kernels/gtest/operator_inputs.py | 16 ++++++ rl_engine/kernels/gtest/operator_specs.py | 18 ++++++- tests/test_operator_inputs.py | 11 +++++ 4 files changed, 100 insertions(+), 2 deletions(-) diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md index 0644424..b55806f 100644 --- a/docs/contributing/issue-108-session-log.md +++ b/docs/contributing/issue-108-session-log.md @@ -189,7 +189,7 @@ python scripts/check_operator.py --op logp --candidate pytorch --device cp Supported key options: ```text ---op Operator name. The minimal version supports logp. +--op Operator name. The minimal version supports logp and linear_logp. --candidate Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, registry. --dtype fp32, bf16, or fp16. --device auto, cpu, cuda, or another torch device string. @@ -422,3 +422,58 @@ Reasoning: - Forward-only checks can miss incorrect or disconnected backward paths. - Gradient inputs must be declared per operator because not every floating tensor should receive gradients. - Input generation remains independent of autograd; the runner clones inputs and enables `requires_grad` only inside the backward check path. + +Known backend limitation: + +- `cuda` `logp` currently calls the compiled `_C.fused_logp` forward path directly and does not produce an autograd-connected output. +- Running `--check-grad` against `candidate=cuda` fails with a missing `grad_fn`; this is a backend implementation gap, not a tolerance issue. +- To support `cuda logp` backward, the backend must add or wrap a real backward path, usually via `torch.autograd.Function` or an explicit CUDA backward kernel. + +### Linear LogP Triton GTest Smoke Support + +Files: + +```text +rl_engine/kernels/gtest/operator_inputs.py +rl_engine/kernels/gtest/operator_specs.py +tests/test_operator_inputs.py +docs/contributing/issue-108-session-log.md +``` + +Change: + +- Added `linear_logp` input construction with `hidden`, `lm_head_weight`, `target_ids`, and `bias=None`. +- Added a `linear_logp` operator spec using `NativeLinearLogpOp.apply` as the PyTorch gold path. +- Added `triton` as a `linear_logp` candidate backend. +- Declared `("hidden", "lm_head_weight")` as gradient inputs for backward checks. + +Reasoning: + +- `linear_logp` is the smallest real fused op in the repository with an implemented Triton backward path. +- The first gtest integration keeps bias disabled to avoid optional-gradient handling in the initial smoke path. +- The op reuses the `logprob` tolerance class because it produces selected-token log probabilities. +- It gives the checker a real non-PyTorch differentiable candidate for end-to-end forward/backward reporting. + +Example Triton smoke command: + +```bash +python scripts/check_operator.py --op linear_logp --candidate triton --device cuda --dtype bf16 --batch 1 --seq 2 --vocab 1024 --normalized-dim 4096 --check-grad +``` + +Observed bf16 result on H100: + +```text +suite=linear_logp passed=False pass_rate=0.0000 +candidate=triton-linear_logp backend=triton passed=False pass_rate=0.6667 + case=linear_logp-torch.bfloat16-1x2x4096x1024 output=0 shape=(1, 2) dtype=torch.float32 max_abs=4.76226807e-01 mean_abs=4.74334717e-01 max_rel=2.96434597e-03 tol=(atol=5.000e-02, rtol=0.000e+00) passed=False + case=linear_logp-torch.bfloat16-1x2x4096x1024 output=1 gradient:hidden shape=(1, 2, 4096) dtype=torch.bfloat16 max_abs=0.00000000e+00 mean_abs=0.00000000e+00 max_rel=0.00000000e+00 tol=(atol=5.000e-02, rtol=0.000e+00) passed=True + case=linear_logp-torch.bfloat16-1x2x4096x1024 output=2 gradient:lm_head_weight shape=(1024, 4096) dtype=torch.bfloat16 max_abs=0.00000000e+00 mean_abs=0.00000000e+00 max_rel=0.00000000e+00 tol=(atol=5.000e-02, rtol=0.000e+00) passed=True +``` + +Interpretation: + +- The checker flow is complete: CLI parsing, input construction, PyTorch gold loading, Triton candidate loading, forward execution, backward execution, gradient collection, comparison, and report formatting all ran successfully. +- The suite is intentionally not marked as passing because the bf16 forward output exceeded the current `logprob` absolute tolerance. +- The current tolerance uses `atol=5.0e-2` and `rtol=0.0`; the observed forward absolute error is about `4.76e-1`, while the relative error is about `2.96e-3`. +- `linear_logp` includes a large bf16 matrix multiply before selected-token logprob, so it likely needs an operator-specific gold policy or tolerance rather than reusing plain `logprob` tolerances unchanged. +- In this case, both checked gradients passed, so the failure is a forward accuracy/tolerance calibration issue, not a failure to execute the backward checker path. diff --git a/rl_engine/kernels/gtest/operator_inputs.py b/rl_engine/kernels/gtest/operator_inputs.py index 7e09719..4d131e2 100644 --- a/rl_engine/kernels/gtest/operator_inputs.py +++ b/rl_engine/kernels/gtest/operator_inputs.py @@ -30,6 +30,7 @@ def make_operator_inputs( "matmul": _make_matmul_inputs, "attention": _make_attention_inputs, "logp": _make_logp_inputs, + "linear_logp": _make_linear_logp_inputs, "rope": _make_rope_inputs, "silu": _make_silu_inputs, "swiglu": _make_swiglu_inputs, @@ -51,6 +52,7 @@ def operator_shape_name(op_name: str, args: argparse.Namespace) -> str: "matmul": f"{batch}x{seq}x{_matmul_k(args)}x{_matmul_n(args)}", "attention": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}", "logp": f"{batch}x{seq}x{vocab}", + "linear_logp": f"{batch}x{seq}x{_normalized_dim(args)}x{vocab}", "rope": f"{batch}x{DEFAULT_N_HEADS}x{seq}x{DEFAULT_HEAD_DIM}", "silu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}", "swiglu": f"{batch}x{seq}x{DEFAULT_INTERMEDIATE}", @@ -111,6 +113,20 @@ def _make_logp_inputs( } +def _make_linear_logp_inputs( + args: argparse.Namespace, dtype: torch.dtype, device: torch.device +) -> dict[str, Any]: + batch, seq = _batch_seq(args) + hidden_dim = _normalized_dim(args) + vocab = _arg_int(args, "vocab", DEFAULT_VOCAB) + return { + "hidden": _floating_tensor((batch, seq, hidden_dim), args, dtype, device, offset=0), + "lm_head_weight": _floating_tensor((vocab, hidden_dim), args, dtype, device, offset=1), + "target_ids": _token_ids((batch, seq), vocab, args, device), + "bias": None, + } + + def _make_rope_inputs( args: argparse.Namespace, dtype: torch.dtype, device: torch.device ) -> dict[str, Any]: diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py index 1373151..334224b 100644 --- a/rl_engine/kernels/gtest/operator_specs.py +++ b/rl_engine/kernels/gtest/operator_specs.py @@ -19,6 +19,7 @@ class OperatorSpec: name: str op_class: str gold_path: str + gold_method: str registry_name: str candidate_paths: dict[str, str] grad_input_names: tuple[str, ...] = () @@ -36,6 +37,7 @@ def _load_object(path: str) -> Any: name="logp", op_class="logprob", gold_path="rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp", + gold_method="forward_fp32", registry_name="logp", candidate_paths={ "pytorch": "rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp", @@ -45,6 +47,19 @@ def _load_object(path: str) -> Any: }, grad_input_names=("logits",), ), + "linear_logp": OperatorSpec( + name="linear_logp", + op_class="logprob", + gold_path="rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp", + gold_method="apply", + registry_name="linear_logp", + candidate_paths={ + "pytorch": "rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp", + "triton": "rl_engine.kernels.ops.triton.loss.linear_logp.TritonLinearLogpOp", + "cuda-sm90": "rl_engine.kernels.ops.cuda.loss.linear_logp.FusedLinearLogpSM90Op", + }, + grad_input_names=("hidden", "lm_head_weight"), + ), } @@ -68,12 +83,13 @@ def make_operator_case( ) -> OperatorCase: spec = OP_SPECS[args.op] gold_op = _load_object(spec.gold_path)() + gold_fn = getattr(gold_op, spec.gold_method) return OperatorCase( name=f"{args.op}-{dtype}-{operator_shape_name(args.op, args)}", op_class=spec.op_class, dtype=dtype, inputs=make_operator_inputs(args.op, args, dtype, device), - gold_fn=gold_op.forward_fp32, + gold_fn=gold_fn, grad_input_names=spec.grad_input_names, ) diff --git a/tests/test_operator_inputs.py b/tests/test_operator_inputs.py index 22d39ae..bb1a222 100644 --- a/tests/test_operator_inputs.py +++ b/tests/test_operator_inputs.py @@ -37,6 +37,7 @@ def _args(**overrides): "matmul", "attention", "logp", + "linear_logp", "rope", "silu", "swiglu", @@ -68,3 +69,13 @@ def test_random_logp_inputs_are_seeded(): assert torch.equal(first["logits"], second["logits"]) assert torch.equal(first["token_ids"], second["token_ids"]) + + +def test_constant_linear_logp_inputs_match_operator_contract(): + args = _args(input_mode="constant", constant_value=0.5, token_value=3) + inputs = make_operator_inputs("linear_logp", args, torch.float32, torch.device("cpu")) + + assert torch.equal(inputs["hidden"], torch.full((1, 2, 128), 0.5)) + assert torch.equal(inputs["lm_head_weight"], torch.full((17, 128), 0.51)) + assert torch.equal(inputs["target_ids"], torch.full((1, 2), 3, dtype=torch.long)) + assert inputs["bias"] is None From 4bd2ed8420560fe880bf810da0aabbd8a490b6bc Mon Sep 17 00:00:00 2001 From: a-kaa Date: Thu, 2 Jul 2026 08:14:01 +0800 Subject: [PATCH 09/11] add random grad for gtest --- docs/contributing/issue-108-session-log.md | 7 +- rl_engine/kernels/gtest/op_checks.py | 110 +++++++++++++++++++-- scripts/check_operator.py | 10 ++ tests/test_op_checks.py | 49 +++++++++ 4 files changed, 169 insertions(+), 7 deletions(-) diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md index b55806f..cd8c0f7 100644 --- a/docs/contributing/issue-108-session-log.md +++ b/docs/contributing/issue-108-session-log.md @@ -202,6 +202,8 @@ Supported key options: --token-value Token id for constant mode, reduced modulo vocab. --seed Random seed for reproducible random inputs. --check-grad Also compare gradients for inputs declared by the operator spec. +--grad-mode Upstream gradient mode for --check-grad: random by default; ones for smoke tests. +--grad-seed Random upstream gradient seed for --grad-mode random. --json Print the full structured report as JSON. ``` @@ -416,12 +418,15 @@ Change: - Added `_run_case_backward()` to compare candidate forward outputs and selected input gradients against the PyTorch gold path. - Added `OperatorSpec.grad_input_names`; `logp` declares `("logits",)`. - Added `scripts/check_operator.py --check-grad`. +- Added `--grad-mode ones|random` and `--grad-seed` for backward checks. Reasoning: - Forward-only checks can miss incorrect or disconnected backward paths. - Gradient inputs must be declared per operator because not every floating tensor should receive gradients. - Input generation remains independent of autograd; the runner clones inputs and enables `requires_grad` only inside the backward check path. +- `grad_mode=random` is the default and catches backward bugs hidden by all-one upstream gradients. +- `grad_mode=ones` remains available for quick smoke tests and preserves the old `output.sum().backward()` behavior. Known backend limitation: @@ -457,7 +462,7 @@ Reasoning: Example Triton smoke command: ```bash -python scripts/check_operator.py --op linear_logp --candidate triton --device cuda --dtype bf16 --batch 1 --seq 2 --vocab 1024 --normalized-dim 4096 --check-grad +python scripts/check_operator.py --op linear_logp --candidate triton --device cuda --dtype bf16 --batch 1 --seq 2 --vocab 1024 --normalized-dim 4096 --check-grad --grad-mode random --grad-seed 123 ``` Observed bf16 result on H100: diff --git a/rl_engine/kernels/gtest/op_checks.py b/rl_engine/kernels/gtest/op_checks.py index fd590d9..cbd0fd6 100644 --- a/rl_engine/kernels/gtest/op_checks.py +++ b/rl_engine/kernels/gtest/op_checks.py @@ -97,6 +97,8 @@ def run_operator_suite( cases: Sequence[OperatorCase], contract: Mapping[str, Any] | None = None, check_grad: bool = False, + grad_mode: str = "random", + grad_seed: int = 123, ) -> OperatorCheckReport: """Run candidates against gold outputs and return a structured report.""" @@ -106,7 +108,14 @@ def run_operator_suite( # camdidate : test instance # loaded_contract : tolerance table candidate_reports = [ - _run_candidate(candidate, cases, loaded_contract, check_grad=check_grad) + _run_candidate( + candidate, + cases, + loaded_contract, + check_grad=check_grad, + grad_mode=grad_mode, + grad_seed=grad_seed, + ) for candidate in candidates ] passed_candidates = sum(1 for report in candidate_reports if report.passed) @@ -128,9 +137,22 @@ def _run_candidate( contract: Mapping[str, Any], *, check_grad: bool, + grad_mode: str, + grad_seed: int, ) -> CandidateReport: - case_runner = _run_case_backward if check_grad else _run_case - case_checks = [case_runner(candidate, case, contract) for case in cases] + if check_grad: + case_checks = [ + _run_case_backward( + candidate, + case, + contract, + grad_mode=grad_mode, + grad_seed=grad_seed, + ) + for case in cases + ] + else: + case_checks = [_run_case(candidate, case, contract) for case in cases] total_outputs = sum(len(case.outputs) for case in case_checks) passed_outputs = sum( 1 for case in case_checks for output in case.outputs if output.passed @@ -161,6 +183,9 @@ def _run_case_backward( candidate: CandidateSpec, case: OperatorCase, contract: Mapping[str, Any], + *, + grad_mode: str, + grad_seed: int, ) -> CaseCheck: if not case.grad_input_names: raise ValueError(f"case {case.name!r} does not declare gradient inputs") @@ -169,8 +194,23 @@ def _run_case_backward( gold_inputs = _clone_inputs_for_backward(case.inputs, case.grad_input_names) candidate_outputs = _flatten_tensors(_call_candidate(candidate.fn, candidate_inputs)) gold_outputs = _flatten_tensors(case.gold_fn(**gold_inputs)) - candidate_grads = _backward_grads(candidate_outputs, candidate_inputs, case.grad_input_names) - gold_grads = _backward_grads(gold_outputs, gold_inputs, case.grad_input_names) + # Candidate and gold must use the same upstream gradients; otherwise we + # would compare different vector-Jacobian products. + # grad_mode="ones" is the old output.sum().backward() smoke path. + # grad_mode="random" is closer to training, where dL/doutput is non-uniform. + grad_outputs = _make_grad_outputs(candidate_outputs, grad_mode=grad_mode, seed=grad_seed) + candidate_grads = _backward_grads( + candidate_outputs, + candidate_inputs, + case.grad_input_names, + grad_outputs=grad_outputs, + ) + gold_grads = _backward_grads( + gold_outputs, + gold_inputs, + case.grad_input_names, + grad_outputs=_match_grad_outputs(grad_outputs, gold_outputs), + ) output_checks = _compare_case_outputs( candidate, case, @@ -178,6 +218,9 @@ def _run_case_backward( candidate_outputs, gold_outputs, ).outputs + # Reuse the same tolerance class for gradients as for values. This is a + # first conservative default; operator-specific gradient tolerances can be + # split out later if a real backend shows different numerical behavior. atol, rtol = _resolve_tolerance( contract, op_class=case.op_class, @@ -279,8 +322,19 @@ def _backward_grads( outputs: list[torch.Tensor], inputs: Mapping[str, Any], grad_input_names: tuple[str, ...], + *, + grad_outputs: list[torch.Tensor], ) -> list[torch.Tensor]: - loss = sum(output.float().sum() for output in outputs) + if len(outputs) != len(grad_outputs): + raise ValueError( + f"got {len(grad_outputs)} upstream gradients for {len(outputs)} outputs" + ) + # `ones` makes this equivalent to output.sum().backward(); `random` tests a + # stricter vector-Jacobian product. + loss = sum( + (output.float() * grad_output.to(device=output.device).float()).sum() + for output, grad_output in zip(outputs, grad_outputs, strict=True) + ) loss.backward() grads: list[torch.Tensor] = [] for name in grad_input_names: @@ -291,6 +345,50 @@ def _backward_grads( return grads +def _make_grad_outputs( + outputs: list[torch.Tensor], + *, + grad_mode: str, + seed: int, +) -> list[torch.Tensor]: + if grad_mode == "ones": + # All-one upstream gradients make the scalar loss equal output.sum(). + return [torch.ones_like(output, dtype=torch.float32) for output in outputs] + if grad_mode != "random": + raise ValueError(f"unsupported grad_mode: {grad_mode}") + + grad_outputs: list[torch.Tensor] = [] + generators: dict[torch.device, torch.Generator] = {} + for output in outputs: + if output.device not in generators: + # Generators are device-local; a CUDA generator cannot draw CPU tensors. + generator = torch.Generator(device=output.device) + generator.manual_seed(seed) + generators[output.device] = generator + # Random upstream gradients test a non-uniform dL/doutput. The same + # tensors are later reused for gold so the comparison stays fair. + grad_outputs.append( + torch.randn( + output.shape, + generator=generators[output.device], + device=output.device, + dtype=torch.float32, + ) + ) + return grad_outputs + + +def _match_grad_outputs( + grad_outputs: list[torch.Tensor], + outputs: list[torch.Tensor], +) -> list[torch.Tensor]: + # Reuse upstream values for gold; only move device when needed. + return [ + grad_output.to(device=output.device) + for grad_output, output in zip(grad_outputs, outputs, strict=True) + ] + + def _flatten_tensors(value: Any) -> list[torch.Tensor]: if isinstance(value, torch.Tensor): return [value] diff --git a/scripts/check_operator.py b/scripts/check_operator.py index 0b31b70..461e3f9 100644 --- a/scripts/check_operator.py +++ b/scripts/check_operator.py @@ -93,6 +93,14 @@ def parse_args() -> argparse.Namespace: help="Optional tolerance override key, for example sm90. Defaults to contract.default.", ) parser.add_argument("--check-grad", action="store_true", help="Also compare gradients for supported inputs.") + # Defaults to random because it catches bugs hidden by output.sum().backward(). + parser.add_argument( + "--grad-mode", + choices=("ones", "random"), + default="random", + help="Upstream gradient mode used with --check-grad.", + ) + parser.add_argument("--grad-seed", type=int, default=123, help="Seed for --grad-mode random.") parser.add_argument("--json", action="store_true", help="Print the full structured report as JSON.") return parser.parse_args() @@ -108,6 +116,8 @@ def main() -> None: candidates=[candidate], cases=[case], check_grad=args.check_grad, + grad_mode=args.grad_mode, + grad_seed=args.grad_seed, ) if args.json: diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py index b28148b..74de3b6 100644 --- a/tests/test_op_checks.py +++ b/tests/test_op_checks.py @@ -170,3 +170,52 @@ def bad_grad_logp(logits, token_ids): assert not report.passed assert gradient_output.message == "gradient:logits" assert gradient_output.max_abs_error > 0.0 + + +def test_random_grad_mode_catches_nonuniform_upstream_gradient_bug(): + # Forward is identity, so only a non-uniform upstream gradient can expose + # the intentionally wrong backward below. + class MeanUpstreamIdentity(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + return x.clone() + + @staticmethod + def backward(ctx, grad_output): + # Wrong for random upstream gradients, but correct when all values are 1. + return grad_output.mean().expand_as(grad_output) + + def bad_identity(x): + return MeanUpstreamIdentity.apply(x) + + case = OperatorCase( + name="identity", + op_class="elementwise", + dtype=torch.float32, + inputs={"x": torch.randn(8, dtype=torch.float32)}, + gold_fn=lambda x: x, + grad_input_names=("x",), + ) + + ones_report = run_operator_suite( + "identity", + candidates=[CandidateSpec(name="bad-identity", backend="test", fn=bad_identity)], + cases=[case], + check_grad=True, + grad_mode="ones", + ) + # ones passes by design; random must fail and prove the stricter path works. + random_report = run_operator_suite( + "identity", + candidates=[CandidateSpec(name="bad-identity", backend="test", fn=bad_identity)], + cases=[case], + check_grad=True, + grad_mode="random", + grad_seed=7, + ) + + assert ones_report.passed + gradient_output = random_report.candidates[0].cases[0].outputs[1] + assert not random_report.passed + assert gradient_output.message == "gradient:x" + assert gradient_output.max_abs_error > 0.0 From 4fbb7fdf40439adb5a708e5fd0257a6f9cd9daa6 Mon Sep 17 00:00:00 2001 From: a-kaa Date: Thu, 2 Jul 2026 09:19:33 +0800 Subject: [PATCH 10/11] Remove registry candidate from gtest checker --- docs/contributing/issue-108-session-log.md | 5 ++--- rl_engine/kernels/gtest/operator_specs.py | 15 +-------------- scripts/check_operator.py | 4 ++-- tests/test_op_checks.py | 19 ------------------- 4 files changed, 5 insertions(+), 38 deletions(-) diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md index cd8c0f7..6261d7c 100644 --- a/docs/contributing/issue-108-session-log.md +++ b/docs/contributing/issue-108-session-log.md @@ -143,13 +143,13 @@ candidates: cuda -> FusedLogpGenericOp cuda-generic -> FusedLogpGenericOp cuda-sm90 -> FusedLogpSM90Op - registry -> kernel_registry.get_op("logp") ``` Important note: - `candidate=pytorch` is only a smoke test for the checker itself. - CUDA, Triton, ROCm, and future hardware-specific implementations are candidates. +- Registry dispatch is tested separately from this accuracy harness because it is hardware-dependent. - Do not compare two operators that implement different math, such as ordinary `logp` and `linear_logp`. ### SM90 Adapter Exception @@ -190,7 +190,7 @@ Supported key options: ```text --op Operator name. The minimal version supports logp and linear_logp. ---candidate Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, registry. +--candidate Candidate backend, for example pytorch, cuda, cuda-generic, cuda-sm90, triton. --dtype fp32, bf16, or fp16. --device auto, cpu, cuda, or another torch device string. --arch-key Optional tolerance override key such as sm90. @@ -276,7 +276,6 @@ Add an `OperatorSpec` entry: name="new_op", op_class="elementwise", gold_path="rl_engine.kernels.ops.pytorch....NativeNewOp", - registry_name="new_op", candidate_paths={ "pytorch": "rl_engine.kernels.ops.pytorch....NativeNewOp", "cuda": "rl_engine.kernels.ops.cuda....CudaNewOp", diff --git a/rl_engine/kernels/gtest/operator_specs.py b/rl_engine/kernels/gtest/operator_specs.py index 334224b..99c45ec 100644 --- a/rl_engine/kernels/gtest/operator_specs.py +++ b/rl_engine/kernels/gtest/operator_specs.py @@ -20,7 +20,6 @@ class OperatorSpec: op_class: str gold_path: str gold_method: str - registry_name: str candidate_paths: dict[str, str] grad_input_names: tuple[str, ...] = () @@ -38,7 +37,6 @@ def _load_object(path: str) -> Any: op_class="logprob", gold_path="rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp", gold_method="forward_fp32", - registry_name="logp", candidate_paths={ "pytorch": "rl_engine.kernels.ops.pytorch.loss.logp.NativeLogpOp", "cuda": "rl_engine.kernels.ops.cuda.loss.logp.FusedLogpGenericOp", @@ -52,7 +50,6 @@ def _load_object(path: str) -> Any: op_class="logprob", gold_path="rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp", gold_method="apply", - registry_name="linear_logp", candidate_paths={ "pytorch": "rl_engine.kernels.ops.pytorch.loss.linear_logp.NativeLinearLogpOp", "triton": "rl_engine.kernels.ops.triton.loss.linear_logp.TritonLinearLogpOp", @@ -109,17 +106,7 @@ def make_candidate(args: argparse.Namespace) -> CandidateSpec: fn=candidate_op, ) - if candidate_name == "registry": - from rl_engine.kernels.registry import kernel_registry - - return CandidateSpec( - name=f"registry-{args.op}", - backend="registry", - arch_key=args.arch_key, - fn=kernel_registry.get_op(spec.registry_name), - ) - - supported = sorted([*spec.candidate_paths, "native", "registry"]) + supported = sorted([*spec.candidate_paths, "native"]) raise ValueError( f"unsupported candidate {args.candidate!r} for op {args.op!r}; " f"supported candidates: {', '.join(supported)}" diff --git a/scripts/check_operator.py b/scripts/check_operator.py index 461e3f9..677f01b 100644 --- a/scripts/check_operator.py +++ b/scripts/check_operator.py @@ -70,8 +70,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--op", choices=operator_names(), default="logp") parser.add_argument( "--candidate", - default="registry", - help="Candidate backend to validate, for example registry, pytorch, cuda, cuda-sm90.", + default="pytorch", + help="Candidate backend to validate, for example pytorch, cuda, cuda-sm90, triton.", ) parser.add_argument("--dtype", choices=("fp32", "bf16", "fp16"), default="fp32") parser.add_argument("--device", default="auto") diff --git a/tests/test_op_checks.py b/tests/test_op_checks.py index 74de3b6..8f08d87 100644 --- a/tests/test_op_checks.py +++ b/tests/test_op_checks.py @@ -51,25 +51,6 @@ def test_logp_native_candidate_suite_passes(): assert all(case.passed for case in report.candidates[0].cases) -def test_logp_registry_candidate_suite_passes_on_cpu(): - from rl_engine.kernels.registry import kernel_registry - - report = run_operator_suite( - "logp", - candidates=[ - CandidateSpec( - name="registry-logp", - backend="registry", - fn=kernel_registry.get_op("logp"), - ) - ], - cases=[_logp_case("fp32", torch.float32, seed=4)], - ) - - assert report.passed - assert report.candidates[0].candidate_name == "registry-logp" - - def test_suite_reports_failure_for_bad_candidate(): def bad_logp(logits, token_ids): del token_ids From 347726b4747fa5ffe6c7cc9bf21773b57551117d Mon Sep 17 00:00:00 2001 From: a-kaa Date: Thu, 2 Jul 2026 09:21:37 +0800 Subject: [PATCH 11/11] Rename tolerance contract to json --- docs/contributing/issue-108-session-log.md | 6 +++--- rl_engine/kernels/gtest/tolerance.py | 2 +- .../{tolerance_contract.yaml => tolerance_contract.json} | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename rl_engine/kernels/gtest/{tolerance_contract.yaml => tolerance_contract.json} (100%) diff --git a/docs/contributing/issue-108-session-log.md b/docs/contributing/issue-108-session-log.md index 6261d7c..a6aa9c1 100644 --- a/docs/contributing/issue-108-session-log.md +++ b/docs/contributing/issue-108-session-log.md @@ -31,7 +31,7 @@ rl_engine/kernels/gtest/ operator_inputs.py operator_specs.py tolerance.py - tolerance_contract.yaml + tolerance_contract.json scripts/check_operator.py @@ -48,7 +48,7 @@ Files: ```text rl_engine/kernels/gtest/tolerance.py -rl_engine/kernels/gtest/tolerance_contract.yaml +rl_engine/kernels/gtest/tolerance_contract.json tests/test_tolerance_contract.py ``` @@ -296,7 +296,7 @@ Rules: File: ```text -rl_engine/kernels/gtest/tolerance_contract.yaml +rl_engine/kernels/gtest/tolerance_contract.json ``` Reuse an existing class when possible: diff --git a/rl_engine/kernels/gtest/tolerance.py b/rl_engine/kernels/gtest/tolerance.py index 9a1cc0b..3265a45 100644 --- a/rl_engine/kernels/gtest/tolerance.py +++ b/rl_engine/kernels/gtest/tolerance.py @@ -8,7 +8,7 @@ from typing import Any -_CONTRACT_PATH = Path(__file__).with_name("tolerance_contract.yaml") +_CONTRACT_PATH = Path(__file__).with_name("tolerance_contract.json") def load_contract(path: str | Path = _CONTRACT_PATH) -> dict[str, Any]: diff --git a/rl_engine/kernels/gtest/tolerance_contract.yaml b/rl_engine/kernels/gtest/tolerance_contract.json similarity index 100% rename from rl_engine/kernels/gtest/tolerance_contract.yaml rename to rl_engine/kernels/gtest/tolerance_contract.json