From ba08156a28405594d9a6fd292f5515752e947dca Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 05:29:35 -0400
Subject: [PATCH 01/22] =?UTF-8?q?feat(memory):=20=E6=B7=BB=E5=8A=A0=20P0?=
 =?UTF-8?q?=20=E8=AE=B0=E5=BF=86=E6=B6=88=E8=9E=8D=20harness=20=E4=B8=8E?=
 =?UTF-8?q?=E6=B3=A8=E5=85=A5=E5=BC=80=E5=85=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

激活方案 Phase 0 的离线消融能力，给此前无生产 caller 的
compare_memory_effectiveness 接上真实入口，并补齐文档误以为已存在、
实则缺失的 memory=off 能力。

**消融开关**
- MemoryExtractionConfig.inject_enabled（默认 True）：False 时
  orchestrator._inject_memory 跳过 set_memory_store，使 get_memory_context
  返回空 = memory=off 臂；抽取/写回经 orchestrator 自有 store 不受影响

**离线 harness** (src/tools/memory_replay.py，纯读)
- load_effectiveness_report：从 run 目录或 json 载入 memory_effectiveness.json
- build_ablation_comparison：包装 compare_memory_effectiveness（首个真实 caller）
- render_ablation_table：输出 memory_decision_lift 对比表

**CLI**
- merge eval-memory --on --off [--out]：载入两臂报告→对比→打印表→可选落 JSON

**测试** tests/unit/test_memory_replay.py：14 用例覆盖 load/compare/render
与开关真实接线（断言 inject_enabled=False 不 wire store）
---
 CLAUDE.md                        |   5 +
 src/cli/main.py                  |  58 ++++++++++
 src/core/orchestrator.py         |   8 +-
 src/models/config.py             |   9 ++
 src/tools/memory_replay.py       |  98 ++++++++++++++++
 tests/unit/test_memory_replay.py | 192 +++++++++++++++++++++++++++++++
 6 files changed, 369 insertions(+), 1 deletion(-)
 create mode 100644 src/tools/memory_replay.py
 create mode 100644 tests/unit/test_memory_replay.py
diff --git a/CLAUDE.md b/CLAUDE.md
index 81b9096..d69c086 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -37,8 +37,13 @@ merge validate --config <path>   # validate config + env vars
 merge init [--repo-path .]       # generate per-target CLAUDE.md for merge decisions
 merge plan-suggest [--target ... --candidates ...]   # enumerate baseline commit-windows
 merge forks-profile init         # scaffold .merge/forks-profile.yaml (recommended ≥30 fork-deleted files)
+merge eval-memory --on <run|json> --off <run|json> [--out <path>]   # P0 memory ablation: compare memory=on vs memory=off effectiveness reports
 ```
 
+To produce a `memory=off` run for the ablation, set `memory.inject_enabled: false`
+in `.merge/config.yaml` and re-run on the same dataset; each run persists a
+`memory_effectiveness.json` under its run dir that `merge eval-memory` consumes.
+
 ## Required Environment Variables
 
 Each agent reads its API key from its own env var — no key is hardcoded:
diff --git a/src/cli/main.py b/src/cli/main.py
index 6bc1f35..0dd75ca 100644
--- a/src/cli/main.py
+++ b/src/cli/main.py
@@ -462,6 +462,64 @@ def validate_config_and_env(config: MergeConfig) -> list[str]:
     return errors
 
 
+@cli.command("eval-memory")
+@click.option(
+    "--on",
+    "on_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="memory=on run: a memory_effectiveness.json file or its run directory",
+)
+@click.option(
+    "--off",
+    "off_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="memory=off run: a memory_effectiveness.json file or its run directory "
+    "(produce one by setting memory.inject_enabled: false in config.yaml)",
+)
+@click.option(
+    "--out",
+    "out_path",
+    required=False,
+    default=None,
+    type=click.Path(),
+    help="optional path to write the ablation comparison as JSON",
+)
+def eval_memory_command(on_path: str, off_path: str, out_path: str | None) -> None:
+    """P0: compare a memory=on vs memory=off run and report the decision lift.
+
+    Offline and read-only — consumes the memory_effectiveness.json each run
+    persists at report time. The acceptance gate (lift > 0 AND harmful rate
+    not rising) is defined in doc/evaluation/acceptance.md.
+    """
+    from src.tools.memory_replay import (
+        build_ablation_comparison,
+        load_effectiveness_report,
+        render_ablation_table,
+    )
+
+    try:
+        report_on = load_effectiveness_report(on_path)
+        report_off = load_effectiveness_report(off_path)
+    except (FileNotFoundError, ValueError) as e:
+        console.print(f"[red]Failed to load effectiveness report: {e}[/red]")
+        sys.exit(1)
+
+    comparison = build_ablation_comparison(report_on, report_off)
+    console.print(render_ablation_table(comparison))
+
+    if out_path:
+        try:
+            Path(out_path).write_text(
+                comparison.model_dump_json(indent=2), encoding="utf-8"
+            )
+            console.print(f"[green]Wrote ablation comparison to {out_path}[/green]")
+        except OSError as e:
+            console.print(f"[red]Failed to write {out_path}: {e}[/red]")
+            sys.exit(1)
+
+
 cli.add_command(_forks_profile_group)
 
 
diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py
index c340c3a..bd09ed8 100644
--- a/src/core/orchestrator.py
+++ b/src/core/orchestrator.py
@@ -597,8 +597,14 @@ def _should_llm_extract(self, phase: str, state: MergeState) -> bool:
 
     def _inject_memory(self) -> None:
         memory_cfg = getattr(self.config, "memory", None)
+        # P0 ablation: when inject_enabled is False, leave each agent's store
+        # at None so get_memory_context() returns "" — the "memory=off" arm.
+        # Extraction/write-back still run at the orchestrator level; only
+        # read-time prompt injection is suppressed.
+        inject_enabled = getattr(memory_cfg, "inject_enabled", True)
         for agent in self._all_agents:
-            agent.set_memory_store(self._memory_store)  # type: ignore[arg-type]
+            if inject_enabled:
+                agent.set_memory_store(self._memory_store)  # type: ignore[arg-type]
             agent.set_memory_hit_tracker(self._memory_hit_tracker)
             agent.set_memory_config(memory_cfg)
             agent.set_upstream_ref(self.config.upstream_ref)
diff --git a/src/models/config.py b/src/models/config.py
index 7fbdf35..872fe91 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -961,6 +961,15 @@ class MemoryExtractionConfig(BaseModel):
         description="OPP-5: minimum pass+fail observations before an entry's "
         "confidence is nudged, so a single run cannot move it.",
     )
+    inject_enabled: bool = Field(
+        default=True,
+        description="P0 ablation switch: when False, no memory context is "
+        "injected into any agent prompt (the orchestrator skips wiring the "
+        "store onto agents, so get_memory_context returns empty). Used to "
+        "produce the 'memory=off' arm of the memory-effectiveness ablation "
+        "(merge eval-memory). Extraction/write-back are unaffected — only "
+        "read-time injection is suppressed. Default True (normal behaviour).",
+    )
 
 
 class RenameDetectionConfig(BaseModel):
diff --git a/src/tools/memory_replay.py b/src/tools/memory_replay.py
new file mode 100644
index 0000000..d98fe37
--- /dev/null
+++ b/src/tools/memory_replay.py
@@ -0,0 +1,98 @@
+"""P0: offline memory ablation harness (read-only).
+
+Consumes the ``memory_effectiveness.json`` artifacts that a run persists at
+report time (one from a ``memory=on`` run, one from a ``memory=off`` run on
+the same dataset — see ``MemoryExtractionConfig.inject_enabled``) and produces
+the ablation comparison that answers "did injected memory actually improve
+merge decisions?". This is the first real caller of
+``compare_memory_effectiveness``.
+
+Pure and offline: it reads already-persisted JSON, makes no LLM calls, and
+never touches a decision path. The acceptance gate (lift > 0 AND harmful rate
+not rising) is defined in ``doc/evaluation/acceptance.md``; this module only
+loads, compares, and renders.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from src.models.memory_effectiveness import (
+    MemoryAblationComparison,
+    MemoryEffectivenessReport,
+)
+from src.tools.memory_eval import compare_memory_effectiveness
+
+REPORT_FILENAME = "memory_effectiveness.json"
+
+
+def _resolve_report_path(path: str | Path) -> Path:
+    """Resolve a user-supplied path to the effectiveness JSON file.
+
+    Accepts either the JSON file directly or a run directory containing
+    ``memory_effectiveness.json``. Raises ``FileNotFoundError`` with an
+    actionable message when neither resolves.
+    """
+    p = Path(path)
+    if p.is_dir():
+        candidate = p / REPORT_FILENAME
+        if not candidate.is_file():
+            raise FileNotFoundError(
+                f"no {REPORT_FILENAME} in run directory {p} — was the run "
+                f"completed with memory effectiveness reporting enabled?"
+            )
+        return candidate
+    if not p.is_file():
+        raise FileNotFoundError(
+            f"effectiveness report not found: {p} (expected a "
+            f"{REPORT_FILENAME} file or a run directory containing it)"
+        )
+    return p
+
+
+def load_effectiveness_report(path: str | Path) -> MemoryEffectivenessReport:
+    """Load a persisted ``MemoryEffectivenessReport`` from a JSON file or run dir."""
+    report_path = _resolve_report_path(path)
+    raw = json.loads(report_path.read_text(encoding="utf-8"))
+    return MemoryEffectivenessReport.model_validate(raw)
+
+
+def build_ablation_comparison(
+    memory_on: MemoryEffectivenessReport,
+    memory_off: MemoryEffectivenessReport,
+) -> MemoryAblationComparison:
+    """Compare the on/off effectiveness reports (wraps the eval analyzer)."""
+    return compare_memory_effectiveness(memory_on, memory_off)
+
+
+def _pct(value: float) -> str:
+    return f"{value * 100:.2f}%"
+
+
+def render_ablation_table(cmp: MemoryAblationComparison) -> str:
+    """Render the ablation comparison as a plain markdown table.
+
+    The verdict line restates the convenience ``memory_beneficial`` flag
+    (lift > 0); the full acceptance gate also requires the harmful-influence
+    rate not to rise over time (see ``doc/evaluation/acceptance.md``).
+    """
+    lift = cmp.memory_decision_lift
+    sign = "+" if lift > 0 else ""
+    verdict = "BENEFICIAL (lift > 0)" if cmp.memory_beneficial else "NOT beneficial"
+    return "\n".join(
+        [
+            "| Metric | memory=on | memory=off |",
+            "|---|---|---|",
+            f"| run_id | `{cmp.on_run_id}` | `{cmp.off_run_id}` |",
+            f"| overall_correct_rate | {_pct(cmp.overall_correct_rate_on)} "
+            f"| {_pct(cmp.overall_correct_rate_off)} |",
+            "",
+            f"**memory_decision_lift**: {sign}{lift:.4f} "
+            f"({_pct(lift) if lift >= 0 else '-' + _pct(-lift)})",
+            "",
+            f"**harmful_influence_rate (on)**: {_pct(cmp.harmful_influence_rate_on)}",
+            "",
+            f"**Verdict**: {verdict}",
+        ]
+    )
diff --git a/tests/unit/test_memory_replay.py b/tests/unit/test_memory_replay.py
new file mode 100644
index 0000000..02e93ca
--- /dev/null
+++ b/tests/unit/test_memory_replay.py
@@ -0,0 +1,192 @@
+"""PR-0a: offline memory ablation harness + inject_enabled switch tests."""
+
+import json
+
+import pytest
+
+from src.models.config import MemoryExtractionConfig
+from src.models.memory_effectiveness import MemoryEffectivenessReport
+from src.tools.memory_replay import (
+    REPORT_FILENAME,
+    build_ablation_comparison,
+    load_effectiveness_report,
+    render_ablation_table,
+)
+
+
+def _report(run_id: str, correct_rate: float, harmful_rate: float = 0.0):
+    return MemoryEffectivenessReport(
+        run_id=run_id,
+        total_judged_decisions=10,
+        overall_correct_rate=correct_rate,
+        memory_influenced_decisions=4,
+        correct_after_influence=3,
+        harmful_influence_count=1,
+        correct_rate_after_influence=0.75,
+        harmful_influence_rate=harmful_rate,
+        total_tracked_entries=2,
+        effective_observations=4,
+    )
+
+
+# --- loading ----------------------------------------------------------------
+
+
+def test_load_from_json_file(tmp_path):
+    report = _report("run-on", 0.9)
+    p = tmp_path / REPORT_FILENAME
+    p.write_text(report.model_dump_json(), encoding="utf-8")
+    loaded = load_effectiveness_report(p)
+    assert loaded == report
+
+
+def test_load_from_run_directory(tmp_path):
+    report = _report("run-off", 0.7)
+    (tmp_path / REPORT_FILENAME).write_text(report.model_dump_json(), encoding="utf-8")
+    loaded = load_effectiveness_report(tmp_path)
+    assert loaded.run_id == "run-off"
+
+
+def test_load_missing_file_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        load_effectiveness_report(tmp_path / "nope.json")
+
+
+def test_load_dir_without_report_raises(tmp_path):
+    with pytest.raises(FileNotFoundError, match=REPORT_FILENAME):
+        load_effectiveness_report(tmp_path)
+
+
+def test_load_rejects_malformed_json(tmp_path):
+    p = tmp_path / REPORT_FILENAME
+    p.write_text(json.dumps({"run_id": "x"}), encoding="utf-8")  # missing fields
+    with pytest.raises(Exception):
+        load_effectiveness_report(p)
+
+
+# --- comparison -------------------------------------------------------------
+
+
+def test_comparison_positive_lift():
+    cmp = build_ablation_comparison(_report("on", 0.9), _report("off", 0.7))
+    assert cmp.memory_decision_lift == pytest.approx(0.2)
+    assert cmp.memory_beneficial is True
+    assert cmp.on_run_id == "on"
+    assert cmp.off_run_id == "off"
+
+
+def test_comparison_non_positive_lift_not_beneficial():
+    cmp = build_ablation_comparison(_report("on", 0.7), _report("off", 0.7))
+    assert cmp.memory_decision_lift == pytest.approx(0.0)
+    assert cmp.memory_beneficial is False
+
+
+# --- rendering --------------------------------------------------------------
+
+
+def test_render_table_contains_key_figures():
+    cmp = build_ablation_comparison(_report("on", 0.9, 0.25), _report("off", 0.7))
+    table = render_ablation_table(cmp)
+    assert "memory_decision_lift" in table
+    assert "BENEFICIAL" in table
+    assert "25.00%" in table  # harmful_influence_rate_on
+    assert "`on`" in table and "`off`" in table
+
+
+def test_render_table_negative_lift():
+    cmp = build_ablation_comparison(_report("on", 0.6), _report("off", 0.8))
+    table = render_ablation_table(cmp)
+    assert "NOT beneficial" in table
+    assert "-" in table  # negative lift rendered with sign
+
+
+# --- inject_enabled ablation switch -----------------------------------------
+
+
+def test_inject_enabled_defaults_true():
+    assert MemoryExtractionConfig().inject_enabled is True
+
+
+def test_inject_disabled_skips_store_wiring():
+    """When inject_enabled is False, _inject_memory must leave each agent's
+    store at None so get_memory_context() returns empty (the memory=off arm)."""
+
+    class _Agent:
+        def __init__(self):
+            self.store = "UNSET"
+            self.tracker = None
+            self.cfg = None
+            self.upstream = None
+
+        def set_memory_store(self, store):
+            self.store = store
+
+        def set_memory_hit_tracker(self, tracker):
+            self.tracker = tracker
+
+        def set_memory_config(self, cfg):
+            self.cfg = cfg
+
+        def set_upstream_ref(self, ref):
+            self.upstream = ref
+
+    class _Cfg:
+        memory = MemoryExtractionConfig(inject_enabled=False)
+        upstream_ref = "upstream/main"
+
+    class _Orch:
+        config = _Cfg()
+        _memory_store = object()
+        _memory_hit_tracker = object()
+
+        def __init__(self):
+            self._all_agents = [_Agent()]
+
+    from src.core.orchestrator import Orchestrator
+
+    orch = _Orch()
+    Orchestrator._inject_memory(orch)  # type: ignore[arg-type]
+    agent = orch._all_agents[0]
+    # store-wiring skipped → stays at the sentinel "UNSET" (never set to None
+    # either, but crucially never set to the real store)
+    assert agent.store == "UNSET"
+    assert agent.tracker is orch._memory_hit_tracker
+    assert agent.cfg is orch.config.memory
+
+
+def test_inject_enabled_wires_store():
+    class _Agent:
+        store = None
+        tracker = None
+        cfg = None
+        upstream = None
+
+        def set_memory_store(self, store):
+            self.store = store
+
+        def set_memory_hit_tracker(self, tracker):
+            self.tracker = tracker
+
+        def set_memory_config(self, cfg):
+            self.cfg = cfg
+
+        def set_upstream_ref(self, ref):
+            self.upstream = ref
+
+    class _Cfg:
+        memory = MemoryExtractionConfig(inject_enabled=True)
+        upstream_ref = "upstream/main"
+
+    class _Orch:
+        config = _Cfg()
+        _memory_store = object()
+        _memory_hit_tracker = object()
+
+        def __init__(self):
+            self._all_agents = [_Agent()]
+
+    from src.core.orchestrator import Orchestrator
+
+    orch = _Orch()
+    Orchestrator._inject_memory(orch)  # type: ignore[arg-type]
+    assert orch._all_agents[0].store is orch._memory_store

From 0ecba9552823a07f650abf578a2d927d1217e167 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 05:29:45 -0400
Subject: [PATCH 02/22] =?UTF-8?q?docs(eval):=20=E5=AE=9A=E4=B9=89=20P0=20?=
 =?UTF-8?q?=E8=AE=B0=E5=BF=86=E6=9C=89=E6=95=88=E6=80=A7=E6=8C=87=E6=A0=87?=
 =?UTF-8?q?=E4=B8=8E=E5=8F=8D=E9=A6=88=E7=8E=AF=E6=BF=80=E6=B4=BB=E9=97=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

把 PR-0a 产出的度量写进权威评估文档，并把"反馈环默认开启"的硬前置
从代码注释固化为可评审验收门（兑现方案原则 P2 先度量再激活）。

- metrics.md §9：记忆有效性指标 MDL/HIR/CRI/MID/PEE/MCPD，含公式、
  数据源（对齐 memory_effectiveness.json 字段）、信号通路；标注
  P1-C/P2-B 待补指标
- acceptance.md §3：自学习反馈环激活门（MDL>0 硬前置 + HIR 不升 +
  CRI≥off基线 + MCPD≤×1.15）含判定流程；原 §3–§6 顺延为 §4–§7
- dependency-graph-optimization-plan.md：修正顺延失效的交叉引用
  acceptance.md §3→§4（report schema）

激活门为独立章节（≠ 合并质量门），acceptance_thresholds.yaml 仅镜像
§1/§2 故不动。
---
 doc/evaluation/acceptance.md                  | 34 +++++++-
 doc/evaluation/metrics.md                     | 83 +++++++++++++++++++
 .../dependency-graph-optimization-plan.md     |  2 +-
 3 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/doc/evaluation/acceptance.md b/doc/evaluation/acceptance.md
index 056d0d5..811c3c1 100644
--- a/doc/evaluation/acceptance.md
+++ b/doc/evaluation/acceptance.md
@@ -42,7 +42,33 @@
 
 ---
 
-## 3. 报告必备元数据
+## 3. 自学习反馈环激活门（Phase 1 前置）
+
+> 这组门**不**判定一次合并 run 的好坏，而是决定自学习方案
+> （`doc/plan/self-learning-system.md`）的反馈环——OPP-5 写回（P1-B）、持久化
+> suppress（P1-A）——能否从 opt-in 翻为**默认开启**。设计原则 P2「先度量再激活」：
+> 任一反馈环默认开启前，必须先用 `merge eval-memory` 在固定数据集上跑出消融基线
+> 证明净收益为正。指标定义见 metrics.md §9。
+
+| 门 | 阈值 | 数据源 | 作用 |
+|---|---|---|---|
+| `MDL` 记忆决策增益 | **> 0** | `merge eval-memory`（on/off 消融）| 任一反馈环默认开启的**硬前置**；≤ 0 则保持 opt-in |
+| `HIR` 有害影响率 | **不高于同数据集 off 基线** | `memory_effectiveness.json` | 上升即说明记忆在污染决策，禁止默认开启 |
+| `CRI` 影响后正确率 | **≥ off 基线 overall_correct_rate** | `memory_effectiveness.json` | 被记忆改变的决策不得比无记忆更差 |
+| `MCPD` 单决策记忆成本 | **≤ off 基线 × 1.15** | `CostTracker` | 防止记忆注入让 prompt 成本悄悄回退 |
+
+**判定流程**：
+1. 同数据集跑 `memory=on`（默认）与 `memory=off`（config `memory.inject_enabled: false`）两 run；
+2. `merge eval-memory --on <on_run> --off <off_run>` 产出 `MemoryAblationComparison`；
+3. `MDL > 0` 且 `HIR` 不升 → 允许把对应反馈环 default 翻为 `True`，并在本文件 §5 历史区记录基线数；
+4. 任一门未过 → 反馈环维持 opt-in，记录原因。
+
+> 这是"默认开启"的闸口，不是合并质量的一票否决；故归为独立章节，与 §1/§2 的合并
+> 质量门互不替代。
+
+---
+
+## 4. 报告必备元数据
 
 `eval_acceptance_<version>.json` 必须含：
 
@@ -70,7 +96,7 @@
 
 ---
 
-## 4. 版本基线历史
+## 5. 版本基线历史
 
 | 版本 | 评估时间 | 数据集 lock | 主要结果 | 备注 |
 |---|---|---|---|---|
@@ -80,7 +106,7 @@
 
 ---
 
-## 5. 阈值修改流程
+## 6. 阈值修改流程
 
 修改任何阈值必须：
 
@@ -91,7 +117,7 @@
 
 ---
 
-## 6. 用户对外承诺模板
+## 7. 用户对外承诺模板
 
 通过 acceptance gate 后，可向用户输出如下承诺（示例）：
 
diff --git a/doc/evaluation/metrics.md b/doc/evaluation/metrics.md
index fe87011..a563cb1 100644
--- a/doc/evaluation/metrics.md
+++ b/doc/evaluation/metrics.md
@@ -320,3 +320,86 @@ BCP = | 配置了 build_check 且退出码 0 的 run | / | 配置了 build_check
 数据源：judge 阶段 `_run_build_check`（command 由 setup 自动探测填充，方案1）。非零退出
 把 Judge PASS 降级 FAIL+veto。Acceptance（Soft）: **BCP = 100%**（仅统计已配置 command 的
 run；未探测到工具链的目标不计入分母）。
+
+---
+
+## 9. 记忆有效性指标（自学习度量，P0 底座）
+
+这一组指标量化"注入的跨 run 记忆是否真的让合并决策更好"——自学习方案
+（`doc/plan/self-learning-system.md`）的开放问题 1。全部**只读、执行接地**：正确/有害
+信号取自 Judge 终判的 `passed_files` / `failed_files`（与 `record_outcome` 同源），不取
+LLM 自报。
+
+> 信号通路：`MemoryHitTracker` 记录本 run 每个文件的记忆注入 → report 阶段
+> `compute_memory_effectiveness`（`src/tools/memory_eval.py`）与 Judge verdict 求交集 →
+> 持久化 `runs/<id>/memory_effectiveness.json`。两次 run（`memory=on` vs
+> `memory=off`，由 `memory.inject_enabled` 切换）的报告经 `merge eval-memory`
+> （`src/tools/memory_replay.py`）对比产出 §9.1。
+>
+> **影响决策口径**：`influenced = injected_files ∩ (passed_files ∪ failed_files)`。
+> 注入图为 run-local（不持久化），故 §9.2–§9.4 是单 run 量；§9.5 的 per-entry 功过
+> 经 tracker sidecar 跨 run 累计。
+
+### 9.1 记忆决策增益（Memory Decision Lift, MDL）
+
+> 消融口径：同一数据集、同配置跑两遍，仅 `memory.inject_enabled` 不同。
+
+```
+MDL = overall_correct_rate(memory=on) − overall_correct_rate(memory=off)
+overall_correct_rate = |passed_files| / (|passed_files| + |failed_files|)
+```
+
+数据源：`MemoryAblationComparison.memory_decision_lift`。**MDL > 0 是"学到了"的
+最小证据**，也是 Phase 1 任一反馈环默认开启的硬前置（见 acceptance.md §3）。
+
+### 9.2 有害影响率（Harmful Influence Rate, HIR）
+
+> 被记忆注入"影响"且最终 fail 的决策占比——F2（检索污染/有害记忆）的直接度量。
+
+```
+HIR = |injected ∩ failed_files| / |influenced|        （influenced=0 时记 0）
+```
+
+数据源：`MemoryEffectivenessReport.harmful_influence_rate`。P1-A（持久化 suppress）的
+优化目标是在"tracker 重置"场景下 HIR 不回升。
+
+### 9.3 影响后正确率（Correct Rate After Influence, CRI）
+
+```
+CRI = |injected ∩ passed_files| / |influenced|        （influenced=0 时记 0）
+```
+
+数据源：`MemoryEffectivenessReport.correct_rate_after_influence`。P1-B（激活并加固
+OPP-5 写回）的优化目标是 CRI 上升、per-entry 分布右移。
+
+### 9.4 影响决策数（Memory Influenced Decisions, MID）
+
+```
+MID = |injected_files ∩ (passed_files ∪ failed_files)|
+```
+
+数据源：`MemoryEffectivenessReport.memory_influenced_decisions`。MID 是 §9.2/§9.3 的
+分母——MID 过小（如 < 5）时，HIR/CRI 抽样不足，MDL 才是更稳的总体判据。
+
+### 9.5 单条目有效性（Per-Entry Effectiveness, PEE）
+
+```
+PEE[e] = (pass[e] − fail[e]) / (pass[e] + fail[e])   ∈ [−1, +1]
+```
+
+数据源：`MemoryHitTracker.outcome_scores()` / `summary()['outcomes']` 的 top_helpful /
+top_harmful 榜（跨 run 累计）。`PEE ≤ −0.5 且 min_observations 满足` 即 `harmful_entry_ids`
+判据——O-M6 注入期过滤的依据，也是 P1-A 固化 suppress 的输入。
+
+### 9.6 单决策记忆成本（Memory Cost Per Decision, MCPD）
+
+```
+MCPD = cost_usd_per_run / F_eval
+```
+
+数据源：`CostTracker` + `F_eval`。记忆注入增大 prompt，开启反馈环不得让 MCPD 显著上升
+（acceptance.md §3）。
+
+> **后续指标（Phase 1-C / 2-B 落地后补充）**：`repeat_error_repair_rounds`（同
+> error_signature 平均修复轮数，需 P1-C 的 `summarize_judge_repair_rounds` 按签名聚合）、
+> `memory_drift_loss`（consolidation 前后 pinned 条目内容差异，P2-B，期望 = 0）。
diff --git a/doc/references/dependency-graph-optimization-plan.md b/doc/references/dependency-graph-optimization-plan.md
index f1050c3..2928575 100644
--- a/doc/references/dependency-graph-optimization-plan.md
+++ b/doc/references/dependency-graph-optimization-plan.md
@@ -350,7 +350,7 @@ God Node（高 degree 节点）→ 命中的改动文件风险提升。
 1. **冒烟**：forgejo 上各跑一次 Control / Treatment-core，diff 三份报告，确认每个消费方 ≥1 次触发（§11.3 计数）——先证「活」。
 2. **小集 A/B**：3 个 C-class + `t1-0031..0033` 各跑 N=3（`DET` 口径），算聚焦指标位移。
 3. **归因**（若收益显著）：按 §11.1 单开关回退定位主要贡献者。
-4. **记录**：结果落 `eval_acceptance_<sha>.json`（acceptance.md §3 schema），基线历史表追加一行；
+4. **记录**：结果落 `eval_acceptance_<sha>.json`（acceptance.md §4 schema），基线历史表追加一行；
    用本轮真实数据回填 §7.3/§8.5 标注「未标定」的阈值（`god_node_min_dependents=8` / `god_node_risk_bump=0.15` /
    `_MAX_TOPO_ISSUES=25`）的标定建议。
 

From 2a5ddd349e22df3a14ae4815c2ec8fc4d8894026 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 05:32:49 -0400
Subject: [PATCH 03/22] =?UTF-8?q?feat(web):=20=E5=86=B2=E7=AA=81=E5=86=B3?=
 =?UTF-8?q?=E7=AD=96=E6=8F=90=E4=BA=A4=E5=90=8E=E8=87=AA=E5=8A=A8=E8=B7=B3?=
 =?UTF-8?q?=E8=BD=AC=E4=B8=8B=E4=B8=80=E4=B8=AA=E5=BE=85=E5=86=B3=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

多文件人工决策时,提交一个文件后视图仍停留在已决文件上,剩余
待决文件(尤其是 conflict_points 为空、右侧无明细的升级文件)
极易被遗漏,导致 run 一直停在 AWAITING_HUMAN。

submitCurrent 发送决策后乐观跳转到下一个 human_decision 仍为
空的待决文件;保留手动点击已决文件查看的能力,不做强制弹走。
---
 web/src/views/ConflictResolution.test.tsx | 65 +++++++++++++++++++++++
 web/src/views/ConflictResolution.tsx      |  8 +++
 2 files changed, 73 insertions(+)

diff --git a/web/src/views/ConflictResolution.test.tsx b/web/src/views/ConflictResolution.test.tsx
index 293789d..c132a70 100644
--- a/web/src/views/ConflictResolution.test.tsx
+++ b/web/src/views/ConflictResolution.test.tsx
@@ -222,6 +222,71 @@ describe("ConflictResolution submit payload (H3)", () => {
   });
 });
 
+describe("ConflictResolution auto-advance after submit", () => {
+  it("selects the next still-pending file after a single submit", () => {
+    act(() => {
+      const store = useConflictDraftStore.getState();
+      store.setDraftDecision("a.py", "take_current");
+      store.selectFile("a.py");
+    });
+
+    const ref = makeClientRef();
+    const { getByText } = render(
+      <ConflictResolution
+        clientRef={
+          ref as unknown as React.MutableRefObject<
+            ReturnType<typeof useWsClient>["current"]
+          >
+        }
+      />,
+    );
+    act(() => {
+      getByText("Submit decision").click();
+    });
+
+    expect(sendSpy).toHaveBeenCalledTimes(1);
+    // a.py was resolved; the view must move the operator onto b.py so the
+    // remaining decision is never silently skipped.
+    expect(useConflictDraftStore.getState().selectedFile).toBe("b.py");
+  });
+
+  it("stays put when the submitted file is the last pending one", () => {
+    const lastPendingSnapshot: MergeStateSnapshot = {
+      ...baseSnapshot,
+      humanDecisionRequests: {
+        "a.py": {
+          ...baseSnapshot.humanDecisionRequests["a.py"],
+          human_decision: "take_current",
+        },
+        "b.py": baseSnapshot.humanDecisionRequests["b.py"],
+      },
+    };
+    useRunStore.setState({ snapshot: lastPendingSnapshot });
+    act(() => {
+      const store = useConflictDraftStore.getState();
+      store.setDraftDecision("b.py", "take_target");
+      store.selectFile("b.py");
+    });
+
+    const ref = makeClientRef();
+    const { getByText } = render(
+      <ConflictResolution
+        clientRef={
+          ref as unknown as React.MutableRefObject<
+            ReturnType<typeof useWsClient>["current"]
+          >
+        }
+      />,
+    );
+    act(() => {
+      getByText("Submit decision").click();
+    });
+
+    expect(sendSpy).toHaveBeenCalledTimes(1);
+    expect(useConflictDraftStore.getState().selectedFile).toBe("b.py");
+  });
+});
+
 describe("ConflictResolution submit feedback", () => {
   it("shows a submitted banner + Resubmit label once the file is decided", () => {
     const decidedSnapshot: MergeStateSnapshot = {
diff --git a/web/src/views/ConflictResolution.tsx b/web/src/views/ConflictResolution.tsx
index 2c404b9..8332164 100644
--- a/web/src/views/ConflictResolution.tsx
+++ b/web/src/views/ConflictResolution.tsx
@@ -255,6 +255,14 @@ export function ConflictResolution({ clientRef }: Props): JSX.Element {
     if (!current || !currentDraft) return;
     if (validateDraft(currentDraft) !== null) return;
     sendSingle(current.file_path, currentDraft);
+    // Auto-advance to the next still-pending file so a multi-file review
+    // can't strand the operator on the file they just resolved and silently
+    // miss a remaining decision (e.g. an escalated file with empty
+    // conflict_points renders no detail and is easy to overlook). Optimistic:
+    // the just-submitted file is still in ``pending`` until the next snapshot
+    // confirms it, so exclude it explicitly here.
+    const next = pending.find((r) => r.file_path !== current.file_path);
+    if (next) selectFile(next.file_path);
   };
 
   const draftCount = Object.keys(drafts).length;

From 50c1eed2efc4a15304e4aaa0e456ce212b64bd2d Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 05:32:57 -0400
Subject: [PATCH 04/22] =?UTF-8?q?fix(core):=20=E6=89=A7=E8=A1=8C=E4=BA=BA?=
 =?UTF-8?q?=E5=B7=A5=E5=86=B3=E7=AD=96=E6=9C=9F=E9=97=B4=E5=88=87=E6=8D=A2?=
 =?UTF-8?q?=E7=8A=B6=E6=80=81=E4=B8=BA=20AUTO=5FMERGING?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

human_review 在 resume 后一次性同步执行已提交的决策(大文件 chunked
语义合并可能耗时数分钟),期间 status 始终为 AWAITING_HUMAN,Web UI
持续展示决策表单,用户误以为 run 卡死。

在 Case 1 执行循环前将状态从 AWAITING_HUMAN 切到 AUTO_MERGING,经
transition observer 自动广播新快照,前端 classifyView 据此离开决策
gate 转入实时进度。仅改内存状态与广播,checkpoint 仍按 PhaseOutcome
在相位边界写入,resume 语义不变;后续转入 JUDGE_REVIEWING /
ANALYZING_CONFLICTS 及失败再升级 AWAITING_HUMAN 均为合法转换。
---
 src/core/phases/human_review.py                  | 16 ++++++++++++++++
 .../test_human_override_and_merge_fidelity.py    | 10 ++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/core/phases/human_review.py b/src/core/phases/human_review.py
index 079fdae..a437827 100644
--- a/src/core/phases/human_review.py
+++ b/src/core/phases/human_review.py
@@ -247,6 +247,22 @@ async def execute(self, state: MergeState, ctx: PhaseContext) -> PhaseOutcome:
                 if req.human_decision is None
             ]
             if not pending:
+                # UX: once every conflict decision is in, the phase is about
+                # to spend a long time applying them (chunked semantic merges
+                # on large files can run for minutes). Leaving status at
+                # AWAITING_HUMAN throughout makes the Web UI keep showing the
+                # decision gate — operators think the run is stuck waiting on
+                # them when it is actually busy executing. Flip to AUTO_MERGING
+                # before the loop so the transition observer pushes a fresh
+                # snapshot and the UI moves off the gate to live progress. The
+                # terminal transition (JUDGE_REVIEWING / ANALYZING_CONFLICTS)
+                # below is still valid from AUTO_MERGING.
+                if state.status != SystemStatus.AUTO_MERGING:
+                    ctx.state_machine.transition(
+                        state,
+                        SystemStatus.AUTO_MERGING,
+                        "executing human conflict decisions",
+                    )
                 executor = ctx.agents["executor"]
                 executed = 0
                 for req in state.human_decision_requests.values():
diff --git a/tests/unit/test_human_override_and_merge_fidelity.py b/tests/unit/test_human_override_and_merge_fidelity.py
index f18772e..818175e 100644
--- a/tests/unit/test_human_override_and_merge_fidelity.py
+++ b/tests/unit/test_human_override_and_merge_fidelity.py
@@ -156,6 +156,16 @@ async def test_human_override_executes_over_stale_auto_record() -> None:
     assert state.file_decision_records["auto.go"].decision == MergeDecision.TAKE_TARGET
     assert outcome.target_status == SystemStatus.JUDGE_REVIEWING
 
+    # UX: executing the decisions must first flip status off the AWAITING_HUMAN
+    # gate to AUTO_MERGING so the Web UI stops showing the decision form while
+    # the (potentially minutes-long) merges run. The AUTO_MERGING transition
+    # must precede the terminal JUDGE_REVIEWING transition.
+    transitioned = [c.args[1] for c in ctx.state_machine.transition.call_args_list]
+    assert SystemStatus.AUTO_MERGING in transitioned
+    assert transitioned.index(SystemStatus.AUTO_MERGING) < transitioned.index(
+        SystemStatus.JUDGE_REVIEWING
+    )
+
 
 # --------------------------------------------------------------------------- #
 # Bug B sibling — judge dispute-round repair must not overwrite a human record

From 71e1fecd57ff46ae97145a6825f657f4546fb568 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 05:39:54 -0400
Subject: [PATCH 05/22] =?UTF-8?q?docs:=20=E6=95=B4=E7=90=86=20doc/=20?=
 =?UTF-8?q?=E7=9B=AE=E5=BD=95=E7=BB=93=E6=9E=84=E5=B9=B6=E6=9B=B4=E6=96=B0?=
 =?UTF-8?q?=20README=20=E7=B4=A2=E5=BC=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

将根目录散落的 8 个文件归入对应子目录：
- web-ui.md / web-ui-redesign-handoff.md → modules/
- forks-profile-init.md → modules/forks-profile.md（改名去歧义）
- migration-aware-merge.md / risk-levels.md → modules/
- multi-agent-optimization-from-merge-experience.md → references/multi-agent-optimization.md
- large-scale-file-processing-optimization.md → plan/
- execute/implementation-notes.md → plan/（删除单文件目录 execute/）

同步修复 3 处内部链接（onboarding.md、insforge 测试报告）。

重写 README.md：补齐此前缺失的 bugfix/、evaluation/、review/、
test-report/ 四个目录的说明，更新时间戳至 2026-05-31。
---
 doc/README.md                                 | 150 +++++++++++++++---
 .../forks-profile.md}                         |   0
 doc/{ => modules}/migration-aware-merge.md    |   0
 doc/modules/onboarding.md                     |   8 +-
 doc/{ => modules}/risk-levels.md              |   0
 doc/{ => modules}/web-ui-redesign-handoff.md  |   0
 doc/{ => modules}/web-ui.md                   |   0
 doc/{execute => plan}/implementation-notes.md |   0
 ...arge-scale-file-processing-optimization.md |   0
 .../multi-agent-optimization.md}              |   0
 ...insforge-v2.1.0-merge-report-2026-05-06.md |   4 +-
 11 files changed, 133 insertions(+), 29 deletions(-)
 rename doc/{forks-profile-init.md => modules/forks-profile.md} (100%)
 rename doc/{ => modules}/migration-aware-merge.md (100%)
 rename doc/{ => modules}/risk-levels.md (100%)
 rename doc/{ => modules}/web-ui-redesign-handoff.md (100%)
 rename doc/{ => modules}/web-ui.md (100%)
 rename doc/{execute => plan}/implementation-notes.md (100%)
 rename doc/{ => plan}/large-scale-file-processing-optimization.md (100%)
 rename doc/{multi-agent-optimization-from-merge-experience.md => references/multi-agent-optimization.md} (100%)

diff --git a/doc/README.md b/doc/README.md
index fcd58f1..c358dde 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,49 +1,150 @@
 # 文档索引
 
-> **最后更新**：2026-04-17
-> 中文版文档。英文版后续补充。
+> **最后更新**：2026-05-31
+> 中文版文档。
 
 ---
 
 ## 新人从这里开始
 
-- [**新人上手指南**](modules/onboarding.md) — 环境、阅读顺序、常见改动起点
+- [**新人上手指南**](modules/onboarding.md) — 环境搭建、阅读顺序、常见改动起点
 
-## 核心设计文档
+---
+
+## 核心设计文档（根目录）
 
 | 文档 | 说明 |
 |---|---|
 | [architecture.md](architecture.md) | 系统架构总览：分层、数据流、持久化、扩展点 |
 | [flow.md](flow.md) | 状态机与 8 个 Phase 的执行流程 |
-| [risk-levels.md](risk-levels.md) | `RiskLevel` 枚举定义与触发条件 |
-| [migration-aware-merge.md](migration-aware-merge.md) | 迁移感知合并（bulk-copy 场景） |
-| [multi-agent-optimization-from-merge-experience.md](multi-agent-optimization-from-merge-experience.md) | 六大丢失模式 + P0/P1/P2 加固项（最新） |
-| [forks-profile-init.md](forks-profile-init.md) | `merge forks-profile init` / `diff` 起草+增量审阅；§9 整合后已接入首次向导（按阈值触发 init）+ 主流程 drift 提示（写入 plan 报告附录） |
+
+---
 
 ## 模块技术文档（`modules/`）
 
-| 模块 | 文档 |
+| 文档 | 说明 |
+|---|---|
+| [data-models.md](modules/data-models.md) | Pydantic 数据模型字段详解 |
+| [agents.md](modules/agents.md) | 各 Agent 职责、模型选择、合作模式 |
+| [core.md](modules/core.md) | Orchestrator / StateMachine / Checkpoint / Phases |
+| [tools.md](modules/tools.md) | 扫描器 / 门禁 / Git 工具 |
+| [llm.md](modules/llm.md) | LLM 路由、成本控制、熔断、压缩 |
+| [memory.md](modules/memory.md) | 三层记忆系统、跨 run 持久化 |
+| [cli.md](modules/cli.md) | CLI 命令、Web UI 与后端 WebSocket 通信 |
+| [web-ui.md](modules/web-ui.md) | Web UI 组件设计与状态管理 |
+| [web-ui-redesign-handoff.md](modules/web-ui-redesign-handoff.md) | Web UI 重设计交付说明 |
+| [forks-profile.md](modules/forks-profile.md) | `merge forks-profile init/diff`；drift 检测；首次向导触发阈值 |
+| [migration-aware-merge.md](modules/migration-aware-merge.md) | 迁移感知合并（bulk-copy 场景） |
+| [risk-levels.md](modules/risk-levels.md) | `RiskLevel` 枚举定义与触发条件 |
+| [onboarding.md](modules/onboarding.md) | 新人上手指南 |
+
+---
+
+## 计划与提案（`plan/`）
+
+| 文档 | 说明 |
+|---|---|
+| [roadmap.md](plan/roadmap.md) | 产品路线图与里程碑 |
+| [self-learning-system.md](plan/self-learning-system.md) | 自学习系统方案（深研究支撑，2026-05-30） |
+| [per-hunk-resolution.md](plan/per-hunk-resolution.md) | 细粒度 hunk 级别冲突解决方案 |
+| [merge-safety-complete.md](plan/merge-safety-complete.md) | 合并安全完整方案 |
+| [dead-code-remediation-and-compression-plan.md](plan/dead-code-remediation-and-compression-plan.md) | 死代码清理与上下文压缩计划 |
+| [large-scale-file-processing-optimization.md](plan/large-scale-file-processing-optimization.md) | 大规模文件处理优化 |
+| [implementation-notes.md](plan/implementation-notes.md) | 实施过程笔记 |
+
+---
+
+## 合并质量审计（`review/`）
+
+记录一次深度合并质量 + LLM 幻觉处理路径审计，以及后续 Wave 实施日志。
+
+| 文档 | 说明 |
+|---|---|
+| [README.md](review/README.md) | 审计背景与文档索引 |
+| [00-audit-findings.md](review/00-audit-findings.md) | 根因分析与确认缺陷 |
+| [01-optimization-plan.md](review/01-optimization-plan.md) | 12 项优化计划 |
+| [02-implementation-log.md](review/02-implementation-log.md) | Wave 1–3 实施日志 |
+| [03-production-readiness.md](review/03-production-readiness.md) | Wave 3 后生产就绪度评估 |
+| [04-production-hardening-plan.md](review/04-production-hardening-plan.md) | Wave 4 加固计划 |
+| [05-wave4-implementation-log.md](review/05-wave4-implementation-log.md) | Wave 4 实施日志 |
+| [06-production-readiness-post-wave4.md](review/06-production-readiness-post-wave4.md) | Wave 4 后生产就绪度 |
+| [07-wave5-residual-closure-plan.md](review/07-wave5-residual-closure-plan.md) | Wave 5 残余问题关闭计划 |
+
+---
+
+## 评估体系（`evaluation/`）
+
+| 文档 | 说明 |
 |---|---|
-| 数据模型 | [data-models.md](modules/data-models.md) |
-| Agents | [agents.md](modules/agents.md) |
-| Core（Orchestrator / StateMachine / Checkpoint / Phases） | [core.md](modules/core.md) |
-| Tools（扫描器 / 门禁 / Git） | [tools.md](modules/tools.md) |
-| LLM 层 | [llm.md](modules/llm.md) |
-| 记忆系统 | [memory.md](modules/memory.md) |
-| CLI / Web UI | [cli.md](modules/cli.md) |
-| 新人指南 | [onboarding.md](modules/onboarding.md) |
+| [README.md](evaluation/README.md) | 评估方案总览 |
+| [metrics.md](evaluation/metrics.md) | 度量指标定义（含 P0 记忆有效性） |
+| [acceptance.md](evaluation/acceptance.md) | 验收门槛 |
+| [dataset.md](evaluation/dataset.md) | 数据集定义 |
+| [procedure.md](evaluation/procedure.md) | 评估流程 |
+| [EXECUTION_PLAN.md](evaluation/EXECUTION_PLAN.md) | 执行计划 |
+| [IMPLEMENTATION_REPORT_PARTIAL.md](evaluation/IMPLEMENTATION_REPORT_PARTIAL.md) | 部分实施报告 |
+
+---
+
+## 测试报告（`test-report/`）
+
+各版本与目标仓库的实测报告，按时间排列。
+
+| 文档 | 说明 |
+|---|---|
+| [insforge-v2.1.0-merge-report-2026-05-06.md](test-report/insforge-v2.1.0-merge-report-2026-05-06.md) | InsForge v2.1.0 正式合并测试报告 |
+| [dify-plugin-daemon-0.6.0-merge-validation.md](test-report/dify-plugin-daemon-0.6.0-merge-validation.md) | dify-plugin-daemon 0.6.0 合并验证 |
+| [dify-plugins-upstream25-merge-test-2026-05-08.md](test-report/dify-plugins-upstream25-merge-test-2026-05-08.md) | dify-plugins upstream-25 合并测试 |
+| [dify-plugins-upstream25-regression-2026-05-08.md](test-report/dify-plugins-upstream25-regression-2026-05-08.md) | dify-plugins upstream-25 回归分析 |
+| [2026-05-01-dify-plugins-upstream10-validation.md](test-report/2026-05-01-dify-plugins-upstream10-validation.md) | dify-plugins upstream-10 验证 |
+| [2026-05-10-planner-judge-optimizations-review.md](test-report/2026-05-10-planner-judge-optimizations-review.md) | Planner/Judge 优化 review |
+| [forgejo-c-class-test-branches-2026-05-18.md](test-report/forgejo-c-class-test-branches-2026-05-18.md) | forgejo C-class 测试分支建立 |
+| [forgejo-planner-judge-divergence-2026-05-18.md](test-report/forgejo-planner-judge-divergence-2026-05-18.md) | forgejo Planner/Judge 分歧分析 |
+| [upstream-29-full-flow-analysis.md](test-report/upstream-29-full-flow-analysis.md) | upstream-29 全流程分析 |
+| [upstream-36-commits-validation-report.md](test-report/upstream-36-commits-validation-report.md) | upstream-36 验证报告 |
+| [upstream-50-commits-test-report.md](test-report/upstream-50-commits-test-report.md) | upstream-50 测试报告 |
+| [merge-validation-report.md](test-report/merge-validation-report.md) | 通用合并验证报告 |
+| [dify-plugin-daemon.md](test-report/dify-plugin-daemon.md) | dify-plugin-daemon 早期记录 |
+
+---
+
+## BUG 分析记录（`bugfix/`）
+
+| 文档 | 说明 |
+|---|---|
+| [0527.md](bugfix/0527.md) | 2026-05-27 批次 BUG 分析与修复方案 |
+| [0528-agent-prompt-engineering-review.md](bugfix/0528-agent-prompt-engineering-review.md) | 2026-05-28 Agent Prompt 工程化审查 |
+| [0528-legacy-merge-base-attr.md](bugfix/0528-legacy-merge-base-attr.md) | 2026-05-28 遗留 merge_base 属性问题 |
+| [0529-context-memory-opt-evaluation.md](bugfix/0529-context-memory-opt-evaluation.md) | 2026-05-29 上下文/记忆优化评估 |
+
+---
 
 ## 参考开源项目分析（`references/`）
 
-这些文档**不是系统设计**，而是对外部项目的学习笔记，用于提炼可借鉴的模式。
+外部项目学习笔记，提炼可借鉴的模式，**不是系统设计**。
 
-| 文件 | 项目 | 借鉴点 |
+| 文档 | 项目 / 主题 | 借鉴点 |
 |---|---|---|
 | [graphify-analysis.md](references/graphify-analysis.md) | Graphify | 知识图谱压缩代码上下文 |
 | [mempalace-analysis.md](references/mempalace-analysis.md) | MemPalace | 语义索引 + 分层记忆 |
 | [hermes-inspired-improvements.md](references/hermes-inspired-improvements.md) | NousResearch/hermes-agent | Agent 架构与工具抽象 |
+| [openai-agents-python-analysis.md](references/openai-agents-python-analysis.md) | openai-agents-python | 轻量 Agent 框架设计 |
+| [claude-code-game-studios-analysis.md](references/claude-code-game-studios-analysis.md) | claude-code-game-studios | 多 Agent 游戏开发实证 |
 | [opensource-comparison.md](references/opensource-comparison.md) | 15+ 合并相关开源项目 | 对照分析与能力矩阵 |
-| [enhanced-context-memory-proposal.md](references/enhanced-context-memory-proposal.md) | 综合 MemPalace + Graphify | 基于上述项目的增强方案蓝图 |
+| [enhanced-context-memory-proposal.md](references/enhanced-context-memory-proposal.md) | 综合 MemPalace + Graphify | 增强方案蓝图 |
+| [dependency-graph-optimization-plan.md](references/dependency-graph-optimization-plan.md) | 依赖图优化 | 基于 forgejo 实测的依赖图优化计划 |
+| [multi-agent-optimization.md](references/multi-agent-optimization.md) | 合并实战经验 | 六大丢失模式 + P0/P1/P2 加固项 |
+
+---
+
+## 分享材料（`share/`）
+
+| 文档 | 说明 |
+|---|---|
+| [agent-engineering-sharing.md](share/agent-engineering-sharing.md) | Agent 工程化经验分享 |
+| [dependency-graph-deep-dive.html](share/dependency-graph-deep-dive.html) | 依赖图深度解析（HTML 演示） |
+
+---
 
 ## 查找路径速查
 
@@ -55,12 +156,15 @@
 Agent 各自用什么模型、职责边界                       → modules/agents.md
 Checkpoint 怎么落盘、状态机怎么转移                  → modules/core.md
 某个具体扫描器原理（shadow / scar / sentinel…）      → modules/tools.md
-                                                     + multi-agent-optimization-from-merge-experience.md
 LLM 请求如何做成本/预算/熔断                         → modules/llm.md
 记忆在 Agent 间是如何传递的                          → modules/memory.md
 怎么用命令行、Web UI 怎么和后端通信                   → modules/cli.md
 Pydantic 模型到底长什么样                            → modules/data-models.md
-fork 被 bulk-copy 迁移过怎么处理                     → migration-aware-merge.md
-为什么要设计这么多扫描器                             → multi-agent-optimization-from-merge-experience.md
+fork 被 bulk-copy 迁移过怎么处理                     → modules/migration-aware-merge.md
+六大丢失模式 / 为什么要设计这么多扫描器              → references/multi-agent-optimization.md
 想学开源项目怎么做类似问题                           → references/
+合并质量审计历史、Wave 实施记录                      → review/
+实测合并报告（dify/forgejo/insforge）                → test-report/
+BUG 分析与修复记录                                   → bugfix/
+评估指标、验收门槛、数据集定义                       → evaluation/
 ```
diff --git a/doc/forks-profile-init.md b/doc/modules/forks-profile.md
similarity index 100%
rename from doc/forks-profile-init.md
rename to doc/modules/forks-profile.md
diff --git a/doc/migration-aware-merge.md b/doc/modules/migration-aware-merge.md
similarity index 100%
rename from doc/migration-aware-merge.md
rename to doc/modules/migration-aware-merge.md
diff --git a/doc/modules/onboarding.md b/doc/modules/onboarding.md
index d960f59..fef270f 100644
--- a/doc/modules/onboarding.md
+++ b/doc/modules/onboarding.md
@@ -87,9 +87,9 @@ mypy src                        # 应全绿（strict mode）
 | LLM 路由/压缩/缓存 | [`llm.md`](llm.md) |
 | 三层记忆系统 | [`memory.md`](memory.md) |
 | CLI + Web UI | [`cli.md`](cli.md) |
-| 六大丢失模式 + P0/P1/P2 加固项 | [`../multi-agent-optimization-from-merge-experience.md`](../multi-agent-optimization-from-merge-experience.md) |
-| 迁移感知合并 | [`../migration-aware-merge.md`](../migration-aware-merge.md) |
-| 风险等级枚举 | [`../risk-levels.md`](../risk-levels.md) |
+| 六大丢失模式 + P0/P1/P2 加固项 | [`../references/multi-agent-optimization.md`](../references/multi-agent-optimization.md) |
+| 迁移感知合并 | [`migration-aware-merge.md`](migration-aware-merge.md) |
+| 风险等级枚举 | [`risk-levels.md`](risk-levels.md) |
 | 参考开源项目分析 | [`../references/`](../references/) |
 
 ---
@@ -168,5 +168,5 @@ ruff format src/ --check
 ## 10. 遇到问题找谁
 
 - 架构类问题 → 先翻 `CLAUDE.md` 和 `../architecture.md`
-- 设计演进 / 加固项背景 → `../multi-agent-optimization-from-merge-experience.md`
+- 设计演进 / 加固项背景 → `../references/multi-agent-optimization.md`
 - 参考外部项目的思路 → `../references/` 三篇分析文档
diff --git a/doc/risk-levels.md b/doc/modules/risk-levels.md
similarity index 100%
rename from doc/risk-levels.md
rename to doc/modules/risk-levels.md
diff --git a/doc/web-ui-redesign-handoff.md b/doc/modules/web-ui-redesign-handoff.md
similarity index 100%
rename from doc/web-ui-redesign-handoff.md
rename to doc/modules/web-ui-redesign-handoff.md
diff --git a/doc/web-ui.md b/doc/modules/web-ui.md
similarity index 100%
rename from doc/web-ui.md
rename to doc/modules/web-ui.md
diff --git a/doc/execute/implementation-notes.md b/doc/plan/implementation-notes.md
similarity index 100%
rename from doc/execute/implementation-notes.md
rename to doc/plan/implementation-notes.md
diff --git a/doc/large-scale-file-processing-optimization.md b/doc/plan/large-scale-file-processing-optimization.md
similarity index 100%
rename from doc/large-scale-file-processing-optimization.md
rename to doc/plan/large-scale-file-processing-optimization.md
diff --git a/doc/multi-agent-optimization-from-merge-experience.md b/doc/references/multi-agent-optimization.md
similarity index 100%
rename from doc/multi-agent-optimization-from-merge-experience.md
rename to doc/references/multi-agent-optimization.md
diff --git a/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md b/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md
index e744550..b581f47 100644
--- a/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md
+++ b/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md
@@ -530,7 +530,7 @@ migration_policy:
 
 **追加日期**：2026-05-07
 **来源会话**：forks-profile-init 设计阶段对「profile 是否应为 planner 产物」的讨论
-**关联文档**：[`doc/forks-profile-init.md`](../forks-profile-init.md) §3 启发式 / §8 不做的事 / §9 已知局限
+**关联文档**：[`doc/forks-profile-init.md`](../modules/forks-profile.md) §3 启发式 / §8 不做的事 / §9 已知局限
 
 #### 9.10.1 动机
 
@@ -620,7 +620,7 @@ forks_profile_suggestions:
 ### 9.11 P3 增强：`forks-profile diff` 半自动 apply 模式（草案）
 
 **追加日期**：2026-05-07
-**关联文档**：[`doc/forks-profile-init.md`](../forks-profile-init.md) §4.2 `diff` 子命令 / §8 「不做的事」第 2 项
+**关联文档**：[`doc/forks-profile-init.md`](../modules/forks-profile.md) §4.2 `diff` 子命令 / §8 「不做的事」第 2 项
 
 #### 9.11.1 动机
 

From 05504822b904494e1790c7088666ee66de3674b2 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 06:29:07 -0400
Subject: [PATCH 06/22] =?UTF-8?q?docs(eval):=20=E5=9B=9E=E5=A1=AB=20P0=20?=
 =?UTF-8?q?=E8=AE=B0=E5=BF=86=E6=9C=89=E6=95=88=E6=80=A7=E9=A6=96=E7=BB=84?=
 =?UTF-8?q?=E5=9F=BA=E7=BA=BF=EF=BC=88MDL=3D0=EF=BC=8C=E4=B8=8D=E9=BB=98?=
 =?UTF-8?q?=E8=AE=A4=E5=BC=80=E5=90=AF=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

在 forgejo test/fork←origin/forgejo 上用 deepseek-v4-pro 实跑 memory=on/off
两臂消融（同一 ablation_decisions.yaml，唯一变量 inject_enabled），经
merge eval-memory 产出首组基线，回填 acceptance.md §5.1：

- MDL=0.0000、HIR(on)=0.20，两臂 overall_correct_rate 均 81.25%（13/16）
- off 臂 memory_influenced_decisions=0，证实 inject_enabled 开关有效
- 据 §3 激活门（MDL>0 为硬前置）判定：P1 反馈环不默认开启
- 标注 caveat：单 run/单数据集/judged=16 样本小，仅首组可搬动基线，
  需多 run 复算方可据以翻默认开启

PR-0c 至此打通 Phase 0 完整闭环（度量→消融→验收门→首组基线）。
---
 doc/evaluation/acceptance.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/doc/evaluation/acceptance.md b/doc/evaluation/acceptance.md
index 811c3c1..c6ed33e 100644
--- a/doc/evaluation/acceptance.md
+++ b/doc/evaluation/acceptance.md
@@ -104,6 +104,19 @@
 
 每次发布更新该表，至少记录 `OA / MMR / WMR / cost_p95 / wall_time_p95`。
 
+### 5.1 记忆有效性基线（§3 激活门）
+
+> 由 `merge eval-memory` 对同数据集 memory=on/off 两 run 产出（metrics.md §9）。
+
+| 评估时间 | 数据集 | on/off run_id | `MDL` | `HIR`(on) | 激活判定 |
+|---|---|---|---|---|---|
+| 2026-05-31 | forgejo `test/fork` ← `origin/forgejo`（124 文件，judge 复审 16）| `a0563230` / `81ce3475` | **0.0000** | 0.20 | **不默认开启**（MDL 未 > 0）|
+
+口径与 caveat（务必随基线一并阅读，避免误用）：
+- 模型 `deepseek-v4-pro`（temperature：executor/judge=0.1，余默认）；两臂同一 `ablation_decisions.yaml`（plan_review 15×`take_target` + judge_review `accept`），唯一变量为 `memory.inject_enabled`。
+- off 臂 `memory_influenced_decisions=0`，证实 `inject_enabled=false` 完全抑制注入（开关有效）。on 臂注入影响 15/16 判决，但 `overall_correct_rate` 与 off 持平（均 81.25%，13/16 pass）→ 本 run 记忆对总体正确率净中性。
+- 单 run、单数据集、`judge_verdict=fail`/`partial_failure`（确定性 reverse_impact veto），样本量小（judged=16）；**不足以作收紧/默认开启依据**，仅为首组可搬动基线。需多 run / 多数据集复算（procedure 待补）方能据 §3 翻默认开启。
+
 ---
 
 ## 6. 阈值修改流程

From e4b3fbc5e398be1d64377dc28c1e36091864b60a Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 06:52:12 -0400
Subject: [PATCH 07/22] =?UTF-8?q?feat(memory):=20PR-0d=20=E8=AE=B0?=
 =?UTF-8?q?=E5=BF=86=E6=9C=89=E6=95=88=E6=80=A7=E8=B7=A8=E8=87=82=E5=9B=A0?=
 =?UTF-8?q?=E6=9E=9C=E5=BD=92=E5=9B=A0=EF=BC=88=E4=BF=AE=E6=AD=A3=20HIR=20?=
 =?UTF-8?q?=E5=81=87=E9=98=B3=E6=80=A7=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

forgejo 首组基线暴露单臂 harmful_influence_rate 的归因缺陷：HIR(on)=0.2
把"记忆恰好注入到本就确定性失败的文件"误算成有害，而消融证明那 3 个失败
在 memory=off 臂逐文件相同（确定性 reverse_impact veto，与记忆无关）。

改为跨臂逐文件因果归因：
- MemoryEffectivenessReport 持久化 passed_files/failed_files（原仅计数）
- compare_memory_effectiveness 计算 memory_helped(off-fail→on-pass)/
  memory_harmed(off-pass→on-fail) + causal_attribution_available
- render_ablation_table 增因果区块；HIR 标注为 correlational
- metrics.md §9.2 警示 HIR 假阳性 + 新增 §9.7 因果归因；acceptance.md §3
  激活门判据由单臂 HIR 改为 memory_harmed=0；§5.1 基线补因果列（harmed=0）

旧产物无 per-file 列表时 causal_attribution_available=False（不可知≠0）。
real forgejo checkpoint 验证：causal helped=0/harmed=0（HIR=0.2 系假阳性）。
---
 doc/evaluation/acceptance.md            | 15 +++---
 doc/evaluation/metrics.md               | 29 ++++++++++--
 src/models/memory_effectiveness.py      | 17 +++++++
 src/tools/memory_eval.py                | 24 +++++++++-
 src/tools/memory_replay.py              | 61 ++++++++++++++++++-------
 tests/unit/test_memory_effectiveness.py |  3 ++
 tests/unit/test_memory_replay.py        | 61 ++++++++++++++++++++++++-
 7 files changed, 180 insertions(+), 30 deletions(-)

diff --git a/doc/evaluation/acceptance.md b/doc/evaluation/acceptance.md
index c6ed33e..4560507 100644
--- a/doc/evaluation/acceptance.md
+++ b/doc/evaluation/acceptance.md
@@ -53,14 +53,14 @@
 | 门 | 阈值 | 数据源 | 作用 |
 |---|---|---|---|
 | `MDL` 记忆决策增益 | **> 0** | `merge eval-memory`（on/off 消融）| 任一反馈环默认开启的**硬前置**；≤ 0 则保持 opt-in |
-| `HIR` 有害影响率 | **不高于同数据集 off 基线** | `memory_effectiveness.json` | 上升即说明记忆在污染决策，禁止默认开启 |
+| `memory_harmed`（因果，PR-0d）| **= 0** | `merge eval-memory` 跨臂逐文件 diff | 跨臂判决翻坏的文件数；> 0 即记忆**导致**退化，禁止默认开启。取代单臂 `HIR`（相关性、会假阳性，metrics §9.7）|
 | `CRI` 影响后正确率 | **≥ off 基线 overall_correct_rate** | `memory_effectiveness.json` | 被记忆改变的决策不得比无记忆更差 |
 | `MCPD` 单决策记忆成本 | **≤ off 基线 × 1.15** | `CostTracker` | 防止记忆注入让 prompt 成本悄悄回退 |
 
 **判定流程**：
 1. 同数据集跑 `memory=on`（默认）与 `memory=off`（config `memory.inject_enabled: false`）两 run；
 2. `merge eval-memory --on <on_run> --off <off_run>` 产出 `MemoryAblationComparison`；
-3. `MDL > 0` 且 `HIR` 不升 → 允许把对应反馈环 default 翻为 `True`，并在本文件 §5 历史区记录基线数；
+3. `MDL > 0` 且 `memory_harmed = 0`（因果，PR-0d）→ 允许把对应反馈环 default 翻为 `True`，并在本文件 §5 历史区记录基线数；
 4. 任一门未过 → 反馈环维持 opt-in，记录原因。
 
 > 这是"默认开启"的闸口，不是合并质量的一票否决；故归为独立章节，与 §1/§2 的合并
@@ -108,14 +108,15 @@
 
 > 由 `merge eval-memory` 对同数据集 memory=on/off 两 run 产出（metrics.md §9）。
 
-| 评估时间 | 数据集 | on/off run_id | `MDL` | `HIR`(on) | 激活判定 |
-|---|---|---|---|---|---|
-| 2026-05-31 | forgejo `test/fork` ← `origin/forgejo`（124 文件，judge 复审 16）| `a0563230` / `81ce3475` | **0.0000** | 0.20 | **不默认开启**（MDL 未 > 0）|
+| 评估时间 | 数据集 | on/off run_id | `MDL` | `memory_harmed`(因果) | `HIR`(on,相关) | 激活判定 |
+|---|---|---|---|---|---|---|
+| 2026-05-31 | forgejo `test/fork` ← `origin/forgejo`（124 文件，judge 复审 16）| `a0563230` / `81ce3475` | **0.0000** | **0** | 0.20 | **不默认开启**（MDL 未 > 0）|
 
 口径与 caveat（务必随基线一并阅读，避免误用）：
 - 模型 `deepseek-v4-pro`（temperature：executor/judge=0.1，余默认）；两臂同一 `ablation_decisions.yaml`（plan_review 15×`take_target` + judge_review `accept`），唯一变量为 `memory.inject_enabled`。
-- off 臂 `memory_influenced_decisions=0`，证实 `inject_enabled=false` 完全抑制注入（开关有效）。on 臂注入影响 15/16 判决，但 `overall_correct_rate` 与 off 持平（均 81.25%，13/16 pass）→ 本 run 记忆对总体正确率净中性。
-- 单 run、单数据集、`judge_verdict=fail`/`partial_failure`（确定性 reverse_impact veto），样本量小（judged=16）；**不足以作收紧/默认开启依据**，仅为首组可搬动基线。需多 run / 多数据集复算（procedure 待补）方能据 §3 翻默认开启。
+- off 臂 `memory_influenced_decisions=0`，证实 `inject_enabled=false` 完全抑制注入（开关有效）。on 臂注入影响 15/16 判决，但**两臂 per-file 判决逐字节相同**（同 13 pass / 同 3 fail，失败均为 `auth_token.go`/`oauth.go`/`build-release.yml` 的确定性 reverse_impact veto）→ 记忆对本 run 任何判决**零作用**。
+- **因果归因（PR-0d）`memory_harmed=0`、`memory_helped=0`**；单臂 `HIR(on)=0.2` 是**假阳性**——它把"记忆恰好注入到本就确定性失败的文件"误算成有害，跨臂 diff 证伪（metrics §9.7）。激活门以因果 harmed 为准。
+- 本 run 由确定性机制（take_target + veto）主导，记忆无用武之地；不证明记忆无价值，需 **LLM 判断密集**数据集才能测出。单 run、judged=16、样本小，**仅首组可搬动基线**，需多 run / 多数据集复算（procedure 待补）方可据 §3 翻默认开启。
 
 ---
 
diff --git a/doc/evaluation/metrics.md b/doc/evaluation/metrics.md
index a563cb1..e140975 100644
--- a/doc/evaluation/metrics.md
+++ b/doc/evaluation/metrics.md
@@ -352,16 +352,19 @@ overall_correct_rate = |passed_files| / (|passed_files| + |failed_files|)
 数据源：`MemoryAblationComparison.memory_decision_lift`。**MDL > 0 是"学到了"的
 最小证据**，也是 Phase 1 任一反馈环默认开启的硬前置（见 acceptance.md §3）。
 
-### 9.2 有害影响率（Harmful Influence Rate, HIR）
+### 9.2 有害影响率（Harmful Influence Rate, HIR）—— **相关性，非因果**
 
-> 被记忆注入"影响"且最终 fail 的决策占比——F2（检索污染/有害记忆）的直接度量。
+> 被记忆注入"影响"且最终 fail 的决策占比。单臂量，**会过度归因**。
 
 ```
 HIR = |injected ∩ failed_files| / |influenced|        （influenced=0 时记 0）
 ```
 
-数据源：`MemoryEffectivenessReport.harmful_influence_rate`。P1-A（持久化 suppress）的
-优化目标是在"tracker 重置"场景下 HIR 不回升。
+数据源：`MemoryEffectivenessReport.harmful_influence_rate`。**警告（PR-0d）**：HIR 把
+"记忆恰好被注入到一个本来就会失败的文件"也算成有害，但确定性失败（如 reverse_impact
+veto）与记忆无关。forgejo 首组基线即暴露此假阳性：HIR(on)=0.2，而跨臂因果归因
+（§9.7）harmed=0——3 个失败在 memory=off 臂**逐文件相同**。故 HIR 只作单 run 粗筛，
+**默认开启/收紧判据一律以 §9.7 的因果 harmed 为准**。
 
 ### 9.3 影响后正确率（Correct Rate After Influence, CRI）
 
@@ -400,6 +403,24 @@ MCPD = cost_usd_per_run / F_eval
 数据源：`CostTracker` + `F_eval`。记忆注入增大 prompt，开启反馈环不得让 MCPD 显著上升
 （acceptance.md §3）。
 
+### 9.7 跨臂因果归因（Causal Help / Harm, PR-0d）
+
+> 唯一能判定"记忆是否**导致**好/坏结果"的口径——逐文件比对 on/off 两臂的 judge 判决，
+> 只有判决真正翻转才归因于记忆。
+
+```
+memory_helped = { f : f ∈ off.failed_files ∧ f ∈ on.passed_files }   （记忆把失败救成功）
+memory_harmed = { f : f ∈ off.passed_files ∧ f ∈ on.failed_files }   （记忆把成功弄失败）
+```
+
+数据源：`MemoryAblationComparison.memory_helped_count / memory_harmed_count`，由
+`merge eval-memory` 跨两份 `memory_effectiveness.json`（PR-0d 起持久化 `passed_files`/
+`failed_files`）计算；`causal_attribution_available=False` 表示报告无 per-file 列表
+（PR-0d 前的旧产物）——此时 helped/harmed 不可知，**不等于 0**。
+
+两臂判决**逐文件相同**（确定性主导的合并）→ helped=harmed=0，正确地不把确定性失败
+甩锅给记忆。这是 §9.2 HIR 的因果替代，也是 acceptance.md §3 激活门的真正判据。
+
 > **后续指标（Phase 1-C / 2-B 落地后补充）**：`repeat_error_repair_rounds`（同
 > error_signature 平均修复轮数，需 P1-C 的 `summarize_judge_repair_rounds` 按签名聚合）、
 > `memory_drift_loss`（consolidation 前后 pinned 条目内容差异，P2-B，期望 = 0）。
diff --git a/src/models/memory_effectiveness.py b/src/models/memory_effectiveness.py
index ef3dbf4..7f2467f 100644
--- a/src/models/memory_effectiveness.py
+++ b/src/models/memory_effectiveness.py
@@ -42,6 +42,12 @@ class MemoryEffectivenessReport(BaseModel, frozen=True):
     top_harmful: list[EntryEffectivenessItem] = Field(default_factory=list)
     total_tracked_entries: int = Field(ge=0)
     effective_observations: int = Field(ge=0)
+    # PR-0d: per-file Judge verdict, persisted so an offline on/off comparison
+    # can attribute help/harm causally (cross-arm set diff) instead of relying
+    # on the single-arm ``injected ∩ failed`` correlation. Default empty keeps
+    # older artifacts (counts only) loadable.
+    passed_files: list[str] = Field(default_factory=list)
+    failed_files: list[str] = Field(default_factory=list)
 
 
 class MemoryAblationComparison(BaseModel, frozen=True):
@@ -61,3 +67,14 @@ class MemoryAblationComparison(BaseModel, frozen=True):
     memory_decision_lift: float
     harmful_influence_rate_on: float = Field(ge=0.0, le=1.0)
     memory_beneficial: bool
+    # PR-0d: causal cross-arm attribution — a file counts as helped/harmed only
+    # if its verdict actually flipped between the arms, so a deterministic
+    # failure that happens identically with and without memory is NOT blamed on
+    # memory (the single-arm ``harmful_influence_rate`` over-attributes it).
+    memory_helped_files: list[str] = Field(default_factory=list)
+    memory_harmed_files: list[str] = Field(default_factory=list)
+    memory_helped_count: int = Field(default=0, ge=0)
+    memory_harmed_count: int = Field(default=0, ge=0)
+    # False when neither report carries per-file lists (e.g. pre-PR-0d
+    # artifacts) — then helped/harmed are unknowable, not zero.
+    causal_attribution_available: bool = False
diff --git a/src/tools/memory_eval.py b/src/tools/memory_eval.py
index 3f7f326..5eaf1cb 100644
--- a/src/tools/memory_eval.py
+++ b/src/tools/memory_eval.py
@@ -93,6 +93,8 @@ def compute_memory_effectiveness(
         top_harmful=_items_from_outcomes(outcomes_dict.get("top_harmful")),
         total_tracked_entries=_as_int(outcomes_dict.get("tracked_entries", 0)),
         effective_observations=_as_int(summary.get("effective_observations", 0)),
+        passed_files=sorted(passed),
+        failed_files=sorted(failed),
     )
 
 
@@ -103,10 +105,25 @@ def compare_memory_effectiveness(
     """Diff two runs (memory on vs off) on the same dataset.
 
     ``memory_beneficial`` is the simple ``lift > 0`` convenience flag; the
-    full acceptance gate (lift positive AND harmful rate not rising) lives in
+    full acceptance gate (lift positive AND harmed count not rising) lives in
     ``doc/evaluation/acceptance.md``.
+
+    PR-0d: help/harm is attributed *causally* by diffing per-file verdicts
+    across the arms — a file is harmed only if it passed without memory but
+    failed with it (and vice-versa for helped). A deterministic failure that
+    occurs identically in both arms therefore counts as neither, unlike the
+    single-arm ``harmful_influence_rate`` which blames any injected-and-failed
+    file. Falls back to ``causal_attribution_available=False`` when the reports
+    predate PR-0d and carry no per-file lists.
     """
     lift = round(memory_on.overall_correct_rate - memory_off.overall_correct_rate, 4)
+
+    on_passed, on_failed = set(memory_on.passed_files), set(memory_on.failed_files)
+    off_passed, off_failed = set(memory_off.passed_files), set(memory_off.failed_files)
+    available = bool(on_passed or on_failed or off_passed or off_failed)
+    helped = sorted(off_failed & on_passed)
+    harmed = sorted(off_passed & on_failed)
+
     return MemoryAblationComparison(
         on_run_id=memory_on.run_id,
         off_run_id=memory_off.run_id,
@@ -115,4 +132,9 @@ def compare_memory_effectiveness(
         memory_decision_lift=lift,
         harmful_influence_rate_on=memory_on.harmful_influence_rate,
         memory_beneficial=lift > 0.0,
+        memory_helped_files=helped,
+        memory_harmed_files=harmed,
+        memory_helped_count=len(helped),
+        memory_harmed_count=len(harmed),
+        causal_attribution_available=available,
     )
diff --git a/src/tools/memory_replay.py b/src/tools/memory_replay.py
index d98fe37..831bd73 100644
--- a/src/tools/memory_replay.py
+++ b/src/tools/memory_replay.py
@@ -70,29 +70,56 @@ def _pct(value: float) -> str:
     return f"{value * 100:.2f}%"
 
 
+def _file_list(files: list[str], limit: int = 5) -> str:
+    if not files:
+        return "none"
+    shown = ", ".join(files[:limit])
+    return shown if len(files) <= limit else f"{shown}, … (+{len(files) - limit})"
+
+
 def render_ablation_table(cmp: MemoryAblationComparison) -> str:
     """Render the ablation comparison as a plain markdown table.
 
     The verdict line restates the convenience ``memory_beneficial`` flag
-    (lift > 0); the full acceptance gate also requires the harmful-influence
-    rate not to rise over time (see ``doc/evaluation/acceptance.md``).
+    (lift > 0); the full acceptance gate also requires the causal harmed count
+    not to rise over time (see ``doc/evaluation/acceptance.md``).
+
+    PR-0d: the causal block reports help/harm attributed by cross-arm per-file
+    verdict diff. ``harmful_influence_rate (on)`` is kept but labelled
+    correlational — a deterministic failure inflates it without memory being
+    the cause, which the causal ``memory_harmed`` count avoids.
     """
     lift = cmp.memory_decision_lift
     sign = "+" if lift > 0 else ""
     verdict = "BENEFICIAL (lift > 0)" if cmp.memory_beneficial else "NOT beneficial"
-    return "\n".join(
-        [
-            "| Metric | memory=on | memory=off |",
-            "|---|---|---|",
-            f"| run_id | `{cmp.on_run_id}` | `{cmp.off_run_id}` |",
-            f"| overall_correct_rate | {_pct(cmp.overall_correct_rate_on)} "
-            f"| {_pct(cmp.overall_correct_rate_off)} |",
-            "",
-            f"**memory_decision_lift**: {sign}{lift:.4f} "
-            f"({_pct(lift) if lift >= 0 else '-' + _pct(-lift)})",
-            "",
-            f"**harmful_influence_rate (on)**: {_pct(cmp.harmful_influence_rate_on)}",
-            "",
-            f"**Verdict**: {verdict}",
+    lines = [
+        "| Metric | memory=on | memory=off |",
+        "|---|---|---|",
+        f"| run_id | `{cmp.on_run_id}` | `{cmp.off_run_id}` |",
+        f"| overall_correct_rate | {_pct(cmp.overall_correct_rate_on)} "
+        f"| {_pct(cmp.overall_correct_rate_off)} |",
+        "",
+        f"**memory_decision_lift**: {sign}{lift:.4f} "
+        f"({_pct(lift) if lift >= 0 else '-' + _pct(-lift)})",
+        "",
+    ]
+    if cmp.causal_attribution_available:
+        lines += [
+            f"**Causal attribution (cross-arm per-file diff)**: "
+            f"helped={cmp.memory_helped_count}, harmed={cmp.memory_harmed_count}",
+            f"  - memory_helped (off-fail → on-pass): {_file_list(cmp.memory_helped_files)}",
+            f"  - memory_harmed (off-pass → on-fail): {_file_list(cmp.memory_harmed_files)}",
         ]
-    )
+    else:
+        lines.append(
+            "**Causal attribution**: N/A (reports carry no per-file lists; "
+            "regenerate with PR-0d+ to enable)"
+        )
+    lines += [
+        "",
+        f"**harmful_influence_rate (on, correlational)**: "
+        f"{_pct(cmp.harmful_influence_rate_on)}",
+        "",
+        f"**Verdict**: {verdict}",
+    ]
+    return "\n".join(lines)
diff --git a/tests/unit/test_memory_effectiveness.py b/tests/unit/test_memory_effectiveness.py
index d46e45f..923a471 100644
--- a/tests/unit/test_memory_effectiveness.py
+++ b/tests/unit/test_memory_effectiveness.py
@@ -100,6 +100,9 @@ def test_influenced_counts_intersection_of_injected_and_judged():
     assert report.harmful_influence_count == 1
     assert report.correct_rate_after_influence == 0.5
     assert report.harmful_influence_rate == 0.5
+    # PR-0d: per-file lists persisted (sorted) for offline causal attribution
+    assert report.passed_files == ["a.py", "c.py"]
+    assert report.failed_files == ["b.py"]
 
 
 def test_injected_file_not_judged_is_excluded_from_influence():
diff --git a/tests/unit/test_memory_replay.py b/tests/unit/test_memory_replay.py
index 02e93ca..7ba87c0 100644
--- a/tests/unit/test_memory_replay.py
+++ b/tests/unit/test_memory_replay.py
@@ -14,7 +14,13 @@
 )
 
 
-def _report(run_id: str, correct_rate: float, harmful_rate: float = 0.0):
+def _report(
+    run_id: str,
+    correct_rate: float,
+    harmful_rate: float = 0.0,
+    passed_files: list[str] | None = None,
+    failed_files: list[str] | None = None,
+):
     return MemoryEffectivenessReport(
         run_id=run_id,
         total_judged_decisions=10,
@@ -26,6 +32,8 @@ def _report(run_id: str, correct_rate: float, harmful_rate: float = 0.0):
         harmful_influence_rate=harmful_rate,
         total_tracked_entries=2,
         effective_observations=4,
+        passed_files=passed_files or [],
+        failed_files=failed_files or [],
     )
 
 
@@ -81,6 +89,57 @@ def test_comparison_non_positive_lift_not_beneficial():
     assert cmp.memory_beneficial is False
 
 
+# --- PR-0d causal attribution ----------------------------------------------
+
+
+def test_causal_deterministic_failure_not_blamed_on_memory():
+    """Same per-file verdict in both arms → 0 helped, 0 harmed even though the
+    single-arm harmful_influence_rate is non-zero (the forgejo baseline case)."""
+    on = _report(
+        "on", 0.8125, harmful_rate=0.2, passed_files=["a", "b"], failed_files=["x"]
+    )
+    off = _report("off", 0.8125, passed_files=["a", "b"], failed_files=["x"])
+    cmp = build_ablation_comparison(on, off)
+    assert cmp.causal_attribution_available is True
+    assert cmp.memory_helped_count == 0
+    assert cmp.memory_harmed_count == 0
+    assert cmp.harmful_influence_rate_on == pytest.approx(
+        0.2
+    )  # correlational, unchanged
+
+
+def test_causal_memory_helped_and_harmed():
+    # off fails f1 (on passes it → helped); off passes f2 (on fails it → harmed)
+    on = _report("on", 0.5, passed_files=["f1", "keep"], failed_files=["f2"])
+    off = _report("off", 0.5, passed_files=["f2", "keep"], failed_files=["f1"])
+    cmp = build_ablation_comparison(on, off)
+    assert cmp.memory_helped_files == ["f1"]
+    assert cmp.memory_harmed_files == ["f2"]
+    assert cmp.memory_helped_count == 1
+    assert cmp.memory_harmed_count == 1
+
+
+def test_causal_unavailable_when_no_file_lists():
+    cmp = build_ablation_comparison(_report("on", 0.9), _report("off", 0.7))
+    assert cmp.causal_attribution_available is False
+    assert cmp.memory_helped_count == 0 and cmp.memory_harmed_count == 0
+
+
+def test_render_causal_block_present():
+    on = _report("on", 0.8, passed_files=["a"], failed_files=["x"])
+    off = _report("off", 0.8, passed_files=["a", "x"], failed_files=[])
+    table = render_ablation_table(build_ablation_comparison(on, off))
+    assert "Causal attribution" in table
+    assert "memory_harmed (off-pass → on-fail): x" in table
+
+
+def test_render_causal_na_when_unavailable():
+    table = render_ablation_table(
+        build_ablation_comparison(_report("on", 0.9), _report("off", 0.7))
+    )
+    assert "N/A" in table
+
+
 # --- rendering --------------------------------------------------------------
 
 

From b83d142e29c8b95dd9971bc92592608eecb2d015 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:11:28 -0400
Subject: [PATCH 08/22] =?UTF-8?q?feat(memory):=20P1-A=20=E6=8C=81=E4=B9=85?=
 =?UTF-8?q?=E5=8C=96=E8=BD=AF=E5=88=A0=20suppress=EF=BC=88=E5=B7=A9?=
 =?UTF-8?q?=E5=9B=BA=20O-M6=20=E4=B8=B4=E6=97=B6=E8=BF=87=E6=BB=A4?=
 =?UTF-8?q?=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

把 O-M6 读取期、依赖 hit_tracker 存活的临时有害过滤巩固为持久、可审计的
软删状态，使 prune 在 tracker sidecar 丢失/观测不足时不再"复活"，并覆盖
写入/consolidation 侧（堵 F2 无界增长）。

- MemoryEntry 增 suppressed/suppressed_reason（content_hash 不含新字段，
  dedup 身份不变；软删保留可审计行，非物理删除）
- MemoryStore/SQLiteMemoryStore 增 suppress_entry()；SQLite 加两列 +
  ALTER TABLE 迁移旧库；get_relevant_context 与 _consolidate_entries 跳过
  suppressed 条目
- layered_loader._build_l2 过滤改为"suppressed 或命中 harmful_entry_ids"
  （持久 + 实时并存）
- orchestrator 新增 _apply_suppress_harmful_entries：run 末把满足
  suppress_min_observations 的稳定有害条目固化 suppress，豁免 bootstrap/HUMAN
- 新 opt-in 开关 memory.persist_suppress（默认 False，按 P2 先度量再激活）

12 新单测 + 3224 unit 绿（1 个 pre-existing 无关 docs 测试除外），mypy/ruff 干净
---
 src/core/orchestrator.py           |  45 ++++++
 src/memory/layered_loader.py       |   6 +-
 src/memory/models.py               |   2 +
 src/memory/sqlite_store.py         |  57 ++++++-
 src/memory/store.py                |  33 ++++
 src/models/config.py               |  15 ++
 tests/unit/test_memory_suppress.py | 236 +++++++++++++++++++++++++++++
 7 files changed, 389 insertions(+), 5 deletions(-)
 create mode 100644 tests/unit/test_memory_suppress.py

diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py
index bd09ed8..32f1d26 100644
--- a/src/core/orchestrator.py
+++ b/src/core/orchestrator.py
@@ -525,6 +525,10 @@ async def _update_memory(self, phase: str, state: MergeState) -> None:
                 self._apply_outcome_confidence_writeback(state)
             except Exception as exc:
                 logger.warning("Outcome confidence write-back failed: %s", exc)
+            try:
+                self._apply_suppress_harmful_entries(state)
+            except Exception as exc:
+                logger.warning("Harmful-entry suppression failed: %s", exc)
 
     def _apply_outcome_confidence_writeback(self, state: MergeState) -> None:
         """OPP-5: nudge persisted memory confidence toward judge outcomes.
@@ -566,6 +570,47 @@ def _apply_outcome_confidence_writeback(self, state: MergeState) -> None:
                 len(deltas),
             )
 
+    def _apply_suppress_harmful_entries(self, state: MergeState) -> None:
+        """P1-A: persistently soft-delete stably-harmful memory entries.
+
+        Default OFF. When ``persist_suppress`` is on, entries whose accumulated
+        outcome score crosses the harmful threshold with at least
+        ``suppress_min_observations`` observations are marked ``suppressed`` so
+        the prune survives tracker loss across runs (the O-M6 read-time filter
+        recomputes from sidecar observations and resurrects on loss). Human and
+        bootstrap entries are exempt, mirroring OPP-5."""
+        cfg = getattr(self.config, "memory", None)
+        if cfg is None or not getattr(cfg, "persist_suppress", False):
+            return
+        harmful_ids = self._memory_hit_tracker.harmful_entry_ids(
+            min_observations=cfg.suppress_min_observations
+        )
+        if not harmful_ids:
+            return
+        human_files = {
+            fp
+            for fp, record in state.file_decision_records.items()
+            if record.decision_source
+            in (DecisionSource.HUMAN, DecisionSource.BATCH_HUMAN)
+        }
+        suppressed = 0
+        for entry in self._memory_store.to_memory().entries:
+            if entry.entry_id not in harmful_ids or entry.suppressed:
+                continue
+            if _BOOTSTRAP_TAG in entry.tags:
+                continue
+            if human_files and any(fp in human_files for fp in entry.file_paths):
+                continue
+            self._memory_store = self._memory_store.suppress_entry(
+                entry.entry_id, reason="P1-A: stably-harmful judge outcomes"
+            )
+            suppressed += 1
+        if suppressed:
+            logger.info(
+                "P1-A: persistently suppressed %d stably-harmful memory entries",
+                suppressed,
+            )
+
     def _should_llm_extract(self, phase: str, state: MergeState) -> bool:
         cfg = getattr(self.config, "memory", None)
         if cfg is None or not cfg.llm_extraction:
diff --git a/src/memory/layered_loader.py b/src/memory/layered_loader.py
index 2ae270f..4a50282 100644
--- a/src/memory/layered_loader.py
+++ b/src/memory/layered_loader.py
@@ -123,7 +123,11 @@ def _build_l2(self, file_paths: list[str]) -> tuple[str, int]:
         lines: list[str] = []
         injected_ids: list[str] = []
         for entry in relevant:
-            if entry.entry_id in harmful_ids:
+            # P1-A: persistent suppress (entry.suppressed) OR realtime harmful
+            # (tracker observations this process). get_relevant_context already
+            # drops suppressed at the source; this keeps the read path correct
+            # even if a caller passes pre-fetched entries.
+            if entry.suppressed or entry.entry_id in harmful_ids:
                 continue
             if not _has_path_overlap(entry.file_paths, file_paths):
                 continue
diff --git a/src/memory/models.py b/src/memory/models.py
index 288ae0e..9311ef9 100644
--- a/src/memory/models.py
+++ b/src/memory/models.py
@@ -34,6 +34,8 @@ class MemoryEntry(BaseModel, frozen=True):
     confidence_level: ConfidenceLevel = Field(default=ConfidenceLevel.INFERRED)
     content_hash: str = Field(default="")
     created_at: datetime = Field(default_factory=datetime.now)
+    suppressed: bool = Field(default=False)
+    suppressed_reason: str | None = Field(default=None)
 
     def model_post_init(self, __context: Any) -> None:
         if not self.content_hash:
diff --git a/src/memory/sqlite_store.py b/src/memory/sqlite_store.py
index 167e140..202a208 100644
--- a/src/memory/sqlite_store.py
+++ b/src/memory/sqlite_store.py
@@ -35,7 +35,9 @@
     confidence        REAL NOT NULL,
     confidence_level  TEXT NOT NULL,
     content_hash      TEXT NOT NULL,
-    created_at        TEXT NOT NULL
+    created_at        TEXT NOT NULL,
+    suppressed        INTEGER NOT NULL DEFAULT 0,
+    suppressed_reason TEXT
 );
 CREATE UNIQUE INDEX IF NOT EXISTS idx_content_hash
     ON memory_entries (content_hash);
@@ -52,10 +54,25 @@
 _INSERT_ENTRY = """
 INSERT OR IGNORE INTO memory_entries
     (entry_id, entry_type, phase, content, file_paths, tags,
-     confidence, confidence_level, content_hash, created_at)
-VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+     confidence, confidence_level, content_hash, created_at,
+     suppressed, suppressed_reason)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 """
 
+# P1-A: columns added after the original schema shipped; older memory.db files
+# predate them. ALTER TABLE ADD COLUMN is a no-op-safe, data-preserving
+# migration run on every open.
+_MIGRATIONS = (
+    (
+        "suppressed",
+        "ALTER TABLE memory_entries ADD COLUMN suppressed INTEGER NOT NULL DEFAULT 0",
+    ),
+    (
+        "suppressed_reason",
+        "ALTER TABLE memory_entries ADD COLUMN suppressed_reason TEXT",
+    ),
+)
+
 _PHASE_ORDER = {
     "planning": 0,
     "auto_merge": 1,
@@ -64,7 +81,7 @@
 }
 
 
-def _entry_to_row(entry: MemoryEntry) -> tuple[str, ...]:
+def _entry_to_row(entry: MemoryEntry) -> tuple[str | int | None, ...]:
     return (
         entry.entry_id,
         entry.entry_type.value,
@@ -76,10 +93,13 @@ def _entry_to_row(entry: MemoryEntry) -> tuple[str, ...]:
         entry.confidence_level.value,
         entry.content_hash,
         entry.created_at.isoformat(),
+        1 if entry.suppressed else 0,
+        entry.suppressed_reason,
     )
 
 
 def _row_to_entry(row: sqlite3.Row) -> MemoryEntry:
+    keys = row.keys()
     return MemoryEntry(
         entry_id=row["entry_id"],
         entry_type=MemoryEntryType(row["entry_type"]),
@@ -91,6 +111,10 @@ def _row_to_entry(row: sqlite3.Row) -> MemoryEntry:
         confidence_level=ConfidenceLevel(row["confidence_level"]),
         content_hash=row["content_hash"],
         created_at=datetime.fromisoformat(row["created_at"]),
+        suppressed=bool(row["suppressed"]) if "suppressed" in keys else False,
+        suppressed_reason=(
+            row["suppressed_reason"] if "suppressed_reason" in keys else None
+        ),
     )
 
 
@@ -129,6 +153,7 @@ def _init_db(self) -> None:
                 conn.execute("PRAGMA busy_timeout=5000")
                 conn.execute("PRAGMA journal_mode=WAL")
                 conn.executescript(_CREATE_SCHEMA)
+                self._apply_migrations(conn)
                 return
             except sqlite3.OperationalError as exc:
                 last_exc = exc
@@ -138,6 +163,15 @@ def _init_db(self) -> None:
         assert last_exc is not None
         raise last_exc
 
+    def _apply_migrations(self, conn: sqlite3.Connection) -> None:
+        existing = {
+            row[1]
+            for row in conn.execute("PRAGMA table_info(memory_entries)").fetchall()
+        }
+        for column, ddl in _MIGRATIONS:
+            if column not in existing:
+                conn.execute(ddl)
+
     @contextmanager
     def _conn(self) -> Generator[sqlite3.Connection, None, None]:
         conn = sqlite3.connect(str(self._db_path), timeout=5.0)
@@ -221,6 +255,19 @@ def adjust_confidence(self, deltas: dict[str, float]) -> "SQLiteMemoryStore":
                 )
         return self
 
+    def suppress_entry(self, entry_id: str, reason: str) -> "SQLiteMemoryStore":
+        """P1-A: persistently soft-delete an entry (audit-preserving).
+
+        Sets ``suppressed=1`` + ``suppressed_reason`` via UPDATE; the row stays
+        for audit/reversal. Already-suppressed or unknown ids are no-ops."""
+        with self._conn() as conn:
+            conn.execute(
+                "UPDATE memory_entries SET suppressed = 1, suppressed_reason = ? "
+                "WHERE entry_id = ? AND suppressed = 0",
+                (reason, entry_id),
+            )
+        return self
+
     def set_codebase_profile(self, key: str, value: str) -> "SQLiteMemoryStore":
         with self._conn() as conn:
             conn.execute(
@@ -274,6 +321,8 @@ def get_relevant_context(
         scored: dict[str, tuple[float, MemoryEntry]] = {}
         for row in rows:
             entry = _row_to_entry(row)
+            if entry.suppressed:
+                continue
             entry_fps: list[str] = json.loads(row["file_paths"])
             path_score = score_path_overlap(file_paths, entry_fps)
 
diff --git a/src/memory/store.py b/src/memory/store.py
index 4ca7aed..16199d4 100644
--- a/src/memory/store.py
+++ b/src/memory/store.py
@@ -69,6 +69,31 @@ def adjust_confidence(self, deltas: dict[str, float]) -> MemoryStore:
         new_memory = self._memory.model_copy(update={"entries": entries})
         return MemoryStore(new_memory)
 
+    def suppress_entry(self, entry_id: str, reason: str) -> MemoryStore:
+        """P1-A: persistently soft-delete an entry (audit-preserving).
+
+        Marks ``suppressed=True`` + records ``reason`` rather than removing
+        the row, so the decision stays auditable and reversible. Suppressed
+        entries are skipped at injection (``get_relevant_context``) and at
+        consolidation. Idempotent and immutable; an unknown ``entry_id`` or an
+        already-suppressed entry returns ``self`` unchanged."""
+        changed = False
+        entries: list[MemoryEntry] = []
+        for entry in self._memory.entries:
+            if entry.entry_id == entry_id and not entry.suppressed:
+                entries.append(
+                    entry.model_copy(
+                        update={"suppressed": True, "suppressed_reason": reason}
+                    )
+                )
+                changed = True
+            else:
+                entries.append(entry)
+        if not changed:
+            return self
+        new_memory = self._memory.model_copy(update={"entries": entries})
+        return MemoryStore(new_memory)
+
     def set_codebase_profile(self, key: str, value: str) -> MemoryStore:
         profile = {**self._memory.codebase_profile, key: value}
         new_memory = self._memory.model_copy(update={"codebase_profile": profile})
@@ -118,6 +143,8 @@ def get_relevant_context(
         ref_short = current_upstream_ref[:8] if current_upstream_ref else ""
         scored: dict[str, tuple[float, MemoryEntry]] = {}
         for entry in self._memory.entries:
+            if entry.suppressed:
+                continue
             path_score = score_path_overlap(file_paths, entry.file_paths)
 
             confidence = entry.confidence
@@ -324,6 +351,12 @@ class (``c_class`` / ``conflict_decision``) shared across directories, so
     ungroupable: list[MemoryEntry] = []
 
     for entry in entries:
+        # P1-A: suppressed entries pass through untouched — they must not be
+        # merged into a live blob (that would resurrect harmful content) nor
+        # silently dropped (audit trail must survive consolidation).
+        if entry.suppressed:
+            ungroupable.append(entry)
+            continue
         primary_tag = entry.tags[0] if entry.tags else ""
         key = (
             entry.phase,
diff --git a/src/models/config.py b/src/models/config.py
index 872fe91..2afb0f7 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -970,6 +970,21 @@ class MemoryExtractionConfig(BaseModel):
         "(merge eval-memory). Extraction/write-back are unaffected — only "
         "read-time injection is suppressed. Default True (normal behaviour).",
     )
+    persist_suppress: bool = Field(
+        default=False,
+        description="P1-A: at run end, persistently soft-delete (suppress) "
+        "memory entries judged stably harmful (>= suppress_min_observations "
+        "pass/fail observations with a net-negative outcome). Default OFF — "
+        "like OPP-5 write-back, suppression is cross-run durable and should "
+        "prove out on the eval-memory ablation before enabling. Never touches "
+        "human-decided or bootstrap (human-authored) entries.",
+    )
+    suppress_min_observations: int = Field(
+        default=3,
+        ge=1,
+        description="P1-A: minimum pass+fail observations before a harmful "
+        "entry is persistently suppressed, so a single run cannot prune it.",
+    )
 
 
 class RenameDetectionConfig(BaseModel):
diff --git a/tests/unit/test_memory_suppress.py b/tests/unit/test_memory_suppress.py
new file mode 100644
index 0000000..c3988d2
--- /dev/null
+++ b/tests/unit/test_memory_suppress.py
@@ -0,0 +1,236 @@
+"""P1-A: persistent soft-delete (suppress) across both stores + orchestrator固化.
+
+The O-M6 harmful filter is read-time and tracker-dependent — a lost sidecar
+resurrects pruned entries. P1-A persists the prune as an auditable
+``suppressed`` flag so it survives tracker loss, and blocks suppressed entries
+from injection AND consolidation.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from src.memory.models import MemoryEntry, MemoryEntryType
+from src.memory.sqlite_store import SQLiteMemoryStore
+from src.memory.store import MemoryStore, _consolidate_entries
+
+
+def _entry(content: str, file_paths: list[str], confidence: float = 0.8) -> MemoryEntry:
+    return MemoryEntry(
+        entry_type=MemoryEntryType.DECISION,
+        phase="conflict_analysis",
+        content=content,
+        file_paths=file_paths,
+        confidence=confidence,
+    )
+
+
+# --- model ------------------------------------------------------------------
+
+
+def test_suppressed_defaults_false_and_hash_unchanged():
+    a = _entry("x", ["a.py"])
+    # suppressing must not change dedup identity (content_hash excludes the flag)
+    b = a.model_copy(update={"suppressed": True, "suppressed_reason": "harmful"})
+    assert a.suppressed is False and a.suppressed_reason is None
+    assert b.suppressed is True and b.suppressed_reason == "harmful"
+    assert a.content_hash == b.content_hash
+
+
+# --- MemoryStore ------------------------------------------------------------
+
+
+def test_memstore_suppress_is_immutable_and_marks_flag():
+    e = _entry("bad", ["a.py"])
+    store = MemoryStore().add_entry(e)
+    new = store.suppress_entry(e.entry_id, "stably harmful")
+    assert new is not store  # new instance
+    assert store.to_memory().entries[0].suppressed is False  # original untouched
+    marked = new.to_memory().entries[0]
+    assert marked.suppressed is True
+    assert marked.suppressed_reason == "stably harmful"
+
+
+def test_memstore_suppress_unknown_and_double_are_noops():
+    e = _entry("bad", ["a.py"])
+    store = MemoryStore().add_entry(e)
+    assert store.suppress_entry("nope", "x") is store
+    once = store.suppress_entry(e.entry_id, "r")
+    assert once.suppress_entry(e.entry_id, "r2") is once  # already suppressed
+
+
+def test_memstore_suppressed_excluded_from_relevant():
+    e = _entry("bad", ["a.py"])
+    store = MemoryStore().add_entry(e)
+    assert store.get_relevant_context(["a.py"])  # visible before
+    suppressed = store.suppress_entry(e.entry_id, "harmful")
+    assert suppressed.get_relevant_context(["a.py"]) == []
+
+
+# --- consolidation ----------------------------------------------------------
+
+
+def test_consolidation_passes_suppressed_through_untouched():
+    # 3 same-group live entries would merge; a suppressed sibling must survive
+    # standalone (not merged, not dropped) to keep the audit trail.
+    live = [_entry(f"c{i}", ["pkg/x/a.py"]) for i in range(3)]
+    suppressed = _entry("harmful", ["pkg/x/a.py"]).model_copy(
+        update={"suppressed": True, "suppressed_reason": "r"}
+    )
+    out = _consolidate_entries([*live, suppressed])
+    surviving = [e for e in out if e.suppressed]
+    assert len(surviving) == 1
+    assert surviving[0].suppressed_reason == "r"
+    # the 3 live ones collapsed into a single consolidated blob
+    assert sum(1 for e in out if not e.suppressed) == 1
+
+
+# --- SQLiteMemoryStore ------------------------------------------------------
+
+
+def test_sqlite_suppress_persists_and_excludes(tmp_path):
+    db = tmp_path / "m.db"
+    store = SQLiteMemoryStore.open(db)
+    e = _entry("bad", ["a.py"])
+    store.add_entry(e)
+    assert store.get_relevant_context(["a.py"])
+    store.suppress_entry(e.entry_id, "harmful")
+    assert store.get_relevant_context(["a.py"]) == []
+    # reopen: flag persisted on disk
+    reopened = SQLiteMemoryStore.open(db)
+    row = next(x for x in reopened.to_memory().entries if x.entry_id == e.entry_id)
+    assert row.suppressed is True and row.suppressed_reason == "harmful"
+
+
+def test_sqlite_migration_adds_columns_to_legacy_db(tmp_path):
+    """A pre-P1-A schema (no suppressed columns) must migrate on open without
+    data loss, defaulting existing rows to suppressed=False."""
+    import sqlite3
+
+    db = tmp_path / "legacy.db"
+    conn = sqlite3.connect(str(db))
+    conn.executescript(
+        """
+        CREATE TABLE memory_entries (
+            entry_id TEXT PRIMARY KEY, entry_type TEXT NOT NULL, phase TEXT NOT NULL,
+            content TEXT NOT NULL, file_paths TEXT NOT NULL, tags TEXT NOT NULL,
+            confidence REAL NOT NULL, confidence_level TEXT NOT NULL,
+            content_hash TEXT NOT NULL, created_at TEXT NOT NULL
+        );
+        INSERT INTO memory_entries VALUES
+            ('id1','decision','planning','legacy','["a.py"]','[]',0.8,
+             'inferred','hash1','2026-01-01T00:00:00');
+        """
+    )
+    conn.commit()
+    conn.close()
+
+    store = SQLiteMemoryStore.open(db)  # migration runs here
+    entries = store.to_memory().entries
+    assert len(entries) == 1
+    assert entries[0].suppressed is False
+    # and suppression now works on the migrated row
+    store.suppress_entry("id1", "harmful")
+    assert store.get_relevant_context(["a.py"]) == []
+
+
+def test_sqlite_suppress_unknown_is_noop(tmp_path):
+    store = SQLiteMemoryStore.open(tmp_path / "m.db")
+    e = _entry("ok", ["a.py"])
+    store.add_entry(e)
+    store.suppress_entry("nope", "x")  # must not raise
+    assert store.get_relevant_context(["a.py"])  # untouched
+
+
+# --- parity -----------------------------------------------------------------
+
+
+def test_both_stores_agree_suppressed_is_hidden(tmp_path):
+    e = _entry("bad", ["a.py"])
+    mem = MemoryStore().add_entry(e).suppress_entry(e.entry_id, "r")
+    sq = SQLiteMemoryStore.open(tmp_path / "m.db")
+    sq.add_entry(e)
+    sq.suppress_entry(e.entry_id, "r")
+    assert mem.get_relevant_context(["a.py"]) == sq.get_relevant_context(["a.py"]) == []
+
+
+# --- orchestrator固化 (_apply_suppress_harmful_entries) ----------------------
+
+
+def _track_fails(tracker, entry_id: str, n: int) -> None:
+    for i in range(n):
+        f = f"{entry_id}-obs{i}"
+        tracker.record_injection([f], [entry_id])
+        tracker.record_outcome(f, success=False)
+
+
+def _orch(persist: bool, min_obs: int = 3):
+    from types import SimpleNamespace
+
+    from src.core.orchestrator import Orchestrator
+    from src.memory.hit_tracker import MemoryHitTracker
+    from src.models.config import MemoryExtractionConfig
+
+    orch = Orchestrator.__new__(Orchestrator)
+    orch._memory_hit_tracker = MemoryHitTracker()
+    orch._memory_store = MemoryStore()
+    orch.config = SimpleNamespace(
+        memory=MemoryExtractionConfig(
+            persist_suppress=persist, suppress_min_observations=min_obs
+        )
+    )
+    return orch
+
+
+def test_persist_suppress_off_by_default():
+    from src.models.config import MemoryExtractionConfig
+
+    assert MemoryExtractionConfig().persist_suppress is False
+
+    from types import SimpleNamespace
+
+    orch = _orch(persist=False)
+    e = _entry("harm", ["src/a.py"])
+    orch._memory_store = orch._memory_store.add_entry(e)
+    _track_fails(orch._memory_hit_tracker, e.entry_id, 3)
+    orch._apply_suppress_harmful_entries(SimpleNamespace(file_decision_records={}))
+    assert orch._memory_store.to_memory().entries[0].suppressed is False
+
+
+def test_persist_suppress_marks_stable_harmful_skips_human_and_bootstrap():
+    from types import SimpleNamespace
+
+    from src.models.decision import DecisionSource
+
+    orch = _orch(persist=True, min_obs=3)
+    harmful = _entry("harm", ["src/a.py"])
+    human = _entry("human", ["src/secret.py"])
+    boot = _entry("boot", []).model_copy(update={"tags": ["bootstrap"]})
+    store = orch._memory_store
+    for e in (harmful, human, boot):
+        store = store.add_entry(e)
+        _track_fails(orch._memory_hit_tracker, e.entry_id, 3)
+    orch._memory_store = store
+
+    state = SimpleNamespace(
+        file_decision_records={
+            "src/secret.py": SimpleNamespace(decision_source=DecisionSource.HUMAN)
+        }
+    )
+    orch._apply_suppress_harmful_entries(state)
+
+    by_id = {e.entry_id: e for e in orch._memory_store.to_memory().entries}
+    assert by_id[harmful.entry_id].suppressed is True
+    assert by_id[human.entry_id].suppressed is False  # human-decided exempt
+    assert by_id[boot.entry_id].suppressed is False  # bootstrap exempt
+
+
+def test_persist_suppress_respects_min_observations():
+    from types import SimpleNamespace
+
+    orch = _orch(persist=True, min_obs=3)
+    e = _entry("harm", ["src/a.py"])
+    orch._memory_store = orch._memory_store.add_entry(e)
+    _track_fails(orch._memory_hit_tracker, e.entry_id, 2)  # below threshold
+    orch._apply_suppress_harmful_entries(SimpleNamespace(file_decision_records={}))
+    assert orch._memory_store.to_memory().entries[0].suppressed is False

From 6b4f905dacc0d9ea7c45a3aed3e5818502ee79c2 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:20:10 -0400
Subject: [PATCH 09/22] =?UTF-8?q?feat(memory):=20P1-B=20=E6=89=A7=E8=A1=8C?=
 =?UTF-8?q?=E6=8E=A5=E5=9C=B0=E5=86=99=E5=9B=9E=E4=BF=A1=E5=8F=B7=E8=9E=8D?=
 =?UTF-8?q?=E5=90=88=EF=BC=88judge=20+=20compile=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OPP-5 写回此前只用 judge pass/fail，而 record_outcome 又跑在 build_check
之前——一个 judge 判过但编译失败的合并仍把产生它的记忆记为 helpful。本提交
把每文件 outcome 记录从 judge_review 阶段集中到 orchestrator 的 judge_review
记忆钩子（此时 verdict 已反映 build_check 降级），并按确定性信号融合：

- 新增 _record_memory_outcomes：默认 ["judge"] 与旧行为逐字节等价；含 "compile"
  时，build_check_failed 的 run 把 judge-passed 的已编译语言文件 demote 为失败，
  非编译文件（如 .md）不受牵连
- judge_review.py 移除 inline record_outcome（避免在 build_check 前误记）
- 写回为双向（OPP-5 已有 +Δ/−Δ），harmful 跌破阈值由紧随其后的 P1-A
  _apply_suppress_harmful_entries 固化 suppress——无需新代码
- config.memory 增 writeback_signal_sources: list[Literal["judge","compile"]]
  （默认 ["judge"]，opt-in 加固，全确定性不引 LLM 自报）

CI/partial_failure 信号有意延后：post-merge 确定性发现在 report_generation
产出，晚于本钩子；完整融合需把写回迁到 report 阶段（见 plan P1-B）

7 新单测 + 3231 unit 绿（1 pre-existing 无关 docs 测试除外），mypy/ruff 干净
---
 src/core/orchestrator.py              |  33 ++++++
 src/core/phases/judge_review.py       |  14 ++-
 src/models/config.py                  |  15 ++-
 tests/unit/test_p1b_outcome_fusion.py | 140 ++++++++++++++++++++++++++
 4 files changed, 193 insertions(+), 9 deletions(-)
 create mode 100644 tests/unit/test_p1b_outcome_fusion.py

diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py
index 32f1d26..8fe7bc9 100644
--- a/src/core/orchestrator.py
+++ b/src/core/orchestrator.py
@@ -521,6 +521,10 @@ async def _update_memory(self, phase: str, state: MergeState) -> None:
                 logger.warning("LLM memory extraction failed for %s: %s", phase, exc)
 
         if phase == "judge_review":
+            try:
+                self._record_memory_outcomes(state)
+            except Exception as exc:
+                logger.warning("Memory outcome recording failed: %s", exc)
             try:
                 self._apply_outcome_confidence_writeback(state)
             except Exception as exc:
@@ -530,6 +534,35 @@ async def _update_memory(self, phase: str, state: MergeState) -> None:
             except Exception as exc:
                 logger.warning("Harmful-entry suppression failed: %s", exc)
 
+    def _record_memory_outcomes(self, state: MergeState) -> None:
+        """P1-B: fuse deterministic signals into the per-file memory outcome
+        that feeds OPP-5 write-back and P1-A suppression, then credit/blame the
+        entries injected for each file.
+
+        Runs once after judge_review — the verdict already reflects the
+        post-judge build check. With the default ``["judge"]`` this reproduces
+        the prior passed/failed split byte-for-byte. Adding ``"compile"`` demotes
+        a judge-passed compiled-language file to a failure when the build check
+        failed this run, so memory that produced an uncompilable merge earns no
+        credit. Deterministic only — no LLM self-report."""
+        verdict = state.judge_verdict
+        if verdict is None:
+            return
+        cfg = getattr(self.config, "memory", None)
+        sources = list(getattr(cfg, "writeback_signal_sources", None) or ["judge"])
+        tracker = self._memory_hit_tracker
+        demoted: frozenset[str] = frozenset()
+        if "compile" in sources and any(
+            issue.issue_type == "build_check_failed" for issue in verdict.issues
+        ):
+            from src.tools.compile_gate import compiled_language_paths
+
+            demoted = frozenset(compiled_language_paths(verdict.passed_files))
+        for fp in verdict.passed_files:
+            tracker.record_outcome(fp, success=fp not in demoted)
+        for fp in verdict.failed_files:
+            tracker.record_outcome(fp, success=False)
+
     def _apply_outcome_confidence_writeback(self, state: MergeState) -> None:
         """OPP-5: nudge persisted memory confidence toward judge outcomes.
 
diff --git a/src/core/phases/judge_review.py b/src/core/phases/judge_review.py
index 76f0b98..060699f 100644
--- a/src/core/phases/judge_review.py
+++ b/src/core/phases/judge_review.py
@@ -217,14 +217,12 @@ async def execute(self, state: MergeState, ctx: PhaseContext) -> PhaseOutcome:
         )
         state.phase_results[MergePhase.JUDGE_REVIEW.value] = phase_result
 
-        # O-M4: credit/blame memory entries based on the final verdict's
-        # passed/failed file lists. Outcomes accumulate across runs via the
-        # tracker's sidecar JSON; future runs use them to bias confidence.
-        if state.judge_verdict is not None and ctx.memory_hit_tracker is not None:
-            for fp in state.judge_verdict.passed_files:
-                ctx.memory_hit_tracker.record_outcome(fp, success=True)
-            for fp in state.judge_verdict.failed_files:
-                ctx.memory_hit_tracker.record_outcome(fp, success=False)
+        # O-M4 / P1-B: credit/blame memory entries by the final verdict's
+        # passed/failed files. Recording is centralised in the orchestrator's
+        # post-judge_review memory hook (``_record_memory_outcomes``) so it sees
+        # the verdict AFTER the post-judge build check ran and can fuse the
+        # compile signal — recording here would run before build_check and
+        # credit a judge-passed but uncompilable file.
 
         gate_ok = await run_gates(state, ctx, "judge_review")
         if not gate_ok:
diff --git a/src/models/config.py b/src/models/config.py
index 2afb0f7..526b50d 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import Any, Literal, Optional
+from typing import Any, Literal, Optional, cast
 from pydantic import BaseModel, Field, field_validator, model_validator
 
 
@@ -961,6 +961,19 @@ class MemoryExtractionConfig(BaseModel):
         description="OPP-5: minimum pass+fail observations before an entry's "
         "confidence is nudged, so a single run cannot move it.",
     )
+    writeback_signal_sources: list[Literal["judge", "compile"]] = Field(
+        default_factory=lambda: cast(list[Literal["judge", "compile"]], ["judge"]),
+        description="P1-B: deterministic signals fused into the per-file memory "
+        "outcome that drives OPP-5 write-back and P1-A suppression. 'judge' = "
+        "the Judge verdict's passed/failed split (default — byte-identical to "
+        "prior behaviour). Adding 'compile' demotes a judge-passed "
+        "compiled-language file to a failure when the post-judge build check "
+        "failed this run, so memory that produced an uncompilable merge is not "
+        "credited. All sources are deterministic — no LLM self-report. "
+        "CI/partial_failure fusion is deferred: the post-merge deterministic "
+        "findings land in report_generation, after this hook (see "
+        "doc/plan/self-learning-system.md P1-B).",
+    )
     inject_enabled: bool = Field(
         default=True,
         description="P0 ablation switch: when False, no memory context is "
diff --git a/tests/unit/test_p1b_outcome_fusion.py b/tests/unit/test_p1b_outcome_fusion.py
new file mode 100644
index 0000000..80dfff8
--- /dev/null
+++ b/tests/unit/test_p1b_outcome_fusion.py
@@ -0,0 +1,140 @@
+"""P1-B: fuse deterministic signals (judge + compile) into the per-file memory
+outcome that drives OPP-5 write-back / P1-A suppression.
+
+Recording moved out of judge_review into the orchestrator's post-phase memory
+hook so the verdict reflects the post-judge build check. With the default
+``["judge"]`` the split is byte-identical to the old behaviour; adding
+``"compile"`` demotes a judge-passed compiled-language file when the build
+check failed this run.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from types import SimpleNamespace
+
+from src.core.orchestrator import Orchestrator
+from src.memory.hit_tracker import MemoryHitTracker
+from src.models.config import MemoryExtractionConfig
+from src.models.judge import IssueSeverity, JudgeIssue, JudgeVerdict, VerdictType
+
+
+def _verdict(passed, failed, *, build_failed: bool = False) -> JudgeVerdict:
+    issues = []
+    if build_failed:
+        issues.append(
+            JudgeIssue(
+                file_path="(build)",
+                issue_level=IssueSeverity.CRITICAL,
+                issue_type="build_check_failed",
+                description="compile broke",
+                veto_condition="Build check failed",
+            )
+        )
+    return JudgeVerdict(
+        verdict=VerdictType.FAIL if build_failed else VerdictType.PASS,
+        reviewed_files_count=len(passed) + len(failed),
+        passed_files=list(passed),
+        failed_files=list(failed),
+        conditional_files=[],
+        issues=issues,
+        critical_issues_count=1 if build_failed else 0,
+        high_issues_count=0,
+        overall_confidence=0.9,
+        summary="x",
+        blocking_issues=[],
+        timestamp=datetime(2026, 1, 1),
+        judge_model="m",
+    )
+
+
+def _orch(sources: list[str], verdict: JudgeVerdict) -> Orchestrator:
+    orch = Orchestrator.__new__(Orchestrator)
+    orch._memory_hit_tracker = MemoryHitTracker()
+    orch.config = SimpleNamespace(
+        memory=MemoryExtractionConfig(writeback_signal_sources=sources)
+    )
+    orch._verdict_for_test = verdict  # convenience handle in assertions
+    return orch
+
+
+def _state(verdict: JudgeVerdict) -> SimpleNamespace:
+    return SimpleNamespace(judge_verdict=verdict)
+
+
+def _inject(tracker: MemoryHitTracker, file_path: str, entry_id: str) -> None:
+    tracker.record_injection([file_path], [entry_id])
+
+
+# --- config -----------------------------------------------------------------
+
+
+def test_default_sources_is_judge_only():
+    assert MemoryExtractionConfig().writeback_signal_sources == ["judge"]
+
+
+# --- judge-only equivalence -------------------------------------------------
+
+
+def test_judge_only_records_pass_and_fail_split():
+    v = _verdict(["a.py"], ["b.py"])
+    orch = _orch(["judge"], v)
+    _inject(orch._memory_hit_tracker, "a.py", "e_pass")
+    _inject(orch._memory_hit_tracker, "b.py", "e_fail")
+
+    orch._record_memory_outcomes(_state(v))
+
+    scores = orch._memory_hit_tracker.outcome_scores(min_observations=1)
+    assert scores["e_pass"] == 1.0
+    assert scores["e_fail"] == -1.0
+
+
+def test_compile_source_but_build_passed_is_judge_equivalent():
+    v = _verdict(["a.go"], [])  # no build_check_failed issue
+    orch = _orch(["judge", "compile"], v)
+    _inject(orch._memory_hit_tracker, "a.go", "e")
+    orch._record_memory_outcomes(_state(v))
+    assert orch._memory_hit_tracker.outcome_scores(min_observations=1)["e"] == 1.0
+
+
+# --- compile fusion ---------------------------------------------------------
+
+
+def test_compile_failure_demotes_compiled_passed_file():
+    # judge passed a.go, but the build broke → a.go's memory is blamed, not
+    # credited, even though judge said pass.
+    v = _verdict(["a.go"], [], build_failed=True)
+    orch = _orch(["judge", "compile"], v)
+    _inject(orch._memory_hit_tracker, "a.go", "e_go")
+    orch._record_memory_outcomes(_state(v))
+    assert orch._memory_hit_tracker.outcome_scores(min_observations=1)["e_go"] == -1.0
+
+
+def test_compile_failure_does_not_demote_non_compiled_passed_file():
+    # a build break must not blame a Markdown file the compiler never touches.
+    v = _verdict(["a.go", "README.md"], [], build_failed=True)
+    orch = _orch(["judge", "compile"], v)
+    _inject(orch._memory_hit_tracker, "a.go", "e_go")
+    _inject(orch._memory_hit_tracker, "README.md", "e_md")
+    orch._record_memory_outcomes(_state(v))
+    scores = orch._memory_hit_tracker.outcome_scores(min_observations=1)
+    assert scores["e_go"] == -1.0
+    assert scores["e_md"] == 1.0  # non-compiled → still credited
+
+
+def test_compile_not_in_sources_keeps_judge_credit_on_build_fail():
+    # build failed, but operator opted out of compile fusion → judge split wins.
+    v = _verdict(["a.go"], [], build_failed=True)
+    orch = _orch(["judge"], v)
+    _inject(orch._memory_hit_tracker, "a.go", "e_go")
+    orch._record_memory_outcomes(_state(v))
+    assert orch._memory_hit_tracker.outcome_scores(min_observations=1)["e_go"] == 1.0
+
+
+# --- guards -----------------------------------------------------------------
+
+
+def test_no_verdict_is_noop():
+    orch = _orch(["judge", "compile"], _verdict([], []))
+    orch._record_memory_outcomes(SimpleNamespace(judge_verdict=None))  # must not raise
+    assert orch._memory_hit_tracker.outcome_scores(min_observations=1) == {}

From 100a33763f8c3c380c952e1a3eef7e2c5a8497dc Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:24:47 -0400
Subject: [PATCH 10/22] =?UTF-8?q?chore(release):=20=E9=85=8D=E7=BD=AE=20Py?=
 =?UTF-8?q?PI=20=E5=8F=91=E5=B8=83=E5=85=83=E6=95=B0=E6=8D=AE=E4=B8=8E=20G?=
 =?UTF-8?q?itHub=20Actions=20release=20workflow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- pyproject.toml 补充 description / readme / license / authors / classifiers / urls
- 新增 LICENSE（MIT）
- 新增 .github/workflows/release.yml：v* tag 触发 → 构建 Web UI → 打 wheel → Trusted Publishing 发布到 PyPI
---
 .github/workflows/release.yml | 61 +++++++++++++++++++++++++++++++++++
 LICENSE                       | 21 ++++++++++++
 pyproject.toml                | 19 +++++++++++
 3 files changed, 101 insertions(+)
 create mode 100644 .github/workflows/release.yml
 create mode 100644 LICENSE

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..de5f258
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,61 @@
+name: Release to PyPI
+
+on:
+  push:
+    tags:
+      - "v*"
+
+permissions:
+  contents: read
+  id-token: write  # required for Trusted Publishing (OIDC)
+
+jobs:
+  build:
+    name: Build wheel
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+          cache: npm
+          cache-dependency-path: web/package-lock.json
+
+      - name: Build Web UI
+        working-directory: web
+        run: npm ci && npm run build
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Build wheel and sdist
+        run: |
+          pip install hatch
+          hatch build
+
+      - name: Upload dist artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+  publish:
+    name: Publish to PyPI
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/project/code-merge-system/
+    steps:
+      - name: Download dist artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8fec473
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Angel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/pyproject.toml b/pyproject.toml
index 82203b5..76fa56a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,20 @@ build-backend = "hatchling.build"
 [project]
 name = "code-merge-system"
 version = "0.1.0"
+description = "AI-powered code merge agent with browser UI — plan, review, and resolve conflicts across long fork histories"
+readme = "README.md"
+license = { text = "MIT" }
+authors = [{ name = "Angel", email = "angel.gosick@gmail.com" }]
+keywords = ["merge", "git", "ai", "llm", "code-review", "conflict-resolution"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Version Control :: Git",
+]
 requires-python = ">=3.11"
 dependencies = [
     "pydantic>=2.5",
@@ -22,6 +36,11 @@ dependencies = [
     "platformdirs>=4.0",
 ]
 
+[project.urls]
+Homepage = "https://github.com/angel/code-merge-system"
+Repository = "https://github.com/angel/code-merge-system"
+Issues = "https://github.com/angel/code-merge-system/issues"
+
 [project.scripts]
 merge = "src.cli.main:cli"
 

From 6bc77c37f3613fcc6f1fcd9abe914b424bf1e0fc Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:25:38 -0400
Subject: [PATCH 11/22] =?UTF-8?q?feat(memory):=20P1-C=20verified-repair=20?=
 =?UTF-8?q?=E7=BB=8F=E9=AA=8C=E5=BA=93=EF=BC=88REPAIR=5FRECIPE=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

把执行期"用确定性算子修好并最终 judge PASS"的成功事件沉淀为可复用经验，
下次同类错误不再从零试错。provenance 全确定性——LLM 不参与"是否成功"判定。

- 新 MemoryEntryType.REPAIR_RECIPE
- state.applied_repairs 记录"修复并放行"的确定性算子（当前唯一会修复而非升级的
  算子：duplicate-symbol dedup；其余 foreign_chars/seam/hallucinated 均升级）
- executor 在两处 dedup 命中点（语义合并 + 分块合并）经 _record_applied_repair
  登记（按 file+operator 去重）
- summarizer._build_repair_recipes：仅当算子触发且文件落入 judge passed_files
  才铸 recipe，按 error_signature(error_class+operator+dir层) 去重、上限 20
- 读取走既有 get_relevant_context 通道（recipe 带 file_paths），无需新检索代码；
  与 ScarListBuilder（历史人工坑）互补=运行期验证过的解法
- config.memory.repair_recipe_enabled 默认 True（纯加性、执行接地，风险低于
  会致害的 P1-A/B，故默认开；可关用于 eval-memory 消融）

7 新单测 + 3238 unit 绿（1 pre-existing 无关 docs 测试除外），mypy/ruff 干净
---
 src/agents/executor_agent.py         |  29 ++++++
 src/core/orchestrator.py             |   5 +-
 src/memory/models.py                 |   1 +
 src/memory/summarizer.py             |  63 +++++++++++-
 src/models/config.py                 |  10 ++
 src/models/state.py                  |  10 ++
 tests/unit/test_p1c_repair_recipe.py | 137 +++++++++++++++++++++++++++
 7 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/test_p1c_repair_recipe.py

diff --git a/src/agents/executor_agent.py b/src/agents/executor_agent.py
index 984c5ed..34c0131 100644
--- a/src/agents/executor_agent.py
+++ b/src/agents/executor_agent.py
@@ -550,6 +550,12 @@ async def execute_semantic_merge(
                 file_diff.file_path,
             )
             merged_content = deduped
+            _record_applied_repair(
+                state,
+                file_diff.file_path,
+                "dedup_top_level_symbols",
+                "duplicate_top_level_symbol",
+            )
 
         fidelity_reason = self._single_shot_fidelity_issue(
             file_diff.file_path,
@@ -814,6 +820,12 @@ async def _execute_chunked_semantic_merge(
                 file_path,
             )
             merged_content = deduped
+            _record_applied_repair(
+                state,
+                file_path,
+                "dedup_top_level_symbols",
+                "duplicate_top_level_symbol",
+            )
         # #10: a chunk seam can re-emit a JS/TS function implementation, a
         # TS2451 redeclaration the const/class dedup above cannot remove safely
         # (deleting a span risks dropping a real overload). Escalate instead.
@@ -1415,6 +1427,23 @@ def _extract_diff_ranges(
     return ranges
 
 
+def _record_applied_repair(
+    state: MergeState, file_path: str, operator: str, error_class: str
+) -> None:
+    """P1-C: log a deterministic repair operator that fixed a file and let the
+    merge proceed (vs escalate). Deduped per (file_path, operator) so repeated
+    chunk seams do not inflate the list. A verified REPAIR_RECIPE memory entry is
+    minted later only if the Judge passes the file."""
+    for existing in state.applied_repairs:
+        if existing.get("file_path") == file_path and existing.get("operator") == (
+            operator
+        ):
+            return
+    state.applied_repairs.append(
+        {"file_path": file_path, "operator": operator, "error_class": error_class}
+    )
+
+
 def _foreign_chars(merged: str, *sources: str) -> str | None:
     """Return a sample of non-ASCII glyphs the merge invented, or None.
 
diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py
index 8fe7bc9..26bcc0b 100644
--- a/src/core/orchestrator.py
+++ b/src/core/orchestrator.py
@@ -183,7 +183,10 @@ def __init__(
         # --- memory ---
         self._memory_store: MemoryStore | SQLiteMemoryStore = MemoryStore()
         self._memory_hit_tracker = MemoryHitTracker()
-        self._summarizer = PhaseSummarizer(upstream_ref=config.upstream_ref)
+        self._summarizer = PhaseSummarizer(
+            upstream_ref=config.upstream_ref,
+            repair_recipe_enabled=getattr(config.memory, "repair_recipe_enabled", True),
+        )
         self._phases_since_last_extract: int = 0
 
         # --- hooks (C1) ---
diff --git a/src/memory/models.py b/src/memory/models.py
index 9311ef9..4f2c7ec 100644
--- a/src/memory/models.py
+++ b/src/memory/models.py
@@ -15,6 +15,7 @@ class MemoryEntryType(str, Enum):
     RELATIONSHIP = "relationship"
     PHASE_SUMMARY = "phase_summary"
     CODEBASE_INSIGHT = "codebase_insight"
+    REPAIR_RECIPE = "repair_recipe"
 
 
 class ConfidenceLevel(str, Enum):
diff --git a/src/memory/summarizer.py b/src/memory/summarizer.py
index 5239688..0b99ad5 100644
--- a/src/memory/summarizer.py
+++ b/src/memory/summarizer.py
@@ -56,9 +56,15 @@ def _is_epistemically_empty(rationale: str | None) -> bool:
     return any(pat.search(lowered) for pat in _EPISTEMIC_FAILURE_PATTERNS)
 
 
+_MAX_REPAIR_RECIPES = 20
+
+
 class PhaseSummarizer:
-    def __init__(self, upstream_ref: str = "") -> None:
+    def __init__(
+        self, upstream_ref: str = "", repair_recipe_enabled: bool = True
+    ) -> None:
         self._upstream_ref = upstream_ref[:8] if upstream_ref else ""
+        self._repair_recipe_enabled = repair_recipe_enabled
 
     def summarize_planning(
         self, state: MergeState
@@ -352,6 +358,8 @@ def summarize_judge_review(
                     )
                 )
 
+        entries.extend(self._build_repair_recipes(state))
+
         summary = PhaseSummary(
             phase="judge_review",
             files_processed=0,
@@ -361,6 +369,59 @@ def summarize_judge_review(
         )
         return summary, entries
 
+    def _build_repair_recipes(self, state: MergeState) -> list[MemoryEntry]:
+        """P1-C: mint REPAIR_RECIPE entries for deterministic repairs the Judge
+        verified by passing the file.
+
+        Pure execution-grounding: an entry is written only when a repair
+        operator fired during the merge (``state.applied_repairs``) AND the file
+        is in ``judge_verdict.passed_files``. No LLM decides "did it work". Keyed
+        by an ``error_signature`` (error_class + operator + dir-layer) so the
+        next run that opens a sibling file retrieves "this class of error was
+        resolved here by operator X, verified by judge PASS"."""
+        if not self._repair_recipe_enabled:
+            return []
+        verdict = state.judge_verdict
+        if verdict is None or not state.applied_repairs:
+            return []
+        passed = set(verdict.passed_files)
+        ref_tag = f"upstream_ref:{self._upstream_ref}" if self._upstream_ref else ""
+        recipes: list[MemoryEntry] = []
+        seen: set[str] = set()
+        for repair in state.applied_repairs:
+            fp = repair.get("file_path", "")
+            if fp not in passed:
+                continue
+            operator = repair.get("operator", "unknown")
+            error_class = repair.get("error_class", "unknown")
+            parts = fp.split(os.sep)
+            dir_layer = os.sep.join(parts[:2]) if len(parts) > 1 else "."
+            signature = f"{error_class}:{operator}:{dir_layer}"
+            if signature in seen:
+                continue
+            seen.add(signature)
+            tags = ["repair_recipe", error_class, operator, dir_layer]
+            if ref_tag:
+                tags.append(ref_tag)
+            recipes.append(
+                MemoryEntry(
+                    entry_type=MemoryEntryType.REPAIR_RECIPE,
+                    phase="judge_review",
+                    content=(
+                        f"{error_class} in {dir_layer}: resolved deterministically "
+                        f"by `{operator}`, verified by judge PASS — apply the same "
+                        f"operator before escalating this error class."
+                    ),
+                    file_paths=[fp, dir_layer],
+                    tags=tags,
+                    confidence=0.9,
+                    confidence_level=ConfidenceLevel.EXTRACTED,
+                )
+            )
+            if len(recipes) >= _MAX_REPAIR_RECIPES:
+                break
+        return recipes
+
 
 def _count_by_directory(file_paths: list[str]) -> Counter[str]:
     dirs: Counter[str] = Counter()
diff --git a/src/models/config.py b/src/models/config.py
index 526b50d..cb80848 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -974,6 +974,16 @@ class MemoryExtractionConfig(BaseModel):
         "findings land in report_generation, after this hook (see "
         "doc/plan/self-learning-system.md P1-B).",
     )
+    repair_recipe_enabled: bool = Field(
+        default=True,
+        description="P1-C: at judge_review summarization, mint a verified "
+        "REPAIR_RECIPE memory entry for each deterministic repair operator "
+        "(e.g. duplicate-symbol dedup) that fired AND whose file the Judge "
+        "passed. Execution-grounded and additive — no LLM decides success — so "
+        "default ON, unlike the harm-capable P1-A/P1-B loops. Future runs that "
+        "open a sibling file retrieve the recipe via the existing memory "
+        "channel. Set False to ablate in the eval-memory harness.",
+    )
     inject_enabled: bool = Field(
         default=True,
         description="P0 ablation switch: when False, no memory context is "
diff --git a/src/models/state.py b/src/models/state.py
index 41c17a6..916f33a 100644
--- a/src/models/state.py
+++ b/src/models/state.py
@@ -176,6 +176,16 @@ class MergeState(BaseModel):
     judge_verdict: JudgeVerdict | None = None
     judge_repair_rounds: int = 0
     judge_verdicts_log: list[dict[str, Any]] = Field(default_factory=list)
+    applied_repairs: list[dict[str, str]] = Field(
+        default_factory=list,
+        description=(
+            "P1-C: deterministic repair operators that fired and let the merge "
+            "proceed (not escalate), e.g. dedup of a duplicate top-level symbol. "
+            "Each entry is {file_path, operator, error_class}. Read at "
+            "judge_review summarization to mint a verified REPAIR_RECIPE memory "
+            "entry only for files the Judge ultimately passed."
+        ),
+    )
     judge_resolution: Literal["accept", "abort", "rerun"] | None = Field(
         default=None,
         description=(
diff --git a/tests/unit/test_p1c_repair_recipe.py b/tests/unit/test_p1c_repair_recipe.py
new file mode 100644
index 0000000..0e99cdd
--- /dev/null
+++ b/tests/unit/test_p1c_repair_recipe.py
@@ -0,0 +1,137 @@
+"""P1-C: verified-repair recipe library.
+
+A deterministic repair operator (duplicate-symbol dedup) that fires AND whose
+file the Judge ultimately passes mints a REPAIR_RECIPE memory entry, keyed by an
+error_signature so a later run that opens a sibling file retrieves
+"this error class was resolved here by operator X, verified by judge PASS".
+Pure execution-grounding — no LLM decides success.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+
+from src.agents.executor_agent import _record_applied_repair
+from src.memory.models import MemoryEntryType
+from src.memory.store import MemoryStore
+from src.memory.summarizer import PhaseSummarizer
+from src.models.judge import JudgeVerdict, VerdictType
+
+
+def _verdict(passed: list[str], failed: list[str]) -> JudgeVerdict:
+    return JudgeVerdict(
+        verdict=VerdictType.PASS if not failed else VerdictType.FAIL,
+        reviewed_files_count=len(passed) + len(failed),
+        passed_files=list(passed),
+        failed_files=list(failed),
+        conditional_files=[],
+        issues=[],
+        critical_issues_count=0,
+        high_issues_count=0,
+        overall_confidence=0.9,
+        summary="x",
+        blocking_issues=[],
+        timestamp=datetime(2026, 1, 1),
+        judge_model="m",
+    )
+
+
+class _State:
+    """Minimal stand-in carrying the fields the summarizer reads."""
+
+    def __init__(self, verdict, applied_repairs):
+        self.judge_verdict = verdict
+        self.applied_repairs = applied_repairs
+        self.judge_verdicts_log = []
+        self.judge_repair_rounds = 0
+
+
+# --- executor recording -----------------------------------------------------
+
+
+def test_record_applied_repair_dedups_per_file_operator():
+    state = _State(None, [])
+    _record_applied_repair(state, "a.go", "dedup_top_level_symbols", "dup_symbol")
+    _record_applied_repair(state, "a.go", "dedup_top_level_symbols", "dup_symbol")
+    assert state.applied_repairs == [
+        {
+            "file_path": "a.go",
+            "operator": "dedup_top_level_symbols",
+            "error_class": "dup_symbol",
+        }
+    ]
+
+
+# --- summarizer minting -----------------------------------------------------
+
+
+def _repairs(*files: str) -> list[dict[str, str]]:
+    return [
+        {
+            "file_path": f,
+            "operator": "dedup_top_level_symbols",
+            "error_class": "duplicate_top_level_symbol",
+        }
+        for f in files
+    ]
+
+
+def test_recipe_minted_only_for_judge_passed_file():
+    state = _State(
+        _verdict(passed=["pkg/x/a.go"], failed=["pkg/y/b.go"]),
+        _repairs("pkg/x/a.go", "pkg/y/b.go"),
+    )
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    recipes = [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE]
+    assert len(recipes) == 1
+    r = recipes[0]
+    assert "pkg/x/a.go" in r.file_paths
+    assert "duplicate_top_level_symbol" in r.tags
+    assert "dedup_top_level_symbols" in r.tags
+    # the failed file earns no recipe
+    assert all("pkg/y/b.go" not in e.file_paths for e in recipes)
+
+
+def test_recipe_signature_deduped_across_same_dir_layer():
+    # two passed files in the same dir-layer with the same operator/error →
+    # one recipe (the error_signature collapses them).
+    state = _State(
+        _verdict(passed=["pkg/x/a.go", "pkg/x/b.go"], failed=[]),
+        _repairs("pkg/x/a.go", "pkg/x/b.go"),
+    )
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    recipes = [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE]
+    assert len(recipes) == 1
+
+
+def test_no_recipe_when_disabled():
+    state = _State(_verdict(passed=["pkg/x/a.go"], failed=[]), _repairs("pkg/x/a.go"))
+    _, entries = PhaseSummarizer(repair_recipe_enabled=False).summarize_judge_review(
+        state
+    )  # type: ignore[arg-type]
+    assert not [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE]
+
+
+def test_no_recipe_without_applied_repairs():
+    state = _State(_verdict(passed=["pkg/x/a.go"], failed=[]), [])
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    assert not [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE]
+
+
+def test_no_recipe_without_verdict():
+    state = _State(None, _repairs("pkg/x/a.go"))
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    assert not [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE]
+
+
+# --- retrieval (existing memory channel) ------------------------------------
+
+
+def test_recipe_is_retrievable_for_matching_file():
+    state = _State(_verdict(passed=["pkg/x/a.go"], failed=[]), _repairs("pkg/x/a.go"))
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    store = MemoryStore()
+    for e in entries:
+        store = store.add_entry(e)
+    hits = store.get_relevant_context(["pkg/x/a.go"])
+    assert any(h.entry_type == MemoryEntryType.REPAIR_RECIPE for h in hits)

From 53b963e8f1ae82d34197459dbf1f4be3a36d55b8 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:26:54 -0400
Subject: [PATCH 12/22] =?UTF-8?q?docs(plan):=20=E6=A0=87=E6=B3=A8=20Phase?=
 =?UTF-8?q?=201=20A/B/C=20=E8=90=BD=E5=9C=B0=E7=8A=B6=E6=80=81=E4=B8=8E=20?=
 =?UTF-8?q?P1-B=20ci=20=E5=BB=B6=E5=90=8E=E5=81=8F=E5=B7=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/plan/self-learning-system.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md
index 37d65a9..5df7079 100644
--- a/doc/plan/self-learning-system.md
+++ b/doc/plan/self-learning-system.md
@@ -154,6 +154,15 @@
 
 ### Phase 1 —— 闭合执行接地反馈环（最高 ROI）
 
+> **落地状态（2026-05-31，feat/web）**：A/B/C 全部实装（`b83d142`/`6b4f905`/`6bc77c3`）。
+> A、B 的反馈环按 P2「先度量再激活」默认 **opt-in（False）**——`memory.persist_suppress`、
+> `memory.writeback_signal_sources` 默认 `["judge"]`（=旧行为），需 `merge eval-memory`
+> 多 run 基线证明净收益为正（§3 激活门：`MDL>0` 且 `memory_harmed=0`）方可翻默认。
+> C 为纯加性、执行接地，默认 **True**（`memory.repair_recipe_enabled`）。
+> **B 偏差**：CI/partial_failure 信号有意延后——它在 `report_generation` 产出，晚于
+> judge_review 记忆钩子；完整融合需把写回迁到 report 阶段（未做）。故 B 现仅
+> `judge + compile` 两源。
+
 > 对应研究最强三条证据：选择性 add+**delete** +10%（F2）、执行接地 >> 自反思（范式2）、Experience 抽象（范式5）。拆三个可独立评审的子项。
 
 #### P1-A 把临时软删（O-M6）巩固为持久、可审计的 suppress（原则 P3）

From 452500862cab7d4652706cf476914bb7b1acf10e Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:37:58 -0400
Subject: [PATCH 13/22] =?UTF-8?q?feat(memory):=20P2-A=20=E9=AB=98=E4=BF=A1?=
 =?UTF-8?q?=E6=81=AF=E6=9D=A1=E7=9B=AE=E5=BC=BA=E5=88=B6=EF=BC=88anti-pois?=
 =?UTF-8?q?oning=20=E5=AF=B9=E5=81=B6=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_is_epistemically_empty 拒"模型放弃"标记；本提交补其对偶——拒"空泛无动作"
条目（Renze & Guven：反思信息量是效果杠杆，GPT-4 0.79→0.93）。

- 新 src/memory/content_quality.py：is_actionable_content 仅审 DECISION/
  REPAIR_RECIPE（PATTERN/摘要类豁免），保守判据默认 True，仅明确空泛填充词/
  过短无动作才 False；enforce_actionable 不可变降级（降 HEURISTIC + 折半
  confidence，de-rank 而非删，保召回）
- orchestrator._update_memory 两个入库点（summarizer + memory_extractor llm
  出口）统一过 enforce_actionable

8 新单测；mypy/ruff 干净
---
 src/core/orchestrator.py               |  5 +-
 src/memory/content_quality.py          | 76 +++++++++++++++++++++++
 tests/unit/test_p2a_content_quality.py | 84 ++++++++++++++++++++++++++
 3 files changed, 163 insertions(+), 2 deletions(-)
 create mode 100644 src/memory/content_quality.py
 create mode 100644 tests/unit/test_p2a_content_quality.py

diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py
index 26bcc0b..5c5ce0b 100644
--- a/src/core/orchestrator.py
+++ b/src/core/orchestrator.py
@@ -54,6 +54,7 @@
 from src.core.coordinator import Coordinator
 from src.core.state_machine import StateMachine
 from src.memory.bootstrap import _BOOTSTRAP_TAG, bootstrap_from_claude_md
+from src.memory.content_quality import enforce_actionable
 from src.memory.hit_tracker import MemoryHitTracker
 from src.memory.sqlite_store import SQLiteMemoryStore
 from src.memory.store import MemoryStore
@@ -496,7 +497,7 @@ async def _update_memory(self, phase: str, state: MergeState) -> None:
             phase_summary, entries = method(state)
             store = self._memory_store.record_phase_summary(phase_summary)
             for entry in entries:
-                store = store.add_entry(entry)
+                store = store.add_entry(enforce_actionable(entry))
             count_before = store.entry_count
             store = store.remove_superseded(phase)
             removed = count_before - store.entry_count
@@ -517,7 +518,7 @@ async def _update_memory(self, phase: str, state: MergeState) -> None:
                 llm_entries = await self.memory_extractor.extract(phase, state)  # type: ignore[attr-defined]
                 store = self._memory_store
                 for entry in llm_entries:
-                    store = store.add_entry(entry)
+                    store = store.add_entry(enforce_actionable(entry))
                 self._memory_store = store
                 self._phases_since_last_extract = 0
             except Exception as exc:
diff --git a/src/memory/content_quality.py b/src/memory/content_quality.py
new file mode 100644
index 0000000..2949e9d
--- /dev/null
+++ b/src/memory/content_quality.py
@@ -0,0 +1,76 @@
+"""P2-A: high-information entry enforcement.
+
+The dual of ``_is_epistemically_empty`` (which rejects "model gave up" markers):
+this rejects entries that are *vacuous* — they name a file but carry no concrete
+action, decision, or fix. Renze & Guven (arXiv 2405.06682) show reflection
+*information content* is what drives the effect (GPT-4 0.79 → 0.93), so a memory
+that says nothing specific is dead weight that dilutes retrieval.
+
+Conservative by design: defaults to actionable (True) and only flags content that
+is clearly filler, so it never silently drops a legitimate entry. Non-actionable
+entries are *de-ranked* (confidence + level lowered), not deleted, preserving
+recall while pushing vacuous entries below the retrieval threshold.
+"""
+
+from __future__ import annotations
+
+import re
+
+from src.memory.models import ConfidenceLevel, MemoryEntry, MemoryEntryType
+
+# Entry types whose value is their specific action/decision/fix. PATTERN /
+# PHASE_SUMMARY / CODEBASE_INSIGHT are intentionally exempt — a terse pattern
+# label ("recurring reverse_impact") is legitimately short.
+_ACTIONABLE_TYPES = frozenset(
+    {
+        MemoryEntryType.DECISION,
+        MemoryEntryType.REPAIR_RECIPE,
+    }
+)
+
+# Filler that carries no information once the file path is stripped.
+_VACUOUS_PATTERNS: tuple[re.Pattern[str], ...] = (
+    re.compile(r"^(decision\s+made|reviewed|processed|handled|done|ok|n/?a|none)\.?$"),
+    re.compile(r"^(no\s+(notes?|details?|specifics?|action|change)s?)\b"),
+    re.compile(r"^(tbd|todo|unknown|see\s+above|as\s+noted)\.?$"),
+)
+
+_MIN_SUBSTANCE_CHARS = 8
+
+
+def _substance(content: str) -> str:
+    """The part after a leading ``path: `` prefix, lowercased + stripped."""
+    head, sep, tail = content.partition(": ")
+    body = tail if sep else content
+    return body.strip().lower()
+
+
+def is_actionable_content(content: str, entry_type: MemoryEntryType) -> bool:
+    """True when ``content`` carries a concrete action/decision/fix.
+
+    Only entry types in ``_ACTIONABLE_TYPES`` are scrutinised; all others are
+    considered actionable by default. The check is deliberately permissive."""
+    if entry_type not in _ACTIONABLE_TYPES:
+        return True
+    body = _substance(content)
+    if len(body) < _MIN_SUBSTANCE_CHARS:
+        return False
+    return not any(pat.match(body) for pat in _VACUOUS_PATTERNS)
+
+
+def enforce_actionable(entry: MemoryEntry) -> MemoryEntry:
+    """Return ``entry`` unchanged when actionable, else a de-ranked copy.
+
+    De-rank = clamp confidence_level to HEURISTIC and halve confidence (floor
+    0.1) so the vacuous entry sinks below the retrieval relevance threshold
+    without being deleted. Immutable — never mutates the input."""
+    if is_actionable_content(entry.content, entry.entry_type):
+        return entry
+    if entry.confidence_level == ConfidenceLevel.HEURISTIC and entry.confidence <= 0.1:
+        return entry
+    return entry.model_copy(
+        update={
+            "confidence_level": ConfidenceLevel.HEURISTIC,
+            "confidence": max(0.1, round(entry.confidence * 0.5, 4)),
+        }
+    )
diff --git a/tests/unit/test_p2a_content_quality.py b/tests/unit/test_p2a_content_quality.py
new file mode 100644
index 0000000..10605e8
--- /dev/null
+++ b/tests/unit/test_p2a_content_quality.py
@@ -0,0 +1,84 @@
+"""P2-A: high-information entry enforcement (dual of epistemic-empty filter)."""
+
+from __future__ import annotations
+
+from src.memory.content_quality import enforce_actionable, is_actionable_content
+from src.memory.models import ConfidenceLevel, MemoryEntry, MemoryEntryType
+
+
+def _entry(
+    content: str,
+    entry_type: MemoryEntryType = MemoryEntryType.DECISION,
+    confidence: float = 0.85,
+    level: ConfidenceLevel = ConfidenceLevel.EXTRACTED,
+) -> MemoryEntry:
+    return MemoryEntry(
+        entry_type=entry_type,
+        phase="conflict_analysis",
+        content=content,
+        confidence=confidence,
+        confidence_level=level,
+    )
+
+
+# --- is_actionable_content --------------------------------------------------
+
+
+def test_decision_with_concrete_action_is_actionable():
+    assert is_actionable_content(
+        "src/a.py: take_target [import_conflict] confidence=0.90 — keep upstream auth",
+        MemoryEntryType.DECISION,
+    )
+
+
+def test_decision_vacuous_filler_not_actionable():
+    for body in ("src/a.py: decision made", "src/a.py: n/a", "src/a.py: no notes"):
+        assert not is_actionable_content(body, MemoryEntryType.DECISION)
+
+
+def test_decision_too_short_not_actionable():
+    assert not is_actionable_content("src/a.py: ok", MemoryEntryType.DECISION)
+
+
+def test_pattern_type_is_exempt():
+    # a terse PATTERN label is legitimately short — never flagged
+    assert is_actionable_content("ok", MemoryEntryType.PATTERN)
+    assert is_actionable_content(
+        "recurring reverse_impact", MemoryEntryType.PHASE_SUMMARY
+    )
+
+
+def test_repair_recipe_scrutinised():
+    assert not is_actionable_content("x: n/a", MemoryEntryType.REPAIR_RECIPE)
+    assert is_actionable_content(
+        "dup_symbol in pkg/x: resolved by dedup, verified by judge PASS",
+        MemoryEntryType.REPAIR_RECIPE,
+    )
+
+
+# --- enforce_actionable -----------------------------------------------------
+
+
+def test_actionable_entry_returned_unchanged():
+    e = _entry("src/a.py: semantic_merge — merged both auth handlers cleanly")
+    assert enforce_actionable(e) is e
+
+
+def test_vacuous_entry_is_deranked_not_dropped():
+    e = _entry("src/a.py: decision made", confidence=0.85)
+    out = enforce_actionable(e)
+    assert out is not e
+    assert out.confidence_level == ConfidenceLevel.HEURISTIC
+    assert out.confidence == 0.425  # 0.85 * 0.5
+    assert out.content == e.content  # content preserved, only rank lowered
+    assert out.content_hash == e.content_hash  # identity stable
+
+
+def test_derank_is_idempotent_at_floor():
+    e = _entry("src/a.py: n/a", confidence=0.1, level=ConfidenceLevel.HEURISTIC)
+    assert enforce_actionable(e) is e  # already at floor → no new object
+
+
+def test_exempt_type_never_deranked():
+    e = _entry("ok", entry_type=MemoryEntryType.PATTERN, confidence=0.8)
+    assert enforce_actionable(e) is e

From 2af4890819441456f9fc265cf3c95fb335f5c1fd Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:37:58 -0400
Subject: [PATCH 14/22] =?UTF-8?q?feat(memory):=20P2-B=20=E5=85=B3=E9=94=AE?=
 =?UTF-8?q?=E4=B8=8D=E5=8F=98=E9=87=8F=E9=94=9A=E5=AE=9A=EF=BC=8C=E9=98=B2?=
 =?UTF-8?q?=E6=91=98=E8=A6=81=E6=BC=82=E7=A7=BB=EF=BC=88F1=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

consolidation 把同组条目摘要成一个 blob，会漂移关键条目内容（F1）。本提交给
验证过的解法与人工决策打 pinned，consolidation 对其 passthrough 不再摘要。

- MemoryEntry.pinned: bool；SQLite 加 pinned 列 + ALTER TABLE 迁移旧库（镜像
  P1-A suppressed 的迁移/行映射）
- _consolidate_entries 跳过 pinned（与 suppressed 同 passthrough 分支）
- summarizer 标 pinned=True：REPAIR_RECIPE（验证过的解法）+ judge_review 中
  decision_source∈{HUMAN,BATCH_HUMAN} 的文件 DECISION 条目
- security-sensitive 锚定延后（summarizer 无 config patterns，plumbing 已就位）

8 新单测（含 F1 多轮 consolidation 零损失断言 + 迁移）；test_p1c 桩补
file_decision_records；mypy/ruff 干净
---
 src/memory/models.py                 |   1 +
 src/memory/sqlite_store.py           |  13 +-
 src/memory/store.py                  |   4 +-
 src/memory/summarizer.py             |  12 +-
 tests/unit/test_p1c_repair_recipe.py |   1 +
 tests/unit/test_p2b_pinned_anchor.py | 175 +++++++++++++++++++++++++++
 6 files changed, 201 insertions(+), 5 deletions(-)
 create mode 100644 tests/unit/test_p2b_pinned_anchor.py

diff --git a/src/memory/models.py b/src/memory/models.py
index 4f2c7ec..9a7b8a4 100644
--- a/src/memory/models.py
+++ b/src/memory/models.py
@@ -37,6 +37,7 @@ class MemoryEntry(BaseModel, frozen=True):
     created_at: datetime = Field(default_factory=datetime.now)
     suppressed: bool = Field(default=False)
     suppressed_reason: str | None = Field(default=None)
+    pinned: bool = Field(default=False)
 
     def model_post_init(self, __context: Any) -> None:
         if not self.content_hash:
diff --git a/src/memory/sqlite_store.py b/src/memory/sqlite_store.py
index 202a208..c68e411 100644
--- a/src/memory/sqlite_store.py
+++ b/src/memory/sqlite_store.py
@@ -37,7 +37,8 @@
     content_hash      TEXT NOT NULL,
     created_at        TEXT NOT NULL,
     suppressed        INTEGER NOT NULL DEFAULT 0,
-    suppressed_reason TEXT
+    suppressed_reason TEXT,
+    pinned            INTEGER NOT NULL DEFAULT 0
 );
 CREATE UNIQUE INDEX IF NOT EXISTS idx_content_hash
     ON memory_entries (content_hash);
@@ -55,8 +56,8 @@
 INSERT OR IGNORE INTO memory_entries
     (entry_id, entry_type, phase, content, file_paths, tags,
      confidence, confidence_level, content_hash, created_at,
-     suppressed, suppressed_reason)
-VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+     suppressed, suppressed_reason, pinned)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 """
 
 # P1-A: columns added after the original schema shipped; older memory.db files
@@ -71,6 +72,10 @@
         "suppressed_reason",
         "ALTER TABLE memory_entries ADD COLUMN suppressed_reason TEXT",
     ),
+    (
+        "pinned",
+        "ALTER TABLE memory_entries ADD COLUMN pinned INTEGER NOT NULL DEFAULT 0",
+    ),
 )
 
 _PHASE_ORDER = {
@@ -95,6 +100,7 @@ def _entry_to_row(entry: MemoryEntry) -> tuple[str | int | None, ...]:
         entry.created_at.isoformat(),
         1 if entry.suppressed else 0,
         entry.suppressed_reason,
+        1 if entry.pinned else 0,
     )
 
 
@@ -115,6 +121,7 @@ def _row_to_entry(row: sqlite3.Row) -> MemoryEntry:
         suppressed_reason=(
             row["suppressed_reason"] if "suppressed_reason" in keys else None
         ),
+        pinned=bool(row["pinned"]) if "pinned" in keys else False,
     )
 
 
diff --git a/src/memory/store.py b/src/memory/store.py
index 16199d4..b4c43fb 100644
--- a/src/memory/store.py
+++ b/src/memory/store.py
@@ -354,7 +354,9 @@ class (``c_class`` / ``conflict_decision``) shared across directories, so
         # P1-A: suppressed entries pass through untouched — they must not be
         # merged into a live blob (that would resurrect harmful content) nor
         # silently dropped (audit trail must survive consolidation).
-        if entry.suppressed:
+        # P2-B: pinned entries (verified REPAIR_RECIPE / human decisions) also
+        # pass through verbatim so consolidation cannot drift their content (F1).
+        if entry.suppressed or entry.pinned:
             ungroupable.append(entry)
             continue
         primary_tag = entry.tags[0] if entry.tags else ""
diff --git a/src/memory/summarizer.py b/src/memory/summarizer.py
index 0b99ad5..1292c58 100644
--- a/src/memory/summarizer.py
+++ b/src/memory/summarizer.py
@@ -11,7 +11,7 @@
     MemoryEntryType,
     PhaseSummary,
 )
-from src.models.decision import FileDecisionRecord
+from src.models.decision import DecisionSource, FileDecisionRecord
 from src.models.state import MergeState
 
 logger = logging.getLogger(__name__)
@@ -335,6 +335,14 @@ def summarize_judge_review(
         # future runs see which files needed repair and why.
         if state.judge_verdict is not None:
             ref_tag = f"upstream_ref:{self._upstream_ref}" if self._upstream_ref else ""
+            # P2-B: pin human-decided files so consolidation cannot drift the
+            # record of an explicit operator decision (F1).
+            human_files = {
+                fp
+                for fp, rec in state.file_decision_records.items()
+                if rec.decision_source
+                in (DecisionSource.HUMAN, DecisionSource.BATCH_HUMAN)
+            }
             issues_by_file: dict[str, list[str]] = {}
             for issue in state.judge_verdict.issues:
                 issues_by_file.setdefault(issue.file_path, []).append(issue.issue_type)
@@ -355,6 +363,7 @@ def summarize_judge_review(
                         tags=tags,
                         confidence=0.85,
                         confidence_level=ConfidenceLevel.EXTRACTED,
+                        pinned=fp in human_files,
                     )
                 )
 
@@ -416,6 +425,7 @@ def _build_repair_recipes(self, state: MergeState) -> list[MemoryEntry]:
                     tags=tags,
                     confidence=0.9,
                     confidence_level=ConfidenceLevel.EXTRACTED,
+                    pinned=True,
                 )
             )
             if len(recipes) >= _MAX_REPAIR_RECIPES:
diff --git a/tests/unit/test_p1c_repair_recipe.py b/tests/unit/test_p1c_repair_recipe.py
index 0e99cdd..20eee97 100644
--- a/tests/unit/test_p1c_repair_recipe.py
+++ b/tests/unit/test_p1c_repair_recipe.py
@@ -44,6 +44,7 @@ def __init__(self, verdict, applied_repairs):
         self.applied_repairs = applied_repairs
         self.judge_verdicts_log = []
         self.judge_repair_rounds = 0
+        self.file_decision_records = {}
 
 
 # --- executor recording -----------------------------------------------------
diff --git a/tests/unit/test_p2b_pinned_anchor.py b/tests/unit/test_p2b_pinned_anchor.py
new file mode 100644
index 0000000..f678e4a
--- /dev/null
+++ b/tests/unit/test_p2b_pinned_anchor.py
@@ -0,0 +1,175 @@
+"""P2-B: pin key invariants so consolidation cannot drift them (F1 guard)."""
+
+from __future__ import annotations
+
+from datetime import datetime
+
+from src.memory.models import ConfidenceLevel, MemoryEntry, MemoryEntryType
+from src.memory.sqlite_store import SQLiteMemoryStore
+from src.memory.store import _consolidate_entries
+from src.memory.summarizer import PhaseSummarizer
+from src.models.decision import (
+    DecisionSource,
+    FileDecisionRecord,
+    MergeDecision,
+)
+from src.models.diff import FileStatus
+from src.models.judge import IssueSeverity, JudgeIssue, JudgeVerdict, VerdictType
+
+
+def _entry(content: str, *, pinned: bool = False, tag: str = "t") -> MemoryEntry:
+    return MemoryEntry(
+        entry_type=MemoryEntryType.DECISION,
+        phase="conflict_analysis",
+        content=content,
+        file_paths=["pkg/x/a.py"],
+        tags=[tag],
+        confidence=0.8,
+        confidence_level=ConfidenceLevel.EXTRACTED,
+        pinned=pinned,
+    )
+
+
+# --- model ------------------------------------------------------------------
+
+
+def test_pinned_defaults_false():
+    assert _entry("x").pinned is False
+
+
+# --- consolidation F1 guard -------------------------------------------------
+
+
+def test_pinned_entry_survives_consolidation_verbatim():
+    # 3 same-group live entries would merge into one lossy blob; a pinned
+    # sibling in the same group must pass through with content intact.
+    live = [_entry(f"c{i}") for i in range(3)]
+    pinned = _entry("CRITICAL: take_current on auth — never drift", pinned=True)
+    out = _consolidate_entries([*live, pinned])
+    survivors = [e for e in out if e.pinned]
+    assert len(survivors) == 1
+    assert survivors[0].content == "CRITICAL: take_current on auth — never drift"
+    # the 3 live ones still collapsed
+    assert sum(1 for e in out if not e.pinned) == 1
+
+
+def test_sqlite_pinned_persists_and_survives_consolidation(tmp_path):
+    store = SQLiteMemoryStore.open(tmp_path / "m.db")
+    pinned = _entry("pinned recipe", pinned=True)
+    store.add_entry(pinned)
+    reopened = SQLiteMemoryStore.open(tmp_path / "m.db")
+    row = next(e for e in reopened.to_memory().entries if e.entry_id == pinned.entry_id)
+    assert row.pinned is True
+
+
+def test_sqlite_legacy_db_migrates_pinned_column(tmp_path):
+    import sqlite3
+
+    db = tmp_path / "legacy.db"
+    conn = sqlite3.connect(str(db))
+    conn.executescript(
+        """
+        CREATE TABLE memory_entries (
+            entry_id TEXT PRIMARY KEY, entry_type TEXT NOT NULL, phase TEXT NOT NULL,
+            content TEXT NOT NULL, file_paths TEXT NOT NULL, tags TEXT NOT NULL,
+            confidence REAL NOT NULL, confidence_level TEXT NOT NULL,
+            content_hash TEXT NOT NULL, created_at TEXT NOT NULL
+        );
+        INSERT INTO memory_entries VALUES
+            ('id1','decision','planning','legacy','["a.py"]','[]',0.8,
+             'inferred','hash1','2026-01-01T00:00:00');
+        """
+    )
+    conn.commit()
+    conn.close()
+    store = SQLiteMemoryStore.open(db)  # migration runs here
+    entries = store.to_memory().entries
+    assert len(entries) == 1
+    assert entries[0].pinned is False
+
+
+# --- summarizer pinning -----------------------------------------------------
+
+
+class _State:
+    def __init__(self, verdict, records):
+        self.judge_verdict = verdict
+        self.applied_repairs = []
+        self.judge_verdicts_log = []
+        self.judge_repair_rounds = 0
+        self.file_decision_records = records
+
+
+def _verdict(passed, failed):
+    issues = [
+        JudgeIssue(
+            file_path=f,
+            issue_level=IssueSeverity.HIGH,
+            issue_type="reverse_impact_unhandled",
+            description="x",
+        )
+        for f in failed
+    ]
+    return JudgeVerdict(
+        verdict=VerdictType.FAIL if failed else VerdictType.PASS,
+        reviewed_files_count=len(passed) + len(failed),
+        passed_files=list(passed),
+        failed_files=list(failed),
+        conditional_files=[],
+        issues=issues,
+        critical_issues_count=0,
+        high_issues_count=len(failed),
+        overall_confidence=0.9,
+        summary="x",
+        blocking_issues=[],
+        timestamp=datetime(2026, 1, 1),
+        judge_model="m",
+    )
+
+
+def _human_record(fp: str) -> FileDecisionRecord:
+    return FileDecisionRecord(
+        file_path=fp,
+        file_status=FileStatus.MODIFIED,
+        decision=MergeDecision.TAKE_CURRENT,
+        decision_source=DecisionSource.HUMAN,
+        rationale="operator kept fork auth on this security-sensitive file",
+    )
+
+
+def test_repair_recipe_entries_are_pinned():
+    state = _State(_verdict(["pkg/x/a.go"], []), {})
+    state.applied_repairs = [
+        {
+            "file_path": "pkg/x/a.go",
+            "operator": "dedup_top_level_symbols",
+            "error_class": "duplicate_top_level_symbol",
+        }
+    ]
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    recipes = [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE]
+    assert recipes and all(r.pinned for r in recipes)
+
+
+def test_human_decided_judge_fail_entry_is_pinned():
+    fp = "pkg/x/secret.py"
+    state = _State(_verdict([], [fp]), {fp: _human_record(fp)})
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    decisions = [
+        e
+        for e in entries
+        if e.entry_type == MemoryEntryType.DECISION and fp in e.file_paths
+    ]
+    assert decisions and all(d.pinned for d in decisions)
+
+
+def test_non_human_judge_fail_entry_not_pinned():
+    fp = "pkg/x/auto.py"
+    state = _State(_verdict([], [fp]), {})  # no human record
+    _, entries = PhaseSummarizer().summarize_judge_review(state)  # type: ignore[arg-type]
+    decisions = [
+        e
+        for e in entries
+        if e.entry_type == MemoryEntryType.DECISION and fp in e.file_paths
+    ]
+    assert decisions and not any(d.pinned for d in decisions)

From 08e3f9eb5b85a4bd97e69ef17e4ac764beaf2099 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 07:38:37 -0400
Subject: [PATCH 15/22] =?UTF-8?q?docs(plan):=20=E6=A0=87=E6=B3=A8=20Phase?=
 =?UTF-8?q?=202=20A/B=20=E8=90=BD=E5=9C=B0=E7=8A=B6=E6=80=81=E4=B8=8E=20se?=
 =?UTF-8?q?curity-sensitive=20=E9=94=9A=E5=AE=9A=E5=BB=B6=E5=90=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/plan/self-learning-system.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md
index 5df7079..5995507 100644
--- a/doc/plan/self-learning-system.md
+++ b/doc/plan/self-learning-system.md
@@ -211,6 +211,13 @@
 
 ### Phase 2 —— 记忆质量加固（中等 ROI，便宜）
 
+> **落地状态（2026-05-31，feat/web）**：A/B 全部实装（`4525008`/`2af4890`）。
+> A（`content_quality.is_actionable_content`/`enforce_actionable`）保守降级而非删，
+> 默认随入库即生效；B（`MemoryEntry.pinned`）锚定 REPAIR_RECIPE + 人工决策条目，
+> consolidation 对其 passthrough。**B 偏差**：security-sensitive 锚定延后——
+> summarizer 无 config 的 `security_sensitive.patterns`；`pinned` 字段已就位，
+> 需后续在有 config 的入库点补标。
+
 **P2-A 高信息条目强制**（范式2，GPT-4 0.79→0.93 的直接杠杆）
 - 扩展 `_is_epistemically_empty` 的对偶：`_has_actionable_content()`——DECISION/REPAIR_RECIPE 类条目若缺"具体动作/修复"则降级或拒写。
 - 接入 `summarizer.py` 各 `summarize_*` 与 `memory_extractor` 出口。

From f5406130086fd15e5036316abba0a60f3c6dce92 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 09:44:16 -0400
Subject: [PATCH 16/22] =?UTF-8?q?feat(memory):=20Phase=203=20=E7=A6=BB?=
 =?UTF-8?q?=E7=BA=BF=E6=8F=90=E7=A4=BA=E4=BC=98=E5=8C=96=20harness=20+=20o?=
 =?UTF-8?q?pt-in=20CLI=EF=BC=88=E4=B8=8D=E8=87=AA=E5=8A=A8=E6=94=B9?=
 =?UTF-8?q?=E7=94=9F=E4=BA=A7=E6=8F=90=E7=A4=BA=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GEPA/MIPROv2 式提示进化的确定性可测核心：对某 gate 的系统提示生成具名候选变体、
按 golden 集决策准确率排名、产人工评审报告。gate 是代码 builder，故采纳赢家=人工
按报告编辑提示源，**永不自动写回 gate_registry**（方案"人工评审后才生效"）。

- src/tools/prompt_optimizer.py：PromptCandidate/GoldenCase/CandidateScore/
  CostLedger/OptimizationReport；MUTATION_STRATEGIES（GEPA 确定性子集=反思指令注入
  stepwise/selfcheck/output_format/evidence_first）；propose_variants（基线+去重）、
  score_candidates、select_winner（须超基线 margin 且已评分）、build/render report
- 昂贵的 LLM rollout 有意外移为注入的 rollouts 映射（candidate_id→{case_id:decision}），
  harness 纯离线可单测；产 rollout 是操作者按文档自担成本的步骤（~$60/万次）
- CLI `merge optimize-prompts --gate --golden --rollouts --strategies --margin --out`：
  仅支持 no-arg/*-SYSTEM gate（其余报错）；无 golden 时只产变体并诚实标注 unscored
- 安全：read-only w.r.t 生产提示；未评分=surfaced 非伪造；margin 内不夺基线

11 harness + 3 CLI 单测（对真实 J-SYSTEM gate）；全量 3268 unit 绿（1 pre-existing
无关 docs 测试除外），mypy 182 文件干净、ruff 过
---
 CLAUDE.md                               |   9 +
 src/cli/main.py                         |  97 +++++++++
 src/tools/prompt_optimizer.py           | 270 ++++++++++++++++++++++++
 tests/unit/test_cli_optimize_prompts.py |  67 ++++++
 tests/unit/test_prompt_optimizer.py     | 125 +++++++++++
 5 files changed, 568 insertions(+)
 create mode 100644 src/tools/prompt_optimizer.py
 create mode 100644 tests/unit/test_cli_optimize_prompts.py
 create mode 100644 tests/unit/test_prompt_optimizer.py

diff --git a/CLAUDE.md b/CLAUDE.md
index d69c086..1f7cb1a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -38,8 +38,17 @@ merge init [--repo-path .]       # generate per-target CLAUDE.md for merge decis
 merge plan-suggest [--target ... --candidates ...]   # enumerate baseline commit-windows
 merge forks-profile init         # scaffold .merge/forks-profile.yaml (recommended ≥30 fork-deleted files)
 merge eval-memory --on <run|json> --off <run|json> [--out <path>]   # P0 memory ablation: compare memory=on vs memory=off effectiveness reports
+merge optimize-prompts --gate <ID> [--golden <json> --rollouts <json> --strategies a,b --margin 0.02 --out <path>]   # Phase 3 (opt-in, offline): generate + rank prompt variants for a *-SYSTEM gate; emits a HUMAN-REVIEW report, never auto-applies
 ```
 
+`merge optimize-prompts` is offline and read-only w.r.t. production prompts:
+gates are code builders, so a winning candidate is applied by a human editing
+the gate's prompt source after reviewing the report. Scoring needs `--golden`
+(JSON `[{case_id, expected_decision}]`) plus `--rollouts` (JSON
+`{candidate_id: {case_id: decision}}` produced by running each candidate — the
+cost-bearing step you run yourself). See `doc/plan/self-learning-system.md`
+Phase 3 for the cost model.
+
 To produce a `memory=off` run for the ablation, set `memory.inject_enabled: false`
 in `.merge/config.yaml` and re-run on the same dataset; each run persists a
 `memory_effectiveness.json` under its run dir that `merge eval-memory` consumes.
diff --git a/src/cli/main.py b/src/cli/main.py
index 0dd75ca..c9a3d19 100644
--- a/src/cli/main.py
+++ b/src/cli/main.py
@@ -520,6 +520,103 @@ def eval_memory_command(on_path: str, off_path: str, out_path: str | None) -> No
             sys.exit(1)
 
 
+@cli.command("optimize-prompts")
+@click.option("--gate", "gate_id", required=True, help="gate ID, e.g. J-SYSTEM")
+@click.option(
+    "--golden",
+    "golden_path",
+    required=False,
+    default=None,
+    type=click.Path(exists=True),
+    help="JSON list of {case_id, expected_decision}; omit to only generate variants",
+)
+@click.option(
+    "--rollouts",
+    "rollouts_path",
+    required=False,
+    default=None,
+    type=click.Path(exists=True),
+    help="JSON {candidate_id: {case_id: decision}} from running each candidate "
+    "(the cost-bearing step you run offline); omit to leave candidates unscored",
+)
+@click.option(
+    "--strategies",
+    "strategies",
+    required=False,
+    default=None,
+    help="comma list of mutation strategies (default: all)",
+)
+@click.option("--margin", "margin", default=0.02, type=float, help="win margin")
+@click.option("--out", "out_path", required=False, default=None, type=click.Path())
+def optimize_prompts_command(
+    gate_id: str,
+    golden_path: str | None,
+    rollouts_path: str | None,
+    strategies: str | None,
+    margin: float,
+    out_path: str | None,
+) -> None:
+    """Phase 3 (opt-in, offline): generate + rank prompt variants for a gate.
+
+    Read-only w.r.t. production: candidates are emitted for HUMAN REVIEW and are
+    never written back to the gate registry. Supply --golden + --rollouts to
+    rank by decision accuracy; otherwise it just generates variants. See
+    doc/plan/self-learning-system.md Phase 3 for the cost model.
+    """
+    import json
+
+    from src.llm.prompts.gate_registry import get_gate, registered_gate_ids
+    from src.tools.prompt_optimizer import (
+        GoldenCase,
+        build_report,
+        propose_variants,
+        render_report_markdown,
+    )
+
+    try:
+        gate = get_gate(gate_id)
+    except KeyError:
+        console.print(
+            f"[red]Unknown gate {gate_id!r}.[/red] Registered: "
+            f"{', '.join(registered_gate_ids())}"
+        )
+        sys.exit(1)
+
+    try:
+        base_prompt = gate.render()
+    except TypeError:
+        console.print(
+            f"[red]Gate {gate_id!r} needs render arguments[/red] — only "
+            "no-arg / *-SYSTEM gates are supported for offline optimization."
+        )
+        sys.exit(1)
+
+    strat_list = [s.strip() for s in strategies.split(",")] if strategies else None
+    candidates = propose_variants(gate_id, base_prompt, strat_list)
+
+    golden: list[GoldenCase] = []
+    if golden_path:
+        raw = json.loads(Path(golden_path).read_text(encoding="utf-8"))
+        golden = [GoldenCase.model_validate(item) for item in raw]
+
+    rollouts: dict[str, dict[str, str]] = {}
+    if rollouts_path:
+        rollouts = json.loads(Path(rollouts_path).read_text(encoding="utf-8"))
+
+    report = build_report(gate_id, candidates, golden, rollouts, margin=margin)
+    console.print(render_report_markdown(report))
+
+    if out_path:
+        try:
+            Path(out_path).write_text(
+                report.model_dump_json(indent=2), encoding="utf-8"
+            )
+            console.print(f"[green]Wrote optimization report to {out_path}[/green]")
+        except OSError as e:
+            console.print(f"[red]Failed to write {out_path}: {e}[/red]")
+            sys.exit(1)
+
+
 cli.add_command(_forks_profile_group)
 
 
diff --git a/src/tools/prompt_optimizer.py b/src/tools/prompt_optimizer.py
new file mode 100644
index 0000000..abdeaf1
--- /dev/null
+++ b/src/tools/prompt_optimizer.py
@@ -0,0 +1,270 @@
+"""Phase 3: offline, opt-in prompt/strategy optimization harness.
+
+GEPA / MIPROv2-style prompt evolution, scoped to the **deterministic, testable**
+core: generate named candidate variants of a gate's system prompt, score each
+against a labelled golden set given precomputed rollouts, rank, and emit a
+**human-review** report. It NEVER mutates `gate_registry` — gates are code
+builders, so applying a winning candidate is a manual edit a human makes after
+reviewing the report (the plan's "人工评审后才生效").
+
+The expensive part — running the model with each candidate to produce decisions
+— is intentionally OUT of this module. It is injected as a ``rollouts`` mapping
+(candidate_id -> {case_id: produced_decision}) so the pure harness stays
+unit-testable and offline; producing rollouts is the operator's documented,
+cost-bearing step (PromptBreeder ~$60/10k calls — see
+doc/plan/self-learning-system.md Phase 3).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+
+from pydantic import BaseModel, Field
+
+# --- mutation operators (deterministic, reflective-instruction injection) ----
+
+# GEPA's reflective mutations need an LLM; these are the safe deterministic
+# subset — each appends one well-known prompting directive. An LLM-reflective
+# generator can be layered on later behind the same PromptCandidate interface.
+_DIRECTIVES: dict[str, str] = {
+    "stepwise": (
+        "Before deciding, reason step by step through the specific evidence; "
+        "do not pattern-match to prior cases."
+    ),
+    "selfcheck": (
+        "After drafting your decision, re-read the inputs and verify each claim "
+        "is grounded in the provided content; revise if any is unsupported."
+    ),
+    "output_format": (
+        "Return ONLY the required structured output — no preamble, no commentary "
+        "outside the specified fields."
+    ),
+    "evidence_first": (
+        "Cite the exact lines or symbols you relied on before stating any "
+        "conclusion; an unsupported conclusion is a failure."
+    ),
+}
+
+
+def _append_directive(directive: str) -> Callable[[str], str]:
+    def _mutate(base: str) -> str:
+        return f"{base.rstrip()}\n\n{directive}"
+
+    return _mutate
+
+
+MUTATION_STRATEGIES: dict[str, Callable[[str], str]] = {
+    name: _append_directive(text) for name, text in _DIRECTIVES.items()
+}
+
+BASELINE_ID = "baseline"
+
+
+# --- models -----------------------------------------------------------------
+
+
+class PromptCandidate(BaseModel, frozen=True):
+    candidate_id: str
+    gate_id: str
+    strategy: str
+    prompt_text: str
+
+
+class GoldenCase(BaseModel, frozen=True):
+    case_id: str
+    expected_decision: str
+
+
+class CandidateScore(BaseModel, frozen=True):
+    candidate_id: str
+    cases_scored: int = Field(ge=0)
+    correct: int = Field(ge=0)
+    accuracy: float = Field(ge=0.0, le=1.0)
+
+
+class CostLedger(BaseModel):
+    llm_calls: int = Field(default=0, ge=0)
+    est_usd: float = Field(default=0.0, ge=0.0)
+
+    def record(self, *, llm_calls: int, est_usd: float) -> "CostLedger":
+        return CostLedger(
+            llm_calls=self.llm_calls + llm_calls,
+            est_usd=round(self.est_usd + est_usd, 4),
+        )
+
+
+class OptimizationReport(BaseModel, frozen=True):
+    gate_id: str
+    baseline_id: str
+    candidates: list[PromptCandidate]
+    scores: list[CandidateScore]
+    winner_id: str | None
+    margin: float
+    cost: CostLedger
+    notes: list[str] = Field(default_factory=list)
+
+
+# --- harness ----------------------------------------------------------------
+
+
+def propose_variants(
+    gate_id: str,
+    base_prompt: str,
+    strategies: list[str] | None = None,
+) -> list[PromptCandidate]:
+    """Baseline + one candidate per requested strategy.
+
+    Strategies that produce text identical to the baseline (or to an earlier
+    candidate) are dropped — a no-op mutation is not a distinct candidate."""
+    names = strategies if strategies is not None else list(MUTATION_STRATEGIES)
+    seen: set[str] = {base_prompt}
+    candidates = [
+        PromptCandidate(
+            candidate_id=BASELINE_ID,
+            gate_id=gate_id,
+            strategy=BASELINE_ID,
+            prompt_text=base_prompt,
+        )
+    ]
+    for name in names:
+        mutate = MUTATION_STRATEGIES.get(name)
+        if mutate is None:
+            continue
+        text = mutate(base_prompt)
+        if text in seen:
+            continue
+        seen.add(text)
+        candidates.append(
+            PromptCandidate(
+                candidate_id=name,
+                gate_id=gate_id,
+                strategy=name,
+                prompt_text=text,
+            )
+        )
+    return candidates
+
+
+def score_candidates(
+    candidates: list[PromptCandidate],
+    rollouts: dict[str, dict[str, str]],
+    golden: list[GoldenCase],
+) -> list[CandidateScore]:
+    """Decision accuracy per candidate against ``golden``.
+
+    ``rollouts[candidate_id][case_id]`` is the decision that candidate's prompt
+    produced for that case (operator-supplied). A candidate with no rollout
+    scores ``cases_scored=0`` rather than a fabricated number — unscored is
+    surfaced, never silently treated as zero-correct."""
+    expected = {c.case_id: c.expected_decision for c in golden}
+    scores: list[CandidateScore] = []
+    for cand in candidates:
+        produced = rollouts.get(cand.candidate_id, {})
+        scored = 0
+        correct = 0
+        for case_id, exp in expected.items():
+            if case_id not in produced:
+                continue
+            scored += 1
+            if produced[case_id] == exp:
+                correct += 1
+        accuracy = round(correct / scored, 4) if scored else 0.0
+        scores.append(
+            CandidateScore(
+                candidate_id=cand.candidate_id,
+                cases_scored=scored,
+                correct=correct,
+                accuracy=accuracy,
+            )
+        )
+    return scores
+
+
+def select_winner(
+    scores: list[CandidateScore],
+    baseline_id: str = BASELINE_ID,
+    margin: float = 0.02,
+) -> str | None:
+    """The highest-accuracy candidate, but only if it beats the baseline by at
+    least ``margin`` AND was actually scored. Ties and within-margin gains keep
+    the baseline — never churn a production prompt for noise."""
+    by_id = {s.candidate_id: s for s in scores}
+    base = by_id.get(baseline_id)
+    if base is None or base.cases_scored == 0:
+        return None
+    best: CandidateScore | None = None
+    for s in scores:
+        if s.candidate_id == baseline_id or s.cases_scored == 0:
+            continue
+        if best is None or s.accuracy > best.accuracy:
+            best = s
+    if best is None:
+        return None
+    if best.accuracy - base.accuracy >= margin:
+        return best.candidate_id
+    return None
+
+
+def build_report(
+    gate_id: str,
+    candidates: list[PromptCandidate],
+    golden: list[GoldenCase],
+    rollouts: dict[str, dict[str, str]],
+    margin: float = 0.02,
+    cost: CostLedger | None = None,
+) -> OptimizationReport:
+    scores = score_candidates(candidates, rollouts, golden)
+    winner = select_winner(scores, margin=margin)
+    notes: list[str] = []
+    if not golden:
+        notes.append("No golden set supplied — candidates generated but unscored.")
+    unscored = [s.candidate_id for s in scores if s.cases_scored == 0]
+    if golden and unscored:
+        notes.append(
+            "Unscored candidates (no rollout supplied): " + ", ".join(unscored)
+        )
+    if golden and winner is None and any(s.cases_scored for s in scores):
+        notes.append(
+            f"No candidate beat baseline by the {margin:.0%} margin — keep current."
+        )
+    return OptimizationReport(
+        gate_id=gate_id,
+        baseline_id=BASELINE_ID,
+        candidates=candidates,
+        scores=scores,
+        winner_id=winner,
+        margin=margin,
+        cost=cost or CostLedger(),
+        notes=notes,
+    )
+
+
+def render_report_markdown(report: OptimizationReport) -> str:
+    lines = [
+        f"# Prompt optimization report — gate `{report.gate_id}`",
+        "",
+        "> Candidates are NOT auto-applied. Gates are code builders; to adopt a "
+        "winner, a human reviews its `prompt_text` below and edits the gate's "
+        "prompt source manually.",
+        "",
+        f"- baseline: `{report.baseline_id}`",
+        "- winner: "
+        + (f"`{report.winner_id}`" if report.winner_id else "_none (kept baseline)_"),
+        f"- margin: {report.margin:.0%}",
+        f"- cost: {report.cost.llm_calls} LLM calls, ~${report.cost.est_usd:.2f}",
+        "",
+        "## Scores",
+        "",
+        "| candidate | strategy | scored | correct | accuracy |",
+        "|---|---|---|---|---|",
+    ]
+    strategy_by_id = {c.candidate_id: c.strategy for c in report.candidates}
+    for s in report.scores:
+        lines.append(
+            f"| `{s.candidate_id}` | {strategy_by_id.get(s.candidate_id, '?')} "
+            f"| {s.cases_scored} | {s.correct} | {s.accuracy:.2%} |"
+        )
+    if report.notes:
+        lines += ["", "## Notes", ""]
+        lines += [f"- {n}" for n in report.notes]
+    return "\n".join(lines)
diff --git a/tests/unit/test_cli_optimize_prompts.py b/tests/unit/test_cli_optimize_prompts.py
new file mode 100644
index 0000000..39fdd26
--- /dev/null
+++ b/tests/unit/test_cli_optimize_prompts.py
@@ -0,0 +1,67 @@
+"""Phase 3: `merge optimize-prompts` CLI wiring (real gate, no LLM)."""
+
+from __future__ import annotations
+
+import json
+
+from click.testing import CliRunner
+
+from src.cli.main import cli
+
+
+def test_generate_variants_for_real_system_gate():
+    res = CliRunner().invoke(
+        cli, ["optimize-prompts", "--gate", "J-SYSTEM", "--strategies", "stepwise"]
+    )
+    assert res.exit_code == 0
+    assert "NOT auto-applied" in res.output
+    assert "`stepwise`" in res.output
+
+
+def test_unknown_gate_exits_nonzero():
+    res = CliRunner().invoke(cli, ["optimize-prompts", "--gate", "NOPE"])
+    assert res.exit_code == 1
+    assert "Unknown gate" in res.output
+
+
+def test_scores_with_golden_and_rollouts(tmp_path):
+    golden = tmp_path / "golden.json"
+    golden.write_text(
+        json.dumps(
+            [
+                {"case_id": "c1", "expected_decision": "take_target"},
+                {"case_id": "c2", "expected_decision": "HUMAN_REQUIRED"},
+            ]
+        ),
+        encoding="utf-8",
+    )
+    rollouts = tmp_path / "rollouts.json"
+    rollouts.write_text(
+        json.dumps(
+            {
+                "baseline": {"c1": "take_target", "c2": "take_target"},
+                "stepwise": {"c1": "take_target", "c2": "HUMAN_REQUIRED"},
+            }
+        ),
+        encoding="utf-8",
+    )
+    out = tmp_path / "report.json"
+    res = CliRunner().invoke(
+        cli,
+        [
+            "optimize-prompts",
+            "--gate",
+            "J-SYSTEM",
+            "--strategies",
+            "stepwise",
+            "--golden",
+            str(golden),
+            "--rollouts",
+            str(rollouts),
+            "--out",
+            str(out),
+        ],
+    )
+    assert res.exit_code == 0
+    report = json.loads(out.read_text(encoding="utf-8"))
+    assert report["winner_id"] == "stepwise"  # 1.0 vs baseline 0.5
diff --git a/tests/unit/test_prompt_optimizer.py b/tests/unit/test_prompt_optimizer.py
new file mode 100644
index 0000000..32f0aea
--- /dev/null
+++ b/tests/unit/test_prompt_optimizer.py
@@ -0,0 +1,125 @@
+"""Phase 3: offline prompt optimization harness (pure, no LLM)."""
+
+from __future__ import annotations
+
+from src.tools.prompt_optimizer import (
+    BASELINE_ID,
+    CostLedger,
+    GoldenCase,
+    build_report,
+    propose_variants,
+    render_report_markdown,
+    score_candidates,
+    select_winner,
+)
+
+_BASE = "You are the Judge. Decide per file."
+
+
+# --- variant generation -----------------------------------------------------
+
+
+def test_baseline_always_first_and_unmodified():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"])
+    assert cands[0].candidate_id == BASELINE_ID
+    assert cands[0].prompt_text == _BASE
+    assert cands[0].strategy == BASELINE_ID
+
+
+def test_each_strategy_appends_distinct_directive():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise", "selfcheck"])
+    ids = [c.candidate_id for c in cands]
+    assert ids == [BASELINE_ID, "stepwise", "selfcheck"]
+    for c in cands[1:]:
+        assert c.prompt_text.startswith(_BASE)
+        assert len(c.prompt_text) > len(_BASE)
+    # distinct mutations
+    assert cands[1].prompt_text != cands[2].prompt_text
+
+
+def test_unknown_strategy_ignored():
+    cands = propose_variants("J-SYSTEM", _BASE, ["nope", "stepwise"])
+    assert [c.candidate_id for c in cands] == [BASELINE_ID, "stepwise"]
+
+
+def test_default_strategies_is_all():
+    cands = propose_variants("J-SYSTEM", _BASE)
+    assert len(cands) >= 4  # baseline + 4 directives
+
+
+# --- scoring ----------------------------------------------------------------
+
+
+def _golden() -> list[GoldenCase]:
+    return [
+        GoldenCase(case_id="c1", expected_decision="take_target"),
+        GoldenCase(case_id="c2", expected_decision="HUMAN_REQUIRED"),
+    ]
+
+
+def test_score_accuracy_per_candidate():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"])
+    rollouts = {
+        "baseline": {"c1": "take_target", "c2": "take_target"},  # 1/2
+        "stepwise": {"c1": "take_target", "c2": "HUMAN_REQUIRED"},  # 2/2
+    }
+    scores = {s.candidate_id: s for s in score_candidates(cands, rollouts, _golden())}
+    assert scores["baseline"].accuracy == 0.5
+    assert scores["stepwise"].accuracy == 1.0
+    assert scores["stepwise"].cases_scored == 2
+
+
+def test_missing_rollout_is_unscored_not_zero():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"])
+    rollouts = {"baseline": {"c1": "take_target", "c2": "HUMAN_REQUIRED"}}
+    scores = {s.candidate_id: s for s in score_candidates(cands, rollouts, _golden())}
+    assert scores["stepwise"].cases_scored == 0  # surfaced, not fabricated
+    assert scores["stepwise"].accuracy == 0.0
+
+
+# --- winner selection -------------------------------------------------------
+
+
+def test_winner_requires_margin_over_baseline():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"])
+    rollouts = {
+        "baseline": {"c1": "take_target", "c2": "take_target"},  # 0.5
+        "stepwise": {"c1": "take_target", "c2": "HUMAN_REQUIRED"},  # 1.0
+    }
+    scores = score_candidates(cands, rollouts, _golden())
+    assert select_winner(scores, margin=0.02) == "stepwise"
+    assert select_winner(scores, margin=0.9) is None  # gain below required margin
+
+
+def test_no_winner_when_baseline_unscored():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"])
+    scores = score_candidates(cands, {"stepwise": {"c1": "take_target"}}, _golden())
+    assert select_winner(scores) is None
+
+
+# --- report -----------------------------------------------------------------
+
+
+def test_report_flags_unscored_and_never_auto_applies():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"])
+    rollouts = {"baseline": {"c1": "take_target", "c2": "HUMAN_REQUIRED"}}
+    report = build_report("J-SYSTEM", cands, _golden(), rollouts)
+    md = render_report_markdown(report)
+    assert "NOT auto-applied" in md
+    assert any("Unscored" in n for n in report.notes)
+    assert report.winner_id is None
+
+
+def test_report_without_golden_notes_unscored():
+    cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"])
+    report = build_report("J-SYSTEM", cands, [], {})
+    assert any("No golden set" in n for n in report.notes)
+    assert report.winner_id is None
+
+
+def test_cost_ledger_accumulates_immutably():
+    led = CostLedger()
+    out = led.record(llm_calls=3, est_usd=0.12).record(llm_calls=2, est_usd=0.08)
+    assert led.llm_calls == 0  # original untouched
+    assert out.llm_calls == 5
+    assert out.est_usd == 0.2

From a12953d21425d51f123855b1e8c3a61c2915540d Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 09:44:53 -0400
Subject: [PATCH 17/22] =?UTF-8?q?docs(plan):=20=E6=A0=87=E6=B3=A8=20Phase?=
 =?UTF-8?q?=203=20=E8=90=BD=E5=9C=B0=E7=8A=B6=E6=80=81=E4=B8=8E=E5=A4=96?=
 =?UTF-8?q?=E7=A7=BB=E8=BE=B9=E7=95=8C=EF=BC=88rollout/no-arg=20gate/LLM-?=
 =?UTF-8?q?=E5=8F=8D=E6=80=9D=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/plan/self-learning-system.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md
index 5995507..bccc0b5 100644
--- a/doc/plan/self-learning-system.md
+++ b/doc/plan/self-learning-system.md
@@ -232,6 +232,15 @@
 
 ### Phase 3 —— 离线提示/策略自动优化（opt-in，后期，成本透明）
 
+> **落地状态（2026-05-31，feat/web）**：确定性可测核心已实装（`f540613`）——
+> `src/tools/prompt_optimizer.py` + `merge optimize-prompts` CLI。生成具名候选变体
+> （GEPA 确定性子集=反思指令注入）、按 golden 决策准确率排名、产**人工评审报告**，
+> **永不自动写回 gate_registry**。**有意外移的部分**：① 昂贵的 LLM rollout 抽象为注入的
+> `rollouts` 映射（操作者自担成本产出），harness 保持纯离线可单测；② 仅支持
+> no-arg/`*-SYSTEM` gate（参数化 gate 无静态基线文本）；③ LLM-反思式变体生成（GEPA
+> 完整形态）留待后续，当前为确定性指令注入。这是 opt-in 子命令、默认不跑，符合
+> 「上界增益、不应早于 0–1」定位。
+
 **目标**：用 Phase 0 的评估器当 metric，离线对 gate 提示（`gate_registry` P-*/J-*/CA-*…）做 GEPA/MIPROv2 式进化。
 
 **强约束（来自调研成本警示）**

From 918c19452f0231e49fa86dc1a061ab7e4ae421a3 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 10:44:18 -0400
Subject: [PATCH 18/22] =?UTF-8?q?fix(memory):=20P1-A=E5=9B=BA=E5=8C=96=20p?=
 =?UTF-8?q?ersist-suppress=20=E5=88=A4=E6=8D=AE=E5=8D=87=E7=BA=A7=EF=BC=88?=
 =?UTF-8?q?=E6=B6=88=E9=99=A4=20PR-0d=20=E5=81=87=E9=98=B3=E6=80=A7?=
 =?UTF-8?q?=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

eval-memory 分析在真实 forgejo 累积 sidecar 上发现：P1-A 的持久软删沿用读取期
单臂 harmful_entry_ids(score<=-0.5)，会把 8 条薄样本(<=4 fail)误判有害并不可逆
软删——正是 PR-0d 为度量层修过的同一相关性假阳性，现落到跨 run 持久副作用上。

持久软删远比读取期 O-M6 过滤(瞬时、可逆)更该严格：
- hit_tracker.harmful_entry_ids 增 min_fail_count（默认 0，读取期行为不变）
- config: suppress_harmful_threshold=-0.8（严于读取期 -0.5）+ suppress_min_fail_count=5
- _apply_suppress_harmful_entries 用严格阈值 + fail 下限；并加确定性混淆守卫——
  条目若仅关联本 run veto(确定性)失败文件、且不沾任何 passed 文件，则其"有害"
  是相关性(确定性门不读 memory)，跳过 suppress

真实 forgejo 实证：旧判据选 8 条软删，新判据选 0 条（全部薄样本被 fail 下限拦住）。
5 新单测(fail下限/严阈值/混淆守卫/守卫不误伤沾 passed 的条目) + 3273 unit 绿
(1 pre-existing 无关 docs 测试除外)，mypy/ruff 干净
---
 src/core/orchestrator.py           |  38 ++++++++--
 src/memory/hit_tracker.py          |   9 ++-
 src/models/config.py               |  17 +++++
 tests/unit/test_memory_pruning.py  |  12 ++++
 tests/unit/test_memory_suppress.py | 112 +++++++++++++++++++++++++++--
 5 files changed, 177 insertions(+), 11 deletions(-)

diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py
index 5c5ce0b..6cd90b7 100644
--- a/src/core/orchestrator.py
+++ b/src/core/orchestrator.py
@@ -610,17 +610,22 @@ def _apply_outcome_confidence_writeback(self, state: MergeState) -> None:
     def _apply_suppress_harmful_entries(self, state: MergeState) -> None:
         """P1-A: persistently soft-delete stably-harmful memory entries.
 
-        Default OFF. When ``persist_suppress`` is on, entries whose accumulated
-        outcome score crosses the harmful threshold with at least
-        ``suppress_min_observations`` observations are marked ``suppressed`` so
-        the prune survives tracker loss across runs (the O-M6 read-time filter
-        recomputes from sidecar observations and resurrects on loss). Human and
+        Default OFF. Persistent suppress is durable and cross-run, so its bar
+        is deliberately stricter than the transient read-time O-M6 filter:
+        ``suppress_harmful_threshold`` (≈ near-universal failure, not a slim
+        majority) AND ``suppress_min_fail_count`` absolute fails. A
+        deterministic-confound guard further skips entries whose only judged
+        file association is with files that failed via a *deterministic* veto
+        this run — a deterministic gate ignores memory, so blaming the injected
+        entry is the PR-0d single-arm false positive (metrics §9.7). Human and
         bootstrap entries are exempt, mirroring OPP-5."""
         cfg = getattr(self.config, "memory", None)
         if cfg is None or not getattr(cfg, "persist_suppress", False):
             return
         harmful_ids = self._memory_hit_tracker.harmful_entry_ids(
-            min_observations=cfg.suppress_min_observations
+            threshold=getattr(cfg, "suppress_harmful_threshold", -0.8),
+            min_observations=cfg.suppress_min_observations,
+            min_fail_count=getattr(cfg, "suppress_min_fail_count", 5),
         )
         if not harmful_ids:
             return
@@ -630,6 +635,20 @@ def _apply_suppress_harmful_entries(self, state: MergeState) -> None:
             if record.decision_source
             in (DecisionSource.HUMAN, DecisionSource.BATCH_HUMAN)
         }
+        verdict = getattr(state, "judge_verdict", None)
+        passed_files = set(verdict.passed_files) if verdict else set()
+        # Files that failed via a deterministic veto this run — their failure is
+        # independent of injected memory, so an entry tied only to them is a
+        # correlational (not causal) "harm".
+        det_fail_files = (
+            {
+                issue.file_path
+                for issue in verdict.issues
+                if issue.veto_condition and issue.file_path in set(verdict.failed_files)
+            }
+            if verdict
+            else set()
+        )
         suppressed = 0
         for entry in self._memory_store.to_memory().entries:
             if entry.entry_id not in harmful_ids or entry.suppressed:
@@ -638,6 +657,13 @@ def _apply_suppress_harmful_entries(self, state: MergeState) -> None:
                 continue
             if human_files and any(fp in human_files for fp in entry.file_paths):
                 continue
+            entry_files = set(entry.file_paths)
+            if (
+                det_fail_files
+                and entry_files & det_fail_files
+                and not (entry_files & passed_files)
+            ):
+                continue
             self._memory_store = self._memory_store.suppress_entry(
                 entry.entry_id, reason="P1-A: stably-harmful judge outcomes"
             )
diff --git a/src/memory/hit_tracker.py b/src/memory/hit_tracker.py
index 3574866..d2acbbc 100644
--- a/src/memory/hit_tracker.py
+++ b/src/memory/hit_tracker.py
@@ -235,6 +235,7 @@ def harmful_entry_ids(
         self,
         threshold: float = -0.5,
         min_observations: int = 2,
+        min_fail_count: int = 0,
     ) -> frozenset[str]:
         """O-M6: entry_ids whose outcome score is at/below ``threshold``.
 
@@ -242,6 +243,12 @@ def harmful_entry_ids(
         an entry is consistently associated with judge failures. Requires
         at least ``min_observations`` total observations to avoid pruning
         entries on a single bad run.
+
+        ``min_fail_count`` (P1-A固化) additionally requires that absolute
+        failure count — a 0-pass/3-fail entry has score -1.0 but only three
+        fails, too thin to justify a *persistent* prune. The transient
+        read-time filter leaves this at 0 (loose is fine — it is recomputed
+        and reversible); the durable suppress path raises it.
         """
         with self._lock:
             harmful: set[str] = set()
@@ -249,7 +256,7 @@ def harmful_entry_ids(
                 p = counters.get("pass", 0)
                 f = counters.get("fail", 0)
                 total = p + f
-                if total < min_observations:
+                if total < min_observations or f < min_fail_count:
                     continue
                 score = (p - f) / total
                 if score <= threshold:
diff --git a/src/models/config.py b/src/models/config.py
index cb80848..7dec24f 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -1008,6 +1008,23 @@ class MemoryExtractionConfig(BaseModel):
         description="P1-A: minimum pass+fail observations before a harmful "
         "entry is persistently suppressed, so a single run cannot prune it.",
     )
+    suppress_harmful_threshold: float = Field(
+        default=-0.8,
+        ge=-1.0,
+        le=0.0,
+        description="P1-A固化: outcome-score ceiling for *persistent* suppress "
+        "(score=(pass-fail)/total). Stricter than the transient read-time "
+        "filter's -0.5 because suppression is durable and cross-run — require "
+        "near-universal failure, not a slim majority.",
+    )
+    suppress_min_fail_count: int = Field(
+        default=5,
+        ge=1,
+        description="P1-A固化: minimum absolute fail count before persistent "
+        "suppress — a 0-pass/3-fail entry is too thin to durably prune. Guards "
+        "the PR-0d false-positive where a few deterministic-file failures look "
+        "harmful by ratio alone.",
+    )
 
 
 class RenameDetectionConfig(BaseModel):
diff --git a/tests/unit/test_memory_pruning.py b/tests/unit/test_memory_pruning.py
index 3c71800..405e399 100644
--- a/tests/unit/test_memory_pruning.py
+++ b/tests/unit/test_memory_pruning.py
@@ -56,6 +56,18 @@ def test_harmful_threshold_is_inclusive() -> None:
     assert "edge" in tracker.harmful_entry_ids(threshold=-0.5)
 
 
+def test_harmful_min_fail_count_floor() -> None:
+    """P1-A固化: min_fail_count gates by absolute fails, not just ratio."""
+    tracker = MemoryHitTracker()
+    for i in range(3):  # 0 pass / 3 fail → score -1.0
+        f = f"f{i}"
+        tracker.record_injection([f], ["thin"])
+        tracker.record_outcome(f, success=False)
+    assert "thin" in tracker.harmful_entry_ids()  # default floor 0
+    assert "thin" not in tracker.harmful_entry_ids(min_fail_count=5)
+    assert "thin" in tracker.harmful_entry_ids(min_fail_count=3)
+
+
 def test_harmful_custom_threshold() -> None:
     tracker = MemoryHitTracker()
     tracker.record_injection(["a.py"], ["mild-bad"])
diff --git a/tests/unit/test_memory_suppress.py b/tests/unit/test_memory_suppress.py
index c3988d2..bc426fa 100644
--- a/tests/unit/test_memory_suppress.py
+++ b/tests/unit/test_memory_suppress.py
@@ -209,13 +209,14 @@ def test_persist_suppress_marks_stable_harmful_skips_human_and_bootstrap():
     store = orch._memory_store
     for e in (harmful, human, boot):
         store = store.add_entry(e)
-        _track_fails(orch._memory_hit_tracker, e.entry_id, 3)
+        _track_fails(orch._memory_hit_tracker, e.entry_id, 6)  # >= min_fail_count
     orch._memory_store = store
 
     state = SimpleNamespace(
+        judge_verdict=None,
         file_decision_records={
             "src/secret.py": SimpleNamespace(decision_source=DecisionSource.HUMAN)
-        }
+        },
     )
     orch._apply_suppress_harmful_entries(state)
 
@@ -231,6 +232,109 @@ def test_persist_suppress_respects_min_observations():
     orch = _orch(persist=True, min_obs=3)
     e = _entry("harm", ["src/a.py"])
     orch._memory_store = orch._memory_store.add_entry(e)
-    _track_fails(orch._memory_hit_tracker, e.entry_id, 2)  # below threshold
-    orch._apply_suppress_harmful_entries(SimpleNamespace(file_decision_records={}))
+    _track_fails(orch._memory_hit_tracker, e.entry_id, 2)  # below min_observations
+    orch._apply_suppress_harmful_entries(
+        SimpleNamespace(judge_verdict=None, file_decision_records={})
+    )
+    assert orch._memory_store.to_memory().entries[0].suppressed is False
+
+
+# --- P1-A固化: stricter persistent-suppress criterion (PR-0d false-positive) -
+
+
+def _track_mixed(tracker, entry_id: str, *, passes: int, fails: int) -> None:
+    for i in range(passes):
+        f = f"{entry_id}-p{i}"
+        tracker.record_injection([f], [entry_id])
+        tracker.record_outcome(f, success=True)
+    for i in range(fails):
+        f = f"{entry_id}-f{i}"
+        tracker.record_injection([f], [entry_id])
+        tracker.record_outcome(f, success=False)
+
+
+def test_suppress_needs_min_fail_count():
+    # score -1.0 but only 4 fails (< default 5) → too thin for a durable prune.
+    from types import SimpleNamespace
+
+    orch = _orch(persist=True, min_obs=3)
+    e = _entry("harm", ["src/a.py"])
+    orch._memory_store = orch._memory_store.add_entry(e)
+    _track_fails(orch._memory_hit_tracker, e.entry_id, 4)
+    orch._apply_suppress_harmful_entries(
+        SimpleNamespace(judge_verdict=None, file_decision_records={})
+    )
+    assert orch._memory_store.to_memory().entries[0].suppressed is False
+
+
+def test_suppress_needs_strict_threshold():
+    # 3 pass / 7 fail → score -0.4, above the -0.8 persistent bar (would pass the
+    # loose read-time -0.5 but not the durable suppress threshold).
+    from types import SimpleNamespace
+
+    orch = _orch(persist=True, min_obs=3)
+    e = _entry("harm", ["src/a.py"])
+    orch._memory_store = orch._memory_store.add_entry(e)
+    _track_mixed(orch._memory_hit_tracker, e.entry_id, passes=3, fails=7)
+    orch._apply_suppress_harmful_entries(
+        SimpleNamespace(judge_verdict=None, file_decision_records={})
+    )
+    assert orch._memory_store.to_memory().entries[0].suppressed is False
+
+
+def test_deterministic_confound_guard_skips_veto_only_entry():
+    # The PR-0d case: entry tied ONLY to a file that failed via a deterministic
+    # veto → its "harm" is correlational; persistent suppress must skip it.
+    from types import SimpleNamespace
+
+    from src.models.judge import IssueSeverity, JudgeIssue
+
+    orch = _orch(persist=True, min_obs=3)
+    e = _entry("harm", ["auth/oauth.go", "auth"])
+    orch._memory_store = orch._memory_store.add_entry(e)
+    _track_fails(orch._memory_hit_tracker, e.entry_id, 6)  # strongly harmful by ratio
+
+    veto_issue = JudgeIssue(
+        file_path="auth/oauth.go",
+        issue_level=IssueSeverity.CRITICAL,
+        issue_type="reverse_impact_unhandled",
+        description="reverse impact",
+        veto_condition="reverse impact unhandled",
+    )
+    verdict = SimpleNamespace(
+        passed_files=[], failed_files=["auth/oauth.go"], issues=[veto_issue]
+    )
+    orch._apply_suppress_harmful_entries(
+        SimpleNamespace(judge_verdict=verdict, file_decision_records={})
+    )
     assert orch._memory_store.to_memory().entries[0].suppressed is False
+
+
+def test_confound_guard_does_not_shield_entry_touching_passed_file():
+    # An entry that also touches a PASSED file is not purely confounded — a
+    # strongly-harmful ratio still suppresses it.
+    from types import SimpleNamespace
+
+    from src.models.judge import IssueSeverity, JudgeIssue
+
+    orch = _orch(persist=True, min_obs=3)
+    e = _entry("harm", ["auth/oauth.go", "auth/ok.go"])
+    orch._memory_store = orch._memory_store.add_entry(e)
+    _track_fails(orch._memory_hit_tracker, e.entry_id, 6)
+
+    veto_issue = JudgeIssue(
+        file_path="auth/oauth.go",
+        issue_level=IssueSeverity.CRITICAL,
+        issue_type="reverse_impact_unhandled",
+        description="reverse impact",
+        veto_condition="reverse impact unhandled",
+    )
+    verdict = SimpleNamespace(
+        passed_files=["auth/ok.go"],
+        failed_files=["auth/oauth.go"],
+        issues=[veto_issue],
+    )
+    orch._apply_suppress_harmful_entries(
+        SimpleNamespace(judge_verdict=verdict, file_decision_records={})
+    )
+    assert orch._memory_store.to_memory().entries[0].suppressed is True

From 126302cc66178dfc14247c8d45f964646cf7e0a7 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Sun, 31 May 2026 10:44:48 -0400
Subject: [PATCH 19/22] =?UTF-8?q?docs(plan):=20=E8=AE=B0=E5=BD=95=20P1-A?=
 =?UTF-8?q?=E5=9B=BA=E5=8C=96=E5=88=A4=E6=8D=AE=E5=8A=A0=E5=9B=BA=EF=BC=88?=
 =?UTF-8?q?eval-memory=20=E5=88=86=E6=9E=90=E9=A9=B1=E5=8A=A8=EF=BC=8C?=
 =?UTF-8?q?=E6=B6=88=E9=99=A4=E5=81=87=E9=98=B3=E6=80=A7=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/plan/self-learning-system.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md
index bccc0b5..9257690 100644
--- a/doc/plan/self-learning-system.md
+++ b/doc/plan/self-learning-system.md
@@ -178,6 +178,13 @@
 - 触发**默认 opt-in**，因 Phase 0 已能度量净收益，可在基线为正后转默认开启。
 
 **防护**：只固化满足 `min_observations` 且 effectiveness≤阈值的条目；豁免 HUMAN/bootstrap（同 OPP-5 现有豁免）；软删可经 CLI 复活。
+
+> **固化判据加固（`918c194`，eval-memory 分析驱动）**：持久软删不可逆、跨 run，
+> 判据须远严于读取期 O-M6 过滤。真实 forgejo 累积 sidecar 上，旧的单臂
+> `harmful_entry_ids(-0.5)` 会误删 8 条薄样本（≤4 fail）——正是 PR-0d 修过的相关性
+> 假阳性。已升级：`suppress_harmful_threshold=-0.8` + `suppress_min_fail_count=5`
+> + 确定性混淆守卫（条目仅关联本 run veto 失败文件且不沾 passed → 跳过）。实证旧
+> 判据选 8 条、新判据选 0 条。
 **验收**：Phase 0 harness 显示 `harmful_influence_rate` 在"tracker 重置"场景下仍不回升（=证明持久化的增量价值），且总决策质量不降。
 
 #### P1-B 激活并加固 OPP-5 写回，融合 compile/CI 信号（原则 P1）

From 31008d63b5aac93a24ed94af038caa3ebb9dde3e Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Mon, 1 Jun 2026 00:03:25 -0400
Subject: [PATCH 20/22] =?UTF-8?q?feat(eval):=20=E8=A1=A5=E9=BD=90=20LLM-?=
 =?UTF-8?q?=E5=88=A4=E6=96=AD=E5=AF=86=E9=9B=86=20golden=20=E9=9B=86?=
 =?UTF-8?q?=E5=86=B3=E7=AD=96=E9=9D=A2=EF=BC=88B-class=20=E5=B9=B2?=
 =?UTF-8?q?=E5=87=80=E5=90=88=E5=B9=B6=20+=20J-SYSTEM=20pass/fail=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

此前 golden 集仅 5 个 C-class 升级样本，三个 gate 都只有单一升级标签
（escalate_human / human_required），优化器只能惩罚"误判升级"、无法奖励
"敢自动合并"，决策面残缺。

新增 4 个 forgejo 真实样本（models/auth/auth_token.go，双侧不相交纯增量、
git 3-way 干净合并、go build 通过、golden 保留双方）：
- t1-0034/0036：auto_safe + semantic_merge + J-SYSTEM pass
- t1-0035：auto_risky（auth 结构改动，弱 risk-hint 非 security_sensitive）
- t1-0037：J-SYSTEM fail fixture——golden 正确，rollout 喂丢掉 upstream
  所有权校验的 fork-only 树（prepare 的 working_tree），judge 应判 fail

逐 gate 现状：J-SYSTEM 0→4（3 pass + 1 fail）、P-RISK 5+3、CA 5+3。

配套：
- golden.md §4/§5：更新 seed 状态，将 J-SYSTEM rollout 输入契约固化为
  golden_tree（pass）/ working_tree（fail）的具体约定
- _schemas.py：SampleMeta 增 judgment_intensive / golden_decisions 字段
- 新增真实数据集守卫单测（决策面覆盖 + golden JSON 与 meta 同步）
- doc/evaluation/README.md 索引补 golden.md
---
 doc/evaluation/README.md                      |   1 +
 doc/evaluation/dataset.md                     |   5 +
 doc/evaluation/golden.md                      | 165 ++++++++++++++
 scripts/eval/_golden.py                       | 115 ++++++++++
 scripts/eval/_schemas.py                      |  25 +++
 scripts/eval/build_golden.py                  |  86 ++++++++
 .../datasets/tier1/samples/t1-0005/meta.yaml  |   6 +
 .../datasets/tier1/samples/t1-0006/meta.yaml  |   6 +
 .../datasets/tier1/samples/t1-0031/meta.yaml  |   7 +
 .../datasets/tier1/samples/t1-0032/meta.yaml  |   7 +
 .../datasets/tier1/samples/t1-0033/meta.yaml  |   7 +
 .../datasets/tier1/samples/t1-0034/base.tar   | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0034/fork.patch |  14 ++
 .../datasets/tier1/samples/t1-0034/golden.tar | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0034/meta.yaml  |  30 +++
 .../tier1/samples/t1-0034/provenance.yaml     |  15 ++
 .../tier1/samples/t1-0034/upstream.patch      |  13 ++
 .../datasets/tier1/samples/t1-0035/base.tar   | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0035/fork.patch |  16 ++
 .../datasets/tier1/samples/t1-0035/golden.tar | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0035/meta.yaml  |  29 +++
 .../tier1/samples/t1-0035/provenance.yaml     |  15 ++
 .../tier1/samples/t1-0035/upstream.patch      |  12 ++
 .../datasets/tier1/samples/t1-0036/base.tar   | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0036/fork.patch |  14 ++
 .../datasets/tier1/samples/t1-0036/golden.tar | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0036/meta.yaml  |  30 +++
 .../tier1/samples/t1-0036/provenance.yaml     |  15 ++
 .../tier1/samples/t1-0036/upstream.patch      |  13 ++
 .../datasets/tier1/samples/t1-0037/base.tar   | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0037/fork.patch |  14 ++
 .../datasets/tier1/samples/t1-0037/golden.tar | Bin 0 -> 10240 bytes
 .../datasets/tier1/samples/t1-0037/meta.yaml  |  31 +++
 .../tier1/samples/t1-0037/provenance.yaml     |  15 ++
 .../tier1/samples/t1-0037/upstream.patch      |  14 ++
 tests/eval/golden/CA-SYSTEM.golden.json       |  34 +++
 tests/eval/golden/J-SYSTEM.golden.json        |  18 ++
 .../golden/P-RISK-SCORE-SYSTEM.golden.json    |  34 +++
 tests/eval/manifests/tier1.lock.json          |  36 +++-
 tests/eval/unit/test_build_golden.py          | 202 ++++++++++++++++++
 40 files changed, 1038 insertions(+), 6 deletions(-)
 create mode 100644 doc/evaluation/golden.md
 create mode 100644 scripts/eval/_golden.py
 create mode 100644 scripts/eval/build_golden.py
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/base.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/fork.patch
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/golden.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/meta.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/upstream.patch
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/base.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/fork.patch
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/golden.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/meta.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/upstream.patch
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/base.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/fork.patch
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/golden.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/meta.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/provenance.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/upstream.patch
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/base.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/fork.patch
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/golden.tar
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/meta.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml
 create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/upstream.patch
 create mode 100644 tests/eval/golden/CA-SYSTEM.golden.json
 create mode 100644 tests/eval/golden/J-SYSTEM.golden.json
 create mode 100644 tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json
 create mode 100644 tests/eval/unit/test_build_golden.py

diff --git a/doc/evaluation/README.md b/doc/evaluation/README.md
index c4923e8..2d0934f 100644
--- a/doc/evaluation/README.md
+++ b/doc/evaluation/README.md
@@ -86,6 +86,7 @@ Tier-1 跑得快、可天天跑；Tier-2 接近真实，但贵；Tier-3 用来
 | [dataset.md](dataset.md) | 三层评估集如何构造、如何维护、如何防止过拟合 |
 | [procedure.md](procedure.md) | 端到端评估流程、复现命令、报告产物规范 |
 | [acceptance.md](acceptance.md) | 每个指标的合格 / 不合格阈值与版本基线管理 |
+| [golden.md](golden.md) | `merge optimize-prompts --golden` 的逐 gate 决策 oracle（LLM-判断密集 golden 集）如何从样本 `meta.yaml` 派生 |
 
 ---
 
diff --git a/doc/evaluation/dataset.md b/doc/evaluation/dataset.md
index 88910d0..b3e6d30 100644
--- a/doc/evaluation/dataset.md
+++ b/doc/evaluation/dataset.md
@@ -2,6 +2,11 @@
 
 > 评估集**质量决定结论可信度**。本章定义三层数据集的构造原则、维护机制、防过拟合规则。
 > 数据集所在路径：`tests/eval/datasets/`（待建）。
+>
+> 逐 gate 的**决策 oracle**（`merge optimize-prompts --golden` 用的
+> LLM-判断密集 golden 集，与下文的 `golden.tar` 黄金树不是一回事）见
+> [golden.md](golden.md)：它从样本 `meta.yaml` 的 `judgment_intensive` /
+> `golden_decisions` 字段派生，单一真相源。
 
 ---
 
diff --git a/doc/evaluation/golden.md b/doc/evaluation/golden.md
new file mode 100644
index 0000000..edff13b
--- /dev/null
+++ b/doc/evaluation/golden.md
@@ -0,0 +1,165 @@
+# LLM-判断密集 Golden 集
+
+> 这是 `merge optimize-prompts --golden` 的决策验证集，**与 Tier-1/2/3 评估数据集
+> 的 `golden.tar`（人工黄金合并树）是两回事**：这里的 golden 是**逐 gate 的决策
+> oracle**，形如 `[{case_id, expected_decision}]`，用来给候选提示按决策准确率排名。
+
+---
+
+## 1. 为什么要"判断密集"
+
+`optimize-prompts` 的打分信号是：候选提示的 rollout 是否复现 `expected_decision`
+（`src/tools/prompt_optimizer.py` `score_candidates`）。这个信号**只有在决策真正由
+LLM 推理决定时才有区分度**。如果一个 case 的最终决策被确定性规则短路，那么所有候选
+提示都会产出同一个决策 → 准确率全相同 → 优化器无法分辨提示好坏，这个 case 是噪声。
+
+因此 golden 集必须**集中在 LLM 判断区**，排除被确定性路径决定的 case。
+
+### 纳入（judgment-intensive）
+
+- 风险分落在 `risk_score_low`/`risk_score_high` 之间的边界样本（`auto_risky`，
+  是否升级取决于 LLM 复核）。
+- C-class 真实文本冲突：双侧对同一区域做**独立**改动，ConflictAnalyst 需判断
+  能否安全语义合并、还是升级。
+- 接口契约 / 调用点漂移（M1/M3 形态），auto-merge 与升级之间的灰带。
+- 命中 `risk_hint_patterns`（弱信号，如 `**/auth/**`）但**未**命中严格
+  `security_sensitive.patterns` 的文件——`config.py` 明确"LLM 的混合分有最终发言权"。
+
+### 排除（deterministic short-circuit）
+
+- 命中 `security_sensitive.patterns`（`**/.env`、`**/credentials.go`、
+  `**/auth/credentials/**` …）→ 强制 `human_required`，与提示无关。
+- 确定性 veto / `deterministic_issues` 强制 judge `fail`。
+- `--no-llm` heuristic 封顶（如封顶 `high` 却判 `critical` 之类的不可达门）。
+- 二进制 / `deleted_only` / `excluded` 等 sentinel 风险级。
+
+判据口诀：**换一版提示词，这个 case 的决策会变吗？** 会 → 纳入；不会 → 排除。
+
+---
+
+## 2. 各 gate 的决策词表
+
+`expected_decision` 必须取自该 gate 真实的决策枚举（`_golden.py` 的
+`GATE_DECISION_VOCAB` 从生产枚举派生，写错值会让 `build_golden` 直接报错）：
+
+| gate ID | 来源枚举 | 取值 |
+|---|---|---|
+| `J-SYSTEM` | `VerdictType` (`src/models/judge.py`) | `pass` / `conditional` / `fail` |
+| `P-RISK-SCORE-SYSTEM` | `RiskLevel` (`src/models/diff.py`) | `auto_safe` / `auto_risky` / `human_required`（+ sentinel `deleted_only`/`binary`/`excluded`） |
+| `CA-SYSTEM` | `MergeDecision` (`src/models/decision.py`) | `take_current` / `take_target` / `semantic_merge` / `manual_patch` / `escalate_human` / `skip` |
+
+一个样本可同时为多个 gate 贡献 golden case（同一个 C-class 冲突既是
+`human_required` 的风险 case，也是 `escalate_human` 的 analyst case）。
+
+---
+
+## 3. 单一真相源：meta.yaml → 生成器
+
+golden 集**不手写**，而是从样本 `meta.yaml` 派生，避免与数据集标注双源漂移。
+
+在样本 `meta.yaml` 里声明：
+
+```yaml
+judgment_intensive: true     # 决策由 LLM 驱动，非确定性短路
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: human_required
+  - gate_id: CA-SYSTEM
+    expected_decision: escalate_human
+```
+
+两个字段都是可选的（`SampleMeta` 默认 `judgment_intensive=false`、
+`golden_decisions=()`），绝大多数样本不带。`judgment_intensive: true` 但不写
+`golden_decisions` 是合法的 no-op（先入选、后标注）。
+
+生成（确定性、按 case_id 排序，输出到 `tests/eval/golden/<gate_id>.golden.json`）：
+
+```bash
+python -m scripts.eval.build_golden                 # 扫 tier 1/2/3
+python -m scripts.eval.build_golden --tier 1        # 只扫 tier-1
+```
+
+编辑样本 `meta.yaml` 后会改变其内容哈希，需重建 lock：
+
+```bash
+python -m scripts.eval.lock --update
+python -m scripts.eval.lock --verify
+```
+
+消费（rollout 是你自担成本的离线步骤，见 `self-learning-system.md` Phase 3）：
+
+```bash
+merge optimize-prompts --gate CA-SYSTEM \
+  --golden tests/eval/golden/CA-SYSTEM.golden.json \
+  --rollouts <你跑各候选产出的 {candidate_id:{case_id:decision}}.json>
+```
+
+---
+
+## 4. 当前 seed 状态（2026-05-31）
+
+数据集现有 **8 个判断密集真实样本**，覆盖完整决策面（其余 t1-0001..0030 为 TBD
+占位 / 单边平凡改动，不入选）：
+
+- **5 个 C-class 升级样本**（t1-0005/0006/0031/0032/0033）：双侧改**同一区域** →
+  应升级。
+- **3 个 B-class 干净自动合并样本**（t1-0034/0035/0036）：双侧对
+  `models/auth/auth_token.go` 的**不相交区域**做纯增量改动，git 3-way 干净合并、
+  `go build ./models/auth/` 通过，golden 保留双方（确定性 `git merge`，非系统自身
+  产物，dataset.md §1.4）。提供 auto-merge 正例。
+- **1 个 J-SYSTEM=fail fixture 样本**（t1-0037）：同形态干净合并，但 upstream 侧加的是
+  一个**安全相关**的 `BelongsTo` 所有权校验；golden.tar 仍保留双方（正确），fail 负例
+  喂的是丢掉该校验的 fork-only 树（§5）。
+
+逐 gate 现状：
+
+| gate | case 数 | 分布 |
+|---|---|---|
+| `CA-SYSTEM` | 8 | 5 `escalate_human` + 3 `semantic_merge`（t1-0034/0035/0036） |
+| `P-RISK-SCORE-SYSTEM` | 8 | 5 `human_required` + 2 `auto_safe`（t1-0034/0036） + 1 `auto_risky`（t1-0035） |
+| `J-SYSTEM` | 4 | 3 `pass`（t1-0034/0035/0036，喂 golden 树） + 1 `fail`（t1-0037，喂 fork-only 树） — 见 §5 契约 |
+
+> 决策面现已**全向**：优化器既能惩罚"把升级误判成自动合并"的提示（升级类负例），
+> 也能奖励"敢对干净合并自动放行 / 保留双方而非 take 一侧"的提示（auto-merge 正例；
+> cf. IMPLEMENTATION_REPORT t1-0003 executor take_target 丢一侧的失败模式），
+> 还能惩罚"对丢了一侧的合并仍判 pass"的提示（`J-SYSTEM=fail` 负例 t1-0037）。
+
+---
+
+## 5. J-SYSTEM rollout 输入契约（已固定）
+
+`J-SYSTEM` 的 golden 语义依赖"judge 在 rollout 时看到的是哪棵树"。`optimize-prompts`
+消费的 rollout JSON（`{candidate_id: {case_id: decision}}`）是你自担成本离线跑出来的，
+因此**这棵树由你在跑 rollout 时按本契约喂**，否则 `expected_decision` 无法对齐产出：
+
+输入树**全部是 `scripts.eval.prepare` 的现成产物**，rollout 无需临时构造（`prepare`
+为每个样本写出 `golden_tree/`（= `golden.tar`）和 `working_tree/`（= base + `fork.patch`，
+即只取 fork 侧、完全丢掉 upstream 增量的方向性 take_current 结果））：
+
+| `expected_decision` | rollout 必须喂给 judge 的合并树 | 何以判得动 |
+|---|---|---|
+| `pass` | `prepare` 的 **`golden_tree/`**（= `golden.tar`，双侧都在） | 正确合并，calibrated 提示应认可 |
+| `fail` | `prepare` 的 **`working_tree/`**（= base + `fork.patch`，丢掉 upstream 侧） | 与 `upstream.patch` 对照可见 upstream 的增量整段缺失 → 语义丢失 |
+
+固定项（rollout 脚本必须遵守，否则该 case 的标签无意义）：
+
+1. **输入树**严格取自上表，不得用系统自己跑出来的合并结果（那会把 judge 的输入与
+   被测提示耦合，丧失 oracle 地位）。
+2. judge 的其余上下文（base / upstream.patch / fork.patch、diff 摘要）按生产口径
+   从样本五件套派生，跨 candidate 保持一致——唯一变量是 judge **提示词本身**。
+   `fail` 案尤其依赖 `upstream.patch` 在场：judge 正是靠它发现 `working_tree` 缺了
+   upstream 的增量。
+3. 当前 3 个 `pass` case（t1-0034/0035/0036）均喂各自 `golden_tree/`。这三棵树都是
+   纯增量双侧合并，正确提示应稳定 `pass`；过度保守的提示会误报 `conditional` / `fail`，
+   这正是优化器要惩罚的——所以即便是 `pass`，case 仍有区分度（§1 判据：换提示会变吗？会）。
+4. `fail` case（t1-0037）喂其 `working_tree/`：fork 侧加了 `const DefaultTokenTTLHours`，
+   但 upstream 侧加的 `BelongsTo` 所有权校验在该树中完全缺失。calibrated 提示应判
+   `fail`（丢了安全校验）；对 fork-only 树仍 `pass` 的提示就是负例要惩罚的对象。
+   注意 t1-0037 仅声明 `J-SYSTEM`（其 golden.tar 本身是正确合并，可日后另补
+   `P-RISK`/`CA` 正例标签，但不与本 fail fixture 混用）。
+
+---
+
+相关：数据集构造见 [dataset.md](dataset.md)，评估流程见
+[procedure.md](procedure.md)，Phase 3 成本模型见
+[../plan/self-learning-system.md](../plan/self-learning-system.md)。
diff --git a/scripts/eval/_golden.py b/scripts/eval/_golden.py
new file mode 100644
index 0000000..1c5d9e8
--- /dev/null
+++ b/scripts/eval/_golden.py
@@ -0,0 +1,115 @@
+"""Build the LLM-judgment golden set consumed by ``merge optimize-prompts``.
+
+A *golden case* is a ``(case_id, expected_decision)`` pair for one
+``*-SYSTEM`` gate. ``optimize-prompts`` ranks prompt variants by how often
+their rollout reproduces ``expected_decision``; that signal only discriminates
+between variants on cases whose decision is genuinely driven by the LLM, not
+short-circuited by a deterministic rule (security_sensitive force, deterministic
+veto, heuristic cap). This module turns the ``judgment_intensive`` /
+``golden_decisions`` fields authored in each sample's ``meta.yaml`` into the
+per-gate ``[{case_id, expected_decision}]`` JSON the CLI consumes — meta.yaml is
+the single source of truth, so the golden set never drifts from the dataset.
+
+The gate decision vocabularies are derived from the production enums
+(``VerdictType`` / ``RiskLevel`` / ``MergeDecision``) so a renamed decision
+value fails the build instead of silently mislabelling a case. ``GoldenCase`` is
+imported from the production harness so the emitted objects are exactly what
+``optimize-prompts --golden`` validates.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from src.models.decision import MergeDecision
+from src.models.diff import RiskLevel
+from src.models.judge import VerdictType
+from src.tools.prompt_optimizer import GoldenCase
+
+from scripts.eval._ground_truth import GroundTruthMissing, load_meta
+from scripts.eval._schemas import SampleMeta
+
+# Tier -> sample container, mirroring scripts.eval.lock.TIER_LAYOUT. Only tiers
+# whose entries carry a SampleMeta (tier-1 micro-bench, tier-3 adversarial) can
+# contribute golden cases; tier-2 replays have no meta.yaml and are skipped.
+_TIER_LAYOUT: dict[int, str] = {
+    1: "tier1/samples",
+    2: "tier2/replays",
+    3: "tier3/adversarial",
+}
+
+# expected_decision must be one of the gate's real decision values. Keyed by the
+# gate IDs registered in src/llm/prompts/gate_registry.py.
+GATE_DECISION_VOCAB: dict[str, frozenset[str]] = {
+    "J-SYSTEM": frozenset(v.value for v in VerdictType),
+    "P-RISK-SCORE-SYSTEM": frozenset(v.value for v in RiskLevel),
+    "CA-SYSTEM": frozenset(v.value for v in MergeDecision),
+}
+
+
+class GoldenBuildError(ValueError):
+    """A sample declared a golden case the build rejects (typo / unknown gate)."""
+
+
+def _validate_decision(sample_id: str, gate_id: str, decision: str) -> None:
+    vocab = GATE_DECISION_VOCAB.get(gate_id)
+    if vocab is None:
+        raise GoldenBuildError(
+            f"{sample_id}: unknown golden gate '{gate_id}' "
+            f"(known: {sorted(GATE_DECISION_VOCAB)})"
+        )
+    if decision not in vocab:
+        raise GoldenBuildError(
+            f"{sample_id}: '{decision}' is not a valid {gate_id} decision "
+            f"(allowed: {sorted(vocab)})"
+        )
+
+
+def _iter_sample_metas(datasets_root: Path, tiers: tuple[int, ...]) -> list[SampleMeta]:
+    metas: list[SampleMeta] = []
+    for tier in tiers:
+        layout = _TIER_LAYOUT.get(tier)
+        if layout is None:
+            continue
+        container = datasets_root / layout
+        if not container.is_dir():
+            continue
+        for sample_dir in sorted(p for p in container.iterdir() if p.is_dir()):
+            try:
+                metas.append(load_meta(sample_dir))
+            except GroundTruthMissing:
+                # Not every dir is a meta-bearing sample (e.g. tier-2 replays).
+                continue
+    return metas
+
+
+def build_golden_sets(
+    datasets_root: Path,
+    tiers: tuple[int, ...] = (1, 2, 3),
+) -> dict[str, list[GoldenCase]]:
+    """Collect judgment-intensive golden cases grouped by ``*-SYSTEM`` gate.
+
+    Only samples with ``judgment_intensive: true`` contribute; each declared
+    ``golden_decisions`` entry is validated against its gate's vocabulary
+    (``GoldenBuildError`` on mismatch) and grouped under that gate. Cases within
+    a gate are sorted by ``case_id`` for deterministic output. A sample marked
+    judgment-intensive with no ``golden_decisions`` is a no-op (it contributes
+    no case) rather than an error — staging a sample before labelling it is
+    allowed.
+    """
+    grouped: dict[str, list[GoldenCase]] = {}
+    for meta in _iter_sample_metas(datasets_root, tiers):
+        if not meta.judgment_intensive:
+            continue
+        for entry in meta.golden_decisions:
+            _validate_decision(meta.sample_id, entry.gate_id, entry.expected_decision)
+            grouped.setdefault(entry.gate_id, []).append(
+                GoldenCase(
+                    case_id=meta.sample_id,
+                    expected_decision=entry.expected_decision,
+                )
+            )
+    return {
+        gate_id: sorted(cases, key=lambda c: c.case_id)
+        for gate_id, cases in sorted(grouped.items())
+    }
diff --git a/scripts/eval/_schemas.py b/scripts/eval/_schemas.py
index 889a34d..62d61f9 100644
--- a/scripts/eval/_schemas.py
+++ b/scripts/eval/_schemas.py
@@ -380,12 +380,34 @@ class AcceptanceThresholds(BaseModel):
 # ---------------------------------------------------------------------------
 
 
+class GoldenDecision(BaseModel):
+    """One ``(gate, expected_decision)`` pair declared by a sample.
+
+    Lets a single judgment-intensive sample contribute a golden case to
+    several ``*-SYSTEM`` gates at once (e.g. the same C-class conflict is a
+    ``human_required`` risk case and an ``escalate_human`` conflict-analyst
+    case). ``expected_decision`` is validated against the gate's real
+    decision vocabulary in ``_golden.py`` — never here, so this schema stays
+    free of a src import.
+    """
+
+    model_config = _FROZEN
+
+    gate_id: str
+    expected_decision: str
+
+
 class SampleMeta(BaseModel):
     """Parsed contents of one sample's ``meta.yaml``.
 
     Mirrors the keys produced by the Phase 1 reference samples
     (``tests/eval/datasets/.../meta.yaml``). Tier-3 entries additionally
     set ``loss_class`` to one of M1..M6; Tier-1/2 leave it ``None``.
+
+    ``judgment_intensive`` + ``golden_decisions`` opt a sample into the
+    LLM-judgment golden set consumed by ``merge optimize-prompts --golden``
+    (see ``doc/evaluation/golden.md``). They are absent on most samples and
+    default to "not a golden case".
     """
 
     model_config = _FROZEN
@@ -396,6 +418,8 @@ class SampleMeta(BaseModel):
     loss_class: str | None = None
     expected_human: bool = False
     description: str | None = None
+    judgment_intensive: bool = False
+    golden_decisions: tuple[GoldenDecision, ...] = ()
 
 
 class GoldenFileEntry(BaseModel):
@@ -431,6 +455,7 @@ class GroundTruthBundle(BaseModel):
     "GateOperator",
     "GateResult",
     "GateVerdict",
+    "GoldenDecision",
     "GoldenFileEntry",
     "GroundTruthBundle",
     "ManifestEntry",
diff --git a/scripts/eval/build_golden.py b/scripts/eval/build_golden.py
new file mode 100644
index 0000000..5155b74
--- /dev/null
+++ b/scripts/eval/build_golden.py
@@ -0,0 +1,86 @@
+"""Emit per-gate LLM-judgment golden JSON from dataset meta.yaml.
+
+For every ``*-SYSTEM`` gate that any judgment-intensive sample labels, writes
+``<out-dir>/<gate_id>.golden.json`` in the ``[{case_id, expected_decision}]``
+shape consumed by ``merge optimize-prompts --golden``. meta.yaml is the single
+source of truth (see ``doc/evaluation/golden.md``); re-run this whenever a
+sample's ``judgment_intensive`` / ``golden_decisions`` fields change.
+
+Usage:
+    python -m scripts.eval.build_golden                 # defaults below
+    python -m scripts.eval.build_golden --tier 1 --tier 3
+    python -m scripts.eval.build_golden \
+        --datasets tests/eval/datasets --out-dir tests/eval/golden
+"""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from scripts.eval._common import write_json
+from scripts.eval._golden import build_golden_sets
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+DEFAULT_DATASETS_DIR = REPO_ROOT / "tests" / "eval" / "datasets"
+DEFAULT_OUT_DIR = REPO_ROOT / "tests" / "eval" / "golden"
+
+
+def _eprint(message: str) -> None:
+    print(message)
+
+
+def cmd_build(datasets_dir: Path, out_dir: Path, tiers: tuple[int, ...]) -> int:
+    golden = build_golden_sets(datasets_dir, tiers=tiers)
+    if not golden:
+        _eprint(
+            "No judgment-intensive golden cases found "
+            f"(datasets={datasets_dir}, tiers={list(tiers)})."
+        )
+        return 0
+    for gate_id, cases in golden.items():
+        out_path = out_dir / f"{gate_id}.golden.json"
+        payload = [case.model_dump(mode="json") for case in cases]
+        write_json(out_path, payload, sort_keys=False)
+        _eprint(f"{gate_id}: {len(cases)} case(s) -> {out_path}")
+    return 0
+
+
+def _build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="scripts.eval.build_golden",
+        description="Emit per-gate LLM-judgment golden JSON from dataset meta.yaml.",
+    )
+    parser.add_argument(
+        "--datasets",
+        default=str(DEFAULT_DATASETS_DIR),
+        help=f"Datasets root (default: {DEFAULT_DATASETS_DIR}).",
+    )
+    parser.add_argument(
+        "--out-dir",
+        default=str(DEFAULT_OUT_DIR),
+        help=f"Directory to write <gate>.golden.json into (default: {DEFAULT_OUT_DIR}).",
+    )
+    parser.add_argument(
+        "--tier",
+        type=int,
+        choices=(1, 2, 3),
+        action="append",
+        dest="tiers",
+        help="Tier to scan (repeatable). Default: all of 1, 2, 3.",
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _build_arg_parser().parse_args(argv)
+    tiers = tuple(args.tiers) if args.tiers else (1, 2, 3)
+    return cmd_build(
+        datasets_dir=Path(args.datasets).resolve(),
+        out_dir=Path(args.out_dir).resolve(),
+        tiers=tiers,
+    )
+
+
+if __name__ == "__main__":  # pragma: no cover - direct CLI entry
+    raise SystemExit(main())
diff --git a/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml
index 0d7482f..644b1de 100644
--- a/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml
+++ b/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml
@@ -9,3 +9,9 @@ description: |
   merge` produces conflict markers, so the system is correct to
   escalate (expected_human=true). Backfilled 2026-05-16 from real
   3-way merge dry-run.
+judgment_intensive: true     # LLM-driven: no security_sensitive force / deterministic veto
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: human_required
+  - gate_id: CA-SYSTEM
+    expected_decision: escalate_human
diff --git a/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml
index a956150..07dde15 100644
--- a/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml
+++ b/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml
@@ -8,3 +8,9 @@ description: |
   requirements.txt (overlapping hunk). `git merge` conflicts on that
   file, so the system is correct to escalate (expected_human=true).
   Backfilled 2026-05-16 from real 3-way merge dry-run.
+judgment_intensive: true     # LLM-driven: no security_sensitive force / deterministic veto
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: human_required
+  - gate_id: CA-SYSTEM
+    expected_decision: escalate_human
diff --git a/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml
index 0809f39..0862216 100644
--- a/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml
+++ b/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml
@@ -17,3 +17,10 @@ description: |
   Under single-shot Tier-1 eval the expected outcome is escalation (no
   human present), so this exercises C-class escalation coverage rather than
   auto-merge-to-golden.
+judgment_intensive: true     # auth/** only hits the weak risk-hint nudge, not the
+                             # strict security_sensitive floor — decision stays LLM-driven
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: human_required
+  - gate_id: CA-SYSTEM
+    expected_decision: escalate_human
diff --git a/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml
index 2f413f1..b6b69e4 100644
--- a/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml
+++ b/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml
@@ -20,3 +20,10 @@ description: |
   while this sample provides the C-class coverage + correct golden
   reference. Under single-shot Tier-1 eval the expected outcome is
   escalation (no human present).
+judgment_intensive: true     # auth/** only hits the weak risk-hint nudge, not the
+                             # strict security_sensitive floor — decision stays LLM-driven
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: human_required
+  - gate_id: CA-SYSTEM
+    expected_decision: escalate_human
diff --git a/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml
index be779b7..0272e03 100644
--- a/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml
+++ b/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml
@@ -16,3 +16,10 @@ description: |
   additions are load-bearing; a directional take_current/take_target would
   silently drop one feature. Under single-shot Tier-1 eval the expected
   outcome is escalation (no human present).
+judgment_intensive: true     # auth/** only hits the weak risk-hint nudge, not the
+                             # strict security_sensitive floor — decision stays LLM-driven
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: human_required
+  - gate_id: CA-SYSTEM
+    expected_decision: escalate_human
diff --git a/tests/eval/datasets/tier1/samples/t1-0034/base.tar b/tests/eval/datasets/tier1/samples/t1-0034/base.tar
new file mode 100644
index 0000000000000000000000000000000000000000..b38e5ebd0856682bb4ec8a97500c3f68d85fda12
GIT binary patch
literal 10240
zcmeHKQE%Ev5Y8+96;q_P_8m0|Ayh(&PDvm=bd>8Qq^(p%)Uh|P7vrVtg@ilhzu(O6
z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2<EPVZ@6k7Yo_=@U
zcHZtC938Zd4iDRHvLCeG9v<z%{*Mb`x2I*Q0*N8*Uhz{~noT$pi?!s_nS$1S>k!5>
z2IoRf+1~<AkQPFwKAglc=rt)wmNL0wk?-K%%MZQJZ-zW%Ny^>~B9<tA%bD!L`@z_8
z7D0FyOc@Zn9EZ;rLMm{bT0InrVh^fLf6DbjiDo(rT1Opv!IDr!Jef9U?17$d=X#yu
zb5?ig*~I4KS95OI8{O!IMZ+>?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t
z<E#Qnv18I#DT_b}v<S%{7ReM8lXIP>kajyRz(z`ar#8d|W3(GEGjri7JL?$g<ZeRl
zCWKn<j#A$`ItUeC1qz{sC6)t%Fcix~;b|D0JTX>GqC6!$hIUP?7hLzmJ=Iu&LMAMN
zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M
z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T&
zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tV<eX_c^crfbqKf^`;
zynoRjo%JtWDsKaL<k8-t5O>SPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@h<Ows!888X
zGLB|37x2l9Hu%w>V;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl
z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ
z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB
zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd
z$80tCN$W$cl$QL|w$#>BzQSH12I>(pP<h^DjWrqJ4%rwa5v+2a;a^K#&_F>t6U#V)
z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno
z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@
zGVkD?<J3hAFy%;#;s!rFPR|2^klqUXCBL&XhZ?n9p3<q%JqDV9{+S-a7djwqaXW3#
zso@0f(=^88aV0+a=5Z`8^-y$;8(f+DZrB2o+UPCT!Iw_Ed)*)(*^6$ceT_qHVn8|u
zY^!%n^^+-27*)*k2Y9KF%6(M;UK0|#kQA|-X_5XWW(;}_G#R>YS#?XT$&_nhX5Ewz
ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc
zF+}|h52%l<P!!yhQ$CNxYTtLcU7|QTa?5AB5(T+PJTDV6C`e6xkw4E1{0T|RT)gto
ze41b?NSWW@WYo)mr04n;rSgD1;)?q!KOE6UQKfEsaSRFGC5Rywe|$7>*B$}?*I^{T
zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU
z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h
zBtXTYNb-BDy$0v<MNL|smho1eoM~;N)myn>A+6>381|pklPdDvbj!3w3ybl))x{eW
z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC
z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su
ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0034/fork.patch b/tests/eval/datasets/tier1/samples/t1-0034/fork.patch
new file mode 100644
index 0000000..56b1590
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0034/fork.patch
@@ -0,0 +1,14 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..2a7ac465fd 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -17,6 +17,9 @@ import (
+ 
+ type AuthorizationPurpose string
+ 
++// MaxActiveTokensPerUser caps how many concurrent long-term tokens a user may hold.
++const MaxActiveTokensPerUser = 10
++
+ var (
+ 	// Used to store long term authorization tokens.
+ 	LongTermAuthorization AuthorizationPurpose = "long_term_authorization"
diff --git a/tests/eval/datasets/tier1/samples/t1-0034/golden.tar b/tests/eval/datasets/tier1/samples/t1-0034/golden.tar
new file mode 100644
index 0000000000000000000000000000000000000000..af3c803ad2de47b781626050d7cada97d0deec5a
GIT binary patch
literal 10240
zcmeHLZBN@s5auiX71K!9I(Ml9Ax))7(FsWCMMrZSgtk(J=wdI456A1S*MZ!L|9&&O
zYbORreCemkk%(i@?z}wn%*>jF2zZpXS*GUt>spCB9(yyf^W#2y_~~`KJ9Nd*)9d?g
zud}oNZol*H;Go+j`F`hpr?&%pKhA~inzB?ei7stl@l#9MZ8#RmN`|w!g3ey&0H$*e
zCqmBnKLQSs7DA>T97Yl7H7Q7*a=GMz=iuIpv*DMwr=ic|l)oJZJXYat$Ymcsj;D^3
zF#nFtI1sxWCtM^#DsY|MhA(2pA5?>W%2lF7JDs!6yB<B@u`hx!p0(%vf$nb?dYuXv
zyy4Kj@y)|adtulc+~|d6K^F0}O}jD`M%&~@)^r@TO1KR-{FkXv#AlgIM9Lvm61{OK
z+K=qv&{yG-PpL@KGcGR?3-DQz!d%?Lg2gMq*!)aNgaRTF&)zC77oe*u1qK;ySg;kY
zj}Ss!oT_bl2e7~AI7=o8Gc=FJ0@Q>a3CXo`(8>v3A+6iil;`Z8qV*{nH%ZLJrMo=F
z0W`?hYf^VzQb&+)m}V3{Q<zc{3G;oC#R{zv5*i#0UNw|xhT^}rS`g$X>WNzl6f!Vj
zX?iatLbQ8kf1YdMHvjVt*IGxz!JJn4O}f*B`$W?K_=1HIupq!qDIl|KT%L2A#XgKk
zybj7W%MBr#)gk`p>`JaO8CQzkhE}lwK)(UM88@2FqjtFzpe7oYTtZRBiWpQ%kI6QX
z!X(72AhOYcUT3}n^lv<b(kF~nuM6<)LC8hFk=&655yrl3e8aRpSzTT{o(xA{;Cys4
zIv-7rM;9)ow*_I)q`jv?+-1q1d}YlU;BrG|@8x9t_vMII|H0BZ4?eLd3>XqTue;rx
zqq)x|d~%_6e)888CYPtDFdUs6UY<_j%@$g}A&aAjB$T;xs6KBN-W0pWg<^}urew-)
zB0gaYq@@(gFyN`?sN^gHMKXv{Xi4Li9Ms@9q0rzqZD5oM%apvO>B=($xIvaR2g5j2
zxCvJWH{Lm)g((*4N9ucS%<rp=<wP2%IzXhkv6Di0tNpHGP?!QIIH<2{yDqIV)cLh4
zqCz`5cOyi!5l^wxGYET!>8bEpM6kL|?}%feahK5cCgR?0<_>@0F_+j}$egh}vrmZk
z7_H_$%^2S_8AWYamo1g`3>$N-+maqlV^zdW(nymL?vRZv4q#dE41Z-hp-e$K7g-d*
zjp;w8+u~;GB*bi3hFE_aZQVzuVV?MB?z_Hvpm9K>&O0{OmR5vCr3<NLKwni*uLX%6
zKqd4o;hW%^fUvq+8^M?oL$y-UpOBh4=>9qXj!`PWN|zYFSisFnah&W^#O5}XC_3nv
zXK)|lv_}ju<e)&eFsJ7MK}e_y{E{#1ETG0-E|176^antb*E`n3|6DtyC2p_Vbarv_
zkLVPD=5Zz7dFFO1F7z;VjT&58=x*5rQ@7Dutb(t-ZvU!9I`RkoUiS)z+O+{`09aNZ
zxEjT?Fy@r8rZ>jJj8yLH67ZUkScTLP`*|<YS7OF^*g}z^`-W7P)QUt!FU+L{o+N$J
z`}#l+KIIg(;eI{n^skz<+>#(y1vynv_K2X^wz9UcCWsE&cD0i*4%VfnlYWbjwu~5}
zK4si!r8=^WqNJvp@`We1_r1vNijJc_w|u4}5i6R+^E4rYSZeZ%{CS?>&q!MC;*p2u
z(>1k(0sIb!lVJ`d-8VKURXE%vuDGuY<A^pkRjRgE1Bme~K@73@<E?>v_Yv?Ni&Oi3
zg^oR2gC|^XM8$fW`=r~{FW>v{%YE4mT{XCNBGl*sM*-S@EE2wUt3|%pG!J>iH+!m1
zG>#8^xNREP%s$y!o!bHms7PAVG$D%E)|<(P(R{Qz;w&b$t#vxr{6$o&LpWMpQZwER
zgki@bPo5rwi}poDS{;_rRvnxvZPTkaV!=dO$zT9`Px46_c`@9o-l7|e(YsN_YZOX7
z!9;JF#;CuU6D{%!Pf@VsGR5OH2s81EHfbR!sd5V`e7&Nnuz5t8QwKILXWj(Q8<?KC
z_gRLDueYsJR?({G-bJ=>k&eZfKc_QrjPF!*uEjS(bC?P;9R1`nzD{5rQL8cI-5g7>
zoF$KN#gd*wEbeV6FP&2gYj{r4M~-1`2uW%2)6Xx-?pj;r*@aFjKlQ%@>fmYwY6NNo
qY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6N}~1pWtie@b)!

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0034/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0034/meta.yaml
new file mode 100644
index 0000000..30042f7
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0034/meta.yaml
@@ -0,0 +1,30 @@
+sample_id: t1-0034
+tier: 1
+category: B
+loss_class: null         # organic clean-merge sample (not M-injected)
+expected_human: false    # disjoint additive changes 3-way merge cleanly → auto-merge, no escalation
+description: |
+  B-class clean auto-merge: models/auth/auth_token.go — fork and upstream
+  make INDEPENDENT, NON-overlapping additions that git 3-way merges with no
+  conflict:
+    * fork adds `const MaxActiveTokensPerUser` directly below the
+      AuthorizationPurpose type declaration (top of file).
+    * upstream appends a `Remaining()` method after HashValidator (end of
+      file).
+  Golden keeps BOTH (deterministic `git merge`, `go build ./models/auth/`
+  verified); it is the obvious correct clean merge, NOT the merge system's
+  own output (dataset.md §1.4). Purely additive — no signature or behavior
+  change — so risk is low. Seeds the auto_safe / semantic_merge / judge-pass
+  positive decision face the C-class escalation samples (t1-0031..0033)
+  cannot exercise.
+judgment_intensive: true     # risk is the LLM hybrid score (no deterministic short-
+                             # circuit: additive, not security_sensitive); a weaker
+                             # prompt could over-escalate this clean merge or drop one
+                             # side (cf. t1-0003 executor take_target, IMPL_REPORT)
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: auto_safe
+  - gate_id: CA-SYSTEM
+    expected_decision: semantic_merge   # keep both disjoint additions, don't take one side
+  - gate_id: J-SYSTEM
+    expected_decision: pass             # judge fed the golden (both-sides) tree → pass (golden.md §5)
diff --git a/tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml b/tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml
new file mode 100644
index 0000000..0bbd157
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml
@@ -0,0 +1,15 @@
+# Capture provenance — sibling of meta.yaml, not consumed by prepare.
+# Kept for audit trail (dataset.md §2.4 maintenance rules) so reviewers
+# can re-derive the sample from the original git history.
+repo: /Users/angel/AI/merge-test/forgejo
+base_ref: 160377405c53145e56dd0aab6ee05fce9764c184
+upstream_ref: a3635aa2f3cb1a2578f4f0b6bf8f477be562e8c1
+fork_ref: ba2beff0d03cd7f9434063df63d894ec11c3e51e
+golden_ref: 682270de0c402a0fd73be751648269f271065574
+paths: ['models/auth/auth_token.go']
+# F8: files touched in base→golden but in neither base→upstream nor
+# base→fork — content the human added directly during the merge commit.
+# Reviewers should weigh whether to keep the sample (the merge system
+# cannot reconstruct these files from the two patches alone, so they
+# surface as MISS_UPSTREAM / MISS_FORK in eval diffs).
+noisy_paths: []
diff --git a/tests/eval/datasets/tier1/samples/t1-0034/upstream.patch b/tests/eval/datasets/tier1/samples/t1-0034/upstream.patch
new file mode 100644
index 0000000..56bd093
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0034/upstream.patch
@@ -0,0 +1,13 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..76acc10e5b 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -111,3 +111,8 @@ func HashValidator(validator []byte) string {
+ 	h.Write(validator)
+ 	return hex.EncodeToString(h.Sum(nil))
+ }
++
++// Remaining reports the duration until the token expires, relative to now.
++func (authToken *AuthorizationToken) Remaining() time.Duration {
++	return time.Until(authToken.Expiry.AsLocalTime())
++}
diff --git a/tests/eval/datasets/tier1/samples/t1-0035/base.tar b/tests/eval/datasets/tier1/samples/t1-0035/base.tar
new file mode 100644
index 0000000000000000000000000000000000000000..b38e5ebd0856682bb4ec8a97500c3f68d85fda12
GIT binary patch
literal 10240
zcmeHKQE%Ev5Y8+96;q_P_8m0|Ayh(&PDvm=bd>8Qq^(p%)Uh|P7vrVtg@ilhzu(O6
z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2<EPVZ@6k7Yo_=@U
zcHZtC938Zd4iDRHvLCeG9v<z%{*Mb`x2I*Q0*N8*Uhz{~noT$pi?!s_nS$1S>k!5>
z2IoRf+1~<AkQPFwKAglc=rt)wmNL0wk?-K%%MZQJZ-zW%Ny^>~B9<tA%bD!L`@z_8
z7D0FyOc@Zn9EZ;rLMm{bT0InrVh^fLf6DbjiDo(rT1Opv!IDr!Jef9U?17$d=X#yu
zb5?ig*~I4KS95OI8{O!IMZ+>?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t
z<E#Qnv18I#DT_b}v<S%{7ReM8lXIP>kajyRz(z`ar#8d|W3(GEGjri7JL?$g<ZeRl
zCWKn<j#A$`ItUeC1qz{sC6)t%Fcix~;b|D0JTX>GqC6!$hIUP?7hLzmJ=Iu&LMAMN
zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M
z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T&
zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tV<eX_c^crfbqKf^`;
zynoRjo%JtWDsKaL<k8-t5O>SPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@h<Ows!888X
zGLB|37x2l9Hu%w>V;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl
z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ
z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB
zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd
z$80tCN$W$cl$QL|w$#>BzQSH12I>(pP<h^DjWrqJ4%rwa5v+2a;a^K#&_F>t6U#V)
z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno
z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@
zGVkD?<J3hAFy%;#;s!rFPR|2^klqUXCBL&XhZ?n9p3<q%JqDV9{+S-a7djwqaXW3#
zso@0f(=^88aV0+a=5Z`8^-y$;8(f+DZrB2o+UPCT!Iw_Ed)*)(*^6$ceT_qHVn8|u
zY^!%n^^+-27*)*k2Y9KF%6(M;UK0|#kQA|-X_5XWW(;}_G#R>YS#?XT$&_nhX5Ewz
ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc
zF+}|h52%l<P!!yhQ$CNxYTtLcU7|QTa?5AB5(T+PJTDV6C`e6xkw4E1{0T|RT)gto
ze41b?NSWW@WYo)mr04n;rSgD1;)?q!KOE6UQKfEsaSRFGC5Rywe|$7>*B$}?*I^{T
zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU
z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h
zBtXTYNb-BDy$0v<MNL|smho1eoM~;N)myn>A+6>381|pklPdDvbj!3w3ybl))x{eW
z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC
z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su
ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0035/fork.patch b/tests/eval/datasets/tier1/samples/t1-0035/fork.patch
new file mode 100644
index 0000000..b73724a
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0035/fork.patch
@@ -0,0 +1,16 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..9c86827a3a 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -57,6 +57,11 @@ func (authToken *AuthorizationToken) IsExpired() bool {
+ 	return authToken.Expiry.AsLocalTime().Before(time.Now())
+ }
+ 
++// IsLongTerm reports whether the token is a long-term authorization token.
++func (authToken *AuthorizationToken) IsLongTerm() bool {
++	return authToken.Purpose == LongTermAuthorization
++}
++
+ // GenerateAuthToken generates a new authentication token for the given user.
+ // It returns the lookup key and validator values that should be passed to the
+ // user via a long-term cookie.
diff --git a/tests/eval/datasets/tier1/samples/t1-0035/golden.tar b/tests/eval/datasets/tier1/samples/t1-0035/golden.tar
new file mode 100644
index 0000000000000000000000000000000000000000..466e0865f6a7b61fea14496cbb728100c982f1da
GIT binary patch
literal 10240
zcmeHKQE%He5YDUhE3O9C%9|^vj*|igf+2~W&RFVoiQN_`0+*2(n~N;E6z#+t^51ty
zQnHmO%O3hv$Uy5zcRb#G_uWT&E+ZbNEw<D%^WQ|v8=eGHx%2%#yZAZibav<)KM%j(
z?zIng-t50=@9*t(I%I#-e%s#Pf!*(y!qbtKsb&gOdfMZMwzOJsA{T2VrZWxg-S!@g
zW*knXn)1IS93d^FN&`5GV=z7`D4ue);!)t?+KUgp&#%u!$di=69z;CR;#zRkh4+Jz
z=Pg)x!=@aFU7jcA3#l~tUacO=MDsgcr$5ztp=B$bvG)D}z2HeGBauv7Gk!<U*K^~i
z#hlkYdbYU*_|=+Q_C{C6v1nMvJZ;gcr55p1c57>Ro?b7wD_8u@w2;Y%rCP|8L#h>K
z<9RElD0eJ+obm{?M2l1$Vwp@qb2T?*O68WL0}fIec(pTh7@^&QnVAbu*<OcGr*IPr
zHzCyWa8&x%)-2Rw#WX?-OKb<QFqF$g<8GLoI<i(QqP!&BhE`2n6!d!#o_ef7BNG-Z
zO>d>bZr47zKhKTvsr<7A6BDEEA+}|CQ|xrXL!wIre9lA+EQ)YZ3fN&+SKwVQlMwo3
z-W26qciWgod5QmcHO2K(l_TKSO^7W3x^?))rqS^3P0ED=N7PKD{(`DXbc*>E+GD#-
zq_7PMN(g0RVAjh}1LimALE$43eb52;dMDMqTVLGJ4pJncs(;0{KJYF&4~D(|XE^Vl
z_Rssnlm3NI?QMdH8npII%A4ilcfNMpS>Upb?Aq~g@aMQs-oLSQ#-ooc77;^&XZ){a
z9L;hr;FB3$^20wyFdUzqL9c&$G(H=_t0!pvigrBbD}*;1C*sZo&xSS_;q+c;HeYzP
z{@p^TtN`>X@v&_iv8$L5*&L~><Vr+5H9S_F#h}Rs2^t--{6Z0?K@G?w_zhP)b<43S
zlOHpk_OPrIVGK`2A~Y^S@8~Kx=Tnj5AipP#@)Lerwpm8DL23%bQLy`@Ag)c`Wh@gZ
zaDsz0*zD_*m!;RwWfkSoF}W)#<E?t~m4QV#I7-iC$YO%tZv@92hmre)b}*E;eq)nx
zCTlbmsOu@*&NxoBGHGiwFPJDZ`uO@Oef-}AWV`)W1sMqs4<U0epy3Al9Z$GI?IfE8
zZR*}oy4WZ)+H{nK<{)shO{zqyH5DtA0va%L0uFTEGP1^I*5C$*jU^GR@-f1{mWGQ=
zLpqboID#uHul6v(#Uzvz7ghr0Wed0vgNlr^kJ9%;eFyGD4NmOI+BAVsGmK6&Enn6p
zQ`3ZnJLd{eIVc<8+k#L$np?$K1;MaN(LbpS=`sCt{tc_tfI_CQemRG$wdQz%7$G*-
zLZNA|ZQsGIz)J;UfF(yp^Ede6cILPv2<ffF|I{-pbExsCQpdC_bPs_J_~69cG0sgw
z#^N7z8eR=A7=5~I;P&W=Pl0_L$qRGl`!)=E=CPZu!6c?;i3{-Mpwqo<QjF|H_n>o$
zXZFN`bO@*j@3`(KQ;~4$SR)wVok?l`WdV3YNE8qf{B9<-=9`!?=rz$~Cw0qe)L>1f
zT<SCHrhRCAAnjeC7auc<x^lnlwY!%M3g$?Vg@TMKXuC&HoFZ;^*bu}N?Z)j?B++Kn
zOw+IN6%w3nUvS)%5%o7bpgD39Q3z8``8*RR+s|^lL@Z6_j?Z-Cqa3YBJZ=**h^4l_
z$e+gz{)D9EKHli)_?=)WNSR;ZXxPhur04n;rSgOu#1;Q#zB!_e=K)RIi$h58szeNN
z_~WC6yLJ!wQ;yTg@d6Wj<bxOKpijeko#kX?>ZfmA`02KohM5}l9ZKE5!&40Rzt0=K
zNvlb@xHk8A%(pewG#XDnd}$h5(rnk+2P?~an?nI5O>0;YqKe(T*?!o_$LnLx5(?Yg
zv~$B>M76wx<Mo)tc)JjWid9mGy|NN*X-Lb<veC+mv!iXbdaD*}q_g0Ow)>!;)RAY)
zEz=ewEH>`e6mL){Xo9WYF^!SF*$X@J3onVV;wq(I8PZ->a+!1x6hisA6~0{3sjz*!
znvnwA-+;jouS(dSS@1~-%{R+7Eh~}gS#YtO`$$J~L;0b9vRB1b2vi7E2vi7E2vi7E
k2vi7E2vi7E2vi7E2vi7E2vi7E2vi7E2vi7E2t13xzx6XIyZ`_I

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0035/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0035/meta.yaml
new file mode 100644
index 0000000..7a6404a
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0035/meta.yaml
@@ -0,0 +1,29 @@
+sample_id: t1-0035
+tier: 1
+category: B
+loss_class: null         # organic clean-merge sample (not M-injected)
+expected_human: false    # disjoint additive changes 3-way merge cleanly → auto-merge with caution
+description: |
+  B-class clean auto-merge: models/auth/auth_token.go — fork and upstream
+  make INDEPENDENT, NON-overlapping additions to the token type that git
+  3-way merges with no conflict:
+    * fork adds an `IsLongTerm()` method right after `IsExpired()` (mid-file).
+    * upstream adds a `RevokedUnix timeutil.TimeStamp` field to the
+      AuthorizationToken struct (before `Expiry`).
+  Golden keeps BOTH (deterministic `git merge`, `go build ./models/auth/`
+  verified). Unlike t1-0034/0036 this touches the token struct shape and
+  lifecycle logic under models/auth/** — it only hits the weak risk-hint
+  nudge (NOT the strict security_sensitive floor), so the hybrid LLM score
+  has the final say (config.py). The correct outcome is auto_risky: merge
+  both, but flag the auth-struct change for attention — exercises the
+  auto_risky positive that distinguishes a calibrated prompt from one that
+  treats every auth touch as auto_safe.
+judgment_intensive: true     # auth/** weak hint only; risk decision is LLM-driven, in
+                             # the risk_score_low..high band rather than short-circuited
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: auto_risky
+  - gate_id: CA-SYSTEM
+    expected_decision: semantic_merge   # keep both disjoint additions, don't take one side
+  - gate_id: J-SYSTEM
+    expected_decision: pass             # judge fed the golden (both-sides) tree → pass (golden.md §5)
diff --git a/tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml b/tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml
new file mode 100644
index 0000000..1e96229
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml
@@ -0,0 +1,15 @@
+# Capture provenance — sibling of meta.yaml, not consumed by prepare.
+# Kept for audit trail (dataset.md §2.4 maintenance rules) so reviewers
+# can re-derive the sample from the original git history.
+repo: /Users/angel/AI/merge-test/forgejo
+base_ref: 160377405c53145e56dd0aab6ee05fce9764c184
+upstream_ref: b176f1ce49adab083d987e3e0fa034b03b0994c1
+fork_ref: d0523cc96b39b5ff27d7abaa53e51a257cc9e465
+golden_ref: 47b9b9619040f22617135330bfd586deddb07afd
+paths: ['models/auth/auth_token.go']
+# F8: files touched in base→golden but in neither base→upstream nor
+# base→fork — content the human added directly during the merge commit.
+# Reviewers should weigh whether to keep the sample (the merge system
+# cannot reconstruct these files from the two patches alone, so they
+# surface as MISS_UPSTREAM / MISS_FORK in eval diffs).
+noisy_paths: []
diff --git a/tests/eval/datasets/tier1/samples/t1-0035/upstream.patch b/tests/eval/datasets/tier1/samples/t1-0035/upstream.patch
new file mode 100644
index 0000000..62a21a5
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0035/upstream.patch
@@ -0,0 +1,12 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..0f7d96086d 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -40,6 +40,7 @@ type AuthorizationToken struct {
+ 	LookupKey       string `xorm:"INDEX UNIQUE"`
+ 	HashedValidator string
+ 	Purpose         AuthorizationPurpose `xorm:"NOT NULL DEFAULT 'long_term_authorization'"`
++	RevokedUnix     timeutil.TimeStamp
+ 	Expiry          timeutil.TimeStamp
+ }
+ 
diff --git a/tests/eval/datasets/tier1/samples/t1-0036/base.tar b/tests/eval/datasets/tier1/samples/t1-0036/base.tar
new file mode 100644
index 0000000000000000000000000000000000000000..b38e5ebd0856682bb4ec8a97500c3f68d85fda12
GIT binary patch
literal 10240
zcmeHKQE%Ev5Y8+96;q_P_8m0|Ayh(&PDvm=bd>8Qq^(p%)Uh|P7vrVtg@ilhzu(O6
z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2<EPVZ@6k7Yo_=@U
zcHZtC938Zd4iDRHvLCeG9v<z%{*Mb`x2I*Q0*N8*Uhz{~noT$pi?!s_nS$1S>k!5>
z2IoRf+1~<AkQPFwKAglc=rt)wmNL0wk?-K%%MZQJZ-zW%Ny^>~B9<tA%bD!L`@z_8
z7D0FyOc@Zn9EZ;rLMm{bT0InrVh^fLf6DbjiDo(rT1Opv!IDr!Jef9U?17$d=X#yu
zb5?ig*~I4KS95OI8{O!IMZ+>?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t
z<E#Qnv18I#DT_b}v<S%{7ReM8lXIP>kajyRz(z`ar#8d|W3(GEGjri7JL?$g<ZeRl
zCWKn<j#A$`ItUeC1qz{sC6)t%Fcix~;b|D0JTX>GqC6!$hIUP?7hLzmJ=Iu&LMAMN
zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M
z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T&
zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tV<eX_c^crfbqKf^`;
zynoRjo%JtWDsKaL<k8-t5O>SPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@h<Ows!888X
zGLB|37x2l9Hu%w>V;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl
z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ
z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB
zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd
z$80tCN$W$cl$QL|w$#>BzQSH12I>(pP<h^DjWrqJ4%rwa5v+2a;a^K#&_F>t6U#V)
z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno
z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@
zGVkD?<J3hAFy%;#;s!rFPR|2^klqUXCBL&XhZ?n9p3<q%JqDV9{+S-a7djwqaXW3#
zso@0f(=^88aV0+a=5Z`8^-y$;8(f+DZrB2o+UPCT!Iw_Ed)*)(*^6$ceT_qHVn8|u
zY^!%n^^+-27*)*k2Y9KF%6(M;UK0|#kQA|-X_5XWW(;}_G#R>YS#?XT$&_nhX5Ewz
ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc
zF+}|h52%l<P!!yhQ$CNxYTtLcU7|QTa?5AB5(T+PJTDV6C`e6xkw4E1{0T|RT)gto
ze41b?NSWW@WYo)mr04n;rSgD1;)?q!KOE6UQKfEsaSRFGC5Rywe|$7>*B$}?*I^{T
zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU
z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h
zBtXTYNb-BDy$0v<MNL|smho1eoM~;N)myn>A+6>381|pklPdDvbj!3w3ybl))x{eW
z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC
z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su
ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0036/fork.patch b/tests/eval/datasets/tier1/samples/t1-0036/fork.patch
new file mode 100644
index 0000000..0d1e1d5
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0036/fork.patch
@@ -0,0 +1,14 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..bd3dd47a89 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -26,6 +26,9 @@ var (
+ 
+ 	// Used to reset the password.
+ 	PasswordReset AuthorizationPurpose = "password_reset"
++
++	// Used to store API-only long term tokens.
++	APIAuthorization AuthorizationPurpose = "api_authorization"
+ )
+ 
+ // Used to activate the specified email address for a user.
diff --git a/tests/eval/datasets/tier1/samples/t1-0036/golden.tar b/tests/eval/datasets/tier1/samples/t1-0036/golden.tar
new file mode 100644
index 0000000000000000000000000000000000000000..45f18f4f91ace7c71f3ddda94bdc37278d20e7b2
GIT binary patch
literal 10240
zcmeHKZBN@s5auiX71Ky<ox5Nl1S&;BCtgA?I-27kw3RAE7kf$eaJ=q%6Ud$T?>Do%
zcH-cuU;3%%NNuoZcXsBPXP((iWW?ian@#oD{CBJ615bjH-1u>y9sC@0IvezjpVjYo
zo!#Ay-TmG6{@z}vL-yVFyY|io?EJV8)_a;}nkfuv{feL3vb_x_GMy_i8f$3pwD({*
z=5Qv}i2p6&2x%cz7Qj&)gISY-;u%*n9t9rmy}CI4ymc-@o@9KhAMr$sd%;x~KJ<s4
zm$LAIjW`gyJWotgsWkZBW+Rk|=1;mof2w(^<#sk^?fnCK!IMx%A{lLu`4c_gPs}<k
zCcNR%vyCmlukDFtZ**rCmJQRGXWO)EsztocZf#A^)AN+Obj3eR3z=L@RVp(MnN}E$
z=gpX+*fHtrj7Oj)TBPC-%VY$atBFZdDz_aMU?XLLw|R~WhG@57=H}8<e%2v0$lWcu
zyCu{Lca-|l(Ja(r#xz0;Q!EFtFqG3o<7pV2I<i(wqB<o!hIZXLFSu^SJ=Iu)Mkb^z
z%O0h|YHwb+KQE1OUHtimTjOKJuSM+WqQ51Rc)kQ(0T$Xyp4TS8Qn88<maz&VN+U~o
zNX(3YPnd{-MG?lA0g}l<7<l*7B!nKB7x?*hx#d8!I+%aGO~v(8RfFv}jEg0#yAAl&
zhSBt%O~|E;hr}D~vXlT!bcXSjm|%%T{Ij(WD!6NXVARu41IE`sh0;eP`k({w^+~Ep
zw~;>32~s4XYJ5cot*kCD?hj6TpW(81*1PNtPI^~9F{uS2YSP|wDIcckAAIhPv%uvC
z*}d0;{$JNUTKzlA#ytASVi7UyrM%TP^LEH%F5#0KUGRfHhcLK4KZn!a+0pg+5Z<oo
zptp2l?<o~34-Vbu`Vb6JD6TY{q^=}Gb{F#jn_vehITI1jOgkvfV$fuR1dWa~ekni$
zeiI50e$ypJm2gbSRm@P>&fEr_Y&a+qp>Y$gj_!g>J`x$~!3UB$KjDv6hvh`-XC^_U
zxjiQXac|<TVvxvy6C5Pv=3JjvS?c^!l~LgxgS(S5UdpG~8CZmaqwHLUEGAg}W^l~0
z(YQ}&2Lt)&H}ins^MotZF*?qkJaTV{_n58WK8=;oh_WL;wJo*vNX)QTh=FDr^>vXq
zSz|*+ctAF?B!XGNGyH363d%HOV>yi@xU*7cg&jAOl8|4R3G6?kR9i;mLv$+g!oLXL
z5A_qc=@y*WDc3TNu&GQVwJhlK3hK2Wbu*`gz9W1Se47w<S8FL5+hXWeDf%asVGf3W
z$-iNi8nDw9<}W93H`g48`4F+W7Ya>#ZTk)$1rB(`08@^PC@%2B<ILzG2<ffF|I~MO
z7Et5ZP{(vCbPs`sZE#}d{iO-WSp0)d)7!*h-=o2g$Ky(T3hd)hUYYspTQ|6}@ZE9+
zCbcnJoP#e1o$gJGeB>{>2b~+7pSKpIL%_Cr&vh>uiG)+dnn54$FG~AwO28XJVi%Gk
zc5^K<-^7gmX$wu3?n_qFQgbpDT9{iG<wNU=_H}_?e9S58(*3g6?%p)XnIl2&3UaET
z?HNIFva&c~K@gL)Q?)aZM2l84MZd>aYeo!Ff58LlBPSFkH`SCcB5~UHU2az>j)~my
znQmXK5Q&#%LI$zS)))EnvcR8^wA{zr70stxOa&?P8ypQzb0F!tu|%mN;3jd!e^VTe
zXrri7x4k-q1n&~W5QjfLTDUi#0e@C;lHXSt*o!rIsSSG6toM0LMy6hW>%!~Dau}v-
zaP2_q-V=@jJpZ{U_#&(p#p23*%45E)six34KJewbY2Yfmva`Cj1r$(Gw5Amy%GmXr
zt%vn|JU`|vA-AnXIT!pzRI5`so?nv~FDJrKu_%(_itKLOg?v$yR;OjXRVQa#+iLYv
zE?7usIXr}&m3mS|zMF29wisctewVs<fkH_WZ1#?6jP%XkXpvudi-H+f8Qu7$o%n@J
zItWUx{K5)fZfGhjUs1-S!1ni8Fu?l;mS-G%QbO~^wnfP*q<S1&O(#Cmu^97Cz4~}5
zu)l@qev3Z_Gk6?xGfzlo^67^c6DvE($j)4Be!=M1jk%`0aOS6#7xNdFloYUbc?2a1
zLDK!yzw+zkY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNo{%;8U
E2fEEXj{pDw

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0036/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0036/meta.yaml
new file mode 100644
index 0000000..e6c381e
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0036/meta.yaml
@@ -0,0 +1,30 @@
+sample_id: t1-0036
+tier: 1
+category: B
+loss_class: null         # organic clean-merge sample (not M-injected)
+expected_human: false    # disjoint additive changes 3-way merge cleanly → auto-merge, no escalation
+description: |
+  B-class clean auto-merge: models/auth/auth_token.go — fork and upstream
+  make INDEPENDENT, NON-overlapping additions that git 3-way merges with no
+  conflict:
+    * fork adds an `APIAuthorization` value to the AuthorizationPurpose enum
+      block (mid-file `var (...)`).
+    * upstream appends a package-level `PurposeIsLongTerm()` helper at the
+      end of the file.
+  Golden keeps BOTH (deterministic `git merge`, `go build ./models/auth/`
+  verified); the obvious correct clean merge, NOT the merge system's own
+  output (dataset.md §1.4). Additive enum + helper, no signature or behavior
+  change → low risk. A second auto_safe / semantic_merge / judge-pass anchor
+  (alongside t1-0034) so the optimizer can reward "dares to auto-merge"
+  prompts rather than only penalising over-merge.
+judgment_intensive: true     # risk is the LLM hybrid score (no deterministic short-
+                             # circuit: additive, not security_sensitive); a weaker
+                             # prompt could over-escalate this clean merge or drop one
+                             # side (cf. t1-0003 executor take_target, IMPL_REPORT)
+golden_decisions:
+  - gate_id: P-RISK-SCORE-SYSTEM
+    expected_decision: auto_safe
+  - gate_id: CA-SYSTEM
+    expected_decision: semantic_merge   # keep both disjoint additions, don't take one side
+  - gate_id: J-SYSTEM
+    expected_decision: pass             # judge fed the golden (both-sides) tree → pass (golden.md §5)
diff --git a/tests/eval/datasets/tier1/samples/t1-0036/provenance.yaml b/tests/eval/datasets/tier1/samples/t1-0036/provenance.yaml
new file mode 100644
index 0000000..6666b7c
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0036/provenance.yaml
@@ -0,0 +1,15 @@
+# Capture provenance — sibling of meta.yaml, not consumed by prepare.
+# Kept for audit trail (dataset.md §2.4 maintenance rules) so reviewers
+# can re-derive the sample from the original git history.
+repo: /Users/angel/AI/merge-test/forgejo
+base_ref: 160377405c53145e56dd0aab6ee05fce9764c184
+upstream_ref: 4dba8b162302ae91bb0819121b8227a3ae910a1d
+fork_ref: f378af4abf63ce5930c65392c2fe55006a2a76d7
+golden_ref: a432c4db17710daff03f923eb1b6f7003ff6a69e
+paths: ['models/auth/auth_token.go']
+# F8: files touched in base→golden but in neither base→upstream nor
+# base→fork — content the human added directly during the merge commit.
+# Reviewers should weigh whether to keep the sample (the merge system
+# cannot reconstruct these files from the two patches alone, so they
+# surface as MISS_UPSTREAM / MISS_FORK in eval diffs).
+noisy_paths: []
diff --git a/tests/eval/datasets/tier1/samples/t1-0036/upstream.patch b/tests/eval/datasets/tier1/samples/t1-0036/upstream.patch
new file mode 100644
index 0000000..b48a273
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0036/upstream.patch
@@ -0,0 +1,13 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..e9a461ee66 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -111,3 +111,8 @@ func HashValidator(validator []byte) string {
+ 	h.Write(validator)
+ 	return hex.EncodeToString(h.Sum(nil))
+ }
++
++// PurposeIsLongTerm reports whether the purpose is the long-term variant.
++func PurposeIsLongTerm(p AuthorizationPurpose) bool {
++	return p == LongTermAuthorization
++}
diff --git a/tests/eval/datasets/tier1/samples/t1-0037/base.tar b/tests/eval/datasets/tier1/samples/t1-0037/base.tar
new file mode 100644
index 0000000000000000000000000000000000000000..b38e5ebd0856682bb4ec8a97500c3f68d85fda12
GIT binary patch
literal 10240
zcmeHKQE%Ev5Y8+96;q_P_8m0|Ayh(&PDvm=bd>8Qq^(p%)Uh|P7vrVtg@ilhzu(O6
z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2<EPVZ@6k7Yo_=@U
zcHZtC938Zd4iDRHvLCeG9v<z%{*Mb`x2I*Q0*N8*Uhz{~noT$pi?!s_nS$1S>k!5>
z2IoRf+1~<AkQPFwKAglc=rt)wmNL0wk?-K%%MZQJZ-zW%Ny^>~B9<tA%bD!L`@z_8
z7D0FyOc@Zn9EZ;rLMm{bT0InrVh^fLf6DbjiDo(rT1Opv!IDr!Jef9U?17$d=X#yu
zb5?ig*~I4KS95OI8{O!IMZ+>?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t
z<E#Qnv18I#DT_b}v<S%{7ReM8lXIP>kajyRz(z`ar#8d|W3(GEGjri7JL?$g<ZeRl
zCWKn<j#A$`ItUeC1qz{sC6)t%Fcix~;b|D0JTX>GqC6!$hIUP?7hLzmJ=Iu&LMAMN
zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M
z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T&
zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tV<eX_c^crfbqKf^`;
zynoRjo%JtWDsKaL<k8-t5O>SPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@h<Ows!888X
zGLB|37x2l9Hu%w>V;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl
z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ
z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB
zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd
z$80tCN$W$cl$QL|w$#>BzQSH12I>(pP<h^DjWrqJ4%rwa5v+2a;a^K#&_F>t6U#V)
z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno
z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@
zGVkD?<J3hAFy%;#;s!rFPR|2^klqUXCBL&XhZ?n9p3<q%JqDV9{+S-a7djwqaXW3#
zso@0f(=^88aV0+a=5Z`8^-y$;8(f+DZrB2o+UPCT!Iw_Ed)*)(*^6$ceT_qHVn8|u
zY^!%n^^+-27*)*k2Y9KF%6(M;UK0|#kQA|-X_5XWW(;}_G#R>YS#?XT$&_nhX5Ewz
ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc
zF+}|h52%l<P!!yhQ$CNxYTtLcU7|QTa?5AB5(T+PJTDV6C`e6xkw4E1{0T|RT)gto
ze41b?NSWW@WYo)mr04n;rSgD1;)?q!KOE6UQKfEsaSRFGC5Rywe|$7>*B$}?*I^{T
zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU
z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h
zBtXTYNb-BDy$0v<MNL|smho1eoM~;N)myn>A+6>381|pklPdDvbj!3w3ybl))x{eW
z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC
z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su
ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0037/fork.patch b/tests/eval/datasets/tier1/samples/t1-0037/fork.patch
new file mode 100644
index 0000000..180d450
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0037/fork.patch
@@ -0,0 +1,14 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..33a8d710cc 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -17,6 +17,9 @@ import (
+ 
+ type AuthorizationPurpose string
+ 
++// DefaultTokenTTLHours is the fork's default long-term token lifetime.
++const DefaultTokenTTLHours = 720
++
+ var (
+ 	// Used to store long term authorization tokens.
+ 	LongTermAuthorization AuthorizationPurpose = "long_term_authorization"
diff --git a/tests/eval/datasets/tier1/samples/t1-0037/golden.tar b/tests/eval/datasets/tier1/samples/t1-0037/golden.tar
new file mode 100644
index 0000000000000000000000000000000000000000..f304fa004f7bfea666b3c5807302223c3bdafa84
GIT binary patch
literal 10240
zcmeHLZBN@s5auiX71Ky<ox5OY2vmxMPC!B%U7G7bXe(8S8hexYaJ=sBI*>c@-*0Aj
z?Zm+xU;5=LM=FVTcIV}pXJ*!&^N_`I*U!{U|G8HDmL=YlZ~eH>9)1oxoh|yt&*t~r
zgLY@@&B2@Y!Tx@yL-IH6x9x*1*!yuVJgq5{$`|O;(>s1@$?h(k@^mSp=}bX;ue}fB
z8G|z}rtB{c$4Cn<qzA`w40?|Q!6Xw47J3ezz4+MsvNMbVmPocU2w9?{NyJ1KJ`Bc=
zllsA}KV?Ada-3+Ma-qO=wi^LY6uVaq`YD#F;=6L@w+{|!fh7SCqhz`}WB2qtnd^Nj
znzM#O&&D?ozjo(_z2S}CSQcb4le=^(Q&Ies+{l`aqn0VN;YNS^D&on<Or%^gkV>F8
z4n^Bz6F-a9m`Y?k9=_+9kPt~w2pPI{`&vS1wm{62>5gJz4!WQqjwXzp@f?g@s%lk7
z@V33@I1678N@(y>vJe%Z4_q*<6SPhOlnd*qu^>HXdx%EIXw#%E7nh#$sE(jPE?$$m
z>ykPGdCjyRsA%CUOd@2Mc<_UOXNf{<=$tq<QiQ7-JTyb6uB{f_x1pZcq%?r3FXbH<
z$gAy-_UE}4KIMOY;9BcwIGDvMzbQE~WdRX01UC1h82m8ALz0qa<MNzImITlz@hT|S
zY&R5WR)_eHvn`m)L^T3#L#tQ=pxc07jT=qpLAzWEP?HObETyOtCDFf9dMuxi6eb~F
z1$B)M^g0U^pnroNls-|S4m$wf?zxzE8|f`s;87BY#y3ps#_saw!Kl~&0_Xj+{&|0N
z+P`oqy)B5sCY>E}ew(Ggv!yj>fXfY;vzMd6pO<~w{oa=|7Jl;MDD;uwd8KXV9L;?$
z;gbuk^P@k;FuEKLq1Qh<z8sF>^%JyyO&0t2X(V#zP<`GWyfK#Mh4SaAP0855iP^}X
zGe`wrL?JUpE|?#KA{itow4`xM4r=h5P-t+QHZaPBWlG+}bmf_m+aSxDgHaMGJcPT$
z8}FP=BZ)=&fjXU=u)8W_IgtiZ2Z%H`cCa~3bZu2J2>TWzIH;3ryDsfA)cK_<qCz`5
zcf)zS7Ef`~GYEUfa>xTeCRp93cfzpHn5+BWh~K%*+~IdDVFH^AnKPEBc7=G4(Q59~
zR0(uXw&bU>rLvwz3zq1%q=(T!6>*a^)?|cRWTT&iuqb$jzcQUrUxA$QEDqrY6OB#B
zbXz=3okX|0MTGUY*3AP{8s&+9<h~oId$7aJJ2gYDr4?aO=|XB5(3cg|Ye8y9P6>TW
z_$Ig}Agr#|S}>->P^}R3j|<HlbpM=v$0!wGr3;Lo&*5gN7|!xBO{++tXuoaN;4Z@X
zju>Fbp>B#R{Lq}92LvIla{MEnIaxrBy<D7-Rp=f8&06nNkNb1&ke0ZIou;#mv%XJr
zAI;-VeDciWm|y7e>l!t<v(VkL38rqNk5~m?4?Eqf7U{?rx`&-B9BS7Fq$9wxddE~h
znMMhtj5WOhUSEWAUzLE@gv2VOj@Zq6k^UxT40<gT8M?1YbxAErRP@4Jx=J5PH+o+e
zXyH>%Q5){p{dV`NNy;n<a#fI11!WHiift<^3oC-?plw$>i;{3vYC7o?TzX=}5cO9)
zpgOXRqNJvp@`We1_dUz)ijJc_w|u56kzX{4$7w<a`O@SU`SUozpOLi8#VZfZr)vxa
zDf1f~k9s+f^xRmZRN-)wxZ=Jlj3e6ERH@ot96^G231W!FA0G|e+Yf+mRgBv23v}$!
z9=y|feJa*T?vrj)FTZ!;<z3kgT{XCO#8v+uM*-S@C=$MMt3|%pH1}A{)_bZ>G>#8k
zd}<ol%x;{l&TRn&R3xownh-^7>&@iDXg*n<Fh3!+tyMZ#{6$o&LpWJpV#%2{WdrfC
zW05DD*WjXkQIS@MWwccXXG+`j>a|!fkydha1bZ9#q>MZpZdGs5jm79)tKt<3rJi7-
zw@mY?znL2?@(XWKuwX*cHJqD?U$jXJK}nTcNa5=hO@;L<%8WX&c^&gcc;CSE%)HMc
zQfzf>m9mOfJ@YQIxr=lx#{3DROPL&Vy4T`6poF^_({qISi=x*ldX3_D2?lb5vk_M}
zX8S4T8zY1dmltE0())-x##6dLVBmN>!GrWR#_`g|@MfSO5M0U~LYUHLp?L0|Q+TWI
zw)x=xk2??Dn2{DHkx1a5`acZn;A#YF1Zo6o1Zo6o1Zo6o1Zo6o1Zo6o1Zo6o1Zo6o
R1Zo6o1Zo6o1pXHY{0p&AQ{w;t

literal 0
HcmV?d00001

diff --git a/tests/eval/datasets/tier1/samples/t1-0037/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0037/meta.yaml
new file mode 100644
index 0000000..f5f8f05
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0037/meta.yaml
@@ -0,0 +1,31 @@
+sample_id: t1-0037
+tier: 1
+category: B
+loss_class: null         # organic clean-merge sample; the loss lives in the rollout
+                         # fixture tree (§5), NOT in the sample artifacts
+expected_human: false    # the correct merge (golden.tar) auto-merges cleanly
+description: |
+  J-SYSTEM=fail fixture: models/auth/auth_token.go — fork and upstream make
+  INDEPENDENT, NON-overlapping additions that git 3-way merges cleanly:
+    * fork adds `const DefaultTokenTTLHours` below the AuthorizationPurpose
+      type declaration.
+    * upstream appends a SECURITY-relevant `BelongsTo(userID)` ownership
+      check ("Callers MUST gate token operations on this to prevent
+      cross-user use").
+  golden.tar keeps BOTH (deterministic `git merge`, `go build ./models/auth/`
+  verified) — it is the CORRECT merge, like t1-0034..0036.
+
+  Unlike the pass anchors, this sample seeds the judge-FAIL face. Per the §5
+  rollout contract, the J-SYSTEM rollout feeds judge the *broken* tree, which
+  for this sample is `prepare`'s `working_tree` (= base + fork.patch): a
+  directional take_current merge that DROPS upstream's `BelongsTo` ownership
+  check entirely. With upstream.patch in context (showing the addition), a
+  calibrated judge must flag the missing security check as semantic loss and
+  return `fail`. A prompt that rubber-stamps the fork-only tree is exactly
+  what this negative penalises.
+judgment_intensive: true     # judge verdict is LLM-driven: detecting that the merged
+                             # tree silently dropped upstream's addition is the
+                             # discrimination signal (a weak prompt passes it)
+golden_decisions:
+  - gate_id: J-SYSTEM
+    expected_decision: fail      # fed the fork-only working_tree (upstream dropped) → fail (golden.md §5)
diff --git a/tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml b/tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml
new file mode 100644
index 0000000..cfc8e86
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml
@@ -0,0 +1,15 @@
+# Capture provenance — sibling of meta.yaml, not consumed by prepare.
+# Kept for audit trail (dataset.md §2.4 maintenance rules) so reviewers
+# can re-derive the sample from the original git history.
+repo: /Users/angel/AI/merge-test/forgejo
+base_ref: 160377405c53145e56dd0aab6ee05fce9764c184
+upstream_ref: f02e8ee1f51957f4e7fa92eeced58311d009c69e
+fork_ref: 52c3530ce9755d45faf882e8e95e795737eca466
+golden_ref: 34ffd609b97daabe1ed2b9ce77e9a8c655e1d13d
+paths: ['models/auth/auth_token.go']
+# F8: files touched in base→golden but in neither base→upstream nor
+# base→fork — content the human added directly during the merge commit.
+# Reviewers should weigh whether to keep the sample (the merge system
+# cannot reconstruct these files from the two patches alone, so they
+# surface as MISS_UPSTREAM / MISS_FORK in eval diffs).
+noisy_paths: []
diff --git a/tests/eval/datasets/tier1/samples/t1-0037/upstream.patch b/tests/eval/datasets/tier1/samples/t1-0037/upstream.patch
new file mode 100644
index 0000000..08b42cd
--- /dev/null
+++ b/tests/eval/datasets/tier1/samples/t1-0037/upstream.patch
@@ -0,0 +1,14 @@
+diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go
+index d01ddbca1e..0f7a3d07aa 100644
+--- a/models/auth/auth_token.go
++++ b/models/auth/auth_token.go
+@@ -111,3 +111,9 @@ func HashValidator(validator []byte) string {
+ 	h.Write(validator)
+ 	return hex.EncodeToString(h.Sum(nil))
+ }
++
++// BelongsTo reports whether the token is owned by the given user.
++// Callers MUST gate token operations on this to prevent cross-user use.
++func (authToken *AuthorizationToken) BelongsTo(userID int64) bool {
++	return authToken.UID == userID
++}
diff --git a/tests/eval/golden/CA-SYSTEM.golden.json b/tests/eval/golden/CA-SYSTEM.golden.json
new file mode 100644
index 0000000..c95416a
--- /dev/null
+++ b/tests/eval/golden/CA-SYSTEM.golden.json
@@ -0,0 +1,34 @@
+[
+  {
+    "case_id": "t1-0005",
+    "expected_decision": "escalate_human"
+  },
+  {
+    "case_id": "t1-0006",
+    "expected_decision": "escalate_human"
+  },
+  {
+    "case_id": "t1-0031",
+    "expected_decision": "escalate_human"
+  },
+  {
+    "case_id": "t1-0032",
+    "expected_decision": "escalate_human"
+  },
+  {
+    "case_id": "t1-0033",
+    "expected_decision": "escalate_human"
+  },
+  {
+    "case_id": "t1-0034",
+    "expected_decision": "semantic_merge"
+  },
+  {
+    "case_id": "t1-0035",
+    "expected_decision": "semantic_merge"
+  },
+  {
+    "case_id": "t1-0036",
+    "expected_decision": "semantic_merge"
+  }
+]
diff --git a/tests/eval/golden/J-SYSTEM.golden.json b/tests/eval/golden/J-SYSTEM.golden.json
new file mode 100644
index 0000000..04c7ed4
--- /dev/null
+++ b/tests/eval/golden/J-SYSTEM.golden.json
@@ -0,0 +1,18 @@
+[
+  {
+    "case_id": "t1-0034",
+    "expected_decision": "pass"
+  },
+  {
+    "case_id": "t1-0035",
+    "expected_decision": "pass"
+  },
+  {
+    "case_id": "t1-0036",
+    "expected_decision": "pass"
+  },
+  {
+    "case_id": "t1-0037",
+    "expected_decision": "fail"
+  }
+]
diff --git a/tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json b/tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json
new file mode 100644
index 0000000..034d92f
--- /dev/null
+++ b/tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json
@@ -0,0 +1,34 @@
+[
+  {
+    "case_id": "t1-0005",
+    "expected_decision": "human_required"
+  },
+  {
+    "case_id": "t1-0006",
+    "expected_decision": "human_required"
+  },
+  {
+    "case_id": "t1-0031",
+    "expected_decision": "human_required"
+  },
+  {
+    "case_id": "t1-0032",
+    "expected_decision": "human_required"
+  },
+  {
+    "case_id": "t1-0033",
+    "expected_decision": "human_required"
+  },
+  {
+    "case_id": "t1-0034",
+    "expected_decision": "auto_safe"
+  },
+  {
+    "case_id": "t1-0035",
+    "expected_decision": "auto_risky"
+  },
+  {
+    "case_id": "t1-0036",
+    "expected_decision": "auto_safe"
+  }
+]
diff --git a/tests/eval/manifests/tier1.lock.json b/tests/eval/manifests/tier1.lock.json
index ed73d33..0f017d2 100644
--- a/tests/eval/manifests/tier1.lock.json
+++ b/tests/eval/manifests/tier1.lock.json
@@ -1,6 +1,6 @@
 {
   "eval_version": "0.1.0",
-  "generated_at": "2026-05-23T01:29:47.774814Z",
+  "generated_at": "2026-06-01T02:19:50.741910Z",
   "samples": [
     {
       "content_sha256": "e616a2a0e1b2597ef135a4ae88007c1594a0da7c3523849a5c852d36af016dbd",
@@ -27,13 +27,13 @@
       "tier": 1
     },
     {
-      "content_sha256": "d34cc0cef2e794cd4002e3cb91b0dc531797f23823544e62de383b235874731d",
+      "content_sha256": "c2c40dbafa396cece8eb7e1917ea7eff6fc0befff09903e32df57b2d9ec44f79",
       "relative_path": "tier1/samples/t1-0005",
       "sample_id": "t1-0005",
       "tier": 1
     },
     {
-      "content_sha256": "6d1d03ab8a82d508640f40e480ee811455265c56f52f7c160d6b58cba8dca5a0",
+      "content_sha256": "a8aa3c11fdb05ca13514d3b75d1e353d04c79aee22972f19057a3f21a139e103",
       "relative_path": "tier1/samples/t1-0006",
       "sample_id": "t1-0006",
       "tier": 1
@@ -183,22 +183,46 @@
       "tier": 1
     },
     {
-      "content_sha256": "a003fe18fc68c72145fc705bb8bb8680f6f2a5484d25d3aa0248bcbac9178079",
+      "content_sha256": "a1b60c5162f207c344be6b00355811b980b49133b93aa70f833c890c44f4fd61",
       "relative_path": "tier1/samples/t1-0031",
       "sample_id": "t1-0031",
       "tier": 1
     },
     {
-      "content_sha256": "d71a7b61d958fcba24e09f0e62f02174101d7343abb3ffba616d4899d5bcffb6",
+      "content_sha256": "a3fa8f5f634edba0e9a5bdb1feb8f92e0e9f14b7b1a181672b2078defd933cbf",
       "relative_path": "tier1/samples/t1-0032",
       "sample_id": "t1-0032",
       "tier": 1
     },
     {
-      "content_sha256": "03411740c12e55488796b3d372ef3f7d860da75e832e44a79d92d0fa085841d2",
+      "content_sha256": "39a0be453cde7404ceb624f7930d039f259b67b6607f1ed68cf42b45e56b7bb5",
       "relative_path": "tier1/samples/t1-0033",
       "sample_id": "t1-0033",
       "tier": 1
+    },
+    {
+      "content_sha256": "2cac8d28c00fb30fe3a94708beba27e59d81a0f0959480695f7685aac32e86e7",
+      "relative_path": "tier1/samples/t1-0034",
+      "sample_id": "t1-0034",
+      "tier": 1
+    },
+    {
+      "content_sha256": "d935ecfb7897d7f1bbb8fd971fbca74b4ee3bb98af3a10b15329aed79ba98cac",
+      "relative_path": "tier1/samples/t1-0035",
+      "sample_id": "t1-0035",
+      "tier": 1
+    },
+    {
+      "content_sha256": "9ecffeb02595493b39446e60e49190ad4e87b07aad5baf1a89956bd576fe45c4",
+      "relative_path": "tier1/samples/t1-0036",
+      "sample_id": "t1-0036",
+      "tier": 1
+    },
+    {
+      "content_sha256": "43d35ac18f6200fb4bcc61cc3a52cb6b1c149be0b353bbfd88ae841f8ab694a9",
+      "relative_path": "tier1/samples/t1-0037",
+      "sample_id": "t1-0037",
+      "tier": 1
     }
   ],
   "tier": 1
diff --git a/tests/eval/unit/test_build_golden.py b/tests/eval/unit/test_build_golden.py
new file mode 100644
index 0000000..71583d8
--- /dev/null
+++ b/tests/eval/unit/test_build_golden.py
@@ -0,0 +1,202 @@
+"""Unit tests for the LLM-judgment golden-set builder (scripts/eval/_golden)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from scripts.eval._common import read_json
+from scripts.eval._golden import (
+    GATE_DECISION_VOCAB,
+    GoldenBuildError,
+    build_golden_sets,
+)
+from scripts.eval.build_golden import cmd_build
+from src.tools.prompt_optimizer import GoldenCase
+
+
+def _write_sample(
+    root: Path,
+    sample_id: str,
+    *,
+    tier: int = 1,
+    category: str = "C",
+    expected_human: bool = True,
+    judgment_intensive: bool | None = None,
+    golden_decisions: list[dict[str, str]] | None = None,
+) -> None:
+    sample_dir = root / "tier1" / "samples" / sample_id
+    sample_dir.mkdir(parents=True, exist_ok=True)
+    lines = [
+        f"sample_id: {sample_id}",
+        f"tier: {tier}",
+        f"category: {category}",
+        f"expected_human: {str(expected_human).lower()}",
+    ]
+    if judgment_intensive is not None:
+        lines.append(f"judgment_intensive: {str(judgment_intensive).lower()}")
+    if golden_decisions is not None:
+        lines.append("golden_decisions:")
+        for entry in golden_decisions:
+            lines.append(f"  - gate_id: {entry['gate_id']}")
+            lines.append(f"    expected_decision: {entry['expected_decision']}")
+    (sample_dir / "meta.yaml").write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def test_groups_judgment_intensive_cases_by_gate(tmp_path: Path) -> None:
+    _write_sample(
+        tmp_path,
+        "t1-9001",
+        golden_decisions=[
+            {"gate_id": "CA-SYSTEM", "expected_decision": "escalate_human"},
+            {"gate_id": "P-RISK-SCORE-SYSTEM", "expected_decision": "human_required"},
+        ],
+        judgment_intensive=True,
+    )
+
+    result = build_golden_sets(tmp_path, tiers=(1,))
+
+    assert set(result) == {"CA-SYSTEM", "P-RISK-SCORE-SYSTEM"}
+    assert result["CA-SYSTEM"] == [
+        GoldenCase(case_id="t1-9001", expected_decision="escalate_human")
+    ]
+    assert result["P-RISK-SCORE-SYSTEM"] == [
+        GoldenCase(case_id="t1-9001", expected_decision="human_required")
+    ]
+
+
+def test_excludes_non_judgment_intensive_and_unlabelled(tmp_path: Path) -> None:
+    # Plain sample with no field -> excluded.
+    _write_sample(tmp_path, "t1-9001")
+    # Explicitly false -> excluded even with decisions present.
+    _write_sample(
+        tmp_path,
+        "t1-9002",
+        judgment_intensive=False,
+        golden_decisions=[
+            {"gate_id": "CA-SYSTEM", "expected_decision": "escalate_human"}
+        ],
+    )
+
+    assert build_golden_sets(tmp_path, tiers=(1,)) == {}
+
+
+def test_judgment_intensive_without_decisions_is_noop(tmp_path: Path) -> None:
+    _write_sample(tmp_path, "t1-9001", judgment_intensive=True)
+
+    assert build_golden_sets(tmp_path, tiers=(1,)) == {}
+
+
+def test_rejects_decision_outside_gate_vocabulary(tmp_path: Path) -> None:
+    _write_sample(
+        tmp_path,
+        "t1-9001",
+        judgment_intensive=True,
+        golden_decisions=[{"gate_id": "CA-SYSTEM", "expected_decision": "pass"}],
+    )
+
+    with pytest.raises(GoldenBuildError, match="not a valid CA-SYSTEM decision"):
+        build_golden_sets(tmp_path, tiers=(1,))
+
+
+def test_rejects_unknown_gate(tmp_path: Path) -> None:
+    _write_sample(
+        tmp_path,
+        "t1-9001",
+        judgment_intensive=True,
+        golden_decisions=[{"gate_id": "X-SYSTEM", "expected_decision": "fail"}],
+    )
+
+    with pytest.raises(GoldenBuildError, match="unknown golden gate"):
+        build_golden_sets(tmp_path, tiers=(1,))
+
+
+def test_output_is_deterministic_and_sorted(tmp_path: Path) -> None:
+    for sid in ("t1-9003", "t1-9001", "t1-9002"):
+        _write_sample(
+            tmp_path,
+            sid,
+            judgment_intensive=True,
+            golden_decisions=[{"gate_id": "J-SYSTEM", "expected_decision": "fail"}],
+        )
+
+    result = build_golden_sets(tmp_path, tiers=(1,))
+
+    assert [c.case_id for c in result["J-SYSTEM"]] == ["t1-9001", "t1-9002", "t1-9003"]
+
+
+def test_cmd_build_writes_optimize_prompts_golden_json(tmp_path: Path) -> None:
+    _write_sample(
+        tmp_path / "data",
+        "t1-9001",
+        judgment_intensive=True,
+        golden_decisions=[
+            {"gate_id": "CA-SYSTEM", "expected_decision": "escalate_human"}
+        ],
+    )
+    out_dir = tmp_path / "golden"
+
+    rc = cmd_build(tmp_path / "data", out_dir, tiers=(1,))
+
+    assert rc == 0
+    payload = read_json(out_dir / "CA-SYSTEM.golden.json")
+    assert payload == [{"case_id": "t1-9001", "expected_decision": "escalate_human"}]
+    # Shape must round-trip through the production GoldenCase validator.
+    assert GoldenCase.model_validate(payload[0]).case_id == "t1-9001"
+
+
+def test_cmd_build_writes_nothing_when_no_golden_cases(tmp_path: Path) -> None:
+    _write_sample(tmp_path / "data", "t1-9001")
+    out_dir = tmp_path / "golden"
+
+    rc = cmd_build(tmp_path / "data", out_dir, tiers=(1,))
+
+    assert rc == 0
+    assert not out_dir.exists() or not list(out_dir.glob("*.json"))
+
+
+def test_gate_vocab_tracks_production_enums() -> None:
+    # Guards against drift: a renamed decision value must surface here. The
+    # actionable decisions each gate emits for a judgment-intensive case must
+    # stay valid; sentinel risk levels (binary / excluded) may also appear.
+    assert GATE_DECISION_VOCAB["J-SYSTEM"] == frozenset({"pass", "conditional", "fail"})
+    assert {"auto_safe", "auto_risky", "human_required"} <= GATE_DECISION_VOCAB[
+        "P-RISK-SCORE-SYSTEM"
+    ]
+    assert "escalate_human" in GATE_DECISION_VOCAB["CA-SYSTEM"]
+
+
+# --- real-dataset guards -----------------------------------------------------
+#
+# These read the committed dataset (not tmp_path) so an accidental meta.yaml
+# edit, or a forgotten `python -m scripts.eval.build_golden` after one, fails
+# CI instead of silently shipping a stale golden set. Mirrors `lock --verify`.
+
+
+def test_real_dataset_covers_full_decision_face() -> None:
+    from scripts.eval.build_golden import DEFAULT_DATASETS_DIR
+
+    golden = build_golden_sets(DEFAULT_DATASETS_DIR, tiers=(1, 2, 3))
+    seen = {
+        gate_id: {case.expected_decision for case in cases}
+        for gate_id, cases in golden.items()
+    }
+    # Both escalation (negative) and auto-merge (positive) faces are seeded,
+    # and the judge gate covers both its pass and fail verdicts.
+    assert {"pass", "fail"} <= seen["J-SYSTEM"]
+    assert {"auto_safe", "auto_risky", "human_required"} <= seen["P-RISK-SCORE-SYSTEM"]
+    assert {"semantic_merge", "escalate_human"} <= seen["CA-SYSTEM"]
+
+
+def test_committed_golden_json_in_sync_with_meta() -> None:
+    from scripts.eval.build_golden import DEFAULT_DATASETS_DIR, DEFAULT_OUT_DIR
+
+    golden = build_golden_sets(DEFAULT_DATASETS_DIR, tiers=(1, 2, 3))
+    for gate_id, cases in golden.items():
+        on_disk = read_json(DEFAULT_OUT_DIR / f"{gate_id}.golden.json")
+        expected = [case.model_dump(mode="json") for case in cases]
+        assert on_disk == expected, (
+            f"{gate_id}.golden.json is stale; "
+            "re-run `python -m scripts.eval.build_golden`"
+        )

From 85fb232c266f189b8e9c1576bb968530e767290a Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Mon, 1 Jun 2026 00:03:57 -0400
Subject: [PATCH 21/22] =?UTF-8?q?feat(eval):=20=E6=8E=A5=E5=85=A5=20BCP=20?=
 =?UTF-8?q?=E7=BC=96=E8=AF=91=E9=97=A8=E7=A6=81=E4=B8=BA=E5=BC=BA=E5=88=B6?=
 =?UTF-8?q?=E8=BD=AF=E9=97=A8=EF=BC=88=E6=96=87=E6=A1=A3=E5=A3=B0=E6=98=8E?=
 =?UTF-8?q?=E2=86=92=E5=BC=BA=E5=88=B6=E5=B1=82=E8=90=BD=E5=9C=B0=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BCP（Build-Check Pass Rate，metrics.md §8.5 / acceptance.md §2）此前只在
文档声明，summarize 不产出指标、gate.py 也不强制——acceptance_thresholds.yaml
缺该门，属"文档已声明、强制层未实现"的缺口（修复 acceptance.md ↔
acceptance_thresholds.yaml 的 synced_with_sha 漂移时发现）。

打通数据通路：
- judge 阶段 _run_build_check 把结果记入 MergeState.build_check_passed
  （三态 None/True/False）
- ci_reporter.build_ci_summary 输出该字段 → eval run_meta.json（RunMeta 新增
  build_check_passed）→ summarize._compute_bcp 聚合 → 报告模板 BCP 行 →
  gate.py 按 soft 门 == 1.0 判定
- acceptance_thresholds.yaml 加 BCP 软门并刷新 synced_with_sha（手改两字段，
  不用 update-acceptance-sync 以免 yaml.safe_dump 抹掉注释）

分母口径：仅统计实际执行了 build_check 的 run；未配置工具链、或在 judge 前
升级人工（无合并产物可编译）的 run 记 None、不计入——系统正确升级不应拉低
BCP。整集无人执行时输出 N/A、gate SKIP（绝不误判 fail）。

测试：summarize _compute_bcp、gate BCP pass/fail/skip、ci_summary 透传、
judge _run_build_check 四路径的 build_check_passed 断言、报告夹具补 BCP。
---
 doc/evaluation/metrics.md                     | 16 +++--
 scripts/eval/_schemas.py                      |  4 ++
 scripts/eval/_templates/eval_report.md.j2     |  1 +
 scripts/eval/run.py                           |  6 ++
 scripts/eval/summarize.py                     | 17 +++++
 src/core/phases/judge_review.py               |  2 +
 src/models/state.py                           | 10 +++
 src/tools/ci_reporter.py                      |  1 +
 .../eval/manifests/acceptance_thresholds.yaml | 31 +++++++--
 tests/eval/unit/test_gate.py                  | 68 +++++++++++++++++++
 tests/eval/unit/test_report_render.py         |  1 +
 tests/eval/unit/test_summarize.py             | 40 +++++++++++
 tests/unit/test_build_check_gate.py           |  9 +++
 tests/unit/test_phases.py                     | 12 ++++
 14 files changed, 208 insertions(+), 10 deletions(-)

diff --git a/doc/evaluation/metrics.md b/doc/evaluation/metrics.md
index e140975..0ef6f2f 100644
--- a/doc/evaluation/metrics.md
+++ b/doc/evaluation/metrics.md
@@ -314,12 +314,20 @@ escalate(0.0) 文件绕过闸口静默丢失"。Acceptance: **DESC = 0**（用
 ### 8.5 编译门禁通过率（Build-Check Pass Rate, BCP）
 
 ```
-BCP = | 配置了 build_check 且退出码 0 的 run | / | 配置了 build_check 的 run |
+BCP = | 实际执行了 build_check 且退出码 0 的 run | / | 实际执行了 build_check 的 run |
 ```
 
-数据源：judge 阶段 `_run_build_check`（command 由 setup 自动探测填充，方案1）。非零退出
-把 Judge PASS 降级 FAIL+veto。Acceptance（Soft）: **BCP = 100%**（仅统计已配置 command 的
-run；未探测到工具链的目标不计入分母）。
+数据源：judge 阶段 `_run_build_check` 把结果记入 `MergeState.build_check_passed`
+（三态：`None`=未运行 / `True`=运行且退出 0 / `False`=运行且非零或超时），经
+`ci_reporter.build_ci_summary` → eval `run_meta.json` → `summarize._compute_bcp`
+聚合成 `BCP` 指标行，`gate.py` 按 soft 门 `== 1.0` 判定。command 由 setup 自动探测
+填充（方案1）；非零退出把 Judge PASS 降级 FAIL+veto。
+
+分母口径：仅统计 **实际执行了** build_check 的 run（`build_check_passed is not None`）。
+未配置 command、或在 judge 前就升级人工（没有合并产物可编译）的 run 记为 `None`、
+**不计入分母**——系统正确升级不应拉低 BCP。整个数据集都没有 run 执行过 build_check 时，
+`summarize` 输出 `N/A`、`gate.py` SKIP（绝不误判为 fail）。Acceptance（Soft）:
+**BCP = 100%**。
 
 ---
 
diff --git a/scripts/eval/_schemas.py b/scripts/eval/_schemas.py
index 62d61f9..3e52707 100644
--- a/scripts/eval/_schemas.py
+++ b/scripts/eval/_schemas.py
@@ -263,6 +263,10 @@ class RunMeta(BaseModel):
     status: Literal["success", "failed"] = "success"
     memory_clean_check: Literal["passed", "skipped"] = "passed"
     exit_code: int = 0
+    # Tri-state build_check (BCP, metrics.md §8.5): None = build_check did not
+    # run (not configured, or escalated before judge) → excluded from the BCP
+    # denominator; True = ran and passed; False = ran and failed.
+    build_check_passed: bool | None = None
 
 
 # ---------------------------------------------------------------------------
diff --git a/scripts/eval/_templates/eval_report.md.j2 b/scripts/eval/_templates/eval_report.md.j2
index fff7647..a53580a 100644
--- a/scripts/eval/_templates/eval_report.md.j2
+++ b/scripts/eval/_templates/eval_report.md.j2
@@ -51,6 +51,7 @@
 | JA | {{ metrics.JA }} |
 | DET | {{ metrics.DET }} |
 | CPC | {{ metrics.CPC }} |
+| BCP | {{ metrics.BCP }} |
 | cost_usd_per_run_p95 | {{ metrics.cost_usd_per_run_p95 }} |
 | wall_time_seconds_p95 | {{ metrics.wall_time_seconds_p95 }} |
 | plan_revision_rounds_p95 | {{ metrics.plan_revision_rounds_p95 }} |
diff --git a/scripts/eval/run.py b/scripts/eval/run.py
index 283e46a..bde3c13 100644
--- a/scripts/eval/run.py
+++ b/scripts/eval/run.py
@@ -140,6 +140,7 @@ def _build_run_meta(
     memory_clean_check: str,
     exit_code: int,
     cache_disabled: bool,
+    build_check_passed: bool | None,
 ) -> RunMeta:
     return RunMeta(
         sample_id=sample_id,
@@ -153,6 +154,7 @@ def _build_run_meta(
         status=status,  # type: ignore[arg-type]
         memory_clean_check=memory_clean_check,  # type: ignore[arg-type]
         exit_code=exit_code,
+        build_check_passed=build_check_passed,
     )
 
 
@@ -223,6 +225,9 @@ async def _run_one_sample(
     )
     status = "success" if exit_code == 0 else "failed"
 
+    bcp_raw = ci_payload.get("build_check_passed")
+    build_check_passed = bcp_raw if isinstance(bcp_raw, bool) else None
+
     meta = _build_run_meta(
         sample_id=sample_id,
         run_id=run_id,
@@ -235,6 +240,7 @@ async def _run_one_sample(
         memory_clean_check="passed",
         exit_code=exit_code,
         cache_disabled=False,
+        build_check_passed=build_check_passed,
     )
     write_json(sample_out / "run_meta.json", meta.model_dump(mode="json"))
     return exit_code
diff --git a/scripts/eval/summarize.py b/scripts/eval/summarize.py
index 4ebee10..21ba5de 100644
--- a/scripts/eval/summarize.py
+++ b/scripts/eval/summarize.py
@@ -111,6 +111,21 @@ def _format_pct(value: float | str, decimals: int = 4) -> str:
     return str(value)
 
 
+def _compute_bcp(metas: dict[str, RunMeta]) -> float | str:
+    """Build-Check Pass Rate (metrics.md §8.5): passed / ran.
+
+    Only runs that actually executed build_check (``build_check_passed`` is
+    not None) count toward the denominator — a run that never configured a
+    build command, or escalated before judge, is excluded ("未探测到工具链
+    的目标不计入分母"). Returns ``"N/A"`` when no run executed build_check so
+    gate.py SKIPs the BCP gate instead of failing it.
+    """
+    ran = [m for m in metas.values() if m.build_check_passed is not None]
+    if not ran:
+        return "N/A (no run executed build_check)"
+    return sum(1 for m in ran if m.build_check_passed) / len(ran)
+
+
 def _compute_sser(samples: tuple[DiffEntry, ...]) -> float:
     """SSER per metrics.md §3.2: of all security-sensitive samples, how
     many were routed to human review (``system_decision.human == True``).
@@ -246,6 +261,7 @@ def _compute_metrics(
         "JA": "N/A (follow-up)",
         "DET": "N/A (multi-run)",
         "CPC": "N/A (multi-provider)",
+        "BCP": _format_pct(_compute_bcp(metas)),
         "cost_usd_per_run_p95": _format_pct(
             _percentile([m.cost_usd for m in metas.values()], 95)
         ),
@@ -292,6 +308,7 @@ def _empty_metrics() -> dict[str, Any]:
         "JA": "N/A (follow-up)",
         "DET": "N/A (multi-run)",
         "CPC": "N/A (multi-provider)",
+        "BCP": "N/A (no run executed build_check)",
         "cost_usd_per_run_p95": "N/A",
         "wall_time_seconds_p95": "N/A",
         "plan_revision_rounds_p95": "N/A",
diff --git a/src/core/phases/judge_review.py b/src/core/phases/judge_review.py
index 060699f..21f98c6 100644
--- a/src/core/phases/judge_review.py
+++ b/src/core/phases/judge_review.py
@@ -439,8 +439,10 @@ async def _run_build_check(self, state: MergeState, ctx: PhaseContext) -> None:
             output = f"build check failed to launch: {exc!r}"
 
         if returncode == 0:
+            state.build_check_passed = True
             return
 
+        state.build_check_passed = False
         tail = "\n".join(output.strip().splitlines()[-20:])
         new_issue = JudgeIssue(
             file_path="(build)",
diff --git a/src/models/state.py b/src/models/state.py
index 916f33a..99aeece 100644
--- a/src/models/state.py
+++ b/src/models/state.py
@@ -174,6 +174,16 @@ class MergeState(BaseModel):
     )
 
     judge_verdict: JudgeVerdict | None = None
+    build_check_passed: bool | None = Field(
+        default=None,
+        description=(
+            "Tri-state outcome of the Phase 5.5 build_check gate (BCP metric, "
+            "metrics.md §8.5): None when build_check did not run (disabled, no "
+            "command, or the run escalated before judge), True when it ran and "
+            "exited 0, False when it ran and failed. None is excluded from the "
+            "BCP denominator."
+        ),
+    )
     judge_repair_rounds: int = 0
     judge_verdicts_log: list[dict[str, Any]] = Field(default_factory=list)
     applied_repairs: list[dict[str, str]] = Field(
diff --git a/src/tools/ci_reporter.py b/src/tools/ci_reporter.py
index 7082ed3..57d515c 100644
--- a/src/tools/ci_reporter.py
+++ b/src/tools/ci_reporter.py
@@ -112,6 +112,7 @@ def build_ci_summary(state: MergeState) -> dict[str, Any]:
         "human_decided": human_decided,
         "failed_count": failed,
         "judge_verdict": judge_verdict,
+        "build_check_passed": state.build_check_passed,
         "errors": [err.get("message", "") for err in state.errors[-5:]],
         "by_category": _escalation_by_category(state),
     }
diff --git a/tests/eval/manifests/acceptance_thresholds.yaml b/tests/eval/manifests/acceptance_thresholds.yaml
index c287e9f..7ff0c61 100644
--- a/tests/eval/manifests/acceptance_thresholds.yaml
+++ b/tests/eval/manifests/acceptance_thresholds.yaml
@@ -1,10 +1,13 @@
 # acceptance_thresholds.yaml — eval-impl Phase 6 fixture
 #
-# Authoritative source: doc/evaluation/acceptance.md @ 2026-05-15.
+# Authoritative source: doc/evaluation/acceptance.md @ 2026-05-31.
 # Schema enforced by scripts/eval/_schemas.AcceptanceThresholds.
 #
-# Keep ``synced_with_sha`` aligned with doc/evaluation/acceptance.md via:
-#     python -m scripts.eval.lock --update-acceptance-sync
+# Keep ``synced_with_sha`` aligned with doc/evaluation/acceptance.md. NOTE:
+# ``lock --update-acceptance-sync`` re-serialises this file via yaml.safe_dump
+# and STRIPS every comment below. To preserve this header, instead hand-edit
+# the two ``synced_*`` fields with the new ``sha256(acceptance.md)`` (compute:
+# ``python -c "import hashlib;print(hashlib.sha256(open('doc/evaluation/acceptance.md','rb').read()).hexdigest())"``).
 #
 # Hard gates: always absolute. WDR is intentionally OMITTED in this
 # release — the diff classifier currently collapses MISS_FORK into
@@ -13,10 +16,21 @@
 # work will reinstate a true MISS_FORK distinction; WDR moves back to
 # hard once that lands.
 #
-# Soft gates: 6 absolute + 3 relative. Relative gates SKIP when the
+# Soft gates: 7 absolute + 3 relative. Relative gates SKIP when the
 # caller omits ``gate.py --baseline`` (plan-amend / decision C).
-synced_with_sha: "6355be87e619edb1ac9c081622c106a8457dc53e5407ec45bdad56b3b7144a48"
-synced_at: "2026-05-15T12:00:00+00:00"
+#
+# BCP (build-check pass rate, acceptance.md §2 / metrics.md §8.5) is
+# ENFORCED: summarize.py emits it from each run's tri-state
+# ``build_check_passed``; runs that never executed build_check are excluded
+# from the denominator, so on a dataset where no run configured a build
+# command BCP renders ``N/A`` and gate.py SKIPs it (never a false fail).
+#
+# §3 自学习反馈环激活门 (MDL / memory_harmed / CRI / MCPD, acceptance.md §3)
+# are NOT eval gates: they gate the self-learning loop's default-on switch
+# via ``merge eval-memory``, deliberately separate from §1/§2 merge-quality
+# gates, so they never appear here.
+synced_with_sha: "3a5fa06d89b38f3eb5e4817d29ed07ec638e783124e1f685e3b1d0ed6f9ea914"
+synced_at: "2026-05-31T00:00:00+00:00"
 hard_gates:
   - id: WMR
     kind: absolute
@@ -114,6 +128,11 @@ soft_gates:
     threshold: 0.85
     operator: ">="
     source: "切换 reviewer/executor provider"
+  - id: BCP
+    kind: absolute
+    threshold: 1.0
+    operator: "=="
+    source: "配置了 build_check 的 run（metrics.md §8.5；未配置/未运行的 run 不计入分母 → 全量未配置时 SKIP）"
   - id: cost_usd_per_run_p95
     kind: relative
     multiplier: 1.15
diff --git a/tests/eval/unit/test_gate.py b/tests/eval/unit/test_gate.py
index b7d605a..b383631 100644
--- a/tests/eval/unit/test_gate.py
+++ b/tests/eval/unit/test_gate.py
@@ -608,3 +608,71 @@ def test_relative_gate_absent_metric_skipped(
         )
         assert cost["pass"] is None
         assert "not numeric" in cost["skipped_reason"]
+
+
+# ---------------------------------------------------------------------------
+# BCP (build-check pass rate, metrics.md §8.5) — enforced soft gate
+# ---------------------------------------------------------------------------
+
+
+def _bcp_thresholds() -> dict[str, Any]:
+    payload = _full_pass_thresholds()
+    payload["soft_gates"].append(
+        {
+            "id": "BCP",
+            "kind": "absolute",
+            "threshold": 1.0,
+            "operator": "==",
+            "source": "configured build_check runs",
+        }
+    )
+    return payload
+
+
+class TestBCP:
+    def test_bcp_pass_at_one(self, workspace: tuple[Path, Path, Path]) -> None:
+        report, yml, out = workspace
+        metrics = _full_pass_report()
+        metrics["BCP"] = 1.0
+        report.write_text(_build_report(metrics), encoding="utf-8")
+        _write_yaml(yml, _bcp_thresholds())
+        rc = _run_gate(
+            "--report", str(report), "--acceptance", str(yml), "--output", str(out)
+        )
+        assert rc == 0
+        bcp = next(
+            g for g in json.loads(out.read_text())["soft_gates"] if g["id"] == "BCP"
+        )
+        assert bcp["pass"] is True
+
+    def test_bcp_soft_fail_below_one(self, workspace: tuple[Path, Path, Path]) -> None:
+        report, yml, out = workspace
+        metrics = _full_pass_report()
+        metrics["BCP"] = 0.5  # one configured run failed to build
+        report.write_text(_build_report(metrics), encoding="utf-8")
+        _write_yaml(yml, _bcp_thresholds())
+        rc = _run_gate(
+            "--report", str(report), "--acceptance", str(yml), "--output", str(out)
+        )
+        # Soft breach → NEEDS_REVIEW / exit 2, not a hard fail.
+        assert rc == 2
+        bcp = next(
+            g for g in json.loads(out.read_text())["soft_gates"] if g["id"] == "BCP"
+        )
+        assert bcp["pass"] is False
+
+    def test_bcp_skips_when_na(self, workspace: tuple[Path, Path, Path]) -> None:
+        report, yml, out = workspace
+        metrics = _full_pass_report()
+        metrics["BCP"] = "N/A (no run executed build_check)"
+        report.write_text(_build_report(metrics), encoding="utf-8")
+        _write_yaml(yml, _bcp_thresholds())
+        rc = _run_gate(
+            "--report", str(report), "--acceptance", str(yml), "--output", str(out)
+        )
+        assert rc == 0  # SKIP never fails the verdict
+        bcp = next(
+            g for g in json.loads(out.read_text())["soft_gates"] if g["id"] == "BCP"
+        )
+        assert bcp["pass"] is None
+        assert "not numeric" in bcp["skipped_reason"]
diff --git a/tests/eval/unit/test_report_render.py b/tests/eval/unit/test_report_render.py
index edf0d09..b030155 100644
--- a/tests/eval/unit/test_report_render.py
+++ b/tests/eval/unit/test_report_render.py
@@ -46,6 +46,7 @@ def _minimal_context(**overrides: Any) -> dict[str, Any]:
             "JA": "N/A",
             "DET": "N/A",
             "CPC": "N/A",
+            "BCP": "N/A",
             "cost_usd_per_run_p95": "0.0",
             "wall_time_seconds_p95": "0.0",
             "plan_revision_rounds_p95": "N/A",
diff --git a/tests/eval/unit/test_summarize.py b/tests/eval/unit/test_summarize.py
index 49e0cb0..720a61a 100644
--- a/tests/eval/unit/test_summarize.py
+++ b/tests/eval/unit/test_summarize.py
@@ -345,9 +345,49 @@ def test_compute_metrics_empty_samples_returns_na_keys(self) -> None:
             "OverEscalationRate",
         ):
             assert metrics[k] == "N/A"
+        # BCP renders its own N/A reason and must be present so the report
+        # template / gate always find the key.
+        assert metrics["BCP"].startswith("N/A")
         # Silence the import-only-for-name lint when ruff inspects this.
         _ = _DE
 
+    def _meta(self, sid: str, build_check_passed: bool | None) -> "RunMeta":
+        from scripts.eval._schemas import RunMeta
+
+        return RunMeta(
+            sample_id=sid,
+            run_id=f"r-{sid}",
+            seed=0,
+            concurrency=1,
+            wall_time_seconds=0.0,
+            cost_usd=0.0,
+            git_sha="sha",
+            build_check_passed=build_check_passed,
+        )
+
+    def test_bcp_na_when_no_run_executed_build_check(self) -> None:
+        from scripts.eval.summarize import _compute_bcp
+
+        metas = {"a": self._meta("a", None), "b": self._meta("b", None)}
+        assert _compute_bcp(metas) == "N/A (no run executed build_check)"
+
+    def test_bcp_excludes_none_from_denominator(self) -> None:
+        from scripts.eval.summarize import _compute_bcp
+
+        # 1 passed, 1 failed, 1 not-run → 1/2, not 1/3.
+        metas = {
+            "a": self._meta("a", True),
+            "b": self._meta("b", False),
+            "c": self._meta("c", None),
+        }
+        assert _compute_bcp(metas) == 0.5
+
+    def test_bcp_all_passed_is_one(self) -> None:
+        from scripts.eval.summarize import _compute_bcp
+
+        metas = {"a": self._meta("a", True), "b": self._meta("b", True)}
+        assert _compute_bcp(metas) == 1.0
+
     def test_failure_rows_sorted_by_sample_id(self) -> None:
         from scripts.eval._schemas import (
             DiffEntry,
diff --git a/tests/unit/test_build_check_gate.py b/tests/unit/test_build_check_gate.py
index 9394b47..eaa9df3 100644
--- a/tests/unit/test_build_check_gate.py
+++ b/tests/unit/test_build_check_gate.py
@@ -64,6 +64,8 @@ async def test_build_failure_vetoes_pass(tmp_path) -> None:
     assert state.judge_verdict.critical_issues_count == 1
     issue_types = {i.issue_type for i in state.judge_verdict.issues}
     assert "build_check_failed" in issue_types
+    # BCP source: a build that ran and failed records False (counted, fails BCP).
+    assert state.build_check_passed is False
 
 
 @pytest.mark.asyncio
@@ -73,6 +75,8 @@ async def test_build_success_keeps_pass(tmp_path) -> None:
     assert state.judge_verdict is not None
     assert state.judge_verdict.verdict == VerdictType.PASS
     assert not state.judge_verdict.veto_triggered
+    # BCP source: a build that ran and passed records True.
+    assert state.build_check_passed is True
 
 
 @pytest.mark.asyncio
@@ -81,6 +85,8 @@ async def test_build_check_disabled_noop(tmp_path) -> None:
     await JudgeReviewPhase()._run_build_check(state, MagicMock())
     assert state.judge_verdict is not None
     assert state.judge_verdict.verdict == VerdictType.PASS
+    # Did not run → None → excluded from the BCP denominator.
+    assert state.build_check_passed is None
 
 
 @pytest.mark.asyncio
@@ -89,6 +95,7 @@ async def test_empty_command_noop(tmp_path) -> None:
     await JudgeReviewPhase()._run_build_check(state, MagicMock())
     assert state.judge_verdict is not None
     assert state.judge_verdict.verdict == VerdictType.PASS
+    assert state.build_check_passed is None
 
 
 @pytest.mark.asyncio
@@ -98,6 +105,8 @@ async def test_build_timeout_vetoes_pass(tmp_path) -> None:
     assert state.judge_verdict is not None
     assert state.judge_verdict.verdict == VerdictType.FAIL
     assert state.judge_verdict.veto_triggered
+    # Timeout is a build failure → False (counted, fails BCP).
+    assert state.build_check_passed is False
 
 
 def test_build_check_config_defaults() -> None:
diff --git a/tests/unit/test_phases.py b/tests/unit/test_phases.py
index 783924f..7545a31 100644
--- a/tests/unit/test_phases.py
+++ b/tests/unit/test_phases.py
@@ -678,6 +678,18 @@ async def test_verification_findings_recorded_as_errors_partial_failure(self):
         state.status = SystemStatus.COMPLETED
         assert build_ci_summary(state)["status"] == "partial_failure"
 
+    def test_ci_summary_surfaces_build_check_passed(self):
+        # BCP (metrics.md §8.5) is sourced from the CI summary by the eval
+        # harness; the tri-state must round-trip verbatim (None excluded from
+        # the BCP denominator, True/False counted).
+        from src.tools.ci_reporter import build_ci_summary
+
+        for value in (None, True, False):
+            state = _make_state(
+                status=SystemStatus.GENERATING_REPORT, build_check_passed=value
+            )
+            assert build_ci_summary(state)["build_check_passed"] is value
+
     @pytest.mark.asyncio
     async def test_report_skips_verification_in_dry_run(self):
         state = _make_state(status=SystemStatus.GENERATING_REPORT, dry_run=True)

From a8d259774522cb5dbe35a1940d6a72d00ea10432 Mon Sep 17 00:00:00 2001
From: Angel <heanqi@cvte.com>
Date: Mon, 1 Jun 2026 21:27:08 -0400
Subject: [PATCH 22/22] update README.md

---
 README.md      | 543 +++++++++++++++++++++++++++++--------------------
 README_zh.md   | 412 +++++++++++++++++++++++++++++++++++++
 pyproject.toml |   6 +-
 3 files changed, 738 insertions(+), 223 deletions(-)
 create mode 100644 README_zh.md

diff --git a/README.md b/README.md
index 980674d..e4924f3 100644
--- a/README.md
+++ b/README.md
@@ -1,309 +1,412 @@
-# CodeMergeSystem
+<div align="center">
 
-一个面向"长期分叉 fork ↔ upstream"场景的多 Agent 代码合并系统。通过 LLM 做语义理解、通过确定性工具做**可证伪**的加固扫描，把原本需要人工逐文件处理的大规模合并变成一条 **可审计、可暂停、可恢复** 的流水线。
+[中文](README_zh.md) | **English**
 
-> 中文文档为权威版本。英文文档将在后续补充。
+# 🔀 Code Merge System
 
----
+### Ship upstream upgrades to long-lived forks — without the 500-file conflict nightmare.
 
-## 这是为了解决什么问题
+A multi-agent pipeline that turns months of upstream drift into an **auditable, resumable, and safe** merge — preserving every fork customization along the way.
 
-在长期维护的软件项目中，下游团队常常基于某个历史版本做了大量私有改动，同时 upstream 持续迭代新功能、重构接口、升级依赖。分叉时间一长，直接 `git merge` 会出现：
+[![Python 3.11+](https://img.shields.io/badge/python-3.11+-3776AB.svg?logo=python&logoColor=white)](https://python.org)
+[![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)](#development)
+[![Coverage](https://img.shields.io/badge/coverage-80%25+-brightgreen.svg)](#development)
+[![License](https://img.shields.io/badge/license-TBD-lightgrey.svg)](#license)
+[![Anthropic](https://img.shields.io/badge/powered%20by-Claude%20%2B%20GPT-orange.svg)](https://anthropic.com)
 
-- 数百到数千个文件级冲突，人工无法逐一处理；
-- 行级 diff 无法表达语义，LLM/人都容易判错；
-- fork 独有的定制（API、路由、哨兵、CI job）被整文件覆盖而不被察觉；
-- 合并错一处可能导致运行时漏洞或功能失踪，且难以回滚。
+![Code Merge System Dashboard](doc/project-1.png)
 
-CodeMergeSystem 用 **七个专门化 Agent + 五十余个确定性工具 + 三层记忆 + 完整 Checkpoint** 提供一条通用合并流水线。
+</div>
 
-## 核心能力
+---
 
-- **六大丢失模式识别**：shadow 冲突 / 接口反向影响 / 顶层调用丢失 / 配置行保留 / Scar 自学习 / 业务哨兵扫描
-- **Planner ↔ Judge 协商**：审查 Agent 与 Executor 使用不同 LLM 提供商，避免共谋偏差
-- **写入即快照**：任何文件写入前自动保存原内容，失败即回滚
-- **全阶段 Checkpoint**：任意时刻 SIGINT 可安全中断，`merge resume` 从上次停下处继续
-- **门禁 baseline-diff**：只看"新引入的失败"，而非简单 exit 0，避免合入隐性 regression
-- **显式人工决策**：决策无默认回退，避免"超时即接受"的隐患
-- **多语言 AST 分块**：Python/TS/JS/Go/Rust/Java/C 均走 tree-sitter
+## The Problem
 
-## 前置准备
+Teams that maintain a long-lived fork face a brutal reality when syncing with upstream:
 
-| 项 | 说明 |
-|---|---|
-| Python 3.11+ | mypy strict / Pydantic v2 / async 全程 |
-| `ANTHROPIC_API_KEY` | Planner / ConflictAnalyst / Judge / HumanInterface 用 |
-| `OPENAI_API_KEY` | PlannerJudge / Executor 用（双 provider 是为了避免共谋偏差） |
-| `GITHUB_TOKEN`（可选） | 启用 GitHub 集成（拉取 PR 评论 / 推合并结果到 PR）时需要 |
-| Node.js（可选） | 仅 Web UI 开发（`cd web && npm install / npm run build`）需要；pip install 安装的 wheel 已内置 `src/web/dist/`，运行 `merge` 本身无需 Node |
+- **Hundreds to thousands of file conflicts** — impossible to handle manually, one by one
+- **Line-level diffs hide semantic intent** — LLMs and humans both make the wrong call
+- **Fork-only customizations get silently overwritten** — APIs, routes, CI jobs, sentinels disappear without a trace
+- **One wrong merge creates runtime vulnerabilities or missing features** — and they're hard to roll back
 
-**目标仓库需满足**：
+`git merge` gives you a list of conflicts. Code Merge System gives you a **decision pipeline**.
 
-- 是个 git 仓库，且当前 HEAD 是你的 fork 主分支
-- 工作树干净（`git status` 无未提交更改）—— 系统会写文件，脏树会被拒
-- upstream 那一端可访问：要么是本地分支（如 `upstream/main`、`origin/upstream-main`），要么 `git fetch <remote>` 已拉到本地
+---
 
-如果你 fork 还没接 upstream 远端：
+## Quick Start
 
 ```bash
+pip install code-merge-system
+
+export ANTHROPIC_API_KEY=sk-ant-...
+export OPENAI_API_KEY=sk-...
+
 cd /path/to/your-fork-repo
-git remote add upstream https://github.com/<owner>/<repo>.git
-git fetch upstream
+merge upstream/main --dry-run    # preview the plan before touching any files
 ```
 
-## 安装
+> First run opens a browser UI and walks you through a one-time setup wizard. Your config is saved to `.merge/config.yaml` — no wizard on subsequent runs.
 
-```bash
-git clone <repo-url> && cd CodeMergeSystem
-python3.11 -m venv .venv && source .venv/bin/activate
-pip install -e ".[dev]"
+---
 
-export ANTHROPIC_API_KEY=sk-ant-...
-export OPENAI_API_KEY=sk-...
-```
+## See It In Action
 
-## 首次合并：完整流程
+<table>
+<tr>
+<td width="50%">
 
-下面是一次真实合并里你**会依次看到的内容 + 每步要做的判断**。第一次跑建议先来一遍 `--dry-run` 摸清规模再决定真合。
+**Plan Review** — 124 files analyzed, 87.9% auto-merge confidence, risk distribution across A–E change categories.
 
-### 1. 启动 + 首次配置向导
+![Plan Review](doc/project-2.png)
 
-```bash
-cd /path/to/your-fork-repo
-merge upstream/main --dry-run
-```
+</td>
+<td width="50%">
 
-首次运行进入交互向导，依次问你：
+**Conflict Resolution** — Side-by-side intent analysis of fork vs. upstream changes, with LLM-recommended merge strategy (SEMANTIC_MERGE 85% confidence).
 
-- **项目背景描述**（一句话即可，会喂给 Planner 帮它理解上下文）
-- **API Key 确认**（已 export 的会显示掩码，回车跳过表示沿用）
-- **阈值**（默认 `auto_merge=0.85 / risk_low=0.30 / risk_high=0.60`，新手直接默认）
+![Conflict Resolution](doc/project-4.png)
 
-之后系统在 `<repo>/.merge/` 写入 `config.yaml` + `.env`（后者自动加进 `.gitignore`），下次运行不再问。
+</td>
+</tr>
+<tr>
+<td width="50%">
 
-> **如果你的 fork 删过整片功能域**（例如砍掉了 payments 子树）：当系统检测到 ≥30 个被 fork 删除的文件时，向导会**主动提示生成 `forks-profile.yaml` 草稿**并打开 `$EDITOR` 让你审阅。低于阈值则完全静默 —— `fork_only_features` 与 `migration_policy` 已在每次 run 时自动从 git 推算，无需手工维护。
+**Judge Verdict** — Independent review agent audits every merged file; CRITICAL/HIGH/MEDIUM/LOW issue breakdown with repair rounds.
 
-### 2. dry-run 跑出合并计划
+![Judge Verdict](doc/project-5.png)
 
-向导通过后系统在浏览器打开 Web UI（`--no-web` 切纯文本输出）。你会看到 8 个 phase 依次推进：
+</td>
+<td width="50%">
 
-```
-INITIALIZE  → 三方分类、风险打分、forks-profile 路由
-PLANNING    → Planner 出合并计划
-PLAN_REVIEW → PlannerJudge 审查；最多 2 轮修订
-AWAITING_HUMAN → 你审阅计划报告
-...（dry-run 在此停止）
-```
+**Run Report** — Full cost accounting ($0.04 for 124 files), per-agent token breakdown, learned memory entries for future runs.
+
+![Run Report](doc/project-6.png)
+
+</td>
+</tr>
+</table>
+
+---
+
+## How It Works
 
-dry-run 结束后**重点看这两个文件**：
+Eight phases driven by a state machine. Seven specialized agents. Every write is snapshotted. Any `Ctrl+C` is safe.
 
 ```
-.merge/plans/MERGE_PLAN_<upstream>_<run_id>.md
-.merge/runs/<run_id>/plan_review.md
+┌─────────────────────────────────────────────────────────────┐
+│  CLI / Web UI                                               │
+│         │                                                   │
+│   Orchestrator ── 8-phase state machine                    │
+│         │                                                   │
+│  ┌──────┴───────┐                                          │
+│  │              │                                           │
+│ Agents        Tools              Memory                     │
+│ (7 roles)   (50+ deterministic   (L0/L1/L2                  │
+│              + AST parsers)       cross-run store)          │
+│  │                                                          │
+│ LLM layer (Anthropic + OpenAI, credential pool, routing)   │
+└─────────────────────────────────────────────────────────────┘
 ```
 
-报告会告诉你：
+| Phase | What happens |
+|-------|-------------|
+| `INITIALIZE` | 3-way classification, risk scoring, fork-profile routing |
+| `PLANNING` | Planner generates merge plan with per-file strategy |
+| `PLAN_REVIEW` | PlannerJudge audits the plan; up to 2 revision rounds |
+| `AWAITING_HUMAN` | You review the plan report; fill in any `HUMAN_REQUIRED` decisions |
+| `AUTO_MERGING` | Executor applies auto-safe files with snapshot-before-write |
+| `CONFLICT_ANALYSIS` | ConflictAnalyst does semantic analysis on risky conflicts |
+| `JUDGE_REVIEW` | Judge + 50+ deterministic scanners audit all merged output |
+| `COMPLETED` | Full report generated; you decide when to `git commit` |
+
+| Agent | Role | Default Model |
+|-------|------|---------------|
+| Planner | Generates merge plan | Claude Opus |
+| PlannerJudge | Reviews plan (read-only) | GPT-4o |
+| ConflictAnalyst | Semantic analysis of high-risk conflicts | Claude Sonnet |
+| Executor | **Sole write authority** — applies merges | GPT-4o |
+| Judge | Reviews merged output + runs deterministic checks | Claude Opus |
+| HumanInterface | Generates decision templates | Claude Haiku |
+| SmokeTest | Post-merge smoke testing | — |
+
+> **Why two LLM providers?** Planner/Judge use Anthropic; Executor/PlannerJudge use OpenAI. Different providers for reviewer vs. writer eliminates collusion bias.
+
+---
+
+## Features
+
+### [Six Lost-Pattern Detectors](doc/modules/tools.md)
+Shadow conflicts, interface reverse impacts, top-level call drops, config line preservation, scar auto-learning, and business sentinel scanning — the failure modes that `git merge` misses entirely.
+
+### [Snapshot-Before-Write](doc/modules/core.md)
+Every file write creates a snapshot of the original. Any failure triggers automatic rollback. You never end up with a half-merged file.
+
+### [Full-Run Checkpointing](doc/modules/core.md)
+State is persisted after every phase. `merge resume --run-id <id>` picks up exactly where you left off — useful for large merges that take hours.
+
+### [Explicit Human Decisions](doc/modules/agents.md)
+No `TIMEOUT_DEFAULT`. No silent fallbacks. Files that need human judgment generate a `decisions.yaml` template; skipped decisions stay as `AWAITING_HUMAN` until explicitly resolved.
+
+### [Multi-Language AST Chunking](doc/modules/tools.md)
+Python, TypeScript, JavaScript, Go, Rust, Java, and C all use tree-sitter for semantic-level diff — not just line-level.
+
+### [Cross-Run Memory](doc/modules/memory.md)
+Decisions, disputes, and metrics are summarized into a SQLite store. Future runs on the same repo load relevant history to inform planning.
+
+### [Baseline-Diff Gate](doc/modules/tools.md)
+CI validation only flags *newly introduced* failures — not pre-existing ones. Merging into a repo with a known broken test won't block you.
+
+### [Browser Web UI](doc/modules/web-ui.md)
+Real-time pipeline progress, conflict resolution UI, plan review, judge verdict — all in a local browser app. Use `--no-web` for pure terminal output or `--ci` for JSON output in CI.
+
+---
+
+## Compared to Alternatives
+
+| | Code Merge System | `git merge` / `git rebase` | GitHub/GitLab UI | LLM chat (ChatGPT etc.) |
+|--|--|--|--|--|
+| Handles 500+ file conflicts | ✅ | ❌ Manual, one-by-one | ❌ | ❌ Context limit |
+| Preserves fork-only features | ✅ Auto-detected via scar/sentinel | ❌ Easy to overwrite | ❌ | ❌ No repo context |
+| Auditable decision trail | ✅ Per-file, with rationale | ❌ | Partial (PR comments) | ❌ |
+| Resumable after interrupt | ✅ Checkpoint after every phase | ❌ | ❌ | ❌ |
+| Deterministic safety checks | ✅ 50+ scanners post-merge | ❌ | ❌ | ❌ |
+| Cost | ~$0.04 for 124 files | Free | Free | Per-token, no automation |
+
+---
+
+## Can You Trust the Output?
+
+A merge tool is only worth as much as the evidence that its output is correct. This project ships a **formal evaluation framework** and an **auditable self-learning loop** — and reports their results honestly, including where the numbers are not yet impressive.
+
+### Evaluation against human golden merges
+
+We do **not** ask the LLM judge to grade its own verdict. The framework under [`doc/evaluation/`](doc/evaluation/README.md) measures system output against **expert human golden merges as ground truth**, scoring five trust dimensions at once — a system that blindly takes upstream and scores 100% "coverage" while losing half the fork's work must still fail:
+
+| Dimension | Question it answers | Key metrics |
+|-----------|--------------------|-------------|
+| **Correctness** | Did it merge what should merge, correctly? | miss-merge rate, wrong-merge rate, conflict-resolution accuracy |
+| **Safety** | Did it silently drop private changes? | M1–M6 semantic-loss recall, security-sensitive escalation rate, snapshot rollback rate |
+| **Process Trust** | Does it escalate uncertainty instead of guessing? | over-escalation rate, plan-dispute hit rate, Judge↔ground-truth agreement |
+| **Explainability** | Can every decision be replayed? | rationale completeness, `discarded_content` retention, trace replayability |
+| **Operational** | Stable across re-runs and models? Cost bounded? | decision consistency, $/run, wall-time P95 |
+
+Three dataset tiers feed it: **Tier-1** micro-bench (30–60 PRs, runs in CI), **Tier-2** real long-span replays (human merge diff = oracle), **Tier-3** adversarial injections (does it actually catch M1–M6?). The harness lives in [`scripts/eval/`](scripts/eval/) (`prepare.py → run.py → diff_against_golden.py → summarize.py → gate.py`).
 
-- 触及多少文件、按 ABCDE 五类分布
-- auto_merge / conflict_analysis / human_required 的占比
-- forks-profile drift 附录（如果 yaml 老化）
-- Planner-Judge 审查记录
+**Hard gates that veto a release** ([`acceptance.md`](doc/evaluation/acceptance.md)): wrong-merge rate **= 0%**, security-sensitive escalation **= 100%**, private-content retention **= 100%**, snapshot rollback **= 100%**, duplicate top-level symbols **= 0**, hallucinated cross-module references **= 0**; miss-merge **≤ 2%** (Tier-1), each M1–M6 recall **≥ 95%**. Soft gates track overall accuracy (≥ 92% Tier-1), determinism (≥ 90% across 3 runs), cross-model consistency (≥ 85%), and cost/latency drift caps.
 
-### 3. 决定继续真合并还是先调整
+> **Honesty over marketing:** the version-baseline table in `acceptance.md` is still seeded with a template row — no release has cleared the full gate yet, so we make **no "evaluated & trusted" claim**. The framework exists precisely so that claim, when made, is backed by lockable dataset SHAs and per-file golden diffs rather than a "99% merge success" headline.
 
-如果计划合理：
+### Self-learning — measured, not assumed
+
+The system improves across runs **without weight fine-tuning and without embeddings** — a deliberate choice backed by a 24-source survey (see [`doc/plan/self-learning-system.md`](doc/plan/self-learning-system.md)): non-parametric, auditable SQLite memory + execution-grounded reflection beats opaque RL on cost and deletability.
+
+| Phase | What it does | Status |
+|-------|-------------|--------|
+| **P0** Effectiveness metric | Ablation harness: `memory=on` vs `memory=off` decision lift | **Landed** — `merge eval-memory` |
+| **P1** Grounded feedback loop | Persistent auditable suppression of harmful entries · confidence write-back from `judge`+`compile`+`ci` signals · verified-repair recipe library | **Landed**, feedback loops **opt-in** until ablation proves net gain |
+| **P2** Memory-quality hardening | High-information entries enforced · key invariants pinned against summarization drift | **Landed** |
+| **P3** Offline prompt optimization | `merge optimize-prompts` ranks gate-prompt variants against a golden set, emits a **human-review report — never auto-applies** | **Landed**, opt-in |
+
+The governing rule is **measure before you activate**: a feedback loop only flips to on-by-default after `merge eval-memory` shows lift **> 0** *and* causally-attributed harm **= 0** on a fixed dataset. First baseline (forgejo, 124 files): lift measured at **0.0000** — so the loops stay opt-in. That run was dominated by deterministic mechanisms (take-target + veto), leaving memory no room to act; it does **not** prove memory worthless, and an LLM-judgment-dense dataset is needed to measure real lift. We report the zero rather than hide it — that *is* the trust signal.
+
+---
+
+## Prerequisites
+
+| | |
+|--|--|
+| Python 3.11+ | mypy strict / Pydantic v2 / async throughout |
+| `ANTHROPIC_API_KEY` | Planner, ConflictAnalyst, Judge, HumanInterface |
+| `OPENAI_API_KEY` | PlannerJudge, Executor (dual-provider anti-collusion) |
+| `GITHUB_TOKEN` *(optional)* | GitHub integration — pull PR comments, push merge results |
+| Node.js *(optional)* | Web UI development only; the installed wheel bundles `web/dist/` |
+
+**Target repo must:**
+- Be a git repo with a clean working tree (`git status` shows no uncommitted changes)
+- Have upstream accessible locally — either as a branch or via `git fetch <remote>`
 
 ```bash
-merge upstream/main          # 不带 --dry-run，正式跑
+# If you haven't added upstream yet:
+git remote add upstream https://github.com/<owner>/<repo>.git
+git fetch upstream
 ```
 
-系统会从 `INITIALIZE` 开始重新走一遍直到 `AUTO_MERGING` / `CONFLICT_ANALYSIS`，写入文件、做快照、跑门禁。
+---
+
+## Full Workflow
+
+### 1. Plan (dry-run)
 
-> **任意时刻 Ctrl+C 都安全** —— 已经写盘的 checkpoint 让你下次用 `merge resume --run-id <id>` 续跑。
+```bash
+cd /path/to/your-fork-repo
+merge upstream/main --dry-run
+```
+
+The browser UI opens and runs through `INITIALIZE → PLANNING → PLAN_REVIEW → AWAITING_HUMAN` then stops. Check the output reports:
+
+```
+.merge/plans/MERGE_PLAN_<run_id>.md   # file-by-file merge strategy
+.merge/runs/<run_id>/plan_review.md   # PlannerJudge audit record
+```
+
+### 2. Merge
+
+```bash
+merge upstream/main     # remove --dry-run to run for real
+```
 
-### 4. 处理人工决策（AWAITING_HUMAN）
+Any `Ctrl+C` is safe — resume with `merge resume --run-id <id>`.
 
-当系统遇到 risk_score 高于 `human_escalation` 的文件、或 Judge 判定不通过时，会暂停在 `AWAITING_HUMAN`，并在 `.merge/runs/<run_id>/` 下生成一个待填的 `decisions.yaml` 模板：
+### 3. Handle Human Decisions
+
+When the system pauses at `AWAITING_HUMAN`, fill in `.merge/runs/<id>/decisions.yaml`:
 
 ```yaml
-# decisions.yaml — 系统生成模板，你填决定
 - file_path: "backend/services/auth/auth.service.ts"
-  decision: take_current        # 可选：take_target / take_current / semantic_merge / escalate_human
-  rationale: "fork 用 SSO，必须保留"
+  decision: take_current          # take_target / take_current / semantic_merge / escalate_human
+  rationale: "Fork uses SSO — must preserve"
 ```
 
-填完续跑：
+Then resume:
 
 ```bash
 merge resume --run-id <id> --decisions .merge/runs/<id>/decisions.yaml
 ```
 
-### 5. 最终产出
-
-合并跑完后看：
+### 4. Review and commit
 
-| 路径 | 说明 |
-|---|---|
-| `.merge/runs/<run_id>/merge_report.md` | 最终合并报告（变更摘要、Judge verdict、未解决项） |
-| `.merge/runs/<run_id>/checkpoint.json` | 完整状态，可继续 resume |
-| `.merge/runs/<run_id>/logs/run_<id>.log` | 全量执行日志 |
-| 工作树本身 | 合并产物已落到当前分支；`git status` 看具体改了什么；自己决定是否 `git commit` |
+```
+.merge/runs/<run_id>/merge_report.md    # final report
+.merge/runs/<run_id>/checkpoint.json    # full state
+.merge/runs/<run_id>/logs/run_<id>.log  # complete execution log
+```
 
-> **系统不自动 commit / push** —— 写到工作树就停手，让你 review 完再提交。
+The system stops at the working tree. **It never auto-commits or auto-pushes** — you review, then decide.
 
-## 常用命令
+---
 
-按使用场景分组：
+## All Commands
 
 ```bash
-# === 首次接入 / 日常合并 ===
-merge <target-branch>                         # 一站式（默认浏览器 Web UI）
-merge <target-branch> --dry-run               # 只跑到 plan，不动文件
-merge <target-branch> --no-web                # 纯文本输出
-merge <target-branch> -r                      # 强制重新跑配置向导
-
-# === 续跑 / 决策 ===
-merge resume --run-id <id>                    # 从 checkpoint 续跑
-merge resume --run-id <id> --decisions decisions.yaml   # 带人工决策续跑
-merge resume --run-id <id> --web              # 在浏览器 Web UI 中续跑 / 查看历史 run 状态
-
-# === 校验 ===
-merge validate --config <path>                # 校验 config.yaml + 所有 api_key_env
-
-# === forks-profile（仅在做 fork 整域裁剪时用）===
-merge forks-profile init -o .merge/forks-profile.yaml   # 起草草稿
-merge forks-profile diff                                # 检查 yaml 是否过时
-merge forks-profile validate                            # 校验 yaml 语法
-
-# === CI ===
-merge <target-branch> --ci                    # 无交互，JSON 摘要到 stdout
+# Daily use
+merge <target-branch>                         # default: browser Web UI
+merge <target-branch> --dry-run               # plan only, no file writes
+merge <target-branch> --no-web                # terminal output
+merge <target-branch> -r                      # re-run setup wizard
+
+# Resume / decisions
+merge resume --run-id <id>
+merge resume --run-id <id> --decisions decisions.yaml
+merge resume --run-id <id> --web              # view history in browser
+
+# Validate
+merge validate --config <path>                # check config + all API keys
+
+# Fork profile (only needed when fork deleted ≥30 files)
+merge forks-profile init -o .merge/forks-profile.yaml
+merge forks-profile diff
+merge forks-profile validate
+
+# CI
+merge <target-branch> --ci                    # non-interactive, JSON summary to stdout
+merge <target-branch> --ci --auto-decisions <yaml>
 ```
 
-## 卡住了？
-
-| 现象 | 排查 |
-|---|---|
-| 向导报 "API Key not set" | 检查 `merge validate --config .merge/config.yaml`；shell env > `.merge/.env` > `~/.config/code-merge-system/.env` |
-| 启动报 "working tree dirty" | `git status` 看到未提交改动；`git stash` 或 `git commit` 后再跑 |
-| 启动报 "upstream ref not found" | 没 `git fetch upstream`，或者 `target-branch` 拼错（要写 `upstream/main` 不是 `main`） |
-| dry-run 卡在 PLAN_REVIEW 多轮 | Planner 与 PlannerJudge 在博弈；正常 1-2 轮，`max_plan_revision_rounds=2` 后会转 `AWAITING_HUMAN`，去看 `plan_review.md` |
-| 跑了一半中断 | 重新跑 `merge resume --run-id <id>`（`run_id` 在 `.merge/runs/` 下能看到） |
-| 想丢弃这次 run 重来 | `rm -rf .merge/runs/<id>/`，再 `merge <target-branch>` |
+---
 
-## 架构一览
+## Troubleshooting
 
-```
-CLI / Web UI
-       │
-  Orchestrator ── 状态机驱动 8 个 Phase
-       │
-  ┌────┴─────┐
-  │          │
-Agents     Tools            Memory
-(7 角色)  (50+ 工具 +         (L0/L1/L2
-         baseline parsers)   三层记忆)
-  │
-LLM 层（anthropic / openai，凭据池、智能路由、压缩）
-```
+| Symptom | Fix |
+|---------|-----|
+| `API Key not set` | Run `merge validate --config .merge/config.yaml`; check shell env → `.merge/.env` → `~/.config/code-merge-system/.env` |
+| `working tree dirty` | `git stash` or `git commit`, then re-run |
+| `upstream ref not found` | Run `git fetch upstream`; use `upstream/main` not `main` |
+| Plan review stuck in multiple rounds | Normal — Planner and PlannerJudge are negotiating; after `max_plan_revision_rounds=2` it transitions to `AWAITING_HUMAN`. Check `plan_review.md`. |
+| Run interrupted mid-way | `merge resume --run-id <id>` (find `run_id` under `.merge/runs/`) |
+| Want to start over | `rm -rf .merge/runs/<id>/`, then re-run |
 
-| Agent | 角色 | 默认模型 |
-|-------|------|----------|
-| Planner | 生成合并计划 | Claude Opus |
-| PlannerJudge | 审查计划 | GPT-4o |
-| ConflictAnalyst | 高风险冲突语义分析 | Claude Sonnet |
-| Executor | **唯一写权限**，应用合并 | GPT-4o |
-| Judge | 审查合并结果 + 确定性复检 | Claude Opus |
-| HumanInterface | 决策模板生成 | Claude Haiku |
-| SmokeTest | 合并后冒烟测试 | — |
+---
 
-每个 Agent 的模型、API Key、降档策略均可在 `config.yaml` 中独立配置。
+## Development
 
-## `.merge/` 生产目录布局
+```bash
+git clone <repo-url> && cd code-merge-system
+python3.11 -m venv .venv && source .venv/bin/activate
+pip install -e ".[dev]"
 
-pip 安装后在目标仓库运行时，所有产物写入 `<repo>/.merge/`：
+pytest tests/unit/ -q               # unit tests (no LLM calls)
+pytest tests/integration/ -v        # integration tests (real API, local only)
+mypy src                            # type check (strict)
+ruff check src/ && ruff format src/ # lint + format
 
-```
-.merge/
-  config.yaml        # 首次运行自动生成
-  .env               # API Keys，自动 gitignore
-  .gitignore         # 自动生成
-  plans/             # MERGE_PLAN_<id>.md 报告
-  runs/<run_id>/
-    checkpoint.json
-    merge_report.md
-    plan_review.md
-    logs/run_<id>.log
+# Web UI (only needed for frontend changes)
+cd web && npm install
+cd web && npm run dev               # Vite dev server at localhost:5173
+cd web && npm run build             # tsc + build → web/dist/
+cd web && npm test                  # vitest
 ```
 
-API Key 解析顺序：**Shell env → `.merge/.env` → `~/.config/code-merge-system/.env`**
+**Architecture constraints enforced by unit tests — do not violate:**
 
-## 文档
+- No `TIMEOUT_DEFAULT` on `DecisionSource` — human decisions must be explicit
+- `Judge` / `PlannerJudge` receive `ReadOnlyStateView` — no state writes from reviewer agents
+- `Executor` uses `apply_with_snapshot()` — no direct file writes
+- `plan_revision_rounds >= max` → `AWAITING_HUMAN`, not `FAILED`
+- `HumanInterface` never fills in default decisions
 
-完整中文文档索引见 [`doc/README.md`](doc/README.md)。关键入口：
+---
 
-- [**新人上手指南**](doc/modules/onboarding.md) — 第一次接触本项目必读
-- [系统架构](doc/architecture.md) — 分层 / 数据流 / 持久化 / 扩展点
-- [执行流程与状态机](doc/flow.md) — 13 个状态、8 个 Phase
-- [六大丢失模式 + P0/P1/P2 加固项](doc/multi-agent-optimization-from-merge-experience.md)
-- [迁移感知合并](doc/migration-aware-merge.md) — bulk-copy 场景
-- [风险等级](doc/risk-levels.md)
+## Contributing
 
-模块技术文档（`doc/modules/`）：
+Contributions are welcome — whether it's a bug report, a feature idea, or a pull request.
 
-| 模块 | 文档 |
-|---|---|
-| 数据模型（Pydantic v2） | [data-models.md](doc/modules/data-models.md) |
-| Agents | [agents.md](doc/modules/agents.md) |
-| Core（Orchestrator / Phases / Checkpoint） | [core.md](doc/modules/core.md) |
-| Tools（扫描器 / 门禁 / Git） | [tools.md](doc/modules/tools.md) |
-| LLM 层（路由 / 压缩 / 凭据池） | [llm.md](doc/modules/llm.md) |
-| 记忆系统（L0/L1/L2） | [memory.md](doc/modules/memory.md) |
-| CLI / Web UI | [cli.md](doc/modules/cli.md) |
-| Web UI 浏览器端用户旅程 | [web-ui.md](doc/web-ui.md) |
+**Good places to start:**
 
-## 参考开源项目
+- 🐛 **[Report a bug](../../issues/new?template=bug_report.md)** — include your Python version, the command you ran, and the relevant log from `.merge/runs/<id>/logs/`
+- 💡 **[Request a feature](../../issues/new?template=feature_request.md)** — describe your fork/upstream scenario and what the system currently gets wrong
+- 🔧 **[Browse open issues](../../issues)** — look for `good first issue` labels if you want a guided starting point
 
-本项目在设计过程中参考了多个开源实现，相关分析文档位于 [`doc/references/`](doc/references/)：
+**Before submitting a PR:**
 
-| 项目 | 类型 | 借鉴点 |
-|---|---|---|
-| [Weave](https://github.com/ataraxy-labs/weave) | 语义合并引擎 | tree-sitter entity-level merge；函数/类粒度三方合并 |
-| [merge-engine](https://docs.rs/merge-engine/) | Rust 合并库 | 4 层合并策略（Pattern DSL → CST → VSA → Genetic） |
-| [Mergiraf](https://mergiraf.org/) | AST 结构化合并 | AST 级语法感知合并 |
-| [git-machete](https://github.com/VirtusLab/git-machete) | 分支工作流 | Fork-point 推断 + `--override-to` 手动校正 |
-| [mergefix](https://pypi.org/project/mergefix/) | AI 冲突修复 | LLM 后处理冲突标记 |
-| [reconcile-ai](https://github.com/kailashchanel/reconcile-ai) | 批量冲突修复 | 批量提示节省成本 |
-| [clash](https://github.com/clash-sh/clash) | 并行 Agent | Worktree 级冲突检测 |
-| [NousResearch/hermes-agent](https://github.com/nousresearch/hermes-agent) | Agent 架构 | 工具抽象与 Agent 协作模式 |
-| Graphify | 代码知识图谱 | 用图谱压缩代码上下文 |
-| MemPalace | 记忆系统 | 语义索引 + 分层记忆 |
+1. Run `pytest tests/unit/` — all tests must pass
+2. Run `mypy src` — no new type errors
+3. Run `ruff check src/` — no lint errors
+4. Keep new files under 800 lines; organize by feature layer (`models → tools → llm → agents → core → cli`)
+5. New agents require a contract yaml under `src/agents/contracts/` — see [`src/agents/contracts/_schema.md`](src/agents/contracts/_schema.md)
 
-详细对照见 [`doc/references/opensource-comparison.md`](doc/references/opensource-comparison.md) 与各 `*-analysis.md`。
+**Key docs for contributors:**
 
-## 开发
+- [System Architecture](doc/architecture.md) — layers, data flow, persistence, extension points
+- [State Machine & Phases](doc/flow.md) — all 13 states and 8 phases
+- [Agent Contracts](src/agents/contracts/_schema.md) — how to add a new agent correctly
+- [Adding a New Agent](doc/modules/agents.md) — step-by-step recipe
 
-```bash
-pytest tests/unit/ -q              # 单元测试（不打 LLM API）
-pytest tests/integration/ -v       # 集成测试（打真 API，本地跑，不进 CI）
-mypy src                           # 类型检查（strict 模式）
-ruff check src/                    # Lint
-ruff format src/                   # 格式化
-
-# Web UI 开发（pip 安装的 wheel 已内置 src/web/dist；下面命令仅开发时需要）
-cd web && npm install              # 装依赖
-cd web && npm run start            # 启动 Vite dev server
-cd web && npm run build            # tsc + vite build → web/dist/
-cd web && npm test                 # vitest
-```
+---
 
-关键约束（PR review 会检查）：
+## Documentation
 
-- 不要给 `DecisionSource` 加 `TIMEOUT_DEFAULT`
-- Judge / PlannerJudge 只接收 `ReadOnlyStateView`
-- Executor 写文件必须走 `apply_with_snapshot()`
-- `plan_revision_rounds >= max` 时转 `AWAITING_HUMAN`，不是 `FAILED`
-- HumanInterface 不填默认值
+Full index: [`doc/README.md`](doc/README.md)
 
-## 许可证
+| | |
+|--|--|
+| [Onboarding Guide](doc/modules/onboarding.md) | Start here if you're new to the project |
+| [Architecture](doc/architecture.md) | Layers, data flow, persistence, extension points |
+| [Flow & State Machine](doc/flow.md) | 13 states, 8 phases |
+| [Six Lost Patterns + P0/P1/P2 Hardening](doc/multi-agent-optimization-from-merge-experience.md) | How we catch what `git merge` misses |
+| [Evaluation Framework](doc/evaluation/README.md) | Golden-merge ground truth, 5 trust dimensions, 3 dataset tiers, acceptance gates |
+| [Self-Learning System](doc/plan/self-learning-system.md) | Non-parametric memory + grounded feedback loop, phased rollout |
+| [Migration-Aware Merge](doc/migration-aware-merge.md) | Handling bulk-copy scenarios |
+| [Risk Levels](doc/risk-levels.md) | How files are classified A–E |
+| [Web UI User Journey](doc/web-ui.md) | Browser-side walkthrough |
+
+---
+
+## License
+
+MIT
+
+---
 
-TBD
+<div align="center">
+  <sub>Built for teams that maintain long-lived forks and need more than <code>git merge</code>.</sub>
+</div>
diff --git a/README_zh.md b/README_zh.md
new file mode 100644
index 0000000..26c0823
--- /dev/null
+++ b/README_zh.md
@@ -0,0 +1,412 @@
+<div align="center">
+
+**中文** | [English](README.md)
+
+# 🔀 Code Merge System
+
+### 把长期分叉仓库的 upstream 升级变成一条流水线——而不是 500 个文件冲突。
+
+一个多 Agent 合并管道，把数月的 upstream 积压变成**可审计、可恢复、安全**的合并——同时保留 fork 里的每一处定制。
+
+[![Python 3.11+](https://img.shields.io/badge/python-3.11+-3776AB.svg?logo=python&logoColor=white)](https://python.org)
+[![测试](https://img.shields.io/badge/tests-passing-brightgreen.svg)](#开发)
+[![覆盖率](https://img.shields.io/badge/coverage-80%25+-brightgreen.svg)](#开发)
+[![许可证](https://img.shields.io/badge/license-MIT-blue.svg)](#许可证)
+[![Powered by](https://img.shields.io/badge/powered%20by-Claude%20%2B%20GPT-orange.svg)](https://anthropic.com)
+
+![Code Merge System 控制台](doc/project-1.png)
+
+</div>
+
+---
+
+## 问题在哪里
+
+长期维护 fork 的团队在同步 upstream 时面临的现实是残酷的：
+
+- **数百到数千个文件冲突**——根本无法逐一人工处理
+- **行级 diff 掩盖了语义意图**——LLM 和人都容易判断出错
+- **fork 独有的定制被静默覆盖**——API、路由、CI job、哨兵逻辑消失了，无人察觉
+- **一处合并错误就可能导致运行时漏洞或功能缺失**——而且难以回滚
+
+`git merge` 给你一份冲突列表。Code Merge System 给你一条**决策流水线**。
+
+---
+
+## 快速开始
+
+```bash
+pip install code-merge-system
+
+export ANTHROPIC_API_KEY=sk-ant-...
+export OPENAI_API_KEY=sk-...
+
+cd /path/to/your-fork-repo
+merge upstream/main --dry-run    # 先预览合并计划，不动任何文件
+```
+
+> 首次运行会打开浏览器并引导完成一次性配置向导。配置保存至 `.merge/config.yaml`，之后运行无需再配置。
+
+---
+
+## 界面一览
+
+<table>
+<tr>
+<td width="50%">
+
+**计划审查** — 124 个文件，87.9% 自动合并置信度，A–E 五类变更分布。
+
+![计划审查](doc/project-2.png)
+
+</td>
+<td width="50%">
+
+**冲突解决** — 并排展示 fork 与 upstream 的变更意图，LLM 给出合并策略推荐（SEMANTIC_MERGE 85% 置信度）。
+
+![冲突解决](doc/project-4.png)
+
+</td>
+</tr>
+<tr>
+<td width="50%">
+
+**Judge 裁决** — 独立 Review Agent 审查每个已合并文件；按 CRITICAL/HIGH/MEDIUM/LOW 分级列出问题，支持多轮修复。
+
+![Judge 裁决](doc/project-5.png)
+
+</td>
+<td width="50%">
+
+**运行报告** — 完整费用明细（124 个文件 $0.04），每个 Agent 的 token 用量，以及本次 run 写入的记忆条目。
+
+![运行报告](doc/project-6.png)
+
+</td>
+</tr>
+</table>
+
+---
+
+## 工作原理
+
+八个阶段由状态机驱动。七个专门化 Agent。每次文件写入前自动快照。任意时刻 `Ctrl+C` 都安全。
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  CLI / Web UI                                               │
+│         │                                                   │
+│   Orchestrator ── 8 阶段状态机                              │
+│         │                                                   │
+│  ┌──────┴───────┐                                          │
+│  │              │                                           │
+│ Agents        Tools              Memory                     │
+│ (7 个角色)   (50+ 确定性工具      (L0/L1/L2                  │
+│              + AST 解析器)        跨 run 记忆存储)           │
+│  │                                                          │
+│ LLM 层（Anthropic + OpenAI，凭据池，智能路由）               │
+└─────────────────────────────────────────────────────────────┘
+```
+
+| 阶段 | 发生了什么 |
+|------|-----------|
+| `INITIALIZE` | 三方分类、风险打分、forks-profile 路由 |
+| `PLANNING` | Planner 生成每文件合并策略 |
+| `PLAN_REVIEW` | PlannerJudge 审查计划；最多 2 轮修订 |
+| `AWAITING_HUMAN` | 你审阅计划报告；填入 `HUMAN_REQUIRED` 决策 |
+| `AUTO_MERGING` | Executor 应用自动安全文件，写前快照 |
+| `CONFLICT_ANALYSIS` | ConflictAnalyst 对高风险冲突做语义分析 |
+| `JUDGE_REVIEW` | Judge + 50+ 确定性扫描器审查所有合并产物 |
+| `COMPLETED` | 生成完整报告；你决定何时 `git commit` |
+
+| Agent | 角色 | 默认模型 |
+|-------|------|----------|
+| Planner | 生成合并计划 | Claude Opus |
+| PlannerJudge | 审查计划（只读） | GPT-4o |
+| ConflictAnalyst | 高风险冲突语义分析 | Claude Sonnet |
+| Executor | **唯一写权限**——应用合并 | GPT-4o |
+| Judge | 审查合并结果 + 确定性复检 | Claude Opus |
+| HumanInterface | 生成决策模板 | Claude Haiku |
+| SmokeTest | 合并后冒烟测试 | — |
+
+> **为什么用两个 LLM 提供商？** Planner/Judge 使用 Anthropic；Executor/PlannerJudge 使用 OpenAI。审查者与执行者用不同供应商，消除共谋偏差。
+
+---
+
+## 功能特性
+
+### [六大丢失模式检测](doc/modules/tools.md)
+Shadow 冲突、接口反向影响、顶层调用丢失、配置行保留、Scar 自学习、业务哨兵扫描——这些是 `git merge` 完全检测不到的失败模式。
+
+### [写前快照](doc/modules/core.md)
+每次文件写入前自动保存原始内容。任何失败触发自动回滚。你不会遇到"合并到一半的文件"。
+
+### [全程 Checkpoint](doc/modules/core.md)
+每个阶段结束后持久化状态。`merge resume --run-id <id>` 从上次停止处精确继续——对耗时数小时的大型合并尤其重要。
+
+### [显式人工决策](doc/modules/agents.md)
+没有 `TIMEOUT_DEFAULT`，没有静默回退。需要人工判断的文件会生成 `decisions.yaml` 模板；跳过的决策保持 `AWAITING_HUMAN` 状态，直到明确填写为止。
+
+### [多语言 AST 分块](doc/modules/tools.md)
+Python、TypeScript、JavaScript、Go、Rust、Java 和 C 均走 tree-sitter，做语义级 diff——而不只是行级。
+
+### [跨 Run 记忆](doc/modules/memory.md)
+决策、争议和指标被汇总写入 SQLite 存储。后续在同一仓库上的 run 会加载相关历史记录来辅助规划。
+
+### [Baseline-Diff 门禁](doc/modules/tools.md)
+CI 验证只标记*新引入*的失败，而非已有的。合并到测试本来就挂的仓库时，不会被已有失败阻塞。
+
+### [浏览器 Web UI](doc/modules/web-ui.md)
+实时流水线进度、冲突解决界面、计划审查、Judge 裁决——全部在本地浏览器 App 中。用 `--no-web` 切换纯终端输出，或 `--ci` 输出 JSON 供 CI 使用。
+
+---
+
+## 与同类工具对比
+
+| | Code Merge System | `git merge` / `git rebase` | GitHub/GitLab UI | LLM 对话（ChatGPT 等） |
+|--|--|--|--|--|
+| 处理 500+ 文件冲突 | ✅ | ❌ 手动逐一处理 | ❌ | ❌ 上下文限制 |
+| 保留 fork 独有功能 | ✅ 通过 scar/sentinel 自动检测 | ❌ 容易被覆盖 | ❌ | ❌ 无仓库上下文 |
+| 可审计的决策记录 | ✅ 每文件附理由 | ❌ | 部分（PR 评论） | ❌ |
+| 中断后可恢复 | ✅ 每阶段 Checkpoint | ❌ | ❌ | ❌ |
+| 确定性安全检查 | ✅ 50+ 扫描器合并后复检 | ❌ | ❌ | ❌ |
+| 费用 | ~$0.04 / 124 文件 | 免费 | 免费 | 按 token 计费，无自动化 |
+
+---
+
+## 能不能信任合并产物？
+
+一个合并工具的价值，取决于它能拿出多少"产物正确"的证据。本项目配套了一套**正式测评系统**和一条**可审计的自学习闭环**——并且如实汇报结果，包括那些目前还不漂亮的数字。
+
+### 以人工黄金合并为 Ground Truth 的测评
+
+我们**不**让 LLM Judge 给自己的 verdict 打分。[`doc/evaluation/`](doc/evaluation/README.md) 下的测评系统以**专家人工黄金合并（Human Golden Merge）作为 Ground Truth**，按统一差分协议度量系统产物与黄金合并的偏差，并同时考核五个信任维度——一个"全部直接 take_target、覆盖率 100% 却丢了一半 fork 改动"的系统必须在这里被判不通过：
+
+| 维度 | 回答的问题 | 主要指标 |
+|------|-----------|---------|
+| **正确性** | 该合的合了没？合得对不对？ | 漏合率、错合率、冲突解决正确率 |
+| **安全性** | 有没有偷偷丢掉私有改动？ | M1–M6 语义丢失召回、安全敏感文件人工率、快照回滚率 |
+| **过程可信** | 不确定的事会上报还是硬猜？ | 升级率、Plan Dispute 命中率、Judge↔Ground Truth 一致率 |
+| **可解释性** | 每个决策都能复盘吗？ | rationale 完整率、`discarded_content` 留存率、Trace 可回放率 |
+| **运行稳健** | 重复跑、换模型结果稳吗？成本可控吗？ | 决策一致性、$/run、wall-time P95 |
+
+三层评估集支撑它：**Tier-1** 微基准（30–60 PR，可进 CI 天天跑）、**Tier-2** 真实长跨度回放（人工合并 diff 即 oracle）、**Tier-3** 对抗注入集（系统真能识别 M1–M6 吗）。评估 harness 位于 [`scripts/eval/`](scripts/eval/)（`prepare.py → run.py → diff_against_golden.py → summarize.py → gate.py`）。
+
+**一票否决的硬门**（[`acceptance.md`](doc/evaluation/acceptance.md)）：错合率 **= 0%**、安全敏感升级率 **= 100%**、私有内容留存率 **= 100%**、快照回滚成功率 **= 100%**、重复顶层符号数 **= 0**、幻觉跨模块引用数 **= 0**；漏合率 **≤ 2%**（Tier-1），M1–M6 各类召回 **≥ 95%**。软门跟踪总正确率（≥ 92% Tier-1）、决策一致性（3 次 run ≥ 90%）、跨模型一致性（≥ 85%）以及成本/时延漂移上限。
+
+> **诚实优先于营销：** `acceptance.md` 的版本基线表目前仍是模板行——尚无任何版本跑通完整 gate，因此我们**不对外宣称"已通过评估、可信"**。这套系统存在的意义，正是为了让这句承诺一旦做出，背后是可锁定的数据集 SHA 与逐文件黄金差分，而不是一句"合并成功率 99%"。
+
+### 自学习——靠度量，而非假设
+
+系统跨 run 自我改进，**不微调权重、不引入 embedding**——这是经 24 源调研支撑的刻意选择（见 [`doc/plan/self-learning-system.md`](doc/plan/self-learning-system.md)）：非参数化、可审计的 SQLite 记忆 + 执行接地的反思，在成本与"可删除性"上胜过不透明的权重 RL。
+
+| 阶段 | 做什么 | 状态 |
+|------|-------|------|
+| **P0** 有效性度量 | 消融 harness：`memory=on` vs `memory=off` 决策增益 | **已落地** — `merge eval-memory` |
+| **P1** 执行接地反馈环 | 有害条目持久可审计软删 · 由 `judge`+`compile`+`ci` 信号写回 confidence · verified-repair 修复配方库 | **已落地**，反馈环在消融证明净收益前默认 **opt-in** |
+| **P2** 记忆质量加固 | 强制高信息条目 · 关键不变量锚定防摘要漂移 | **已落地** |
+| **P3** 离线提示优化 | `merge optimize-prompts` 按黄金集排名 gate 提示变体，产**人工评审报告——绝不自动写回** | **已落地**，opt-in |
+
+核心准则是**先度量再激活**：任一反馈环只有在 `merge eval-memory` 于固定数据集上显示增益 **> 0** **且**因果归因的有害数 **= 0** 后，才翻为默认开启。首组基线（forgejo，124 文件）实测增益 **0.0000**——所以反馈环维持 opt-in。那次 run 由确定性机制（take-target + veto）主导，记忆没有用武之地；这**不**证明记忆无价值，需要 LLM 判断密集的数据集才能测出真实增益。我们如实报告这个零，而非藏起来——这本身就是信任信号。
+
+---
+
+## 前置要求
+
+| | |
+|--|--|
+| Python 3.11+ | mypy strict / Pydantic v2 / async 全程 |
+| `ANTHROPIC_API_KEY` | Planner、ConflictAnalyst、Judge、HumanInterface |
+| `OPENAI_API_KEY` | PlannerJudge、Executor（双供应商防共谋） |
+| `GITHUB_TOKEN` *（可选）* | GitHub 集成——拉取 PR 评论、推送合并结果 |
+| Node.js *（可选）* | 仅 Web UI 开发；安装包已内置 `web/dist/` |
+
+**目标仓库需满足：**
+- 是一个 git 仓库，且工作树干净（`git status` 无未提交改动）
+- upstream 可在本地访问——作为分支或通过 `git fetch <remote>` 已拉取
+
+```bash
+# 如果还没添加 upstream 远端：
+git remote add upstream https://github.com/<owner>/<repo>.git
+git fetch upstream
+```
+
+---
+
+## 完整流程
+
+### 1. 预跑（dry-run）
+
+```bash
+cd /path/to/your-fork-repo
+merge upstream/main --dry-run
+```
+
+浏览器打开并依次推进 `INITIALIZE → PLANNING → PLAN_REVIEW → AWAITING_HUMAN`，然后停止。重点查看：
+
+```
+.merge/plans/MERGE_PLAN_<run_id>.md   # 每文件合并策略
+.merge/runs/<run_id>/plan_review.md   # PlannerJudge 审查记录
+```
+
+### 2. 正式合并
+
+```bash
+merge upstream/main     # 去掉 --dry-run 正式运行
+```
+
+任意时刻 `Ctrl+C` 都安全——用 `merge resume --run-id <id>` 续跑。
+
+### 3. 处理人工决策
+
+当系统在 `AWAITING_HUMAN` 暂停时，填写 `.merge/runs/<id>/decisions.yaml`：
+
+```yaml
+- file_path: "backend/services/auth/auth.service.ts"
+  decision: take_current          # take_target / take_current / semantic_merge / escalate_human
+  rationale: "fork 用 SSO，必须保留"
+```
+
+然后续跑：
+
+```bash
+merge resume --run-id <id> --decisions .merge/runs/<id>/decisions.yaml
+```
+
+### 4. 审阅并提交
+
+```
+.merge/runs/<run_id>/merge_report.md    # 最终合并报告
+.merge/runs/<run_id>/checkpoint.json    # 完整状态
+.merge/runs/<run_id>/logs/run_<id>.log  # 全量执行日志
+```
+
+系统在写入工作树后停手。**它不会自动 commit 或 push**——你审阅完再决定。
+
+---
+
+## 常用命令
+
+```bash
+# 日常使用
+merge <target-branch>                         # 默认：浏览器 Web UI
+merge <target-branch> --dry-run               # 仅跑到计划，不动文件
+merge <target-branch> --no-web                # 纯终端输出
+merge <target-branch> -r                      # 重新运行配置向导
+
+# 续跑 / 决策
+merge resume --run-id <id>
+merge resume --run-id <id> --decisions decisions.yaml
+merge resume --run-id <id> --web              # 在浏览器中查看历史 run
+
+# 校验
+merge validate --config <path>                # 检查 config + 所有 API Key
+
+# Fork Profile（仅在 fork 删除了 ≥30 个文件时需要）
+merge forks-profile init -o .merge/forks-profile.yaml
+merge forks-profile diff
+merge forks-profile validate
+
+# CI
+merge <target-branch> --ci                    # 无交互，JSON 摘要输出到 stdout
+merge <target-branch> --ci --auto-decisions <yaml>
+```
+
+---
+
+## 排查问题
+
+| 现象 | 解决方法 |
+|------|---------|
+| `API Key not set` | 运行 `merge validate --config .merge/config.yaml`；检查 shell env → `.merge/.env` → `~/.config/code-merge-system/.env` |
+| `working tree dirty` | `git stash` 或 `git commit`，再重跑 |
+| `upstream ref not found` | 执行 `git fetch upstream`；用 `upstream/main` 而非 `main` |
+| Plan review 卡在多轮协商 | 正常现象——Planner 与 PlannerJudge 在博弈；`max_plan_revision_rounds=2` 后自动转 `AWAITING_HUMAN`，去看 `plan_review.md` |
+| 中途中断 | `merge resume --run-id <id>`（`run_id` 在 `.merge/runs/` 下可以看到） |
+| 想重头来过 | `rm -rf .merge/runs/<id>/`，再重新运行 |
+
+---
+
+## 开发
+
+```bash
+git clone <repo-url> && cd code-merge-system
+python3.11 -m venv .venv && source .venv/bin/activate
+pip install -e ".[dev]"
+
+pytest tests/unit/ -q               # 单元测试（不打 LLM API）
+pytest tests/integration/ -v        # 集成测试（真实 API，本地运行，不进 CI）
+mypy src                            # 类型检查（strict 模式）
+ruff check src/ && ruff format src/ # Lint + 格式化
+
+# Web UI（仅前端改动时需要）
+cd web && npm install
+cd web && npm run dev               # Vite dev server，localhost:5173
+cd web && npm run build             # tsc + build → web/dist/
+cd web && npm test                  # vitest
+```
+
+**由单测强制保证的架构约束——不得违反：**
+
+- `DecisionSource` 不加 `TIMEOUT_DEFAULT`——人工决策必须显式
+- `Judge` / `PlannerJudge` 接收 `ReadOnlyStateView`——审查 Agent 不得写 state
+- `Executor` 使用 `apply_with_snapshot()`——不得直接写文件
+- `plan_revision_rounds >= max` 时转 `AWAITING_HUMAN`，不是 `FAILED`
+- `HumanInterface` 不填入默认决策
+
+---
+
+## 参与贡献
+
+欢迎任何形式的贡献——无论是 Bug 报告、功能建议还是 PR。
+
+**适合入手的地方：**
+
+- 🐛 **[报告 Bug](../../issues/new?template=bug_report.md)** — 请附上 Python 版本、运行的命令，以及 `.merge/runs/<id>/logs/` 中的相关日志
+- 💡 **[提功能建议](../../issues/new?template=feature_request.md)** — 描述你的 fork/upstream 场景，以及系统目前哪里处理得不对
+- 🔧 **[浏览 Issues](../../issues)** — 找标有 `good first issue` 标签的任务，适合初次贡献
+
+**提交 PR 前请确认：**
+
+1. `pytest tests/unit/` 全部通过
+2. `mypy src` 无新类型错误
+3. `ruff check src/` 无 lint 错误
+4. 新文件不超过 800 行；按功能层组织（`models → tools → llm → agents → core → cli`）
+5. 新 Agent 需在 `src/agents/contracts/` 下创建 contract yaml，参见 [`src/agents/contracts/_schema.md`](src/agents/contracts/_schema.md)
+
+**贡献者必读文档：**
+
+- [系统架构](doc/architecture.md) — 分层、数据流、持久化、扩展点
+- [状态机与阶段](doc/flow.md) — 全部 13 个状态和 8 个阶段
+- [Agent Contract](src/agents/contracts/_schema.md) — 如何正确添加新 Agent
+- [新增 Agent 指南](doc/modules/agents.md) — 分步操作手册
+
+---
+
+## 文档
+
+完整索引见 [`doc/README.md`](doc/README.md)
+
+| | |
+|--|--|
+| [新人上手指南](doc/modules/onboarding.md) | 第一次接触本项目必读 |
+| [系统架构](doc/architecture.md) | 分层、数据流、持久化、扩展点 |
+| [执行流程与状态机](doc/flow.md) | 13 个状态、8 个阶段 |
+| [六大丢失模式 + P0/P1/P2 加固](doc/multi-agent-optimization-from-merge-experience.md) | 我们如何捕获 `git merge` 遗漏的失败 |
+| [测评系统](doc/evaluation/README.md) | 黄金合并 Ground Truth、五大信任维度、三层评估集、验收门 |
+| [自学习系统](doc/plan/self-learning-system.md) | 非参数化记忆 + 执行接地反馈环，分阶段落地 |
+| [迁移感知合并](doc/migration-aware-merge.md) | 批量复制场景的处理 |
+| [风险等级](doc/risk-levels.md) | 文件如何被分为 A–E 类 |
+| [Web UI 用户旅程](doc/web-ui.md) | 浏览器端完整操作流程 |
+
+---
+
+## 许可证
+
+MIT
+
+---
+
+<div align="center">
+  <sub>为长期维护 fork、不满足于 <code>git merge</code> 的团队而生。</sub>
+</div>
diff --git a/pyproject.toml b/pyproject.toml
index 76fa56a..3ace067 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,9 +37,9 @@ dependencies = [
 ]
 
 [project.urls]
-Homepage = "https://github.com/angel/code-merge-system"
-Repository = "https://github.com/angel/code-merge-system"
-Issues = "https://github.com/angel/code-merge-system/issues"
+Homepage = "https://github.com/GOSICK-Angel/code-merge-system"
+Repository = "https://github.com/GOSICK-Angel/code-merge-system"
+Issues = "https://github.com/GOSICK-Angel/code-merge-system/issues"
 
 [project.scripts]
 merge = "src.cli.main:cli"

+ +计划审查 — 124 个文件，87.9% 自动合并置信度，A–E 五类变更分布。 + +![计划审查](doc/project-2.png) + +	+ +冲突解决 — 并排展示 fork 与 upstream 的变更意图，LLM 给出合并策略推荐（SEMANTIC_MERGE 85% 置信度）。 + +![冲突解决](doc/project-4.png) + +
+ +Judge 裁决 — 独立 Review Agent 审查每个已合并文件；按 CRITICAL/HIGH/MEDIUM/LOW 分级列出问题，支持多轮修复。 + +![Judge 裁决](doc/project-5.png) + +	+ +运行报告 — 完整费用明细（124 个文件 $0.04），每个 Agent 的 token 用量，以及本次 run 写入的记忆条目。 + +![运行报告](doc/project-6.png) + +