From ba08156a28405594d9a6fd292f5515752e947dca Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 05:29:35 -0400 Subject: [PATCH 01/22] =?UTF-8?q?feat(memory):=20=E6=B7=BB=E5=8A=A0=20P0?= =?UTF-8?q?=20=E8=AE=B0=E5=BF=86=E6=B6=88=E8=9E=8D=20harness=20=E4=B8=8E?= =?UTF-8?q?=E6=B3=A8=E5=85=A5=E5=BC=80=E5=85=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 激活方案 Phase 0 的离线消融能力,给此前无生产 caller 的 compare_memory_effectiveness 接上真实入口,并补齐文档误以为已存在、 实则缺失的 memory=off 能力。 **消融开关** - MemoryExtractionConfig.inject_enabled(默认 True):False 时 orchestrator._inject_memory 跳过 set_memory_store,使 get_memory_context 返回空 = memory=off 臂;抽取/写回经 orchestrator 自有 store 不受影响 **离线 harness** (src/tools/memory_replay.py,纯读) - load_effectiveness_report:从 run 目录或 json 载入 memory_effectiveness.json - build_ablation_comparison:包装 compare_memory_effectiveness(首个真实 caller) - render_ablation_table:输出 memory_decision_lift 对比表 **CLI** - merge eval-memory --on --off [--out]:载入两臂报告→对比→打印表→可选落 JSON **测试** tests/unit/test_memory_replay.py:14 用例覆盖 load/compare/render 与开关真实接线(断言 inject_enabled=False 不 wire store) --- CLAUDE.md | 5 + src/cli/main.py | 58 ++++++++++ src/core/orchestrator.py | 8 +- src/models/config.py | 9 ++ src/tools/memory_replay.py | 98 ++++++++++++++++ tests/unit/test_memory_replay.py | 192 +++++++++++++++++++++++++++++++ 6 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 src/tools/memory_replay.py create mode 100644 tests/unit/test_memory_replay.py diff --git a/CLAUDE.md b/CLAUDE.md index 81b9096..d69c086 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -37,8 +37,13 @@ merge validate --config # validate config + env vars merge init [--repo-path .] # generate per-target CLAUDE.md for merge decisions merge plan-suggest [--target ... --candidates ...] # enumerate baseline commit-windows merge forks-profile init # scaffold .merge/forks-profile.yaml (recommended ≥30 fork-deleted files) +merge eval-memory --on --off [--out ] # P0 memory ablation: compare memory=on vs memory=off effectiveness reports ``` +To produce a `memory=off` run for the ablation, set `memory.inject_enabled: false` +in `.merge/config.yaml` and re-run on the same dataset; each run persists a +`memory_effectiveness.json` under its run dir that `merge eval-memory` consumes. + ## Required Environment Variables Each agent reads its API key from its own env var — no key is hardcoded: diff --git a/src/cli/main.py b/src/cli/main.py index 6bc1f35..0dd75ca 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -462,6 +462,64 @@ def validate_config_and_env(config: MergeConfig) -> list[str]: return errors +@cli.command("eval-memory") +@click.option( + "--on", + "on_path", + required=True, + type=click.Path(exists=True), + help="memory=on run: a memory_effectiveness.json file or its run directory", +) +@click.option( + "--off", + "off_path", + required=True, + type=click.Path(exists=True), + help="memory=off run: a memory_effectiveness.json file or its run directory " + "(produce one by setting memory.inject_enabled: false in config.yaml)", +) +@click.option( + "--out", + "out_path", + required=False, + default=None, + type=click.Path(), + help="optional path to write the ablation comparison as JSON", +) +def eval_memory_command(on_path: str, off_path: str, out_path: str | None) -> None: + """P0: compare a memory=on vs memory=off run and report the decision lift. + + Offline and read-only — consumes the memory_effectiveness.json each run + persists at report time. The acceptance gate (lift > 0 AND harmful rate + not rising) is defined in doc/evaluation/acceptance.md. + """ + from src.tools.memory_replay import ( + build_ablation_comparison, + load_effectiveness_report, + render_ablation_table, + ) + + try: + report_on = load_effectiveness_report(on_path) + report_off = load_effectiveness_report(off_path) + except (FileNotFoundError, ValueError) as e: + console.print(f"[red]Failed to load effectiveness report: {e}[/red]") + sys.exit(1) + + comparison = build_ablation_comparison(report_on, report_off) + console.print(render_ablation_table(comparison)) + + if out_path: + try: + Path(out_path).write_text( + comparison.model_dump_json(indent=2), encoding="utf-8" + ) + console.print(f"[green]Wrote ablation comparison to {out_path}[/green]") + except OSError as e: + console.print(f"[red]Failed to write {out_path}: {e}[/red]") + sys.exit(1) + + cli.add_command(_forks_profile_group) diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py index c340c3a..bd09ed8 100644 --- a/src/core/orchestrator.py +++ b/src/core/orchestrator.py @@ -597,8 +597,14 @@ def _should_llm_extract(self, phase: str, state: MergeState) -> bool: def _inject_memory(self) -> None: memory_cfg = getattr(self.config, "memory", None) + # P0 ablation: when inject_enabled is False, leave each agent's store + # at None so get_memory_context() returns "" — the "memory=off" arm. + # Extraction/write-back still run at the orchestrator level; only + # read-time prompt injection is suppressed. + inject_enabled = getattr(memory_cfg, "inject_enabled", True) for agent in self._all_agents: - agent.set_memory_store(self._memory_store) # type: ignore[arg-type] + if inject_enabled: + agent.set_memory_store(self._memory_store) # type: ignore[arg-type] agent.set_memory_hit_tracker(self._memory_hit_tracker) agent.set_memory_config(memory_cfg) agent.set_upstream_ref(self.config.upstream_ref) diff --git a/src/models/config.py b/src/models/config.py index 7fbdf35..872fe91 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -961,6 +961,15 @@ class MemoryExtractionConfig(BaseModel): description="OPP-5: minimum pass+fail observations before an entry's " "confidence is nudged, so a single run cannot move it.", ) + inject_enabled: bool = Field( + default=True, + description="P0 ablation switch: when False, no memory context is " + "injected into any agent prompt (the orchestrator skips wiring the " + "store onto agents, so get_memory_context returns empty). Used to " + "produce the 'memory=off' arm of the memory-effectiveness ablation " + "(merge eval-memory). Extraction/write-back are unaffected — only " + "read-time injection is suppressed. Default True (normal behaviour).", + ) class RenameDetectionConfig(BaseModel): diff --git a/src/tools/memory_replay.py b/src/tools/memory_replay.py new file mode 100644 index 0000000..d98fe37 --- /dev/null +++ b/src/tools/memory_replay.py @@ -0,0 +1,98 @@ +"""P0: offline memory ablation harness (read-only). + +Consumes the ``memory_effectiveness.json`` artifacts that a run persists at +report time (one from a ``memory=on`` run, one from a ``memory=off`` run on +the same dataset — see ``MemoryExtractionConfig.inject_enabled``) and produces +the ablation comparison that answers "did injected memory actually improve +merge decisions?". This is the first real caller of +``compare_memory_effectiveness``. + +Pure and offline: it reads already-persisted JSON, makes no LLM calls, and +never touches a decision path. The acceptance gate (lift > 0 AND harmful rate +not rising) is defined in ``doc/evaluation/acceptance.md``; this module only +loads, compares, and renders. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from src.models.memory_effectiveness import ( + MemoryAblationComparison, + MemoryEffectivenessReport, +) +from src.tools.memory_eval import compare_memory_effectiveness + +REPORT_FILENAME = "memory_effectiveness.json" + + +def _resolve_report_path(path: str | Path) -> Path: + """Resolve a user-supplied path to the effectiveness JSON file. + + Accepts either the JSON file directly or a run directory containing + ``memory_effectiveness.json``. Raises ``FileNotFoundError`` with an + actionable message when neither resolves. + """ + p = Path(path) + if p.is_dir(): + candidate = p / REPORT_FILENAME + if not candidate.is_file(): + raise FileNotFoundError( + f"no {REPORT_FILENAME} in run directory {p} — was the run " + f"completed with memory effectiveness reporting enabled?" + ) + return candidate + if not p.is_file(): + raise FileNotFoundError( + f"effectiveness report not found: {p} (expected a " + f"{REPORT_FILENAME} file or a run directory containing it)" + ) + return p + + +def load_effectiveness_report(path: str | Path) -> MemoryEffectivenessReport: + """Load a persisted ``MemoryEffectivenessReport`` from a JSON file or run dir.""" + report_path = _resolve_report_path(path) + raw = json.loads(report_path.read_text(encoding="utf-8")) + return MemoryEffectivenessReport.model_validate(raw) + + +def build_ablation_comparison( + memory_on: MemoryEffectivenessReport, + memory_off: MemoryEffectivenessReport, +) -> MemoryAblationComparison: + """Compare the on/off effectiveness reports (wraps the eval analyzer).""" + return compare_memory_effectiveness(memory_on, memory_off) + + +def _pct(value: float) -> str: + return f"{value * 100:.2f}%" + + +def render_ablation_table(cmp: MemoryAblationComparison) -> str: + """Render the ablation comparison as a plain markdown table. + + The verdict line restates the convenience ``memory_beneficial`` flag + (lift > 0); the full acceptance gate also requires the harmful-influence + rate not to rise over time (see ``doc/evaluation/acceptance.md``). + """ + lift = cmp.memory_decision_lift + sign = "+" if lift > 0 else "" + verdict = "BENEFICIAL (lift > 0)" if cmp.memory_beneficial else "NOT beneficial" + return "\n".join( + [ + "| Metric | memory=on | memory=off |", + "|---|---|---|", + f"| run_id | `{cmp.on_run_id}` | `{cmp.off_run_id}` |", + f"| overall_correct_rate | {_pct(cmp.overall_correct_rate_on)} " + f"| {_pct(cmp.overall_correct_rate_off)} |", + "", + f"**memory_decision_lift**: {sign}{lift:.4f} " + f"({_pct(lift) if lift >= 0 else '-' + _pct(-lift)})", + "", + f"**harmful_influence_rate (on)**: {_pct(cmp.harmful_influence_rate_on)}", + "", + f"**Verdict**: {verdict}", + ] + ) diff --git a/tests/unit/test_memory_replay.py b/tests/unit/test_memory_replay.py new file mode 100644 index 0000000..02e93ca --- /dev/null +++ b/tests/unit/test_memory_replay.py @@ -0,0 +1,192 @@ +"""PR-0a: offline memory ablation harness + inject_enabled switch tests.""" + +import json + +import pytest + +from src.models.config import MemoryExtractionConfig +from src.models.memory_effectiveness import MemoryEffectivenessReport +from src.tools.memory_replay import ( + REPORT_FILENAME, + build_ablation_comparison, + load_effectiveness_report, + render_ablation_table, +) + + +def _report(run_id: str, correct_rate: float, harmful_rate: float = 0.0): + return MemoryEffectivenessReport( + run_id=run_id, + total_judged_decisions=10, + overall_correct_rate=correct_rate, + memory_influenced_decisions=4, + correct_after_influence=3, + harmful_influence_count=1, + correct_rate_after_influence=0.75, + harmful_influence_rate=harmful_rate, + total_tracked_entries=2, + effective_observations=4, + ) + + +# --- loading ---------------------------------------------------------------- + + +def test_load_from_json_file(tmp_path): + report = _report("run-on", 0.9) + p = tmp_path / REPORT_FILENAME + p.write_text(report.model_dump_json(), encoding="utf-8") + loaded = load_effectiveness_report(p) + assert loaded == report + + +def test_load_from_run_directory(tmp_path): + report = _report("run-off", 0.7) + (tmp_path / REPORT_FILENAME).write_text(report.model_dump_json(), encoding="utf-8") + loaded = load_effectiveness_report(tmp_path) + assert loaded.run_id == "run-off" + + +def test_load_missing_file_raises(tmp_path): + with pytest.raises(FileNotFoundError): + load_effectiveness_report(tmp_path / "nope.json") + + +def test_load_dir_without_report_raises(tmp_path): + with pytest.raises(FileNotFoundError, match=REPORT_FILENAME): + load_effectiveness_report(tmp_path) + + +def test_load_rejects_malformed_json(tmp_path): + p = tmp_path / REPORT_FILENAME + p.write_text(json.dumps({"run_id": "x"}), encoding="utf-8") # missing fields + with pytest.raises(Exception): + load_effectiveness_report(p) + + +# --- comparison ------------------------------------------------------------- + + +def test_comparison_positive_lift(): + cmp = build_ablation_comparison(_report("on", 0.9), _report("off", 0.7)) + assert cmp.memory_decision_lift == pytest.approx(0.2) + assert cmp.memory_beneficial is True + assert cmp.on_run_id == "on" + assert cmp.off_run_id == "off" + + +def test_comparison_non_positive_lift_not_beneficial(): + cmp = build_ablation_comparison(_report("on", 0.7), _report("off", 0.7)) + assert cmp.memory_decision_lift == pytest.approx(0.0) + assert cmp.memory_beneficial is False + + +# --- rendering -------------------------------------------------------------- + + +def test_render_table_contains_key_figures(): + cmp = build_ablation_comparison(_report("on", 0.9, 0.25), _report("off", 0.7)) + table = render_ablation_table(cmp) + assert "memory_decision_lift" in table + assert "BENEFICIAL" in table + assert "25.00%" in table # harmful_influence_rate_on + assert "`on`" in table and "`off`" in table + + +def test_render_table_negative_lift(): + cmp = build_ablation_comparison(_report("on", 0.6), _report("off", 0.8)) + table = render_ablation_table(cmp) + assert "NOT beneficial" in table + assert "-" in table # negative lift rendered with sign + + +# --- inject_enabled ablation switch ----------------------------------------- + + +def test_inject_enabled_defaults_true(): + assert MemoryExtractionConfig().inject_enabled is True + + +def test_inject_disabled_skips_store_wiring(): + """When inject_enabled is False, _inject_memory must leave each agent's + store at None so get_memory_context() returns empty (the memory=off arm).""" + + class _Agent: + def __init__(self): + self.store = "UNSET" + self.tracker = None + self.cfg = None + self.upstream = None + + def set_memory_store(self, store): + self.store = store + + def set_memory_hit_tracker(self, tracker): + self.tracker = tracker + + def set_memory_config(self, cfg): + self.cfg = cfg + + def set_upstream_ref(self, ref): + self.upstream = ref + + class _Cfg: + memory = MemoryExtractionConfig(inject_enabled=False) + upstream_ref = "upstream/main" + + class _Orch: + config = _Cfg() + _memory_store = object() + _memory_hit_tracker = object() + + def __init__(self): + self._all_agents = [_Agent()] + + from src.core.orchestrator import Orchestrator + + orch = _Orch() + Orchestrator._inject_memory(orch) # type: ignore[arg-type] + agent = orch._all_agents[0] + # store-wiring skipped → stays at the sentinel "UNSET" (never set to None + # either, but crucially never set to the real store) + assert agent.store == "UNSET" + assert agent.tracker is orch._memory_hit_tracker + assert agent.cfg is orch.config.memory + + +def test_inject_enabled_wires_store(): + class _Agent: + store = None + tracker = None + cfg = None + upstream = None + + def set_memory_store(self, store): + self.store = store + + def set_memory_hit_tracker(self, tracker): + self.tracker = tracker + + def set_memory_config(self, cfg): + self.cfg = cfg + + def set_upstream_ref(self, ref): + self.upstream = ref + + class _Cfg: + memory = MemoryExtractionConfig(inject_enabled=True) + upstream_ref = "upstream/main" + + class _Orch: + config = _Cfg() + _memory_store = object() + _memory_hit_tracker = object() + + def __init__(self): + self._all_agents = [_Agent()] + + from src.core.orchestrator import Orchestrator + + orch = _Orch() + Orchestrator._inject_memory(orch) # type: ignore[arg-type] + assert orch._all_agents[0].store is orch._memory_store From 0ecba9552823a07f650abf578a2d927d1217e167 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 05:29:45 -0400 Subject: [PATCH 02/22] =?UTF-8?q?docs(eval):=20=E5=AE=9A=E4=B9=89=20P0=20?= =?UTF-8?q?=E8=AE=B0=E5=BF=86=E6=9C=89=E6=95=88=E6=80=A7=E6=8C=87=E6=A0=87?= =?UTF-8?q?=E4=B8=8E=E5=8F=8D=E9=A6=88=E7=8E=AF=E6=BF=80=E6=B4=BB=E9=97=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 把 PR-0a 产出的度量写进权威评估文档,并把"反馈环默认开启"的硬前置 从代码注释固化为可评审验收门(兑现方案原则 P2 先度量再激活)。 - metrics.md §9:记忆有效性指标 MDL/HIR/CRI/MID/PEE/MCPD,含公式、 数据源(对齐 memory_effectiveness.json 字段)、信号通路;标注 P1-C/P2-B 待补指标 - acceptance.md §3:自学习反馈环激活门(MDL>0 硬前置 + HIR 不升 + CRI≥off基线 + MCPD≤×1.15)含判定流程;原 §3–§6 顺延为 §4–§7 - dependency-graph-optimization-plan.md:修正顺延失效的交叉引用 acceptance.md §3→§4(report schema) 激活门为独立章节(≠ 合并质量门),acceptance_thresholds.yaml 仅镜像 §1/§2 故不动。 --- doc/evaluation/acceptance.md | 34 +++++++- doc/evaluation/metrics.md | 83 +++++++++++++++++++ .../dependency-graph-optimization-plan.md | 2 +- 3 files changed, 114 insertions(+), 5 deletions(-) diff --git a/doc/evaluation/acceptance.md b/doc/evaluation/acceptance.md index 056d0d5..811c3c1 100644 --- a/doc/evaluation/acceptance.md +++ b/doc/evaluation/acceptance.md @@ -42,7 +42,33 @@ --- -## 3. 报告必备元数据 +## 3. 自学习反馈环激活门(Phase 1 前置) + +> 这组门**不**判定一次合并 run 的好坏,而是决定自学习方案 +> (`doc/plan/self-learning-system.md`)的反馈环——OPP-5 写回(P1-B)、持久化 +> suppress(P1-A)——能否从 opt-in 翻为**默认开启**。设计原则 P2「先度量再激活」: +> 任一反馈环默认开启前,必须先用 `merge eval-memory` 在固定数据集上跑出消融基线 +> 证明净收益为正。指标定义见 metrics.md §9。 + +| 门 | 阈值 | 数据源 | 作用 | +|---|---|---|---| +| `MDL` 记忆决策增益 | **> 0** | `merge eval-memory`(on/off 消融)| 任一反馈环默认开启的**硬前置**;≤ 0 则保持 opt-in | +| `HIR` 有害影响率 | **不高于同数据集 off 基线** | `memory_effectiveness.json` | 上升即说明记忆在污染决策,禁止默认开启 | +| `CRI` 影响后正确率 | **≥ off 基线 overall_correct_rate** | `memory_effectiveness.json` | 被记忆改变的决策不得比无记忆更差 | +| `MCPD` 单决策记忆成本 | **≤ off 基线 × 1.15** | `CostTracker` | 防止记忆注入让 prompt 成本悄悄回退 | + +**判定流程**: +1. 同数据集跑 `memory=on`(默认)与 `memory=off`(config `memory.inject_enabled: false`)两 run; +2. `merge eval-memory --on --off ` 产出 `MemoryAblationComparison`; +3. `MDL > 0` 且 `HIR` 不升 → 允许把对应反馈环 default 翻为 `True`,并在本文件 §5 历史区记录基线数; +4. 任一门未过 → 反馈环维持 opt-in,记录原因。 + +> 这是"默认开启"的闸口,不是合并质量的一票否决;故归为独立章节,与 §1/§2 的合并 +> 质量门互不替代。 + +--- + +## 4. 报告必备元数据 `eval_acceptance_.json` 必须含: @@ -70,7 +96,7 @@ --- -## 4. 版本基线历史 +## 5. 版本基线历史 | 版本 | 评估时间 | 数据集 lock | 主要结果 | 备注 | |---|---|---|---|---| @@ -80,7 +106,7 @@ --- -## 5. 阈值修改流程 +## 6. 阈值修改流程 修改任何阈值必须: @@ -91,7 +117,7 @@ --- -## 6. 用户对外承诺模板 +## 7. 用户对外承诺模板 通过 acceptance gate 后,可向用户输出如下承诺(示例): diff --git a/doc/evaluation/metrics.md b/doc/evaluation/metrics.md index fe87011..a563cb1 100644 --- a/doc/evaluation/metrics.md +++ b/doc/evaluation/metrics.md @@ -320,3 +320,86 @@ BCP = | 配置了 build_check 且退出码 0 的 run | / | 配置了 build_check 数据源:judge 阶段 `_run_build_check`(command 由 setup 自动探测填充,方案1)。非零退出 把 Judge PASS 降级 FAIL+veto。Acceptance(Soft): **BCP = 100%**(仅统计已配置 command 的 run;未探测到工具链的目标不计入分母)。 + +--- + +## 9. 记忆有效性指标(自学习度量,P0 底座) + +这一组指标量化"注入的跨 run 记忆是否真的让合并决策更好"——自学习方案 +(`doc/plan/self-learning-system.md`)的开放问题 1。全部**只读、执行接地**:正确/有害 +信号取自 Judge 终判的 `passed_files` / `failed_files`(与 `record_outcome` 同源),不取 +LLM 自报。 + +> 信号通路:`MemoryHitTracker` 记录本 run 每个文件的记忆注入 → report 阶段 +> `compute_memory_effectiveness`(`src/tools/memory_eval.py`)与 Judge verdict 求交集 → +> 持久化 `runs//memory_effectiveness.json`。两次 run(`memory=on` vs +> `memory=off`,由 `memory.inject_enabled` 切换)的报告经 `merge eval-memory` +> (`src/tools/memory_replay.py`)对比产出 §9.1。 +> +> **影响决策口径**:`influenced = injected_files ∩ (passed_files ∪ failed_files)`。 +> 注入图为 run-local(不持久化),故 §9.2–§9.4 是单 run 量;§9.5 的 per-entry 功过 +> 经 tracker sidecar 跨 run 累计。 + +### 9.1 记忆决策增益(Memory Decision Lift, MDL) + +> 消融口径:同一数据集、同配置跑两遍,仅 `memory.inject_enabled` 不同。 + +``` +MDL = overall_correct_rate(memory=on) − overall_correct_rate(memory=off) +overall_correct_rate = |passed_files| / (|passed_files| + |failed_files|) +``` + +数据源:`MemoryAblationComparison.memory_decision_lift`。**MDL > 0 是"学到了"的 +最小证据**,也是 Phase 1 任一反馈环默认开启的硬前置(见 acceptance.md §3)。 + +### 9.2 有害影响率(Harmful Influence Rate, HIR) + +> 被记忆注入"影响"且最终 fail 的决策占比——F2(检索污染/有害记忆)的直接度量。 + +``` +HIR = |injected ∩ failed_files| / |influenced| (influenced=0 时记 0) +``` + +数据源:`MemoryEffectivenessReport.harmful_influence_rate`。P1-A(持久化 suppress)的 +优化目标是在"tracker 重置"场景下 HIR 不回升。 + +### 9.3 影响后正确率(Correct Rate After Influence, CRI) + +``` +CRI = |injected ∩ passed_files| / |influenced| (influenced=0 时记 0) +``` + +数据源:`MemoryEffectivenessReport.correct_rate_after_influence`。P1-B(激活并加固 +OPP-5 写回)的优化目标是 CRI 上升、per-entry 分布右移。 + +### 9.4 影响决策数(Memory Influenced Decisions, MID) + +``` +MID = |injected_files ∩ (passed_files ∪ failed_files)| +``` + +数据源:`MemoryEffectivenessReport.memory_influenced_decisions`。MID 是 §9.2/§9.3 的 +分母——MID 过小(如 < 5)时,HIR/CRI 抽样不足,MDL 才是更稳的总体判据。 + +### 9.5 单条目有效性(Per-Entry Effectiveness, PEE) + +``` +PEE[e] = (pass[e] − fail[e]) / (pass[e] + fail[e]) ∈ [−1, +1] +``` + +数据源:`MemoryHitTracker.outcome_scores()` / `summary()['outcomes']` 的 top_helpful / +top_harmful 榜(跨 run 累计)。`PEE ≤ −0.5 且 min_observations 满足` 即 `harmful_entry_ids` +判据——O-M6 注入期过滤的依据,也是 P1-A 固化 suppress 的输入。 + +### 9.6 单决策记忆成本(Memory Cost Per Decision, MCPD) + +``` +MCPD = cost_usd_per_run / F_eval +``` + +数据源:`CostTracker` + `F_eval`。记忆注入增大 prompt,开启反馈环不得让 MCPD 显著上升 +(acceptance.md §3)。 + +> **后续指标(Phase 1-C / 2-B 落地后补充)**:`repeat_error_repair_rounds`(同 +> error_signature 平均修复轮数,需 P1-C 的 `summarize_judge_repair_rounds` 按签名聚合)、 +> `memory_drift_loss`(consolidation 前后 pinned 条目内容差异,P2-B,期望 = 0)。 diff --git a/doc/references/dependency-graph-optimization-plan.md b/doc/references/dependency-graph-optimization-plan.md index f1050c3..2928575 100644 --- a/doc/references/dependency-graph-optimization-plan.md +++ b/doc/references/dependency-graph-optimization-plan.md @@ -350,7 +350,7 @@ God Node(高 degree 节点)→ 命中的改动文件风险提升。 1. **冒烟**:forgejo 上各跑一次 Control / Treatment-core,diff 三份报告,确认每个消费方 ≥1 次触发(§11.3 计数)——先证「活」。 2. **小集 A/B**:3 个 C-class + `t1-0031..0033` 各跑 N=3(`DET` 口径),算聚焦指标位移。 3. **归因**(若收益显著):按 §11.1 单开关回退定位主要贡献者。 -4. **记录**:结果落 `eval_acceptance_.json`(acceptance.md §3 schema),基线历史表追加一行; +4. **记录**:结果落 `eval_acceptance_.json`(acceptance.md §4 schema),基线历史表追加一行; 用本轮真实数据回填 §7.3/§8.5 标注「未标定」的阈值(`god_node_min_dependents=8` / `god_node_risk_bump=0.15` / `_MAX_TOPO_ISSUES=25`)的标定建议。 From 2a5ddd349e22df3a14ae4815c2ec8fc4d8894026 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 05:32:49 -0400 Subject: [PATCH 03/22] =?UTF-8?q?feat(web):=20=E5=86=B2=E7=AA=81=E5=86=B3?= =?UTF-8?q?=E7=AD=96=E6=8F=90=E4=BA=A4=E5=90=8E=E8=87=AA=E5=8A=A8=E8=B7=B3?= =?UTF-8?q?=E8=BD=AC=E4=B8=8B=E4=B8=80=E4=B8=AA=E5=BE=85=E5=86=B3=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 多文件人工决策时,提交一个文件后视图仍停留在已决文件上,剩余 待决文件(尤其是 conflict_points 为空、右侧无明细的升级文件) 极易被遗漏,导致 run 一直停在 AWAITING_HUMAN。 submitCurrent 发送决策后乐观跳转到下一个 human_decision 仍为 空的待决文件;保留手动点击已决文件查看的能力,不做强制弹走。 --- web/src/views/ConflictResolution.test.tsx | 65 +++++++++++++++++++++++ web/src/views/ConflictResolution.tsx | 8 +++ 2 files changed, 73 insertions(+) diff --git a/web/src/views/ConflictResolution.test.tsx b/web/src/views/ConflictResolution.test.tsx index 293789d..c132a70 100644 --- a/web/src/views/ConflictResolution.test.tsx +++ b/web/src/views/ConflictResolution.test.tsx @@ -222,6 +222,71 @@ describe("ConflictResolution submit payload (H3)", () => { }); }); +describe("ConflictResolution auto-advance after submit", () => { + it("selects the next still-pending file after a single submit", () => { + act(() => { + const store = useConflictDraftStore.getState(); + store.setDraftDecision("a.py", "take_current"); + store.selectFile("a.py"); + }); + + const ref = makeClientRef(); + const { getByText } = render( + ["current"] + > + } + />, + ); + act(() => { + getByText("Submit decision").click(); + }); + + expect(sendSpy).toHaveBeenCalledTimes(1); + // a.py was resolved; the view must move the operator onto b.py so the + // remaining decision is never silently skipped. + expect(useConflictDraftStore.getState().selectedFile).toBe("b.py"); + }); + + it("stays put when the submitted file is the last pending one", () => { + const lastPendingSnapshot: MergeStateSnapshot = { + ...baseSnapshot, + humanDecisionRequests: { + "a.py": { + ...baseSnapshot.humanDecisionRequests["a.py"], + human_decision: "take_current", + }, + "b.py": baseSnapshot.humanDecisionRequests["b.py"], + }, + }; + useRunStore.setState({ snapshot: lastPendingSnapshot }); + act(() => { + const store = useConflictDraftStore.getState(); + store.setDraftDecision("b.py", "take_target"); + store.selectFile("b.py"); + }); + + const ref = makeClientRef(); + const { getByText } = render( + ["current"] + > + } + />, + ); + act(() => { + getByText("Submit decision").click(); + }); + + expect(sendSpy).toHaveBeenCalledTimes(1); + expect(useConflictDraftStore.getState().selectedFile).toBe("b.py"); + }); +}); + describe("ConflictResolution submit feedback", () => { it("shows a submitted banner + Resubmit label once the file is decided", () => { const decidedSnapshot: MergeStateSnapshot = { diff --git a/web/src/views/ConflictResolution.tsx b/web/src/views/ConflictResolution.tsx index 2c404b9..8332164 100644 --- a/web/src/views/ConflictResolution.tsx +++ b/web/src/views/ConflictResolution.tsx @@ -255,6 +255,14 @@ export function ConflictResolution({ clientRef }: Props): JSX.Element { if (!current || !currentDraft) return; if (validateDraft(currentDraft) !== null) return; sendSingle(current.file_path, currentDraft); + // Auto-advance to the next still-pending file so a multi-file review + // can't strand the operator on the file they just resolved and silently + // miss a remaining decision (e.g. an escalated file with empty + // conflict_points renders no detail and is easy to overlook). Optimistic: + // the just-submitted file is still in ``pending`` until the next snapshot + // confirms it, so exclude it explicitly here. + const next = pending.find((r) => r.file_path !== current.file_path); + if (next) selectFile(next.file_path); }; const draftCount = Object.keys(drafts).length; From 50c1eed2efc4a15304e4aaa0e456ce212b64bd2d Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 05:32:57 -0400 Subject: [PATCH 04/22] =?UTF-8?q?fix(core):=20=E6=89=A7=E8=A1=8C=E4=BA=BA?= =?UTF-8?q?=E5=B7=A5=E5=86=B3=E7=AD=96=E6=9C=9F=E9=97=B4=E5=88=87=E6=8D=A2?= =?UTF-8?q?=E7=8A=B6=E6=80=81=E4=B8=BA=20AUTO=5FMERGING?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit human_review 在 resume 后一次性同步执行已提交的决策(大文件 chunked 语义合并可能耗时数分钟),期间 status 始终为 AWAITING_HUMAN,Web UI 持续展示决策表单,用户误以为 run 卡死。 在 Case 1 执行循环前将状态从 AWAITING_HUMAN 切到 AUTO_MERGING,经 transition observer 自动广播新快照,前端 classifyView 据此离开决策 gate 转入实时进度。仅改内存状态与广播,checkpoint 仍按 PhaseOutcome 在相位边界写入,resume 语义不变;后续转入 JUDGE_REVIEWING / ANALYZING_CONFLICTS 及失败再升级 AWAITING_HUMAN 均为合法转换。 --- src/core/phases/human_review.py | 16 ++++++++++++++++ .../test_human_override_and_merge_fidelity.py | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/core/phases/human_review.py b/src/core/phases/human_review.py index 079fdae..a437827 100644 --- a/src/core/phases/human_review.py +++ b/src/core/phases/human_review.py @@ -247,6 +247,22 @@ async def execute(self, state: MergeState, ctx: PhaseContext) -> PhaseOutcome: if req.human_decision is None ] if not pending: + # UX: once every conflict decision is in, the phase is about + # to spend a long time applying them (chunked semantic merges + # on large files can run for minutes). Leaving status at + # AWAITING_HUMAN throughout makes the Web UI keep showing the + # decision gate — operators think the run is stuck waiting on + # them when it is actually busy executing. Flip to AUTO_MERGING + # before the loop so the transition observer pushes a fresh + # snapshot and the UI moves off the gate to live progress. The + # terminal transition (JUDGE_REVIEWING / ANALYZING_CONFLICTS) + # below is still valid from AUTO_MERGING. + if state.status != SystemStatus.AUTO_MERGING: + ctx.state_machine.transition( + state, + SystemStatus.AUTO_MERGING, + "executing human conflict decisions", + ) executor = ctx.agents["executor"] executed = 0 for req in state.human_decision_requests.values(): diff --git a/tests/unit/test_human_override_and_merge_fidelity.py b/tests/unit/test_human_override_and_merge_fidelity.py index f18772e..818175e 100644 --- a/tests/unit/test_human_override_and_merge_fidelity.py +++ b/tests/unit/test_human_override_and_merge_fidelity.py @@ -156,6 +156,16 @@ async def test_human_override_executes_over_stale_auto_record() -> None: assert state.file_decision_records["auto.go"].decision == MergeDecision.TAKE_TARGET assert outcome.target_status == SystemStatus.JUDGE_REVIEWING + # UX: executing the decisions must first flip status off the AWAITING_HUMAN + # gate to AUTO_MERGING so the Web UI stops showing the decision form while + # the (potentially minutes-long) merges run. The AUTO_MERGING transition + # must precede the terminal JUDGE_REVIEWING transition. + transitioned = [c.args[1] for c in ctx.state_machine.transition.call_args_list] + assert SystemStatus.AUTO_MERGING in transitioned + assert transitioned.index(SystemStatus.AUTO_MERGING) < transitioned.index( + SystemStatus.JUDGE_REVIEWING + ) + # --------------------------------------------------------------------------- # # Bug B sibling — judge dispute-round repair must not overwrite a human record From 71e1fecd57ff46ae97145a6825f657f4546fb568 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 05:39:54 -0400 Subject: [PATCH 05/22] =?UTF-8?q?docs:=20=E6=95=B4=E7=90=86=20doc/=20?= =?UTF-8?q?=E7=9B=AE=E5=BD=95=E7=BB=93=E6=9E=84=E5=B9=B6=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=20README=20=E7=B4=A2=E5=BC=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将根目录散落的 8 个文件归入对应子目录: - web-ui.md / web-ui-redesign-handoff.md → modules/ - forks-profile-init.md → modules/forks-profile.md(改名去歧义) - migration-aware-merge.md / risk-levels.md → modules/ - multi-agent-optimization-from-merge-experience.md → references/multi-agent-optimization.md - large-scale-file-processing-optimization.md → plan/ - execute/implementation-notes.md → plan/(删除单文件目录 execute/) 同步修复 3 处内部链接(onboarding.md、insforge 测试报告)。 重写 README.md:补齐此前缺失的 bugfix/、evaluation/、review/、 test-report/ 四个目录的说明,更新时间戳至 2026-05-31。 --- doc/README.md | 150 +++++++++++++++--- .../forks-profile.md} | 0 doc/{ => modules}/migration-aware-merge.md | 0 doc/modules/onboarding.md | 8 +- doc/{ => modules}/risk-levels.md | 0 doc/{ => modules}/web-ui-redesign-handoff.md | 0 doc/{ => modules}/web-ui.md | 0 doc/{execute => plan}/implementation-notes.md | 0 ...arge-scale-file-processing-optimization.md | 0 .../multi-agent-optimization.md} | 0 ...insforge-v2.1.0-merge-report-2026-05-06.md | 4 +- 11 files changed, 133 insertions(+), 29 deletions(-) rename doc/{forks-profile-init.md => modules/forks-profile.md} (100%) rename doc/{ => modules}/migration-aware-merge.md (100%) rename doc/{ => modules}/risk-levels.md (100%) rename doc/{ => modules}/web-ui-redesign-handoff.md (100%) rename doc/{ => modules}/web-ui.md (100%) rename doc/{execute => plan}/implementation-notes.md (100%) rename doc/{ => plan}/large-scale-file-processing-optimization.md (100%) rename doc/{multi-agent-optimization-from-merge-experience.md => references/multi-agent-optimization.md} (100%) diff --git a/doc/README.md b/doc/README.md index fcd58f1..c358dde 100644 --- a/doc/README.md +++ b/doc/README.md @@ -1,49 +1,150 @@ # 文档索引 -> **最后更新**:2026-04-17 -> 中文版文档。英文版后续补充。 +> **最后更新**:2026-05-31 +> 中文版文档。 --- ## 新人从这里开始 -- [**新人上手指南**](modules/onboarding.md) — 环境、阅读顺序、常见改动起点 +- [**新人上手指南**](modules/onboarding.md) — 环境搭建、阅读顺序、常见改动起点 -## 核心设计文档 +--- + +## 核心设计文档(根目录) | 文档 | 说明 | |---|---| | [architecture.md](architecture.md) | 系统架构总览:分层、数据流、持久化、扩展点 | | [flow.md](flow.md) | 状态机与 8 个 Phase 的执行流程 | -| [risk-levels.md](risk-levels.md) | `RiskLevel` 枚举定义与触发条件 | -| [migration-aware-merge.md](migration-aware-merge.md) | 迁移感知合并(bulk-copy 场景) | -| [multi-agent-optimization-from-merge-experience.md](multi-agent-optimization-from-merge-experience.md) | 六大丢失模式 + P0/P1/P2 加固项(最新) | -| [forks-profile-init.md](forks-profile-init.md) | `merge forks-profile init` / `diff` 起草+增量审阅;§9 整合后已接入首次向导(按阈值触发 init)+ 主流程 drift 提示(写入 plan 报告附录) | + +--- ## 模块技术文档(`modules/`) -| 模块 | 文档 | +| 文档 | 说明 | +|---|---| +| [data-models.md](modules/data-models.md) | Pydantic 数据模型字段详解 | +| [agents.md](modules/agents.md) | 各 Agent 职责、模型选择、合作模式 | +| [core.md](modules/core.md) | Orchestrator / StateMachine / Checkpoint / Phases | +| [tools.md](modules/tools.md) | 扫描器 / 门禁 / Git 工具 | +| [llm.md](modules/llm.md) | LLM 路由、成本控制、熔断、压缩 | +| [memory.md](modules/memory.md) | 三层记忆系统、跨 run 持久化 | +| [cli.md](modules/cli.md) | CLI 命令、Web UI 与后端 WebSocket 通信 | +| [web-ui.md](modules/web-ui.md) | Web UI 组件设计与状态管理 | +| [web-ui-redesign-handoff.md](modules/web-ui-redesign-handoff.md) | Web UI 重设计交付说明 | +| [forks-profile.md](modules/forks-profile.md) | `merge forks-profile init/diff`;drift 检测;首次向导触发阈值 | +| [migration-aware-merge.md](modules/migration-aware-merge.md) | 迁移感知合并(bulk-copy 场景) | +| [risk-levels.md](modules/risk-levels.md) | `RiskLevel` 枚举定义与触发条件 | +| [onboarding.md](modules/onboarding.md) | 新人上手指南 | + +--- + +## 计划与提案(`plan/`) + +| 文档 | 说明 | +|---|---| +| [roadmap.md](plan/roadmap.md) | 产品路线图与里程碑 | +| [self-learning-system.md](plan/self-learning-system.md) | 自学习系统方案(深研究支撑,2026-05-30) | +| [per-hunk-resolution.md](plan/per-hunk-resolution.md) | 细粒度 hunk 级别冲突解决方案 | +| [merge-safety-complete.md](plan/merge-safety-complete.md) | 合并安全完整方案 | +| [dead-code-remediation-and-compression-plan.md](plan/dead-code-remediation-and-compression-plan.md) | 死代码清理与上下文压缩计划 | +| [large-scale-file-processing-optimization.md](plan/large-scale-file-processing-optimization.md) | 大规模文件处理优化 | +| [implementation-notes.md](plan/implementation-notes.md) | 实施过程笔记 | + +--- + +## 合并质量审计(`review/`) + +记录一次深度合并质量 + LLM 幻觉处理路径审计,以及后续 Wave 实施日志。 + +| 文档 | 说明 | +|---|---| +| [README.md](review/README.md) | 审计背景与文档索引 | +| [00-audit-findings.md](review/00-audit-findings.md) | 根因分析与确认缺陷 | +| [01-optimization-plan.md](review/01-optimization-plan.md) | 12 项优化计划 | +| [02-implementation-log.md](review/02-implementation-log.md) | Wave 1–3 实施日志 | +| [03-production-readiness.md](review/03-production-readiness.md) | Wave 3 后生产就绪度评估 | +| [04-production-hardening-plan.md](review/04-production-hardening-plan.md) | Wave 4 加固计划 | +| [05-wave4-implementation-log.md](review/05-wave4-implementation-log.md) | Wave 4 实施日志 | +| [06-production-readiness-post-wave4.md](review/06-production-readiness-post-wave4.md) | Wave 4 后生产就绪度 | +| [07-wave5-residual-closure-plan.md](review/07-wave5-residual-closure-plan.md) | Wave 5 残余问题关闭计划 | + +--- + +## 评估体系(`evaluation/`) + +| 文档 | 说明 | |---|---| -| 数据模型 | [data-models.md](modules/data-models.md) | -| Agents | [agents.md](modules/agents.md) | -| Core(Orchestrator / StateMachine / Checkpoint / Phases) | [core.md](modules/core.md) | -| Tools(扫描器 / 门禁 / Git) | [tools.md](modules/tools.md) | -| LLM 层 | [llm.md](modules/llm.md) | -| 记忆系统 | [memory.md](modules/memory.md) | -| CLI / Web UI | [cli.md](modules/cli.md) | -| 新人指南 | [onboarding.md](modules/onboarding.md) | +| [README.md](evaluation/README.md) | 评估方案总览 | +| [metrics.md](evaluation/metrics.md) | 度量指标定义(含 P0 记忆有效性) | +| [acceptance.md](evaluation/acceptance.md) | 验收门槛 | +| [dataset.md](evaluation/dataset.md) | 数据集定义 | +| [procedure.md](evaluation/procedure.md) | 评估流程 | +| [EXECUTION_PLAN.md](evaluation/EXECUTION_PLAN.md) | 执行计划 | +| [IMPLEMENTATION_REPORT_PARTIAL.md](evaluation/IMPLEMENTATION_REPORT_PARTIAL.md) | 部分实施报告 | + +--- + +## 测试报告(`test-report/`) + +各版本与目标仓库的实测报告,按时间排列。 + +| 文档 | 说明 | +|---|---| +| [insforge-v2.1.0-merge-report-2026-05-06.md](test-report/insforge-v2.1.0-merge-report-2026-05-06.md) | InsForge v2.1.0 正式合并测试报告 | +| [dify-plugin-daemon-0.6.0-merge-validation.md](test-report/dify-plugin-daemon-0.6.0-merge-validation.md) | dify-plugin-daemon 0.6.0 合并验证 | +| [dify-plugins-upstream25-merge-test-2026-05-08.md](test-report/dify-plugins-upstream25-merge-test-2026-05-08.md) | dify-plugins upstream-25 合并测试 | +| [dify-plugins-upstream25-regression-2026-05-08.md](test-report/dify-plugins-upstream25-regression-2026-05-08.md) | dify-plugins upstream-25 回归分析 | +| [2026-05-01-dify-plugins-upstream10-validation.md](test-report/2026-05-01-dify-plugins-upstream10-validation.md) | dify-plugins upstream-10 验证 | +| [2026-05-10-planner-judge-optimizations-review.md](test-report/2026-05-10-planner-judge-optimizations-review.md) | Planner/Judge 优化 review | +| [forgejo-c-class-test-branches-2026-05-18.md](test-report/forgejo-c-class-test-branches-2026-05-18.md) | forgejo C-class 测试分支建立 | +| [forgejo-planner-judge-divergence-2026-05-18.md](test-report/forgejo-planner-judge-divergence-2026-05-18.md) | forgejo Planner/Judge 分歧分析 | +| [upstream-29-full-flow-analysis.md](test-report/upstream-29-full-flow-analysis.md) | upstream-29 全流程分析 | +| [upstream-36-commits-validation-report.md](test-report/upstream-36-commits-validation-report.md) | upstream-36 验证报告 | +| [upstream-50-commits-test-report.md](test-report/upstream-50-commits-test-report.md) | upstream-50 测试报告 | +| [merge-validation-report.md](test-report/merge-validation-report.md) | 通用合并验证报告 | +| [dify-plugin-daemon.md](test-report/dify-plugin-daemon.md) | dify-plugin-daemon 早期记录 | + +--- + +## BUG 分析记录(`bugfix/`) + +| 文档 | 说明 | +|---|---| +| [0527.md](bugfix/0527.md) | 2026-05-27 批次 BUG 分析与修复方案 | +| [0528-agent-prompt-engineering-review.md](bugfix/0528-agent-prompt-engineering-review.md) | 2026-05-28 Agent Prompt 工程化审查 | +| [0528-legacy-merge-base-attr.md](bugfix/0528-legacy-merge-base-attr.md) | 2026-05-28 遗留 merge_base 属性问题 | +| [0529-context-memory-opt-evaluation.md](bugfix/0529-context-memory-opt-evaluation.md) | 2026-05-29 上下文/记忆优化评估 | + +--- ## 参考开源项目分析(`references/`) -这些文档**不是系统设计**,而是对外部项目的学习笔记,用于提炼可借鉴的模式。 +外部项目学习笔记,提炼可借鉴的模式,**不是系统设计**。 -| 文件 | 项目 | 借鉴点 | +| 文档 | 项目 / 主题 | 借鉴点 | |---|---|---| | [graphify-analysis.md](references/graphify-analysis.md) | Graphify | 知识图谱压缩代码上下文 | | [mempalace-analysis.md](references/mempalace-analysis.md) | MemPalace | 语义索引 + 分层记忆 | | [hermes-inspired-improvements.md](references/hermes-inspired-improvements.md) | NousResearch/hermes-agent | Agent 架构与工具抽象 | +| [openai-agents-python-analysis.md](references/openai-agents-python-analysis.md) | openai-agents-python | 轻量 Agent 框架设计 | +| [claude-code-game-studios-analysis.md](references/claude-code-game-studios-analysis.md) | claude-code-game-studios | 多 Agent 游戏开发实证 | | [opensource-comparison.md](references/opensource-comparison.md) | 15+ 合并相关开源项目 | 对照分析与能力矩阵 | -| [enhanced-context-memory-proposal.md](references/enhanced-context-memory-proposal.md) | 综合 MemPalace + Graphify | 基于上述项目的增强方案蓝图 | +| [enhanced-context-memory-proposal.md](references/enhanced-context-memory-proposal.md) | 综合 MemPalace + Graphify | 增强方案蓝图 | +| [dependency-graph-optimization-plan.md](references/dependency-graph-optimization-plan.md) | 依赖图优化 | 基于 forgejo 实测的依赖图优化计划 | +| [multi-agent-optimization.md](references/multi-agent-optimization.md) | 合并实战经验 | 六大丢失模式 + P0/P1/P2 加固项 | + +--- + +## 分享材料(`share/`) + +| 文档 | 说明 | +|---|---| +| [agent-engineering-sharing.md](share/agent-engineering-sharing.md) | Agent 工程化经验分享 | +| [dependency-graph-deep-dive.html](share/dependency-graph-deep-dive.html) | 依赖图深度解析(HTML 演示) | + +--- ## 查找路径速查 @@ -55,12 +156,15 @@ Agent 各自用什么模型、职责边界 → modules/agents.md Checkpoint 怎么落盘、状态机怎么转移 → modules/core.md 某个具体扫描器原理(shadow / scar / sentinel…) → modules/tools.md - + multi-agent-optimization-from-merge-experience.md LLM 请求如何做成本/预算/熔断 → modules/llm.md 记忆在 Agent 间是如何传递的 → modules/memory.md 怎么用命令行、Web UI 怎么和后端通信 → modules/cli.md Pydantic 模型到底长什么样 → modules/data-models.md -fork 被 bulk-copy 迁移过怎么处理 → migration-aware-merge.md -为什么要设计这么多扫描器 → multi-agent-optimization-from-merge-experience.md +fork 被 bulk-copy 迁移过怎么处理 → modules/migration-aware-merge.md +六大丢失模式 / 为什么要设计这么多扫描器 → references/multi-agent-optimization.md 想学开源项目怎么做类似问题 → references/ +合并质量审计历史、Wave 实施记录 → review/ +实测合并报告(dify/forgejo/insforge) → test-report/ +BUG 分析与修复记录 → bugfix/ +评估指标、验收门槛、数据集定义 → evaluation/ ``` diff --git a/doc/forks-profile-init.md b/doc/modules/forks-profile.md similarity index 100% rename from doc/forks-profile-init.md rename to doc/modules/forks-profile.md diff --git a/doc/migration-aware-merge.md b/doc/modules/migration-aware-merge.md similarity index 100% rename from doc/migration-aware-merge.md rename to doc/modules/migration-aware-merge.md diff --git a/doc/modules/onboarding.md b/doc/modules/onboarding.md index d960f59..fef270f 100644 --- a/doc/modules/onboarding.md +++ b/doc/modules/onboarding.md @@ -87,9 +87,9 @@ mypy src # 应全绿(strict mode) | LLM 路由/压缩/缓存 | [`llm.md`](llm.md) | | 三层记忆系统 | [`memory.md`](memory.md) | | CLI + Web UI | [`cli.md`](cli.md) | -| 六大丢失模式 + P0/P1/P2 加固项 | [`../multi-agent-optimization-from-merge-experience.md`](../multi-agent-optimization-from-merge-experience.md) | -| 迁移感知合并 | [`../migration-aware-merge.md`](../migration-aware-merge.md) | -| 风险等级枚举 | [`../risk-levels.md`](../risk-levels.md) | +| 六大丢失模式 + P0/P1/P2 加固项 | [`../references/multi-agent-optimization.md`](../references/multi-agent-optimization.md) | +| 迁移感知合并 | [`migration-aware-merge.md`](migration-aware-merge.md) | +| 风险等级枚举 | [`risk-levels.md`](risk-levels.md) | | 参考开源项目分析 | [`../references/`](../references/) | --- @@ -168,5 +168,5 @@ ruff format src/ --check ## 10. 遇到问题找谁 - 架构类问题 → 先翻 `CLAUDE.md` 和 `../architecture.md` -- 设计演进 / 加固项背景 → `../multi-agent-optimization-from-merge-experience.md` +- 设计演进 / 加固项背景 → `../references/multi-agent-optimization.md` - 参考外部项目的思路 → `../references/` 三篇分析文档 diff --git a/doc/risk-levels.md b/doc/modules/risk-levels.md similarity index 100% rename from doc/risk-levels.md rename to doc/modules/risk-levels.md diff --git a/doc/web-ui-redesign-handoff.md b/doc/modules/web-ui-redesign-handoff.md similarity index 100% rename from doc/web-ui-redesign-handoff.md rename to doc/modules/web-ui-redesign-handoff.md diff --git a/doc/web-ui.md b/doc/modules/web-ui.md similarity index 100% rename from doc/web-ui.md rename to doc/modules/web-ui.md diff --git a/doc/execute/implementation-notes.md b/doc/plan/implementation-notes.md similarity index 100% rename from doc/execute/implementation-notes.md rename to doc/plan/implementation-notes.md diff --git a/doc/large-scale-file-processing-optimization.md b/doc/plan/large-scale-file-processing-optimization.md similarity index 100% rename from doc/large-scale-file-processing-optimization.md rename to doc/plan/large-scale-file-processing-optimization.md diff --git a/doc/multi-agent-optimization-from-merge-experience.md b/doc/references/multi-agent-optimization.md similarity index 100% rename from doc/multi-agent-optimization-from-merge-experience.md rename to doc/references/multi-agent-optimization.md diff --git a/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md b/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md index e744550..b581f47 100644 --- a/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md +++ b/doc/test-report/insforge-v2.1.0-merge-report-2026-05-06.md @@ -530,7 +530,7 @@ migration_policy: **追加日期**:2026-05-07 **来源会话**:forks-profile-init 设计阶段对「profile 是否应为 planner 产物」的讨论 -**关联文档**:[`doc/forks-profile-init.md`](../forks-profile-init.md) §3 启发式 / §8 不做的事 / §9 已知局限 +**关联文档**:[`doc/forks-profile-init.md`](../modules/forks-profile.md) §3 启发式 / §8 不做的事 / §9 已知局限 #### 9.10.1 动机 @@ -620,7 +620,7 @@ forks_profile_suggestions: ### 9.11 P3 增强:`forks-profile diff` 半自动 apply 模式(草案) **追加日期**:2026-05-07 -**关联文档**:[`doc/forks-profile-init.md`](../forks-profile-init.md) §4.2 `diff` 子命令 / §8 「不做的事」第 2 项 +**关联文档**:[`doc/forks-profile-init.md`](../modules/forks-profile.md) §4.2 `diff` 子命令 / §8 「不做的事」第 2 项 #### 9.11.1 动机 From 05504822b904494e1790c7088666ee66de3674b2 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 06:29:07 -0400 Subject: [PATCH 06/22] =?UTF-8?q?docs(eval):=20=E5=9B=9E=E5=A1=AB=20P0=20?= =?UTF-8?q?=E8=AE=B0=E5=BF=86=E6=9C=89=E6=95=88=E6=80=A7=E9=A6=96=E7=BB=84?= =?UTF-8?q?=E5=9F=BA=E7=BA=BF=EF=BC=88MDL=3D0=EF=BC=8C=E4=B8=8D=E9=BB=98?= =?UTF-8?q?=E8=AE=A4=E5=BC=80=E5=90=AF=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在 forgejo test/fork←origin/forgejo 上用 deepseek-v4-pro 实跑 memory=on/off 两臂消融(同一 ablation_decisions.yaml,唯一变量 inject_enabled),经 merge eval-memory 产出首组基线,回填 acceptance.md §5.1: - MDL=0.0000、HIR(on)=0.20,两臂 overall_correct_rate 均 81.25%(13/16) - off 臂 memory_influenced_decisions=0,证实 inject_enabled 开关有效 - 据 §3 激活门(MDL>0 为硬前置)判定:P1 反馈环不默认开启 - 标注 caveat:单 run/单数据集/judged=16 样本小,仅首组可搬动基线, 需多 run 复算方可据以翻默认开启 PR-0c 至此打通 Phase 0 完整闭环(度量→消融→验收门→首组基线)。 --- doc/evaluation/acceptance.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/evaluation/acceptance.md b/doc/evaluation/acceptance.md index 811c3c1..c6ed33e 100644 --- a/doc/evaluation/acceptance.md +++ b/doc/evaluation/acceptance.md @@ -104,6 +104,19 @@ 每次发布更新该表,至少记录 `OA / MMR / WMR / cost_p95 / wall_time_p95`。 +### 5.1 记忆有效性基线(§3 激活门) + +> 由 `merge eval-memory` 对同数据集 memory=on/off 两 run 产出(metrics.md §9)。 + +| 评估时间 | 数据集 | on/off run_id | `MDL` | `HIR`(on) | 激活判定 | +|---|---|---|---|---|---| +| 2026-05-31 | forgejo `test/fork` ← `origin/forgejo`(124 文件,judge 复审 16)| `a0563230` / `81ce3475` | **0.0000** | 0.20 | **不默认开启**(MDL 未 > 0)| + +口径与 caveat(务必随基线一并阅读,避免误用): +- 模型 `deepseek-v4-pro`(temperature:executor/judge=0.1,余默认);两臂同一 `ablation_decisions.yaml`(plan_review 15×`take_target` + judge_review `accept`),唯一变量为 `memory.inject_enabled`。 +- off 臂 `memory_influenced_decisions=0`,证实 `inject_enabled=false` 完全抑制注入(开关有效)。on 臂注入影响 15/16 判决,但 `overall_correct_rate` 与 off 持平(均 81.25%,13/16 pass)→ 本 run 记忆对总体正确率净中性。 +- 单 run、单数据集、`judge_verdict=fail`/`partial_failure`(确定性 reverse_impact veto),样本量小(judged=16);**不足以作收紧/默认开启依据**,仅为首组可搬动基线。需多 run / 多数据集复算(procedure 待补)方能据 §3 翻默认开启。 + --- ## 6. 阈值修改流程 From e4b3fbc5e398be1d64377dc28c1e36091864b60a Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 06:52:12 -0400 Subject: [PATCH 07/22] =?UTF-8?q?feat(memory):=20PR-0d=20=E8=AE=B0?= =?UTF-8?q?=E5=BF=86=E6=9C=89=E6=95=88=E6=80=A7=E8=B7=A8=E8=87=82=E5=9B=A0?= =?UTF-8?q?=E6=9E=9C=E5=BD=92=E5=9B=A0=EF=BC=88=E4=BF=AE=E6=AD=A3=20HIR=20?= =?UTF-8?q?=E5=81=87=E9=98=B3=E6=80=A7=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit forgejo 首组基线暴露单臂 harmful_influence_rate 的归因缺陷:HIR(on)=0.2 把"记忆恰好注入到本就确定性失败的文件"误算成有害,而消融证明那 3 个失败 在 memory=off 臂逐文件相同(确定性 reverse_impact veto,与记忆无关)。 改为跨臂逐文件因果归因: - MemoryEffectivenessReport 持久化 passed_files/failed_files(原仅计数) - compare_memory_effectiveness 计算 memory_helped(off-fail→on-pass)/ memory_harmed(off-pass→on-fail) + causal_attribution_available - render_ablation_table 增因果区块;HIR 标注为 correlational - metrics.md §9.2 警示 HIR 假阳性 + 新增 §9.7 因果归因;acceptance.md §3 激活门判据由单臂 HIR 改为 memory_harmed=0;§5.1 基线补因果列(harmed=0) 旧产物无 per-file 列表时 causal_attribution_available=False(不可知≠0)。 real forgejo checkpoint 验证:causal helped=0/harmed=0(HIR=0.2 系假阳性)。 --- doc/evaluation/acceptance.md | 15 +++--- doc/evaluation/metrics.md | 29 ++++++++++-- src/models/memory_effectiveness.py | 17 +++++++ src/tools/memory_eval.py | 24 +++++++++- src/tools/memory_replay.py | 61 ++++++++++++++++++------- tests/unit/test_memory_effectiveness.py | 3 ++ tests/unit/test_memory_replay.py | 61 ++++++++++++++++++++++++- 7 files changed, 180 insertions(+), 30 deletions(-) diff --git a/doc/evaluation/acceptance.md b/doc/evaluation/acceptance.md index c6ed33e..4560507 100644 --- a/doc/evaluation/acceptance.md +++ b/doc/evaluation/acceptance.md @@ -53,14 +53,14 @@ | 门 | 阈值 | 数据源 | 作用 | |---|---|---|---| | `MDL` 记忆决策增益 | **> 0** | `merge eval-memory`(on/off 消融)| 任一反馈环默认开启的**硬前置**;≤ 0 则保持 opt-in | -| `HIR` 有害影响率 | **不高于同数据集 off 基线** | `memory_effectiveness.json` | 上升即说明记忆在污染决策,禁止默认开启 | +| `memory_harmed`(因果,PR-0d)| **= 0** | `merge eval-memory` 跨臂逐文件 diff | 跨臂判决翻坏的文件数;> 0 即记忆**导致**退化,禁止默认开启。取代单臂 `HIR`(相关性、会假阳性,metrics §9.7)| | `CRI` 影响后正确率 | **≥ off 基线 overall_correct_rate** | `memory_effectiveness.json` | 被记忆改变的决策不得比无记忆更差 | | `MCPD` 单决策记忆成本 | **≤ off 基线 × 1.15** | `CostTracker` | 防止记忆注入让 prompt 成本悄悄回退 | **判定流程**: 1. 同数据集跑 `memory=on`(默认)与 `memory=off`(config `memory.inject_enabled: false`)两 run; 2. `merge eval-memory --on --off ` 产出 `MemoryAblationComparison`; -3. `MDL > 0` 且 `HIR` 不升 → 允许把对应反馈环 default 翻为 `True`,并在本文件 §5 历史区记录基线数; +3. `MDL > 0` 且 `memory_harmed = 0`(因果,PR-0d)→ 允许把对应反馈环 default 翻为 `True`,并在本文件 §5 历史区记录基线数; 4. 任一门未过 → 反馈环维持 opt-in,记录原因。 > 这是"默认开启"的闸口,不是合并质量的一票否决;故归为独立章节,与 §1/§2 的合并 @@ -108,14 +108,15 @@ > 由 `merge eval-memory` 对同数据集 memory=on/off 两 run 产出(metrics.md §9)。 -| 评估时间 | 数据集 | on/off run_id | `MDL` | `HIR`(on) | 激活判定 | -|---|---|---|---|---|---| -| 2026-05-31 | forgejo `test/fork` ← `origin/forgejo`(124 文件,judge 复审 16)| `a0563230` / `81ce3475` | **0.0000** | 0.20 | **不默认开启**(MDL 未 > 0)| +| 评估时间 | 数据集 | on/off run_id | `MDL` | `memory_harmed`(因果) | `HIR`(on,相关) | 激活判定 | +|---|---|---|---|---|---|---| +| 2026-05-31 | forgejo `test/fork` ← `origin/forgejo`(124 文件,judge 复审 16)| `a0563230` / `81ce3475` | **0.0000** | **0** | 0.20 | **不默认开启**(MDL 未 > 0)| 口径与 caveat(务必随基线一并阅读,避免误用): - 模型 `deepseek-v4-pro`(temperature:executor/judge=0.1,余默认);两臂同一 `ablation_decisions.yaml`(plan_review 15×`take_target` + judge_review `accept`),唯一变量为 `memory.inject_enabled`。 -- off 臂 `memory_influenced_decisions=0`,证实 `inject_enabled=false` 完全抑制注入(开关有效)。on 臂注入影响 15/16 判决,但 `overall_correct_rate` 与 off 持平(均 81.25%,13/16 pass)→ 本 run 记忆对总体正确率净中性。 -- 单 run、单数据集、`judge_verdict=fail`/`partial_failure`(确定性 reverse_impact veto),样本量小(judged=16);**不足以作收紧/默认开启依据**,仅为首组可搬动基线。需多 run / 多数据集复算(procedure 待补)方能据 §3 翻默认开启。 +- off 臂 `memory_influenced_decisions=0`,证实 `inject_enabled=false` 完全抑制注入(开关有效)。on 臂注入影响 15/16 判决,但**两臂 per-file 判决逐字节相同**(同 13 pass / 同 3 fail,失败均为 `auth_token.go`/`oauth.go`/`build-release.yml` 的确定性 reverse_impact veto)→ 记忆对本 run 任何判决**零作用**。 +- **因果归因(PR-0d)`memory_harmed=0`、`memory_helped=0`**;单臂 `HIR(on)=0.2` 是**假阳性**——它把"记忆恰好注入到本就确定性失败的文件"误算成有害,跨臂 diff 证伪(metrics §9.7)。激活门以因果 harmed 为准。 +- 本 run 由确定性机制(take_target + veto)主导,记忆无用武之地;不证明记忆无价值,需 **LLM 判断密集**数据集才能测出。单 run、judged=16、样本小,**仅首组可搬动基线**,需多 run / 多数据集复算(procedure 待补)方可据 §3 翻默认开启。 --- diff --git a/doc/evaluation/metrics.md b/doc/evaluation/metrics.md index a563cb1..e140975 100644 --- a/doc/evaluation/metrics.md +++ b/doc/evaluation/metrics.md @@ -352,16 +352,19 @@ overall_correct_rate = |passed_files| / (|passed_files| + |failed_files|) 数据源:`MemoryAblationComparison.memory_decision_lift`。**MDL > 0 是"学到了"的 最小证据**,也是 Phase 1 任一反馈环默认开启的硬前置(见 acceptance.md §3)。 -### 9.2 有害影响率(Harmful Influence Rate, HIR) +### 9.2 有害影响率(Harmful Influence Rate, HIR)—— **相关性,非因果** -> 被记忆注入"影响"且最终 fail 的决策占比——F2(检索污染/有害记忆)的直接度量。 +> 被记忆注入"影响"且最终 fail 的决策占比。单臂量,**会过度归因**。 ``` HIR = |injected ∩ failed_files| / |influenced| (influenced=0 时记 0) ``` -数据源:`MemoryEffectivenessReport.harmful_influence_rate`。P1-A(持久化 suppress)的 -优化目标是在"tracker 重置"场景下 HIR 不回升。 +数据源:`MemoryEffectivenessReport.harmful_influence_rate`。**警告(PR-0d)**:HIR 把 +"记忆恰好被注入到一个本来就会失败的文件"也算成有害,但确定性失败(如 reverse_impact +veto)与记忆无关。forgejo 首组基线即暴露此假阳性:HIR(on)=0.2,而跨臂因果归因 +(§9.7)harmed=0——3 个失败在 memory=off 臂**逐文件相同**。故 HIR 只作单 run 粗筛, +**默认开启/收紧判据一律以 §9.7 的因果 harmed 为准**。 ### 9.3 影响后正确率(Correct Rate After Influence, CRI) @@ -400,6 +403,24 @@ MCPD = cost_usd_per_run / F_eval 数据源:`CostTracker` + `F_eval`。记忆注入增大 prompt,开启反馈环不得让 MCPD 显著上升 (acceptance.md §3)。 +### 9.7 跨臂因果归因(Causal Help / Harm, PR-0d) + +> 唯一能判定"记忆是否**导致**好/坏结果"的口径——逐文件比对 on/off 两臂的 judge 判决, +> 只有判决真正翻转才归因于记忆。 + +``` +memory_helped = { f : f ∈ off.failed_files ∧ f ∈ on.passed_files } (记忆把失败救成功) +memory_harmed = { f : f ∈ off.passed_files ∧ f ∈ on.failed_files } (记忆把成功弄失败) +``` + +数据源:`MemoryAblationComparison.memory_helped_count / memory_harmed_count`,由 +`merge eval-memory` 跨两份 `memory_effectiveness.json`(PR-0d 起持久化 `passed_files`/ +`failed_files`)计算;`causal_attribution_available=False` 表示报告无 per-file 列表 +(PR-0d 前的旧产物)——此时 helped/harmed 不可知,**不等于 0**。 + +两臂判决**逐文件相同**(确定性主导的合并)→ helped=harmed=0,正确地不把确定性失败 +甩锅给记忆。这是 §9.2 HIR 的因果替代,也是 acceptance.md §3 激活门的真正判据。 + > **后续指标(Phase 1-C / 2-B 落地后补充)**:`repeat_error_repair_rounds`(同 > error_signature 平均修复轮数,需 P1-C 的 `summarize_judge_repair_rounds` 按签名聚合)、 > `memory_drift_loss`(consolidation 前后 pinned 条目内容差异,P2-B,期望 = 0)。 diff --git a/src/models/memory_effectiveness.py b/src/models/memory_effectiveness.py index ef3dbf4..7f2467f 100644 --- a/src/models/memory_effectiveness.py +++ b/src/models/memory_effectiveness.py @@ -42,6 +42,12 @@ class MemoryEffectivenessReport(BaseModel, frozen=True): top_harmful: list[EntryEffectivenessItem] = Field(default_factory=list) total_tracked_entries: int = Field(ge=0) effective_observations: int = Field(ge=0) + # PR-0d: per-file Judge verdict, persisted so an offline on/off comparison + # can attribute help/harm causally (cross-arm set diff) instead of relying + # on the single-arm ``injected ∩ failed`` correlation. Default empty keeps + # older artifacts (counts only) loadable. + passed_files: list[str] = Field(default_factory=list) + failed_files: list[str] = Field(default_factory=list) class MemoryAblationComparison(BaseModel, frozen=True): @@ -61,3 +67,14 @@ class MemoryAblationComparison(BaseModel, frozen=True): memory_decision_lift: float harmful_influence_rate_on: float = Field(ge=0.0, le=1.0) memory_beneficial: bool + # PR-0d: causal cross-arm attribution — a file counts as helped/harmed only + # if its verdict actually flipped between the arms, so a deterministic + # failure that happens identically with and without memory is NOT blamed on + # memory (the single-arm ``harmful_influence_rate`` over-attributes it). + memory_helped_files: list[str] = Field(default_factory=list) + memory_harmed_files: list[str] = Field(default_factory=list) + memory_helped_count: int = Field(default=0, ge=0) + memory_harmed_count: int = Field(default=0, ge=0) + # False when neither report carries per-file lists (e.g. pre-PR-0d + # artifacts) — then helped/harmed are unknowable, not zero. + causal_attribution_available: bool = False diff --git a/src/tools/memory_eval.py b/src/tools/memory_eval.py index 3f7f326..5eaf1cb 100644 --- a/src/tools/memory_eval.py +++ b/src/tools/memory_eval.py @@ -93,6 +93,8 @@ def compute_memory_effectiveness( top_harmful=_items_from_outcomes(outcomes_dict.get("top_harmful")), total_tracked_entries=_as_int(outcomes_dict.get("tracked_entries", 0)), effective_observations=_as_int(summary.get("effective_observations", 0)), + passed_files=sorted(passed), + failed_files=sorted(failed), ) @@ -103,10 +105,25 @@ def compare_memory_effectiveness( """Diff two runs (memory on vs off) on the same dataset. ``memory_beneficial`` is the simple ``lift > 0`` convenience flag; the - full acceptance gate (lift positive AND harmful rate not rising) lives in + full acceptance gate (lift positive AND harmed count not rising) lives in ``doc/evaluation/acceptance.md``. + + PR-0d: help/harm is attributed *causally* by diffing per-file verdicts + across the arms — a file is harmed only if it passed without memory but + failed with it (and vice-versa for helped). A deterministic failure that + occurs identically in both arms therefore counts as neither, unlike the + single-arm ``harmful_influence_rate`` which blames any injected-and-failed + file. Falls back to ``causal_attribution_available=False`` when the reports + predate PR-0d and carry no per-file lists. """ lift = round(memory_on.overall_correct_rate - memory_off.overall_correct_rate, 4) + + on_passed, on_failed = set(memory_on.passed_files), set(memory_on.failed_files) + off_passed, off_failed = set(memory_off.passed_files), set(memory_off.failed_files) + available = bool(on_passed or on_failed or off_passed or off_failed) + helped = sorted(off_failed & on_passed) + harmed = sorted(off_passed & on_failed) + return MemoryAblationComparison( on_run_id=memory_on.run_id, off_run_id=memory_off.run_id, @@ -115,4 +132,9 @@ def compare_memory_effectiveness( memory_decision_lift=lift, harmful_influence_rate_on=memory_on.harmful_influence_rate, memory_beneficial=lift > 0.0, + memory_helped_files=helped, + memory_harmed_files=harmed, + memory_helped_count=len(helped), + memory_harmed_count=len(harmed), + causal_attribution_available=available, ) diff --git a/src/tools/memory_replay.py b/src/tools/memory_replay.py index d98fe37..831bd73 100644 --- a/src/tools/memory_replay.py +++ b/src/tools/memory_replay.py @@ -70,29 +70,56 @@ def _pct(value: float) -> str: return f"{value * 100:.2f}%" +def _file_list(files: list[str], limit: int = 5) -> str: + if not files: + return "none" + shown = ", ".join(files[:limit]) + return shown if len(files) <= limit else f"{shown}, … (+{len(files) - limit})" + + def render_ablation_table(cmp: MemoryAblationComparison) -> str: """Render the ablation comparison as a plain markdown table. The verdict line restates the convenience ``memory_beneficial`` flag - (lift > 0); the full acceptance gate also requires the harmful-influence - rate not to rise over time (see ``doc/evaluation/acceptance.md``). + (lift > 0); the full acceptance gate also requires the causal harmed count + not to rise over time (see ``doc/evaluation/acceptance.md``). + + PR-0d: the causal block reports help/harm attributed by cross-arm per-file + verdict diff. ``harmful_influence_rate (on)`` is kept but labelled + correlational — a deterministic failure inflates it without memory being + the cause, which the causal ``memory_harmed`` count avoids. """ lift = cmp.memory_decision_lift sign = "+" if lift > 0 else "" verdict = "BENEFICIAL (lift > 0)" if cmp.memory_beneficial else "NOT beneficial" - return "\n".join( - [ - "| Metric | memory=on | memory=off |", - "|---|---|---|", - f"| run_id | `{cmp.on_run_id}` | `{cmp.off_run_id}` |", - f"| overall_correct_rate | {_pct(cmp.overall_correct_rate_on)} " - f"| {_pct(cmp.overall_correct_rate_off)} |", - "", - f"**memory_decision_lift**: {sign}{lift:.4f} " - f"({_pct(lift) if lift >= 0 else '-' + _pct(-lift)})", - "", - f"**harmful_influence_rate (on)**: {_pct(cmp.harmful_influence_rate_on)}", - "", - f"**Verdict**: {verdict}", + lines = [ + "| Metric | memory=on | memory=off |", + "|---|---|---|", + f"| run_id | `{cmp.on_run_id}` | `{cmp.off_run_id}` |", + f"| overall_correct_rate | {_pct(cmp.overall_correct_rate_on)} " + f"| {_pct(cmp.overall_correct_rate_off)} |", + "", + f"**memory_decision_lift**: {sign}{lift:.4f} " + f"({_pct(lift) if lift >= 0 else '-' + _pct(-lift)})", + "", + ] + if cmp.causal_attribution_available: + lines += [ + f"**Causal attribution (cross-arm per-file diff)**: " + f"helped={cmp.memory_helped_count}, harmed={cmp.memory_harmed_count}", + f" - memory_helped (off-fail → on-pass): {_file_list(cmp.memory_helped_files)}", + f" - memory_harmed (off-pass → on-fail): {_file_list(cmp.memory_harmed_files)}", ] - ) + else: + lines.append( + "**Causal attribution**: N/A (reports carry no per-file lists; " + "regenerate with PR-0d+ to enable)" + ) + lines += [ + "", + f"**harmful_influence_rate (on, correlational)**: " + f"{_pct(cmp.harmful_influence_rate_on)}", + "", + f"**Verdict**: {verdict}", + ] + return "\n".join(lines) diff --git a/tests/unit/test_memory_effectiveness.py b/tests/unit/test_memory_effectiveness.py index d46e45f..923a471 100644 --- a/tests/unit/test_memory_effectiveness.py +++ b/tests/unit/test_memory_effectiveness.py @@ -100,6 +100,9 @@ def test_influenced_counts_intersection_of_injected_and_judged(): assert report.harmful_influence_count == 1 assert report.correct_rate_after_influence == 0.5 assert report.harmful_influence_rate == 0.5 + # PR-0d: per-file lists persisted (sorted) for offline causal attribution + assert report.passed_files == ["a.py", "c.py"] + assert report.failed_files == ["b.py"] def test_injected_file_not_judged_is_excluded_from_influence(): diff --git a/tests/unit/test_memory_replay.py b/tests/unit/test_memory_replay.py index 02e93ca..7ba87c0 100644 --- a/tests/unit/test_memory_replay.py +++ b/tests/unit/test_memory_replay.py @@ -14,7 +14,13 @@ ) -def _report(run_id: str, correct_rate: float, harmful_rate: float = 0.0): +def _report( + run_id: str, + correct_rate: float, + harmful_rate: float = 0.0, + passed_files: list[str] | None = None, + failed_files: list[str] | None = None, +): return MemoryEffectivenessReport( run_id=run_id, total_judged_decisions=10, @@ -26,6 +32,8 @@ def _report(run_id: str, correct_rate: float, harmful_rate: float = 0.0): harmful_influence_rate=harmful_rate, total_tracked_entries=2, effective_observations=4, + passed_files=passed_files or [], + failed_files=failed_files or [], ) @@ -81,6 +89,57 @@ def test_comparison_non_positive_lift_not_beneficial(): assert cmp.memory_beneficial is False +# --- PR-0d causal attribution ---------------------------------------------- + + +def test_causal_deterministic_failure_not_blamed_on_memory(): + """Same per-file verdict in both arms → 0 helped, 0 harmed even though the + single-arm harmful_influence_rate is non-zero (the forgejo baseline case).""" + on = _report( + "on", 0.8125, harmful_rate=0.2, passed_files=["a", "b"], failed_files=["x"] + ) + off = _report("off", 0.8125, passed_files=["a", "b"], failed_files=["x"]) + cmp = build_ablation_comparison(on, off) + assert cmp.causal_attribution_available is True + assert cmp.memory_helped_count == 0 + assert cmp.memory_harmed_count == 0 + assert cmp.harmful_influence_rate_on == pytest.approx( + 0.2 + ) # correlational, unchanged + + +def test_causal_memory_helped_and_harmed(): + # off fails f1 (on passes it → helped); off passes f2 (on fails it → harmed) + on = _report("on", 0.5, passed_files=["f1", "keep"], failed_files=["f2"]) + off = _report("off", 0.5, passed_files=["f2", "keep"], failed_files=["f1"]) + cmp = build_ablation_comparison(on, off) + assert cmp.memory_helped_files == ["f1"] + assert cmp.memory_harmed_files == ["f2"] + assert cmp.memory_helped_count == 1 + assert cmp.memory_harmed_count == 1 + + +def test_causal_unavailable_when_no_file_lists(): + cmp = build_ablation_comparison(_report("on", 0.9), _report("off", 0.7)) + assert cmp.causal_attribution_available is False + assert cmp.memory_helped_count == 0 and cmp.memory_harmed_count == 0 + + +def test_render_causal_block_present(): + on = _report("on", 0.8, passed_files=["a"], failed_files=["x"]) + off = _report("off", 0.8, passed_files=["a", "x"], failed_files=[]) + table = render_ablation_table(build_ablation_comparison(on, off)) + assert "Causal attribution" in table + assert "memory_harmed (off-pass → on-fail): x" in table + + +def test_render_causal_na_when_unavailable(): + table = render_ablation_table( + build_ablation_comparison(_report("on", 0.9), _report("off", 0.7)) + ) + assert "N/A" in table + + # --- rendering -------------------------------------------------------------- From b83d142e29c8b95dd9971bc92592608eecb2d015 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:11:28 -0400 Subject: [PATCH 08/22] =?UTF-8?q?feat(memory):=20P1-A=20=E6=8C=81=E4=B9=85?= =?UTF-8?q?=E5=8C=96=E8=BD=AF=E5=88=A0=20suppress=EF=BC=88=E5=B7=A9?= =?UTF-8?q?=E5=9B=BA=20O-M6=20=E4=B8=B4=E6=97=B6=E8=BF=87=E6=BB=A4?= =?UTF-8?q?=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 把 O-M6 读取期、依赖 hit_tracker 存活的临时有害过滤巩固为持久、可审计的 软删状态,使 prune 在 tracker sidecar 丢失/观测不足时不再"复活",并覆盖 写入/consolidation 侧(堵 F2 无界增长)。 - MemoryEntry 增 suppressed/suppressed_reason(content_hash 不含新字段, dedup 身份不变;软删保留可审计行,非物理删除) - MemoryStore/SQLiteMemoryStore 增 suppress_entry();SQLite 加两列 + ALTER TABLE 迁移旧库;get_relevant_context 与 _consolidate_entries 跳过 suppressed 条目 - layered_loader._build_l2 过滤改为"suppressed 或命中 harmful_entry_ids" (持久 + 实时并存) - orchestrator 新增 _apply_suppress_harmful_entries:run 末把满足 suppress_min_observations 的稳定有害条目固化 suppress,豁免 bootstrap/HUMAN - 新 opt-in 开关 memory.persist_suppress(默认 False,按 P2 先度量再激活) 12 新单测 + 3224 unit 绿(1 个 pre-existing 无关 docs 测试除外),mypy/ruff 干净 --- src/core/orchestrator.py | 45 ++++++ src/memory/layered_loader.py | 6 +- src/memory/models.py | 2 + src/memory/sqlite_store.py | 57 ++++++- src/memory/store.py | 33 ++++ src/models/config.py | 15 ++ tests/unit/test_memory_suppress.py | 236 +++++++++++++++++++++++++++++ 7 files changed, 389 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_memory_suppress.py diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py index bd09ed8..32f1d26 100644 --- a/src/core/orchestrator.py +++ b/src/core/orchestrator.py @@ -525,6 +525,10 @@ async def _update_memory(self, phase: str, state: MergeState) -> None: self._apply_outcome_confidence_writeback(state) except Exception as exc: logger.warning("Outcome confidence write-back failed: %s", exc) + try: + self._apply_suppress_harmful_entries(state) + except Exception as exc: + logger.warning("Harmful-entry suppression failed: %s", exc) def _apply_outcome_confidence_writeback(self, state: MergeState) -> None: """OPP-5: nudge persisted memory confidence toward judge outcomes. @@ -566,6 +570,47 @@ def _apply_outcome_confidence_writeback(self, state: MergeState) -> None: len(deltas), ) + def _apply_suppress_harmful_entries(self, state: MergeState) -> None: + """P1-A: persistently soft-delete stably-harmful memory entries. + + Default OFF. When ``persist_suppress`` is on, entries whose accumulated + outcome score crosses the harmful threshold with at least + ``suppress_min_observations`` observations are marked ``suppressed`` so + the prune survives tracker loss across runs (the O-M6 read-time filter + recomputes from sidecar observations and resurrects on loss). Human and + bootstrap entries are exempt, mirroring OPP-5.""" + cfg = getattr(self.config, "memory", None) + if cfg is None or not getattr(cfg, "persist_suppress", False): + return + harmful_ids = self._memory_hit_tracker.harmful_entry_ids( + min_observations=cfg.suppress_min_observations + ) + if not harmful_ids: + return + human_files = { + fp + for fp, record in state.file_decision_records.items() + if record.decision_source + in (DecisionSource.HUMAN, DecisionSource.BATCH_HUMAN) + } + suppressed = 0 + for entry in self._memory_store.to_memory().entries: + if entry.entry_id not in harmful_ids or entry.suppressed: + continue + if _BOOTSTRAP_TAG in entry.tags: + continue + if human_files and any(fp in human_files for fp in entry.file_paths): + continue + self._memory_store = self._memory_store.suppress_entry( + entry.entry_id, reason="P1-A: stably-harmful judge outcomes" + ) + suppressed += 1 + if suppressed: + logger.info( + "P1-A: persistently suppressed %d stably-harmful memory entries", + suppressed, + ) + def _should_llm_extract(self, phase: str, state: MergeState) -> bool: cfg = getattr(self.config, "memory", None) if cfg is None or not cfg.llm_extraction: diff --git a/src/memory/layered_loader.py b/src/memory/layered_loader.py index 2ae270f..4a50282 100644 --- a/src/memory/layered_loader.py +++ b/src/memory/layered_loader.py @@ -123,7 +123,11 @@ def _build_l2(self, file_paths: list[str]) -> tuple[str, int]: lines: list[str] = [] injected_ids: list[str] = [] for entry in relevant: - if entry.entry_id in harmful_ids: + # P1-A: persistent suppress (entry.suppressed) OR realtime harmful + # (tracker observations this process). get_relevant_context already + # drops suppressed at the source; this keeps the read path correct + # even if a caller passes pre-fetched entries. + if entry.suppressed or entry.entry_id in harmful_ids: continue if not _has_path_overlap(entry.file_paths, file_paths): continue diff --git a/src/memory/models.py b/src/memory/models.py index 288ae0e..9311ef9 100644 --- a/src/memory/models.py +++ b/src/memory/models.py @@ -34,6 +34,8 @@ class MemoryEntry(BaseModel, frozen=True): confidence_level: ConfidenceLevel = Field(default=ConfidenceLevel.INFERRED) content_hash: str = Field(default="") created_at: datetime = Field(default_factory=datetime.now) + suppressed: bool = Field(default=False) + suppressed_reason: str | None = Field(default=None) def model_post_init(self, __context: Any) -> None: if not self.content_hash: diff --git a/src/memory/sqlite_store.py b/src/memory/sqlite_store.py index 167e140..202a208 100644 --- a/src/memory/sqlite_store.py +++ b/src/memory/sqlite_store.py @@ -35,7 +35,9 @@ confidence REAL NOT NULL, confidence_level TEXT NOT NULL, content_hash TEXT NOT NULL, - created_at TEXT NOT NULL + created_at TEXT NOT NULL, + suppressed INTEGER NOT NULL DEFAULT 0, + suppressed_reason TEXT ); CREATE UNIQUE INDEX IF NOT EXISTS idx_content_hash ON memory_entries (content_hash); @@ -52,10 +54,25 @@ _INSERT_ENTRY = """ INSERT OR IGNORE INTO memory_entries (entry_id, entry_type, phase, content, file_paths, tags, - confidence, confidence_level, content_hash, created_at) -VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + confidence, confidence_level, content_hash, created_at, + suppressed, suppressed_reason) +VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """ +# P1-A: columns added after the original schema shipped; older memory.db files +# predate them. ALTER TABLE ADD COLUMN is a no-op-safe, data-preserving +# migration run on every open. +_MIGRATIONS = ( + ( + "suppressed", + "ALTER TABLE memory_entries ADD COLUMN suppressed INTEGER NOT NULL DEFAULT 0", + ), + ( + "suppressed_reason", + "ALTER TABLE memory_entries ADD COLUMN suppressed_reason TEXT", + ), +) + _PHASE_ORDER = { "planning": 0, "auto_merge": 1, @@ -64,7 +81,7 @@ } -def _entry_to_row(entry: MemoryEntry) -> tuple[str, ...]: +def _entry_to_row(entry: MemoryEntry) -> tuple[str | int | None, ...]: return ( entry.entry_id, entry.entry_type.value, @@ -76,10 +93,13 @@ def _entry_to_row(entry: MemoryEntry) -> tuple[str, ...]: entry.confidence_level.value, entry.content_hash, entry.created_at.isoformat(), + 1 if entry.suppressed else 0, + entry.suppressed_reason, ) def _row_to_entry(row: sqlite3.Row) -> MemoryEntry: + keys = row.keys() return MemoryEntry( entry_id=row["entry_id"], entry_type=MemoryEntryType(row["entry_type"]), @@ -91,6 +111,10 @@ def _row_to_entry(row: sqlite3.Row) -> MemoryEntry: confidence_level=ConfidenceLevel(row["confidence_level"]), content_hash=row["content_hash"], created_at=datetime.fromisoformat(row["created_at"]), + suppressed=bool(row["suppressed"]) if "suppressed" in keys else False, + suppressed_reason=( + row["suppressed_reason"] if "suppressed_reason" in keys else None + ), ) @@ -129,6 +153,7 @@ def _init_db(self) -> None: conn.execute("PRAGMA busy_timeout=5000") conn.execute("PRAGMA journal_mode=WAL") conn.executescript(_CREATE_SCHEMA) + self._apply_migrations(conn) return except sqlite3.OperationalError as exc: last_exc = exc @@ -138,6 +163,15 @@ def _init_db(self) -> None: assert last_exc is not None raise last_exc + def _apply_migrations(self, conn: sqlite3.Connection) -> None: + existing = { + row[1] + for row in conn.execute("PRAGMA table_info(memory_entries)").fetchall() + } + for column, ddl in _MIGRATIONS: + if column not in existing: + conn.execute(ddl) + @contextmanager def _conn(self) -> Generator[sqlite3.Connection, None, None]: conn = sqlite3.connect(str(self._db_path), timeout=5.0) @@ -221,6 +255,19 @@ def adjust_confidence(self, deltas: dict[str, float]) -> "SQLiteMemoryStore": ) return self + def suppress_entry(self, entry_id: str, reason: str) -> "SQLiteMemoryStore": + """P1-A: persistently soft-delete an entry (audit-preserving). + + Sets ``suppressed=1`` + ``suppressed_reason`` via UPDATE; the row stays + for audit/reversal. Already-suppressed or unknown ids are no-ops.""" + with self._conn() as conn: + conn.execute( + "UPDATE memory_entries SET suppressed = 1, suppressed_reason = ? " + "WHERE entry_id = ? AND suppressed = 0", + (reason, entry_id), + ) + return self + def set_codebase_profile(self, key: str, value: str) -> "SQLiteMemoryStore": with self._conn() as conn: conn.execute( @@ -274,6 +321,8 @@ def get_relevant_context( scored: dict[str, tuple[float, MemoryEntry]] = {} for row in rows: entry = _row_to_entry(row) + if entry.suppressed: + continue entry_fps: list[str] = json.loads(row["file_paths"]) path_score = score_path_overlap(file_paths, entry_fps) diff --git a/src/memory/store.py b/src/memory/store.py index 4ca7aed..16199d4 100644 --- a/src/memory/store.py +++ b/src/memory/store.py @@ -69,6 +69,31 @@ def adjust_confidence(self, deltas: dict[str, float]) -> MemoryStore: new_memory = self._memory.model_copy(update={"entries": entries}) return MemoryStore(new_memory) + def suppress_entry(self, entry_id: str, reason: str) -> MemoryStore: + """P1-A: persistently soft-delete an entry (audit-preserving). + + Marks ``suppressed=True`` + records ``reason`` rather than removing + the row, so the decision stays auditable and reversible. Suppressed + entries are skipped at injection (``get_relevant_context``) and at + consolidation. Idempotent and immutable; an unknown ``entry_id`` or an + already-suppressed entry returns ``self`` unchanged.""" + changed = False + entries: list[MemoryEntry] = [] + for entry in self._memory.entries: + if entry.entry_id == entry_id and not entry.suppressed: + entries.append( + entry.model_copy( + update={"suppressed": True, "suppressed_reason": reason} + ) + ) + changed = True + else: + entries.append(entry) + if not changed: + return self + new_memory = self._memory.model_copy(update={"entries": entries}) + return MemoryStore(new_memory) + def set_codebase_profile(self, key: str, value: str) -> MemoryStore: profile = {**self._memory.codebase_profile, key: value} new_memory = self._memory.model_copy(update={"codebase_profile": profile}) @@ -118,6 +143,8 @@ def get_relevant_context( ref_short = current_upstream_ref[:8] if current_upstream_ref else "" scored: dict[str, tuple[float, MemoryEntry]] = {} for entry in self._memory.entries: + if entry.suppressed: + continue path_score = score_path_overlap(file_paths, entry.file_paths) confidence = entry.confidence @@ -324,6 +351,12 @@ class (``c_class`` / ``conflict_decision``) shared across directories, so ungroupable: list[MemoryEntry] = [] for entry in entries: + # P1-A: suppressed entries pass through untouched — they must not be + # merged into a live blob (that would resurrect harmful content) nor + # silently dropped (audit trail must survive consolidation). + if entry.suppressed: + ungroupable.append(entry) + continue primary_tag = entry.tags[0] if entry.tags else "" key = ( entry.phase, diff --git a/src/models/config.py b/src/models/config.py index 872fe91..2afb0f7 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -970,6 +970,21 @@ class MemoryExtractionConfig(BaseModel): "(merge eval-memory). Extraction/write-back are unaffected — only " "read-time injection is suppressed. Default True (normal behaviour).", ) + persist_suppress: bool = Field( + default=False, + description="P1-A: at run end, persistently soft-delete (suppress) " + "memory entries judged stably harmful (>= suppress_min_observations " + "pass/fail observations with a net-negative outcome). Default OFF — " + "like OPP-5 write-back, suppression is cross-run durable and should " + "prove out on the eval-memory ablation before enabling. Never touches " + "human-decided or bootstrap (human-authored) entries.", + ) + suppress_min_observations: int = Field( + default=3, + ge=1, + description="P1-A: minimum pass+fail observations before a harmful " + "entry is persistently suppressed, so a single run cannot prune it.", + ) class RenameDetectionConfig(BaseModel): diff --git a/tests/unit/test_memory_suppress.py b/tests/unit/test_memory_suppress.py new file mode 100644 index 0000000..c3988d2 --- /dev/null +++ b/tests/unit/test_memory_suppress.py @@ -0,0 +1,236 @@ +"""P1-A: persistent soft-delete (suppress) across both stores + orchestrator固化. + +The O-M6 harmful filter is read-time and tracker-dependent — a lost sidecar +resurrects pruned entries. P1-A persists the prune as an auditable +``suppressed`` flag so it survives tracker loss, and blocks suppressed entries +from injection AND consolidation. +""" + +from __future__ import annotations + +import pytest + +from src.memory.models import MemoryEntry, MemoryEntryType +from src.memory.sqlite_store import SQLiteMemoryStore +from src.memory.store import MemoryStore, _consolidate_entries + + +def _entry(content: str, file_paths: list[str], confidence: float = 0.8) -> MemoryEntry: + return MemoryEntry( + entry_type=MemoryEntryType.DECISION, + phase="conflict_analysis", + content=content, + file_paths=file_paths, + confidence=confidence, + ) + + +# --- model ------------------------------------------------------------------ + + +def test_suppressed_defaults_false_and_hash_unchanged(): + a = _entry("x", ["a.py"]) + # suppressing must not change dedup identity (content_hash excludes the flag) + b = a.model_copy(update={"suppressed": True, "suppressed_reason": "harmful"}) + assert a.suppressed is False and a.suppressed_reason is None + assert b.suppressed is True and b.suppressed_reason == "harmful" + assert a.content_hash == b.content_hash + + +# --- MemoryStore ------------------------------------------------------------ + + +def test_memstore_suppress_is_immutable_and_marks_flag(): + e = _entry("bad", ["a.py"]) + store = MemoryStore().add_entry(e) + new = store.suppress_entry(e.entry_id, "stably harmful") + assert new is not store # new instance + assert store.to_memory().entries[0].suppressed is False # original untouched + marked = new.to_memory().entries[0] + assert marked.suppressed is True + assert marked.suppressed_reason == "stably harmful" + + +def test_memstore_suppress_unknown_and_double_are_noops(): + e = _entry("bad", ["a.py"]) + store = MemoryStore().add_entry(e) + assert store.suppress_entry("nope", "x") is store + once = store.suppress_entry(e.entry_id, "r") + assert once.suppress_entry(e.entry_id, "r2") is once # already suppressed + + +def test_memstore_suppressed_excluded_from_relevant(): + e = _entry("bad", ["a.py"]) + store = MemoryStore().add_entry(e) + assert store.get_relevant_context(["a.py"]) # visible before + suppressed = store.suppress_entry(e.entry_id, "harmful") + assert suppressed.get_relevant_context(["a.py"]) == [] + + +# --- consolidation ---------------------------------------------------------- + + +def test_consolidation_passes_suppressed_through_untouched(): + # 3 same-group live entries would merge; a suppressed sibling must survive + # standalone (not merged, not dropped) to keep the audit trail. + live = [_entry(f"c{i}", ["pkg/x/a.py"]) for i in range(3)] + suppressed = _entry("harmful", ["pkg/x/a.py"]).model_copy( + update={"suppressed": True, "suppressed_reason": "r"} + ) + out = _consolidate_entries([*live, suppressed]) + surviving = [e for e in out if e.suppressed] + assert len(surviving) == 1 + assert surviving[0].suppressed_reason == "r" + # the 3 live ones collapsed into a single consolidated blob + assert sum(1 for e in out if not e.suppressed) == 1 + + +# --- SQLiteMemoryStore ------------------------------------------------------ + + +def test_sqlite_suppress_persists_and_excludes(tmp_path): + db = tmp_path / "m.db" + store = SQLiteMemoryStore.open(db) + e = _entry("bad", ["a.py"]) + store.add_entry(e) + assert store.get_relevant_context(["a.py"]) + store.suppress_entry(e.entry_id, "harmful") + assert store.get_relevant_context(["a.py"]) == [] + # reopen: flag persisted on disk + reopened = SQLiteMemoryStore.open(db) + row = next(x for x in reopened.to_memory().entries if x.entry_id == e.entry_id) + assert row.suppressed is True and row.suppressed_reason == "harmful" + + +def test_sqlite_migration_adds_columns_to_legacy_db(tmp_path): + """A pre-P1-A schema (no suppressed columns) must migrate on open without + data loss, defaulting existing rows to suppressed=False.""" + import sqlite3 + + db = tmp_path / "legacy.db" + conn = sqlite3.connect(str(db)) + conn.executescript( + """ + CREATE TABLE memory_entries ( + entry_id TEXT PRIMARY KEY, entry_type TEXT NOT NULL, phase TEXT NOT NULL, + content TEXT NOT NULL, file_paths TEXT NOT NULL, tags TEXT NOT NULL, + confidence REAL NOT NULL, confidence_level TEXT NOT NULL, + content_hash TEXT NOT NULL, created_at TEXT NOT NULL + ); + INSERT INTO memory_entries VALUES + ('id1','decision','planning','legacy','["a.py"]','[]',0.8, + 'inferred','hash1','2026-01-01T00:00:00'); + """ + ) + conn.commit() + conn.close() + + store = SQLiteMemoryStore.open(db) # migration runs here + entries = store.to_memory().entries + assert len(entries) == 1 + assert entries[0].suppressed is False + # and suppression now works on the migrated row + store.suppress_entry("id1", "harmful") + assert store.get_relevant_context(["a.py"]) == [] + + +def test_sqlite_suppress_unknown_is_noop(tmp_path): + store = SQLiteMemoryStore.open(tmp_path / "m.db") + e = _entry("ok", ["a.py"]) + store.add_entry(e) + store.suppress_entry("nope", "x") # must not raise + assert store.get_relevant_context(["a.py"]) # untouched + + +# --- parity ----------------------------------------------------------------- + + +def test_both_stores_agree_suppressed_is_hidden(tmp_path): + e = _entry("bad", ["a.py"]) + mem = MemoryStore().add_entry(e).suppress_entry(e.entry_id, "r") + sq = SQLiteMemoryStore.open(tmp_path / "m.db") + sq.add_entry(e) + sq.suppress_entry(e.entry_id, "r") + assert mem.get_relevant_context(["a.py"]) == sq.get_relevant_context(["a.py"]) == [] + + +# --- orchestrator固化 (_apply_suppress_harmful_entries) ---------------------- + + +def _track_fails(tracker, entry_id: str, n: int) -> None: + for i in range(n): + f = f"{entry_id}-obs{i}" + tracker.record_injection([f], [entry_id]) + tracker.record_outcome(f, success=False) + + +def _orch(persist: bool, min_obs: int = 3): + from types import SimpleNamespace + + from src.core.orchestrator import Orchestrator + from src.memory.hit_tracker import MemoryHitTracker + from src.models.config import MemoryExtractionConfig + + orch = Orchestrator.__new__(Orchestrator) + orch._memory_hit_tracker = MemoryHitTracker() + orch._memory_store = MemoryStore() + orch.config = SimpleNamespace( + memory=MemoryExtractionConfig( + persist_suppress=persist, suppress_min_observations=min_obs + ) + ) + return orch + + +def test_persist_suppress_off_by_default(): + from src.models.config import MemoryExtractionConfig + + assert MemoryExtractionConfig().persist_suppress is False + + from types import SimpleNamespace + + orch = _orch(persist=False) + e = _entry("harm", ["src/a.py"]) + orch._memory_store = orch._memory_store.add_entry(e) + _track_fails(orch._memory_hit_tracker, e.entry_id, 3) + orch._apply_suppress_harmful_entries(SimpleNamespace(file_decision_records={})) + assert orch._memory_store.to_memory().entries[0].suppressed is False + + +def test_persist_suppress_marks_stable_harmful_skips_human_and_bootstrap(): + from types import SimpleNamespace + + from src.models.decision import DecisionSource + + orch = _orch(persist=True, min_obs=3) + harmful = _entry("harm", ["src/a.py"]) + human = _entry("human", ["src/secret.py"]) + boot = _entry("boot", []).model_copy(update={"tags": ["bootstrap"]}) + store = orch._memory_store + for e in (harmful, human, boot): + store = store.add_entry(e) + _track_fails(orch._memory_hit_tracker, e.entry_id, 3) + orch._memory_store = store + + state = SimpleNamespace( + file_decision_records={ + "src/secret.py": SimpleNamespace(decision_source=DecisionSource.HUMAN) + } + ) + orch._apply_suppress_harmful_entries(state) + + by_id = {e.entry_id: e for e in orch._memory_store.to_memory().entries} + assert by_id[harmful.entry_id].suppressed is True + assert by_id[human.entry_id].suppressed is False # human-decided exempt + assert by_id[boot.entry_id].suppressed is False # bootstrap exempt + + +def test_persist_suppress_respects_min_observations(): + from types import SimpleNamespace + + orch = _orch(persist=True, min_obs=3) + e = _entry("harm", ["src/a.py"]) + orch._memory_store = orch._memory_store.add_entry(e) + _track_fails(orch._memory_hit_tracker, e.entry_id, 2) # below threshold + orch._apply_suppress_harmful_entries(SimpleNamespace(file_decision_records={})) + assert orch._memory_store.to_memory().entries[0].suppressed is False From 6b4f905dacc0d9ea7c45a3aed3e5818502ee79c2 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:20:10 -0400 Subject: [PATCH 09/22] =?UTF-8?q?feat(memory):=20P1-B=20=E6=89=A7=E8=A1=8C?= =?UTF-8?q?=E6=8E=A5=E5=9C=B0=E5=86=99=E5=9B=9E=E4=BF=A1=E5=8F=B7=E8=9E=8D?= =?UTF-8?q?=E5=90=88=EF=BC=88judge=20+=20compile=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OPP-5 写回此前只用 judge pass/fail,而 record_outcome 又跑在 build_check 之前——一个 judge 判过但编译失败的合并仍把产生它的记忆记为 helpful。本提交 把每文件 outcome 记录从 judge_review 阶段集中到 orchestrator 的 judge_review 记忆钩子(此时 verdict 已反映 build_check 降级),并按确定性信号融合: - 新增 _record_memory_outcomes:默认 ["judge"] 与旧行为逐字节等价;含 "compile" 时,build_check_failed 的 run 把 judge-passed 的已编译语言文件 demote 为失败, 非编译文件(如 .md)不受牵连 - judge_review.py 移除 inline record_outcome(避免在 build_check 前误记) - 写回为双向(OPP-5 已有 +Δ/−Δ),harmful 跌破阈值由紧随其后的 P1-A _apply_suppress_harmful_entries 固化 suppress——无需新代码 - config.memory 增 writeback_signal_sources: list[Literal["judge","compile"]] (默认 ["judge"],opt-in 加固,全确定性不引 LLM 自报) CI/partial_failure 信号有意延后:post-merge 确定性发现在 report_generation 产出,晚于本钩子;完整融合需把写回迁到 report 阶段(见 plan P1-B) 7 新单测 + 3231 unit 绿(1 pre-existing 无关 docs 测试除外),mypy/ruff 干净 --- src/core/orchestrator.py | 33 ++++++ src/core/phases/judge_review.py | 14 ++- src/models/config.py | 15 ++- tests/unit/test_p1b_outcome_fusion.py | 140 ++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 9 deletions(-) create mode 100644 tests/unit/test_p1b_outcome_fusion.py diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py index 32f1d26..8fe7bc9 100644 --- a/src/core/orchestrator.py +++ b/src/core/orchestrator.py @@ -521,6 +521,10 @@ async def _update_memory(self, phase: str, state: MergeState) -> None: logger.warning("LLM memory extraction failed for %s: %s", phase, exc) if phase == "judge_review": + try: + self._record_memory_outcomes(state) + except Exception as exc: + logger.warning("Memory outcome recording failed: %s", exc) try: self._apply_outcome_confidence_writeback(state) except Exception as exc: @@ -530,6 +534,35 @@ async def _update_memory(self, phase: str, state: MergeState) -> None: except Exception as exc: logger.warning("Harmful-entry suppression failed: %s", exc) + def _record_memory_outcomes(self, state: MergeState) -> None: + """P1-B: fuse deterministic signals into the per-file memory outcome + that feeds OPP-5 write-back and P1-A suppression, then credit/blame the + entries injected for each file. + + Runs once after judge_review — the verdict already reflects the + post-judge build check. With the default ``["judge"]`` this reproduces + the prior passed/failed split byte-for-byte. Adding ``"compile"`` demotes + a judge-passed compiled-language file to a failure when the build check + failed this run, so memory that produced an uncompilable merge earns no + credit. Deterministic only — no LLM self-report.""" + verdict = state.judge_verdict + if verdict is None: + return + cfg = getattr(self.config, "memory", None) + sources = list(getattr(cfg, "writeback_signal_sources", None) or ["judge"]) + tracker = self._memory_hit_tracker + demoted: frozenset[str] = frozenset() + if "compile" in sources and any( + issue.issue_type == "build_check_failed" for issue in verdict.issues + ): + from src.tools.compile_gate import compiled_language_paths + + demoted = frozenset(compiled_language_paths(verdict.passed_files)) + for fp in verdict.passed_files: + tracker.record_outcome(fp, success=fp not in demoted) + for fp in verdict.failed_files: + tracker.record_outcome(fp, success=False) + def _apply_outcome_confidence_writeback(self, state: MergeState) -> None: """OPP-5: nudge persisted memory confidence toward judge outcomes. diff --git a/src/core/phases/judge_review.py b/src/core/phases/judge_review.py index 76f0b98..060699f 100644 --- a/src/core/phases/judge_review.py +++ b/src/core/phases/judge_review.py @@ -217,14 +217,12 @@ async def execute(self, state: MergeState, ctx: PhaseContext) -> PhaseOutcome: ) state.phase_results[MergePhase.JUDGE_REVIEW.value] = phase_result - # O-M4: credit/blame memory entries based on the final verdict's - # passed/failed file lists. Outcomes accumulate across runs via the - # tracker's sidecar JSON; future runs use them to bias confidence. - if state.judge_verdict is not None and ctx.memory_hit_tracker is not None: - for fp in state.judge_verdict.passed_files: - ctx.memory_hit_tracker.record_outcome(fp, success=True) - for fp in state.judge_verdict.failed_files: - ctx.memory_hit_tracker.record_outcome(fp, success=False) + # O-M4 / P1-B: credit/blame memory entries by the final verdict's + # passed/failed files. Recording is centralised in the orchestrator's + # post-judge_review memory hook (``_record_memory_outcomes``) so it sees + # the verdict AFTER the post-judge build check ran and can fuse the + # compile signal — recording here would run before build_check and + # credit a judge-passed but uncompilable file. gate_ok = await run_gates(state, ctx, "judge_review") if not gate_ok: diff --git a/src/models/config.py b/src/models/config.py index 2afb0f7..526b50d 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import datetime -from typing import Any, Literal, Optional +from typing import Any, Literal, Optional, cast from pydantic import BaseModel, Field, field_validator, model_validator @@ -961,6 +961,19 @@ class MemoryExtractionConfig(BaseModel): description="OPP-5: minimum pass+fail observations before an entry's " "confidence is nudged, so a single run cannot move it.", ) + writeback_signal_sources: list[Literal["judge", "compile"]] = Field( + default_factory=lambda: cast(list[Literal["judge", "compile"]], ["judge"]), + description="P1-B: deterministic signals fused into the per-file memory " + "outcome that drives OPP-5 write-back and P1-A suppression. 'judge' = " + "the Judge verdict's passed/failed split (default — byte-identical to " + "prior behaviour). Adding 'compile' demotes a judge-passed " + "compiled-language file to a failure when the post-judge build check " + "failed this run, so memory that produced an uncompilable merge is not " + "credited. All sources are deterministic — no LLM self-report. " + "CI/partial_failure fusion is deferred: the post-merge deterministic " + "findings land in report_generation, after this hook (see " + "doc/plan/self-learning-system.md P1-B).", + ) inject_enabled: bool = Field( default=True, description="P0 ablation switch: when False, no memory context is " diff --git a/tests/unit/test_p1b_outcome_fusion.py b/tests/unit/test_p1b_outcome_fusion.py new file mode 100644 index 0000000..80dfff8 --- /dev/null +++ b/tests/unit/test_p1b_outcome_fusion.py @@ -0,0 +1,140 @@ +"""P1-B: fuse deterministic signals (judge + compile) into the per-file memory +outcome that drives OPP-5 write-back / P1-A suppression. + +Recording moved out of judge_review into the orchestrator's post-phase memory +hook so the verdict reflects the post-judge build check. With the default +``["judge"]`` the split is byte-identical to the old behaviour; adding +``"compile"`` demotes a judge-passed compiled-language file when the build +check failed this run. +""" + +from __future__ import annotations + +from datetime import datetime +from types import SimpleNamespace + +from src.core.orchestrator import Orchestrator +from src.memory.hit_tracker import MemoryHitTracker +from src.models.config import MemoryExtractionConfig +from src.models.judge import IssueSeverity, JudgeIssue, JudgeVerdict, VerdictType + + +def _verdict(passed, failed, *, build_failed: bool = False) -> JudgeVerdict: + issues = [] + if build_failed: + issues.append( + JudgeIssue( + file_path="(build)", + issue_level=IssueSeverity.CRITICAL, + issue_type="build_check_failed", + description="compile broke", + veto_condition="Build check failed", + ) + ) + return JudgeVerdict( + verdict=VerdictType.FAIL if build_failed else VerdictType.PASS, + reviewed_files_count=len(passed) + len(failed), + passed_files=list(passed), + failed_files=list(failed), + conditional_files=[], + issues=issues, + critical_issues_count=1 if build_failed else 0, + high_issues_count=0, + overall_confidence=0.9, + summary="x", + blocking_issues=[], + timestamp=datetime(2026, 1, 1), + judge_model="m", + ) + + +def _orch(sources: list[str], verdict: JudgeVerdict) -> Orchestrator: + orch = Orchestrator.__new__(Orchestrator) + orch._memory_hit_tracker = MemoryHitTracker() + orch.config = SimpleNamespace( + memory=MemoryExtractionConfig(writeback_signal_sources=sources) + ) + orch._verdict_for_test = verdict # convenience handle in assertions + return orch + + +def _state(verdict: JudgeVerdict) -> SimpleNamespace: + return SimpleNamespace(judge_verdict=verdict) + + +def _inject(tracker: MemoryHitTracker, file_path: str, entry_id: str) -> None: + tracker.record_injection([file_path], [entry_id]) + + +# --- config ----------------------------------------------------------------- + + +def test_default_sources_is_judge_only(): + assert MemoryExtractionConfig().writeback_signal_sources == ["judge"] + + +# --- judge-only equivalence ------------------------------------------------- + + +def test_judge_only_records_pass_and_fail_split(): + v = _verdict(["a.py"], ["b.py"]) + orch = _orch(["judge"], v) + _inject(orch._memory_hit_tracker, "a.py", "e_pass") + _inject(orch._memory_hit_tracker, "b.py", "e_fail") + + orch._record_memory_outcomes(_state(v)) + + scores = orch._memory_hit_tracker.outcome_scores(min_observations=1) + assert scores["e_pass"] == 1.0 + assert scores["e_fail"] == -1.0 + + +def test_compile_source_but_build_passed_is_judge_equivalent(): + v = _verdict(["a.go"], []) # no build_check_failed issue + orch = _orch(["judge", "compile"], v) + _inject(orch._memory_hit_tracker, "a.go", "e") + orch._record_memory_outcomes(_state(v)) + assert orch._memory_hit_tracker.outcome_scores(min_observations=1)["e"] == 1.0 + + +# --- compile fusion --------------------------------------------------------- + + +def test_compile_failure_demotes_compiled_passed_file(): + # judge passed a.go, but the build broke → a.go's memory is blamed, not + # credited, even though judge said pass. + v = _verdict(["a.go"], [], build_failed=True) + orch = _orch(["judge", "compile"], v) + _inject(orch._memory_hit_tracker, "a.go", "e_go") + orch._record_memory_outcomes(_state(v)) + assert orch._memory_hit_tracker.outcome_scores(min_observations=1)["e_go"] == -1.0 + + +def test_compile_failure_does_not_demote_non_compiled_passed_file(): + # a build break must not blame a Markdown file the compiler never touches. + v = _verdict(["a.go", "README.md"], [], build_failed=True) + orch = _orch(["judge", "compile"], v) + _inject(orch._memory_hit_tracker, "a.go", "e_go") + _inject(orch._memory_hit_tracker, "README.md", "e_md") + orch._record_memory_outcomes(_state(v)) + scores = orch._memory_hit_tracker.outcome_scores(min_observations=1) + assert scores["e_go"] == -1.0 + assert scores["e_md"] == 1.0 # non-compiled → still credited + + +def test_compile_not_in_sources_keeps_judge_credit_on_build_fail(): + # build failed, but operator opted out of compile fusion → judge split wins. + v = _verdict(["a.go"], [], build_failed=True) + orch = _orch(["judge"], v) + _inject(orch._memory_hit_tracker, "a.go", "e_go") + orch._record_memory_outcomes(_state(v)) + assert orch._memory_hit_tracker.outcome_scores(min_observations=1)["e_go"] == 1.0 + + +# --- guards ----------------------------------------------------------------- + + +def test_no_verdict_is_noop(): + orch = _orch(["judge", "compile"], _verdict([], [])) + orch._record_memory_outcomes(SimpleNamespace(judge_verdict=None)) # must not raise + assert orch._memory_hit_tracker.outcome_scores(min_observations=1) == {} From 100a33763f8c3c380c952e1a3eef7e2c5a8497dc Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:24:47 -0400 Subject: [PATCH 10/22] =?UTF-8?q?chore(release):=20=E9=85=8D=E7=BD=AE=20Py?= =?UTF-8?q?PI=20=E5=8F=91=E5=B8=83=E5=85=83=E6=95=B0=E6=8D=AE=E4=B8=8E=20G?= =?UTF-8?q?itHub=20Actions=20release=20workflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - pyproject.toml 补充 description / readme / license / authors / classifiers / urls - 新增 LICENSE(MIT) - 新增 .github/workflows/release.yml:v* tag 触发 → 构建 Web UI → 打 wheel → Trusted Publishing 发布到 PyPI --- .github/workflows/release.yml | 61 +++++++++++++++++++++++++++++++++++ LICENSE | 21 ++++++++++++ pyproject.toml | 19 +++++++++++ 3 files changed, 101 insertions(+) create mode 100644 .github/workflows/release.yml create mode 100644 LICENSE diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..de5f258 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,61 @@ +name: Release to PyPI + +on: + push: + tags: + - "v*" + +permissions: + contents: read + id-token: write # required for Trusted Publishing (OIDC) + +jobs: + build: + name: Build wheel + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: "20" + cache: npm + cache-dependency-path: web/package-lock.json + + - name: Build Web UI + working-directory: web + run: npm ci && npm run build + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Build wheel and sdist + run: | + pip install hatch + hatch build + + - name: Upload dist artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + name: Publish to PyPI + needs: build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/project/code-merge-system/ + steps: + - name: Download dist artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8fec473 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Angel + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/pyproject.toml b/pyproject.toml index 82203b5..76fa56a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,20 @@ build-backend = "hatchling.build" [project] name = "code-merge-system" version = "0.1.0" +description = "AI-powered code merge agent with browser UI — plan, review, and resolve conflicts across long fork histories" +readme = "README.md" +license = { text = "MIT" } +authors = [{ name = "Angel", email = "angel.gosick@gmail.com" }] +keywords = ["merge", "git", "ai", "llm", "code-review", "conflict-resolution"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Version Control :: Git", +] requires-python = ">=3.11" dependencies = [ "pydantic>=2.5", @@ -22,6 +36,11 @@ dependencies = [ "platformdirs>=4.0", ] +[project.urls] +Homepage = "https://github.com/angel/code-merge-system" +Repository = "https://github.com/angel/code-merge-system" +Issues = "https://github.com/angel/code-merge-system/issues" + [project.scripts] merge = "src.cli.main:cli" From 6bc77c37f3613fcc6f1fcd9abe914b424bf1e0fc Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:25:38 -0400 Subject: [PATCH 11/22] =?UTF-8?q?feat(memory):=20P1-C=20verified-repair=20?= =?UTF-8?q?=E7=BB=8F=E9=AA=8C=E5=BA=93=EF=BC=88REPAIR=5FRECIPE=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 把执行期"用确定性算子修好并最终 judge PASS"的成功事件沉淀为可复用经验, 下次同类错误不再从零试错。provenance 全确定性——LLM 不参与"是否成功"判定。 - 新 MemoryEntryType.REPAIR_RECIPE - state.applied_repairs 记录"修复并放行"的确定性算子(当前唯一会修复而非升级的 算子:duplicate-symbol dedup;其余 foreign_chars/seam/hallucinated 均升级) - executor 在两处 dedup 命中点(语义合并 + 分块合并)经 _record_applied_repair 登记(按 file+operator 去重) - summarizer._build_repair_recipes:仅当算子触发且文件落入 judge passed_files 才铸 recipe,按 error_signature(error_class+operator+dir层) 去重、上限 20 - 读取走既有 get_relevant_context 通道(recipe 带 file_paths),无需新检索代码; 与 ScarListBuilder(历史人工坑)互补=运行期验证过的解法 - config.memory.repair_recipe_enabled 默认 True(纯加性、执行接地,风险低于 会致害的 P1-A/B,故默认开;可关用于 eval-memory 消融) 7 新单测 + 3238 unit 绿(1 pre-existing 无关 docs 测试除外),mypy/ruff 干净 --- src/agents/executor_agent.py | 29 ++++++ src/core/orchestrator.py | 5 +- src/memory/models.py | 1 + src/memory/summarizer.py | 63 +++++++++++- src/models/config.py | 10 ++ src/models/state.py | 10 ++ tests/unit/test_p1c_repair_recipe.py | 137 +++++++++++++++++++++++++++ 7 files changed, 253 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_p1c_repair_recipe.py diff --git a/src/agents/executor_agent.py b/src/agents/executor_agent.py index 984c5ed..34c0131 100644 --- a/src/agents/executor_agent.py +++ b/src/agents/executor_agent.py @@ -550,6 +550,12 @@ async def execute_semantic_merge( file_diff.file_path, ) merged_content = deduped + _record_applied_repair( + state, + file_diff.file_path, + "dedup_top_level_symbols", + "duplicate_top_level_symbol", + ) fidelity_reason = self._single_shot_fidelity_issue( file_diff.file_path, @@ -814,6 +820,12 @@ async def _execute_chunked_semantic_merge( file_path, ) merged_content = deduped + _record_applied_repair( + state, + file_path, + "dedup_top_level_symbols", + "duplicate_top_level_symbol", + ) # #10: a chunk seam can re-emit a JS/TS function implementation, a # TS2451 redeclaration the const/class dedup above cannot remove safely # (deleting a span risks dropping a real overload). Escalate instead. @@ -1415,6 +1427,23 @@ def _extract_diff_ranges( return ranges +def _record_applied_repair( + state: MergeState, file_path: str, operator: str, error_class: str +) -> None: + """P1-C: log a deterministic repair operator that fixed a file and let the + merge proceed (vs escalate). Deduped per (file_path, operator) so repeated + chunk seams do not inflate the list. A verified REPAIR_RECIPE memory entry is + minted later only if the Judge passes the file.""" + for existing in state.applied_repairs: + if existing.get("file_path") == file_path and existing.get("operator") == ( + operator + ): + return + state.applied_repairs.append( + {"file_path": file_path, "operator": operator, "error_class": error_class} + ) + + def _foreign_chars(merged: str, *sources: str) -> str | None: """Return a sample of non-ASCII glyphs the merge invented, or None. diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py index 8fe7bc9..26bcc0b 100644 --- a/src/core/orchestrator.py +++ b/src/core/orchestrator.py @@ -183,7 +183,10 @@ def __init__( # --- memory --- self._memory_store: MemoryStore | SQLiteMemoryStore = MemoryStore() self._memory_hit_tracker = MemoryHitTracker() - self._summarizer = PhaseSummarizer(upstream_ref=config.upstream_ref) + self._summarizer = PhaseSummarizer( + upstream_ref=config.upstream_ref, + repair_recipe_enabled=getattr(config.memory, "repair_recipe_enabled", True), + ) self._phases_since_last_extract: int = 0 # --- hooks (C1) --- diff --git a/src/memory/models.py b/src/memory/models.py index 9311ef9..4f2c7ec 100644 --- a/src/memory/models.py +++ b/src/memory/models.py @@ -15,6 +15,7 @@ class MemoryEntryType(str, Enum): RELATIONSHIP = "relationship" PHASE_SUMMARY = "phase_summary" CODEBASE_INSIGHT = "codebase_insight" + REPAIR_RECIPE = "repair_recipe" class ConfidenceLevel(str, Enum): diff --git a/src/memory/summarizer.py b/src/memory/summarizer.py index 5239688..0b99ad5 100644 --- a/src/memory/summarizer.py +++ b/src/memory/summarizer.py @@ -56,9 +56,15 @@ def _is_epistemically_empty(rationale: str | None) -> bool: return any(pat.search(lowered) for pat in _EPISTEMIC_FAILURE_PATTERNS) +_MAX_REPAIR_RECIPES = 20 + + class PhaseSummarizer: - def __init__(self, upstream_ref: str = "") -> None: + def __init__( + self, upstream_ref: str = "", repair_recipe_enabled: bool = True + ) -> None: self._upstream_ref = upstream_ref[:8] if upstream_ref else "" + self._repair_recipe_enabled = repair_recipe_enabled def summarize_planning( self, state: MergeState @@ -352,6 +358,8 @@ def summarize_judge_review( ) ) + entries.extend(self._build_repair_recipes(state)) + summary = PhaseSummary( phase="judge_review", files_processed=0, @@ -361,6 +369,59 @@ def summarize_judge_review( ) return summary, entries + def _build_repair_recipes(self, state: MergeState) -> list[MemoryEntry]: + """P1-C: mint REPAIR_RECIPE entries for deterministic repairs the Judge + verified by passing the file. + + Pure execution-grounding: an entry is written only when a repair + operator fired during the merge (``state.applied_repairs``) AND the file + is in ``judge_verdict.passed_files``. No LLM decides "did it work". Keyed + by an ``error_signature`` (error_class + operator + dir-layer) so the + next run that opens a sibling file retrieves "this class of error was + resolved here by operator X, verified by judge PASS".""" + if not self._repair_recipe_enabled: + return [] + verdict = state.judge_verdict + if verdict is None or not state.applied_repairs: + return [] + passed = set(verdict.passed_files) + ref_tag = f"upstream_ref:{self._upstream_ref}" if self._upstream_ref else "" + recipes: list[MemoryEntry] = [] + seen: set[str] = set() + for repair in state.applied_repairs: + fp = repair.get("file_path", "") + if fp not in passed: + continue + operator = repair.get("operator", "unknown") + error_class = repair.get("error_class", "unknown") + parts = fp.split(os.sep) + dir_layer = os.sep.join(parts[:2]) if len(parts) > 1 else "." + signature = f"{error_class}:{operator}:{dir_layer}" + if signature in seen: + continue + seen.add(signature) + tags = ["repair_recipe", error_class, operator, dir_layer] + if ref_tag: + tags.append(ref_tag) + recipes.append( + MemoryEntry( + entry_type=MemoryEntryType.REPAIR_RECIPE, + phase="judge_review", + content=( + f"{error_class} in {dir_layer}: resolved deterministically " + f"by `{operator}`, verified by judge PASS — apply the same " + f"operator before escalating this error class." + ), + file_paths=[fp, dir_layer], + tags=tags, + confidence=0.9, + confidence_level=ConfidenceLevel.EXTRACTED, + ) + ) + if len(recipes) >= _MAX_REPAIR_RECIPES: + break + return recipes + def _count_by_directory(file_paths: list[str]) -> Counter[str]: dirs: Counter[str] = Counter() diff --git a/src/models/config.py b/src/models/config.py index 526b50d..cb80848 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -974,6 +974,16 @@ class MemoryExtractionConfig(BaseModel): "findings land in report_generation, after this hook (see " "doc/plan/self-learning-system.md P1-B).", ) + repair_recipe_enabled: bool = Field( + default=True, + description="P1-C: at judge_review summarization, mint a verified " + "REPAIR_RECIPE memory entry for each deterministic repair operator " + "(e.g. duplicate-symbol dedup) that fired AND whose file the Judge " + "passed. Execution-grounded and additive — no LLM decides success — so " + "default ON, unlike the harm-capable P1-A/P1-B loops. Future runs that " + "open a sibling file retrieve the recipe via the existing memory " + "channel. Set False to ablate in the eval-memory harness.", + ) inject_enabled: bool = Field( default=True, description="P0 ablation switch: when False, no memory context is " diff --git a/src/models/state.py b/src/models/state.py index 41c17a6..916f33a 100644 --- a/src/models/state.py +++ b/src/models/state.py @@ -176,6 +176,16 @@ class MergeState(BaseModel): judge_verdict: JudgeVerdict | None = None judge_repair_rounds: int = 0 judge_verdicts_log: list[dict[str, Any]] = Field(default_factory=list) + applied_repairs: list[dict[str, str]] = Field( + default_factory=list, + description=( + "P1-C: deterministic repair operators that fired and let the merge " + "proceed (not escalate), e.g. dedup of a duplicate top-level symbol. " + "Each entry is {file_path, operator, error_class}. Read at " + "judge_review summarization to mint a verified REPAIR_RECIPE memory " + "entry only for files the Judge ultimately passed." + ), + ) judge_resolution: Literal["accept", "abort", "rerun"] | None = Field( default=None, description=( diff --git a/tests/unit/test_p1c_repair_recipe.py b/tests/unit/test_p1c_repair_recipe.py new file mode 100644 index 0000000..0e99cdd --- /dev/null +++ b/tests/unit/test_p1c_repair_recipe.py @@ -0,0 +1,137 @@ +"""P1-C: verified-repair recipe library. + +A deterministic repair operator (duplicate-symbol dedup) that fires AND whose +file the Judge ultimately passes mints a REPAIR_RECIPE memory entry, keyed by an +error_signature so a later run that opens a sibling file retrieves +"this error class was resolved here by operator X, verified by judge PASS". +Pure execution-grounding — no LLM decides success. +""" + +from __future__ import annotations + +from datetime import datetime + +from src.agents.executor_agent import _record_applied_repair +from src.memory.models import MemoryEntryType +from src.memory.store import MemoryStore +from src.memory.summarizer import PhaseSummarizer +from src.models.judge import JudgeVerdict, VerdictType + + +def _verdict(passed: list[str], failed: list[str]) -> JudgeVerdict: + return JudgeVerdict( + verdict=VerdictType.PASS if not failed else VerdictType.FAIL, + reviewed_files_count=len(passed) + len(failed), + passed_files=list(passed), + failed_files=list(failed), + conditional_files=[], + issues=[], + critical_issues_count=0, + high_issues_count=0, + overall_confidence=0.9, + summary="x", + blocking_issues=[], + timestamp=datetime(2026, 1, 1), + judge_model="m", + ) + + +class _State: + """Minimal stand-in carrying the fields the summarizer reads.""" + + def __init__(self, verdict, applied_repairs): + self.judge_verdict = verdict + self.applied_repairs = applied_repairs + self.judge_verdicts_log = [] + self.judge_repair_rounds = 0 + + +# --- executor recording ----------------------------------------------------- + + +def test_record_applied_repair_dedups_per_file_operator(): + state = _State(None, []) + _record_applied_repair(state, "a.go", "dedup_top_level_symbols", "dup_symbol") + _record_applied_repair(state, "a.go", "dedup_top_level_symbols", "dup_symbol") + assert state.applied_repairs == [ + { + "file_path": "a.go", + "operator": "dedup_top_level_symbols", + "error_class": "dup_symbol", + } + ] + + +# --- summarizer minting ----------------------------------------------------- + + +def _repairs(*files: str) -> list[dict[str, str]]: + return [ + { + "file_path": f, + "operator": "dedup_top_level_symbols", + "error_class": "duplicate_top_level_symbol", + } + for f in files + ] + + +def test_recipe_minted_only_for_judge_passed_file(): + state = _State( + _verdict(passed=["pkg/x/a.go"], failed=["pkg/y/b.go"]), + _repairs("pkg/x/a.go", "pkg/y/b.go"), + ) + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + recipes = [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE] + assert len(recipes) == 1 + r = recipes[0] + assert "pkg/x/a.go" in r.file_paths + assert "duplicate_top_level_symbol" in r.tags + assert "dedup_top_level_symbols" in r.tags + # the failed file earns no recipe + assert all("pkg/y/b.go" not in e.file_paths for e in recipes) + + +def test_recipe_signature_deduped_across_same_dir_layer(): + # two passed files in the same dir-layer with the same operator/error → + # one recipe (the error_signature collapses them). + state = _State( + _verdict(passed=["pkg/x/a.go", "pkg/x/b.go"], failed=[]), + _repairs("pkg/x/a.go", "pkg/x/b.go"), + ) + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + recipes = [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE] + assert len(recipes) == 1 + + +def test_no_recipe_when_disabled(): + state = _State(_verdict(passed=["pkg/x/a.go"], failed=[]), _repairs("pkg/x/a.go")) + _, entries = PhaseSummarizer(repair_recipe_enabled=False).summarize_judge_review( + state + ) # type: ignore[arg-type] + assert not [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE] + + +def test_no_recipe_without_applied_repairs(): + state = _State(_verdict(passed=["pkg/x/a.go"], failed=[]), []) + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + assert not [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE] + + +def test_no_recipe_without_verdict(): + state = _State(None, _repairs("pkg/x/a.go")) + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + assert not [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE] + + +# --- retrieval (existing memory channel) ------------------------------------ + + +def test_recipe_is_retrievable_for_matching_file(): + state = _State(_verdict(passed=["pkg/x/a.go"], failed=[]), _repairs("pkg/x/a.go")) + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + store = MemoryStore() + for e in entries: + store = store.add_entry(e) + hits = store.get_relevant_context(["pkg/x/a.go"]) + assert any(h.entry_type == MemoryEntryType.REPAIR_RECIPE for h in hits) From 53b963e8f1ae82d34197459dbf1f4be3a36d55b8 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:26:54 -0400 Subject: [PATCH 12/22] =?UTF-8?q?docs(plan):=20=E6=A0=87=E6=B3=A8=20Phase?= =?UTF-8?q?=201=20A/B/C=20=E8=90=BD=E5=9C=B0=E7=8A=B6=E6=80=81=E4=B8=8E=20?= =?UTF-8?q?P1-B=20ci=20=E5=BB=B6=E5=90=8E=E5=81=8F=E5=B7=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/plan/self-learning-system.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md index 37d65a9..5df7079 100644 --- a/doc/plan/self-learning-system.md +++ b/doc/plan/self-learning-system.md @@ -154,6 +154,15 @@ ### Phase 1 —— 闭合执行接地反馈环(最高 ROI) +> **落地状态(2026-05-31,feat/web)**:A/B/C 全部实装(`b83d142`/`6b4f905`/`6bc77c3`)。 +> A、B 的反馈环按 P2「先度量再激活」默认 **opt-in(False)**——`memory.persist_suppress`、 +> `memory.writeback_signal_sources` 默认 `["judge"]`(=旧行为),需 `merge eval-memory` +> 多 run 基线证明净收益为正(§3 激活门:`MDL>0` 且 `memory_harmed=0`)方可翻默认。 +> C 为纯加性、执行接地,默认 **True**(`memory.repair_recipe_enabled`)。 +> **B 偏差**:CI/partial_failure 信号有意延后——它在 `report_generation` 产出,晚于 +> judge_review 记忆钩子;完整融合需把写回迁到 report 阶段(未做)。故 B 现仅 +> `judge + compile` 两源。 + > 对应研究最强三条证据:选择性 add+**delete** +10%(F2)、执行接地 >> 自反思(范式2)、Experience 抽象(范式5)。拆三个可独立评审的子项。 #### P1-A 把临时软删(O-M6)巩固为持久、可审计的 suppress(原则 P3) From 452500862cab7d4652706cf476914bb7b1acf10e Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:37:58 -0400 Subject: [PATCH 13/22] =?UTF-8?q?feat(memory):=20P2-A=20=E9=AB=98=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E6=9D=A1=E7=9B=AE=E5=BC=BA=E5=88=B6=EF=BC=88anti-pois?= =?UTF-8?q?oning=20=E5=AF=B9=E5=81=B6=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _is_epistemically_empty 拒"模型放弃"标记;本提交补其对偶——拒"空泛无动作" 条目(Renze & Guven:反思信息量是效果杠杆,GPT-4 0.79→0.93)。 - 新 src/memory/content_quality.py:is_actionable_content 仅审 DECISION/ REPAIR_RECIPE(PATTERN/摘要类豁免),保守判据默认 True,仅明确空泛填充词/ 过短无动作才 False;enforce_actionable 不可变降级(降 HEURISTIC + 折半 confidence,de-rank 而非删,保召回) - orchestrator._update_memory 两个入库点(summarizer + memory_extractor llm 出口)统一过 enforce_actionable 8 新单测;mypy/ruff 干净 --- src/core/orchestrator.py | 5 +- src/memory/content_quality.py | 76 +++++++++++++++++++++++ tests/unit/test_p2a_content_quality.py | 84 ++++++++++++++++++++++++++ 3 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 src/memory/content_quality.py create mode 100644 tests/unit/test_p2a_content_quality.py diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py index 26bcc0b..5c5ce0b 100644 --- a/src/core/orchestrator.py +++ b/src/core/orchestrator.py @@ -54,6 +54,7 @@ from src.core.coordinator import Coordinator from src.core.state_machine import StateMachine from src.memory.bootstrap import _BOOTSTRAP_TAG, bootstrap_from_claude_md +from src.memory.content_quality import enforce_actionable from src.memory.hit_tracker import MemoryHitTracker from src.memory.sqlite_store import SQLiteMemoryStore from src.memory.store import MemoryStore @@ -496,7 +497,7 @@ async def _update_memory(self, phase: str, state: MergeState) -> None: phase_summary, entries = method(state) store = self._memory_store.record_phase_summary(phase_summary) for entry in entries: - store = store.add_entry(entry) + store = store.add_entry(enforce_actionable(entry)) count_before = store.entry_count store = store.remove_superseded(phase) removed = count_before - store.entry_count @@ -517,7 +518,7 @@ async def _update_memory(self, phase: str, state: MergeState) -> None: llm_entries = await self.memory_extractor.extract(phase, state) # type: ignore[attr-defined] store = self._memory_store for entry in llm_entries: - store = store.add_entry(entry) + store = store.add_entry(enforce_actionable(entry)) self._memory_store = store self._phases_since_last_extract = 0 except Exception as exc: diff --git a/src/memory/content_quality.py b/src/memory/content_quality.py new file mode 100644 index 0000000..2949e9d --- /dev/null +++ b/src/memory/content_quality.py @@ -0,0 +1,76 @@ +"""P2-A: high-information entry enforcement. + +The dual of ``_is_epistemically_empty`` (which rejects "model gave up" markers): +this rejects entries that are *vacuous* — they name a file but carry no concrete +action, decision, or fix. Renze & Guven (arXiv 2405.06682) show reflection +*information content* is what drives the effect (GPT-4 0.79 → 0.93), so a memory +that says nothing specific is dead weight that dilutes retrieval. + +Conservative by design: defaults to actionable (True) and only flags content that +is clearly filler, so it never silently drops a legitimate entry. Non-actionable +entries are *de-ranked* (confidence + level lowered), not deleted, preserving +recall while pushing vacuous entries below the retrieval threshold. +""" + +from __future__ import annotations + +import re + +from src.memory.models import ConfidenceLevel, MemoryEntry, MemoryEntryType + +# Entry types whose value is their specific action/decision/fix. PATTERN / +# PHASE_SUMMARY / CODEBASE_INSIGHT are intentionally exempt — a terse pattern +# label ("recurring reverse_impact") is legitimately short. +_ACTIONABLE_TYPES = frozenset( + { + MemoryEntryType.DECISION, + MemoryEntryType.REPAIR_RECIPE, + } +) + +# Filler that carries no information once the file path is stripped. +_VACUOUS_PATTERNS: tuple[re.Pattern[str], ...] = ( + re.compile(r"^(decision\s+made|reviewed|processed|handled|done|ok|n/?a|none)\.?$"), + re.compile(r"^(no\s+(notes?|details?|specifics?|action|change)s?)\b"), + re.compile(r"^(tbd|todo|unknown|see\s+above|as\s+noted)\.?$"), +) + +_MIN_SUBSTANCE_CHARS = 8 + + +def _substance(content: str) -> str: + """The part after a leading ``path: `` prefix, lowercased + stripped.""" + head, sep, tail = content.partition(": ") + body = tail if sep else content + return body.strip().lower() + + +def is_actionable_content(content: str, entry_type: MemoryEntryType) -> bool: + """True when ``content`` carries a concrete action/decision/fix. + + Only entry types in ``_ACTIONABLE_TYPES`` are scrutinised; all others are + considered actionable by default. The check is deliberately permissive.""" + if entry_type not in _ACTIONABLE_TYPES: + return True + body = _substance(content) + if len(body) < _MIN_SUBSTANCE_CHARS: + return False + return not any(pat.match(body) for pat in _VACUOUS_PATTERNS) + + +def enforce_actionable(entry: MemoryEntry) -> MemoryEntry: + """Return ``entry`` unchanged when actionable, else a de-ranked copy. + + De-rank = clamp confidence_level to HEURISTIC and halve confidence (floor + 0.1) so the vacuous entry sinks below the retrieval relevance threshold + without being deleted. Immutable — never mutates the input.""" + if is_actionable_content(entry.content, entry.entry_type): + return entry + if entry.confidence_level == ConfidenceLevel.HEURISTIC and entry.confidence <= 0.1: + return entry + return entry.model_copy( + update={ + "confidence_level": ConfidenceLevel.HEURISTIC, + "confidence": max(0.1, round(entry.confidence * 0.5, 4)), + } + ) diff --git a/tests/unit/test_p2a_content_quality.py b/tests/unit/test_p2a_content_quality.py new file mode 100644 index 0000000..10605e8 --- /dev/null +++ b/tests/unit/test_p2a_content_quality.py @@ -0,0 +1,84 @@ +"""P2-A: high-information entry enforcement (dual of epistemic-empty filter).""" + +from __future__ import annotations + +from src.memory.content_quality import enforce_actionable, is_actionable_content +from src.memory.models import ConfidenceLevel, MemoryEntry, MemoryEntryType + + +def _entry( + content: str, + entry_type: MemoryEntryType = MemoryEntryType.DECISION, + confidence: float = 0.85, + level: ConfidenceLevel = ConfidenceLevel.EXTRACTED, +) -> MemoryEntry: + return MemoryEntry( + entry_type=entry_type, + phase="conflict_analysis", + content=content, + confidence=confidence, + confidence_level=level, + ) + + +# --- is_actionable_content -------------------------------------------------- + + +def test_decision_with_concrete_action_is_actionable(): + assert is_actionable_content( + "src/a.py: take_target [import_conflict] confidence=0.90 — keep upstream auth", + MemoryEntryType.DECISION, + ) + + +def test_decision_vacuous_filler_not_actionable(): + for body in ("src/a.py: decision made", "src/a.py: n/a", "src/a.py: no notes"): + assert not is_actionable_content(body, MemoryEntryType.DECISION) + + +def test_decision_too_short_not_actionable(): + assert not is_actionable_content("src/a.py: ok", MemoryEntryType.DECISION) + + +def test_pattern_type_is_exempt(): + # a terse PATTERN label is legitimately short — never flagged + assert is_actionable_content("ok", MemoryEntryType.PATTERN) + assert is_actionable_content( + "recurring reverse_impact", MemoryEntryType.PHASE_SUMMARY + ) + + +def test_repair_recipe_scrutinised(): + assert not is_actionable_content("x: n/a", MemoryEntryType.REPAIR_RECIPE) + assert is_actionable_content( + "dup_symbol in pkg/x: resolved by dedup, verified by judge PASS", + MemoryEntryType.REPAIR_RECIPE, + ) + + +# --- enforce_actionable ----------------------------------------------------- + + +def test_actionable_entry_returned_unchanged(): + e = _entry("src/a.py: semantic_merge — merged both auth handlers cleanly") + assert enforce_actionable(e) is e + + +def test_vacuous_entry_is_deranked_not_dropped(): + e = _entry("src/a.py: decision made", confidence=0.85) + out = enforce_actionable(e) + assert out is not e + assert out.confidence_level == ConfidenceLevel.HEURISTIC + assert out.confidence == 0.425 # 0.85 * 0.5 + assert out.content == e.content # content preserved, only rank lowered + assert out.content_hash == e.content_hash # identity stable + + +def test_derank_is_idempotent_at_floor(): + e = _entry("src/a.py: n/a", confidence=0.1, level=ConfidenceLevel.HEURISTIC) + assert enforce_actionable(e) is e # already at floor → no new object + + +def test_exempt_type_never_deranked(): + e = _entry("ok", entry_type=MemoryEntryType.PATTERN, confidence=0.8) + assert enforce_actionable(e) is e From 2af4890819441456f9fc265cf3c95fb335f5c1fd Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:37:58 -0400 Subject: [PATCH 14/22] =?UTF-8?q?feat(memory):=20P2-B=20=E5=85=B3=E9=94=AE?= =?UTF-8?q?=E4=B8=8D=E5=8F=98=E9=87=8F=E9=94=9A=E5=AE=9A=EF=BC=8C=E9=98=B2?= =?UTF-8?q?=E6=91=98=E8=A6=81=E6=BC=82=E7=A7=BB=EF=BC=88F1=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit consolidation 把同组条目摘要成一个 blob,会漂移关键条目内容(F1)。本提交给 验证过的解法与人工决策打 pinned,consolidation 对其 passthrough 不再摘要。 - MemoryEntry.pinned: bool;SQLite 加 pinned 列 + ALTER TABLE 迁移旧库(镜像 P1-A suppressed 的迁移/行映射) - _consolidate_entries 跳过 pinned(与 suppressed 同 passthrough 分支) - summarizer 标 pinned=True:REPAIR_RECIPE(验证过的解法)+ judge_review 中 decision_source∈{HUMAN,BATCH_HUMAN} 的文件 DECISION 条目 - security-sensitive 锚定延后(summarizer 无 config patterns,plumbing 已就位) 8 新单测(含 F1 多轮 consolidation 零损失断言 + 迁移);test_p1c 桩补 file_decision_records;mypy/ruff 干净 --- src/memory/models.py | 1 + src/memory/sqlite_store.py | 13 +- src/memory/store.py | 4 +- src/memory/summarizer.py | 12 +- tests/unit/test_p1c_repair_recipe.py | 1 + tests/unit/test_p2b_pinned_anchor.py | 175 +++++++++++++++++++++++++++ 6 files changed, 201 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_p2b_pinned_anchor.py diff --git a/src/memory/models.py b/src/memory/models.py index 4f2c7ec..9a7b8a4 100644 --- a/src/memory/models.py +++ b/src/memory/models.py @@ -37,6 +37,7 @@ class MemoryEntry(BaseModel, frozen=True): created_at: datetime = Field(default_factory=datetime.now) suppressed: bool = Field(default=False) suppressed_reason: str | None = Field(default=None) + pinned: bool = Field(default=False) def model_post_init(self, __context: Any) -> None: if not self.content_hash: diff --git a/src/memory/sqlite_store.py b/src/memory/sqlite_store.py index 202a208..c68e411 100644 --- a/src/memory/sqlite_store.py +++ b/src/memory/sqlite_store.py @@ -37,7 +37,8 @@ content_hash TEXT NOT NULL, created_at TEXT NOT NULL, suppressed INTEGER NOT NULL DEFAULT 0, - suppressed_reason TEXT + suppressed_reason TEXT, + pinned INTEGER NOT NULL DEFAULT 0 ); CREATE UNIQUE INDEX IF NOT EXISTS idx_content_hash ON memory_entries (content_hash); @@ -55,8 +56,8 @@ INSERT OR IGNORE INTO memory_entries (entry_id, entry_type, phase, content, file_paths, tags, confidence, confidence_level, content_hash, created_at, - suppressed, suppressed_reason) -VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + suppressed, suppressed_reason, pinned) +VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """ # P1-A: columns added after the original schema shipped; older memory.db files @@ -71,6 +72,10 @@ "suppressed_reason", "ALTER TABLE memory_entries ADD COLUMN suppressed_reason TEXT", ), + ( + "pinned", + "ALTER TABLE memory_entries ADD COLUMN pinned INTEGER NOT NULL DEFAULT 0", + ), ) _PHASE_ORDER = { @@ -95,6 +100,7 @@ def _entry_to_row(entry: MemoryEntry) -> tuple[str | int | None, ...]: entry.created_at.isoformat(), 1 if entry.suppressed else 0, entry.suppressed_reason, + 1 if entry.pinned else 0, ) @@ -115,6 +121,7 @@ def _row_to_entry(row: sqlite3.Row) -> MemoryEntry: suppressed_reason=( row["suppressed_reason"] if "suppressed_reason" in keys else None ), + pinned=bool(row["pinned"]) if "pinned" in keys else False, ) diff --git a/src/memory/store.py b/src/memory/store.py index 16199d4..b4c43fb 100644 --- a/src/memory/store.py +++ b/src/memory/store.py @@ -354,7 +354,9 @@ class (``c_class`` / ``conflict_decision``) shared across directories, so # P1-A: suppressed entries pass through untouched — they must not be # merged into a live blob (that would resurrect harmful content) nor # silently dropped (audit trail must survive consolidation). - if entry.suppressed: + # P2-B: pinned entries (verified REPAIR_RECIPE / human decisions) also + # pass through verbatim so consolidation cannot drift their content (F1). + if entry.suppressed or entry.pinned: ungroupable.append(entry) continue primary_tag = entry.tags[0] if entry.tags else "" diff --git a/src/memory/summarizer.py b/src/memory/summarizer.py index 0b99ad5..1292c58 100644 --- a/src/memory/summarizer.py +++ b/src/memory/summarizer.py @@ -11,7 +11,7 @@ MemoryEntryType, PhaseSummary, ) -from src.models.decision import FileDecisionRecord +from src.models.decision import DecisionSource, FileDecisionRecord from src.models.state import MergeState logger = logging.getLogger(__name__) @@ -335,6 +335,14 @@ def summarize_judge_review( # future runs see which files needed repair and why. if state.judge_verdict is not None: ref_tag = f"upstream_ref:{self._upstream_ref}" if self._upstream_ref else "" + # P2-B: pin human-decided files so consolidation cannot drift the + # record of an explicit operator decision (F1). + human_files = { + fp + for fp, rec in state.file_decision_records.items() + if rec.decision_source + in (DecisionSource.HUMAN, DecisionSource.BATCH_HUMAN) + } issues_by_file: dict[str, list[str]] = {} for issue in state.judge_verdict.issues: issues_by_file.setdefault(issue.file_path, []).append(issue.issue_type) @@ -355,6 +363,7 @@ def summarize_judge_review( tags=tags, confidence=0.85, confidence_level=ConfidenceLevel.EXTRACTED, + pinned=fp in human_files, ) ) @@ -416,6 +425,7 @@ def _build_repair_recipes(self, state: MergeState) -> list[MemoryEntry]: tags=tags, confidence=0.9, confidence_level=ConfidenceLevel.EXTRACTED, + pinned=True, ) ) if len(recipes) >= _MAX_REPAIR_RECIPES: diff --git a/tests/unit/test_p1c_repair_recipe.py b/tests/unit/test_p1c_repair_recipe.py index 0e99cdd..20eee97 100644 --- a/tests/unit/test_p1c_repair_recipe.py +++ b/tests/unit/test_p1c_repair_recipe.py @@ -44,6 +44,7 @@ def __init__(self, verdict, applied_repairs): self.applied_repairs = applied_repairs self.judge_verdicts_log = [] self.judge_repair_rounds = 0 + self.file_decision_records = {} # --- executor recording ----------------------------------------------------- diff --git a/tests/unit/test_p2b_pinned_anchor.py b/tests/unit/test_p2b_pinned_anchor.py new file mode 100644 index 0000000..f678e4a --- /dev/null +++ b/tests/unit/test_p2b_pinned_anchor.py @@ -0,0 +1,175 @@ +"""P2-B: pin key invariants so consolidation cannot drift them (F1 guard).""" + +from __future__ import annotations + +from datetime import datetime + +from src.memory.models import ConfidenceLevel, MemoryEntry, MemoryEntryType +from src.memory.sqlite_store import SQLiteMemoryStore +from src.memory.store import _consolidate_entries +from src.memory.summarizer import PhaseSummarizer +from src.models.decision import ( + DecisionSource, + FileDecisionRecord, + MergeDecision, +) +from src.models.diff import FileStatus +from src.models.judge import IssueSeverity, JudgeIssue, JudgeVerdict, VerdictType + + +def _entry(content: str, *, pinned: bool = False, tag: str = "t") -> MemoryEntry: + return MemoryEntry( + entry_type=MemoryEntryType.DECISION, + phase="conflict_analysis", + content=content, + file_paths=["pkg/x/a.py"], + tags=[tag], + confidence=0.8, + confidence_level=ConfidenceLevel.EXTRACTED, + pinned=pinned, + ) + + +# --- model ------------------------------------------------------------------ + + +def test_pinned_defaults_false(): + assert _entry("x").pinned is False + + +# --- consolidation F1 guard ------------------------------------------------- + + +def test_pinned_entry_survives_consolidation_verbatim(): + # 3 same-group live entries would merge into one lossy blob; a pinned + # sibling in the same group must pass through with content intact. + live = [_entry(f"c{i}") for i in range(3)] + pinned = _entry("CRITICAL: take_current on auth — never drift", pinned=True) + out = _consolidate_entries([*live, pinned]) + survivors = [e for e in out if e.pinned] + assert len(survivors) == 1 + assert survivors[0].content == "CRITICAL: take_current on auth — never drift" + # the 3 live ones still collapsed + assert sum(1 for e in out if not e.pinned) == 1 + + +def test_sqlite_pinned_persists_and_survives_consolidation(tmp_path): + store = SQLiteMemoryStore.open(tmp_path / "m.db") + pinned = _entry("pinned recipe", pinned=True) + store.add_entry(pinned) + reopened = SQLiteMemoryStore.open(tmp_path / "m.db") + row = next(e for e in reopened.to_memory().entries if e.entry_id == pinned.entry_id) + assert row.pinned is True + + +def test_sqlite_legacy_db_migrates_pinned_column(tmp_path): + import sqlite3 + + db = tmp_path / "legacy.db" + conn = sqlite3.connect(str(db)) + conn.executescript( + """ + CREATE TABLE memory_entries ( + entry_id TEXT PRIMARY KEY, entry_type TEXT NOT NULL, phase TEXT NOT NULL, + content TEXT NOT NULL, file_paths TEXT NOT NULL, tags TEXT NOT NULL, + confidence REAL NOT NULL, confidence_level TEXT NOT NULL, + content_hash TEXT NOT NULL, created_at TEXT NOT NULL + ); + INSERT INTO memory_entries VALUES + ('id1','decision','planning','legacy','["a.py"]','[]',0.8, + 'inferred','hash1','2026-01-01T00:00:00'); + """ + ) + conn.commit() + conn.close() + store = SQLiteMemoryStore.open(db) # migration runs here + entries = store.to_memory().entries + assert len(entries) == 1 + assert entries[0].pinned is False + + +# --- summarizer pinning ----------------------------------------------------- + + +class _State: + def __init__(self, verdict, records): + self.judge_verdict = verdict + self.applied_repairs = [] + self.judge_verdicts_log = [] + self.judge_repair_rounds = 0 + self.file_decision_records = records + + +def _verdict(passed, failed): + issues = [ + JudgeIssue( + file_path=f, + issue_level=IssueSeverity.HIGH, + issue_type="reverse_impact_unhandled", + description="x", + ) + for f in failed + ] + return JudgeVerdict( + verdict=VerdictType.FAIL if failed else VerdictType.PASS, + reviewed_files_count=len(passed) + len(failed), + passed_files=list(passed), + failed_files=list(failed), + conditional_files=[], + issues=issues, + critical_issues_count=0, + high_issues_count=len(failed), + overall_confidence=0.9, + summary="x", + blocking_issues=[], + timestamp=datetime(2026, 1, 1), + judge_model="m", + ) + + +def _human_record(fp: str) -> FileDecisionRecord: + return FileDecisionRecord( + file_path=fp, + file_status=FileStatus.MODIFIED, + decision=MergeDecision.TAKE_CURRENT, + decision_source=DecisionSource.HUMAN, + rationale="operator kept fork auth on this security-sensitive file", + ) + + +def test_repair_recipe_entries_are_pinned(): + state = _State(_verdict(["pkg/x/a.go"], []), {}) + state.applied_repairs = [ + { + "file_path": "pkg/x/a.go", + "operator": "dedup_top_level_symbols", + "error_class": "duplicate_top_level_symbol", + } + ] + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + recipes = [e for e in entries if e.entry_type == MemoryEntryType.REPAIR_RECIPE] + assert recipes and all(r.pinned for r in recipes) + + +def test_human_decided_judge_fail_entry_is_pinned(): + fp = "pkg/x/secret.py" + state = _State(_verdict([], [fp]), {fp: _human_record(fp)}) + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + decisions = [ + e + for e in entries + if e.entry_type == MemoryEntryType.DECISION and fp in e.file_paths + ] + assert decisions and all(d.pinned for d in decisions) + + +def test_non_human_judge_fail_entry_not_pinned(): + fp = "pkg/x/auto.py" + state = _State(_verdict([], [fp]), {}) # no human record + _, entries = PhaseSummarizer().summarize_judge_review(state) # type: ignore[arg-type] + decisions = [ + e + for e in entries + if e.entry_type == MemoryEntryType.DECISION and fp in e.file_paths + ] + assert decisions and not any(d.pinned for d in decisions) From 08e3f9eb5b85a4bd97e69ef17e4ac764beaf2099 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 07:38:37 -0400 Subject: [PATCH 15/22] =?UTF-8?q?docs(plan):=20=E6=A0=87=E6=B3=A8=20Phase?= =?UTF-8?q?=202=20A/B=20=E8=90=BD=E5=9C=B0=E7=8A=B6=E6=80=81=E4=B8=8E=20se?= =?UTF-8?q?curity-sensitive=20=E9=94=9A=E5=AE=9A=E5=BB=B6=E5=90=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/plan/self-learning-system.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md index 5df7079..5995507 100644 --- a/doc/plan/self-learning-system.md +++ b/doc/plan/self-learning-system.md @@ -211,6 +211,13 @@ ### Phase 2 —— 记忆质量加固(中等 ROI,便宜) +> **落地状态(2026-05-31,feat/web)**:A/B 全部实装(`4525008`/`2af4890`)。 +> A(`content_quality.is_actionable_content`/`enforce_actionable`)保守降级而非删, +> 默认随入库即生效;B(`MemoryEntry.pinned`)锚定 REPAIR_RECIPE + 人工决策条目, +> consolidation 对其 passthrough。**B 偏差**:security-sensitive 锚定延后—— +> summarizer 无 config 的 `security_sensitive.patterns`;`pinned` 字段已就位, +> 需后续在有 config 的入库点补标。 + **P2-A 高信息条目强制**(范式2,GPT-4 0.79→0.93 的直接杠杆) - 扩展 `_is_epistemically_empty` 的对偶:`_has_actionable_content()`——DECISION/REPAIR_RECIPE 类条目若缺"具体动作/修复"则降级或拒写。 - 接入 `summarizer.py` 各 `summarize_*` 与 `memory_extractor` 出口。 From f5406130086fd15e5036316abba0a60f3c6dce92 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 09:44:16 -0400 Subject: [PATCH 16/22] =?UTF-8?q?feat(memory):=20Phase=203=20=E7=A6=BB?= =?UTF-8?q?=E7=BA=BF=E6=8F=90=E7=A4=BA=E4=BC=98=E5=8C=96=20harness=20+=20o?= =?UTF-8?q?pt-in=20CLI=EF=BC=88=E4=B8=8D=E8=87=AA=E5=8A=A8=E6=94=B9?= =?UTF-8?q?=E7=94=9F=E4=BA=A7=E6=8F=90=E7=A4=BA=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GEPA/MIPROv2 式提示进化的确定性可测核心:对某 gate 的系统提示生成具名候选变体、 按 golden 集决策准确率排名、产人工评审报告。gate 是代码 builder,故采纳赢家=人工 按报告编辑提示源,**永不自动写回 gate_registry**(方案"人工评审后才生效")。 - src/tools/prompt_optimizer.py:PromptCandidate/GoldenCase/CandidateScore/ CostLedger/OptimizationReport;MUTATION_STRATEGIES(GEPA 确定性子集=反思指令注入 stepwise/selfcheck/output_format/evidence_first);propose_variants(基线+去重)、 score_candidates、select_winner(须超基线 margin 且已评分)、build/render report - 昂贵的 LLM rollout 有意外移为注入的 rollouts 映射(candidate_id→{case_id:decision}), harness 纯离线可单测;产 rollout 是操作者按文档自担成本的步骤(~$60/万次) - CLI `merge optimize-prompts --gate --golden --rollouts --strategies --margin --out`: 仅支持 no-arg/*-SYSTEM gate(其余报错);无 golden 时只产变体并诚实标注 unscored - 安全:read-only w.r.t 生产提示;未评分=surfaced 非伪造;margin 内不夺基线 11 harness + 3 CLI 单测(对真实 J-SYSTEM gate);全量 3268 unit 绿(1 pre-existing 无关 docs 测试除外),mypy 182 文件干净、ruff 过 --- CLAUDE.md | 9 + src/cli/main.py | 97 +++++++++ src/tools/prompt_optimizer.py | 270 ++++++++++++++++++++++++ tests/unit/test_cli_optimize_prompts.py | 67 ++++++ tests/unit/test_prompt_optimizer.py | 125 +++++++++++ 5 files changed, 568 insertions(+) create mode 100644 src/tools/prompt_optimizer.py create mode 100644 tests/unit/test_cli_optimize_prompts.py create mode 100644 tests/unit/test_prompt_optimizer.py diff --git a/CLAUDE.md b/CLAUDE.md index d69c086..1f7cb1a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,8 +38,17 @@ merge init [--repo-path .] # generate per-target CLAUDE.md for merge decis merge plan-suggest [--target ... --candidates ...] # enumerate baseline commit-windows merge forks-profile init # scaffold .merge/forks-profile.yaml (recommended ≥30 fork-deleted files) merge eval-memory --on --off [--out ] # P0 memory ablation: compare memory=on vs memory=off effectiveness reports +merge optimize-prompts --gate [--golden --rollouts --strategies a,b --margin 0.02 --out ] # Phase 3 (opt-in, offline): generate + rank prompt variants for a *-SYSTEM gate; emits a HUMAN-REVIEW report, never auto-applies ``` +`merge optimize-prompts` is offline and read-only w.r.t. production prompts: +gates are code builders, so a winning candidate is applied by a human editing +the gate's prompt source after reviewing the report. Scoring needs `--golden` +(JSON `[{case_id, expected_decision}]`) plus `--rollouts` (JSON +`{candidate_id: {case_id: decision}}` produced by running each candidate — the +cost-bearing step you run yourself). See `doc/plan/self-learning-system.md` +Phase 3 for the cost model. + To produce a `memory=off` run for the ablation, set `memory.inject_enabled: false` in `.merge/config.yaml` and re-run on the same dataset; each run persists a `memory_effectiveness.json` under its run dir that `merge eval-memory` consumes. diff --git a/src/cli/main.py b/src/cli/main.py index 0dd75ca..c9a3d19 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -520,6 +520,103 @@ def eval_memory_command(on_path: str, off_path: str, out_path: str | None) -> No sys.exit(1) +@cli.command("optimize-prompts") +@click.option("--gate", "gate_id", required=True, help="gate ID, e.g. J-SYSTEM") +@click.option( + "--golden", + "golden_path", + required=False, + default=None, + type=click.Path(exists=True), + help="JSON list of {case_id, expected_decision}; omit to only generate variants", +) +@click.option( + "--rollouts", + "rollouts_path", + required=False, + default=None, + type=click.Path(exists=True), + help="JSON {candidate_id: {case_id: decision}} from running each candidate " + "(the cost-bearing step you run offline); omit to leave candidates unscored", +) +@click.option( + "--strategies", + "strategies", + required=False, + default=None, + help="comma list of mutation strategies (default: all)", +) +@click.option("--margin", "margin", default=0.02, type=float, help="win margin") +@click.option("--out", "out_path", required=False, default=None, type=click.Path()) +def optimize_prompts_command( + gate_id: str, + golden_path: str | None, + rollouts_path: str | None, + strategies: str | None, + margin: float, + out_path: str | None, +) -> None: + """Phase 3 (opt-in, offline): generate + rank prompt variants for a gate. + + Read-only w.r.t. production: candidates are emitted for HUMAN REVIEW and are + never written back to the gate registry. Supply --golden + --rollouts to + rank by decision accuracy; otherwise it just generates variants. See + doc/plan/self-learning-system.md Phase 3 for the cost model. + """ + import json + + from src.llm.prompts.gate_registry import get_gate, registered_gate_ids + from src.tools.prompt_optimizer import ( + GoldenCase, + build_report, + propose_variants, + render_report_markdown, + ) + + try: + gate = get_gate(gate_id) + except KeyError: + console.print( + f"[red]Unknown gate {gate_id!r}.[/red] Registered: " + f"{', '.join(registered_gate_ids())}" + ) + sys.exit(1) + + try: + base_prompt = gate.render() + except TypeError: + console.print( + f"[red]Gate {gate_id!r} needs render arguments[/red] — only " + "no-arg / *-SYSTEM gates are supported for offline optimization." + ) + sys.exit(1) + + strat_list = [s.strip() for s in strategies.split(",")] if strategies else None + candidates = propose_variants(gate_id, base_prompt, strat_list) + + golden: list[GoldenCase] = [] + if golden_path: + raw = json.loads(Path(golden_path).read_text(encoding="utf-8")) + golden = [GoldenCase.model_validate(item) for item in raw] + + rollouts: dict[str, dict[str, str]] = {} + if rollouts_path: + rollouts = json.loads(Path(rollouts_path).read_text(encoding="utf-8")) + + report = build_report(gate_id, candidates, golden, rollouts, margin=margin) + console.print(render_report_markdown(report)) + + if out_path: + try: + Path(out_path).write_text( + report.model_dump_json(indent=2), encoding="utf-8" + ) + console.print(f"[green]Wrote optimization report to {out_path}[/green]") + except OSError as e: + console.print(f"[red]Failed to write {out_path}: {e}[/red]") + sys.exit(1) + + cli.add_command(_forks_profile_group) diff --git a/src/tools/prompt_optimizer.py b/src/tools/prompt_optimizer.py new file mode 100644 index 0000000..abdeaf1 --- /dev/null +++ b/src/tools/prompt_optimizer.py @@ -0,0 +1,270 @@ +"""Phase 3: offline, opt-in prompt/strategy optimization harness. + +GEPA / MIPROv2-style prompt evolution, scoped to the **deterministic, testable** +core: generate named candidate variants of a gate's system prompt, score each +against a labelled golden set given precomputed rollouts, rank, and emit a +**human-review** report. It NEVER mutates `gate_registry` — gates are code +builders, so applying a winning candidate is a manual edit a human makes after +reviewing the report (the plan's "人工评审后才生效"). + +The expensive part — running the model with each candidate to produce decisions +— is intentionally OUT of this module. It is injected as a ``rollouts`` mapping +(candidate_id -> {case_id: produced_decision}) so the pure harness stays +unit-testable and offline; producing rollouts is the operator's documented, +cost-bearing step (PromptBreeder ~$60/10k calls — see +doc/plan/self-learning-system.md Phase 3). +""" + +from __future__ import annotations + +from collections.abc import Callable + +from pydantic import BaseModel, Field + +# --- mutation operators (deterministic, reflective-instruction injection) ---- + +# GEPA's reflective mutations need an LLM; these are the safe deterministic +# subset — each appends one well-known prompting directive. An LLM-reflective +# generator can be layered on later behind the same PromptCandidate interface. +_DIRECTIVES: dict[str, str] = { + "stepwise": ( + "Before deciding, reason step by step through the specific evidence; " + "do not pattern-match to prior cases." + ), + "selfcheck": ( + "After drafting your decision, re-read the inputs and verify each claim " + "is grounded in the provided content; revise if any is unsupported." + ), + "output_format": ( + "Return ONLY the required structured output — no preamble, no commentary " + "outside the specified fields." + ), + "evidence_first": ( + "Cite the exact lines or symbols you relied on before stating any " + "conclusion; an unsupported conclusion is a failure." + ), +} + + +def _append_directive(directive: str) -> Callable[[str], str]: + def _mutate(base: str) -> str: + return f"{base.rstrip()}\n\n{directive}" + + return _mutate + + +MUTATION_STRATEGIES: dict[str, Callable[[str], str]] = { + name: _append_directive(text) for name, text in _DIRECTIVES.items() +} + +BASELINE_ID = "baseline" + + +# --- models ----------------------------------------------------------------- + + +class PromptCandidate(BaseModel, frozen=True): + candidate_id: str + gate_id: str + strategy: str + prompt_text: str + + +class GoldenCase(BaseModel, frozen=True): + case_id: str + expected_decision: str + + +class CandidateScore(BaseModel, frozen=True): + candidate_id: str + cases_scored: int = Field(ge=0) + correct: int = Field(ge=0) + accuracy: float = Field(ge=0.0, le=1.0) + + +class CostLedger(BaseModel): + llm_calls: int = Field(default=0, ge=0) + est_usd: float = Field(default=0.0, ge=0.0) + + def record(self, *, llm_calls: int, est_usd: float) -> "CostLedger": + return CostLedger( + llm_calls=self.llm_calls + llm_calls, + est_usd=round(self.est_usd + est_usd, 4), + ) + + +class OptimizationReport(BaseModel, frozen=True): + gate_id: str + baseline_id: str + candidates: list[PromptCandidate] + scores: list[CandidateScore] + winner_id: str | None + margin: float + cost: CostLedger + notes: list[str] = Field(default_factory=list) + + +# --- harness ---------------------------------------------------------------- + + +def propose_variants( + gate_id: str, + base_prompt: str, + strategies: list[str] | None = None, +) -> list[PromptCandidate]: + """Baseline + one candidate per requested strategy. + + Strategies that produce text identical to the baseline (or to an earlier + candidate) are dropped — a no-op mutation is not a distinct candidate.""" + names = strategies if strategies is not None else list(MUTATION_STRATEGIES) + seen: set[str] = {base_prompt} + candidates = [ + PromptCandidate( + candidate_id=BASELINE_ID, + gate_id=gate_id, + strategy=BASELINE_ID, + prompt_text=base_prompt, + ) + ] + for name in names: + mutate = MUTATION_STRATEGIES.get(name) + if mutate is None: + continue + text = mutate(base_prompt) + if text in seen: + continue + seen.add(text) + candidates.append( + PromptCandidate( + candidate_id=name, + gate_id=gate_id, + strategy=name, + prompt_text=text, + ) + ) + return candidates + + +def score_candidates( + candidates: list[PromptCandidate], + rollouts: dict[str, dict[str, str]], + golden: list[GoldenCase], +) -> list[CandidateScore]: + """Decision accuracy per candidate against ``golden``. + + ``rollouts[candidate_id][case_id]`` is the decision that candidate's prompt + produced for that case (operator-supplied). A candidate with no rollout + scores ``cases_scored=0`` rather than a fabricated number — unscored is + surfaced, never silently treated as zero-correct.""" + expected = {c.case_id: c.expected_decision for c in golden} + scores: list[CandidateScore] = [] + for cand in candidates: + produced = rollouts.get(cand.candidate_id, {}) + scored = 0 + correct = 0 + for case_id, exp in expected.items(): + if case_id not in produced: + continue + scored += 1 + if produced[case_id] == exp: + correct += 1 + accuracy = round(correct / scored, 4) if scored else 0.0 + scores.append( + CandidateScore( + candidate_id=cand.candidate_id, + cases_scored=scored, + correct=correct, + accuracy=accuracy, + ) + ) + return scores + + +def select_winner( + scores: list[CandidateScore], + baseline_id: str = BASELINE_ID, + margin: float = 0.02, +) -> str | None: + """The highest-accuracy candidate, but only if it beats the baseline by at + least ``margin`` AND was actually scored. Ties and within-margin gains keep + the baseline — never churn a production prompt for noise.""" + by_id = {s.candidate_id: s for s in scores} + base = by_id.get(baseline_id) + if base is None or base.cases_scored == 0: + return None + best: CandidateScore | None = None + for s in scores: + if s.candidate_id == baseline_id or s.cases_scored == 0: + continue + if best is None or s.accuracy > best.accuracy: + best = s + if best is None: + return None + if best.accuracy - base.accuracy >= margin: + return best.candidate_id + return None + + +def build_report( + gate_id: str, + candidates: list[PromptCandidate], + golden: list[GoldenCase], + rollouts: dict[str, dict[str, str]], + margin: float = 0.02, + cost: CostLedger | None = None, +) -> OptimizationReport: + scores = score_candidates(candidates, rollouts, golden) + winner = select_winner(scores, margin=margin) + notes: list[str] = [] + if not golden: + notes.append("No golden set supplied — candidates generated but unscored.") + unscored = [s.candidate_id for s in scores if s.cases_scored == 0] + if golden and unscored: + notes.append( + "Unscored candidates (no rollout supplied): " + ", ".join(unscored) + ) + if golden and winner is None and any(s.cases_scored for s in scores): + notes.append( + f"No candidate beat baseline by the {margin:.0%} margin — keep current." + ) + return OptimizationReport( + gate_id=gate_id, + baseline_id=BASELINE_ID, + candidates=candidates, + scores=scores, + winner_id=winner, + margin=margin, + cost=cost or CostLedger(), + notes=notes, + ) + + +def render_report_markdown(report: OptimizationReport) -> str: + lines = [ + f"# Prompt optimization report — gate `{report.gate_id}`", + "", + "> Candidates are NOT auto-applied. Gates are code builders; to adopt a " + "winner, a human reviews its `prompt_text` below and edits the gate's " + "prompt source manually.", + "", + f"- baseline: `{report.baseline_id}`", + "- winner: " + + (f"`{report.winner_id}`" if report.winner_id else "_none (kept baseline)_"), + f"- margin: {report.margin:.0%}", + f"- cost: {report.cost.llm_calls} LLM calls, ~${report.cost.est_usd:.2f}", + "", + "## Scores", + "", + "| candidate | strategy | scored | correct | accuracy |", + "|---|---|---|---|---|", + ] + strategy_by_id = {c.candidate_id: c.strategy for c in report.candidates} + for s in report.scores: + lines.append( + f"| `{s.candidate_id}` | {strategy_by_id.get(s.candidate_id, '?')} " + f"| {s.cases_scored} | {s.correct} | {s.accuracy:.2%} |" + ) + if report.notes: + lines += ["", "## Notes", ""] + lines += [f"- {n}" for n in report.notes] + return "\n".join(lines) diff --git a/tests/unit/test_cli_optimize_prompts.py b/tests/unit/test_cli_optimize_prompts.py new file mode 100644 index 0000000..39fdd26 --- /dev/null +++ b/tests/unit/test_cli_optimize_prompts.py @@ -0,0 +1,67 @@ +"""Phase 3: `merge optimize-prompts` CLI wiring (real gate, no LLM).""" + +from __future__ import annotations + +import json + +from click.testing import CliRunner + +from src.cli.main import cli + + +def test_generate_variants_for_real_system_gate(): + res = CliRunner().invoke( + cli, ["optimize-prompts", "--gate", "J-SYSTEM", "--strategies", "stepwise"] + ) + assert res.exit_code == 0 + assert "NOT auto-applied" in res.output + assert "`stepwise`" in res.output + + +def test_unknown_gate_exits_nonzero(): + res = CliRunner().invoke(cli, ["optimize-prompts", "--gate", "NOPE"]) + assert res.exit_code == 1 + assert "Unknown gate" in res.output + + +def test_scores_with_golden_and_rollouts(tmp_path): + golden = tmp_path / "golden.json" + golden.write_text( + json.dumps( + [ + {"case_id": "c1", "expected_decision": "take_target"}, + {"case_id": "c2", "expected_decision": "HUMAN_REQUIRED"}, + ] + ), + encoding="utf-8", + ) + rollouts = tmp_path / "rollouts.json" + rollouts.write_text( + json.dumps( + { + "baseline": {"c1": "take_target", "c2": "take_target"}, + "stepwise": {"c1": "take_target", "c2": "HUMAN_REQUIRED"}, + } + ), + encoding="utf-8", + ) + out = tmp_path / "report.json" + res = CliRunner().invoke( + cli, + [ + "optimize-prompts", + "--gate", + "J-SYSTEM", + "--strategies", + "stepwise", + "--golden", + str(golden), + "--rollouts", + str(rollouts), + "--out", + str(out), + ], + ) + assert res.exit_code == 0 + report = json.loads(out.read_text(encoding="utf-8")) + assert report["winner_id"] == "stepwise" # 1.0 vs baseline 0.5 diff --git a/tests/unit/test_prompt_optimizer.py b/tests/unit/test_prompt_optimizer.py new file mode 100644 index 0000000..32f0aea --- /dev/null +++ b/tests/unit/test_prompt_optimizer.py @@ -0,0 +1,125 @@ +"""Phase 3: offline prompt optimization harness (pure, no LLM).""" + +from __future__ import annotations + +from src.tools.prompt_optimizer import ( + BASELINE_ID, + CostLedger, + GoldenCase, + build_report, + propose_variants, + render_report_markdown, + score_candidates, + select_winner, +) + +_BASE = "You are the Judge. Decide per file." + + +# --- variant generation ----------------------------------------------------- + + +def test_baseline_always_first_and_unmodified(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"]) + assert cands[0].candidate_id == BASELINE_ID + assert cands[0].prompt_text == _BASE + assert cands[0].strategy == BASELINE_ID + + +def test_each_strategy_appends_distinct_directive(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise", "selfcheck"]) + ids = [c.candidate_id for c in cands] + assert ids == [BASELINE_ID, "stepwise", "selfcheck"] + for c in cands[1:]: + assert c.prompt_text.startswith(_BASE) + assert len(c.prompt_text) > len(_BASE) + # distinct mutations + assert cands[1].prompt_text != cands[2].prompt_text + + +def test_unknown_strategy_ignored(): + cands = propose_variants("J-SYSTEM", _BASE, ["nope", "stepwise"]) + assert [c.candidate_id for c in cands] == [BASELINE_ID, "stepwise"] + + +def test_default_strategies_is_all(): + cands = propose_variants("J-SYSTEM", _BASE) + assert len(cands) >= 4 # baseline + 4 directives + + +# --- scoring ---------------------------------------------------------------- + + +def _golden() -> list[GoldenCase]: + return [ + GoldenCase(case_id="c1", expected_decision="take_target"), + GoldenCase(case_id="c2", expected_decision="HUMAN_REQUIRED"), + ] + + +def test_score_accuracy_per_candidate(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"]) + rollouts = { + "baseline": {"c1": "take_target", "c2": "take_target"}, # 1/2 + "stepwise": {"c1": "take_target", "c2": "HUMAN_REQUIRED"}, # 2/2 + } + scores = {s.candidate_id: s for s in score_candidates(cands, rollouts, _golden())} + assert scores["baseline"].accuracy == 0.5 + assert scores["stepwise"].accuracy == 1.0 + assert scores["stepwise"].cases_scored == 2 + + +def test_missing_rollout_is_unscored_not_zero(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"]) + rollouts = {"baseline": {"c1": "take_target", "c2": "HUMAN_REQUIRED"}} + scores = {s.candidate_id: s for s in score_candidates(cands, rollouts, _golden())} + assert scores["stepwise"].cases_scored == 0 # surfaced, not fabricated + assert scores["stepwise"].accuracy == 0.0 + + +# --- winner selection ------------------------------------------------------- + + +def test_winner_requires_margin_over_baseline(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"]) + rollouts = { + "baseline": {"c1": "take_target", "c2": "take_target"}, # 0.5 + "stepwise": {"c1": "take_target", "c2": "HUMAN_REQUIRED"}, # 1.0 + } + scores = score_candidates(cands, rollouts, _golden()) + assert select_winner(scores, margin=0.02) == "stepwise" + assert select_winner(scores, margin=0.9) is None # gain below required margin + + +def test_no_winner_when_baseline_unscored(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"]) + scores = score_candidates(cands, {"stepwise": {"c1": "take_target"}}, _golden()) + assert select_winner(scores) is None + + +# --- report ----------------------------------------------------------------- + + +def test_report_flags_unscored_and_never_auto_applies(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"]) + rollouts = {"baseline": {"c1": "take_target", "c2": "HUMAN_REQUIRED"}} + report = build_report("J-SYSTEM", cands, _golden(), rollouts) + md = render_report_markdown(report) + assert "NOT auto-applied" in md + assert any("Unscored" in n for n in report.notes) + assert report.winner_id is None + + +def test_report_without_golden_notes_unscored(): + cands = propose_variants("J-SYSTEM", _BASE, ["stepwise"]) + report = build_report("J-SYSTEM", cands, [], {}) + assert any("No golden set" in n for n in report.notes) + assert report.winner_id is None + + +def test_cost_ledger_accumulates_immutably(): + led = CostLedger() + out = led.record(llm_calls=3, est_usd=0.12).record(llm_calls=2, est_usd=0.08) + assert led.llm_calls == 0 # original untouched + assert out.llm_calls == 5 + assert out.est_usd == 0.2 From a12953d21425d51f123855b1e8c3a61c2915540d Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 09:44:53 -0400 Subject: [PATCH 17/22] =?UTF-8?q?docs(plan):=20=E6=A0=87=E6=B3=A8=20Phase?= =?UTF-8?q?=203=20=E8=90=BD=E5=9C=B0=E7=8A=B6=E6=80=81=E4=B8=8E=E5=A4=96?= =?UTF-8?q?=E7=A7=BB=E8=BE=B9=E7=95=8C=EF=BC=88rollout/no-arg=20gate/LLM-?= =?UTF-8?q?=E5=8F=8D=E6=80=9D=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/plan/self-learning-system.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md index 5995507..bccc0b5 100644 --- a/doc/plan/self-learning-system.md +++ b/doc/plan/self-learning-system.md @@ -232,6 +232,15 @@ ### Phase 3 —— 离线提示/策略自动优化(opt-in,后期,成本透明) +> **落地状态(2026-05-31,feat/web)**:确定性可测核心已实装(`f540613`)—— +> `src/tools/prompt_optimizer.py` + `merge optimize-prompts` CLI。生成具名候选变体 +> (GEPA 确定性子集=反思指令注入)、按 golden 决策准确率排名、产**人工评审报告**, +> **永不自动写回 gate_registry**。**有意外移的部分**:① 昂贵的 LLM rollout 抽象为注入的 +> `rollouts` 映射(操作者自担成本产出),harness 保持纯离线可单测;② 仅支持 +> no-arg/`*-SYSTEM` gate(参数化 gate 无静态基线文本);③ LLM-反思式变体生成(GEPA +> 完整形态)留待后续,当前为确定性指令注入。这是 opt-in 子命令、默认不跑,符合 +> 「上界增益、不应早于 0–1」定位。 + **目标**:用 Phase 0 的评估器当 metric,离线对 gate 提示(`gate_registry` P-*/J-*/CA-*…)做 GEPA/MIPROv2 式进化。 **强约束(来自调研成本警示)** From 918c19452f0231e49fa86dc1a061ab7e4ae421a3 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 10:44:18 -0400 Subject: [PATCH 18/22] =?UTF-8?q?fix(memory):=20P1-A=E5=9B=BA=E5=8C=96=20p?= =?UTF-8?q?ersist-suppress=20=E5=88=A4=E6=8D=AE=E5=8D=87=E7=BA=A7=EF=BC=88?= =?UTF-8?q?=E6=B6=88=E9=99=A4=20PR-0d=20=E5=81=87=E9=98=B3=E6=80=A7?= =?UTF-8?q?=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eval-memory 分析在真实 forgejo 累积 sidecar 上发现:P1-A 的持久软删沿用读取期 单臂 harmful_entry_ids(score<=-0.5),会把 8 条薄样本(<=4 fail)误判有害并不可逆 软删——正是 PR-0d 为度量层修过的同一相关性假阳性,现落到跨 run 持久副作用上。 持久软删远比读取期 O-M6 过滤(瞬时、可逆)更该严格: - hit_tracker.harmful_entry_ids 增 min_fail_count(默认 0,读取期行为不变) - config: suppress_harmful_threshold=-0.8(严于读取期 -0.5)+ suppress_min_fail_count=5 - _apply_suppress_harmful_entries 用严格阈值 + fail 下限;并加确定性混淆守卫—— 条目若仅关联本 run veto(确定性)失败文件、且不沾任何 passed 文件,则其"有害" 是相关性(确定性门不读 memory),跳过 suppress 真实 forgejo 实证:旧判据选 8 条软删,新判据选 0 条(全部薄样本被 fail 下限拦住)。 5 新单测(fail下限/严阈值/混淆守卫/守卫不误伤沾 passed 的条目) + 3273 unit 绿 (1 pre-existing 无关 docs 测试除外),mypy/ruff 干净 --- src/core/orchestrator.py | 38 ++++++++-- src/memory/hit_tracker.py | 9 ++- src/models/config.py | 17 +++++ tests/unit/test_memory_pruning.py | 12 ++++ tests/unit/test_memory_suppress.py | 112 +++++++++++++++++++++++++++-- 5 files changed, 177 insertions(+), 11 deletions(-) diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py index 5c5ce0b..6cd90b7 100644 --- a/src/core/orchestrator.py +++ b/src/core/orchestrator.py @@ -610,17 +610,22 @@ def _apply_outcome_confidence_writeback(self, state: MergeState) -> None: def _apply_suppress_harmful_entries(self, state: MergeState) -> None: """P1-A: persistently soft-delete stably-harmful memory entries. - Default OFF. When ``persist_suppress`` is on, entries whose accumulated - outcome score crosses the harmful threshold with at least - ``suppress_min_observations`` observations are marked ``suppressed`` so - the prune survives tracker loss across runs (the O-M6 read-time filter - recomputes from sidecar observations and resurrects on loss). Human and + Default OFF. Persistent suppress is durable and cross-run, so its bar + is deliberately stricter than the transient read-time O-M6 filter: + ``suppress_harmful_threshold`` (≈ near-universal failure, not a slim + majority) AND ``suppress_min_fail_count`` absolute fails. A + deterministic-confound guard further skips entries whose only judged + file association is with files that failed via a *deterministic* veto + this run — a deterministic gate ignores memory, so blaming the injected + entry is the PR-0d single-arm false positive (metrics §9.7). Human and bootstrap entries are exempt, mirroring OPP-5.""" cfg = getattr(self.config, "memory", None) if cfg is None or not getattr(cfg, "persist_suppress", False): return harmful_ids = self._memory_hit_tracker.harmful_entry_ids( - min_observations=cfg.suppress_min_observations + threshold=getattr(cfg, "suppress_harmful_threshold", -0.8), + min_observations=cfg.suppress_min_observations, + min_fail_count=getattr(cfg, "suppress_min_fail_count", 5), ) if not harmful_ids: return @@ -630,6 +635,20 @@ def _apply_suppress_harmful_entries(self, state: MergeState) -> None: if record.decision_source in (DecisionSource.HUMAN, DecisionSource.BATCH_HUMAN) } + verdict = getattr(state, "judge_verdict", None) + passed_files = set(verdict.passed_files) if verdict else set() + # Files that failed via a deterministic veto this run — their failure is + # independent of injected memory, so an entry tied only to them is a + # correlational (not causal) "harm". + det_fail_files = ( + { + issue.file_path + for issue in verdict.issues + if issue.veto_condition and issue.file_path in set(verdict.failed_files) + } + if verdict + else set() + ) suppressed = 0 for entry in self._memory_store.to_memory().entries: if entry.entry_id not in harmful_ids or entry.suppressed: @@ -638,6 +657,13 @@ def _apply_suppress_harmful_entries(self, state: MergeState) -> None: continue if human_files and any(fp in human_files for fp in entry.file_paths): continue + entry_files = set(entry.file_paths) + if ( + det_fail_files + and entry_files & det_fail_files + and not (entry_files & passed_files) + ): + continue self._memory_store = self._memory_store.suppress_entry( entry.entry_id, reason="P1-A: stably-harmful judge outcomes" ) diff --git a/src/memory/hit_tracker.py b/src/memory/hit_tracker.py index 3574866..d2acbbc 100644 --- a/src/memory/hit_tracker.py +++ b/src/memory/hit_tracker.py @@ -235,6 +235,7 @@ def harmful_entry_ids( self, threshold: float = -0.5, min_observations: int = 2, + min_fail_count: int = 0, ) -> frozenset[str]: """O-M6: entry_ids whose outcome score is at/below ``threshold``. @@ -242,6 +243,12 @@ def harmful_entry_ids( an entry is consistently associated with judge failures. Requires at least ``min_observations`` total observations to avoid pruning entries on a single bad run. + + ``min_fail_count`` (P1-A固化) additionally requires that absolute + failure count — a 0-pass/3-fail entry has score -1.0 but only three + fails, too thin to justify a *persistent* prune. The transient + read-time filter leaves this at 0 (loose is fine — it is recomputed + and reversible); the durable suppress path raises it. """ with self._lock: harmful: set[str] = set() @@ -249,7 +256,7 @@ def harmful_entry_ids( p = counters.get("pass", 0) f = counters.get("fail", 0) total = p + f - if total < min_observations: + if total < min_observations or f < min_fail_count: continue score = (p - f) / total if score <= threshold: diff --git a/src/models/config.py b/src/models/config.py index cb80848..7dec24f 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -1008,6 +1008,23 @@ class MemoryExtractionConfig(BaseModel): description="P1-A: minimum pass+fail observations before a harmful " "entry is persistently suppressed, so a single run cannot prune it.", ) + suppress_harmful_threshold: float = Field( + default=-0.8, + ge=-1.0, + le=0.0, + description="P1-A固化: outcome-score ceiling for *persistent* suppress " + "(score=(pass-fail)/total). Stricter than the transient read-time " + "filter's -0.5 because suppression is durable and cross-run — require " + "near-universal failure, not a slim majority.", + ) + suppress_min_fail_count: int = Field( + default=5, + ge=1, + description="P1-A固化: minimum absolute fail count before persistent " + "suppress — a 0-pass/3-fail entry is too thin to durably prune. Guards " + "the PR-0d false-positive where a few deterministic-file failures look " + "harmful by ratio alone.", + ) class RenameDetectionConfig(BaseModel): diff --git a/tests/unit/test_memory_pruning.py b/tests/unit/test_memory_pruning.py index 3c71800..405e399 100644 --- a/tests/unit/test_memory_pruning.py +++ b/tests/unit/test_memory_pruning.py @@ -56,6 +56,18 @@ def test_harmful_threshold_is_inclusive() -> None: assert "edge" in tracker.harmful_entry_ids(threshold=-0.5) +def test_harmful_min_fail_count_floor() -> None: + """P1-A固化: min_fail_count gates by absolute fails, not just ratio.""" + tracker = MemoryHitTracker() + for i in range(3): # 0 pass / 3 fail → score -1.0 + f = f"f{i}" + tracker.record_injection([f], ["thin"]) + tracker.record_outcome(f, success=False) + assert "thin" in tracker.harmful_entry_ids() # default floor 0 + assert "thin" not in tracker.harmful_entry_ids(min_fail_count=5) + assert "thin" in tracker.harmful_entry_ids(min_fail_count=3) + + def test_harmful_custom_threshold() -> None: tracker = MemoryHitTracker() tracker.record_injection(["a.py"], ["mild-bad"]) diff --git a/tests/unit/test_memory_suppress.py b/tests/unit/test_memory_suppress.py index c3988d2..bc426fa 100644 --- a/tests/unit/test_memory_suppress.py +++ b/tests/unit/test_memory_suppress.py @@ -209,13 +209,14 @@ def test_persist_suppress_marks_stable_harmful_skips_human_and_bootstrap(): store = orch._memory_store for e in (harmful, human, boot): store = store.add_entry(e) - _track_fails(orch._memory_hit_tracker, e.entry_id, 3) + _track_fails(orch._memory_hit_tracker, e.entry_id, 6) # >= min_fail_count orch._memory_store = store state = SimpleNamespace( + judge_verdict=None, file_decision_records={ "src/secret.py": SimpleNamespace(decision_source=DecisionSource.HUMAN) - } + }, ) orch._apply_suppress_harmful_entries(state) @@ -231,6 +232,109 @@ def test_persist_suppress_respects_min_observations(): orch = _orch(persist=True, min_obs=3) e = _entry("harm", ["src/a.py"]) orch._memory_store = orch._memory_store.add_entry(e) - _track_fails(orch._memory_hit_tracker, e.entry_id, 2) # below threshold - orch._apply_suppress_harmful_entries(SimpleNamespace(file_decision_records={})) + _track_fails(orch._memory_hit_tracker, e.entry_id, 2) # below min_observations + orch._apply_suppress_harmful_entries( + SimpleNamespace(judge_verdict=None, file_decision_records={}) + ) + assert orch._memory_store.to_memory().entries[0].suppressed is False + + +# --- P1-A固化: stricter persistent-suppress criterion (PR-0d false-positive) - + + +def _track_mixed(tracker, entry_id: str, *, passes: int, fails: int) -> None: + for i in range(passes): + f = f"{entry_id}-p{i}" + tracker.record_injection([f], [entry_id]) + tracker.record_outcome(f, success=True) + for i in range(fails): + f = f"{entry_id}-f{i}" + tracker.record_injection([f], [entry_id]) + tracker.record_outcome(f, success=False) + + +def test_suppress_needs_min_fail_count(): + # score -1.0 but only 4 fails (< default 5) → too thin for a durable prune. + from types import SimpleNamespace + + orch = _orch(persist=True, min_obs=3) + e = _entry("harm", ["src/a.py"]) + orch._memory_store = orch._memory_store.add_entry(e) + _track_fails(orch._memory_hit_tracker, e.entry_id, 4) + orch._apply_suppress_harmful_entries( + SimpleNamespace(judge_verdict=None, file_decision_records={}) + ) + assert orch._memory_store.to_memory().entries[0].suppressed is False + + +def test_suppress_needs_strict_threshold(): + # 3 pass / 7 fail → score -0.4, above the -0.8 persistent bar (would pass the + # loose read-time -0.5 but not the durable suppress threshold). + from types import SimpleNamespace + + orch = _orch(persist=True, min_obs=3) + e = _entry("harm", ["src/a.py"]) + orch._memory_store = orch._memory_store.add_entry(e) + _track_mixed(orch._memory_hit_tracker, e.entry_id, passes=3, fails=7) + orch._apply_suppress_harmful_entries( + SimpleNamespace(judge_verdict=None, file_decision_records={}) + ) + assert orch._memory_store.to_memory().entries[0].suppressed is False + + +def test_deterministic_confound_guard_skips_veto_only_entry(): + # The PR-0d case: entry tied ONLY to a file that failed via a deterministic + # veto → its "harm" is correlational; persistent suppress must skip it. + from types import SimpleNamespace + + from src.models.judge import IssueSeverity, JudgeIssue + + orch = _orch(persist=True, min_obs=3) + e = _entry("harm", ["auth/oauth.go", "auth"]) + orch._memory_store = orch._memory_store.add_entry(e) + _track_fails(orch._memory_hit_tracker, e.entry_id, 6) # strongly harmful by ratio + + veto_issue = JudgeIssue( + file_path="auth/oauth.go", + issue_level=IssueSeverity.CRITICAL, + issue_type="reverse_impact_unhandled", + description="reverse impact", + veto_condition="reverse impact unhandled", + ) + verdict = SimpleNamespace( + passed_files=[], failed_files=["auth/oauth.go"], issues=[veto_issue] + ) + orch._apply_suppress_harmful_entries( + SimpleNamespace(judge_verdict=verdict, file_decision_records={}) + ) assert orch._memory_store.to_memory().entries[0].suppressed is False + + +def test_confound_guard_does_not_shield_entry_touching_passed_file(): + # An entry that also touches a PASSED file is not purely confounded — a + # strongly-harmful ratio still suppresses it. + from types import SimpleNamespace + + from src.models.judge import IssueSeverity, JudgeIssue + + orch = _orch(persist=True, min_obs=3) + e = _entry("harm", ["auth/oauth.go", "auth/ok.go"]) + orch._memory_store = orch._memory_store.add_entry(e) + _track_fails(orch._memory_hit_tracker, e.entry_id, 6) + + veto_issue = JudgeIssue( + file_path="auth/oauth.go", + issue_level=IssueSeverity.CRITICAL, + issue_type="reverse_impact_unhandled", + description="reverse impact", + veto_condition="reverse impact unhandled", + ) + verdict = SimpleNamespace( + passed_files=["auth/ok.go"], + failed_files=["auth/oauth.go"], + issues=[veto_issue], + ) + orch._apply_suppress_harmful_entries( + SimpleNamespace(judge_verdict=verdict, file_decision_records={}) + ) + assert orch._memory_store.to_memory().entries[0].suppressed is True From 126302cc66178dfc14247c8d45f964646cf7e0a7 Mon Sep 17 00:00:00 2001 From: Angel Date: Sun, 31 May 2026 10:44:48 -0400 Subject: [PATCH 19/22] =?UTF-8?q?docs(plan):=20=E8=AE=B0=E5=BD=95=20P1-A?= =?UTF-8?q?=E5=9B=BA=E5=8C=96=E5=88=A4=E6=8D=AE=E5=8A=A0=E5=9B=BA=EF=BC=88?= =?UTF-8?q?eval-memory=20=E5=88=86=E6=9E=90=E9=A9=B1=E5=8A=A8=EF=BC=8C?= =?UTF-8?q?=E6=B6=88=E9=99=A4=E5=81=87=E9=98=B3=E6=80=A7=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/plan/self-learning-system.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/plan/self-learning-system.md b/doc/plan/self-learning-system.md index bccc0b5..9257690 100644 --- a/doc/plan/self-learning-system.md +++ b/doc/plan/self-learning-system.md @@ -178,6 +178,13 @@ - 触发**默认 opt-in**,因 Phase 0 已能度量净收益,可在基线为正后转默认开启。 **防护**:只固化满足 `min_observations` 且 effectiveness≤阈值的条目;豁免 HUMAN/bootstrap(同 OPP-5 现有豁免);软删可经 CLI 复活。 + +> **固化判据加固(`918c194`,eval-memory 分析驱动)**:持久软删不可逆、跨 run, +> 判据须远严于读取期 O-M6 过滤。真实 forgejo 累积 sidecar 上,旧的单臂 +> `harmful_entry_ids(-0.5)` 会误删 8 条薄样本(≤4 fail)——正是 PR-0d 修过的相关性 +> 假阳性。已升级:`suppress_harmful_threshold=-0.8` + `suppress_min_fail_count=5` +> + 确定性混淆守卫(条目仅关联本 run veto 失败文件且不沾 passed → 跳过)。实证旧 +> 判据选 8 条、新判据选 0 条。 **验收**:Phase 0 harness 显示 `harmful_influence_rate` 在"tracker 重置"场景下仍不回升(=证明持久化的增量价值),且总决策质量不降。 #### P1-B 激活并加固 OPP-5 写回,融合 compile/CI 信号(原则 P1) From 31008d63b5aac93a24ed94af038caa3ebb9dde3e Mon Sep 17 00:00:00 2001 From: Angel Date: Mon, 1 Jun 2026 00:03:25 -0400 Subject: [PATCH 20/22] =?UTF-8?q?feat(eval):=20=E8=A1=A5=E9=BD=90=20LLM-?= =?UTF-8?q?=E5=88=A4=E6=96=AD=E5=AF=86=E9=9B=86=20golden=20=E9=9B=86?= =?UTF-8?q?=E5=86=B3=E7=AD=96=E9=9D=A2=EF=BC=88B-class=20=E5=B9=B2?= =?UTF-8?q?=E5=87=80=E5=90=88=E5=B9=B6=20+=20J-SYSTEM=20pass/fail=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 此前 golden 集仅 5 个 C-class 升级样本,三个 gate 都只有单一升级标签 (escalate_human / human_required),优化器只能惩罚"误判升级"、无法奖励 "敢自动合并",决策面残缺。 新增 4 个 forgejo 真实样本(models/auth/auth_token.go,双侧不相交纯增量、 git 3-way 干净合并、go build 通过、golden 保留双方): - t1-0034/0036:auto_safe + semantic_merge + J-SYSTEM pass - t1-0035:auto_risky(auth 结构改动,弱 risk-hint 非 security_sensitive) - t1-0037:J-SYSTEM fail fixture——golden 正确,rollout 喂丢掉 upstream 所有权校验的 fork-only 树(prepare 的 working_tree),judge 应判 fail 逐 gate 现状:J-SYSTEM 0→4(3 pass + 1 fail)、P-RISK 5+3、CA 5+3。 配套: - golden.md §4/§5:更新 seed 状态,将 J-SYSTEM rollout 输入契约固化为 golden_tree(pass)/ working_tree(fail)的具体约定 - _schemas.py:SampleMeta 增 judgment_intensive / golden_decisions 字段 - 新增真实数据集守卫单测(决策面覆盖 + golden JSON 与 meta 同步) - doc/evaluation/README.md 索引补 golden.md --- doc/evaluation/README.md | 1 + doc/evaluation/dataset.md | 5 + doc/evaluation/golden.md | 165 ++++++++++++++ scripts/eval/_golden.py | 115 ++++++++++ scripts/eval/_schemas.py | 25 +++ scripts/eval/build_golden.py | 86 ++++++++ .../datasets/tier1/samples/t1-0005/meta.yaml | 6 + .../datasets/tier1/samples/t1-0006/meta.yaml | 6 + .../datasets/tier1/samples/t1-0031/meta.yaml | 7 + .../datasets/tier1/samples/t1-0032/meta.yaml | 7 + .../datasets/tier1/samples/t1-0033/meta.yaml | 7 + .../datasets/tier1/samples/t1-0034/base.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0034/fork.patch | 14 ++ .../datasets/tier1/samples/t1-0034/golden.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0034/meta.yaml | 30 +++ .../tier1/samples/t1-0034/provenance.yaml | 15 ++ .../tier1/samples/t1-0034/upstream.patch | 13 ++ .../datasets/tier1/samples/t1-0035/base.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0035/fork.patch | 16 ++ .../datasets/tier1/samples/t1-0035/golden.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0035/meta.yaml | 29 +++ .../tier1/samples/t1-0035/provenance.yaml | 15 ++ .../tier1/samples/t1-0035/upstream.patch | 12 ++ .../datasets/tier1/samples/t1-0036/base.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0036/fork.patch | 14 ++ .../datasets/tier1/samples/t1-0036/golden.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0036/meta.yaml | 30 +++ .../tier1/samples/t1-0036/provenance.yaml | 15 ++ .../tier1/samples/t1-0036/upstream.patch | 13 ++ .../datasets/tier1/samples/t1-0037/base.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0037/fork.patch | 14 ++ .../datasets/tier1/samples/t1-0037/golden.tar | Bin 0 -> 10240 bytes .../datasets/tier1/samples/t1-0037/meta.yaml | 31 +++ .../tier1/samples/t1-0037/provenance.yaml | 15 ++ .../tier1/samples/t1-0037/upstream.patch | 14 ++ tests/eval/golden/CA-SYSTEM.golden.json | 34 +++ tests/eval/golden/J-SYSTEM.golden.json | 18 ++ .../golden/P-RISK-SCORE-SYSTEM.golden.json | 34 +++ tests/eval/manifests/tier1.lock.json | 36 +++- tests/eval/unit/test_build_golden.py | 202 ++++++++++++++++++ 40 files changed, 1038 insertions(+), 6 deletions(-) create mode 100644 doc/evaluation/golden.md create mode 100644 scripts/eval/_golden.py create mode 100644 scripts/eval/build_golden.py create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/base.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/fork.patch create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/golden.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/meta.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0034/upstream.patch create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/base.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/fork.patch create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/golden.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/meta.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0035/upstream.patch create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/base.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/fork.patch create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/golden.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/meta.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/provenance.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0036/upstream.patch create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/base.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/fork.patch create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/golden.tar create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/meta.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml create mode 100644 tests/eval/datasets/tier1/samples/t1-0037/upstream.patch create mode 100644 tests/eval/golden/CA-SYSTEM.golden.json create mode 100644 tests/eval/golden/J-SYSTEM.golden.json create mode 100644 tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json create mode 100644 tests/eval/unit/test_build_golden.py diff --git a/doc/evaluation/README.md b/doc/evaluation/README.md index c4923e8..2d0934f 100644 --- a/doc/evaluation/README.md +++ b/doc/evaluation/README.md @@ -86,6 +86,7 @@ Tier-1 跑得快、可天天跑;Tier-2 接近真实,但贵;Tier-3 用来 | [dataset.md](dataset.md) | 三层评估集如何构造、如何维护、如何防止过拟合 | | [procedure.md](procedure.md) | 端到端评估流程、复现命令、报告产物规范 | | [acceptance.md](acceptance.md) | 每个指标的合格 / 不合格阈值与版本基线管理 | +| [golden.md](golden.md) | `merge optimize-prompts --golden` 的逐 gate 决策 oracle(LLM-判断密集 golden 集)如何从样本 `meta.yaml` 派生 | --- diff --git a/doc/evaluation/dataset.md b/doc/evaluation/dataset.md index 88910d0..b3e6d30 100644 --- a/doc/evaluation/dataset.md +++ b/doc/evaluation/dataset.md @@ -2,6 +2,11 @@ > 评估集**质量决定结论可信度**。本章定义三层数据集的构造原则、维护机制、防过拟合规则。 > 数据集所在路径:`tests/eval/datasets/`(待建)。 +> +> 逐 gate 的**决策 oracle**(`merge optimize-prompts --golden` 用的 +> LLM-判断密集 golden 集,与下文的 `golden.tar` 黄金树不是一回事)见 +> [golden.md](golden.md):它从样本 `meta.yaml` 的 `judgment_intensive` / +> `golden_decisions` 字段派生,单一真相源。 --- diff --git a/doc/evaluation/golden.md b/doc/evaluation/golden.md new file mode 100644 index 0000000..edff13b --- /dev/null +++ b/doc/evaluation/golden.md @@ -0,0 +1,165 @@ +# LLM-判断密集 Golden 集 + +> 这是 `merge optimize-prompts --golden` 的决策验证集,**与 Tier-1/2/3 评估数据集 +> 的 `golden.tar`(人工黄金合并树)是两回事**:这里的 golden 是**逐 gate 的决策 +> oracle**,形如 `[{case_id, expected_decision}]`,用来给候选提示按决策准确率排名。 + +--- + +## 1. 为什么要"判断密集" + +`optimize-prompts` 的打分信号是:候选提示的 rollout 是否复现 `expected_decision` +(`src/tools/prompt_optimizer.py` `score_candidates`)。这个信号**只有在决策真正由 +LLM 推理决定时才有区分度**。如果一个 case 的最终决策被确定性规则短路,那么所有候选 +提示都会产出同一个决策 → 准确率全相同 → 优化器无法分辨提示好坏,这个 case 是噪声。 + +因此 golden 集必须**集中在 LLM 判断区**,排除被确定性路径决定的 case。 + +### 纳入(judgment-intensive) + +- 风险分落在 `risk_score_low`/`risk_score_high` 之间的边界样本(`auto_risky`, + 是否升级取决于 LLM 复核)。 +- C-class 真实文本冲突:双侧对同一区域做**独立**改动,ConflictAnalyst 需判断 + 能否安全语义合并、还是升级。 +- 接口契约 / 调用点漂移(M1/M3 形态),auto-merge 与升级之间的灰带。 +- 命中 `risk_hint_patterns`(弱信号,如 `**/auth/**`)但**未**命中严格 + `security_sensitive.patterns` 的文件——`config.py` 明确"LLM 的混合分有最终发言权"。 + +### 排除(deterministic short-circuit) + +- 命中 `security_sensitive.patterns`(`**/.env`、`**/credentials.go`、 + `**/auth/credentials/**` …)→ 强制 `human_required`,与提示无关。 +- 确定性 veto / `deterministic_issues` 强制 judge `fail`。 +- `--no-llm` heuristic 封顶(如封顶 `high` 却判 `critical` 之类的不可达门)。 +- 二进制 / `deleted_only` / `excluded` 等 sentinel 风险级。 + +判据口诀:**换一版提示词,这个 case 的决策会变吗?** 会 → 纳入;不会 → 排除。 + +--- + +## 2. 各 gate 的决策词表 + +`expected_decision` 必须取自该 gate 真实的决策枚举(`_golden.py` 的 +`GATE_DECISION_VOCAB` 从生产枚举派生,写错值会让 `build_golden` 直接报错): + +| gate ID | 来源枚举 | 取值 | +|---|---|---| +| `J-SYSTEM` | `VerdictType` (`src/models/judge.py`) | `pass` / `conditional` / `fail` | +| `P-RISK-SCORE-SYSTEM` | `RiskLevel` (`src/models/diff.py`) | `auto_safe` / `auto_risky` / `human_required`(+ sentinel `deleted_only`/`binary`/`excluded`) | +| `CA-SYSTEM` | `MergeDecision` (`src/models/decision.py`) | `take_current` / `take_target` / `semantic_merge` / `manual_patch` / `escalate_human` / `skip` | + +一个样本可同时为多个 gate 贡献 golden case(同一个 C-class 冲突既是 +`human_required` 的风险 case,也是 `escalate_human` 的 analyst case)。 + +--- + +## 3. 单一真相源:meta.yaml → 生成器 + +golden 集**不手写**,而是从样本 `meta.yaml` 派生,避免与数据集标注双源漂移。 + +在样本 `meta.yaml` 里声明: + +```yaml +judgment_intensive: true # 决策由 LLM 驱动,非确定性短路 +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: human_required + - gate_id: CA-SYSTEM + expected_decision: escalate_human +``` + +两个字段都是可选的(`SampleMeta` 默认 `judgment_intensive=false`、 +`golden_decisions=()`),绝大多数样本不带。`judgment_intensive: true` 但不写 +`golden_decisions` 是合法的 no-op(先入选、后标注)。 + +生成(确定性、按 case_id 排序,输出到 `tests/eval/golden/.golden.json`): + +```bash +python -m scripts.eval.build_golden # 扫 tier 1/2/3 +python -m scripts.eval.build_golden --tier 1 # 只扫 tier-1 +``` + +编辑样本 `meta.yaml` 后会改变其内容哈希,需重建 lock: + +```bash +python -m scripts.eval.lock --update +python -m scripts.eval.lock --verify +``` + +消费(rollout 是你自担成本的离线步骤,见 `self-learning-system.md` Phase 3): + +```bash +merge optimize-prompts --gate CA-SYSTEM \ + --golden tests/eval/golden/CA-SYSTEM.golden.json \ + --rollouts <你跑各候选产出的 {candidate_id:{case_id:decision}}.json> +``` + +--- + +## 4. 当前 seed 状态(2026-05-31) + +数据集现有 **8 个判断密集真实样本**,覆盖完整决策面(其余 t1-0001..0030 为 TBD +占位 / 单边平凡改动,不入选): + +- **5 个 C-class 升级样本**(t1-0005/0006/0031/0032/0033):双侧改**同一区域** → + 应升级。 +- **3 个 B-class 干净自动合并样本**(t1-0034/0035/0036):双侧对 + `models/auth/auth_token.go` 的**不相交区域**做纯增量改动,git 3-way 干净合并、 + `go build ./models/auth/` 通过,golden 保留双方(确定性 `git merge`,非系统自身 + 产物,dataset.md §1.4)。提供 auto-merge 正例。 +- **1 个 J-SYSTEM=fail fixture 样本**(t1-0037):同形态干净合并,但 upstream 侧加的是 + 一个**安全相关**的 `BelongsTo` 所有权校验;golden.tar 仍保留双方(正确),fail 负例 + 喂的是丢掉该校验的 fork-only 树(§5)。 + +逐 gate 现状: + +| gate | case 数 | 分布 | +|---|---|---| +| `CA-SYSTEM` | 8 | 5 `escalate_human` + 3 `semantic_merge`(t1-0034/0035/0036) | +| `P-RISK-SCORE-SYSTEM` | 8 | 5 `human_required` + 2 `auto_safe`(t1-0034/0036) + 1 `auto_risky`(t1-0035) | +| `J-SYSTEM` | 4 | 3 `pass`(t1-0034/0035/0036,喂 golden 树) + 1 `fail`(t1-0037,喂 fork-only 树) — 见 §5 契约 | + +> 决策面现已**全向**:优化器既能惩罚"把升级误判成自动合并"的提示(升级类负例), +> 也能奖励"敢对干净合并自动放行 / 保留双方而非 take 一侧"的提示(auto-merge 正例; +> cf. IMPLEMENTATION_REPORT t1-0003 executor take_target 丢一侧的失败模式), +> 还能惩罚"对丢了一侧的合并仍判 pass"的提示(`J-SYSTEM=fail` 负例 t1-0037)。 + +--- + +## 5. J-SYSTEM rollout 输入契约(已固定) + +`J-SYSTEM` 的 golden 语义依赖"judge 在 rollout 时看到的是哪棵树"。`optimize-prompts` +消费的 rollout JSON(`{candidate_id: {case_id: decision}}`)是你自担成本离线跑出来的, +因此**这棵树由你在跑 rollout 时按本契约喂**,否则 `expected_decision` 无法对齐产出: + +输入树**全部是 `scripts.eval.prepare` 的现成产物**,rollout 无需临时构造(`prepare` +为每个样本写出 `golden_tree/`(= `golden.tar`)和 `working_tree/`(= base + `fork.patch`, +即只取 fork 侧、完全丢掉 upstream 增量的方向性 take_current 结果)): + +| `expected_decision` | rollout 必须喂给 judge 的合并树 | 何以判得动 | +|---|---|---| +| `pass` | `prepare` 的 **`golden_tree/`**(= `golden.tar`,双侧都在) | 正确合并,calibrated 提示应认可 | +| `fail` | `prepare` 的 **`working_tree/`**(= base + `fork.patch`,丢掉 upstream 侧) | 与 `upstream.patch` 对照可见 upstream 的增量整段缺失 → 语义丢失 | + +固定项(rollout 脚本必须遵守,否则该 case 的标签无意义): + +1. **输入树**严格取自上表,不得用系统自己跑出来的合并结果(那会把 judge 的输入与 + 被测提示耦合,丧失 oracle 地位)。 +2. judge 的其余上下文(base / upstream.patch / fork.patch、diff 摘要)按生产口径 + 从样本五件套派生,跨 candidate 保持一致——唯一变量是 judge **提示词本身**。 + `fail` 案尤其依赖 `upstream.patch` 在场:judge 正是靠它发现 `working_tree` 缺了 + upstream 的增量。 +3. 当前 3 个 `pass` case(t1-0034/0035/0036)均喂各自 `golden_tree/`。这三棵树都是 + 纯增量双侧合并,正确提示应稳定 `pass`;过度保守的提示会误报 `conditional` / `fail`, + 这正是优化器要惩罚的——所以即便是 `pass`,case 仍有区分度(§1 判据:换提示会变吗?会)。 +4. `fail` case(t1-0037)喂其 `working_tree/`:fork 侧加了 `const DefaultTokenTTLHours`, + 但 upstream 侧加的 `BelongsTo` 所有权校验在该树中完全缺失。calibrated 提示应判 + `fail`(丢了安全校验);对 fork-only 树仍 `pass` 的提示就是负例要惩罚的对象。 + 注意 t1-0037 仅声明 `J-SYSTEM`(其 golden.tar 本身是正确合并,可日后另补 + `P-RISK`/`CA` 正例标签,但不与本 fail fixture 混用)。 + +--- + +相关:数据集构造见 [dataset.md](dataset.md),评估流程见 +[procedure.md](procedure.md),Phase 3 成本模型见 +[../plan/self-learning-system.md](../plan/self-learning-system.md)。 diff --git a/scripts/eval/_golden.py b/scripts/eval/_golden.py new file mode 100644 index 0000000..1c5d9e8 --- /dev/null +++ b/scripts/eval/_golden.py @@ -0,0 +1,115 @@ +"""Build the LLM-judgment golden set consumed by ``merge optimize-prompts``. + +A *golden case* is a ``(case_id, expected_decision)`` pair for one +``*-SYSTEM`` gate. ``optimize-prompts`` ranks prompt variants by how often +their rollout reproduces ``expected_decision``; that signal only discriminates +between variants on cases whose decision is genuinely driven by the LLM, not +short-circuited by a deterministic rule (security_sensitive force, deterministic +veto, heuristic cap). This module turns the ``judgment_intensive`` / +``golden_decisions`` fields authored in each sample's ``meta.yaml`` into the +per-gate ``[{case_id, expected_decision}]`` JSON the CLI consumes — meta.yaml is +the single source of truth, so the golden set never drifts from the dataset. + +The gate decision vocabularies are derived from the production enums +(``VerdictType`` / ``RiskLevel`` / ``MergeDecision``) so a renamed decision +value fails the build instead of silently mislabelling a case. ``GoldenCase`` is +imported from the production harness so the emitted objects are exactly what +``optimize-prompts --golden`` validates. +""" + +from __future__ import annotations + +from pathlib import Path + +from src.models.decision import MergeDecision +from src.models.diff import RiskLevel +from src.models.judge import VerdictType +from src.tools.prompt_optimizer import GoldenCase + +from scripts.eval._ground_truth import GroundTruthMissing, load_meta +from scripts.eval._schemas import SampleMeta + +# Tier -> sample container, mirroring scripts.eval.lock.TIER_LAYOUT. Only tiers +# whose entries carry a SampleMeta (tier-1 micro-bench, tier-3 adversarial) can +# contribute golden cases; tier-2 replays have no meta.yaml and are skipped. +_TIER_LAYOUT: dict[int, str] = { + 1: "tier1/samples", + 2: "tier2/replays", + 3: "tier3/adversarial", +} + +# expected_decision must be one of the gate's real decision values. Keyed by the +# gate IDs registered in src/llm/prompts/gate_registry.py. +GATE_DECISION_VOCAB: dict[str, frozenset[str]] = { + "J-SYSTEM": frozenset(v.value for v in VerdictType), + "P-RISK-SCORE-SYSTEM": frozenset(v.value for v in RiskLevel), + "CA-SYSTEM": frozenset(v.value for v in MergeDecision), +} + + +class GoldenBuildError(ValueError): + """A sample declared a golden case the build rejects (typo / unknown gate).""" + + +def _validate_decision(sample_id: str, gate_id: str, decision: str) -> None: + vocab = GATE_DECISION_VOCAB.get(gate_id) + if vocab is None: + raise GoldenBuildError( + f"{sample_id}: unknown golden gate '{gate_id}' " + f"(known: {sorted(GATE_DECISION_VOCAB)})" + ) + if decision not in vocab: + raise GoldenBuildError( + f"{sample_id}: '{decision}' is not a valid {gate_id} decision " + f"(allowed: {sorted(vocab)})" + ) + + +def _iter_sample_metas(datasets_root: Path, tiers: tuple[int, ...]) -> list[SampleMeta]: + metas: list[SampleMeta] = [] + for tier in tiers: + layout = _TIER_LAYOUT.get(tier) + if layout is None: + continue + container = datasets_root / layout + if not container.is_dir(): + continue + for sample_dir in sorted(p for p in container.iterdir() if p.is_dir()): + try: + metas.append(load_meta(sample_dir)) + except GroundTruthMissing: + # Not every dir is a meta-bearing sample (e.g. tier-2 replays). + continue + return metas + + +def build_golden_sets( + datasets_root: Path, + tiers: tuple[int, ...] = (1, 2, 3), +) -> dict[str, list[GoldenCase]]: + """Collect judgment-intensive golden cases grouped by ``*-SYSTEM`` gate. + + Only samples with ``judgment_intensive: true`` contribute; each declared + ``golden_decisions`` entry is validated against its gate's vocabulary + (``GoldenBuildError`` on mismatch) and grouped under that gate. Cases within + a gate are sorted by ``case_id`` for deterministic output. A sample marked + judgment-intensive with no ``golden_decisions`` is a no-op (it contributes + no case) rather than an error — staging a sample before labelling it is + allowed. + """ + grouped: dict[str, list[GoldenCase]] = {} + for meta in _iter_sample_metas(datasets_root, tiers): + if not meta.judgment_intensive: + continue + for entry in meta.golden_decisions: + _validate_decision(meta.sample_id, entry.gate_id, entry.expected_decision) + grouped.setdefault(entry.gate_id, []).append( + GoldenCase( + case_id=meta.sample_id, + expected_decision=entry.expected_decision, + ) + ) + return { + gate_id: sorted(cases, key=lambda c: c.case_id) + for gate_id, cases in sorted(grouped.items()) + } diff --git a/scripts/eval/_schemas.py b/scripts/eval/_schemas.py index 889a34d..62d61f9 100644 --- a/scripts/eval/_schemas.py +++ b/scripts/eval/_schemas.py @@ -380,12 +380,34 @@ class AcceptanceThresholds(BaseModel): # --------------------------------------------------------------------------- +class GoldenDecision(BaseModel): + """One ``(gate, expected_decision)`` pair declared by a sample. + + Lets a single judgment-intensive sample contribute a golden case to + several ``*-SYSTEM`` gates at once (e.g. the same C-class conflict is a + ``human_required`` risk case and an ``escalate_human`` conflict-analyst + case). ``expected_decision`` is validated against the gate's real + decision vocabulary in ``_golden.py`` — never here, so this schema stays + free of a src import. + """ + + model_config = _FROZEN + + gate_id: str + expected_decision: str + + class SampleMeta(BaseModel): """Parsed contents of one sample's ``meta.yaml``. Mirrors the keys produced by the Phase 1 reference samples (``tests/eval/datasets/.../meta.yaml``). Tier-3 entries additionally set ``loss_class`` to one of M1..M6; Tier-1/2 leave it ``None``. + + ``judgment_intensive`` + ``golden_decisions`` opt a sample into the + LLM-judgment golden set consumed by ``merge optimize-prompts --golden`` + (see ``doc/evaluation/golden.md``). They are absent on most samples and + default to "not a golden case". """ model_config = _FROZEN @@ -396,6 +418,8 @@ class SampleMeta(BaseModel): loss_class: str | None = None expected_human: bool = False description: str | None = None + judgment_intensive: bool = False + golden_decisions: tuple[GoldenDecision, ...] = () class GoldenFileEntry(BaseModel): @@ -431,6 +455,7 @@ class GroundTruthBundle(BaseModel): "GateOperator", "GateResult", "GateVerdict", + "GoldenDecision", "GoldenFileEntry", "GroundTruthBundle", "ManifestEntry", diff --git a/scripts/eval/build_golden.py b/scripts/eval/build_golden.py new file mode 100644 index 0000000..5155b74 --- /dev/null +++ b/scripts/eval/build_golden.py @@ -0,0 +1,86 @@ +"""Emit per-gate LLM-judgment golden JSON from dataset meta.yaml. + +For every ``*-SYSTEM`` gate that any judgment-intensive sample labels, writes +``/.golden.json`` in the ``[{case_id, expected_decision}]`` +shape consumed by ``merge optimize-prompts --golden``. meta.yaml is the single +source of truth (see ``doc/evaluation/golden.md``); re-run this whenever a +sample's ``judgment_intensive`` / ``golden_decisions`` fields change. + +Usage: + python -m scripts.eval.build_golden # defaults below + python -m scripts.eval.build_golden --tier 1 --tier 3 + python -m scripts.eval.build_golden \ + --datasets tests/eval/datasets --out-dir tests/eval/golden +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +from scripts.eval._common import write_json +from scripts.eval._golden import build_golden_sets + +REPO_ROOT = Path(__file__).resolve().parents[2] +DEFAULT_DATASETS_DIR = REPO_ROOT / "tests" / "eval" / "datasets" +DEFAULT_OUT_DIR = REPO_ROOT / "tests" / "eval" / "golden" + + +def _eprint(message: str) -> None: + print(message) + + +def cmd_build(datasets_dir: Path, out_dir: Path, tiers: tuple[int, ...]) -> int: + golden = build_golden_sets(datasets_dir, tiers=tiers) + if not golden: + _eprint( + "No judgment-intensive golden cases found " + f"(datasets={datasets_dir}, tiers={list(tiers)})." + ) + return 0 + for gate_id, cases in golden.items(): + out_path = out_dir / f"{gate_id}.golden.json" + payload = [case.model_dump(mode="json") for case in cases] + write_json(out_path, payload, sort_keys=False) + _eprint(f"{gate_id}: {len(cases)} case(s) -> {out_path}") + return 0 + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="scripts.eval.build_golden", + description="Emit per-gate LLM-judgment golden JSON from dataset meta.yaml.", + ) + parser.add_argument( + "--datasets", + default=str(DEFAULT_DATASETS_DIR), + help=f"Datasets root (default: {DEFAULT_DATASETS_DIR}).", + ) + parser.add_argument( + "--out-dir", + default=str(DEFAULT_OUT_DIR), + help=f"Directory to write .golden.json into (default: {DEFAULT_OUT_DIR}).", + ) + parser.add_argument( + "--tier", + type=int, + choices=(1, 2, 3), + action="append", + dest="tiers", + help="Tier to scan (repeatable). Default: all of 1, 2, 3.", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = _build_arg_parser().parse_args(argv) + tiers = tuple(args.tiers) if args.tiers else (1, 2, 3) + return cmd_build( + datasets_dir=Path(args.datasets).resolve(), + out_dir=Path(args.out_dir).resolve(), + tiers=tiers, + ) + + +if __name__ == "__main__": # pragma: no cover - direct CLI entry + raise SystemExit(main()) diff --git a/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml index 0d7482f..644b1de 100644 --- a/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml +++ b/tests/eval/datasets/tier1/samples/t1-0005/meta.yaml @@ -9,3 +9,9 @@ description: | merge` produces conflict markers, so the system is correct to escalate (expected_human=true). Backfilled 2026-05-16 from real 3-way merge dry-run. +judgment_intensive: true # LLM-driven: no security_sensitive force / deterministic veto +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: human_required + - gate_id: CA-SYSTEM + expected_decision: escalate_human diff --git a/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml index a956150..07dde15 100644 --- a/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml +++ b/tests/eval/datasets/tier1/samples/t1-0006/meta.yaml @@ -8,3 +8,9 @@ description: | requirements.txt (overlapping hunk). `git merge` conflicts on that file, so the system is correct to escalate (expected_human=true). Backfilled 2026-05-16 from real 3-way merge dry-run. +judgment_intensive: true # LLM-driven: no security_sensitive force / deterministic veto +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: human_required + - gate_id: CA-SYSTEM + expected_decision: escalate_human diff --git a/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml index 0809f39..0862216 100644 --- a/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml +++ b/tests/eval/datasets/tier1/samples/t1-0031/meta.yaml @@ -17,3 +17,10 @@ description: | Under single-shot Tier-1 eval the expected outcome is escalation (no human present), so this exercises C-class escalation coverage rather than auto-merge-to-golden. +judgment_intensive: true # auth/** only hits the weak risk-hint nudge, not the + # strict security_sensitive floor — decision stays LLM-driven +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: human_required + - gate_id: CA-SYSTEM + expected_decision: escalate_human diff --git a/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml index 2f413f1..b6b69e4 100644 --- a/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml +++ b/tests/eval/datasets/tier1/samples/t1-0032/meta.yaml @@ -20,3 +20,10 @@ description: | while this sample provides the C-class coverage + correct golden reference. Under single-shot Tier-1 eval the expected outcome is escalation (no human present). +judgment_intensive: true # auth/** only hits the weak risk-hint nudge, not the + # strict security_sensitive floor — decision stays LLM-driven +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: human_required + - gate_id: CA-SYSTEM + expected_decision: escalate_human diff --git a/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml index be779b7..0272e03 100644 --- a/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml +++ b/tests/eval/datasets/tier1/samples/t1-0033/meta.yaml @@ -16,3 +16,10 @@ description: | additions are load-bearing; a directional take_current/take_target would silently drop one feature. Under single-shot Tier-1 eval the expected outcome is escalation (no human present). +judgment_intensive: true # auth/** only hits the weak risk-hint nudge, not the + # strict security_sensitive floor — decision stays LLM-driven +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: human_required + - gate_id: CA-SYSTEM + expected_decision: escalate_human diff --git a/tests/eval/datasets/tier1/samples/t1-0034/base.tar b/tests/eval/datasets/tier1/samples/t1-0034/base.tar new file mode 100644 index 0000000000000000000000000000000000000000..b38e5ebd0856682bb4ec8a97500c3f68d85fda12 GIT binary patch literal 10240 zcmeHKQE%Ev5Y8+96;q_P_8m0|Ayh(&PDvm=bd>8Qq^(p%)Uh|P7vrVtg@ilhzu(O6 z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2k!5> z2IoRf+1~~B9?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t zkajyRz(z`ar#8d|W3(GEGjri7JL?$gGqC6!$hIUP?7hLzmJ=Iu&LMAMN zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T& zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tVSPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@hV;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd z$80tCN$W$cl$QL|w$#>BzQSH12I>(pPt6U#V) z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@ zGVkD?YS#?XT$&_nhX5Ewz ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc zF+}|h52%l*B$}?*I^{T zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h zBtXTYNb-BDy$0vA+6>381|pklPdDvbj!3w3ybl))x{eW z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I literal 0 HcmV?d00001 diff --git a/tests/eval/datasets/tier1/samples/t1-0034/fork.patch b/tests/eval/datasets/tier1/samples/t1-0034/fork.patch new file mode 100644 index 0000000..56b1590 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0034/fork.patch @@ -0,0 +1,14 @@ +diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go +index d01ddbca1e..2a7ac465fd 100644 +--- a/models/auth/auth_token.go ++++ b/models/auth/auth_token.go +@@ -17,6 +17,9 @@ import ( + + type AuthorizationPurpose string + ++// MaxActiveTokensPerUser caps how many concurrent long-term tokens a user may hold. ++const MaxActiveTokensPerUser = 10 ++ + var ( + // Used to store long term authorization tokens. + LongTermAuthorization AuthorizationPurpose = "long_term_authorization" diff --git a/tests/eval/datasets/tier1/samples/t1-0034/golden.tar b/tests/eval/datasets/tier1/samples/t1-0034/golden.tar new file mode 100644 index 0000000000000000000000000000000000000000..af3c803ad2de47b781626050d7cada97d0deec5a GIT binary patch literal 10240 zcmeHLZBN@s5auiX71K!9I(Ml9Ax))7(FsWCMMrZSgtk(J=wdI456A1S*MZ!L|9&&O zYbORreCemkk%(i@?z}wn%*>jF2zZpXS*GUt>spCB9(yyf^W#2y_~~`KJ9Nd*)9d?g zud}oNZol*H;Go+j`F`hpr?&%pKhA~inzB?ei7stl@l#9MZ8#RmN`|w!g3ey&0H$*e zCqmBnKLQSs7DA>T97Yl7H7Q7*a=GMz=iuIpv*DMwr=ic|l)oJZJXYat$Ymcsj;D^3 zF#nFtI1sxWCtM^#DsY|MhA(2pA5?>W%2lF7JDs!6yBa3CXo`(8>v3A+6iil;`Z8qV*{nH%ZLJrMo=F z0W`?hYf^VzQb&+)m}V3{QWNzl6f!Vj zX?iatLbQ8kf1YdMHvjVt*IGxz!JJn4O}f*B`$W?K_=1HIupq!qDIl|KT%L2A#XgKk zybj7W%MBr#)gk`p>`JaO8CQzkhE}lwK)(UM88@2FqjtFzpe7oYTtZRBiWpQ%kI6QX z!X(72AhOYcUT3}n^lvXqTue;rx zqq)x|d~%_6e)888CYPtDFdUs6UY<_j%@$g}A&aAjB$T;xs6KBN-W0pWg<^}urew-) zB0gaYq@@(gFyN`?sN^gHMKXv{Xi4Li9Ms@9q0rzqZD5oM%apvO>B=($xIvaR2g5j2 zxCvJWH{Lm)g((*4N9ucS%8bEpM6kL|?}%feahK5cCgR?0<_>@0F_+j}$egh}vrmZk z7_H_$%^2S_8AWYamo1g`3>$N-+maqlV^zdW(nymL?vRZv4q#dE41Z-hp-e$K7g-d* zjp;w8+u~;GB*bi3hFE_aZQVzuVV?MB?z_Hvpm9K>&O0{OmR5vCr39qXj!`PWN|zYFSisFnah&W^#O5}XC_3nv zXK)|lv_}jujJj8yLH67ZUkScTLP`*|#(y1vynv_K2X^wz9UcCWsE&cD0i*4%VfnlYWbjwu~5} zK4si!r8=^WqNJvp@`We1_r1vNijJc_w|u4}5i6R+^E4rYSZeZ%{CS?>&q!MC;*p2u z(>1k(0sIb!lVJ`d-8VKURXE%vuDGuYv_Yv?Ni&Oi3 zg^oR2gC|^XM8$fW`=r~{FW>v{%YE4mT{XCNBGl*sM*-S@EE2wUt3|%pG!J>iH+!m1 zG>#8^xNREP%s$y!o!bHms7PAVG$D%E)|<(P(R{Qz;w&b$t#vxr{6$o&LpWMpQZwER zgki@bPo5rwi}poDS{;_rRvnxvZPTkaV!=dO$zT9`Px46_c`@9o-l7|e(YsN_YZOX7 z!9;JF#;CuU6D{%!Pf@VsGR5OH2s81EHfbR!sd5V`e7&Nnuz5t8QwKILXWj(Q8k&eZfKc_QrjPF!*uEjS(bC?P;9R1`nzD{5rQL8cI-5g7> zoF$KN#gd*wEbeV6FP&2gYj{r4M~-1`2uW%2)6Xx-?pj;r*@aFjKlQ%@>fmYwY6NNo qY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6NNoY6N}~1pWtie@b)! literal 0 HcmV?d00001 diff --git a/tests/eval/datasets/tier1/samples/t1-0034/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0034/meta.yaml new file mode 100644 index 0000000..30042f7 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0034/meta.yaml @@ -0,0 +1,30 @@ +sample_id: t1-0034 +tier: 1 +category: B +loss_class: null # organic clean-merge sample (not M-injected) +expected_human: false # disjoint additive changes 3-way merge cleanly → auto-merge, no escalation +description: | + B-class clean auto-merge: models/auth/auth_token.go — fork and upstream + make INDEPENDENT, NON-overlapping additions that git 3-way merges with no + conflict: + * fork adds `const MaxActiveTokensPerUser` directly below the + AuthorizationPurpose type declaration (top of file). + * upstream appends a `Remaining()` method after HashValidator (end of + file). + Golden keeps BOTH (deterministic `git merge`, `go build ./models/auth/` + verified); it is the obvious correct clean merge, NOT the merge system's + own output (dataset.md §1.4). Purely additive — no signature or behavior + change — so risk is low. Seeds the auto_safe / semantic_merge / judge-pass + positive decision face the C-class escalation samples (t1-0031..0033) + cannot exercise. +judgment_intensive: true # risk is the LLM hybrid score (no deterministic short- + # circuit: additive, not security_sensitive); a weaker + # prompt could over-escalate this clean merge or drop one + # side (cf. t1-0003 executor take_target, IMPL_REPORT) +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: auto_safe + - gate_id: CA-SYSTEM + expected_decision: semantic_merge # keep both disjoint additions, don't take one side + - gate_id: J-SYSTEM + expected_decision: pass # judge fed the golden (both-sides) tree → pass (golden.md §5) diff --git a/tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml b/tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml new file mode 100644 index 0000000..0bbd157 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0034/provenance.yaml @@ -0,0 +1,15 @@ +# Capture provenance — sibling of meta.yaml, not consumed by prepare. +# Kept for audit trail (dataset.md §2.4 maintenance rules) so reviewers +# can re-derive the sample from the original git history. +repo: /Users/angel/AI/merge-test/forgejo +base_ref: 160377405c53145e56dd0aab6ee05fce9764c184 +upstream_ref: a3635aa2f3cb1a2578f4f0b6bf8f477be562e8c1 +fork_ref: ba2beff0d03cd7f9434063df63d894ec11c3e51e +golden_ref: 682270de0c402a0fd73be751648269f271065574 +paths: ['models/auth/auth_token.go'] +# F8: files touched in base→golden but in neither base→upstream nor +# base→fork — content the human added directly during the merge commit. +# Reviewers should weigh whether to keep the sample (the merge system +# cannot reconstruct these files from the two patches alone, so they +# surface as MISS_UPSTREAM / MISS_FORK in eval diffs). +noisy_paths: [] diff --git a/tests/eval/datasets/tier1/samples/t1-0034/upstream.patch b/tests/eval/datasets/tier1/samples/t1-0034/upstream.patch new file mode 100644 index 0000000..56bd093 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0034/upstream.patch @@ -0,0 +1,13 @@ +diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go +index d01ddbca1e..76acc10e5b 100644 +--- a/models/auth/auth_token.go ++++ b/models/auth/auth_token.go +@@ -111,3 +111,8 @@ func HashValidator(validator []byte) string { + h.Write(validator) + return hex.EncodeToString(h.Sum(nil)) + } ++ ++// Remaining reports the duration until the token expires, relative to now. ++func (authToken *AuthorizationToken) Remaining() time.Duration { ++ return time.Until(authToken.Expiry.AsLocalTime()) ++} diff --git a/tests/eval/datasets/tier1/samples/t1-0035/base.tar b/tests/eval/datasets/tier1/samples/t1-0035/base.tar new file mode 100644 index 0000000000000000000000000000000000000000..b38e5ebd0856682bb4ec8a97500c3f68d85fda12 GIT binary patch literal 10240 zcmeHKQE%Ev5Y8+96;q_P_8m0|Ayh(&PDvm=bd>8Qq^(p%)Uh|P7vrVtg@ilhzu(O6 z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2k!5> z2IoRf+1~~B9?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t zkajyRz(z`ar#8d|W3(GEGjri7JL?$gGqC6!$hIUP?7hLzmJ=Iu&LMAMN zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T& zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tVSPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@hV;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd z$80tCN$W$cl$QL|w$#>BzQSH12I>(pPt6U#V) z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@ zGVkD?YS#?XT$&_nhX5Ewz ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc zF+}|h52%l*B$}?*I^{T zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h zBtXTYNb-BDy$0vA+6>381|pklPdDvbj!3w3ybl))x{eW z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I literal 0 HcmV?d00001 diff --git a/tests/eval/datasets/tier1/samples/t1-0035/fork.patch b/tests/eval/datasets/tier1/samples/t1-0035/fork.patch new file mode 100644 index 0000000..b73724a --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0035/fork.patch @@ -0,0 +1,16 @@ +diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go +index d01ddbca1e..9c86827a3a 100644 +--- a/models/auth/auth_token.go ++++ b/models/auth/auth_token.go +@@ -57,6 +57,11 @@ func (authToken *AuthorizationToken) IsExpired() bool { + return authToken.Expiry.AsLocalTime().Before(time.Now()) + } + ++// IsLongTerm reports whether the token is a long-term authorization token. ++func (authToken *AuthorizationToken) IsLongTerm() bool { ++ return authToken.Purpose == LongTermAuthorization ++} ++ + // GenerateAuthToken generates a new authentication token for the given user. + // It returns the lookup key and validator values that should be passed to the + // user via a long-term cookie. diff --git a/tests/eval/datasets/tier1/samples/t1-0035/golden.tar b/tests/eval/datasets/tier1/samples/t1-0035/golden.tar new file mode 100644 index 0000000000000000000000000000000000000000..466e0865f6a7b61fea14496cbb728100c982f1da GIT binary patch literal 10240 zcmeHKQE%He5YDUhE3O9C%9|^vj*|igf+2~W&RFVoiQN_`0+*2(n~N;E6z#+t^51ty zQnHmO%O3hv$Uy5zcRb#G_uWT&E+ZbNEwRo?b7wD_8u@w2;Y%rCP|8L#h>K z<9RElD0eJ+obm{?M2l1$Vwp@qb2T?*O68WL0}fIec(pTh7@^&QnVAbu*d>bZr47zKhKTvsr<7A6BDEEA+}|CQ|xrXL!wIre9lA+EQ)YZ3fN&+SKwVQlMwo3 z-W26qciWgod5QmcHO2K(l_TKSO^7W3x^?))rqS^3P0ED=N7PKD{(`DXbc*>E+GD#- zq_7PMN(g0RVAjh}1LimALE$43eb52;dMDMqTVLGJ4pJncs(;0{KJYF&4~D(|XE^Vl z_Rssnlm3NI?QMdH8npII%A4ilcfNMpS>Upb?Aq~g@aMQs-oLSQ#-ooc77;^&XZ){a z9L;hr;FB3$^20wyFdUzqL9c&$G(H=_t0!pvigrBbD}*;1C*sZo&xSS_;q+c;HeYzP z{@p^TtN`>X@v&_iv8$L5*&L~>o$ zXZFN`bO@*j@3`(KQ;~4$SR)wVok?l`WdV3YNE8qf{B9<-=9`!?=rz$~Cw0qe)L>1f zTZfmA`02KohM5}l9ZKE5!&40Rzt0=K zNvlb@xHk8A%(pewG#XDnd}$h5(rnk+2P?~an?nI5O>0;YqKe(T*?!o_$LnLx5(?Yg zv~$B>M76wx!;)RAY) zEz=ewEH>`e6mL){Xo9WYF^!SF*$X@J3onVV;wq(I8PZ->a+!1x6hisA6~0{3sjz*! znvnwA-+;jouS(dSS@1~-%{R+7Eh~}gS#YtO`$$J~L;0b9vRB1b2vi7E2vi7E2vi7E k2vi7E2vi7E2vi7E2vi7E2vi7E2vi7E2vi7E2t13xzx6XIyZ`_I literal 0 HcmV?d00001 diff --git a/tests/eval/datasets/tier1/samples/t1-0035/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0035/meta.yaml new file mode 100644 index 0000000..7a6404a --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0035/meta.yaml @@ -0,0 +1,29 @@ +sample_id: t1-0035 +tier: 1 +category: B +loss_class: null # organic clean-merge sample (not M-injected) +expected_human: false # disjoint additive changes 3-way merge cleanly → auto-merge with caution +description: | + B-class clean auto-merge: models/auth/auth_token.go — fork and upstream + make INDEPENDENT, NON-overlapping additions to the token type that git + 3-way merges with no conflict: + * fork adds an `IsLongTerm()` method right after `IsExpired()` (mid-file). + * upstream adds a `RevokedUnix timeutil.TimeStamp` field to the + AuthorizationToken struct (before `Expiry`). + Golden keeps BOTH (deterministic `git merge`, `go build ./models/auth/` + verified). Unlike t1-0034/0036 this touches the token struct shape and + lifecycle logic under models/auth/** — it only hits the weak risk-hint + nudge (NOT the strict security_sensitive floor), so the hybrid LLM score + has the final say (config.py). The correct outcome is auto_risky: merge + both, but flag the auth-struct change for attention — exercises the + auto_risky positive that distinguishes a calibrated prompt from one that + treats every auth touch as auto_safe. +judgment_intensive: true # auth/** weak hint only; risk decision is LLM-driven, in + # the risk_score_low..high band rather than short-circuited +golden_decisions: + - gate_id: P-RISK-SCORE-SYSTEM + expected_decision: auto_risky + - gate_id: CA-SYSTEM + expected_decision: semantic_merge # keep both disjoint additions, don't take one side + - gate_id: J-SYSTEM + expected_decision: pass # judge fed the golden (both-sides) tree → pass (golden.md §5) diff --git a/tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml b/tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml new file mode 100644 index 0000000..1e96229 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0035/provenance.yaml @@ -0,0 +1,15 @@ +# Capture provenance — sibling of meta.yaml, not consumed by prepare. +# Kept for audit trail (dataset.md §2.4 maintenance rules) so reviewers +# can re-derive the sample from the original git history. +repo: /Users/angel/AI/merge-test/forgejo +base_ref: 160377405c53145e56dd0aab6ee05fce9764c184 +upstream_ref: b176f1ce49adab083d987e3e0fa034b03b0994c1 +fork_ref: d0523cc96b39b5ff27d7abaa53e51a257cc9e465 +golden_ref: 47b9b9619040f22617135330bfd586deddb07afd +paths: ['models/auth/auth_token.go'] +# F8: files touched in base→golden but in neither base→upstream nor +# base→fork — content the human added directly during the merge commit. +# Reviewers should weigh whether to keep the sample (the merge system +# cannot reconstruct these files from the two patches alone, so they +# surface as MISS_UPSTREAM / MISS_FORK in eval diffs). +noisy_paths: [] diff --git a/tests/eval/datasets/tier1/samples/t1-0035/upstream.patch b/tests/eval/datasets/tier1/samples/t1-0035/upstream.patch new file mode 100644 index 0000000..62a21a5 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0035/upstream.patch @@ -0,0 +1,12 @@ +diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go +index d01ddbca1e..0f7d96086d 100644 +--- a/models/auth/auth_token.go ++++ b/models/auth/auth_token.go +@@ -40,6 +40,7 @@ type AuthorizationToken struct { + LookupKey string `xorm:"INDEX UNIQUE"` + HashedValidator string + Purpose AuthorizationPurpose `xorm:"NOT NULL DEFAULT 'long_term_authorization'"` ++ RevokedUnix timeutil.TimeStamp + Expiry timeutil.TimeStamp + } + diff --git a/tests/eval/datasets/tier1/samples/t1-0036/base.tar b/tests/eval/datasets/tier1/samples/t1-0036/base.tar new file mode 100644 index 0000000000000000000000000000000000000000..b38e5ebd0856682bb4ec8a97500c3f68d85fda12 GIT binary patch literal 10240 zcmeHKQE%Ev5Y8+96;q_P_8m0|Ayh(&PDvm=bd>8Qq^(p%)Uh|P7vrVtg@ilhzu(O6 z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2k!5> z2IoRf+1~~B9?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t zkajyRz(z`ar#8d|W3(GEGjri7JL?$gGqC6!$hIUP?7hLzmJ=Iu&LMAMN zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T& zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tVSPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@hV;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd z$80tCN$W$cl$QL|w$#>BzQSH12I>(pPt6U#V) z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@ zGVkD?YS#?XT$&_nhX5Ewz ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc zF+}|h52%l*B$}?*I^{T zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h zBtXTYNb-BDy$0vA+6>381|pklPdDvbj!3w3ybl))x{eW z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I literal 0 HcmV?d00001 diff --git a/tests/eval/datasets/tier1/samples/t1-0036/fork.patch b/tests/eval/datasets/tier1/samples/t1-0036/fork.patch new file mode 100644 index 0000000..0d1e1d5 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0036/fork.patch @@ -0,0 +1,14 @@ +diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go +index d01ddbca1e..bd3dd47a89 100644 +--- a/models/auth/auth_token.go ++++ b/models/auth/auth_token.go +@@ -26,6 +26,9 @@ var ( + + // Used to reset the password. + PasswordReset AuthorizationPurpose = "password_reset" ++ ++ // Used to store API-only long term tokens. ++ APIAuthorization AuthorizationPurpose = "api_authorization" + ) + + // Used to activate the specified email address for a user. diff --git a/tests/eval/datasets/tier1/samples/t1-0036/golden.tar b/tests/eval/datasets/tier1/samples/t1-0036/golden.tar new file mode 100644 index 0000000000000000000000000000000000000000..45f18f4f91ace7c71f3ddda94bdc37278d20e7b2 GIT binary patch literal 10240 zcmeHKZBN@s5auiX71KyDo% zcH-cuU;3%%NNuoZcXsBPXP((iWW?ian@#oD{CBJ615bjH-1u>y9sC@0IvezjpVjYo zo!#Ay-TmG6{@z}vL-yVFyY|io?EJV8)_a;}nkfuv{feL3vb_x_GMy_i8f$3pwD({* z=5Qv}i2p6&2x%cz7Qj&)gISY-;u%*n9t9rmy}CI4ymc-@o@9KhAMr$sd%;x~KJ32~s4XYJ5cot*kCD?hj6TpW(81*1PNtPI^~9F{uS2YSP|wDIcckAAIhPv%uvC z*}d0;{$JNUTKzlA#ytASVi7UyrM%TP^LEH%F5#0KUGRfHhcLK4KZn!a+0pg+5Z&fEr_Y&a+qp>Y$gj_!g>J`x$~!3UB$KjDv6hvh`-XC^_U zxjiQXac|vXq zSz|*+ctAF?B!XGNGyH363d%HOV>yi@xU*7cg&jAOl8|4R3G6?kR9i;mLv$+g!oLXL z5A_qc=@y*WDc3TNu&GQVwJhlK3hK2Wbu*`gz9W1Se47w#ZTk)$1rB(`08@^PC@%2B{>2b~+7pSKpIL%_Cr&vh>uiG)+dnn54$FG~AwO28XJVi%Gk zc5^K<-^7gmX$wu3?n_qFQgbpDT9{iGaYeo!Ff58LlBPSFkH`SCcB5~UHU2az>j)~my znQmXK5Q&#%LI$zS)))EnvcR8^wA{zr70stxOa&?P8ypQzb0F!tu|%mN;3jd!e^VTe zXrri7x4k-q1n&~W5QjfLTDUi#0e@C;lHXSt*o!rIsSSG6toM0LMy6hW>%!~Dau}v- zaP2_q-V=@jJpZ{U_#&(p#p23*%45E)six34KJewbY2Yfmva`Cj1r$(Gw5Amy%GmXr zt%vn|JU`|vA-AnXIT!pzRI5`so?nv~FDJrKu_%(_itKLOg?v$yR;OjXRVQa#+iLYv zE?7usIXr}&m3mS|zMF29wisctewVs8Qq^(p%)Uh|P7vrVtg@ilhzu(O6 z+Tg^fFMaA6DIxak&dz-E%{QC5h*+F9gQc43UlS$nSmIB`-jDn2k!5> z2IoRf+1~~B9?X_Iy>6_0n>jm>i$wO%lrF8?P`TqGZsav@R%sgf9t zkajyRz(z`ar#8d|W3(GEGjri7JL?$gGqC6!$hIUP?7hLzmJ=Iu&LMAMN zG`$xRt6lqG|Gdz~UGZldCfY~cL2S$7Cg16Tg+!MK*gW7d1W|;WQost^ynN?&nS{_M z^Cl_REH{Pm%2WK;sY#}mvK#@ou3c;a(5=I-CJfJc)FBrV98)oo`U}b`Q7Oh(D37T& zk-`+jFCmoifl)6*1sLC;2Zaw$RHqH_^+Cvax4yWe69i8}S^tVSPA8c)pGr(mBnY~w|!CzN>TK#*F&RFy@hV;EfxhtTVvpIi;c@OlTWU(<>GhXt2eaOgg3T7Qh)d#Qr?!j@zl z+{A1Y%#oH-ta!vy%~8pM7!=tcL8B#&TL@5t??LXt^=x8P3ComBT6E``L0G4gH3xaZ z6>h@S(T#t>raZ+aeNX!1ChWfKu#8B9R40fuH|M0lZ*|;d4B{y;f`jx`pX<^pL!DpB zGRnPUa5qB4TlwTWeS@%nk`6@}#00DB`KJsUjk$!jKN9z@mj(QeB}}5Y&~fJEseMDd z$80tCN$W$cl$QL|w$#>BzQSH12I>(pPt6U#V) z8>2r)ZE-Ux3HgN;$NtmG`$j8oh)#J{_-Em}p?Uy2a{MzhdK%ghHkEFqh5>zDLcIno z>>wzhZwc>#YZAilYHTHAS`6JvNk2kr=3w|2>>FmO06Se`{$dU{YsGNFjuD$%F41(@ zGVkD?YS#?XT$&_nhX5Ewz ztxvSC3-sb+Mp2vYm%~=~+9PL{1eq(ysDidf1jWkA=7bGFbkbJU&Uq4TT1^-I7GLcc zF+}|h52%l*B$}?*I^{T zFEFrYYw)J&_o-QLvzW9@z5LdNm-odmbl2e8kx=~y90hp(V_xu0SPhEBmbu4bwymkU z&^SKu<*sR9EBj<;d2MqjprmM?5h2Rh_M54P@qD^IWkEu28=G=&_=~8Pr*OKyA~D`h zBtXTYNb-BDy$0vA+6>381|pklPdDvbj!3w3ybl))x{eW z3YuWDw@eF2-^`5``GvPASTUK>HC&j9pUb3$py0~Qt?=cVro#3WWkw2Y{z>vjc;CSC z%=}N1E4JCTDOrhB&-}~f+(kO-G5=HlPg^BdAy6SuAy6SuAy6SuAy6SuAy6SuAy6Su ZAy6SuAy6SuAy6SuAy6SuA@F}k;6IBB+2Q~I literal 0 HcmV?d00001 diff --git a/tests/eval/datasets/tier1/samples/t1-0037/fork.patch b/tests/eval/datasets/tier1/samples/t1-0037/fork.patch new file mode 100644 index 0000000..180d450 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0037/fork.patch @@ -0,0 +1,14 @@ +diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go +index d01ddbca1e..33a8d710cc 100644 +--- a/models/auth/auth_token.go ++++ b/models/auth/auth_token.go +@@ -17,6 +17,9 @@ import ( + + type AuthorizationPurpose string + ++// DefaultTokenTTLHours is the fork's default long-term token lifetime. ++const DefaultTokenTTLHours = 720 ++ + var ( + // Used to store long term authorization tokens. + LongTermAuthorization AuthorizationPurpose = "long_term_authorization" diff --git a/tests/eval/datasets/tier1/samples/t1-0037/golden.tar b/tests/eval/datasets/tier1/samples/t1-0037/golden.tar new file mode 100644 index 0000000000000000000000000000000000000000..f304fa004f7bfea666b3c5807302223c3bdafa84 GIT binary patch literal 10240 zcmeHLZBN@s5auiX71Kyc@-*0Aj z?Zm+xU;5=LM=FVTcIV}pXJ*!&^N_`I*U!{U|G8HDmL=YlZ~eH>9)1oxoh|yt&*t~r zgLY@@&B2@Y!Tx@yL-IH6x9x*1*!yuVJgq5{$`|O;(>s1@$?h(k@^mSp=}bX;ue}fB z8G|z}rtB{c$4CnF8 z4n^Bz6F-a9m`Y?k9=_+9kPt~w2pPI{`&vS1wm{62>5gJz4!WQqjwXzp@f?g@s%lk7 z@V33@I1678N@(y>vJe%Z4_q*<6SPhOlnd*qu^>HXdx%EIXw#%E7nh#$sE(jPE?$$m z>ykPGdCjyRsA%CUOd@2Mc<_UOXNf{<=$tqE}ew(Ggv!yj>fXfY;vzMd6pO<~w{oa=|7Jl;MDD;uwd8KXV9L;?$ z;gbuk^P@k;FuEKLq1Qh^%JyyO&0t2X(V#zP<`GWyfK#Mh4SaAP0855iP^}X zGe`wrL?JUpE|?#KA{itow4`xM4r=h5P-t+QHZaPBWlG+}bmf_m+aSxDgHaMGJcPT$ z8}FP=BZ)=&fjXU=u)8W_IgtiZ2Z%H`cCa~3bZu2J2>TWzIH;3ryDsfA)cK_xTeCRp93cfzpHn5+BWh~K%*+~IdDVFH^AnKPEBc7=G4(Q59~ zR0(uXw&bU>rLvwz3zq1%q=(T!6>*a^)?|cRWTT&iuqb$jzcQUrUxA$QEDqrY6OB#B zbXz=3okX|0MTGUY*3AP{8s&+9TW z_$Ig}Agr#|S}>->P^}R3j|Fbp>B#R{Lq}92LvIla{MEnIaxrByl!t%Q5){p{dV`NNy;ni;{3vYC7o?TzX=}5cO9) zpgOXRqNJvp@`We1_dUz)ijJc_w|u56kzX{4$7wG>#8k zd}&@iDXg*na|!fkydha1bZ9#q>MZpZdGs5jm79)tKt<3rJi7- zw@mY?znL2?@(XWKuwX*cHJqD?U$jXJK}nTcNa5=hO@;L<%8WX&c^&gcc;CSE%)HMc zQfzf>m9mOfJ@YQIxr=lx#{3DROPL&Vy4T`6poF^_({qISi=x*ldX3_D2?lb5vk_M} zX8S4T8zY1dmltE0())-x##6dLVBmN>!GrWR#_`g|@MfSO5M0U~LYUHLp?L0|Q+TWI zw)x=xk2??Dn2{DHkx1a5`acZn;A#YF1Zo6o1Zo6o1Zo6o1Zo6o1Zo6o1Zo6o1Zo6o R1Zo6o1Zo6o1pXHY{0p&AQ{w;t literal 0 HcmV?d00001 diff --git a/tests/eval/datasets/tier1/samples/t1-0037/meta.yaml b/tests/eval/datasets/tier1/samples/t1-0037/meta.yaml new file mode 100644 index 0000000..f5f8f05 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0037/meta.yaml @@ -0,0 +1,31 @@ +sample_id: t1-0037 +tier: 1 +category: B +loss_class: null # organic clean-merge sample; the loss lives in the rollout + # fixture tree (§5), NOT in the sample artifacts +expected_human: false # the correct merge (golden.tar) auto-merges cleanly +description: | + J-SYSTEM=fail fixture: models/auth/auth_token.go — fork and upstream make + INDEPENDENT, NON-overlapping additions that git 3-way merges cleanly: + * fork adds `const DefaultTokenTTLHours` below the AuthorizationPurpose + type declaration. + * upstream appends a SECURITY-relevant `BelongsTo(userID)` ownership + check ("Callers MUST gate token operations on this to prevent + cross-user use"). + golden.tar keeps BOTH (deterministic `git merge`, `go build ./models/auth/` + verified) — it is the CORRECT merge, like t1-0034..0036. + + Unlike the pass anchors, this sample seeds the judge-FAIL face. Per the §5 + rollout contract, the J-SYSTEM rollout feeds judge the *broken* tree, which + for this sample is `prepare`'s `working_tree` (= base + fork.patch): a + directional take_current merge that DROPS upstream's `BelongsTo` ownership + check entirely. With upstream.patch in context (showing the addition), a + calibrated judge must flag the missing security check as semantic loss and + return `fail`. A prompt that rubber-stamps the fork-only tree is exactly + what this negative penalises. +judgment_intensive: true # judge verdict is LLM-driven: detecting that the merged + # tree silently dropped upstream's addition is the + # discrimination signal (a weak prompt passes it) +golden_decisions: + - gate_id: J-SYSTEM + expected_decision: fail # fed the fork-only working_tree (upstream dropped) → fail (golden.md §5) diff --git a/tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml b/tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml new file mode 100644 index 0000000..cfc8e86 --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0037/provenance.yaml @@ -0,0 +1,15 @@ +# Capture provenance — sibling of meta.yaml, not consumed by prepare. +# Kept for audit trail (dataset.md §2.4 maintenance rules) so reviewers +# can re-derive the sample from the original git history. +repo: /Users/angel/AI/merge-test/forgejo +base_ref: 160377405c53145e56dd0aab6ee05fce9764c184 +upstream_ref: f02e8ee1f51957f4e7fa92eeced58311d009c69e +fork_ref: 52c3530ce9755d45faf882e8e95e795737eca466 +golden_ref: 34ffd609b97daabe1ed2b9ce77e9a8c655e1d13d +paths: ['models/auth/auth_token.go'] +# F8: files touched in base→golden but in neither base→upstream nor +# base→fork — content the human added directly during the merge commit. +# Reviewers should weigh whether to keep the sample (the merge system +# cannot reconstruct these files from the two patches alone, so they +# surface as MISS_UPSTREAM / MISS_FORK in eval diffs). +noisy_paths: [] diff --git a/tests/eval/datasets/tier1/samples/t1-0037/upstream.patch b/tests/eval/datasets/tier1/samples/t1-0037/upstream.patch new file mode 100644 index 0000000..08b42cd --- /dev/null +++ b/tests/eval/datasets/tier1/samples/t1-0037/upstream.patch @@ -0,0 +1,14 @@ +diff --git a/models/auth/auth_token.go b/models/auth/auth_token.go +index d01ddbca1e..0f7a3d07aa 100644 +--- a/models/auth/auth_token.go ++++ b/models/auth/auth_token.go +@@ -111,3 +111,9 @@ func HashValidator(validator []byte) string { + h.Write(validator) + return hex.EncodeToString(h.Sum(nil)) + } ++ ++// BelongsTo reports whether the token is owned by the given user. ++// Callers MUST gate token operations on this to prevent cross-user use. ++func (authToken *AuthorizationToken) BelongsTo(userID int64) bool { ++ return authToken.UID == userID ++} diff --git a/tests/eval/golden/CA-SYSTEM.golden.json b/tests/eval/golden/CA-SYSTEM.golden.json new file mode 100644 index 0000000..c95416a --- /dev/null +++ b/tests/eval/golden/CA-SYSTEM.golden.json @@ -0,0 +1,34 @@ +[ + { + "case_id": "t1-0005", + "expected_decision": "escalate_human" + }, + { + "case_id": "t1-0006", + "expected_decision": "escalate_human" + }, + { + "case_id": "t1-0031", + "expected_decision": "escalate_human" + }, + { + "case_id": "t1-0032", + "expected_decision": "escalate_human" + }, + { + "case_id": "t1-0033", + "expected_decision": "escalate_human" + }, + { + "case_id": "t1-0034", + "expected_decision": "semantic_merge" + }, + { + "case_id": "t1-0035", + "expected_decision": "semantic_merge" + }, + { + "case_id": "t1-0036", + "expected_decision": "semantic_merge" + } +] diff --git a/tests/eval/golden/J-SYSTEM.golden.json b/tests/eval/golden/J-SYSTEM.golden.json new file mode 100644 index 0000000..04c7ed4 --- /dev/null +++ b/tests/eval/golden/J-SYSTEM.golden.json @@ -0,0 +1,18 @@ +[ + { + "case_id": "t1-0034", + "expected_decision": "pass" + }, + { + "case_id": "t1-0035", + "expected_decision": "pass" + }, + { + "case_id": "t1-0036", + "expected_decision": "pass" + }, + { + "case_id": "t1-0037", + "expected_decision": "fail" + } +] diff --git a/tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json b/tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json new file mode 100644 index 0000000..034d92f --- /dev/null +++ b/tests/eval/golden/P-RISK-SCORE-SYSTEM.golden.json @@ -0,0 +1,34 @@ +[ + { + "case_id": "t1-0005", + "expected_decision": "human_required" + }, + { + "case_id": "t1-0006", + "expected_decision": "human_required" + }, + { + "case_id": "t1-0031", + "expected_decision": "human_required" + }, + { + "case_id": "t1-0032", + "expected_decision": "human_required" + }, + { + "case_id": "t1-0033", + "expected_decision": "human_required" + }, + { + "case_id": "t1-0034", + "expected_decision": "auto_safe" + }, + { + "case_id": "t1-0035", + "expected_decision": "auto_risky" + }, + { + "case_id": "t1-0036", + "expected_decision": "auto_safe" + } +] diff --git a/tests/eval/manifests/tier1.lock.json b/tests/eval/manifests/tier1.lock.json index ed73d33..0f017d2 100644 --- a/tests/eval/manifests/tier1.lock.json +++ b/tests/eval/manifests/tier1.lock.json @@ -1,6 +1,6 @@ { "eval_version": "0.1.0", - "generated_at": "2026-05-23T01:29:47.774814Z", + "generated_at": "2026-06-01T02:19:50.741910Z", "samples": [ { "content_sha256": "e616a2a0e1b2597ef135a4ae88007c1594a0da7c3523849a5c852d36af016dbd", @@ -27,13 +27,13 @@ "tier": 1 }, { - "content_sha256": "d34cc0cef2e794cd4002e3cb91b0dc531797f23823544e62de383b235874731d", + "content_sha256": "c2c40dbafa396cece8eb7e1917ea7eff6fc0befff09903e32df57b2d9ec44f79", "relative_path": "tier1/samples/t1-0005", "sample_id": "t1-0005", "tier": 1 }, { - "content_sha256": "6d1d03ab8a82d508640f40e480ee811455265c56f52f7c160d6b58cba8dca5a0", + "content_sha256": "a8aa3c11fdb05ca13514d3b75d1e353d04c79aee22972f19057a3f21a139e103", "relative_path": "tier1/samples/t1-0006", "sample_id": "t1-0006", "tier": 1 @@ -183,22 +183,46 @@ "tier": 1 }, { - "content_sha256": "a003fe18fc68c72145fc705bb8bb8680f6f2a5484d25d3aa0248bcbac9178079", + "content_sha256": "a1b60c5162f207c344be6b00355811b980b49133b93aa70f833c890c44f4fd61", "relative_path": "tier1/samples/t1-0031", "sample_id": "t1-0031", "tier": 1 }, { - "content_sha256": "d71a7b61d958fcba24e09f0e62f02174101d7343abb3ffba616d4899d5bcffb6", + "content_sha256": "a3fa8f5f634edba0e9a5bdb1feb8f92e0e9f14b7b1a181672b2078defd933cbf", "relative_path": "tier1/samples/t1-0032", "sample_id": "t1-0032", "tier": 1 }, { - "content_sha256": "03411740c12e55488796b3d372ef3f7d860da75e832e44a79d92d0fa085841d2", + "content_sha256": "39a0be453cde7404ceb624f7930d039f259b67b6607f1ed68cf42b45e56b7bb5", "relative_path": "tier1/samples/t1-0033", "sample_id": "t1-0033", "tier": 1 + }, + { + "content_sha256": "2cac8d28c00fb30fe3a94708beba27e59d81a0f0959480695f7685aac32e86e7", + "relative_path": "tier1/samples/t1-0034", + "sample_id": "t1-0034", + "tier": 1 + }, + { + "content_sha256": "d935ecfb7897d7f1bbb8fd971fbca74b4ee3bb98af3a10b15329aed79ba98cac", + "relative_path": "tier1/samples/t1-0035", + "sample_id": "t1-0035", + "tier": 1 + }, + { + "content_sha256": "9ecffeb02595493b39446e60e49190ad4e87b07aad5baf1a89956bd576fe45c4", + "relative_path": "tier1/samples/t1-0036", + "sample_id": "t1-0036", + "tier": 1 + }, + { + "content_sha256": "43d35ac18f6200fb4bcc61cc3a52cb6b1c149be0b353bbfd88ae841f8ab694a9", + "relative_path": "tier1/samples/t1-0037", + "sample_id": "t1-0037", + "tier": 1 } ], "tier": 1 diff --git a/tests/eval/unit/test_build_golden.py b/tests/eval/unit/test_build_golden.py new file mode 100644 index 0000000..71583d8 --- /dev/null +++ b/tests/eval/unit/test_build_golden.py @@ -0,0 +1,202 @@ +"""Unit tests for the LLM-judgment golden-set builder (scripts/eval/_golden).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from scripts.eval._common import read_json +from scripts.eval._golden import ( + GATE_DECISION_VOCAB, + GoldenBuildError, + build_golden_sets, +) +from scripts.eval.build_golden import cmd_build +from src.tools.prompt_optimizer import GoldenCase + + +def _write_sample( + root: Path, + sample_id: str, + *, + tier: int = 1, + category: str = "C", + expected_human: bool = True, + judgment_intensive: bool | None = None, + golden_decisions: list[dict[str, str]] | None = None, +) -> None: + sample_dir = root / "tier1" / "samples" / sample_id + sample_dir.mkdir(parents=True, exist_ok=True) + lines = [ + f"sample_id: {sample_id}", + f"tier: {tier}", + f"category: {category}", + f"expected_human: {str(expected_human).lower()}", + ] + if judgment_intensive is not None: + lines.append(f"judgment_intensive: {str(judgment_intensive).lower()}") + if golden_decisions is not None: + lines.append("golden_decisions:") + for entry in golden_decisions: + lines.append(f" - gate_id: {entry['gate_id']}") + lines.append(f" expected_decision: {entry['expected_decision']}") + (sample_dir / "meta.yaml").write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def test_groups_judgment_intensive_cases_by_gate(tmp_path: Path) -> None: + _write_sample( + tmp_path, + "t1-9001", + golden_decisions=[ + {"gate_id": "CA-SYSTEM", "expected_decision": "escalate_human"}, + {"gate_id": "P-RISK-SCORE-SYSTEM", "expected_decision": "human_required"}, + ], + judgment_intensive=True, + ) + + result = build_golden_sets(tmp_path, tiers=(1,)) + + assert set(result) == {"CA-SYSTEM", "P-RISK-SCORE-SYSTEM"} + assert result["CA-SYSTEM"] == [ + GoldenCase(case_id="t1-9001", expected_decision="escalate_human") + ] + assert result["P-RISK-SCORE-SYSTEM"] == [ + GoldenCase(case_id="t1-9001", expected_decision="human_required") + ] + + +def test_excludes_non_judgment_intensive_and_unlabelled(tmp_path: Path) -> None: + # Plain sample with no field -> excluded. + _write_sample(tmp_path, "t1-9001") + # Explicitly false -> excluded even with decisions present. + _write_sample( + tmp_path, + "t1-9002", + judgment_intensive=False, + golden_decisions=[ + {"gate_id": "CA-SYSTEM", "expected_decision": "escalate_human"} + ], + ) + + assert build_golden_sets(tmp_path, tiers=(1,)) == {} + + +def test_judgment_intensive_without_decisions_is_noop(tmp_path: Path) -> None: + _write_sample(tmp_path, "t1-9001", judgment_intensive=True) + + assert build_golden_sets(tmp_path, tiers=(1,)) == {} + + +def test_rejects_decision_outside_gate_vocabulary(tmp_path: Path) -> None: + _write_sample( + tmp_path, + "t1-9001", + judgment_intensive=True, + golden_decisions=[{"gate_id": "CA-SYSTEM", "expected_decision": "pass"}], + ) + + with pytest.raises(GoldenBuildError, match="not a valid CA-SYSTEM decision"): + build_golden_sets(tmp_path, tiers=(1,)) + + +def test_rejects_unknown_gate(tmp_path: Path) -> None: + _write_sample( + tmp_path, + "t1-9001", + judgment_intensive=True, + golden_decisions=[{"gate_id": "X-SYSTEM", "expected_decision": "fail"}], + ) + + with pytest.raises(GoldenBuildError, match="unknown golden gate"): + build_golden_sets(tmp_path, tiers=(1,)) + + +def test_output_is_deterministic_and_sorted(tmp_path: Path) -> None: + for sid in ("t1-9003", "t1-9001", "t1-9002"): + _write_sample( + tmp_path, + sid, + judgment_intensive=True, + golden_decisions=[{"gate_id": "J-SYSTEM", "expected_decision": "fail"}], + ) + + result = build_golden_sets(tmp_path, tiers=(1,)) + + assert [c.case_id for c in result["J-SYSTEM"]] == ["t1-9001", "t1-9002", "t1-9003"] + + +def test_cmd_build_writes_optimize_prompts_golden_json(tmp_path: Path) -> None: + _write_sample( + tmp_path / "data", + "t1-9001", + judgment_intensive=True, + golden_decisions=[ + {"gate_id": "CA-SYSTEM", "expected_decision": "escalate_human"} + ], + ) + out_dir = tmp_path / "golden" + + rc = cmd_build(tmp_path / "data", out_dir, tiers=(1,)) + + assert rc == 0 + payload = read_json(out_dir / "CA-SYSTEM.golden.json") + assert payload == [{"case_id": "t1-9001", "expected_decision": "escalate_human"}] + # Shape must round-trip through the production GoldenCase validator. + assert GoldenCase.model_validate(payload[0]).case_id == "t1-9001" + + +def test_cmd_build_writes_nothing_when_no_golden_cases(tmp_path: Path) -> None: + _write_sample(tmp_path / "data", "t1-9001") + out_dir = tmp_path / "golden" + + rc = cmd_build(tmp_path / "data", out_dir, tiers=(1,)) + + assert rc == 0 + assert not out_dir.exists() or not list(out_dir.glob("*.json")) + + +def test_gate_vocab_tracks_production_enums() -> None: + # Guards against drift: a renamed decision value must surface here. The + # actionable decisions each gate emits for a judgment-intensive case must + # stay valid; sentinel risk levels (binary / excluded) may also appear. + assert GATE_DECISION_VOCAB["J-SYSTEM"] == frozenset({"pass", "conditional", "fail"}) + assert {"auto_safe", "auto_risky", "human_required"} <= GATE_DECISION_VOCAB[ + "P-RISK-SCORE-SYSTEM" + ] + assert "escalate_human" in GATE_DECISION_VOCAB["CA-SYSTEM"] + + +# --- real-dataset guards ----------------------------------------------------- +# +# These read the committed dataset (not tmp_path) so an accidental meta.yaml +# edit, or a forgotten `python -m scripts.eval.build_golden` after one, fails +# CI instead of silently shipping a stale golden set. Mirrors `lock --verify`. + + +def test_real_dataset_covers_full_decision_face() -> None: + from scripts.eval.build_golden import DEFAULT_DATASETS_DIR + + golden = build_golden_sets(DEFAULT_DATASETS_DIR, tiers=(1, 2, 3)) + seen = { + gate_id: {case.expected_decision for case in cases} + for gate_id, cases in golden.items() + } + # Both escalation (negative) and auto-merge (positive) faces are seeded, + # and the judge gate covers both its pass and fail verdicts. + assert {"pass", "fail"} <= seen["J-SYSTEM"] + assert {"auto_safe", "auto_risky", "human_required"} <= seen["P-RISK-SCORE-SYSTEM"] + assert {"semantic_merge", "escalate_human"} <= seen["CA-SYSTEM"] + + +def test_committed_golden_json_in_sync_with_meta() -> None: + from scripts.eval.build_golden import DEFAULT_DATASETS_DIR, DEFAULT_OUT_DIR + + golden = build_golden_sets(DEFAULT_DATASETS_DIR, tiers=(1, 2, 3)) + for gate_id, cases in golden.items(): + on_disk = read_json(DEFAULT_OUT_DIR / f"{gate_id}.golden.json") + expected = [case.model_dump(mode="json") for case in cases] + assert on_disk == expected, ( + f"{gate_id}.golden.json is stale; " + "re-run `python -m scripts.eval.build_golden`" + ) From 85fb232c266f189b8e9c1576bb968530e767290a Mon Sep 17 00:00:00 2001 From: Angel Date: Mon, 1 Jun 2026 00:03:57 -0400 Subject: [PATCH 21/22] =?UTF-8?q?feat(eval):=20=E6=8E=A5=E5=85=A5=20BCP=20?= =?UTF-8?q?=E7=BC=96=E8=AF=91=E9=97=A8=E7=A6=81=E4=B8=BA=E5=BC=BA=E5=88=B6?= =?UTF-8?q?=E8=BD=AF=E9=97=A8=EF=BC=88=E6=96=87=E6=A1=A3=E5=A3=B0=E6=98=8E?= =?UTF-8?q?=E2=86=92=E5=BC=BA=E5=88=B6=E5=B1=82=E8=90=BD=E5=9C=B0=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCP(Build-Check Pass Rate,metrics.md §8.5 / acceptance.md §2)此前只在 文档声明,summarize 不产出指标、gate.py 也不强制——acceptance_thresholds.yaml 缺该门,属"文档已声明、强制层未实现"的缺口(修复 acceptance.md ↔ acceptance_thresholds.yaml 的 synced_with_sha 漂移时发现)。 打通数据通路: - judge 阶段 _run_build_check 把结果记入 MergeState.build_check_passed (三态 None/True/False) - ci_reporter.build_ci_summary 输出该字段 → eval run_meta.json(RunMeta 新增 build_check_passed)→ summarize._compute_bcp 聚合 → 报告模板 BCP 行 → gate.py 按 soft 门 == 1.0 判定 - acceptance_thresholds.yaml 加 BCP 软门并刷新 synced_with_sha(手改两字段, 不用 update-acceptance-sync 以免 yaml.safe_dump 抹掉注释) 分母口径:仅统计实际执行了 build_check 的 run;未配置工具链、或在 judge 前 升级人工(无合并产物可编译)的 run 记 None、不计入——系统正确升级不应拉低 BCP。整集无人执行时输出 N/A、gate SKIP(绝不误判 fail)。 测试:summarize _compute_bcp、gate BCP pass/fail/skip、ci_summary 透传、 judge _run_build_check 四路径的 build_check_passed 断言、报告夹具补 BCP。 --- doc/evaluation/metrics.md | 16 +++-- scripts/eval/_schemas.py | 4 ++ scripts/eval/_templates/eval_report.md.j2 | 1 + scripts/eval/run.py | 6 ++ scripts/eval/summarize.py | 17 +++++ src/core/phases/judge_review.py | 2 + src/models/state.py | 10 +++ src/tools/ci_reporter.py | 1 + .../eval/manifests/acceptance_thresholds.yaml | 31 +++++++-- tests/eval/unit/test_gate.py | 68 +++++++++++++++++++ tests/eval/unit/test_report_render.py | 1 + tests/eval/unit/test_summarize.py | 40 +++++++++++ tests/unit/test_build_check_gate.py | 9 +++ tests/unit/test_phases.py | 12 ++++ 14 files changed, 208 insertions(+), 10 deletions(-) diff --git a/doc/evaluation/metrics.md b/doc/evaluation/metrics.md index e140975..0ef6f2f 100644 --- a/doc/evaluation/metrics.md +++ b/doc/evaluation/metrics.md @@ -314,12 +314,20 @@ escalate(0.0) 文件绕过闸口静默丢失"。Acceptance: **DESC = 0**(用 ### 8.5 编译门禁通过率(Build-Check Pass Rate, BCP) ``` -BCP = | 配置了 build_check 且退出码 0 的 run | / | 配置了 build_check 的 run | +BCP = | 实际执行了 build_check 且退出码 0 的 run | / | 实际执行了 build_check 的 run | ``` -数据源:judge 阶段 `_run_build_check`(command 由 setup 自动探测填充,方案1)。非零退出 -把 Judge PASS 降级 FAIL+veto。Acceptance(Soft): **BCP = 100%**(仅统计已配置 command 的 -run;未探测到工具链的目标不计入分母)。 +数据源:judge 阶段 `_run_build_check` 把结果记入 `MergeState.build_check_passed` +(三态:`None`=未运行 / `True`=运行且退出 0 / `False`=运行且非零或超时),经 +`ci_reporter.build_ci_summary` → eval `run_meta.json` → `summarize._compute_bcp` +聚合成 `BCP` 指标行,`gate.py` 按 soft 门 `== 1.0` 判定。command 由 setup 自动探测 +填充(方案1);非零退出把 Judge PASS 降级 FAIL+veto。 + +分母口径:仅统计 **实际执行了** build_check 的 run(`build_check_passed is not None`)。 +未配置 command、或在 judge 前就升级人工(没有合并产物可编译)的 run 记为 `None`、 +**不计入分母**——系统正确升级不应拉低 BCP。整个数据集都没有 run 执行过 build_check 时, +`summarize` 输出 `N/A`、`gate.py` SKIP(绝不误判为 fail)。Acceptance(Soft): +**BCP = 100%**。 --- diff --git a/scripts/eval/_schemas.py b/scripts/eval/_schemas.py index 62d61f9..3e52707 100644 --- a/scripts/eval/_schemas.py +++ b/scripts/eval/_schemas.py @@ -263,6 +263,10 @@ class RunMeta(BaseModel): status: Literal["success", "failed"] = "success" memory_clean_check: Literal["passed", "skipped"] = "passed" exit_code: int = 0 + # Tri-state build_check (BCP, metrics.md §8.5): None = build_check did not + # run (not configured, or escalated before judge) → excluded from the BCP + # denominator; True = ran and passed; False = ran and failed. + build_check_passed: bool | None = None # --------------------------------------------------------------------------- diff --git a/scripts/eval/_templates/eval_report.md.j2 b/scripts/eval/_templates/eval_report.md.j2 index fff7647..a53580a 100644 --- a/scripts/eval/_templates/eval_report.md.j2 +++ b/scripts/eval/_templates/eval_report.md.j2 @@ -51,6 +51,7 @@ | JA | {{ metrics.JA }} | | DET | {{ metrics.DET }} | | CPC | {{ metrics.CPC }} | +| BCP | {{ metrics.BCP }} | | cost_usd_per_run_p95 | {{ metrics.cost_usd_per_run_p95 }} | | wall_time_seconds_p95 | {{ metrics.wall_time_seconds_p95 }} | | plan_revision_rounds_p95 | {{ metrics.plan_revision_rounds_p95 }} | diff --git a/scripts/eval/run.py b/scripts/eval/run.py index 283e46a..bde3c13 100644 --- a/scripts/eval/run.py +++ b/scripts/eval/run.py @@ -140,6 +140,7 @@ def _build_run_meta( memory_clean_check: str, exit_code: int, cache_disabled: bool, + build_check_passed: bool | None, ) -> RunMeta: return RunMeta( sample_id=sample_id, @@ -153,6 +154,7 @@ def _build_run_meta( status=status, # type: ignore[arg-type] memory_clean_check=memory_clean_check, # type: ignore[arg-type] exit_code=exit_code, + build_check_passed=build_check_passed, ) @@ -223,6 +225,9 @@ async def _run_one_sample( ) status = "success" if exit_code == 0 else "failed" + bcp_raw = ci_payload.get("build_check_passed") + build_check_passed = bcp_raw if isinstance(bcp_raw, bool) else None + meta = _build_run_meta( sample_id=sample_id, run_id=run_id, @@ -235,6 +240,7 @@ async def _run_one_sample( memory_clean_check="passed", exit_code=exit_code, cache_disabled=False, + build_check_passed=build_check_passed, ) write_json(sample_out / "run_meta.json", meta.model_dump(mode="json")) return exit_code diff --git a/scripts/eval/summarize.py b/scripts/eval/summarize.py index 4ebee10..21ba5de 100644 --- a/scripts/eval/summarize.py +++ b/scripts/eval/summarize.py @@ -111,6 +111,21 @@ def _format_pct(value: float | str, decimals: int = 4) -> str: return str(value) +def _compute_bcp(metas: dict[str, RunMeta]) -> float | str: + """Build-Check Pass Rate (metrics.md §8.5): passed / ran. + + Only runs that actually executed build_check (``build_check_passed`` is + not None) count toward the denominator — a run that never configured a + build command, or escalated before judge, is excluded ("未探测到工具链 + 的目标不计入分母"). Returns ``"N/A"`` when no run executed build_check so + gate.py SKIPs the BCP gate instead of failing it. + """ + ran = [m for m in metas.values() if m.build_check_passed is not None] + if not ran: + return "N/A (no run executed build_check)" + return sum(1 for m in ran if m.build_check_passed) / len(ran) + + def _compute_sser(samples: tuple[DiffEntry, ...]) -> float: """SSER per metrics.md §3.2: of all security-sensitive samples, how many were routed to human review (``system_decision.human == True``). @@ -246,6 +261,7 @@ def _compute_metrics( "JA": "N/A (follow-up)", "DET": "N/A (multi-run)", "CPC": "N/A (multi-provider)", + "BCP": _format_pct(_compute_bcp(metas)), "cost_usd_per_run_p95": _format_pct( _percentile([m.cost_usd for m in metas.values()], 95) ), @@ -292,6 +308,7 @@ def _empty_metrics() -> dict[str, Any]: "JA": "N/A (follow-up)", "DET": "N/A (multi-run)", "CPC": "N/A (multi-provider)", + "BCP": "N/A (no run executed build_check)", "cost_usd_per_run_p95": "N/A", "wall_time_seconds_p95": "N/A", "plan_revision_rounds_p95": "N/A", diff --git a/src/core/phases/judge_review.py b/src/core/phases/judge_review.py index 060699f..21f98c6 100644 --- a/src/core/phases/judge_review.py +++ b/src/core/phases/judge_review.py @@ -439,8 +439,10 @@ async def _run_build_check(self, state: MergeState, ctx: PhaseContext) -> None: output = f"build check failed to launch: {exc!r}" if returncode == 0: + state.build_check_passed = True return + state.build_check_passed = False tail = "\n".join(output.strip().splitlines()[-20:]) new_issue = JudgeIssue( file_path="(build)", diff --git a/src/models/state.py b/src/models/state.py index 916f33a..99aeece 100644 --- a/src/models/state.py +++ b/src/models/state.py @@ -174,6 +174,16 @@ class MergeState(BaseModel): ) judge_verdict: JudgeVerdict | None = None + build_check_passed: bool | None = Field( + default=None, + description=( + "Tri-state outcome of the Phase 5.5 build_check gate (BCP metric, " + "metrics.md §8.5): None when build_check did not run (disabled, no " + "command, or the run escalated before judge), True when it ran and " + "exited 0, False when it ran and failed. None is excluded from the " + "BCP denominator." + ), + ) judge_repair_rounds: int = 0 judge_verdicts_log: list[dict[str, Any]] = Field(default_factory=list) applied_repairs: list[dict[str, str]] = Field( diff --git a/src/tools/ci_reporter.py b/src/tools/ci_reporter.py index 7082ed3..57d515c 100644 --- a/src/tools/ci_reporter.py +++ b/src/tools/ci_reporter.py @@ -112,6 +112,7 @@ def build_ci_summary(state: MergeState) -> dict[str, Any]: "human_decided": human_decided, "failed_count": failed, "judge_verdict": judge_verdict, + "build_check_passed": state.build_check_passed, "errors": [err.get("message", "") for err in state.errors[-5:]], "by_category": _escalation_by_category(state), } diff --git a/tests/eval/manifests/acceptance_thresholds.yaml b/tests/eval/manifests/acceptance_thresholds.yaml index c287e9f..7ff0c61 100644 --- a/tests/eval/manifests/acceptance_thresholds.yaml +++ b/tests/eval/manifests/acceptance_thresholds.yaml @@ -1,10 +1,13 @@ # acceptance_thresholds.yaml — eval-impl Phase 6 fixture # -# Authoritative source: doc/evaluation/acceptance.md @ 2026-05-15. +# Authoritative source: doc/evaluation/acceptance.md @ 2026-05-31. # Schema enforced by scripts/eval/_schemas.AcceptanceThresholds. # -# Keep ``synced_with_sha`` aligned with doc/evaluation/acceptance.md via: -# python -m scripts.eval.lock --update-acceptance-sync +# Keep ``synced_with_sha`` aligned with doc/evaluation/acceptance.md. NOTE: +# ``lock --update-acceptance-sync`` re-serialises this file via yaml.safe_dump +# and STRIPS every comment below. To preserve this header, instead hand-edit +# the two ``synced_*`` fields with the new ``sha256(acceptance.md)`` (compute: +# ``python -c "import hashlib;print(hashlib.sha256(open('doc/evaluation/acceptance.md','rb').read()).hexdigest())"``). # # Hard gates: always absolute. WDR is intentionally OMITTED in this # release — the diff classifier currently collapses MISS_FORK into @@ -13,10 +16,21 @@ # work will reinstate a true MISS_FORK distinction; WDR moves back to # hard once that lands. # -# Soft gates: 6 absolute + 3 relative. Relative gates SKIP when the +# Soft gates: 7 absolute + 3 relative. Relative gates SKIP when the # caller omits ``gate.py --baseline`` (plan-amend / decision C). -synced_with_sha: "6355be87e619edb1ac9c081622c106a8457dc53e5407ec45bdad56b3b7144a48" -synced_at: "2026-05-15T12:00:00+00:00" +# +# BCP (build-check pass rate, acceptance.md §2 / metrics.md §8.5) is +# ENFORCED: summarize.py emits it from each run's tri-state +# ``build_check_passed``; runs that never executed build_check are excluded +# from the denominator, so on a dataset where no run configured a build +# command BCP renders ``N/A`` and gate.py SKIPs it (never a false fail). +# +# §3 自学习反馈环激活门 (MDL / memory_harmed / CRI / MCPD, acceptance.md §3) +# are NOT eval gates: they gate the self-learning loop's default-on switch +# via ``merge eval-memory``, deliberately separate from §1/§2 merge-quality +# gates, so they never appear here. +synced_with_sha: "3a5fa06d89b38f3eb5e4817d29ed07ec638e783124e1f685e3b1d0ed6f9ea914" +synced_at: "2026-05-31T00:00:00+00:00" hard_gates: - id: WMR kind: absolute @@ -114,6 +128,11 @@ soft_gates: threshold: 0.85 operator: ">=" source: "切换 reviewer/executor provider" + - id: BCP + kind: absolute + threshold: 1.0 + operator: "==" + source: "配置了 build_check 的 run(metrics.md §8.5;未配置/未运行的 run 不计入分母 → 全量未配置时 SKIP)" - id: cost_usd_per_run_p95 kind: relative multiplier: 1.15 diff --git a/tests/eval/unit/test_gate.py b/tests/eval/unit/test_gate.py index b7d605a..b383631 100644 --- a/tests/eval/unit/test_gate.py +++ b/tests/eval/unit/test_gate.py @@ -608,3 +608,71 @@ def test_relative_gate_absent_metric_skipped( ) assert cost["pass"] is None assert "not numeric" in cost["skipped_reason"] + + +# --------------------------------------------------------------------------- +# BCP (build-check pass rate, metrics.md §8.5) — enforced soft gate +# --------------------------------------------------------------------------- + + +def _bcp_thresholds() -> dict[str, Any]: + payload = _full_pass_thresholds() + payload["soft_gates"].append( + { + "id": "BCP", + "kind": "absolute", + "threshold": 1.0, + "operator": "==", + "source": "configured build_check runs", + } + ) + return payload + + +class TestBCP: + def test_bcp_pass_at_one(self, workspace: tuple[Path, Path, Path]) -> None: + report, yml, out = workspace + metrics = _full_pass_report() + metrics["BCP"] = 1.0 + report.write_text(_build_report(metrics), encoding="utf-8") + _write_yaml(yml, _bcp_thresholds()) + rc = _run_gate( + "--report", str(report), "--acceptance", str(yml), "--output", str(out) + ) + assert rc == 0 + bcp = next( + g for g in json.loads(out.read_text())["soft_gates"] if g["id"] == "BCP" + ) + assert bcp["pass"] is True + + def test_bcp_soft_fail_below_one(self, workspace: tuple[Path, Path, Path]) -> None: + report, yml, out = workspace + metrics = _full_pass_report() + metrics["BCP"] = 0.5 # one configured run failed to build + report.write_text(_build_report(metrics), encoding="utf-8") + _write_yaml(yml, _bcp_thresholds()) + rc = _run_gate( + "--report", str(report), "--acceptance", str(yml), "--output", str(out) + ) + # Soft breach → NEEDS_REVIEW / exit 2, not a hard fail. + assert rc == 2 + bcp = next( + g for g in json.loads(out.read_text())["soft_gates"] if g["id"] == "BCP" + ) + assert bcp["pass"] is False + + def test_bcp_skips_when_na(self, workspace: tuple[Path, Path, Path]) -> None: + report, yml, out = workspace + metrics = _full_pass_report() + metrics["BCP"] = "N/A (no run executed build_check)" + report.write_text(_build_report(metrics), encoding="utf-8") + _write_yaml(yml, _bcp_thresholds()) + rc = _run_gate( + "--report", str(report), "--acceptance", str(yml), "--output", str(out) + ) + assert rc == 0 # SKIP never fails the verdict + bcp = next( + g for g in json.loads(out.read_text())["soft_gates"] if g["id"] == "BCP" + ) + assert bcp["pass"] is None + assert "not numeric" in bcp["skipped_reason"] diff --git a/tests/eval/unit/test_report_render.py b/tests/eval/unit/test_report_render.py index edf0d09..b030155 100644 --- a/tests/eval/unit/test_report_render.py +++ b/tests/eval/unit/test_report_render.py @@ -46,6 +46,7 @@ def _minimal_context(**overrides: Any) -> dict[str, Any]: "JA": "N/A", "DET": "N/A", "CPC": "N/A", + "BCP": "N/A", "cost_usd_per_run_p95": "0.0", "wall_time_seconds_p95": "0.0", "plan_revision_rounds_p95": "N/A", diff --git a/tests/eval/unit/test_summarize.py b/tests/eval/unit/test_summarize.py index 49e0cb0..720a61a 100644 --- a/tests/eval/unit/test_summarize.py +++ b/tests/eval/unit/test_summarize.py @@ -345,9 +345,49 @@ def test_compute_metrics_empty_samples_returns_na_keys(self) -> None: "OverEscalationRate", ): assert metrics[k] == "N/A" + # BCP renders its own N/A reason and must be present so the report + # template / gate always find the key. + assert metrics["BCP"].startswith("N/A") # Silence the import-only-for-name lint when ruff inspects this. _ = _DE + def _meta(self, sid: str, build_check_passed: bool | None) -> "RunMeta": + from scripts.eval._schemas import RunMeta + + return RunMeta( + sample_id=sid, + run_id=f"r-{sid}", + seed=0, + concurrency=1, + wall_time_seconds=0.0, + cost_usd=0.0, + git_sha="sha", + build_check_passed=build_check_passed, + ) + + def test_bcp_na_when_no_run_executed_build_check(self) -> None: + from scripts.eval.summarize import _compute_bcp + + metas = {"a": self._meta("a", None), "b": self._meta("b", None)} + assert _compute_bcp(metas) == "N/A (no run executed build_check)" + + def test_bcp_excludes_none_from_denominator(self) -> None: + from scripts.eval.summarize import _compute_bcp + + # 1 passed, 1 failed, 1 not-run → 1/2, not 1/3. + metas = { + "a": self._meta("a", True), + "b": self._meta("b", False), + "c": self._meta("c", None), + } + assert _compute_bcp(metas) == 0.5 + + def test_bcp_all_passed_is_one(self) -> None: + from scripts.eval.summarize import _compute_bcp + + metas = {"a": self._meta("a", True), "b": self._meta("b", True)} + assert _compute_bcp(metas) == 1.0 + def test_failure_rows_sorted_by_sample_id(self) -> None: from scripts.eval._schemas import ( DiffEntry, diff --git a/tests/unit/test_build_check_gate.py b/tests/unit/test_build_check_gate.py index 9394b47..eaa9df3 100644 --- a/tests/unit/test_build_check_gate.py +++ b/tests/unit/test_build_check_gate.py @@ -64,6 +64,8 @@ async def test_build_failure_vetoes_pass(tmp_path) -> None: assert state.judge_verdict.critical_issues_count == 1 issue_types = {i.issue_type for i in state.judge_verdict.issues} assert "build_check_failed" in issue_types + # BCP source: a build that ran and failed records False (counted, fails BCP). + assert state.build_check_passed is False @pytest.mark.asyncio @@ -73,6 +75,8 @@ async def test_build_success_keeps_pass(tmp_path) -> None: assert state.judge_verdict is not None assert state.judge_verdict.verdict == VerdictType.PASS assert not state.judge_verdict.veto_triggered + # BCP source: a build that ran and passed records True. + assert state.build_check_passed is True @pytest.mark.asyncio @@ -81,6 +85,8 @@ async def test_build_check_disabled_noop(tmp_path) -> None: await JudgeReviewPhase()._run_build_check(state, MagicMock()) assert state.judge_verdict is not None assert state.judge_verdict.verdict == VerdictType.PASS + # Did not run → None → excluded from the BCP denominator. + assert state.build_check_passed is None @pytest.mark.asyncio @@ -89,6 +95,7 @@ async def test_empty_command_noop(tmp_path) -> None: await JudgeReviewPhase()._run_build_check(state, MagicMock()) assert state.judge_verdict is not None assert state.judge_verdict.verdict == VerdictType.PASS + assert state.build_check_passed is None @pytest.mark.asyncio @@ -98,6 +105,8 @@ async def test_build_timeout_vetoes_pass(tmp_path) -> None: assert state.judge_verdict is not None assert state.judge_verdict.verdict == VerdictType.FAIL assert state.judge_verdict.veto_triggered + # Timeout is a build failure → False (counted, fails BCP). + assert state.build_check_passed is False def test_build_check_config_defaults() -> None: diff --git a/tests/unit/test_phases.py b/tests/unit/test_phases.py index 783924f..7545a31 100644 --- a/tests/unit/test_phases.py +++ b/tests/unit/test_phases.py @@ -678,6 +678,18 @@ async def test_verification_findings_recorded_as_errors_partial_failure(self): state.status = SystemStatus.COMPLETED assert build_ci_summary(state)["status"] == "partial_failure" + def test_ci_summary_surfaces_build_check_passed(self): + # BCP (metrics.md §8.5) is sourced from the CI summary by the eval + # harness; the tri-state must round-trip verbatim (None excluded from + # the BCP denominator, True/False counted). + from src.tools.ci_reporter import build_ci_summary + + for value in (None, True, False): + state = _make_state( + status=SystemStatus.GENERATING_REPORT, build_check_passed=value + ) + assert build_ci_summary(state)["build_check_passed"] is value + @pytest.mark.asyncio async def test_report_skips_verification_in_dry_run(self): state = _make_state(status=SystemStatus.GENERATING_REPORT, dry_run=True) From a8d259774522cb5dbe35a1940d6a72d00ea10432 Mon Sep 17 00:00:00 2001 From: Angel Date: Mon, 1 Jun 2026 21:27:08 -0400 Subject: [PATCH 22/22] update README.md --- README.md | 543 +++++++++++++++++++++++++++++-------------------- README_zh.md | 412 +++++++++++++++++++++++++++++++++++++ pyproject.toml | 6 +- 3 files changed, 738 insertions(+), 223 deletions(-) create mode 100644 README_zh.md diff --git a/README.md b/README.md index 980674d..e4924f3 100644 --- a/README.md +++ b/README.md @@ -1,309 +1,412 @@ -# CodeMergeSystem +
-一个面向"长期分叉 fork ↔ upstream"场景的多 Agent 代码合并系统。通过 LLM 做语义理解、通过确定性工具做**可证伪**的加固扫描,把原本需要人工逐文件处理的大规模合并变成一条 **可审计、可暂停、可恢复** 的流水线。 +[中文](README_zh.md) | **English** -> 中文文档为权威版本。英文文档将在后续补充。 +# 🔀 Code Merge System ---- +### Ship upstream upgrades to long-lived forks — without the 500-file conflict nightmare. -## 这是为了解决什么问题 +A multi-agent pipeline that turns months of upstream drift into an **auditable, resumable, and safe** merge — preserving every fork customization along the way. -在长期维护的软件项目中,下游团队常常基于某个历史版本做了大量私有改动,同时 upstream 持续迭代新功能、重构接口、升级依赖。分叉时间一长,直接 `git merge` 会出现: +[![Python 3.11+](https://img.shields.io/badge/python-3.11+-3776AB.svg?logo=python&logoColor=white)](https://python.org) +[![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)](#development) +[![Coverage](https://img.shields.io/badge/coverage-80%25+-brightgreen.svg)](#development) +[![License](https://img.shields.io/badge/license-TBD-lightgrey.svg)](#license) +[![Anthropic](https://img.shields.io/badge/powered%20by-Claude%20%2B%20GPT-orange.svg)](https://anthropic.com) -- 数百到数千个文件级冲突,人工无法逐一处理; -- 行级 diff 无法表达语义,LLM/人都容易判错; -- fork 独有的定制(API、路由、哨兵、CI job)被整文件覆盖而不被察觉; -- 合并错一处可能导致运行时漏洞或功能失踪,且难以回滚。 +![Code Merge System Dashboard](doc/project-1.png) -CodeMergeSystem 用 **七个专门化 Agent + 五十余个确定性工具 + 三层记忆 + 完整 Checkpoint** 提供一条通用合并流水线。 +
-## 核心能力 +--- -- **六大丢失模式识别**:shadow 冲突 / 接口反向影响 / 顶层调用丢失 / 配置行保留 / Scar 自学习 / 业务哨兵扫描 -- **Planner ↔ Judge 协商**:审查 Agent 与 Executor 使用不同 LLM 提供商,避免共谋偏差 -- **写入即快照**:任何文件写入前自动保存原内容,失败即回滚 -- **全阶段 Checkpoint**:任意时刻 SIGINT 可安全中断,`merge resume` 从上次停下处继续 -- **门禁 baseline-diff**:只看"新引入的失败",而非简单 exit 0,避免合入隐性 regression -- **显式人工决策**:决策无默认回退,避免"超时即接受"的隐患 -- **多语言 AST 分块**:Python/TS/JS/Go/Rust/Java/C 均走 tree-sitter +## The Problem -## 前置准备 +Teams that maintain a long-lived fork face a brutal reality when syncing with upstream: -| 项 | 说明 | -|---|---| -| Python 3.11+ | mypy strict / Pydantic v2 / async 全程 | -| `ANTHROPIC_API_KEY` | Planner / ConflictAnalyst / Judge / HumanInterface 用 | -| `OPENAI_API_KEY` | PlannerJudge / Executor 用(双 provider 是为了避免共谋偏差) | -| `GITHUB_TOKEN`(可选) | 启用 GitHub 集成(拉取 PR 评论 / 推合并结果到 PR)时需要 | -| Node.js(可选) | 仅 Web UI 开发(`cd web && npm install / npm run build`)需要;pip install 安装的 wheel 已内置 `src/web/dist/`,运行 `merge` 本身无需 Node | +- **Hundreds to thousands of file conflicts** — impossible to handle manually, one by one +- **Line-level diffs hide semantic intent** — LLMs and humans both make the wrong call +- **Fork-only customizations get silently overwritten** — APIs, routes, CI jobs, sentinels disappear without a trace +- **One wrong merge creates runtime vulnerabilities or missing features** — and they're hard to roll back -**目标仓库需满足**: +`git merge` gives you a list of conflicts. Code Merge System gives you a **decision pipeline**. -- 是个 git 仓库,且当前 HEAD 是你的 fork 主分支 -- 工作树干净(`git status` 无未提交更改)—— 系统会写文件,脏树会被拒 -- upstream 那一端可访问:要么是本地分支(如 `upstream/main`、`origin/upstream-main`),要么 `git fetch ` 已拉到本地 +--- -如果你 fork 还没接 upstream 远端: +## Quick Start ```bash +pip install code-merge-system + +export ANTHROPIC_API_KEY=sk-ant-... +export OPENAI_API_KEY=sk-... + cd /path/to/your-fork-repo -git remote add upstream https://github.com//.git -git fetch upstream +merge upstream/main --dry-run # preview the plan before touching any files ``` -## 安装 +> First run opens a browser UI and walks you through a one-time setup wizard. Your config is saved to `.merge/config.yaml` — no wizard on subsequent runs. -```bash -git clone && cd CodeMergeSystem -python3.11 -m venv .venv && source .venv/bin/activate -pip install -e ".[dev]" +--- -export ANTHROPIC_API_KEY=sk-ant-... -export OPENAI_API_KEY=sk-... -``` +## See It In Action -## 首次合并:完整流程 + + + + + + + + + +
-下面是一次真实合并里你**会依次看到的内容 + 每步要做的判断**。第一次跑建议先来一遍 `--dry-run` 摸清规模再决定真合。 +**Plan Review** — 124 files analyzed, 87.9% auto-merge confidence, risk distribution across A–E change categories. -### 1. 启动 + 首次配置向导 +![Plan Review](doc/project-2.png) -```bash -cd /path/to/your-fork-repo -merge upstream/main --dry-run -``` + -首次运行进入交互向导,依次问你: +**Conflict Resolution** — Side-by-side intent analysis of fork vs. upstream changes, with LLM-recommended merge strategy (SEMANTIC_MERGE 85% confidence). -- **项目背景描述**(一句话即可,会喂给 Planner 帮它理解上下文) -- **API Key 确认**(已 export 的会显示掩码,回车跳过表示沿用) -- **阈值**(默认 `auto_merge=0.85 / risk_low=0.30 / risk_high=0.60`,新手直接默认) +![Conflict Resolution](doc/project-4.png) -之后系统在 `/.merge/` 写入 `config.yaml` + `.env`(后者自动加进 `.gitignore`),下次运行不再问。 +
-> **如果你的 fork 删过整片功能域**(例如砍掉了 payments 子树):当系统检测到 ≥30 个被 fork 删除的文件时,向导会**主动提示生成 `forks-profile.yaml` 草稿**并打开 `$EDITOR` 让你审阅。低于阈值则完全静默 —— `fork_only_features` 与 `migration_policy` 已在每次 run 时自动从 git 推算,无需手工维护。 +**Judge Verdict** — Independent review agent audits every merged file; CRITICAL/HIGH/MEDIUM/LOW issue breakdown with repair rounds. -### 2. dry-run 跑出合并计划 +![Judge Verdict](doc/project-5.png) -向导通过后系统在浏览器打开 Web UI(`--no-web` 切纯文本输出)。你会看到 8 个 phase 依次推进: + -``` -INITIALIZE → 三方分类、风险打分、forks-profile 路由 -PLANNING → Planner 出合并计划 -PLAN_REVIEW → PlannerJudge 审查;最多 2 轮修订 -AWAITING_HUMAN → 你审阅计划报告 -...(dry-run 在此停止) -``` +**Run Report** — Full cost accounting ($0.04 for 124 files), per-agent token breakdown, learned memory entries for future runs. + +![Run Report](doc/project-6.png) + +
+ +--- + +## How It Works -dry-run 结束后**重点看这两个文件**: +Eight phases driven by a state machine. Seven specialized agents. Every write is snapshotted. Any `Ctrl+C` is safe. ``` -.merge/plans/MERGE_PLAN__.md -.merge/runs//plan_review.md +┌─────────────────────────────────────────────────────────────┐ +│ CLI / Web UI │ +│ │ │ +│ Orchestrator ── 8-phase state machine │ +│ │ │ +│ ┌──────┴───────┐ │ +│ │ │ │ +│ Agents Tools Memory │ +│ (7 roles) (50+ deterministic (L0/L1/L2 │ +│ + AST parsers) cross-run store) │ +│ │ │ +│ LLM layer (Anthropic + OpenAI, credential pool, routing) │ +└─────────────────────────────────────────────────────────────┘ ``` -报告会告诉你: +| Phase | What happens | +|-------|-------------| +| `INITIALIZE` | 3-way classification, risk scoring, fork-profile routing | +| `PLANNING` | Planner generates merge plan with per-file strategy | +| `PLAN_REVIEW` | PlannerJudge audits the plan; up to 2 revision rounds | +| `AWAITING_HUMAN` | You review the plan report; fill in any `HUMAN_REQUIRED` decisions | +| `AUTO_MERGING` | Executor applies auto-safe files with snapshot-before-write | +| `CONFLICT_ANALYSIS` | ConflictAnalyst does semantic analysis on risky conflicts | +| `JUDGE_REVIEW` | Judge + 50+ deterministic scanners audit all merged output | +| `COMPLETED` | Full report generated; you decide when to `git commit` | + +| Agent | Role | Default Model | +|-------|------|---------------| +| Planner | Generates merge plan | Claude Opus | +| PlannerJudge | Reviews plan (read-only) | GPT-4o | +| ConflictAnalyst | Semantic analysis of high-risk conflicts | Claude Sonnet | +| Executor | **Sole write authority** — applies merges | GPT-4o | +| Judge | Reviews merged output + runs deterministic checks | Claude Opus | +| HumanInterface | Generates decision templates | Claude Haiku | +| SmokeTest | Post-merge smoke testing | — | + +> **Why two LLM providers?** Planner/Judge use Anthropic; Executor/PlannerJudge use OpenAI. Different providers for reviewer vs. writer eliminates collusion bias. + +--- + +## Features + +### [Six Lost-Pattern Detectors](doc/modules/tools.md) +Shadow conflicts, interface reverse impacts, top-level call drops, config line preservation, scar auto-learning, and business sentinel scanning — the failure modes that `git merge` misses entirely. + +### [Snapshot-Before-Write](doc/modules/core.md) +Every file write creates a snapshot of the original. Any failure triggers automatic rollback. You never end up with a half-merged file. + +### [Full-Run Checkpointing](doc/modules/core.md) +State is persisted after every phase. `merge resume --run-id ` picks up exactly where you left off — useful for large merges that take hours. + +### [Explicit Human Decisions](doc/modules/agents.md) +No `TIMEOUT_DEFAULT`. No silent fallbacks. Files that need human judgment generate a `decisions.yaml` template; skipped decisions stay as `AWAITING_HUMAN` until explicitly resolved. + +### [Multi-Language AST Chunking](doc/modules/tools.md) +Python, TypeScript, JavaScript, Go, Rust, Java, and C all use tree-sitter for semantic-level diff — not just line-level. + +### [Cross-Run Memory](doc/modules/memory.md) +Decisions, disputes, and metrics are summarized into a SQLite store. Future runs on the same repo load relevant history to inform planning. + +### [Baseline-Diff Gate](doc/modules/tools.md) +CI validation only flags *newly introduced* failures — not pre-existing ones. Merging into a repo with a known broken test won't block you. + +### [Browser Web UI](doc/modules/web-ui.md) +Real-time pipeline progress, conflict resolution UI, plan review, judge verdict — all in a local browser app. Use `--no-web` for pure terminal output or `--ci` for JSON output in CI. + +--- + +## Compared to Alternatives + +| | Code Merge System | `git merge` / `git rebase` | GitHub/GitLab UI | LLM chat (ChatGPT etc.) | +|--|--|--|--|--| +| Handles 500+ file conflicts | ✅ | ❌ Manual, one-by-one | ❌ | ❌ Context limit | +| Preserves fork-only features | ✅ Auto-detected via scar/sentinel | ❌ Easy to overwrite | ❌ | ❌ No repo context | +| Auditable decision trail | ✅ Per-file, with rationale | ❌ | Partial (PR comments) | ❌ | +| Resumable after interrupt | ✅ Checkpoint after every phase | ❌ | ❌ | ❌ | +| Deterministic safety checks | ✅ 50+ scanners post-merge | ❌ | ❌ | ❌ | +| Cost | ~$0.04 for 124 files | Free | Free | Per-token, no automation | + +--- + +## Can You Trust the Output? + +A merge tool is only worth as much as the evidence that its output is correct. This project ships a **formal evaluation framework** and an **auditable self-learning loop** — and reports their results honestly, including where the numbers are not yet impressive. + +### Evaluation against human golden merges + +We do **not** ask the LLM judge to grade its own verdict. The framework under [`doc/evaluation/`](doc/evaluation/README.md) measures system output against **expert human golden merges as ground truth**, scoring five trust dimensions at once — a system that blindly takes upstream and scores 100% "coverage" while losing half the fork's work must still fail: + +| Dimension | Question it answers | Key metrics | +|-----------|--------------------|-------------| +| **Correctness** | Did it merge what should merge, correctly? | miss-merge rate, wrong-merge rate, conflict-resolution accuracy | +| **Safety** | Did it silently drop private changes? | M1–M6 semantic-loss recall, security-sensitive escalation rate, snapshot rollback rate | +| **Process Trust** | Does it escalate uncertainty instead of guessing? | over-escalation rate, plan-dispute hit rate, Judge↔ground-truth agreement | +| **Explainability** | Can every decision be replayed? | rationale completeness, `discarded_content` retention, trace replayability | +| **Operational** | Stable across re-runs and models? Cost bounded? | decision consistency, $/run, wall-time P95 | + +Three dataset tiers feed it: **Tier-1** micro-bench (30–60 PRs, runs in CI), **Tier-2** real long-span replays (human merge diff = oracle), **Tier-3** adversarial injections (does it actually catch M1–M6?). The harness lives in [`scripts/eval/`](scripts/eval/) (`prepare.py → run.py → diff_against_golden.py → summarize.py → gate.py`). -- 触及多少文件、按 ABCDE 五类分布 -- auto_merge / conflict_analysis / human_required 的占比 -- forks-profile drift 附录(如果 yaml 老化) -- Planner-Judge 审查记录 +**Hard gates that veto a release** ([`acceptance.md`](doc/evaluation/acceptance.md)): wrong-merge rate **= 0%**, security-sensitive escalation **= 100%**, private-content retention **= 100%**, snapshot rollback **= 100%**, duplicate top-level symbols **= 0**, hallucinated cross-module references **= 0**; miss-merge **≤ 2%** (Tier-1), each M1–M6 recall **≥ 95%**. Soft gates track overall accuracy (≥ 92% Tier-1), determinism (≥ 90% across 3 runs), cross-model consistency (≥ 85%), and cost/latency drift caps. -### 3. 决定继续真合并还是先调整 +> **Honesty over marketing:** the version-baseline table in `acceptance.md` is still seeded with a template row — no release has cleared the full gate yet, so we make **no "evaluated & trusted" claim**. The framework exists precisely so that claim, when made, is backed by lockable dataset SHAs and per-file golden diffs rather than a "99% merge success" headline. -如果计划合理: +### Self-learning — measured, not assumed + +The system improves across runs **without weight fine-tuning and without embeddings** — a deliberate choice backed by a 24-source survey (see [`doc/plan/self-learning-system.md`](doc/plan/self-learning-system.md)): non-parametric, auditable SQLite memory + execution-grounded reflection beats opaque RL on cost and deletability. + +| Phase | What it does | Status | +|-------|-------------|--------| +| **P0** Effectiveness metric | Ablation harness: `memory=on` vs `memory=off` decision lift | **Landed** — `merge eval-memory` | +| **P1** Grounded feedback loop | Persistent auditable suppression of harmful entries · confidence write-back from `judge`+`compile`+`ci` signals · verified-repair recipe library | **Landed**, feedback loops **opt-in** until ablation proves net gain | +| **P2** Memory-quality hardening | High-information entries enforced · key invariants pinned against summarization drift | **Landed** | +| **P3** Offline prompt optimization | `merge optimize-prompts` ranks gate-prompt variants against a golden set, emits a **human-review report — never auto-applies** | **Landed**, opt-in | + +The governing rule is **measure before you activate**: a feedback loop only flips to on-by-default after `merge eval-memory` shows lift **> 0** *and* causally-attributed harm **= 0** on a fixed dataset. First baseline (forgejo, 124 files): lift measured at **0.0000** — so the loops stay opt-in. That run was dominated by deterministic mechanisms (take-target + veto), leaving memory no room to act; it does **not** prove memory worthless, and an LLM-judgment-dense dataset is needed to measure real lift. We report the zero rather than hide it — that *is* the trust signal. + +--- + +## Prerequisites + +| | | +|--|--| +| Python 3.11+ | mypy strict / Pydantic v2 / async throughout | +| `ANTHROPIC_API_KEY` | Planner, ConflictAnalyst, Judge, HumanInterface | +| `OPENAI_API_KEY` | PlannerJudge, Executor (dual-provider anti-collusion) | +| `GITHUB_TOKEN` *(optional)* | GitHub integration — pull PR comments, push merge results | +| Node.js *(optional)* | Web UI development only; the installed wheel bundles `web/dist/` | + +**Target repo must:** +- Be a git repo with a clean working tree (`git status` shows no uncommitted changes) +- Have upstream accessible locally — either as a branch or via `git fetch ` ```bash -merge upstream/main # 不带 --dry-run,正式跑 +# If you haven't added upstream yet: +git remote add upstream https://github.com//.git +git fetch upstream ``` -系统会从 `INITIALIZE` 开始重新走一遍直到 `AUTO_MERGING` / `CONFLICT_ANALYSIS`,写入文件、做快照、跑门禁。 +--- + +## Full Workflow + +### 1. Plan (dry-run) -> **任意时刻 Ctrl+C 都安全** —— 已经写盘的 checkpoint 让你下次用 `merge resume --run-id ` 续跑。 +```bash +cd /path/to/your-fork-repo +merge upstream/main --dry-run +``` + +The browser UI opens and runs through `INITIALIZE → PLANNING → PLAN_REVIEW → AWAITING_HUMAN` then stops. Check the output reports: + +``` +.merge/plans/MERGE_PLAN_.md # file-by-file merge strategy +.merge/runs//plan_review.md # PlannerJudge audit record +``` + +### 2. Merge + +```bash +merge upstream/main # remove --dry-run to run for real +``` -### 4. 处理人工决策(AWAITING_HUMAN) +Any `Ctrl+C` is safe — resume with `merge resume --run-id `. -当系统遇到 risk_score 高于 `human_escalation` 的文件、或 Judge 判定不通过时,会暂停在 `AWAITING_HUMAN`,并在 `.merge/runs//` 下生成一个待填的 `decisions.yaml` 模板: +### 3. Handle Human Decisions + +When the system pauses at `AWAITING_HUMAN`, fill in `.merge/runs//decisions.yaml`: ```yaml -# decisions.yaml — 系统生成模板,你填决定 - file_path: "backend/services/auth/auth.service.ts" - decision: take_current # 可选:take_target / take_current / semantic_merge / escalate_human - rationale: "fork 用 SSO,必须保留" + decision: take_current # take_target / take_current / semantic_merge / escalate_human + rationale: "Fork uses SSO — must preserve" ``` -填完续跑: +Then resume: ```bash merge resume --run-id --decisions .merge/runs//decisions.yaml ``` -### 5. 最终产出 - -合并跑完后看: +### 4. Review and commit -| 路径 | 说明 | -|---|---| -| `.merge/runs//merge_report.md` | 最终合并报告(变更摘要、Judge verdict、未解决项) | -| `.merge/runs//checkpoint.json` | 完整状态,可继续 resume | -| `.merge/runs//logs/run_.log` | 全量执行日志 | -| 工作树本身 | 合并产物已落到当前分支;`git status` 看具体改了什么;自己决定是否 `git commit` | +``` +.merge/runs//merge_report.md # final report +.merge/runs//checkpoint.json # full state +.merge/runs//logs/run_.log # complete execution log +``` -> **系统不自动 commit / push** —— 写到工作树就停手,让你 review 完再提交。 +The system stops at the working tree. **It never auto-commits or auto-pushes** — you review, then decide. -## 常用命令 +--- -按使用场景分组: +## All Commands ```bash -# === 首次接入 / 日常合并 === -merge # 一站式(默认浏览器 Web UI) -merge --dry-run # 只跑到 plan,不动文件 -merge --no-web # 纯文本输出 -merge -r # 强制重新跑配置向导 - -# === 续跑 / 决策 === -merge resume --run-id # 从 checkpoint 续跑 -merge resume --run-id --decisions decisions.yaml # 带人工决策续跑 -merge resume --run-id --web # 在浏览器 Web UI 中续跑 / 查看历史 run 状态 - -# === 校验 === -merge validate --config # 校验 config.yaml + 所有 api_key_env - -# === forks-profile(仅在做 fork 整域裁剪时用)=== -merge forks-profile init -o .merge/forks-profile.yaml # 起草草稿 -merge forks-profile diff # 检查 yaml 是否过时 -merge forks-profile validate # 校验 yaml 语法 - -# === CI === -merge --ci # 无交互,JSON 摘要到 stdout +# Daily use +merge # default: browser Web UI +merge --dry-run # plan only, no file writes +merge --no-web # terminal output +merge -r # re-run setup wizard + +# Resume / decisions +merge resume --run-id +merge resume --run-id --decisions decisions.yaml +merge resume --run-id --web # view history in browser + +# Validate +merge validate --config # check config + all API keys + +# Fork profile (only needed when fork deleted ≥30 files) +merge forks-profile init -o .merge/forks-profile.yaml +merge forks-profile diff +merge forks-profile validate + +# CI +merge --ci # non-interactive, JSON summary to stdout +merge --ci --auto-decisions ``` -## 卡住了? - -| 现象 | 排查 | -|---|---| -| 向导报 "API Key not set" | 检查 `merge validate --config .merge/config.yaml`;shell env > `.merge/.env` > `~/.config/code-merge-system/.env` | -| 启动报 "working tree dirty" | `git status` 看到未提交改动;`git stash` 或 `git commit` 后再跑 | -| 启动报 "upstream ref not found" | 没 `git fetch upstream`,或者 `target-branch` 拼错(要写 `upstream/main` 不是 `main`) | -| dry-run 卡在 PLAN_REVIEW 多轮 | Planner 与 PlannerJudge 在博弈;正常 1-2 轮,`max_plan_revision_rounds=2` 后会转 `AWAITING_HUMAN`,去看 `plan_review.md` | -| 跑了一半中断 | 重新跑 `merge resume --run-id `(`run_id` 在 `.merge/runs/` 下能看到) | -| 想丢弃这次 run 重来 | `rm -rf .merge/runs//`,再 `merge ` | +--- -## 架构一览 +## Troubleshooting -``` -CLI / Web UI - │ - Orchestrator ── 状态机驱动 8 个 Phase - │ - ┌────┴─────┐ - │ │ -Agents Tools Memory -(7 角色) (50+ 工具 + (L0/L1/L2 - baseline parsers) 三层记忆) - │ -LLM 层(anthropic / openai,凭据池、智能路由、压缩) -``` +| Symptom | Fix | +|---------|-----| +| `API Key not set` | Run `merge validate --config .merge/config.yaml`; check shell env → `.merge/.env` → `~/.config/code-merge-system/.env` | +| `working tree dirty` | `git stash` or `git commit`, then re-run | +| `upstream ref not found` | Run `git fetch upstream`; use `upstream/main` not `main` | +| Plan review stuck in multiple rounds | Normal — Planner and PlannerJudge are negotiating; after `max_plan_revision_rounds=2` it transitions to `AWAITING_HUMAN`. Check `plan_review.md`. | +| Run interrupted mid-way | `merge resume --run-id ` (find `run_id` under `.merge/runs/`) | +| Want to start over | `rm -rf .merge/runs//`, then re-run | -| Agent | 角色 | 默认模型 | -|-------|------|----------| -| Planner | 生成合并计划 | Claude Opus | -| PlannerJudge | 审查计划 | GPT-4o | -| ConflictAnalyst | 高风险冲突语义分析 | Claude Sonnet | -| Executor | **唯一写权限**,应用合并 | GPT-4o | -| Judge | 审查合并结果 + 确定性复检 | Claude Opus | -| HumanInterface | 决策模板生成 | Claude Haiku | -| SmokeTest | 合并后冒烟测试 | — | +--- -每个 Agent 的模型、API Key、降档策略均可在 `config.yaml` 中独立配置。 +## Development -## `.merge/` 生产目录布局 +```bash +git clone && cd code-merge-system +python3.11 -m venv .venv && source .venv/bin/activate +pip install -e ".[dev]" -pip 安装后在目标仓库运行时,所有产物写入 `/.merge/`: +pytest tests/unit/ -q # unit tests (no LLM calls) +pytest tests/integration/ -v # integration tests (real API, local only) +mypy src # type check (strict) +ruff check src/ && ruff format src/ # lint + format -``` -.merge/ - config.yaml # 首次运行自动生成 - .env # API Keys,自动 gitignore - .gitignore # 自动生成 - plans/ # MERGE_PLAN_.md 报告 - runs// - checkpoint.json - merge_report.md - plan_review.md - logs/run_.log +# Web UI (only needed for frontend changes) +cd web && npm install +cd web && npm run dev # Vite dev server at localhost:5173 +cd web && npm run build # tsc + build → web/dist/ +cd web && npm test # vitest ``` -API Key 解析顺序:**Shell env → `.merge/.env` → `~/.config/code-merge-system/.env`** +**Architecture constraints enforced by unit tests — do not violate:** -## 文档 +- No `TIMEOUT_DEFAULT` on `DecisionSource` — human decisions must be explicit +- `Judge` / `PlannerJudge` receive `ReadOnlyStateView` — no state writes from reviewer agents +- `Executor` uses `apply_with_snapshot()` — no direct file writes +- `plan_revision_rounds >= max` → `AWAITING_HUMAN`, not `FAILED` +- `HumanInterface` never fills in default decisions -完整中文文档索引见 [`doc/README.md`](doc/README.md)。关键入口: +--- -- [**新人上手指南**](doc/modules/onboarding.md) — 第一次接触本项目必读 -- [系统架构](doc/architecture.md) — 分层 / 数据流 / 持久化 / 扩展点 -- [执行流程与状态机](doc/flow.md) — 13 个状态、8 个 Phase -- [六大丢失模式 + P0/P1/P2 加固项](doc/multi-agent-optimization-from-merge-experience.md) -- [迁移感知合并](doc/migration-aware-merge.md) — bulk-copy 场景 -- [风险等级](doc/risk-levels.md) +## Contributing -模块技术文档(`doc/modules/`): +Contributions are welcome — whether it's a bug report, a feature idea, or a pull request. -| 模块 | 文档 | -|---|---| -| 数据模型(Pydantic v2) | [data-models.md](doc/modules/data-models.md) | -| Agents | [agents.md](doc/modules/agents.md) | -| Core(Orchestrator / Phases / Checkpoint) | [core.md](doc/modules/core.md) | -| Tools(扫描器 / 门禁 / Git) | [tools.md](doc/modules/tools.md) | -| LLM 层(路由 / 压缩 / 凭据池) | [llm.md](doc/modules/llm.md) | -| 记忆系统(L0/L1/L2) | [memory.md](doc/modules/memory.md) | -| CLI / Web UI | [cli.md](doc/modules/cli.md) | -| Web UI 浏览器端用户旅程 | [web-ui.md](doc/web-ui.md) | +**Good places to start:** -## 参考开源项目 +- 🐛 **[Report a bug](../../issues/new?template=bug_report.md)** — include your Python version, the command you ran, and the relevant log from `.merge/runs//logs/` +- 💡 **[Request a feature](../../issues/new?template=feature_request.md)** — describe your fork/upstream scenario and what the system currently gets wrong +- 🔧 **[Browse open issues](../../issues)** — look for `good first issue` labels if you want a guided starting point -本项目在设计过程中参考了多个开源实现,相关分析文档位于 [`doc/references/`](doc/references/): +**Before submitting a PR:** -| 项目 | 类型 | 借鉴点 | -|---|---|---| -| [Weave](https://github.com/ataraxy-labs/weave) | 语义合并引擎 | tree-sitter entity-level merge;函数/类粒度三方合并 | -| [merge-engine](https://docs.rs/merge-engine/) | Rust 合并库 | 4 层合并策略(Pattern DSL → CST → VSA → Genetic) | -| [Mergiraf](https://mergiraf.org/) | AST 结构化合并 | AST 级语法感知合并 | -| [git-machete](https://github.com/VirtusLab/git-machete) | 分支工作流 | Fork-point 推断 + `--override-to` 手动校正 | -| [mergefix](https://pypi.org/project/mergefix/) | AI 冲突修复 | LLM 后处理冲突标记 | -| [reconcile-ai](https://github.com/kailashchanel/reconcile-ai) | 批量冲突修复 | 批量提示节省成本 | -| [clash](https://github.com/clash-sh/clash) | 并行 Agent | Worktree 级冲突检测 | -| [NousResearch/hermes-agent](https://github.com/nousresearch/hermes-agent) | Agent 架构 | 工具抽象与 Agent 协作模式 | -| Graphify | 代码知识图谱 | 用图谱压缩代码上下文 | -| MemPalace | 记忆系统 | 语义索引 + 分层记忆 | +1. Run `pytest tests/unit/` — all tests must pass +2. Run `mypy src` — no new type errors +3. Run `ruff check src/` — no lint errors +4. Keep new files under 800 lines; organize by feature layer (`models → tools → llm → agents → core → cli`) +5. New agents require a contract yaml under `src/agents/contracts/` — see [`src/agents/contracts/_schema.md`](src/agents/contracts/_schema.md) -详细对照见 [`doc/references/opensource-comparison.md`](doc/references/opensource-comparison.md) 与各 `*-analysis.md`。 +**Key docs for contributors:** -## 开发 +- [System Architecture](doc/architecture.md) — layers, data flow, persistence, extension points +- [State Machine & Phases](doc/flow.md) — all 13 states and 8 phases +- [Agent Contracts](src/agents/contracts/_schema.md) — how to add a new agent correctly +- [Adding a New Agent](doc/modules/agents.md) — step-by-step recipe -```bash -pytest tests/unit/ -q # 单元测试(不打 LLM API) -pytest tests/integration/ -v # 集成测试(打真 API,本地跑,不进 CI) -mypy src # 类型检查(strict 模式) -ruff check src/ # Lint -ruff format src/ # 格式化 - -# Web UI 开发(pip 安装的 wheel 已内置 src/web/dist;下面命令仅开发时需要) -cd web && npm install # 装依赖 -cd web && npm run start # 启动 Vite dev server -cd web && npm run build # tsc + vite build → web/dist/ -cd web && npm test # vitest -``` +--- -关键约束(PR review 会检查): +## Documentation -- 不要给 `DecisionSource` 加 `TIMEOUT_DEFAULT` -- Judge / PlannerJudge 只接收 `ReadOnlyStateView` -- Executor 写文件必须走 `apply_with_snapshot()` -- `plan_revision_rounds >= max` 时转 `AWAITING_HUMAN`,不是 `FAILED` -- HumanInterface 不填默认值 +Full index: [`doc/README.md`](doc/README.md) -## 许可证 +| | | +|--|--| +| [Onboarding Guide](doc/modules/onboarding.md) | Start here if you're new to the project | +| [Architecture](doc/architecture.md) | Layers, data flow, persistence, extension points | +| [Flow & State Machine](doc/flow.md) | 13 states, 8 phases | +| [Six Lost Patterns + P0/P1/P2 Hardening](doc/multi-agent-optimization-from-merge-experience.md) | How we catch what `git merge` misses | +| [Evaluation Framework](doc/evaluation/README.md) | Golden-merge ground truth, 5 trust dimensions, 3 dataset tiers, acceptance gates | +| [Self-Learning System](doc/plan/self-learning-system.md) | Non-parametric memory + grounded feedback loop, phased rollout | +| [Migration-Aware Merge](doc/migration-aware-merge.md) | Handling bulk-copy scenarios | +| [Risk Levels](doc/risk-levels.md) | How files are classified A–E | +| [Web UI User Journey](doc/web-ui.md) | Browser-side walkthrough | + +--- + +## License + +MIT + +--- -TBD +
+ Built for teams that maintain long-lived forks and need more than git merge. +
diff --git a/README_zh.md b/README_zh.md new file mode 100644 index 0000000..26c0823 --- /dev/null +++ b/README_zh.md @@ -0,0 +1,412 @@ +
+ +**中文** | [English](README.md) + +# 🔀 Code Merge System + +### 把长期分叉仓库的 upstream 升级变成一条流水线——而不是 500 个文件冲突。 + +一个多 Agent 合并管道,把数月的 upstream 积压变成**可审计、可恢复、安全**的合并——同时保留 fork 里的每一处定制。 + +[![Python 3.11+](https://img.shields.io/badge/python-3.11+-3776AB.svg?logo=python&logoColor=white)](https://python.org) +[![测试](https://img.shields.io/badge/tests-passing-brightgreen.svg)](#开发) +[![覆盖率](https://img.shields.io/badge/coverage-80%25+-brightgreen.svg)](#开发) +[![许可证](https://img.shields.io/badge/license-MIT-blue.svg)](#许可证) +[![Powered by](https://img.shields.io/badge/powered%20by-Claude%20%2B%20GPT-orange.svg)](https://anthropic.com) + +![Code Merge System 控制台](doc/project-1.png) + +
+ +--- + +## 问题在哪里 + +长期维护 fork 的团队在同步 upstream 时面临的现实是残酷的: + +- **数百到数千个文件冲突**——根本无法逐一人工处理 +- **行级 diff 掩盖了语义意图**——LLM 和人都容易判断出错 +- **fork 独有的定制被静默覆盖**——API、路由、CI job、哨兵逻辑消失了,无人察觉 +- **一处合并错误就可能导致运行时漏洞或功能缺失**——而且难以回滚 + +`git merge` 给你一份冲突列表。Code Merge System 给你一条**决策流水线**。 + +--- + +## 快速开始 + +```bash +pip install code-merge-system + +export ANTHROPIC_API_KEY=sk-ant-... +export OPENAI_API_KEY=sk-... + +cd /path/to/your-fork-repo +merge upstream/main --dry-run # 先预览合并计划,不动任何文件 +``` + +> 首次运行会打开浏览器并引导完成一次性配置向导。配置保存至 `.merge/config.yaml`,之后运行无需再配置。 + +--- + +## 界面一览 + + + + + + + + + + +
+ +**计划审查** — 124 个文件,87.9% 自动合并置信度,A–E 五类变更分布。 + +![计划审查](doc/project-2.png) + + + +**冲突解决** — 并排展示 fork 与 upstream 的变更意图,LLM 给出合并策略推荐(SEMANTIC_MERGE 85% 置信度)。 + +![冲突解决](doc/project-4.png) + +
+ +**Judge 裁决** — 独立 Review Agent 审查每个已合并文件;按 CRITICAL/HIGH/MEDIUM/LOW 分级列出问题,支持多轮修复。 + +![Judge 裁决](doc/project-5.png) + + + +**运行报告** — 完整费用明细(124 个文件 $0.04),每个 Agent 的 token 用量,以及本次 run 写入的记忆条目。 + +![运行报告](doc/project-6.png) + +
+ +--- + +## 工作原理 + +八个阶段由状态机驱动。七个专门化 Agent。每次文件写入前自动快照。任意时刻 `Ctrl+C` 都安全。 + +``` +┌─────────────────────────────────────────────────────────────┐ +│ CLI / Web UI │ +│ │ │ +│ Orchestrator ── 8 阶段状态机 │ +│ │ │ +│ ┌──────┴───────┐ │ +│ │ │ │ +│ Agents Tools Memory │ +│ (7 个角色) (50+ 确定性工具 (L0/L1/L2 │ +│ + AST 解析器) 跨 run 记忆存储) │ +│ │ │ +│ LLM 层(Anthropic + OpenAI,凭据池,智能路由) │ +└─────────────────────────────────────────────────────────────┘ +``` + +| 阶段 | 发生了什么 | +|------|-----------| +| `INITIALIZE` | 三方分类、风险打分、forks-profile 路由 | +| `PLANNING` | Planner 生成每文件合并策略 | +| `PLAN_REVIEW` | PlannerJudge 审查计划;最多 2 轮修订 | +| `AWAITING_HUMAN` | 你审阅计划报告;填入 `HUMAN_REQUIRED` 决策 | +| `AUTO_MERGING` | Executor 应用自动安全文件,写前快照 | +| `CONFLICT_ANALYSIS` | ConflictAnalyst 对高风险冲突做语义分析 | +| `JUDGE_REVIEW` | Judge + 50+ 确定性扫描器审查所有合并产物 | +| `COMPLETED` | 生成完整报告;你决定何时 `git commit` | + +| Agent | 角色 | 默认模型 | +|-------|------|----------| +| Planner | 生成合并计划 | Claude Opus | +| PlannerJudge | 审查计划(只读) | GPT-4o | +| ConflictAnalyst | 高风险冲突语义分析 | Claude Sonnet | +| Executor | **唯一写权限**——应用合并 | GPT-4o | +| Judge | 审查合并结果 + 确定性复检 | Claude Opus | +| HumanInterface | 生成决策模板 | Claude Haiku | +| SmokeTest | 合并后冒烟测试 | — | + +> **为什么用两个 LLM 提供商?** Planner/Judge 使用 Anthropic;Executor/PlannerJudge 使用 OpenAI。审查者与执行者用不同供应商,消除共谋偏差。 + +--- + +## 功能特性 + +### [六大丢失模式检测](doc/modules/tools.md) +Shadow 冲突、接口反向影响、顶层调用丢失、配置行保留、Scar 自学习、业务哨兵扫描——这些是 `git merge` 完全检测不到的失败模式。 + +### [写前快照](doc/modules/core.md) +每次文件写入前自动保存原始内容。任何失败触发自动回滚。你不会遇到"合并到一半的文件"。 + +### [全程 Checkpoint](doc/modules/core.md) +每个阶段结束后持久化状态。`merge resume --run-id ` 从上次停止处精确继续——对耗时数小时的大型合并尤其重要。 + +### [显式人工决策](doc/modules/agents.md) +没有 `TIMEOUT_DEFAULT`,没有静默回退。需要人工判断的文件会生成 `decisions.yaml` 模板;跳过的决策保持 `AWAITING_HUMAN` 状态,直到明确填写为止。 + +### [多语言 AST 分块](doc/modules/tools.md) +Python、TypeScript、JavaScript、Go、Rust、Java 和 C 均走 tree-sitter,做语义级 diff——而不只是行级。 + +### [跨 Run 记忆](doc/modules/memory.md) +决策、争议和指标被汇总写入 SQLite 存储。后续在同一仓库上的 run 会加载相关历史记录来辅助规划。 + +### [Baseline-Diff 门禁](doc/modules/tools.md) +CI 验证只标记*新引入*的失败,而非已有的。合并到测试本来就挂的仓库时,不会被已有失败阻塞。 + +### [浏览器 Web UI](doc/modules/web-ui.md) +实时流水线进度、冲突解决界面、计划审查、Judge 裁决——全部在本地浏览器 App 中。用 `--no-web` 切换纯终端输出,或 `--ci` 输出 JSON 供 CI 使用。 + +--- + +## 与同类工具对比 + +| | Code Merge System | `git merge` / `git rebase` | GitHub/GitLab UI | LLM 对话(ChatGPT 等) | +|--|--|--|--|--| +| 处理 500+ 文件冲突 | ✅ | ❌ 手动逐一处理 | ❌ | ❌ 上下文限制 | +| 保留 fork 独有功能 | ✅ 通过 scar/sentinel 自动检测 | ❌ 容易被覆盖 | ❌ | ❌ 无仓库上下文 | +| 可审计的决策记录 | ✅ 每文件附理由 | ❌ | 部分(PR 评论) | ❌ | +| 中断后可恢复 | ✅ 每阶段 Checkpoint | ❌ | ❌ | ❌ | +| 确定性安全检查 | ✅ 50+ 扫描器合并后复检 | ❌ | ❌ | ❌ | +| 费用 | ~$0.04 / 124 文件 | 免费 | 免费 | 按 token 计费,无自动化 | + +--- + +## 能不能信任合并产物? + +一个合并工具的价值,取决于它能拿出多少"产物正确"的证据。本项目配套了一套**正式测评系统**和一条**可审计的自学习闭环**——并且如实汇报结果,包括那些目前还不漂亮的数字。 + +### 以人工黄金合并为 Ground Truth 的测评 + +我们**不**让 LLM Judge 给自己的 verdict 打分。[`doc/evaluation/`](doc/evaluation/README.md) 下的测评系统以**专家人工黄金合并(Human Golden Merge)作为 Ground Truth**,按统一差分协议度量系统产物与黄金合并的偏差,并同时考核五个信任维度——一个"全部直接 take_target、覆盖率 100% 却丢了一半 fork 改动"的系统必须在这里被判不通过: + +| 维度 | 回答的问题 | 主要指标 | +|------|-----------|---------| +| **正确性** | 该合的合了没?合得对不对? | 漏合率、错合率、冲突解决正确率 | +| **安全性** | 有没有偷偷丢掉私有改动? | M1–M6 语义丢失召回、安全敏感文件人工率、快照回滚率 | +| **过程可信** | 不确定的事会上报还是硬猜? | 升级率、Plan Dispute 命中率、Judge↔Ground Truth 一致率 | +| **可解释性** | 每个决策都能复盘吗? | rationale 完整率、`discarded_content` 留存率、Trace 可回放率 | +| **运行稳健** | 重复跑、换模型结果稳吗?成本可控吗? | 决策一致性、$/run、wall-time P95 | + +三层评估集支撑它:**Tier-1** 微基准(30–60 PR,可进 CI 天天跑)、**Tier-2** 真实长跨度回放(人工合并 diff 即 oracle)、**Tier-3** 对抗注入集(系统真能识别 M1–M6 吗)。评估 harness 位于 [`scripts/eval/`](scripts/eval/)(`prepare.py → run.py → diff_against_golden.py → summarize.py → gate.py`)。 + +**一票否决的硬门**([`acceptance.md`](doc/evaluation/acceptance.md)):错合率 **= 0%**、安全敏感升级率 **= 100%**、私有内容留存率 **= 100%**、快照回滚成功率 **= 100%**、重复顶层符号数 **= 0**、幻觉跨模块引用数 **= 0**;漏合率 **≤ 2%**(Tier-1),M1–M6 各类召回 **≥ 95%**。软门跟踪总正确率(≥ 92% Tier-1)、决策一致性(3 次 run ≥ 90%)、跨模型一致性(≥ 85%)以及成本/时延漂移上限。 + +> **诚实优先于营销:** `acceptance.md` 的版本基线表目前仍是模板行——尚无任何版本跑通完整 gate,因此我们**不对外宣称"已通过评估、可信"**。这套系统存在的意义,正是为了让这句承诺一旦做出,背后是可锁定的数据集 SHA 与逐文件黄金差分,而不是一句"合并成功率 99%"。 + +### 自学习——靠度量,而非假设 + +系统跨 run 自我改进,**不微调权重、不引入 embedding**——这是经 24 源调研支撑的刻意选择(见 [`doc/plan/self-learning-system.md`](doc/plan/self-learning-system.md)):非参数化、可审计的 SQLite 记忆 + 执行接地的反思,在成本与"可删除性"上胜过不透明的权重 RL。 + +| 阶段 | 做什么 | 状态 | +|------|-------|------| +| **P0** 有效性度量 | 消融 harness:`memory=on` vs `memory=off` 决策增益 | **已落地** — `merge eval-memory` | +| **P1** 执行接地反馈环 | 有害条目持久可审计软删 · 由 `judge`+`compile`+`ci` 信号写回 confidence · verified-repair 修复配方库 | **已落地**,反馈环在消融证明净收益前默认 **opt-in** | +| **P2** 记忆质量加固 | 强制高信息条目 · 关键不变量锚定防摘要漂移 | **已落地** | +| **P3** 离线提示优化 | `merge optimize-prompts` 按黄金集排名 gate 提示变体,产**人工评审报告——绝不自动写回** | **已落地**,opt-in | + +核心准则是**先度量再激活**:任一反馈环只有在 `merge eval-memory` 于固定数据集上显示增益 **> 0** **且**因果归因的有害数 **= 0** 后,才翻为默认开启。首组基线(forgejo,124 文件)实测增益 **0.0000**——所以反馈环维持 opt-in。那次 run 由确定性机制(take-target + veto)主导,记忆没有用武之地;这**不**证明记忆无价值,需要 LLM 判断密集的数据集才能测出真实增益。我们如实报告这个零,而非藏起来——这本身就是信任信号。 + +--- + +## 前置要求 + +| | | +|--|--| +| Python 3.11+ | mypy strict / Pydantic v2 / async 全程 | +| `ANTHROPIC_API_KEY` | Planner、ConflictAnalyst、Judge、HumanInterface | +| `OPENAI_API_KEY` | PlannerJudge、Executor(双供应商防共谋) | +| `GITHUB_TOKEN` *(可选)* | GitHub 集成——拉取 PR 评论、推送合并结果 | +| Node.js *(可选)* | 仅 Web UI 开发;安装包已内置 `web/dist/` | + +**目标仓库需满足:** +- 是一个 git 仓库,且工作树干净(`git status` 无未提交改动) +- upstream 可在本地访问——作为分支或通过 `git fetch ` 已拉取 + +```bash +# 如果还没添加 upstream 远端: +git remote add upstream https://github.com//.git +git fetch upstream +``` + +--- + +## 完整流程 + +### 1. 预跑(dry-run) + +```bash +cd /path/to/your-fork-repo +merge upstream/main --dry-run +``` + +浏览器打开并依次推进 `INITIALIZE → PLANNING → PLAN_REVIEW → AWAITING_HUMAN`,然后停止。重点查看: + +``` +.merge/plans/MERGE_PLAN_.md # 每文件合并策略 +.merge/runs//plan_review.md # PlannerJudge 审查记录 +``` + +### 2. 正式合并 + +```bash +merge upstream/main # 去掉 --dry-run 正式运行 +``` + +任意时刻 `Ctrl+C` 都安全——用 `merge resume --run-id ` 续跑。 + +### 3. 处理人工决策 + +当系统在 `AWAITING_HUMAN` 暂停时,填写 `.merge/runs//decisions.yaml`: + +```yaml +- file_path: "backend/services/auth/auth.service.ts" + decision: take_current # take_target / take_current / semantic_merge / escalate_human + rationale: "fork 用 SSO,必须保留" +``` + +然后续跑: + +```bash +merge resume --run-id --decisions .merge/runs//decisions.yaml +``` + +### 4. 审阅并提交 + +``` +.merge/runs//merge_report.md # 最终合并报告 +.merge/runs//checkpoint.json # 完整状态 +.merge/runs//logs/run_.log # 全量执行日志 +``` + +系统在写入工作树后停手。**它不会自动 commit 或 push**——你审阅完再决定。 + +--- + +## 常用命令 + +```bash +# 日常使用 +merge # 默认:浏览器 Web UI +merge --dry-run # 仅跑到计划,不动文件 +merge --no-web # 纯终端输出 +merge -r # 重新运行配置向导 + +# 续跑 / 决策 +merge resume --run-id +merge resume --run-id --decisions decisions.yaml +merge resume --run-id --web # 在浏览器中查看历史 run + +# 校验 +merge validate --config # 检查 config + 所有 API Key + +# Fork Profile(仅在 fork 删除了 ≥30 个文件时需要) +merge forks-profile init -o .merge/forks-profile.yaml +merge forks-profile diff +merge forks-profile validate + +# CI +merge --ci # 无交互,JSON 摘要输出到 stdout +merge --ci --auto-decisions +``` + +--- + +## 排查问题 + +| 现象 | 解决方法 | +|------|---------| +| `API Key not set` | 运行 `merge validate --config .merge/config.yaml`;检查 shell env → `.merge/.env` → `~/.config/code-merge-system/.env` | +| `working tree dirty` | `git stash` 或 `git commit`,再重跑 | +| `upstream ref not found` | 执行 `git fetch upstream`;用 `upstream/main` 而非 `main` | +| Plan review 卡在多轮协商 | 正常现象——Planner 与 PlannerJudge 在博弈;`max_plan_revision_rounds=2` 后自动转 `AWAITING_HUMAN`,去看 `plan_review.md` | +| 中途中断 | `merge resume --run-id `(`run_id` 在 `.merge/runs/` 下可以看到) | +| 想重头来过 | `rm -rf .merge/runs//`,再重新运行 | + +--- + +## 开发 + +```bash +git clone && cd code-merge-system +python3.11 -m venv .venv && source .venv/bin/activate +pip install -e ".[dev]" + +pytest tests/unit/ -q # 单元测试(不打 LLM API) +pytest tests/integration/ -v # 集成测试(真实 API,本地运行,不进 CI) +mypy src # 类型检查(strict 模式) +ruff check src/ && ruff format src/ # Lint + 格式化 + +# Web UI(仅前端改动时需要) +cd web && npm install +cd web && npm run dev # Vite dev server,localhost:5173 +cd web && npm run build # tsc + build → web/dist/ +cd web && npm test # vitest +``` + +**由单测强制保证的架构约束——不得违反:** + +- `DecisionSource` 不加 `TIMEOUT_DEFAULT`——人工决策必须显式 +- `Judge` / `PlannerJudge` 接收 `ReadOnlyStateView`——审查 Agent 不得写 state +- `Executor` 使用 `apply_with_snapshot()`——不得直接写文件 +- `plan_revision_rounds >= max` 时转 `AWAITING_HUMAN`,不是 `FAILED` +- `HumanInterface` 不填入默认决策 + +--- + +## 参与贡献 + +欢迎任何形式的贡献——无论是 Bug 报告、功能建议还是 PR。 + +**适合入手的地方:** + +- 🐛 **[报告 Bug](../../issues/new?template=bug_report.md)** — 请附上 Python 版本、运行的命令,以及 `.merge/runs//logs/` 中的相关日志 +- 💡 **[提功能建议](../../issues/new?template=feature_request.md)** — 描述你的 fork/upstream 场景,以及系统目前哪里处理得不对 +- 🔧 **[浏览 Issues](../../issues)** — 找标有 `good first issue` 标签的任务,适合初次贡献 + +**提交 PR 前请确认:** + +1. `pytest tests/unit/` 全部通过 +2. `mypy src` 无新类型错误 +3. `ruff check src/` 无 lint 错误 +4. 新文件不超过 800 行;按功能层组织(`models → tools → llm → agents → core → cli`) +5. 新 Agent 需在 `src/agents/contracts/` 下创建 contract yaml,参见 [`src/agents/contracts/_schema.md`](src/agents/contracts/_schema.md) + +**贡献者必读文档:** + +- [系统架构](doc/architecture.md) — 分层、数据流、持久化、扩展点 +- [状态机与阶段](doc/flow.md) — 全部 13 个状态和 8 个阶段 +- [Agent Contract](src/agents/contracts/_schema.md) — 如何正确添加新 Agent +- [新增 Agent 指南](doc/modules/agents.md) — 分步操作手册 + +--- + +## 文档 + +完整索引见 [`doc/README.md`](doc/README.md) + +| | | +|--|--| +| [新人上手指南](doc/modules/onboarding.md) | 第一次接触本项目必读 | +| [系统架构](doc/architecture.md) | 分层、数据流、持久化、扩展点 | +| [执行流程与状态机](doc/flow.md) | 13 个状态、8 个阶段 | +| [六大丢失模式 + P0/P1/P2 加固](doc/multi-agent-optimization-from-merge-experience.md) | 我们如何捕获 `git merge` 遗漏的失败 | +| [测评系统](doc/evaluation/README.md) | 黄金合并 Ground Truth、五大信任维度、三层评估集、验收门 | +| [自学习系统](doc/plan/self-learning-system.md) | 非参数化记忆 + 执行接地反馈环,分阶段落地 | +| [迁移感知合并](doc/migration-aware-merge.md) | 批量复制场景的处理 | +| [风险等级](doc/risk-levels.md) | 文件如何被分为 A–E 类 | +| [Web UI 用户旅程](doc/web-ui.md) | 浏览器端完整操作流程 | + +--- + +## 许可证 + +MIT + +--- + +
+ 为长期维护 fork、不满足于 git merge 的团队而生。 +
diff --git a/pyproject.toml b/pyproject.toml index 76fa56a..3ace067 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,9 +37,9 @@ dependencies = [ ] [project.urls] -Homepage = "https://github.com/angel/code-merge-system" -Repository = "https://github.com/angel/code-merge-system" -Issues = "https://github.com/angel/code-merge-system/issues" +Homepage = "https://github.com/GOSICK-Angel/code-merge-system" +Repository = "https://github.com/GOSICK-Angel/code-merge-system" +Issues = "https://github.com/GOSICK-Angel/code-merge-system/issues" [project.scripts] merge = "src.cli.main:cli"