notque · notque · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py
@@ -1,6 +1,7 @@
 import contextlib
 import importlib.util
 import json
+import os
 import subprocess
 import sys
 from pathlib import Path
@@ -673,6 +674,47 @@ def fake_run_blind_compare_eval(
     assert scores["task_results"][0]["winner"] == "candidate"
 
 
+def test_assess_target_trigger_eval_uses_multiple_runs(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_trigger_runs",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\ndescription: trigger eval test\n---\n")
+    tasks = [{"query": "inspect this repo", "should_trigger": True}]
+    seen = {}
+
+    def fake_run_trigger_rate(
+        target_path,
+        description,
+        tasks,
+        candidate_content=None,
+        eval_mode="auto",
+        num_workers=1,
+        timeout=30,
+        runs_per_query=3,
+        verbose=False,
+    ):
+        seen["runs_per_query"] = runs_per_query
+        return {
+            "results": [{"query": "inspect this repo", "should_trigger": True, "trigger_rate": 1.0, "pass": True}],
+            "summary": {"total": 1, "passed": 1, "failed": 0},
+        }
+
+    monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate)
+
+    scores = optimize_loop.assess_target(
+        target,
+        tasks,
+        "improve routing precision",
+        behavioral_runs_per_task=4,
+    )
+
+    assert seen["runs_per_query"] == 4
+    assert scores["tests_pass"] is True
+
+
 def test_socratic_question_only_heuristic_penalizes_preamble():
     optimize_loop = load_module(
         "agent_comparison_optimize_loop_socratic_heuristic",
@@ -755,6 +797,329 @@ def fake_capture(query, cwd, accepted_skill_ids, timeout=180):
     assert results[0]["candidate_reasons"][0] == "mentioned blocked skill tool"
 
 
+def test_behavioral_eval_sequential_path_uses_isolated_worktrees(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_behavioral_isolation",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    project_root = tmp_path / "repo"
+    (project_root / ".claude").mkdir(parents=True)
+    cwd_before = Path.cwd()
+    os.chdir(project_root)
+    try:
+        calls = []
+
+        def fake_single_task_in_worktree(task, project_root, env, timeout, verbose, runs_per_task, trigger_threshold):
+            calls.append((task["query"], project_root))
+            return {
+                "query": task["query"],
+                "triggered": task["should_trigger"],
+                "should_trigger": task["should_trigger"],
+                "pass": True,
+                "new_artifacts": [],
+            }
+
+        monkeypatch.setattr(optimize_loop, "_run_single_behavioral_task_in_worktree", fake_single_task_in_worktree)
+
+        results = optimize_loop._run_behavioral_eval(
+            tmp_path / "skills" / "example" / "SKILL.md",
+            "desc",
+            [{"query": "make a skill", "should_trigger": True}],
+            parallel_workers=0,
+        )
+    finally:
+        os.chdir(cwd_before)
+
+    assert len(calls) == 1
+    assert results[0]["pass"] is True
+
+
+def test_holdout_score_attaches_to_best_iteration_not_last(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_holdout_attachment",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n")
+    tasks_file = tmp_path / "tasks.json"
+    tasks_file.write_text(
+        json.dumps(
+            {
+                "tasks": [
+                    {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"},
+                    {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"},
+                ]
+            }
+        )
+    )
+
+    variant_counter = {"n": 0}
+
+    def fake_generate_variant_output(*args, **kwargs):
+        variant_counter["n"] += 1
+        marker = f"candidate-{variant_counter['n']}"
+        return {
+            "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n",
+            "summary": marker,
+            "reasoning": marker,
+            "tokens_used": 1,
+            "deletions": [],
+            "deletion_justification": "",
+        }
+
+    monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output)
+
+    def fake_assess_target(
+        path,
+        tasks,
+        goal,
+        verbose=False,
+        dry_run=False,
+        behavioral_runs_per_task=1,
+        behavioral_trigger_threshold=0.5,
+        parallel_eval_workers=0,
+        candidate_content=None,
+        baseline_content=None,
+        eval_mode="auto",
+    ):
+        is_test = bool(tasks and tasks[0].get("split") == "test")
+        content = candidate_content or path.read_text()
+        if is_test:
+            score = 9.0 if "candidate-1" in content else 3.0
+        else:
+            if "candidate-1" in content:
+                score = 8.0
+            elif "candidate-2" in content:
+                score = 7.0
+            else:
+                score = 5.0
+        return {
+            "parses": True,
+            "compiles": True,
+            "tests_pass": True,
+            "protected_intact": True,
+            "correctness": score,
+            "error_handling": 0.0,
+            "language_idioms": 0.0,
+            "testing": 0.0,
+            "efficiency": 0.0,
+            "task_results": [{"name": "task", "passed": True}],
+        }
+
+    monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target)
+
+    result = optimize_loop.run_optimization_loop(
+        target_path=target,
+        goal="improve routing precision",
+        benchmark_tasks_path=tasks_file,
+        max_iterations=1,
+        min_gain=0.0,
+        output_dir=tmp_path / "out",
+        report_path=tmp_path / "out" / "report.html",
+        beam_width=2,
+        candidates_per_parent=2,
+        holdout_check_cadence=1,
+        verbose=False,
+        dry_run=False,
+    )
+
+    iteration_one = next(it for it in result["iterations"] if it["number"] == 1)
+    iteration_two = next(it for it in result["iterations"] if it["number"] == 2)
+    assert iteration_one["score"]["test"] == 3.6
+    assert iteration_two["score"]["test"] == 1.2
+
+
+def test_best_by_test_can_switch_from_later_train_best_to_earlier_candidate(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_best_by_test",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n")
+    tasks_file = tmp_path / "tasks.json"
+    tasks_file.write_text(
+        json.dumps(
+            {
+                "tasks": [
+                    {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"},
+                    {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"},
+                ]
+            }
+        )
+    )
+
+    variant_counter = {"n": 0}
+
+    def fake_generate_variant_output(*args, **kwargs):
+        variant_counter["n"] += 1
+        marker = f"candidate-{variant_counter['n']}"
+        return {
+            "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n",
+            "summary": marker,
+            "reasoning": marker,
+            "tokens_used": 1,
+            "deletions": [],
+            "deletion_justification": "",
+        }
+
+    monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output)
+
+    def fake_assess_target(
+        path,
+        tasks,
+        goal,
+        verbose=False,
+        dry_run=False,
+        behavioral_runs_per_task=1,
+        behavioral_trigger_threshold=0.5,
+        parallel_eval_workers=0,
+        candidate_content=None,
+        baseline_content=None,
+        eval_mode="auto",
+    ):
+        is_test = bool(tasks and tasks[0].get("split") == "test")
+        content = candidate_content or path.read_text()
+        if is_test:
+            score = 9.0 if "candidate-1" in content else 3.0
+        else:
+            if "candidate-1" in content:
+                score = 8.0
+            elif "candidate-2" in content:
+                score = 9.0
+            else:
+                score = 5.0
+        return {
+            "parses": True,
+            "compiles": True,
+            "tests_pass": True,
+            "protected_intact": True,
+            "correctness": score,
+            "error_handling": 0.0,
+            "language_idioms": 0.0,
+            "testing": 0.0,
+            "efficiency": 0.0,
+            "task_results": [{"name": "task", "passed": True}],
+        }
+
+    monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target)
+
+    result = optimize_loop.run_optimization_loop(
+        target_path=target,
+        goal="improve routing precision",
+        benchmark_tasks_path=tasks_file,
+        max_iterations=2,
+        min_gain=0.0,
+        output_dir=tmp_path / "out",
+        report_path=tmp_path / "out" / "report.html",
+        beam_width=1,
+        candidates_per_parent=1,
+        holdout_check_cadence=1,
+        verbose=False,
+        dry_run=False,
+    )
+
+    assert result["best_iteration"] == 1
+    assert result["best_test_score"] == 3.6
+
+
+def test_final_report_uses_post_selection_test_scores(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_final_report",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n")
+    tasks_file = tmp_path / "tasks.json"
+    tasks_file.write_text(
+        json.dumps(
+            {
+                "tasks": [
+                    {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"},
+                    {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"},
+                ]
+            }
+        )
+    )
+
+    variant_counter = {"n": 0}
+
+    def fake_generate_variant_output(*args, **kwargs):
+        variant_counter["n"] += 1
+        marker = f"candidate-{variant_counter['n']}"
+        return {
+            "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n",
+            "summary": marker,
+            "reasoning": marker,
+            "tokens_used": 1,
+            "deletions": [],
+            "deletion_justification": "",
+        }
+
+    monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output)
+
+    def fake_assess_target(
+        path,
+        tasks,
+        goal,
+        verbose=False,
+        dry_run=False,
+        behavioral_runs_per_task=1,
+        behavioral_trigger_threshold=0.5,
+        parallel_eval_workers=0,
+        candidate_content=None,
+        baseline_content=None,
+        eval_mode="auto",
+    ):
+        is_test = bool(tasks and tasks[0].get("split") == "test")
+        content = candidate_content or path.read_text()
+        if is_test:
+            score = 9.0 if "candidate-1" in content else 3.0
+        else:
+            score = 8.0 if "candidate-1" in content else 5.0
+        return {
+            "parses": True,
+            "compiles": True,
+            "tests_pass": True,
+            "protected_intact": True,
+            "correctness": score,
+            "error_handling": 0.0,
+            "language_idioms": 0.0,
+            "testing": 0.0,
+            "efficiency": 0.0,
+            "task_results": [{"name": "task", "passed": True}],
+        }
+
+    def fake_generate_optimization_report(data, auto_refresh=False):
+        _ = auto_refresh
+        return json.dumps(data)
+
+    monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target)
+    monkeypatch.setattr(optimize_loop, "generate_optimization_report", fake_generate_optimization_report)
+
+    report_path = tmp_path / "out" / "report.html"
+    optimize_loop.run_optimization_loop(
+        target_path=target,
+        goal="improve routing precision",
+        benchmark_tasks_path=tasks_file,
+        max_iterations=1,
+        min_gain=0.0,
+        output_dir=tmp_path / "out",
+        report_path=report_path,
+        beam_width=1,
+        candidates_per_parent=1,
+        holdout_check_cadence=1,
+        verbose=False,
+        dry_run=False,
+    )
+
+    report = json.loads(report_path.read_text())
+    assert report["iterations"][0]["score"]["test"] == 3.6
+
+
 def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, monkeypatch):
     optimize_loop = load_module(
         "agent_comparison_optimize_loop_parallel_forwarding",
@@ -903,6 +1268,7 @@ def fake_run_trigger_rate(
         eval_mode="auto",
         num_workers=5,
         timeout=30,
+        runs_per_query=3,
         verbose=False,
     ):
         passed = trigger_query in description