Skip to content
Merged
366 changes: 366 additions & 0 deletions scripts/tests/test_agent_comparison_optimize_loop.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import contextlib
import importlib.util
import json
import os
import subprocess
import sys
from pathlib import Path
Expand Down Expand Up @@ -673,6 +674,47 @@ def fake_run_blind_compare_eval(
assert scores["task_results"][0]["winner"] == "candidate"


def test_assess_target_trigger_eval_uses_multiple_runs(tmp_path, monkeypatch):
optimize_loop = load_module(
"agent_comparison_optimize_loop_trigger_runs",
"skills/agent-comparison/scripts/optimize_loop.py",
)

target = tmp_path / "SKILL.md"
target.write_text("---\ndescription: trigger eval test\n---\n")
tasks = [{"query": "inspect this repo", "should_trigger": True}]
seen = {}

def fake_run_trigger_rate(
target_path,
description,
tasks,
candidate_content=None,
eval_mode="auto",
num_workers=1,
timeout=30,
runs_per_query=3,
verbose=False,
):
seen["runs_per_query"] = runs_per_query
return {
"results": [{"query": "inspect this repo", "should_trigger": True, "trigger_rate": 1.0, "pass": True}],
"summary": {"total": 1, "passed": 1, "failed": 0},
}

monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate)

scores = optimize_loop.assess_target(
target,
tasks,
"improve routing precision",
behavioral_runs_per_task=4,
)

assert seen["runs_per_query"] == 4
assert scores["tests_pass"] is True


def test_socratic_question_only_heuristic_penalizes_preamble():
optimize_loop = load_module(
"agent_comparison_optimize_loop_socratic_heuristic",
Expand Down Expand Up @@ -755,6 +797,329 @@ def fake_capture(query, cwd, accepted_skill_ids, timeout=180):
assert results[0]["candidate_reasons"][0] == "mentioned blocked skill tool"


def test_behavioral_eval_sequential_path_uses_isolated_worktrees(tmp_path, monkeypatch):
optimize_loop = load_module(
"agent_comparison_optimize_loop_behavioral_isolation",
"skills/agent-comparison/scripts/optimize_loop.py",
)

project_root = tmp_path / "repo"
(project_root / ".claude").mkdir(parents=True)
cwd_before = Path.cwd()
os.chdir(project_root)
try:
calls = []

def fake_single_task_in_worktree(task, project_root, env, timeout, verbose, runs_per_task, trigger_threshold):
calls.append((task["query"], project_root))
return {
"query": task["query"],
"triggered": task["should_trigger"],
"should_trigger": task["should_trigger"],
"pass": True,
"new_artifacts": [],
}

monkeypatch.setattr(optimize_loop, "_run_single_behavioral_task_in_worktree", fake_single_task_in_worktree)

results = optimize_loop._run_behavioral_eval(
tmp_path / "skills" / "example" / "SKILL.md",
"desc",
[{"query": "make a skill", "should_trigger": True}],
parallel_workers=0,
)
finally:
os.chdir(cwd_before)

assert len(calls) == 1
assert results[0]["pass"] is True


def test_holdout_score_attaches_to_best_iteration_not_last(tmp_path, monkeypatch):
optimize_loop = load_module(
"agent_comparison_optimize_loop_holdout_attachment",
"skills/agent-comparison/scripts/optimize_loop.py",
)

target = tmp_path / "SKILL.md"
target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n")
tasks_file = tmp_path / "tasks.json"
tasks_file.write_text(
json.dumps(
{
"tasks": [
{"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"},
{"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"},
]
}
)
)

variant_counter = {"n": 0}

def fake_generate_variant_output(*args, **kwargs):
variant_counter["n"] += 1
marker = f"candidate-{variant_counter['n']}"
return {
"variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n",
"summary": marker,
"reasoning": marker,
"tokens_used": 1,
"deletions": [],
"deletion_justification": "",
}

monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output)

def fake_assess_target(
path,
tasks,
goal,
verbose=False,
dry_run=False,
behavioral_runs_per_task=1,
behavioral_trigger_threshold=0.5,
parallel_eval_workers=0,
candidate_content=None,
baseline_content=None,
eval_mode="auto",
):
is_test = bool(tasks and tasks[0].get("split") == "test")
content = candidate_content or path.read_text()
if is_test:
score = 9.0 if "candidate-1" in content else 3.0
else:
if "candidate-1" in content:
score = 8.0
elif "candidate-2" in content:
score = 7.0
else:
score = 5.0
return {
"parses": True,
"compiles": True,
"tests_pass": True,
"protected_intact": True,
"correctness": score,
"error_handling": 0.0,
"language_idioms": 0.0,
"testing": 0.0,
"efficiency": 0.0,
"task_results": [{"name": "task", "passed": True}],
}

monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target)

result = optimize_loop.run_optimization_loop(
target_path=target,
goal="improve routing precision",
benchmark_tasks_path=tasks_file,
max_iterations=1,
min_gain=0.0,
output_dir=tmp_path / "out",
report_path=tmp_path / "out" / "report.html",
beam_width=2,
candidates_per_parent=2,
holdout_check_cadence=1,
verbose=False,
dry_run=False,
)

iteration_one = next(it for it in result["iterations"] if it["number"] == 1)
iteration_two = next(it for it in result["iterations"] if it["number"] == 2)
assert iteration_one["score"]["test"] == 3.6
assert iteration_two["score"]["test"] == 1.2


def test_best_by_test_can_switch_from_later_train_best_to_earlier_candidate(tmp_path, monkeypatch):
optimize_loop = load_module(
"agent_comparison_optimize_loop_best_by_test",
"skills/agent-comparison/scripts/optimize_loop.py",
)

target = tmp_path / "SKILL.md"
target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n")
tasks_file = tmp_path / "tasks.json"
tasks_file.write_text(
json.dumps(
{
"tasks": [
{"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"},
{"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"},
]
}
)
)

variant_counter = {"n": 0}

def fake_generate_variant_output(*args, **kwargs):
variant_counter["n"] += 1
marker = f"candidate-{variant_counter['n']}"
return {
"variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n",
"summary": marker,
"reasoning": marker,
"tokens_used": 1,
"deletions": [],
"deletion_justification": "",
}

monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output)

def fake_assess_target(
path,
tasks,
goal,
verbose=False,
dry_run=False,
behavioral_runs_per_task=1,
behavioral_trigger_threshold=0.5,
parallel_eval_workers=0,
candidate_content=None,
baseline_content=None,
eval_mode="auto",
):
is_test = bool(tasks and tasks[0].get("split") == "test")
content = candidate_content or path.read_text()
if is_test:
score = 9.0 if "candidate-1" in content else 3.0
else:
if "candidate-1" in content:
score = 8.0
elif "candidate-2" in content:
score = 9.0
else:
score = 5.0
return {
"parses": True,
"compiles": True,
"tests_pass": True,
"protected_intact": True,
"correctness": score,
"error_handling": 0.0,
"language_idioms": 0.0,
"testing": 0.0,
"efficiency": 0.0,
"task_results": [{"name": "task", "passed": True}],
}

monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target)

result = optimize_loop.run_optimization_loop(
target_path=target,
goal="improve routing precision",
benchmark_tasks_path=tasks_file,
max_iterations=2,
min_gain=0.0,
output_dir=tmp_path / "out",
report_path=tmp_path / "out" / "report.html",
beam_width=1,
candidates_per_parent=1,
holdout_check_cadence=1,
verbose=False,
dry_run=False,
)

assert result["best_iteration"] == 1
assert result["best_test_score"] == 3.6


def test_final_report_uses_post_selection_test_scores(tmp_path, monkeypatch):
optimize_loop = load_module(
"agent_comparison_optimize_loop_final_report",
"skills/agent-comparison/scripts/optimize_loop.py",
)

target = tmp_path / "SKILL.md"
target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n")
tasks_file = tmp_path / "tasks.json"
tasks_file.write_text(
json.dumps(
{
"tasks": [
{"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"},
{"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"},
]
}
)
)

variant_counter = {"n": 0}

def fake_generate_variant_output(*args, **kwargs):
variant_counter["n"] += 1
marker = f"candidate-{variant_counter['n']}"
return {
"variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n",
"summary": marker,
"reasoning": marker,
"tokens_used": 1,
"deletions": [],
"deletion_justification": "",
}

monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output)

def fake_assess_target(
path,
tasks,
goal,
verbose=False,
dry_run=False,
behavioral_runs_per_task=1,
behavioral_trigger_threshold=0.5,
parallel_eval_workers=0,
candidate_content=None,
baseline_content=None,
eval_mode="auto",
):
is_test = bool(tasks and tasks[0].get("split") == "test")
content = candidate_content or path.read_text()
if is_test:
score = 9.0 if "candidate-1" in content else 3.0
else:
score = 8.0 if "candidate-1" in content else 5.0
return {
"parses": True,
"compiles": True,
"tests_pass": True,
"protected_intact": True,
"correctness": score,
"error_handling": 0.0,
"language_idioms": 0.0,
"testing": 0.0,
"efficiency": 0.0,
"task_results": [{"name": "task", "passed": True}],
}

def fake_generate_optimization_report(data, auto_refresh=False):
_ = auto_refresh
return json.dumps(data)

monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target)
monkeypatch.setattr(optimize_loop, "generate_optimization_report", fake_generate_optimization_report)

report_path = tmp_path / "out" / "report.html"
optimize_loop.run_optimization_loop(
target_path=target,
goal="improve routing precision",
benchmark_tasks_path=tasks_file,
max_iterations=1,
min_gain=0.0,
output_dir=tmp_path / "out",
report_path=report_path,
beam_width=1,
candidates_per_parent=1,
holdout_check_cadence=1,
verbose=False,
dry_run=False,
)

report = json.loads(report_path.read_text())
assert report["iterations"][0]["score"]["test"] == 3.6


def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, monkeypatch):
optimize_loop = load_module(
"agent_comparison_optimize_loop_parallel_forwarding",
Expand Down Expand Up @@ -903,6 +1268,7 @@ def fake_run_trigger_rate(
eval_mode="auto",
num_workers=5,
timeout=30,
runs_per_query=3,
verbose=False,
):
passed = trigger_query in description
Expand Down
Loading
Loading