From 127ef5a0a379d3bd01e08fa41f96aff029d9cc46 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 19:16:52 -0700 Subject: [PATCH 1/5] fix(autoresearch): isolate live registered-skill eval --- scripts/skill_eval/run_eval.py | 261 +++++-- .../test_agent_comparison_optimize_loop.py | 462 ++++++++++++- scripts/tests/test_skill_eval_claude_code.py | 434 ++++++++++++ skills/agent-comparison/SKILL.md | 53 +- .../references/optimization-guide.md | 97 ++- .../references/read-only-ops-short-tasks.json | 16 + .../scripts/generate_variant.py | 201 ++++-- .../agent-comparison/scripts/optimize_loop.py | 651 ++++++++++++------ skills/read-only-ops/SKILL.md | 10 +- 9 files changed, 1822 insertions(+), 363 deletions(-) create mode 100644 skills/agent-comparison/references/read-only-ops-short-tasks.json diff --git a/scripts/skill_eval/run_eval.py b/scripts/skill_eval/run_eval.py index 383e74b5..7b3a509a 100755 --- a/scripts/skill_eval/run_eval.py +++ b/scripts/skill_eval/run_eval.py @@ -6,11 +6,14 @@ """ import argparse +import contextlib import json import os import select +import shutil import subprocess import sys +import tempfile import time import uuid from concurrent.futures import ProcessPoolExecutor, as_completed @@ -32,40 +35,145 @@ def find_project_root() -> Path: return current +def resolve_registered_skill_relpath(skill_path: Path, project_root: Path) -> Path | None: + """Return repo-relative SKILL.md path when `skill_path` is a registered repo skill.""" + skill_md = (skill_path / "SKILL.md").resolve() + try: + rel = skill_md.relative_to(project_root.resolve()) + except ValueError: + return None + if len(rel.parts) >= 3 and rel.parts[0] == "skills" and rel.parts[-1] == "SKILL.md": + return rel + return None + + +def replace_description_in_skill_md(content: str, new_description: str) -> str: + """Replace the top-level frontmatter description field in SKILL.md content.""" + lines = content.splitlines() + if not lines or lines[0].strip() != "---": + raise ValueError("SKILL.md missing frontmatter (no opening ---)") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("SKILL.md missing frontmatter (no closing ---)") + + frontmatter_lines = lines[1:end_idx] + body_lines = lines[end_idx + 1 :] + updated_frontmatter: list[str] = [] + replaced = False + i = 0 + while i < len(frontmatter_lines): + line = frontmatter_lines[i] + if not replaced and line.startswith("description:"): + updated_frontmatter.append("description: |") + updated_frontmatter.extend(f" {desc_line}" for desc_line in new_description.splitlines()) + replaced = True + i += 1 + while i < len(frontmatter_lines) and ( + frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t") + ): + i += 1 + continue + updated_frontmatter.append(line) + i += 1 + + if not replaced: + raise ValueError("SKILL.md frontmatter missing description field") + + rebuilt = ["---", *updated_frontmatter, "---", *body_lines] + return "\n".join(rebuilt) + ("\n" if content.endswith("\n") else "") + + +def load_eval_set(path: Path) -> list[dict]: + """Load eval tasks from list or common wrapped JSON shapes.""" + payload = json.loads(path.read_text()) + if isinstance(payload, list): + return payload + if isinstance(payload, dict): + if "tasks" in payload and isinstance(payload["tasks"], list): + return payload["tasks"] + if "queries" in payload and isinstance(payload["queries"], list): + return payload["queries"] + train = payload.get("train") + test = payload.get("test") + if isinstance(train, list) or isinstance(test, list): + return [*(train or []), *(test or [])] + raise ValueError("Unsupported eval set format; expected list, {tasks:[...]}, {queries:[...]}, or {train:[...], test:[...]}") + + +@contextlib.contextmanager +def candidate_worktree(project_root: Path, registered_skill_relpath: Path, candidate_content: str | None): + """Create a temporary git worktree and optionally patch the target skill content.""" + wt_path_str = tempfile.mkdtemp(prefix="skill-eval-wt-", dir="/tmp") + wt_path = Path(wt_path_str) + wt_path.rmdir() + try: + subprocess.run( + ["git", "worktree", "add", wt_path_str, "HEAD"], + cwd=str(project_root), + capture_output=True, + text=True, + check=True, + ) + if candidate_content is not None: + (wt_path / registered_skill_relpath).write_text(candidate_content) + yield wt_path + finally: + try: + subprocess.run( + ["git", "worktree", "remove", "--force", wt_path_str], + cwd=str(project_root), + capture_output=True, + text=True, + ) + except Exception: + pass + shutil.rmtree(wt_path_str, ignore_errors=True) + + def run_single_query( query: str, skill_name: str, skill_description: str, timeout: int, project_root: str, + eval_mode: str = "alias", model: str | None = None, ) -> bool: """Run a single query and return whether the skill was triggered. - Creates a command file in .claude/commands/ so it appears in Claude's - available_skills list, then runs `claude -p` with the raw query. + In alias mode, creates a command file in .claude/commands/ so it appears in + Claude's available skills list. In registered mode, assumes the real skill + is already present in the isolated worktree and detects only the real name. + Uses --include-partial-messages to detect triggering early from stream events (content_block_start) rather than waiting for the full assistant message, which only arrives after tool execution. """ unique_id = uuid.uuid4().hex[:8] clean_name = f"{skill_name}-skill-{unique_id}" + accepted_skill_ids = {clean_name} if eval_mode == "alias" else {skill_name} project_commands_dir = Path(project_root) / ".claude" / "commands" command_file = project_commands_dir / f"{clean_name}.md" try: - project_commands_dir.mkdir(parents=True, exist_ok=True) - # Use YAML block scalar to avoid breaking on quotes in description - indented_desc = "\n ".join(skill_description.split("\n")) - command_content = ( - f"---\n" - f"description: |\n" - f" {indented_desc}\n" - f"---\n\n" - f"# {skill_name}\n\n" - f"This skill handles: {skill_description}\n" - ) - command_file.write_text(command_content) + if eval_mode == "alias": + project_commands_dir.mkdir(parents=True, exist_ok=True) + # Use YAML block scalar to avoid breaking on quotes in description + indented_desc = "\n ".join(skill_description.split("\n")) + command_content = ( + f"---\n" + f"description: |\n" + f" {indented_desc}\n" + f"---\n\n" + f"# {skill_name}\n\n" + f"This skill handles: {skill_description}\n" + ) + command_file.write_text(command_content) cmd = [ "claude", @@ -140,20 +248,24 @@ def run_single_query( pending_tool_name = tool_name accumulated_json = "" else: - return False + pending_tool_name = None + accumulated_json = "" elif se_type == "content_block_delta" and pending_tool_name: delta = se.get("delta", {}) if delta.get("type") == "input_json_delta": accumulated_json += delta.get("partial_json", "") - if clean_name in accumulated_json: - return True + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True elif se_type in ("content_block_stop", "message_stop"): if pending_tool_name: - return clean_name in accumulated_json + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True + pending_tool_name = None + accumulated_json = "" if se_type == "message_stop": - return False + return triggered # Fallback: full assistant message elif event.get("type") == "assistant": @@ -163,11 +275,15 @@ def run_single_query( continue tool_name = content_item.get("name", "") tool_input = content_item.get("input", {}) - if (tool_name == "Skill" and clean_name in tool_input.get("skill", "")) or ( - tool_name == "Read" and clean_name in tool_input.get("file_path", "") + if (tool_name == "Skill" and any( + skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids + )) or ( + tool_name == "Read" + and any(skill_id in tool_input.get("file_path", "") for skill_id in accepted_skill_ids) ): triggered = True - return triggered + if triggered: + return True elif event.get("type") == "result": return triggered @@ -179,7 +295,7 @@ def run_single_query( return triggered finally: - if command_file.exists(): + if eval_mode == "alias" and command_file.exists(): command_file.unlink() @@ -192,39 +308,69 @@ def run_eval( project_root: Path, runs_per_query: int = 1, trigger_threshold: float = 0.5, + eval_mode: str = "auto", + skill_path: Path | None = None, + candidate_content: str | None = None, model: str | None = None, ) -> dict: """Run the full eval set and return results.""" results = [] - with ProcessPoolExecutor(max_workers=num_workers) as executor: - future_to_info = {} - for item in eval_set: - for run_idx in range(runs_per_query): - future = executor.submit( - run_single_query, - item["query"], - skill_name, - description, - timeout, - str(project_root), - model, - ) - future_to_info[future] = (item, run_idx) - - query_triggers: dict[str, list[bool]] = {} - query_items: dict[str, dict] = {} - for future in as_completed(future_to_info): - item, _ = future_to_info[future] - query = item["query"] - query_items[query] = item - if query not in query_triggers: - query_triggers[query] = [] - try: - query_triggers[query].append(future.result()) - except Exception as e: - print(f"Warning: query failed: {e}", file=sys.stderr) - query_triggers[query].append(False) + effective_mode = eval_mode + effective_project_root = project_root + worktree_cm = contextlib.nullcontext(project_root) + + if effective_mode == "auto": + if skill_path is not None and resolve_registered_skill_relpath(skill_path, project_root) is not None: + effective_mode = "registered" + else: + effective_mode = "alias" + + if effective_mode == "registered": + if skill_path is None: + raise ValueError("registered eval mode requires skill_path") + relpath = resolve_registered_skill_relpath(skill_path, project_root) + if relpath is None: + raise ValueError("registered eval mode requires skill_path under project_root/skills/*/SKILL.md") + _name, original_description, original_content = parse_skill_md(skill_path) + if candidate_content is None: + if description != original_description: + candidate_content = replace_description_in_skill_md(original_content, description) + else: + candidate_content = original_content + worktree_cm = candidate_worktree(project_root, relpath, candidate_content) + + with worktree_cm as active_project_root: + effective_project_root = active_project_root + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_to_info = {} + for item in eval_set: + for run_idx in range(runs_per_query): + future = executor.submit( + run_single_query, + item["query"], + skill_name, + description, + timeout, + str(effective_project_root), + effective_mode, + model, + ) + future_to_info[future] = (item, run_idx) + + query_triggers: dict[str, list[bool]] = {} + query_items: dict[str, dict] = {} + for future in as_completed(future_to_info): + item, _ = future_to_info[future] + query = item["query"] + query_items[query] = item + if query not in query_triggers: + query_triggers[query] = [] + try: + query_triggers[query].append(future.result()) + except Exception as e: + print(f"Warning: query failed: {e}", file=sys.stderr) + query_triggers[query].append(False) for query, triggers in query_triggers.items(): item = query_items[query] @@ -266,15 +412,17 @@ def main(): parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") parser.add_argument("--skill-path", required=True, help="Path to skill directory") parser.add_argument("--description", default=None, help="Override description to test") - parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--candidate-content-file", default=None, help="Optional full SKILL.md content to evaluate") + parser.add_argument("--eval-mode", choices=["auto", "registered", "alias"], default="auto", help="Evaluator mode") + parser.add_argument("--num-workers", type=int, default=1, help="Number of parallel workers") parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") - parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--runs-per-query", type=int, default=1, help="Number of runs per query") parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") args = parser.parse_args() - eval_set = json.loads(Path(args.eval_set).read_text()) + eval_set = load_eval_set(Path(args.eval_set)) skill_path = Path(args.skill_path) if not (skill_path / "SKILL.md").exists(): @@ -284,9 +432,11 @@ def main(): name, original_description, _content = parse_skill_md(skill_path) description = args.description or original_description project_root = find_project_root() + candidate_content = Path(args.candidate_content_file).read_text() if args.candidate_content_file else None if args.verbose: print(f"Evaluating: {description}", file=sys.stderr) + print(f"Eval mode: {args.eval_mode}", file=sys.stderr) output = run_eval( eval_set=eval_set, @@ -297,6 +447,9 @@ def main(): project_root=project_root, runs_per_query=args.runs_per_query, trigger_threshold=args.trigger_threshold, + eval_mode=args.eval_mode, + skill_path=skill_path, + candidate_content=candidate_content, model=args.model, ) diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index 662b63f8..388a9e13 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -110,11 +110,161 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout): generate_variant.main() output = json.loads(capsys.readouterr().out) - assert output["variant"] == "---\ndescription: updated\n---" + assert generate_variant.extract_description(output["variant"]) == "updated" assert output["tokens_used"] == 3 assert output["reasoning"] == "raw result" +def test_generate_variant_only_changes_description_field(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_description_only", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: | + old description +routing: + triggers: + - "keep-this-trigger" +--- + +# Skill + +Body stays the same. +""" + + def fake_run_claude_code(prompt, model): + return ( + "new description line 1\nnew description line 2" + "improved description", + "raw result", + 9, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + result = generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve routing precision", + current_content=current_content, + failures=[], + model=None, + ) + + assert generate_variant.extract_description(result["variant"]) == "new description line 1\nnew description line 2" + assert ' - "keep-this-trigger"' in result["variant"] + assert "# Skill" in result["variant"] + assert "Body stays the same." in result["variant"] + assert result["deletions"] == [] + + +def test_generate_variant_legacy_full_file_output_is_reduced_to_description_only(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_legacy_variant", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: old description +routing: + triggers: + - "keep-this-trigger" +--- + +# Skill + +Original body. +""" + + legacy_variant = """--- +name: example-skill +description: updated description +routing: + triggers: + - "changed-trigger" +--- + +# Skill + +Changed body. +""" + + def fake_run_claude_code(prompt, model): + return ( + f"{legacy_variant}legacy response" + "", + "raw result", + 5, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + result = generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve routing precision", + current_content=current_content, + failures=[], + model=None, + ) + + assert generate_variant.extract_description(result["variant"]) == "updated description" + assert ' - "keep-this-trigger"' in result["variant"] + assert ' - "changed-trigger"' not in result["variant"] + assert "Original body." in result["variant"] + assert "Changed body." not in result["variant"] + + +def test_generate_variant_prompt_includes_full_failed_query_and_expectation(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_failure_context", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: old description +--- + +# Skill +""" + + captured = {} + + def fake_run_claude_code(prompt, model): + captured["prompt"] = prompt + return ( + "updated description" + "improved description", + "raw result", + 4, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve routing precision", + current_content=current_content, + failures=[ + { + "name": "rubber duck this bug with me, don't solv", + "query": "rubber duck this bug with me, don't solve it yet", + "should_trigger": True, + "details": "trigger_rate=0.00", + "trigger_rate": 0.0, + } + ], + model=None, + ) + + assert "rubber duck this bug with me, don't solve it yet" in captured["prompt"] + assert "expected: SHOULD trigger" in captured["prompt"] + assert "raw_trigger_rate=0.00" in captured["prompt"] + + def test_optimize_loop_omits_model_flag_when_not_provided(tmp_path, monkeypatch): optimize_loop = load_module( "agent_comparison_optimize_loop_nomodel", @@ -231,7 +381,7 @@ def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): ) assert result["status"] == "CONVERGED" - assert "2 rounds without KEEP" in result["exit_reason"] + assert "2 rounds without ACCEPT" in result["exit_reason"] def test_optimize_loop_beam_search_retains_top_k_candidates(tmp_path, monkeypatch): @@ -268,7 +418,9 @@ def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") def fake_assess_target(path, *args, **kwargs): - content = Path(path).read_text() + content = kwargs.get("candidate_content") + if content is None: + content = Path(path).read_text() score = 0.0 if "" in content: score = 1.2 @@ -313,3 +465,307 @@ def fake_assess_target(path, *args, **kwargs): selected = [it for it in result["iterations"] if it.get("selected_for_frontier")] assert len(selected) == 2 assert selected[0]["frontier_rank"] == 1 or selected[1]["frontier_rank"] == 1 + + +def test_composite_score_uses_weighted_dimensions_only_when_hard_gates_pass(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_scoring", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + scores = { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": 7.5, + "error_handling": 6.0, + "language_idioms": 5.0, + "testing": 8.0, + "efficiency": 4.0, + } + + assert optimize_loop.composite_score(scores) == 6.55 + + +def test_composite_score_returns_zero_when_hard_gate_fails(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_hard_gate", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + scores = { + "parses": False, + "compiles": True, + "tests_pass": False, + "protected_intact": True, + "correctness": 10.0, + "error_handling": 10.0, + "language_idioms": 10.0, + "testing": 10.0, + "efficiency": 10.0, + } + + assert optimize_loop.composite_score(scores) == 0.0 + + +def test_assess_target_scores_trigger_rate_results(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_trigger_score", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: trigger scoring test\n---\n") + tasks = [ + {"query": "good query", "should_trigger": True}, + {"query": "bad query", "should_trigger": False}, + ] + + def fake_run_trigger_rate(*args, **kwargs): + return { + "summary": {"total": 2, "passed": 1, "failed": 1}, + "results": [ + {"query": "good query", "pass": True, "trigger_rate": 1.0}, + {"query": "bad query", "pass": False, "trigger_rate": 0.0}, + ], + } + + monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve routing precision", + dry_run=False, + ) + + assert scores["correctness"] == 5.0 + assert scores["error_handling"] == 4.0 + assert scores["language_idioms"] == 3.5 + assert scores["testing"] == 4.0 + assert scores["efficiency"] == 3.6 + assert scores["tests_pass"] is False + assert [item["passed"] for item in scores["task_results"]] == [True, False] + assert scores["task_results"][0]["query"] == "good query" + assert scores["task_results"][0]["should_trigger"] is True + assert scores["task_results"][1]["query"] == "bad query" + assert scores["task_results"][1]["should_trigger"] is False + assert optimize_loop.composite_score(scores) == 4.285 + + +def test_assess_target_forwards_parallel_workers_for_behavioral_eval(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_behavioral_parallel", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: behavioral scoring test\n---\n") + tasks = [ + {"query": "make a skill", "should_trigger": True, "eval_mode": "behavioral"}, + ] + seen = {} + + def fake_run_behavioral_eval(*args, **kwargs): + seen["parallel_workers"] = kwargs["parallel_workers"] + return [{"query": "make a skill", "pass": True, "triggered": True, "new_artifacts": ["skills/x/SKILL.md"]}] + + monkeypatch.setattr(optimize_loop, "_run_behavioral_eval", fake_run_behavioral_eval) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve routing precision", + parallel_eval_workers=3, + ) + + assert seen["parallel_workers"] == 3 + assert scores["tests_pass"] is True + assert scores["correctness"] == 10.0 + assert scores["task_results"][0]["query"] == "make a skill" + assert scores["task_results"][0]["should_trigger"] is True + assert optimize_loop.composite_score(scores) == 8.45 + + +def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_parallel_forwarding", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "make a skill", "should_trigger": True, "eval_mode": "behavioral", "split": "train"}, + {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "eval_mode": "behavioral", "split": "test"}, + ] + } + ) + ) + + calls = [] + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + eval_mode="auto", + ): + calls.append( + { + "path": str(path), + "task_count": len(tasks), + "parallel_eval_workers": parallel_eval_workers, + "candidate_content": candidate_content, + "eval_mode": eval_mode, + } + ) + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": 10.0, + "error_handling": 8.0, + "language_idioms": 7.0, + "testing": 8.0, + "efficiency": 6.0, + "task_results": [{"name": "task", "passed": True}], + } + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + train_split=0.6, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=True, + parallel_eval=2, + ) + + assert result["status"] in {"COMPLETE", "CONVERGED"} + assert calls + assert all(call["parallel_eval_workers"] == 2 for call in calls) + assert all(call["candidate_content"] is not None for call in calls) + assert all(call["eval_mode"] == "auto" for call in calls) + + +def test_tiny_end_to_end_autoresearch_improves_real_weak_skill_copy(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_e2e", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + generate_variant = load_module( + "agent_comparison_generate_variant_e2e", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + source_skill = REPO_ROOT / "skills" / "socratic-debugging" / "SKILL.md" + target = tmp_path / "SKILL.md" + target.write_text(source_skill.read_text()) + + trigger_query = "help me think through this bug step by step" + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text(json.dumps({"tasks": [{"name": "positive", "query": trigger_query, "should_trigger": True, "split": "train"}]})) + + def fake_generate_variant_output( + current_content, + target_path, + goal, + last_failures, + history, + model, + dry_run, + iteration_number, + diversification_note=None, + ): + improved_description = ( + "Question-only debugging mode that guides users to find root causes through structured questions. " + f'Use when: "{trigger_query}", "rubber duck debug with me", "help me think through this bug".' + ) + return { + "variant": generate_variant.replace_description(current_content, improved_description), + "summary": "Added exact positive trigger phrase to the description.", + "reasoning": "Deterministic test variant", + "tokens_used": 0, + "deletions": [], + "deletion_justification": "", + } + + def fake_run_trigger_rate( + target_path, + description, + tasks, + candidate_content=None, + eval_mode="auto", + num_workers=5, + timeout=30, + verbose=False, + ): + passed = trigger_query in description + return { + "results": [ + { + "query": trigger_query, + "pass": passed, + "trigger_rate": 1.0 if passed else 0.0, + } + ], + "summary": { + "total": 1, + "passed": 1 if passed else 0, + "failed": 0 if passed else 1, + }, + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate) + + out_dir = tmp_path / "out" + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + train_split=0.6, + model=None, + output_dir=out_dir, + report_path=out_dir / "report.html", + verbose=False, + dry_run=False, + ) + + assert result["best_iteration"] == 1 + assert result["improvements_found"] == 1 + assert result["baseline_train_score"] == 0.06 + assert result["best_score"] == 8.45 + + results_json = json.loads((out_dir / "results.json").read_text()) + assert results_json["best_iteration"] == 1 + assert results_json["iterations"][0]["verdict"] == "ACCEPT" + + best_variant = (out_dir / "best_variant.md").read_text() + assert trigger_query in generate_variant.extract_description(best_variant) + + verdict_json = json.loads((out_dir / "001" / "verdict.json").read_text()) + assert verdict_json["verdict"] == "ACCEPT" + assert verdict_json["composite_score"] == 8.45 diff --git a/scripts/tests/test_skill_eval_claude_code.py b/scripts/tests/test_skill_eval_claude_code.py index a0c9e05c..b740b3fe 100644 --- a/scripts/tests/test_skill_eval_claude_code.py +++ b/scripts/tests/test_skill_eval_claude_code.py @@ -1,7 +1,9 @@ from __future__ import annotations import json +import os import subprocess +from contextlib import contextmanager from pathlib import Path @@ -46,3 +48,435 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout): transcript = json.loads((tmp_path / "improve_iter_1.json").read_text()) assert transcript["raw_result_text"] == "raw result" assert transcript["rewrite_raw_result_text"] == "raw result" + + +class _FakeUUID: + hex = "deadbeefcafebabe" + + +class _FakePopen: + def __init__(self, stdout_bytes: bytes): + read_fd, write_fd = os.pipe() + os.write(write_fd, stdout_bytes) + os.close(write_fd) + self.stdout = os.fdopen(read_fd, "rb", buffering=0) + self._returncode = None + + def poll(self): + return self._returncode + + def kill(self): + self._returncode = -9 + + def wait(self): + return self._returncode + + +def test_run_single_query_ignores_unrelated_stream_tool_use_before_matching_read(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + clean_name = "demo-skill-skill-deadbeef" + stream_lines = [ + { + "type": "stream_event", + "event": {"type": "content_block_start", "content_block": {"type": "tool_use", "name": "Bash"}}, + }, + { + "type": "stream_event", + "event": {"type": "content_block_start", "content_block": {"type": "tool_use", "name": "Read"}}, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "delta": { + "type": "input_json_delta", + "partial_json": f'{{"file_path":"/tmp/project/.claude/commands/{clean_name}.md"}}', + }, + }, + }, + {"type": "stream_event", "event": {"type": "content_block_stop"}}, + {"type": "result"}, + ] + payload = ("\n".join(json.dumps(line) for line in stream_lines) + "\n").encode() + + monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) + monkeypatch.setattr(mod.subprocess, "Popen", lambda *args, **kwargs: _FakePopen(payload)) + monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) + + triggered = mod.run_single_query( + query="help me debug this", + skill_name="demo-skill", + skill_description="demo description", + timeout=5, + project_root=str(tmp_path), + eval_mode="registered", + ) + + assert triggered is True + + +def test_run_single_query_scans_all_assistant_tool_uses_before_returning(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + clean_name = "demo-skill-skill-deadbeef" + assistant_lines = [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "name": "Bash", "input": {"command": "echo hi"}}, + { + "type": "tool_use", + "name": "Read", + "input": {"file_path": f"/tmp/project/.claude/commands/{clean_name}.md"}, + }, + ] + }, + }, + {"type": "result"}, + ] + payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode() + + monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) + monkeypatch.setattr(mod.subprocess, "Popen", lambda *args, **kwargs: _FakePopen(payload)) + monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) + + triggered = mod.run_single_query( + query="help me debug this", + skill_name="demo-skill", + skill_description="demo description", + timeout=5, + project_root=str(tmp_path), + eval_mode="registered", + ) + + assert triggered is True + + +def test_run_single_query_accepts_real_skill_name_not_just_temporary_alias(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + assistant_lines = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "name": "Skill", + "input": {"skill": "demo-skill"}, + } + ] + }, + }, + {"type": "result"}, + ] + payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode() + + monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) + monkeypatch.setattr(mod.subprocess, "Popen", lambda *args, **kwargs: _FakePopen(payload)) + monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) + + triggered = mod.run_single_query( + query="help me debug this", + skill_name="demo-skill", + skill_description="demo description", + timeout=5, + project_root=str(tmp_path), + eval_mode="registered", + ) + + assert triggered is True + + +def test_resolve_registered_skill_relpath_accepts_repo_skill(tmp_path): + from scripts.skill_eval import run_eval as mod + + project_root = tmp_path + skill_dir = project_root / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n") + + relpath = mod.resolve_registered_skill_relpath(skill_dir, project_root) + + assert relpath == Path("skills/demo-skill/SKILL.md") + + +def test_replace_description_in_skill_md_rewrites_frontmatter_block_scalar(): + from scripts.skill_eval import run_eval as mod + + original = """--- +name: demo-skill +description: | + old description +version: 1.0.0 +--- + +# Skill +""" + + updated = mod.replace_description_in_skill_md(original, "new description line 1\nnew description line 2") + + assert "description: |\n new description line 1\n new description line 2\nversion: 1.0.0" in updated + assert "# Skill" in updated + + +def test_load_eval_set_accepts_common_wrapped_formats(tmp_path): + from scripts.skill_eval import run_eval as mod + + tasks_path = tmp_path / "tasks.json" + tasks_path.write_text(json.dumps({"tasks": [{"query": "q1", "should_trigger": True}]})) + queries_path = tmp_path / "queries.json" + queries_path.write_text(json.dumps({"queries": [{"query": "q2", "should_trigger": False}]})) + split_path = tmp_path / "split.json" + split_path.write_text( + json.dumps( + { + "train": [{"query": "q3", "should_trigger": True}], + "test": [{"query": "q4", "should_trigger": False}], + } + ) + ) + + assert mod.load_eval_set(tasks_path) == [{"query": "q1", "should_trigger": True}] + assert mod.load_eval_set(queries_path) == [{"query": "q2", "should_trigger": False}] + assert mod.load_eval_set(split_path) == [ + {"query": "q3", "should_trigger": True}, + {"query": "q4", "should_trigger": False}, + ] + + +def test_run_eval_auto_uses_registered_worktree_for_repo_skill(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n") + worktree_root = tmp_path / "worktree" + worktree_root.mkdir() + + seen = {"candidate_content": None, "submitted": []} + + @contextmanager + def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content): + seen["candidate_content"] = candidate_content + seen["registered_skill_relpath"] = registered_skill_relpath + yield worktree_root + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + seen["submitted"].append(args) + return _FakeFuture(True) + + monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree) + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + result = mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="demo description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="auto", + skill_path=skill_dir, + candidate_content="candidate body", + ) + + assert seen["candidate_content"] == "candidate body" + assert seen["registered_skill_relpath"] == Path("skills/demo-skill/SKILL.md") + assert seen["submitted"] + _, _, _, _, submitted_project_root, submitted_eval_mode, _ = seen["submitted"][0] + assert submitted_project_root == str(worktree_root) + assert submitted_eval_mode == "registered" + assert result["summary"]["passed"] == 1 + + +def test_run_eval_registered_mode_patches_candidate_from_description_override(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + original_content = """--- +name: demo-skill +description: old description +version: 1.0.0 +--- + +# Skill +""" + (skill_dir / "SKILL.md").write_text(original_content) + seen = {"candidate_content": None} + + @contextmanager + def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content): + seen["candidate_content"] = candidate_content + yield tmp_path / "worktree" + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + return _FakeFuture(True) + + monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree) + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="new description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="registered", + skill_path=skill_dir, + candidate_content=None, + ) + + assert seen["candidate_content"] is not None + assert "description: |\n new description\nversion: 1.0.0" in seen["candidate_content"] + + +def test_run_eval_registered_mode_patches_current_working_copy_when_no_override(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + original_content = """--- +name: demo-skill +description: current working copy description +version: 1.0.0 +--- + +# Skill +""" + (skill_dir / "SKILL.md").write_text(original_content) + seen = {"candidate_content": None} + + @contextmanager + def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content): + seen["candidate_content"] = candidate_content + yield tmp_path / "worktree" + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + return _FakeFuture(True) + + monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree) + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="current working copy description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="registered", + skill_path=skill_dir, + candidate_content=None, + ) + + assert seen["candidate_content"] == original_content + + +def test_run_eval_auto_falls_back_to_alias_for_non_registered_skill(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "scratch" / "demo-skill" + skill_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n") + + seen_submissions = [] + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + seen_submissions.append(args) + return _FakeFuture(False) + + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + result = mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="demo description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="auto", + skill_path=skill_dir, + ) + + assert seen_submissions + _, _, _, _, submitted_project_root, submitted_eval_mode, _ = seen_submissions[0] + assert submitted_project_root == str(tmp_path) + assert submitted_eval_mode == "alias" + assert result["summary"]["passed"] == 0 diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 21e8c150..6d934786 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -308,15 +308,15 @@ The loop automatically evaluates the unmodified target against the train set bef **Step 4: Enter optimization loop** The `optimize_loop.py` script handles the full loop: -- Calls `generate_variant.py` to propose changes through `claude -p` +- Calls `generate_variant.py` to propose a new frontmatter `description` through `claude -p` - Evaluates each variant against train tasks - Runs either: - single-path hill climbing: `--beam-width 1 --candidates-per-parent 1` - beam search with top-K retention: keep the best `K` improving candidates each round -- Keeps variants that beat their parent by more than `--min-gain` (default 0.02) -- Reverts variants that don't improve, break hard gates, or delete sections without justification +- Accepts variants that beat their parent by more than `--min-gain` (default 0.02) +- Rejects variants that don't improve or break hard gates - Checks held-out test set every `--holdout-check-cadence` rounds for Goodhart divergence -- Stops on convergence (`--revert-streak-limit` rounds without any KEEP), Goodhart alarm, or max iterations +- Stops on convergence (`--revert-streak-limit` rounds without any ACCEPT), Goodhart alarm, or max iterations ```bash python3 skills/agent-comparison/scripts/optimize_loop.py \ @@ -340,23 +340,33 @@ Omit `--model` to use Claude Code's configured default model, or pass it explici The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and review/export controls. Recommended modes: -- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1` +- Short default optimization: default flags only +- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1 --max-iterations 3 --revert-streak-limit 3` - True autoresearch sweep: `--max-iterations 20 --beam-width 3 --candidates-per-parent 2 --revert-streak-limit 20` - Conservative search with strict keeps: raise `--min-gain` above `0.02` - Exploratory search that accepts small wins: use `--min-gain 0.0` +Live eval defaults are intentionally short: +- one optimization round +- one trigger-eval run per query +- one trigger-eval worker +- no holdout cadence unless explicitly requested + +For real repo skills at `skills//SKILL.md`, the live evaluator now prefers an isolated git worktree so the candidate content is scored at the real skill path. This is the default `--eval-mode auto` behavior and avoids scoring the installed skill instead of the candidate. +The registered-skill path also evaluates the current working copy, not just `HEAD`, so local uncommitted edits are measured correctly. + **Step 5: Present results in UI** Open the generated `optimization-report.html` in a browser. The report shows: -- Progress dashboard (status, baseline vs best, kept/reverted counts) +- Progress dashboard (status, baseline vs best, accepted/rejected counts) - Convergence chart (train solid line, held-out dashed line, baseline dotted) - Iteration table with verdict, composite score, delta, and change summary - Expandable inline diffs per iteration (click any row) -**Step 6: Review kept snapshots** +**Step 6: Review accepted snapshots** -Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews the kept iterations as candidate snapshots from the original target: -- Inspect each kept iteration's diff in the report +Not all ACCEPT iterations are real improvements — some may be harness artifacts. The user reviews the accepted iterations as candidate snapshots from the original target: +- Inspect each accepted iteration's diff in the report - Use "Preview Selected Snapshot" only as a comparison aid in the UI - Use "Export Selected" to download a review JSON describing the selected snapshot diff - In beam mode, review the retained frontier candidates first; they are the strongest candidates from the latest round @@ -365,15 +375,16 @@ Not all KEEP iterations are real improvements — some may be harness artifacts. Apply one reviewed improvement to the original target file. -- If you want the best single kept variant, use `evals/iterations/best_variant.md`. -- Beam search still writes a single `best_variant.md`: the highest-scoring kept candidate seen anywhere in the run. -- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple kept diffs into a generated patch. +- If you want the best single accepted variant, use `evals/iterations/best_variant.md`. +- Beam search still writes a single `best_variant.md`: the highest-scoring accepted candidate seen anywhere in the run. +- The current optimizer is description-only, so diffs should stay confined to the frontmatter `description`. +- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple accepted diffs into a generated patch. ```bash -# Review the best kept variant before applying +# Review the best accepted variant before applying cat evals/iterations/best_variant.md | head -20 -# Replace the target with the best kept variant +# Replace the target with the best accepted variant cp evals/iterations/best_variant.md skills/{target}/SKILL.md ``` @@ -397,11 +408,23 @@ Compare final scores to the baseline to confirm net improvement. In beam mode, t python3 scripts/learning-db.py learn \ --skill agent-comparison \ "autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \ - Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}" + Accepted: {accepted}/{total}. Stop: {reason}. Changes: {summaries}" ``` **Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded. +### Current Reality Check + +The current optimizer is in a solid state for: +- deterministic proof runs +- isolated live evaluation of existing registered skills +- short live optimization of `read-only-ops`, with the accepted description change now applied and validated against `references/read-only-ops-short-tasks.json` + +One live-harness caveat remains: +- temporary renamed skill copies do not yet show reliable live trigger improvements through the dynamic command alias path + +That caveat does not affect deterministic proof runs or live checks against existing registered skills, but it does mean the current system is stronger for optimizing real in-repo skills than arbitrary renamed temp clones. + ### Optional Extensions These are off by default. Enable explicitly when needed: diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md index 3aa0f6a8..6101ecd7 100644 --- a/skills/agent-comparison/references/optimization-guide.md +++ b/skills/agent-comparison/references/optimization-guide.md @@ -80,8 +80,29 @@ Explicit train/test sets: If no split markers are present, the loop performs a reproducible random split using `--train-split` and seed `42`. +`run_eval.py` now accepts the same common task-file wrappers: + +- raw list: `[{"query": "...", "should_trigger": true}]` +- task wrapper: `{"tasks": [...]}` +- query wrapper: `{"queries": [...]}` +- split wrapper: `{"train": [...], "test": [...]}` + ## Command +Short default run: + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/go-testing/SKILL.md \ + --goal "improve routing precision without losing recall" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --report optimization-report.html \ + --output-dir evals/iterations \ + --verbose +``` + +Longer search: + ```bash python3 skills/agent-comparison/scripts/optimize_loop.py \ --target skills/go-testing/SKILL.md \ @@ -106,20 +127,45 @@ Useful flags: - `--dry-run`: exercise the loop mechanics without calling Claude Code - `--report`: write a live HTML report - `--output-dir`: persist iteration snapshots and `results.json` +- `--eval-mode auto|registered|alias`: choose how live trigger eval is isolated - `--beam-width`: retain the best K improving candidates per round - `--candidates-per-parent`: generate multiple sibling variants from each frontier candidate -- `--revert-streak-limit`: stop after N rounds without any KEEP candidates +- `--revert-streak-limit`: stop after N rounds without any ACCEPT candidates - `--holdout-check-cadence`: evaluate the global best on held-out tasks every N rounds +- `--parallel-eval N`: run behavioral eval tasks in parallel isolated worktrees + +Short defaults: + +- `--max-iterations 1` +- `--revert-streak-limit 1` +- `--holdout-check-cadence 0` +- trigger eval `--num-workers 1` +- trigger eval `--runs-per-query 1` Recommended search presets: +- Short proof run: + - default flags only - Single-path local search: - - `--beam-width 1 --candidates-per-parent 1` + - `--beam-width 1 --candidates-per-parent 1 --max-iterations 3 --revert-streak-limit 3` - Balanced beam search: - `--beam-width 3 --candidates-per-parent 2` - Aggressive exploration: - `--beam-width 5 --candidates-per-parent 3 --min-gain 0.0` +## Live Eval Isolation Modes + +`run_eval.py` now has three modes: + +- `auto`: default. If the target is a real repo skill at `skills//SKILL.md`, live eval runs in an isolated git worktree with the candidate content patched into the real path. Otherwise it falls back to alias mode. +- `registered`: force isolated worktree evaluation of a real registered skill. +- `alias`: force legacy dynamic command-file evaluation. + +For real registered skills, `auto` is the preferred mode. It prevents the evaluator +from accidentally scoring the installed skill instead of the candidate under test. +It also patches the current working-copy skill content into the isolated worktree, +so local uncommitted edits are evaluated correctly. + ## Evaluation Model The loop follows the ADR-131 structure: @@ -131,11 +177,10 @@ The loop follows the ADR-131 structure: ### Layer 1: Hard Gates -An iteration is rejected immediately if any of these fail: +An iteration is rejected immediately if any of these mechanical validity gates fail: - `parses` - `compiles` -- `tests_pass` - `protected_intact` For description optimization, `parses` and `protected_intact` are the most @@ -144,9 +189,13 @@ preserved verbatim. ### Layer 2: Composite Score -The loop converts trigger-rate evaluation results into a weighted composite -score using the built-in weights in `optimize_loop.py`. A candidate is kept only -if it beats its parent by more than `--min-gain`. +The loop converts evaluation results into a weighted composite score using the +built-in weights in `optimize_loop.py`. Task accuracy affects the component +dimensions (`correctness`, `error_handling`, `language_idioms`, `testing`, +`efficiency`) without zeroing the entire score. This preserves optimization +signal for incremental improvements when a task set is not yet perfect. + +A candidate is accepted only if it beats its parent by more than `--min-gain`. ### Layer 3: Held-Out Regression Check @@ -161,21 +210,22 @@ When beam search is enabled: - each frontier candidate generates `--candidates-per-parent` siblings - every sibling is scored independently -- the top `--beam-width` KEEP candidates become the next frontier +- the top `--beam-width` ACCEPT candidates become the next frontier - `best_variant.md` still tracks the single best candidate seen anywhere in the run When `--beam-width 1 --candidates-per-parent 1`, the behavior collapses back to the original single-path optimizer. -## Deletion Safety Rule +## Description-Only Scope -Deleting sections is allowed only with explicit justification. +The current optimizer mutates only the YAML frontmatter `description`. -- `generate_variant.py` detects removed `##` headings -- the model must return a `deletion_justification` -- `optimize_loop.py` rejects deletions without one +- `generate_variant.py` asks the model for an improved description, not a full-file rewrite +- the variant file is reconstructed by replacing that one field in the original content +- eval results therefore measure the same surface area that generation changes -This enforces ADR-131's "no deletion without justification" rule. +This avoids false negatives where the model improves routing blocks or body text +that the evaluator does not read. ## Iteration Artifacts @@ -193,10 +243,25 @@ When `--output-dir` is set, the loop writes: When `--report` is set, it also writes a live HTML dashboard showing: -- status, baseline, best score, kept/reverted counts +- status, baseline, best score, accepted/rejected counts - convergence chart - iteration table with diffs -- review/export controls for kept snapshot diffs from the original target +- review/export controls for accepted snapshot diffs from the original target + +## Current Validation Status + +What is currently demonstrated: +- deterministic end-to-end improvement runs with readable artifacts +- isolated live optimization for existing registered skills via temporary git worktrees +- score calculations and accept/reject decisions that match the weighted rubric +- short live proof on `skills/read-only-ops/SKILL.md` using + `references/read-only-ops-short-tasks.json`, improving from one failed positive + to `2/2` live passes after the accepted description update + +What remains imperfect: +- live optimization of temporary renamed skill copies still fails to show measured improvement through the dynamic command alias path + +So the current tooling is operational for real registered skills and deterministic proof runs, but not yet fully proven for arbitrary temporary renamed clones. ## Choosing Good Eval Tasks diff --git a/skills/agent-comparison/references/read-only-ops-short-tasks.json b/skills/agent-comparison/references/read-only-ops-short-tasks.json new file mode 100644 index 00000000..dad16d94 --- /dev/null +++ b/skills/agent-comparison/references/read-only-ops-short-tasks.json @@ -0,0 +1,16 @@ +{ + "tasks": [ + { + "name": "positive-read-only-report", + "query": "inspect this repository and report back without changing anything", + "should_trigger": true, + "split": "train" + }, + { + "name": "negative-fix-tests", + "query": "fix the failing tests in this repository", + "should_trigger": false, + "split": "train" + } + ] +} diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py index 31cb2446..f0707507 100644 --- a/skills/agent-comparison/scripts/generate_variant.py +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 -"""Generate a variant of an agent/skill file using Claude Code. +"""Generate a description-optimized variant of an agent/skill file using Claude Code. -Proposes modifications to improve the target file based on the optimization -goal and previous iteration failures. Preserves protected sections marked -with DO NOT OPTIMIZE markers. +Proposes a new frontmatter description based on the optimization goal and +previous iteration failures, then applies that description back onto the +original file. This keeps the optimization surface aligned with what the +evaluator actually measures. Pattern: uses `claude -p` so generation runs through Claude Code directly. @@ -17,8 +18,8 @@ Output (JSON to stdout): { - "variant": "full file content...", - "summary": "Added CRITICAL warning for error wrapping", + "variant": "full file content with updated description...", + "summary": "Added concrete trigger phrases to the description", "deletion_justification": "", "reasoning": "Extended thinking content...", "tokens_used": 12345 @@ -86,6 +87,92 @@ def detect_deletions(original: str, variant: str) -> list[str]: return sorted(orig_headings - var_headings) +# --------------------------------------------------------------------------- +# Description-only optimization helpers +# --------------------------------------------------------------------------- + + +def extract_description(content: str) -> str: + """Extract frontmatter description text from a markdown file.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + + fm_lines = lines[1:end_idx] + idx = 0 + while idx < len(fm_lines): + line = fm_lines[idx] + if line.startswith("description:"): + value = line[len("description:") :].strip() + if value in (">", "|", ">-", "|-"): + parts: list[str] = [] + idx += 1 + while idx < len(fm_lines) and (fm_lines[idx].startswith(" ") or fm_lines[idx].startswith("\t")): + parts.append(fm_lines[idx].strip()) + idx += 1 + return "\n".join(parts).strip() + return value.strip('"').strip("'").strip() + idx += 1 + + raise ValueError("Content missing frontmatter description") + + +def replace_description(content: str, new_description: str) -> str: + """Replace the frontmatter description while preserving all other content verbatim.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + + fm_lines = lines[1:end_idx] + start_idx = None + stop_idx = None + idx = 0 + while idx < len(fm_lines): + line = fm_lines[idx] + if line.startswith("description:"): + start_idx = idx + value = line[len("description:") :].strip() + stop_idx = idx + 1 + if value in (">", "|", ">-", "|-"): + stop_idx = idx + 1 + while stop_idx < len(fm_lines) and ( + fm_lines[stop_idx].startswith(" ") or fm_lines[stop_idx].startswith("\t") + ): + stop_idx += 1 + break + idx += 1 + + if start_idx is None or stop_idx is None: + raise ValueError("Content missing frontmatter description") + + normalized = new_description.strip() + replacement = ["description: |"] + if normalized: + replacement.extend(f" {line}" if line else " " for line in normalized.splitlines()) + else: + replacement.append(" ") + + new_fm_lines = fm_lines[:start_idx] + replacement + fm_lines[stop_idx:] + rebuilt_lines = ["---", *new_fm_lines, "---", *lines[end_idx + 1 :]] + return "\n".join(rebuilt_lines) + + # --------------------------------------------------------------------------- # Variant generation # --------------------------------------------------------------------------- @@ -162,7 +249,20 @@ def generate_variant( if failures: failure_section = "\n\nFailed tasks from the last iteration:\n" for f in failures: - failure_section += f" - {f.get('name', 'unnamed')}: {f.get('details', 'failed')}\n" + label = f.get("query") or f.get("name", "unnamed") + should_trigger = f.get("should_trigger") + expectation = "" + if should_trigger is True: + expectation = " (expected: SHOULD trigger)" + elif should_trigger is False: + expectation = " (expected: should NOT trigger)" + detail_bits = [] + if f.get("details"): + detail_bits.append(str(f["details"])) + if "trigger_rate" in f: + detail_bits.append(f"raw_trigger_rate={f['trigger_rate']:.2f}") + details = "; ".join(detail_bits) if detail_bits else "failed" + failure_section += f" - {label}{expectation}: {details}\n" history_section = "" if history: @@ -188,7 +288,9 @@ def generate_variant( This is non-negotiable: protected sections contain safety gates that must not be removed even if removing them would improve test scores.""" - prompt = f"""You are optimizing an agent/skill file to improve its performance. + current_description = extract_description(current_content) + + prompt = f"""You are optimizing an agent/skill file to improve its trigger performance. Target file: {target_path} Optimization goal: {goal} @@ -197,36 +299,45 @@ def generate_variant( {current_content} +Current description: + +{current_description} + {failure_section}{history_section}{diversification_section}{protected_notice} SAFETY RULES: -1. Do NOT delete sections without replacing them with equivalent or better content. - If you remove a section heading that exists in the original, you must explain what - replaces the removed functionality. Pure deletion degrades unmeasured capabilities. +1. Optimize ONLY the YAML frontmatter `description` field. + Do not modify any other part of the file. The optimizer evaluates description-trigger + quality only, so changing routing blocks, body text, or headings is out of scope. -2. Do NOT change the tools, SDKs, or interfaces the agent uses. The variant must work - in the same environment as the original (no switching from SDK to curl, etc.). +2. Keep the description faithful to the file's actual purpose. Improve routing precision + by making the description clearer and more triggerable, not by changing the behavior + or scope of the skill. -3. Keep YAML frontmatter structure intact (name, description, routing, etc.). +3. Keep the skill name, routing, tools, instructions, and all protected sections unchanged. -4. Focus on making the agent/skill better at achieving the stated goal. Common +4. Focus on making the description better at achieving the stated goal. Common improvements include: - - Moving critical information to more prominent positions (CRITICAL banners) - - Adding explicit planning steps before code generation - - Improving error handling instructions with specific patterns - - Adding concrete examples for ambiguous instructions - - Restructuring for clarity when sections are dense - -Please respond with the complete modified file content inside tags, -and a brief summary of what you changed and why inside tags. - -If you removed any existing `##` section heading, include a brief justification -inside tags. If you did not remove a section, return -empty tags. - - -[complete file content here] - + - Including natural user phrasings that should trigger this skill + - Making the first sentence more concrete and specific + - Removing vague wording that overlaps with unrelated skills + - Adding concise usage examples when they help routing + +5. Treat failed eval tasks as primary routing evidence: + - If a task SHOULD have triggered but did not, strongly prefer copying the exact + user phrasing or a very close paraphrase into the description. + - If a task should NOT have triggered, add clarifying language that separates this + skill from that request without expanding scope. + - Optimize for the smallest description change that would make the failed tasks + more likely to score correctly on the next run. + +Please respond with ONLY the improved description text inside tags, +without YAML quoting or frontmatter delimiters, and a brief summary inside tags. +Do not return the full file. + + +[improved description only] + [1-2 sentence description of the change] @@ -238,13 +349,20 @@ def generate_variant( text, raw_result_text, tokens_used = _run_claude_code(prompt, model) - # Parse variant content - variant_match = re.search(r"(.*?)", text, re.DOTALL) - if not variant_match: - print("Error: No tags in response", file=sys.stderr) - sys.exit(1) - - variant = variant_match.group(1).strip() + # Parse improved description. Accept legacy output as a fallback + # so older prompt responses and tests remain parseable. + description_match = re.search(r"(.*?)", text, re.DOTALL) + if description_match: + new_description = description_match.group(1).strip() + else: + variant_match = re.search(r"(.*?)", text, re.DOTALL) + if not variant_match: + print("Error: No or tags in response", file=sys.stderr) + sys.exit(1) + legacy_variant = variant_match.group(1).strip() + new_description = extract_description(legacy_variant) + + variant = replace_description(current_content, new_description) # Parse summary summary_match = re.search(r"(.*?)", text, re.DOTALL) @@ -253,13 +371,12 @@ def generate_variant( deletion_match = re.search(r"(.*?)", text, re.DOTALL) deletion_justification = deletion_match.group(1).strip() if deletion_match else "" - # Restore protected sections (safety net) + # Restore protected sections (safety net); should be a no-op when only the + # description changes, but keep it as belt-and-suspenders protection. variant = restore_protected(current_content, variant) - # Check for unauthorized deletions + # Description-only optimization should never delete sections. deletions = detect_deletions(current_content, variant) - if deletions: - print(f"Warning: Deleted sections: {deletions}", file=sys.stderr) return { "variant": variant, diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index 15c11182..b8987c34 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -20,11 +20,13 @@ from __future__ import annotations import argparse +import concurrent.futures import glob import json import os import random import re +import shutil import subprocess import sys import tempfile @@ -43,7 +45,10 @@ "efficiency": 0.10, } -HARD_GATE_KEYS = ["parses", "compiles", "tests_pass", "protected_intact"] +# Hard gates should capture mechanical invalidity, not evaluation quality. +# Routing/task accuracy is already reflected in the weighted dimensions below; +# zeroing the whole composite on any failed task destroys the optimization signal. +HARD_GATE_KEYS = ["parses", "compiles", "protected_intact"] def passes_hard_gates(scores: dict) -> bool: @@ -273,7 +278,7 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: rows = "" for it in iterations: v = it["verdict"] - vcls = {"KEEP": "keep", "REVERT": "revert", "STOP": "stop"}.get(v, "") + vcls = {"ACCEPT": "accept", "REJECT": "reject", "STOP": "stop"}.get(v, "") sc = it["score"] train_score = sc.get("train") test_score = sc.get("test") @@ -284,7 +289,7 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero" summary = html_mod.escape(str(it.get("change_summary", ""))[:80]) diff_esc = html_mod.escape(str(it.get("diff", ""))) - is_keep = v == "KEEP" + is_keep = v == "ACCEPT" n = it["number"] rows += f""" @@ -310,8 +315,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: bt = baseline.get("train", 0.0) best = max((it["score"].get("train", bt) for it in iterations), default=bt) - kept = sum(1 for it in iterations if it["verdict"] == "KEEP") - reverted = sum(1 for it in iterations if it["verdict"] == "REVERT") + accepted = sum(1 for it in iterations if it["verdict"] == "ACCEPT") + rejected = sum(1 for it in iterations if it["verdict"] == "REJECT") cur = len(iterations) mx = data.get("max_iterations", 20) scls = "running" if status == "RUNNING" else "done" if status in ("CONVERGED", "COMPLETE") else "alarm" @@ -345,8 +350,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: .iter-row:hover {{ background:var(--surface-2); }} .diff-row td {{ padding:0; }} .diff-block {{ background:#080b0f;padding:12px;font-family:var(--font-mono);font-size:11px;max-height:400px;overflow:auto;white-space:pre;line-height:1.5;color:var(--muted); }} -.verdict-keep {{ color:var(--green);font-weight:600; }} -.verdict-revert {{ color:var(--red);font-weight:600; }} +.verdict-accept {{ color:var(--green);font-weight:600; }} +.verdict-reject {{ color:var(--red);font-weight:600; }} .verdict-stop {{ color:var(--yellow);font-weight:600; }} .d-pos {{ color:var(--green);font-weight:600; }} .d-neg {{ color:var(--red);font-weight:600; }} @@ -367,8 +372,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
Progress{cur}/{mx}
Baseline{bt:.2f}
Best{best:.2f} ({best - bt:+.2f})
-
Kept{kept}
-
Reverted{reverted}
+
Accepted{accepted}
+
Rejected{rejected}

{score_label}

@@ -636,7 +641,9 @@ def _run_trigger_rate( target_path: Path, description: str, tasks: list[dict], - num_workers: int = 5, + candidate_content: str | None = None, + eval_mode: str = "auto", + num_workers: int = 1, timeout: int = 30, verbose: bool = False, ) -> dict: @@ -651,39 +658,47 @@ def _run_trigger_rate( task_file = f.name json.dump(tasks, f) - with tempfile.TemporaryDirectory() as skill_dir: - skill_md = Path(skill_dir) / "SKILL.md" - skill_md.write_text(target_path.read_text()) - - project_root = Path.cwd() - for parent in [project_root, *project_root.parents]: - if (parent / ".claude").is_dir(): - project_root = parent - break - - cmd = [ - sys.executable, - "-m", - "scripts.skill_eval.run_eval", - "--eval-set", - task_file, - "--skill-path", - skill_dir, - "--description", - description, - "--num-workers", - str(num_workers), - "--timeout", - str(timeout), - "--runs-per-query", - "1", - ] - if verbose: - cmd.append("--verbose") - print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr) + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + project_root = parent + break + + cmd = [ + sys.executable, + "-m", + "scripts.skill_eval.run_eval", + "--eval-set", + task_file, + "--skill-path", + str(target_path.parent), + "--description", + description, + "--eval-mode", + eval_mode, + "--num-workers", + str(num_workers), + "--timeout", + str(timeout), + "--runs-per-query", + "1", + ] + if candidate_content is not None: + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as candidate_file: + candidate_file.write(candidate_content) + candidate_file.flush() + cmd.extend(["--candidate-content-file", candidate_file.name]) + candidate_file_path = Path(candidate_file.name) + else: + candidate_file_path = None + + if verbose: + cmd.append("--verbose") + print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr) - env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + try: result = subprocess.run( cmd, capture_output=True, @@ -692,16 +707,19 @@ def _run_trigger_rate( env=env, timeout=600, ) - - if result.returncode != 0: - print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr) - return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} - - try: - return json.loads(result.stdout) - except json.JSONDecodeError as e: - print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr) - return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} + finally: + if candidate_file_path is not None: + candidate_file_path.unlink(missing_ok=True) + + if result.returncode != 0: + print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} + + try: + return json.loads(result.stdout) + except json.JSONDecodeError as e: + print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} finally: if task_file: Path(task_file).unlink(missing_ok=True) @@ -726,6 +744,179 @@ def _snapshot_extra_dirs(project_root: Path) -> set[str]: return snapshot +def _run_single_behavioral_task( + task: dict, + project_root: Path, + worktree_path: Path, + env: dict[str, str], + timeout: int, + verbose: bool, + runs_per_task: int, + trigger_threshold: float, +) -> dict: + """Run a single behavioral task and return its result dict. + + Args: + task: Task dict with 'query', 'should_trigger', optional 'artifact_glob' and 'query_prefix'. + project_root: Canonical project root (used only for worktree creation context). + worktree_path: Directory in which claude -p runs and artifact globs are resolved. + For sequential execution this equals project_root; for parallel execution + this is an isolated git worktree. + env: Environment variables to pass to subprocess. + timeout: Per-run timeout in seconds for the claude -p invocation. + verbose: Print progress to stderr. + runs_per_task: Number of times to run the query; result is averaged. + trigger_threshold: Fraction of runs that must trigger to count as triggered. + + Returns: + Per-task result dict with keys: query, triggered, should_trigger, pass, new_artifacts. + """ + query: str = task["query"] + should_trigger: bool = task["should_trigger"] + artifact_glob: str = task.get("artifact_glob", "adr/*.md") + query_prefix: str = task.get("query_prefix", "/do ") + + full_query = f"{query_prefix}{query}" + + run_results: list[bool] = [] + all_new_artifacts: list[str] = [] + + for run_index in range(runs_per_task): + if verbose and runs_per_task > 1: + print(f"[behavioral] Run {run_index + 1}/{runs_per_task}: {full_query!r}", file=sys.stderr) + elif verbose: + print(f"[behavioral] Running: claude -p {full_query!r}", file=sys.stderr) + + # Snapshot existing artifacts before the run (primary glob + extra dirs) + before: set[str] = set(glob.glob(str(worktree_path / artifact_glob))) + before_extra: set[str] = _snapshot_extra_dirs(worktree_path) + + run_triggered = False + run_new_artifacts: list[str] = [] + + try: + result = subprocess.run( + ["claude", "-p", full_query], + capture_output=True, + text=True, + cwd=str(worktree_path), + env=env, + timeout=timeout, + ) + if result.returncode != 0: + print( + f"[behavioral] claude exited {result.returncode}: {result.stderr[:300]}", + file=sys.stderr, + ) + + # Check for new files matching the artifact glob + after: set[str] = set(glob.glob(str(worktree_path / artifact_glob))) + run_new_artifacts = sorted(after - before) + run_triggered = len(run_new_artifacts) > 0 + + if verbose and run_new_artifacts: + print(f"[behavioral] New artifacts: {run_new_artifacts}", file=sys.stderr) + + except subprocess.TimeoutExpired: + if verbose: + print(f"[behavioral] Timed out after {timeout}s for query: {full_query!r}", file=sys.stderr) + # Still check artifacts — the process may have written them before timing out + after_timeout: set[str] = set(glob.glob(str(worktree_path / artifact_glob))) + run_new_artifacts = sorted(after_timeout - before) + run_triggered = len(run_new_artifacts) > 0 + if verbose and run_triggered: + print(f"[behavioral] Artifacts found despite timeout: {run_new_artifacts}", file=sys.stderr) + + # Clean up primary-glob artifacts + for artifact_path in run_new_artifacts: + try: + Path(artifact_path).unlink(missing_ok=True) + except OSError: + pass + + # Clean up extra-dir artifacts (agents/, skills/, pipelines/, scripts/) + after_extra: set[str] = _snapshot_extra_dirs(worktree_path) + new_extra = sorted(after_extra - before_extra) + for path in new_extra: + try: + Path(path).unlink(missing_ok=True) + except OSError: + pass + if verbose and new_extra: + print( + f"[behavioral] Cleaned up {len(new_extra)} extra artifacts: {new_extra}", + file=sys.stderr, + ) + + run_results.append(run_triggered) + all_new_artifacts.extend(run_new_artifacts) + + # Aggregate across runs + if runs_per_task > 1: + triggered = (sum(run_results) / len(run_results)) >= trigger_threshold + else: + triggered = run_results[0] if run_results else False + + passed = triggered == should_trigger + return { + "query": query, + "triggered": triggered, + "should_trigger": should_trigger, + "pass": passed, + "new_artifacts": all_new_artifacts, + } + + +def _run_single_behavioral_task_in_worktree( + task: dict, + project_root: Path, + env: dict[str, str], + timeout: int, + verbose: bool, + runs_per_task: int, + trigger_threshold: float, +) -> dict: + """Create a temporary git worktree, run a behavioral task inside it, then remove it. + + Used by the parallel execution path in _run_behavioral_eval. Each thread + gets its own isolated worktree so concurrent claude -p invocations do not + share working-directory state. + + The worktree is always removed in a finally block regardless of success or failure. + """ + wt_path_str = tempfile.mkdtemp(prefix="eval-wt-", dir="/tmp") + wt_path = Path(wt_path_str) + # Remove the empty dir so git worktree add can create it + wt_path.rmdir() + try: + subprocess.run( + ["git", "worktree", "add", wt_path_str, "HEAD"], + cwd=str(project_root), + capture_output=True, + check=True, + ) + return _run_single_behavioral_task( + task=task, + project_root=project_root, + worktree_path=wt_path, + env=env, + timeout=timeout, + verbose=verbose, + runs_per_task=runs_per_task, + trigger_threshold=trigger_threshold, + ) + finally: + try: + subprocess.run( + ["git", "worktree", "remove", "--force", wt_path_str], + cwd=str(project_root), + capture_output=True, + ) + except Exception: + pass + shutil.rmtree(wt_path_str, ignore_errors=True) + + def _run_behavioral_eval( target_path: Path, description: str, @@ -734,12 +925,16 @@ def _run_behavioral_eval( verbose: bool = False, runs_per_task: int = 1, trigger_threshold: float = 0.5, + parallel_workers: int = 0, ) -> list[dict]: """Run behavioral assessment by invoking claude -p and checking artifact output. Each task must have 'query', 'should_trigger', 'artifact_glob', and optionally - 'query_prefix' fields. Tasks are run sequentially since each claude -p invocation - is resource-intensive. + 'query_prefix' fields. + + When parallel_workers > 1, tasks are dispatched concurrently via ThreadPoolExecutor. + Each concurrent task runs in an isolated git worktree created from HEAD so that + file-system mutations do not interfere across tasks. When runs_per_task > 1, each task query is run that many times. The final triggered value is True iff (sum(results) / runs_per_task) >= trigger_threshold. @@ -755,106 +950,56 @@ def _run_behavioral_eval( env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} - results = [] - for task in tasks: - query: str = task["query"] - should_trigger: bool = task["should_trigger"] - artifact_glob: str = task.get("artifact_glob", "adr/*.md") - query_prefix: str = task.get("query_prefix", "/do ") - - full_query = f"{query_prefix}{query}" - - run_results: list[bool] = [] - all_new_artifacts: list[str] = [] - - for run_index in range(runs_per_task): - if verbose and runs_per_task > 1: - print(f"[behavioral] Run {run_index + 1}/{runs_per_task}: {full_query!r}", file=sys.stderr) - elif verbose: - print(f"[behavioral] Running: claude -p {full_query!r}", file=sys.stderr) - - # Snapshot existing artifacts before the run (primary glob + extra dirs) - before: set[str] = set(glob.glob(str(project_root / artifact_glob))) - before_extra: set[str] = _snapshot_extra_dirs(project_root) - - run_triggered = False - run_new_artifacts: list[str] = [] - - try: - result = subprocess.run( - ["claude", "-p", full_query], - capture_output=True, - text=True, - cwd=str(project_root), - env=env, - timeout=timeout, - ) - if result.returncode != 0: - print( - f"[behavioral] claude exited {result.returncode}: {result.stderr[:300]}", - file=sys.stderr, - ) - - # Check for new files matching the artifact glob - after: set[str] = set(glob.glob(str(project_root / artifact_glob))) - run_new_artifacts = sorted(after - before) - run_triggered = len(run_new_artifacts) > 0 - - if verbose and run_new_artifacts: - print(f"[behavioral] New artifacts: {run_new_artifacts}", file=sys.stderr) - - except subprocess.TimeoutExpired: - if verbose: - print(f"[behavioral] Timed out after {timeout}s for query: {full_query!r}", file=sys.stderr) - # Still check artifacts — the process may have written them before timing out - after_timeout: set[str] = set(glob.glob(str(project_root / artifact_glob))) - run_new_artifacts = sorted(after_timeout - before) - run_triggered = len(run_new_artifacts) > 0 - if verbose and run_triggered: - print(f"[behavioral] Artifacts found despite timeout: {run_new_artifacts}", file=sys.stderr) - - # Clean up primary-glob artifacts - for artifact_path in run_new_artifacts: - try: - Path(artifact_path).unlink(missing_ok=True) - except OSError: - pass - - # Clean up extra-dir artifacts (agents/, skills/, pipelines/, scripts/) - after_extra: set[str] = _snapshot_extra_dirs(project_root) - new_extra = sorted(after_extra - before_extra) - for path in new_extra: + if parallel_workers > 1: + # Parallel path: each task runs in its own temporary git worktree. + results: list[dict] = [{}] * len(tasks) + with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_workers) as executor: + future_to_index = { + executor.submit( + _run_single_behavioral_task_in_worktree, + task, + project_root, + env, + timeout, + verbose, + runs_per_task, + trigger_threshold, + ): idx + for idx, task in enumerate(tasks) + } + for future in concurrent.futures.as_completed(future_to_index): + idx = future_to_index[future] try: - Path(path).unlink(missing_ok=True) - except OSError: - pass - if verbose and new_extra: - print( - f"[behavioral] Cleaned up {len(new_extra)} extra artifacts: {new_extra}", - file=sys.stderr, - ) - - run_results.append(run_triggered) - all_new_artifacts.extend(run_new_artifacts) + results[idx] = future.result() + except Exception as exc: + task = tasks[idx] + query = task.get("query", "unknown") + print(f"[behavioral] Task {query!r} raised exception: {exc}", file=sys.stderr) + results[idx] = { + "query": query, + "triggered": False, + "should_trigger": task.get("should_trigger", False), + "pass": False, + "new_artifacts": [], + } + return results - # Aggregate across runs - if runs_per_task > 1: - triggered = (sum(run_results) / len(run_results)) >= trigger_threshold - else: - triggered = run_results[0] if run_results else False - - passed = triggered == should_trigger - results.append( - { - "query": query, - "triggered": triggered, - "should_trigger": should_trigger, - "pass": passed, - "new_artifacts": all_new_artifacts, - } + # Sequential path (parallel_workers <= 1): run tasks one at a time in project_root. + sequential_results = [] + for task in tasks: + sequential_results.append( + _run_single_behavioral_task( + task=task, + project_root=project_root, + worktree_path=project_root, + env=env, + timeout=timeout, + verbose=verbose, + runs_per_task=runs_per_task, + trigger_threshold=trigger_threshold, + ) ) - - return results + return sequential_results # --------------------------------------------------------------------------- @@ -870,6 +1015,9 @@ def assess_target( dry_run: bool = False, behavioral_runs_per_task: int = 1, behavioral_trigger_threshold: float = 0.5, + parallel_eval_workers: int = 0, + candidate_content: str | None = None, + eval_mode: str = "auto", ) -> dict: """Assess a target file against tasks. @@ -879,6 +1027,9 @@ def assess_target( - Dry-run: returns synthetic scores for testing loop mechanics. - Benchmark (NYI): tasks have 'prompt' + 'name' fields. + When parallel_eval_workers > 1 and the task set is behavioral, tasks are + dispatched in parallel via ThreadPoolExecutor, each in its own git worktree. + Returns scores dict with hard gate booleans and quality dimensions. """ scores: dict = { @@ -894,7 +1045,7 @@ def assess_target( "task_results": [], } - content = target_path.read_text() + content = candidate_content if candidate_content is not None else target_path.read_text() valid, description = _parse_frontmatter(content) if not valid or not description: scores["parses"] = False @@ -932,7 +1083,15 @@ def assess_target( is_trigger = not is_behavioral and all(_is_trigger_task(task) for task in tasks) if is_trigger: - results = _run_trigger_rate(target_path, description, tasks, verbose=verbose) + task_expectations = {task.get("query", ""): task.get("should_trigger") for task in tasks} + results = _run_trigger_rate( + target_path, + description, + tasks, + candidate_content=content, + eval_mode=eval_mode, + verbose=verbose, + ) summary = results.get("summary", {}) total = summary.get("total", 0) passed = summary.get("passed", 0) @@ -951,6 +1110,9 @@ def assess_target( scores["task_results"].append( { "name": r.get("query", "unnamed")[:40], + "query": r.get("query", ""), + "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""), None)), + "trigger_rate": r.get("trigger_rate", 0.0), "passed": r.get("pass", False), "score": 1.0 if r.get("pass", False) else 0.0, "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", @@ -959,6 +1121,7 @@ def assess_target( return scores if is_behavioral: + task_expectations = {task.get("query", ""): task.get("should_trigger") for task in tasks} behavioral_results = _run_behavioral_eval( target_path, description, @@ -966,6 +1129,7 @@ def assess_target( verbose=verbose, runs_per_task=behavioral_runs_per_task, trigger_threshold=behavioral_trigger_threshold, + parallel_workers=parallel_eval_workers, ) total = len(behavioral_results) passed = sum(1 for r in behavioral_results if r.get("pass", False)) @@ -985,6 +1149,8 @@ def assess_target( scores["task_results"].append( { "name": r.get("query", "unnamed")[:40], + "query": r.get("query", ""), + "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""), None)), "passed": r.get("pass", False), "score": 1.0 if r.get("pass", False) else 0.0, "details": f"triggered={r.get('triggered')}, artifacts={artifact_summary}", @@ -1030,13 +1196,13 @@ def run_optimization_loop( target_path: Path, goal: str, benchmark_tasks_path: Path, - max_iterations: int = 20, + max_iterations: int = 1, min_gain: float = 0.02, train_split: float = 0.6, - revert_streak_limit: int = 5, + revert_streak_limit: int = 1, beam_width: int = 1, candidates_per_parent: int = 1, - holdout_check_cadence: int = 5, + holdout_check_cadence: int = 0, model: str | None = None, verbose: bool = False, report_path: Path | None = None, @@ -1044,6 +1210,8 @@ def run_optimization_loop( dry_run: bool = False, behavioral_runs_per_task: int = 1, behavioral_trigger_threshold: float = 0.5, + parallel_eval: int = 0, + eval_mode: str = "auto", ) -> dict: """Run the autoresearch optimization loop.""" if beam_width < 1: @@ -1063,8 +1231,21 @@ def run_optimization_loop( _validate_task_set(all_tasks) train_tasks, test_tasks = split_tasks(all_tasks, train_split) + # Warn and fall back to sequential when --parallel-eval is used with non-behavioral tasks. + is_all_behavioral = all(_is_behavioral_task(t) for t in all_tasks) + effective_parallel_eval = parallel_eval + if parallel_eval > 1 and not is_all_behavioral: + print( + "[parallel-eval] Warning: --parallel-eval requires eval_mode=behavioral tasks. " + "Falling back to sequential evaluation.", + file=sys.stderr, + ) + effective_parallel_eval = 0 + if verbose: print(f"Tasks: {len(train_tasks)} train, {len(test_tasks)} test", file=sys.stderr) + if effective_parallel_eval > 1: + print(f"Parallel behavioral eval: {effective_parallel_eval} workers", file=sys.stderr) original_content = target_path.read_text() target_valid, target_description = _parse_frontmatter(original_content) @@ -1086,6 +1267,9 @@ def run_optimization_loop( dry_run, behavioral_runs_per_task, behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=original_content, + eval_mode=eval_mode, ) baseline_composite = composite_score(baseline_scores) best_score = baseline_composite @@ -1101,6 +1285,9 @@ def run_optimization_loop( dry_run, behavioral_runs_per_task, behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=original_content, + eval_mode=eval_mode, ) if test_tasks else None @@ -1128,7 +1315,7 @@ def run_optimization_loop( status = "RUNNING" total_tokens = 0 iteration_counter = 0 - # Maps iteration number → variant content for KEEP verdicts (used for best-by-test selection) + # Maps iteration number → variant content for ACCEPT verdicts (used for best-by-test selection) keep_contents: dict[int, str] = {} for round_number in range(1, max_iterations + 1): @@ -1190,7 +1377,7 @@ def run_optimization_loop( print(f"Variant generation failed: {e}", file=sys.stderr) iteration_data = { "number": iteration_counter, - "verdict": "REVERT", + "verdict": "REJECT", "score": {"train": parent["score"], "test": None}, "delta": "0", "change_summary": str(e), @@ -1205,7 +1392,7 @@ def run_optimization_loop( iteration_counter, parent["content"], {}, - "REVERT", + "REJECT", "", "", str(e), @@ -1223,7 +1410,7 @@ def run_optimization_loop( print("REJECTED: Protected sections modified", file=sys.stderr) iteration_data = { "number": iteration_counter, - "verdict": "REVERT", + "verdict": "REJECT", "score": {"train": 0.0, "test": None}, "delta": "0", "change_summary": "Protected sections modified", @@ -1238,7 +1425,7 @@ def run_optimization_loop( iteration_counter, variant_content, {"protected_intact": False}, - "REVERT", + "REJECT", "Protected sections modified", diff_text, change_summary, @@ -1253,7 +1440,7 @@ def run_optimization_loop( print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) iteration_data = { "number": iteration_counter, - "verdict": "REVERT", + "verdict": "REJECT", "score": {"train": parent["score"], "test": None}, "delta": "0", "change_summary": "Deleted sections without justification", @@ -1270,7 +1457,7 @@ def run_optimization_loop( iteration_counter, variant_content, {"protected_intact": True}, - "REVERT", + "REJECT", "Deleted sections without justification", diff_text, change_summary, @@ -1281,25 +1468,21 @@ def run_optimization_loop( iteration_by_number[iteration_counter] = iteration_data continue - temp_target = ( - target_path.parent / f".{target_path.stem}_variant_{iteration_counter}{target_path.suffix}" + t0 = time.time() + variant_scores = assess_target( + target_path, + train_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=variant_content, + eval_mode=eval_mode, ) - try: - temp_target.write_text(variant_content) - t0 = time.time() - variant_scores = assess_target( - temp_target, - train_tasks, - goal, - verbose, - dry_run, - behavioral_runs_per_task, - behavioral_trigger_threshold, - ) - eval_elapsed = time.time() - t0 - variant_composite = composite_score(variant_scores) - finally: - temp_target.unlink(missing_ok=True) + eval_elapsed = time.time() - t0 + variant_composite = composite_score(variant_scores) gain = variant_composite - parent["score"] if verbose: @@ -1310,7 +1493,7 @@ def run_optimization_loop( file=sys.stderr, ) - verdict = "KEEP" if gain > min_gain else "REVERT" + verdict = "ACCEPT" if gain > min_gain else "REJECT" if deletions and deletion_justification: change_summary = f"{change_summary} [deletion justified]" delta_str = f"{gain:+.2f}" if gain != 0 else "0" @@ -1351,13 +1534,13 @@ def run_optimization_loop( iterations.append(iteration_data) iteration_by_number[iteration_counter] = iteration_data - if verdict == "KEEP": + if verdict == "ACCEPT": if variant_composite > best_score: best_score = variant_composite best_content = variant_content best_iteration = iteration_counter - # Track content for each KEEP so best-by-test can look it up later + # Track content for each ACCEPT so best-by-test can look it up later keep_contents[iteration_counter] = variant_content kept_nodes.append( @@ -1391,23 +1574,21 @@ def run_optimization_loop( rounds_without_keep += 1 if test_tasks and holdout_check_cadence > 0 and round_number % holdout_check_cadence == 0: - temp_target = target_path.parent / f".{target_path.stem}_holdout_check{target_path.suffix}" - try: - temp_target.write_text(best_content) - holdout_scores = assess_target( - temp_target, - test_tasks, - goal, - verbose, - dry_run, - behavioral_runs_per_task, - behavioral_trigger_threshold, - ) - holdout_composite = composite_score(holdout_scores) - if iterations: - iterations[-1]["score"]["test"] = holdout_composite - finally: - temp_target.unlink(missing_ok=True) + holdout_scores = assess_target( + target_path, + test_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=best_content, + eval_mode=eval_mode, + ) + holdout_composite = composite_score(holdout_scores) + if iterations: + iterations[-1]["score"]["test"] = holdout_composite if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): if verbose: @@ -1420,7 +1601,7 @@ def run_optimization_loop( break if rounds_without_keep >= revert_streak_limit: - exit_reason = f"converged ({revert_streak_limit} rounds without KEEP by round {round_number})" + exit_reason = f"converged ({revert_streak_limit} rounds without ACCEPT by round {round_number})" status = "CONVERGED" break @@ -1471,7 +1652,7 @@ def run_optimization_loop( } report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) - # Best-by-test selection: if test tasks exist, prefer the KEEP iteration with the + # Best-by-test selection: if test tasks exist, prefer the ACCEPT iteration with the # highest held-out test score rather than the highest training score (anti-Goodhart). best_test_score: float | None = None if test_tasks and keep_contents: @@ -1479,7 +1660,7 @@ def run_optimization_loop( scored_keeps = [ (it["number"], it["score"]["test"]) for it in iterations - if it["verdict"] == "KEEP" and it["score"].get("test") is not None and it["number"] in keep_contents + if it["verdict"] == "ACCEPT" and it["score"].get("test") is not None and it["number"] in keep_contents ] if scored_keeps: best_test_iter, best_test_score = max(scored_keeps, key=lambda x: x[1]) @@ -1494,25 +1675,23 @@ def run_optimization_loop( best_content = keep_contents[best_test_iter] best_iteration = best_test_iter else: - # No holdout-checked KEEP iterations — run a final test eval on best_content + # No holdout-checked ACCEPT iterations — run a final test eval on best_content if best_iteration > 0: - temp_target = target_path.parent / f".{target_path.stem}_final_test{target_path.suffix}" - try: - temp_target.write_text(best_content) - final_test_scores = assess_target( - temp_target, - test_tasks, - goal, - verbose, - dry_run, - behavioral_runs_per_task, - behavioral_trigger_threshold, - ) - best_test_score = composite_score(final_test_scores) - if verbose: - print(f"Final test eval on best_content: test={best_test_score:.4f}", file=sys.stderr) - finally: - temp_target.unlink(missing_ok=True) + final_test_scores = assess_target( + target_path, + test_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=best_content, + eval_mode=eval_mode, + ) + best_test_score = composite_score(final_test_scores) + if verbose: + print(f"Final test eval on best_content: test={best_test_score:.4f}", file=sys.stderr) if best_iteration > 0: best_path = output_dir / "best_variant.md" @@ -1533,7 +1712,7 @@ def run_optimization_loop( "best_iteration": best_iteration, "iterations_run": len(iterations), "max_iterations": max_iterations, - "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"), + "improvements_found": sum(1 for it in iterations if it["verdict"] == "ACCEPT"), "total_tokens": total_tokens, "search_strategy": "beam" if beam_width > 1 or candidates_per_parent > 1 else "hill_climb", "beam_width": beam_width, @@ -1560,18 +1739,18 @@ def main(): parser.add_argument( "--max-iterations", type=int, - default=20, - help="Max optimization rounds (default: 20); each round evaluates up to beam_width x candidates_per_parent candidates", + default=1, + help="Max optimization rounds (default: 1, short mode); each round evaluates up to beam_width x candidates_per_parent candidates", ) parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)") parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)") parser.add_argument( "--revert-streak-limit", type=int, - default=5, - help="Stop after this many rounds without any KEEP candidates (default: 5)", + default=1, + help="Stop after this many rounds without any ACCEPT candidates (default: 1, short mode)", ) - parser.add_argument("--beam-width", type=int, default=1, help="Number of kept candidates to retain per round") + parser.add_argument("--beam-width", type=int, default=1, help="Number of accepted candidates to retain per round") parser.add_argument( "--candidates-per-parent", type=int, @@ -1581,8 +1760,8 @@ def main(): parser.add_argument( "--holdout-check-cadence", type=int, - default=5, - help="Check held-out tasks every N rounds (default: 5; 0 disables)", + default=0, + help="Check held-out tasks every N rounds (default: 0, disabled in short mode)", ) parser.add_argument("--model", default=None, help="Optional Claude Code model override for variant generation") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") @@ -1603,6 +1782,18 @@ def main(): default=0.5, help="Fraction of runs that must trigger to count as triggered (default: 0.5)", ) + parser.add_argument( + "--parallel-eval", + type=int, + default=0, + help="Run behavioral eval tasks in parallel with isolated git worktrees (default: 0, disabled)", + ) + parser.add_argument( + "--eval-mode", + choices=["auto", "registered", "alias"], + default="auto", + help="Trigger evaluator mode (default: auto; prefers registered-skill worktree eval when possible)", + ) args = parser.parse_args() target = Path(args.target) @@ -1634,6 +1825,8 @@ def main(): dry_run=args.dry_run, behavioral_runs_per_task=args.behavioral_runs_per_task, behavioral_trigger_threshold=args.behavioral_trigger_threshold, + parallel_eval=args.parallel_eval, + eval_mode=args.eval_mode, ) except ValueError as e: print(f"Error: {e}", file=sys.stderr) diff --git a/skills/read-only-ops/SKILL.md b/skills/read-only-ops/SKILL.md index 70375644..115f4b5e 100644 --- a/skills/read-only-ops/SKILL.md +++ b/skills/read-only-ops/SKILL.md @@ -1,10 +1,12 @@ --- name: read-only-ops description: | - Read-only exploration, status checks, and reporting without modifications. - Use when user asks to check status, find files, search code, show state, - or explicitly requests read-only investigation. Route to other skills when user wants - changes, fixes, refactoring, or any write operation. + Read-only exploration, inspection, and reporting without modifications. + Use when the user wants to inspect, investigate, audit, survey, or analyze code/files/state + without making changes. Common triggers: "inspect this", "report back without changing anything", + "show me", "look at", "tell me about", "find files", "check status", "list all", "how many", + "where is", or "what is the current state of". Route away when the user wants fixes, + refactors, writing, or any write operation. version: 2.0.0 user-invocable: false allowed-tools: From 2ec703cdb00e96f1bdfc375d565556b377dbc054 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 20:15:08 -0700 Subject: [PATCH 2/5] feat(agent-comparison): harden autoresearch live evals --- .../test_agent_comparison_optimize_loop.py | 533 +++++++++++++++++ skills/agent-comparison/SKILL.md | 27 +- .../references/optimization-guide.md | 99 +++- .../socratic-debugging-body-short-tasks.json | 12 + .../scripts/generate_variant.py | 132 ++++- .../agent-comparison/scripts/optimize_loop.py | 543 ++++++++++++++++-- skills/do/references/routing-tables.md | 4 +- skills/skill-eval/SKILL.md | 11 +- skills/socratic-debugging/SKILL.md | 5 +- 9 files changed, 1265 insertions(+), 101 deletions(-) create mode 100644 skills/agent-comparison/references/socratic-debugging-body-short-tasks.json diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index 388a9e13..710e3a91 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -1,5 +1,7 @@ +import contextlib import importlib.util import json +import os import subprocess import sys from pathlib import Path @@ -217,6 +219,48 @@ def fake_run_claude_code(prompt, model): assert "Changed body." not in result["variant"] +def test_generate_variant_body_only_changes_body_not_frontmatter(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_body_only", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: old description +version: 1.0.0 +--- + +# Skill + +Original body. +""" + + def fake_run_claude_code(prompt, model): + assert "" in prompt + return ( + "# Skill\n\nImproved body.\nbody change" + "", + "raw result", + 7, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + result = generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve behavioral quality", + current_content=current_content, + failures=[], + model=None, + optimization_scope="body-only", + ) + + assert "description: old description" in result["variant"] + assert "# Skill\n\nImproved body." in result["variant"] + assert "Original body." not in result["variant"] + + def test_generate_variant_prompt_includes_full_failed_query_and_expectation(monkeypatch): generate_variant = load_module( "agent_comparison_generate_variant_failure_context", @@ -588,6 +632,490 @@ def fake_run_behavioral_eval(*args, **kwargs): assert optimize_loop.composite_score(scores) == 8.45 +def test_assess_target_scores_blind_compare_results(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_blind_compare", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: blind compare test\n---\n") + tasks = [{"query": "help me debug this", "eval_mode": "blind_compare", "judge": "socratic_question_only"}] + + def fake_run_blind_compare_eval(target_path, candidate_content, tasks, baseline_content=None, timeout=180, verbose=False): + assert baseline_content == "---\ndescription: baseline\n---\n" + return [ + { + "query": "help me debug this", + "winner": "candidate", + "candidate_score": 0.8, + "baseline_score": 0.5, + "candidate_output": "What changed recently?", + "baseline_output": "The issue is probably your env var rename.", + "passed": True, + } + ] + + monkeypatch.setattr(optimize_loop, "_run_blind_compare_eval", fake_run_blind_compare_eval) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve behavioral quality", + candidate_content="---\ndescription: candidate\n---\n", + baseline_content="---\ndescription: baseline\n---\n", + ) + + assert scores["correctness"] == 8.0 + assert scores["testing"] == 8.0 + assert scores["tests_pass"] is True + assert scores["task_results"][0]["winner"] == "candidate" + + +def test_assess_target_trigger_eval_uses_multiple_runs(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_trigger_runs", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: trigger eval test\n---\n") + tasks = [{"query": "inspect this repo", "should_trigger": True}] + seen = {} + + def fake_run_trigger_rate( + target_path, + description, + tasks, + candidate_content=None, + eval_mode="auto", + num_workers=1, + timeout=30, + runs_per_query=3, + verbose=False, + ): + seen["runs_per_query"] = runs_per_query + return { + "results": [{"query": "inspect this repo", "should_trigger": True, "trigger_rate": 1.0, "pass": True}], + "summary": {"total": 1, "passed": 1, "failed": 0}, + } + + monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve routing precision", + behavioral_runs_per_task=4, + ) + + assert seen["runs_per_query"] == 4 + assert scores["tests_pass"] is True + + +def test_socratic_question_only_heuristic_penalizes_preamble(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_socratic_heuristic", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + clean_score, _ = optimize_loop._score_socratic_question_only_output( + "What did you expect the test to do?" + ) + preamble_score, _ = optimize_loop._score_socratic_question_only_output( + "Let me read the skill first. What did you expect the test to do?" + ) + + assert clean_score > preamble_score + + +def test_contains_fallback_contamination_detects_tool_blocked_text(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_contamination", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + contaminated, reasons = optimize_loop._contains_fallback_contamination( + "The Skill tool was blocked in this session, so I'll guide you through this directly." + ) + + assert contaminated is True + assert "mentioned blocked skill tool" in reasons + assert "fell back to direct guidance" in reasons + + +def test_run_blind_compare_zeroes_untriggered_or_contaminated_runs(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_blind_compare_guardrails", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "skills" / "socratic-debugging" / "SKILL.md" + target.parent.mkdir(parents=True) + target.write_text("---\nname: socratic-debugging\ndescription: test\n---\n") + + monkeypatch.setattr(optimize_loop, "_find_project_root", lambda: tmp_path) + + @contextlib.contextmanager + def fake_worktree(_project_root, _relpath, content): + worktree = tmp_path / ("candidate" if "candidate" in content else "baseline") + worktree.mkdir(exist_ok=True) + yield worktree + + monkeypatch.setattr(optimize_loop, "_candidate_worktree", fake_worktree) + + def fake_capture(query, cwd, accepted_skill_ids, timeout=180): + if cwd.name == "baseline": + return { + "output": "What changed recently?", + "triggered": False, + "contaminated": False, + "contamination_reasons": [], + } + return { + "output": "The Skill tool was blocked in this session, so I'll guide you through this directly. What changed recently?", + "triggered": True, + "contaminated": True, + "contamination_reasons": ["mentioned blocked skill tool", "fell back to direct guidance"], + } + + monkeypatch.setattr(optimize_loop, "_run_query_capture_output", fake_capture) + + results = optimize_loop._run_blind_compare_eval( + target, + "---\nname: socratic-debugging\ndescription: candidate\n# candidate\n", + [{"query": "help me debug", "eval_mode": "blind_compare", "judge": "socratic_question_only"}], + baseline_content="---\nname: socratic-debugging\ndescription: baseline\n# baseline\n", + ) + + assert results[0]["baseline_score"] == 0.0 + assert results[0]["candidate_score"] == 0.0 + assert results[0]["baseline_triggered"] is False + assert results[0]["candidate_contaminated"] is True + assert results[0]["winner"] == "tie" + assert results[0]["baseline_reasons"][0] == "target skill did not trigger" + assert results[0]["candidate_reasons"][0] == "mentioned blocked skill tool" + + +def test_behavioral_eval_sequential_path_uses_isolated_worktrees(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_behavioral_isolation", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + project_root = tmp_path / "repo" + (project_root / ".claude").mkdir(parents=True) + cwd_before = Path.cwd() + os.chdir(project_root) + try: + calls = [] + + def fake_single_task_in_worktree(task, project_root, env, timeout, verbose, runs_per_task, trigger_threshold): + calls.append((task["query"], project_root)) + return { + "query": task["query"], + "triggered": task["should_trigger"], + "should_trigger": task["should_trigger"], + "pass": True, + "new_artifacts": [], + } + + monkeypatch.setattr(optimize_loop, "_run_single_behavioral_task_in_worktree", fake_single_task_in_worktree) + + results = optimize_loop._run_behavioral_eval( + tmp_path / "skills" / "example" / "SKILL.md", + "desc", + [{"query": "make a skill", "should_trigger": True}], + parallel_workers=0, + ) + finally: + os.chdir(cwd_before) + + assert len(calls) == 1 + assert results[0]["pass"] is True + + +def test_holdout_score_attaches_to_best_iteration_not_last(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_holdout_attachment", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"}, + {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"}, + ] + } + ) + ) + + variant_counter = {"n": 0} + + def fake_generate_variant_output(*args, **kwargs): + variant_counter["n"] += 1 + marker = f"candidate-{variant_counter['n']}" + return { + "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n", + "summary": marker, + "reasoning": marker, + "tokens_used": 1, + "deletions": [], + "deletion_justification": "", + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + baseline_content=None, + eval_mode="auto", + ): + is_test = bool(tasks and tasks[0].get("split") == "test") + content = candidate_content or path.read_text() + if is_test: + score = 9.0 if "candidate-1" in content else 3.0 + else: + if "candidate-1" in content: + score = 8.0 + elif "candidate-2" in content: + score = 7.0 + else: + score = 5.0 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [{"name": "task", "passed": True}], + } + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + beam_width=2, + candidates_per_parent=2, + holdout_check_cadence=1, + verbose=False, + dry_run=False, + ) + + iteration_one = next(it for it in result["iterations"] if it["number"] == 1) + iteration_two = next(it for it in result["iterations"] if it["number"] == 2) + assert iteration_one["score"]["test"] == 3.6 + assert iteration_two["score"]["test"] == 1.2 + + +def test_best_by_test_can_switch_from_later_train_best_to_earlier_candidate(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_best_by_test", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"}, + {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"}, + ] + } + ) + ) + + variant_counter = {"n": 0} + + def fake_generate_variant_output(*args, **kwargs): + variant_counter["n"] += 1 + marker = f"candidate-{variant_counter['n']}" + return { + "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n", + "summary": marker, + "reasoning": marker, + "tokens_used": 1, + "deletions": [], + "deletion_justification": "", + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + baseline_content=None, + eval_mode="auto", + ): + is_test = bool(tasks and tasks[0].get("split") == "test") + content = candidate_content or path.read_text() + if is_test: + score = 9.0 if "candidate-1" in content else 3.0 + else: + if "candidate-1" in content: + score = 8.0 + elif "candidate-2" in content: + score = 9.0 + else: + score = 5.0 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [{"name": "task", "passed": True}], + } + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=2, + min_gain=0.0, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + beam_width=1, + candidates_per_parent=1, + holdout_check_cadence=1, + verbose=False, + dry_run=False, + ) + + assert result["best_iteration"] == 1 + assert result["best_test_score"] == 3.6 + + +def test_final_report_uses_post_selection_test_scores(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_final_report", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"}, + {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"}, + ] + } + ) + ) + + variant_counter = {"n": 0} + + def fake_generate_variant_output(*args, **kwargs): + variant_counter["n"] += 1 + marker = f"candidate-{variant_counter['n']}" + return { + "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n", + "summary": marker, + "reasoning": marker, + "tokens_used": 1, + "deletions": [], + "deletion_justification": "", + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + baseline_content=None, + eval_mode="auto", + ): + is_test = bool(tasks and tasks[0].get("split") == "test") + content = candidate_content or path.read_text() + if is_test: + score = 9.0 if "candidate-1" in content else 3.0 + else: + score = 8.0 if "candidate-1" in content else 5.0 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [{"name": "task", "passed": True}], + } + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + monkeypatch.setattr(optimize_loop, "generate_optimization_report", lambda data, auto_refresh=False: json.dumps(data)) + + report_path = tmp_path / "out" / "report.html" + optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + output_dir=tmp_path / "out", + report_path=report_path, + beam_width=1, + candidates_per_parent=1, + holdout_check_cadence=1, + verbose=False, + dry_run=False, + ) + + report = json.loads(report_path.read_text()) + assert report["iterations"][0]["score"]["test"] == 3.6 + + def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, monkeypatch): optimize_loop = load_module( "agent_comparison_optimize_loop_parallel_forwarding", @@ -620,6 +1148,7 @@ def fake_assess_target( behavioral_trigger_threshold=0.5, parallel_eval_workers=0, candidate_content=None, + baseline_content=None, eval_mode="auto", ): calls.append( @@ -628,6 +1157,7 @@ def fake_assess_target( "task_count": len(tasks), "parallel_eval_workers": parallel_eval_workers, "candidate_content": candidate_content, + "baseline_content": baseline_content, "eval_mode": eval_mode, } ) @@ -665,6 +1195,7 @@ def fake_assess_target( assert calls assert all(call["parallel_eval_workers"] == 2 for call in calls) assert all(call["candidate_content"] is not None for call in calls) + assert any(call["baseline_content"] is not None for call in calls[1:]) assert all(call["eval_mode"] == "auto" for call in calls) @@ -695,6 +1226,7 @@ def fake_generate_variant_output( model, dry_run, iteration_number, + optimization_scope, diversification_note=None, ): improved_description = ( @@ -718,6 +1250,7 @@ def fake_run_trigger_rate( eval_mode="auto", num_workers=5, timeout=30, + runs_per_query=3, verbose=False, ): passed = trigger_query in description diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 6d934786..ef3321f5 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -348,7 +348,7 @@ Recommended modes: Live eval defaults are intentionally short: - one optimization round -- one trigger-eval run per query +- three trigger-eval runs per query - one trigger-eval worker - no holdout cadence unless explicitly requested @@ -357,7 +357,7 @@ The registered-skill path also evaluates the current working copy, not just `HEA **Step 5: Present results in UI** -Open the generated `optimization-report.html` in a browser. The report shows: +If you passed `--report optimization-report.html`, open the generated file in a browser. The report shows: - Progress dashboard (status, baseline vs best, accepted/rejected counts) - Convergence chart (train solid line, held-out dashed line, baseline dotted) - Iteration table with verdict, composite score, delta, and change summary @@ -367,7 +367,7 @@ Open the generated `optimization-report.html` in a browser. The report shows: Not all ACCEPT iterations are real improvements — some may be harness artifacts. The user reviews the accepted iterations as candidate snapshots from the original target: - Inspect each accepted iteration's diff in the report -- Use "Preview Selected Snapshot" only as a comparison aid in the UI +- Use "Preview Combined" only as a comparison aid in the UI - Use "Export Selected" to download a review JSON describing the selected snapshot diff - In beam mode, review the retained frontier candidates first; they are the strongest candidates from the latest round @@ -377,7 +377,9 @@ Apply one reviewed improvement to the original target file. - If you want the best single accepted variant, use `evals/iterations/best_variant.md`. - Beam search still writes a single `best_variant.md`: the highest-scoring accepted candidate seen anywhere in the run. -- The current optimizer is description-only, so diffs should stay confined to the frontmatter `description`. +- Choose scope deliberately: + - `description-only` for routing-trigger work + - `body-only` for behavioral work on the skill instructions themselves - If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple accepted diffs into a generated patch. ```bash @@ -390,10 +392,17 @@ cp evals/iterations/best_variant.md skills/{target}/SKILL.md **Step 8: Run final evaluation on FULL task set (train + test)** -After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize: +After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize. Use evaluation-only mode by rerunning the optimizer with `--max-iterations 0`, which records the baseline for the current file without generating fresh variants: ```bash -# Re-run optimize_loop.py against the same task file and inspect results.json/report output +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/{target}/SKILL.md \ + --goal "{same goal}" \ + --benchmark-tasks {full-task-file}.json \ + --max-iterations 0 \ + --report optimization-report.html \ + --output-dir evals/final-check \ + --verbose ``` Compare final scores to the baseline to confirm net improvement. In beam mode, the final report and `results.json` also include: @@ -419,12 +428,18 @@ The current optimizer is in a solid state for: - deterministic proof runs - isolated live evaluation of existing registered skills - short live optimization of `read-only-ops`, with the accepted description change now applied and validated against `references/read-only-ops-short-tasks.json` +- short live body optimization of `socratic-debugging`, with the accepted instruction-body update now applied and validated against `references/socratic-debugging-body-short-tasks.json` One live-harness caveat remains: - temporary renamed skill copies do not yet show reliable live trigger improvements through the dynamic command alias path That caveat does not affect deterministic proof runs or live checks against existing registered skills, but it does mean the current system is stronger for optimizing real in-repo skills than arbitrary renamed temp clones. +For body optimization runs, the blind evaluator now rejects responses that: +- never triggered the target skill +- mention blocked skill/tool access +- fall back into generic "I'll guide you directly" behavior + ### Optional Extensions These are off by default. Enable explicitly when needed: diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md index 6101ecd7..a0eccc17 100644 --- a/skills/agent-comparison/references/optimization-guide.md +++ b/skills/agent-comparison/references/optimization-guide.md @@ -2,9 +2,15 @@ ## Scope -The current autoresearch loop optimizes a markdown target's frontmatter -`description` using trigger-rate eval tasks. This is useful for improving -skill routing accuracy and similar description-driven dispatch behavior. +The current autoresearch loop supports two optimization scopes: + +- `description-only`: mutate the frontmatter `description` and score it with + trigger-rate eval tasks +- `body-only`: mutate the instruction body and score it with `blind_compare` + behavioral tasks + +This is useful for improving skill routing accuracy and for short, repeatable +instruction-body improvements on real registered skills. It is not a replacement for the manual agent benchmark workflow in Phases 1-4. If you want to compare real code-generation quality across benchmark tasks, use @@ -22,7 +28,11 @@ drives routing. ## Supported Task Formats -Every task must include: +Two task families are supported: + +### Trigger-rate tasks + +Every trigger-rate task must include: - `query`: the prompt to test - `should_trigger`: whether the target should trigger for that prompt @@ -77,6 +87,40 @@ Explicit train/test sets: } ``` +### Blind body-compare tasks + +Every blind body-compare task must include: + +- `query`: the prompt to test +- `eval_mode: blind_compare` +- `judge`: currently `heuristic_socratic_debugging` + +Optional fields: + +- `name`: label shown in logs and reports +- `split`: `train` or `test` +- `min_score`: minimum candidate score required for the task to count as passed + +Example: + +```json +{ + "tasks": [ + { + "name": "socratic-first-turn", + "query": "Help me think through this bug. My Python script sometimes returns None instead of a dict when the cache is warm. Please do not solve it for me directly.", + "eval_mode": "blind_compare", + "judge": "heuristic_socratic_debugging", + "min_score": 0.7, + "split": "train" + } + ] +} +``` + +Within one run, tasks must all belong to the same family. The optimizer rejects +mixed trigger-rate and blind body-compare task sets. + If no split markers are present, the loop performs a reproducible random split using `--train-split` and seed `42`. @@ -216,16 +260,20 @@ When beam search is enabled: When `--beam-width 1 --candidates-per-parent 1`, the behavior collapses back to the original single-path optimizer. -## Description-Only Scope +## Optimization Scopes -The current optimizer mutates only the YAML frontmatter `description`. +The optimizer supports two mutation scopes: -- `generate_variant.py` asks the model for an improved description, not a full-file rewrite -- the variant file is reconstructed by replacing that one field in the original content -- eval results therefore measure the same surface area that generation changes +- `description-only`: replace only the YAML frontmatter `description` +- `body-only`: replace only the markdown body below the frontmatter -This avoids false negatives where the model improves routing blocks or body text -that the evaluator does not read. +`generate_variant.py` reconstructs the full file around the selected scope so +the unchanged parts stay intact. Use `description-only` for routing-trigger +work and `body-only` for behavioral work judged from the skill's actual output. + +For body optimization, pair `--optimization-scope body-only` with +`blind_compare` tasks so generation and evaluation are measuring the same +surface area. ## Iteration Artifacts @@ -253,16 +301,45 @@ When `--report` is set, it also writes a live HTML dashboard showing: What is currently demonstrated: - deterministic end-to-end improvement runs with readable artifacts - isolated live optimization for existing registered skills via temporary git worktrees +- blind body-eval runs that require actual skill-trigger evidence before scoring - score calculations and accept/reject decisions that match the weighted rubric - short live proof on `skills/read-only-ops/SKILL.md` using `references/read-only-ops-short-tasks.json`, improving from one failed positive to `2/2` live passes after the accepted description update +- short live body optimization on `skills/socratic-debugging/SKILL.md` using + `references/socratic-debugging-body-short-tasks.json`, improving from `7.85` + to `8.45` after the accepted instruction-body update What remains imperfect: - live optimization of temporary renamed skill copies still fails to show measured improvement through the dynamic command alias path So the current tooling is operational for real registered skills and deterministic proof runs, but not yet fully proven for arbitrary temporary renamed clones. +## Short Live Commands + +Routing optimization on a real registered skill: + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/read-only-ops/SKILL.md \ + --goal "Improve read-only routing precision for realistic user prompts." \ + --benchmark-tasks skills/agent-comparison/references/read-only-ops-short-tasks.json +``` + +Body optimization on a real registered skill: + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/socratic-debugging/SKILL.md \ + --goal "Improve the first response so it asks exactly one question, avoids direct diagnosis, avoids code examples, and does not add tool-permission preamble." \ + --benchmark-tasks skills/agent-comparison/references/socratic-debugging-body-short-tasks.json \ + --optimization-scope body-only +``` + +The blind body path now fails closed: if the intended skill does not trigger, or +the response falls back into tool-blocked/direct-guidance chatter, the run is +scored as a failure instead of being treated as a weak improvement. + ## Choosing Good Eval Tasks 1. Include both positive and negative prompts. diff --git a/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json b/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json new file mode 100644 index 00000000..457b7240 --- /dev/null +++ b/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json @@ -0,0 +1,12 @@ +{ + "tasks": [ + { + "name": "socratic-first-turn", + "query": "Help me think through this bug. My Python script sometimes returns None instead of a dict when the cache is warm. Please do not solve it for me directly.", + "eval_mode": "blind_compare", + "judge": "heuristic_socratic_debugging", + "min_score": 0.7, + "split": "train" + } + ] +} diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py index f0707507..1a35aa46 100644 --- a/skills/agent-comparison/scripts/generate_variant.py +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 -"""Generate a description-optimized variant of an agent/skill file using Claude Code. +"""Generate an optimized variant of an agent/skill file using Claude Code. -Proposes a new frontmatter description based on the optimization goal and -previous iteration failures, then applies that description back onto the -original file. This keeps the optimization surface aligned with what the -evaluator actually measures. +Supports two optimization scopes: +- description-only: mutate frontmatter description only +- body-only: mutate the markdown body only Pattern: uses `claude -p` so generation runs through Claude Code directly. @@ -173,6 +172,40 @@ def replace_description(content: str, new_description: str) -> str: return "\n".join(rebuilt_lines) +def extract_body(content: str) -> str: + """Extract markdown body content after frontmatter.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + return "\n".join(lines[end_idx + 1 :]) + + +def replace_body(content: str, new_body: str) -> str: + """Replace the markdown body while preserving frontmatter verbatim.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + rebuilt_lines = [*lines[: end_idx + 1], *new_body.splitlines()] + rebuilt = "\n".join(rebuilt_lines) + if content.endswith("\n") and not rebuilt.endswith("\n"): + rebuilt += "\n" + return rebuilt + + # --------------------------------------------------------------------------- # Variant generation # --------------------------------------------------------------------------- @@ -237,6 +270,7 @@ def generate_variant( current_content: str, failures: list[dict], model: str | None, + optimization_scope: str = "description-only", history: list[dict] | None = None, diversification_note: str | None = None, ) -> dict: @@ -289,8 +323,10 @@ def generate_variant( removed even if removing them would improve test scores.""" current_description = extract_description(current_content) + current_body = extract_body(current_content) - prompt = f"""You are optimizing an agent/skill file to improve its trigger performance. + if optimization_scope == "description-only": + prompt = f"""You are optimizing an agent/skill file to improve its trigger performance. Target file: {target_path} Optimization goal: {goal} @@ -346,23 +382,72 @@ def generate_variant( [why any removed section was replaced safely, or leave blank] """ + text, raw_result_text, tokens_used = _run_claude_code(prompt, model) + + description_match = re.search(r"(.*?)", text, re.DOTALL) + if description_match: + new_payload = description_match.group(1).strip() + else: + variant_match = re.search(r"(.*?)", text, re.DOTALL) + if not variant_match: + print("Error: No or tags in response", file=sys.stderr) + sys.exit(1) + legacy_variant = variant_match.group(1).strip() + new_payload = extract_description(legacy_variant) + + variant = replace_description(current_content, new_payload) + elif optimization_scope == "body-only": + prompt = f"""You are optimizing an agent/skill file to improve its behavioral quality. - text, raw_result_text, tokens_used = _run_claude_code(prompt, model) +Target file: {target_path} +Optimization goal: {goal} - # Parse improved description. Accept legacy output as a fallback - # so older prompt responses and tests remain parseable. - description_match = re.search(r"(.*?)", text, re.DOTALL) - if description_match: - new_description = description_match.group(1).strip() - else: - variant_match = re.search(r"(.*?)", text, re.DOTALL) - if not variant_match: - print("Error: No or tags in response", file=sys.stderr) - sys.exit(1) - legacy_variant = variant_match.group(1).strip() - new_description = extract_description(legacy_variant) +Current content of the file: + +{current_content} + +Current body: + +{current_body} + +{failure_section}{history_section}{diversification_section}{protected_notice} - variant = replace_description(current_content, new_description) +SAFETY RULES: +1. Optimize ONLY the markdown body after the YAML frontmatter. + Do not modify the frontmatter, skill name, description, routing, tools, or version. +2. Keep the skill faithful to its current purpose. Improve how it behaves, not what broad domain it covers. +3. Preserve headings and protected sections unless you have a clear reason to improve the body structure safely. +4. Prefer the smallest body change that addresses the failed tasks and improves behavioral quality. + +Please respond with ONLY the improved body text inside tags and a brief summary inside tags. +Do not return the full file. + + +[improved markdown body only] + + + +[1-2 sentence description of the change] + + + +[why any removed section was replaced safely, or leave blank] +""" + text, raw_result_text, tokens_used = _run_claude_code(prompt, model) + body_match = re.search(r"(.*?)", text, re.DOTALL) + if body_match: + new_payload = body_match.group(1).strip("\n") + else: + variant_match = re.search(r"(.*?)", text, re.DOTALL) + if not variant_match: + print("Error: No or tags in response", file=sys.stderr) + sys.exit(1) + legacy_variant = variant_match.group(1).strip() + new_payload = extract_body(legacy_variant) + + variant = replace_body(current_content, new_payload) + else: + raise ValueError(f"Unsupported optimization_scope: {optimization_scope}") # Parse summary summary_match = re.search(r"(.*?)", text, re.DOTALL) @@ -404,6 +489,12 @@ def main(): parser.add_argument("--history", default="[]", help="JSON list of previous iterations") parser.add_argument("--diversification-note", default=None, help="Optional search diversification hint") parser.add_argument("--model", default=None, help="Optional Claude Code model override") + parser.add_argument( + "--optimization-scope", + choices=["description-only", "body-only"], + default="description-only", + help="Which part of the file to mutate", + ) args = parser.parse_args() try: @@ -429,6 +520,7 @@ def main(): current_content=current_content, failures=failures, model=args.model, + optimization_scope=args.optimization_scope, history=history if history else None, diversification_note=args.diversification_note, ) diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index b8987c34..e23315f5 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -21,7 +21,9 @@ import argparse import concurrent.futures +import contextlib import glob +import hashlib import json import os import random @@ -167,6 +169,7 @@ def _generate_variant_output( model: str | None, dry_run: bool, iteration_number: int, + optimization_scope: str, diversification_note: str | None = None, ) -> dict: """Generate a candidate variant either synthetically or through Claude Code.""" @@ -197,6 +200,8 @@ def _generate_variant_output( json.dumps(last_failures), "--history", json.dumps(history), + "--optimization-scope", + optimization_scope, ] if diversification_note: variant_cmd.extend(["--diversification-note", diversification_note]) @@ -256,6 +261,13 @@ def _build_report_data( } +def _iteration_entry_by_number(iterations: list[dict], number: int) -> dict | None: + for entry in iterations: + if entry.get("number") == number: + return entry + return None + + def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: """Generate iteration history HTML report. @@ -596,6 +608,10 @@ def _is_behavioral_task(task: dict) -> bool: return "query" in task and "should_trigger" in task and task.get("eval_mode") == "behavioral" +def _is_blind_compare_task(task: dict) -> bool: + return "query" in task and task.get("eval_mode") == "blind_compare" and "judge" in task + + def _validate_task_set(tasks: list[dict]) -> None: """Reject unsupported or mixed task formats early with a clear error.""" if not tasks: @@ -604,18 +620,22 @@ def _validate_task_set(tasks: list[dict]) -> None: trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task)) pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task)) behavioral_tasks = sum(1 for task in tasks if _is_behavioral_task(task)) + blind_compare_tasks = sum(1 for task in tasks if _is_blind_compare_task(task)) # behavioral tasks are a subset of trigger tasks (same base fields), so subtract them # to avoid double-counting when checking for pure trigger-rate sets - pure_trigger_tasks = trigger_tasks - behavioral_tasks + pure_trigger_tasks = trigger_tasks - behavioral_tasks - blind_compare_tasks - if (pure_trigger_tasks or behavioral_tasks) and pattern_tasks: + if (pure_trigger_tasks or behavioral_tasks or blind_compare_tasks) and pattern_tasks: raise ValueError( "Task file mixes trigger-rate/behavioral and pattern benchmark formats. Use one format per run." ) - if behavioral_tasks and pure_trigger_tasks: - raise ValueError("Task file mixes trigger-rate and behavioral eval modes. Use one eval_mode per run.") + if sum(1 for n in [behavioral_tasks > 0, pure_trigger_tasks > 0, blind_compare_tasks > 0] if n) > 1: + raise ValueError("Task file mixes trigger-rate, behavioral, and blind-compare eval modes. Use one eval_mode per run.") + + if blind_compare_tasks == len(tasks): + return if behavioral_tasks == len(tasks): return @@ -645,6 +665,7 @@ def _run_trigger_rate( eval_mode: str = "auto", num_workers: int = 1, timeout: int = 30, + runs_per_query: int = 3, verbose: bool = False, ) -> dict: """Run trigger-rate assessment using the skill_eval infrastructure. @@ -681,7 +702,7 @@ def _run_trigger_rate( "--timeout", str(timeout), "--runs-per-query", - "1", + str(runs_per_query), ] if candidate_content is not None: with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as candidate_file: @@ -724,6 +745,353 @@ def _run_trigger_rate( if task_file: Path(task_file).unlink(missing_ok=True) +# --------------------------------------------------------------------------- +# Blind comparative behavioral evaluator +# --------------------------------------------------------------------------- + + +def _find_project_root() -> Path: + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + return parent + return project_root + + +def _resolve_registered_skill_relpath(target_path: Path, project_root: Path) -> Path: + resolved = target_path.resolve() + try: + rel = resolved.relative_to(project_root.resolve()) + except ValueError as exc: + raise ValueError("blind_compare eval requires a target under the current project root") from exc + if len(rel.parts) >= 3 and rel.parts[0] == "skills" and rel.parts[-1] == "SKILL.md": + return rel + raise ValueError("blind_compare eval currently supports real registered skills under skills/*/SKILL.md only") + + +@contextlib.contextmanager +def _candidate_worktree(project_root: Path, relpath: Path, content: str): + wt_path_str = tempfile.mkdtemp(prefix="blind-eval-wt-", dir="/tmp") + wt_path = Path(wt_path_str) + wt_path.rmdir() + try: + subprocess.run( + ["git", "worktree", "add", wt_path_str, "HEAD"], + cwd=str(project_root), + capture_output=True, + check=True, + ) + (wt_path / relpath).write_text(content) + yield wt_path + finally: + try: + subprocess.run( + ["git", "worktree", "remove", "--force", wt_path_str], + cwd=str(project_root), + capture_output=True, + ) + except Exception: + pass + shutil.rmtree(wt_path_str, ignore_errors=True) + + +def _extract_registered_skill_ids(relpath: Path, content: str) -> set[str]: + ids = {relpath.as_posix()} + if len(relpath.parts) >= 2: + ids.add(relpath.parts[1]) + match = re.search(r"^name:\s*(.+)$", content, re.MULTILINE) + if match: + ids.add(match.group(1).strip().strip("\"'")) + return {value for value in ids if value} + + +def _assistant_message_triggered_skill(message: dict, accepted_skill_ids: set[str]) -> bool: + for content_item in message.get("content", []): + if content_item.get("type") != "tool_use": + continue + tool_name = content_item.get("name", "") + tool_input = content_item.get("input", {}) + if tool_name == "Skill" and any(skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids): + return True + if tool_name == "Read" and any(skill_id in tool_input.get("file_path", "") for skill_id in accepted_skill_ids): + return True + return False + + +def _contains_fallback_contamination(output: str) -> tuple[bool, list[str]]: + lowered = output.lower() + reasons = [] + contamination_markers = { + "skill tool was blocked": "mentioned blocked skill tool", + "tool was blocked": "mentioned blocked tool access", + "i'll guide you through this directly": "fell back to direct guidance", + "i can still help directly": "fell back to direct guidance", + "instead of using the skill": "mentioned skill fallback mode", + "mode announcement": "included mode/meta announcement", + "tool-permission": "mentioned tool permission", + } + for marker, reason in contamination_markers.items(): + if marker in lowered: + reasons.append(reason) + return bool(reasons), reasons + + +def _run_query_capture_output(query: str, cwd: Path, accepted_skill_ids: set[str], timeout: int = 180) -> dict: + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + [ + "claude", + "-p", + query, + "--output-format", + "stream-json", + "--verbose", + "--include-partial-messages", + "--permission-mode", + "bypassPermissions", + ], + capture_output=True, + text=True, + cwd=str(cwd), + env=env, + timeout=timeout, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or f"claude -p exited {result.returncode}") + + assistant_text: list[str] = [] + raw_result = "" + triggered = False + pending_tool_name = None + accumulated_json = "" + + for raw_line in result.stdout.splitlines(): + line = raw_line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + if event.get("type") == "stream_event": + se = event.get("event", {}) + se_type = se.get("type", "") + if se_type == "content_block_start": + cb = se.get("content_block", {}) + if cb.get("type") == "tool_use": + tool_name = cb.get("name", "") + if tool_name in {"Skill", "Read"}: + pending_tool_name = tool_name + accumulated_json = "" + else: + pending_tool_name = None + accumulated_json = "" + elif se_type == "content_block_delta" and pending_tool_name: + delta = se.get("delta", {}) + if delta.get("type") == "input_json_delta": + accumulated_json += delta.get("partial_json", "") + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True + elif se_type in {"content_block_stop", "message_stop"} and pending_tool_name: + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True + pending_tool_name = None + accumulated_json = "" + + if event.get("type") == "assistant": + message = event.get("message", {}) + if _assistant_message_triggered_skill(message, accepted_skill_ids): + triggered = True + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text.append(content.get("text", "")) + elif event.get("type") == "result": + raw_result = event.get("result", "") + + output = "".join(assistant_text).strip() or raw_result.strip() + contaminated, contamination_reasons = _contains_fallback_contamination(output) + return { + "output": output, + "triggered": triggered, + "contaminated": contaminated, + "contamination_reasons": contamination_reasons, + } + + +def _score_socratic_question_only_output(output: str) -> tuple[float, list[str]]: + stripped = output.strip() + lowered = stripped.lower() + reasons: list[str] = [] + score = 0.0 + + question_marks = stripped.count("?") + if question_marks == 1: + score += 0.45 + reasons.append("asked exactly one question") + elif question_marks == 0: + reasons.append("asked no question") + else: + score += max(0.0, 0.20 - (question_marks - 2) * 0.10) + reasons.append(f"asked {question_marks} questions") + + if stripped.endswith("?"): + score += 0.15 + reasons.append("ended on a question") + else: + reasons.append("did not end on a question") + + starters = ("what ", "when ", "where ", "which ", "can ", "could ", "did ", "is ", "are ", "have ") + if any(lowered.startswith(starter) for starter in starters): + score += 0.15 + reasons.append("opened directly with a question") + else: + reasons.append("did not open directly with a question") + + first_sentence = lowered.split("?")[0] + preamble_markers = ["let me", "i'll", "i will", "we'll", "we will", "let's", "before we", "looking at"] + if any(marker in first_sentence for marker in preamble_markers): + score -= 0.30 + reasons.append("included preamble before the first question") + + direct_answer_markers = [ + "common mistake", + "classic", + "the issue is", + "the problem is", + "the bug is", + "you should", + "fix this by", + "the root cause", + "likely cause", + "think about code like", + "vs.", + "return cache.get", + "poison the cache", + ] + if any(marker in lowered for marker in direct_answer_markers): + score -= 0.35 + reasons.append("gave direct diagnosis/advice") + else: + score += 0.15 + reasons.append("avoided direct diagnosis") + + if "```" in output: + score -= 0.15 + reasons.append("included code block") + else: + score += 0.10 + reasons.append("no code block") + + if len(stripped) <= 450: + score += 0.10 + reasons.append("kept first turn concise") + else: + reasons.append("first response was long") + + return max(0.0, min(1.0, round(score, 4))), reasons + + +def _score_output_with_judge(task: dict, output: str) -> tuple[float, list[str]]: + judge = task.get("judge") + if judge in {"socratic_question_only", "heuristic_socratic_debugging"}: + return _score_socratic_question_only_output(output) + raise ValueError(f"Unsupported blind_compare judge: {judge}") + + +def _run_blind_compare_eval( + target_path: Path, + candidate_content: str, + tasks: list[dict], + baseline_content: str | None = None, + timeout: int = 180, + verbose: bool = False, +) -> list[dict]: + """Run blind comparative evaluation for real registered skills.""" + project_root = _find_project_root() + relpath = _resolve_registered_skill_relpath(target_path, project_root) + baseline_source = baseline_content if baseline_content is not None else candidate_content + candidate_skill_ids = _extract_registered_skill_ids(relpath, candidate_content) + baseline_skill_ids = _extract_registered_skill_ids(relpath, baseline_source) + + results: list[dict] = [] + for task in tasks: + query = task["query"] + if baseline_source == candidate_content: + with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt: + candidate_capture = _run_query_capture_output( + query, candidate_wt, candidate_skill_ids, timeout=timeout + ) + baseline_capture = dict(candidate_capture) + else: + with _candidate_worktree(project_root, relpath, baseline_source) as baseline_wt: + baseline_capture = _run_query_capture_output( + query, baseline_wt, baseline_skill_ids, timeout=timeout + ) + with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt: + candidate_capture = _run_query_capture_output( + query, candidate_wt, candidate_skill_ids, timeout=timeout + ) + + baseline_output = baseline_capture["output"] + candidate_output = candidate_capture["output"] + + baseline_score, baseline_reasons = _score_output_with_judge(task, baseline_output) + candidate_score, candidate_reasons = _score_output_with_judge(task, candidate_output) + + if not baseline_capture["triggered"]: + baseline_score = 0.0 + baseline_reasons = ["target skill did not trigger", *baseline_reasons] + if baseline_capture["contaminated"]: + baseline_score = 0.0 + baseline_reasons = [*baseline_capture["contamination_reasons"], *baseline_reasons] + if not candidate_capture["triggered"]: + candidate_score = 0.0 + candidate_reasons = ["target skill did not trigger", *candidate_reasons] + if candidate_capture["contaminated"]: + candidate_score = 0.0 + candidate_reasons = [*candidate_capture["contamination_reasons"], *candidate_reasons] + + seed = int(hashlib.sha256(query.encode()).hexdigest()[:8], 16) + if seed % 2 == 0: + label_map = {"A": "baseline", "B": "candidate"} + else: + label_map = {"A": "candidate", "B": "baseline"} + + if candidate_score > baseline_score: + winner = "candidate" + elif candidate_score < baseline_score: + winner = "baseline" + else: + winner = "tie" + + if verbose: + print( + f"[blind-compare] {query[:60]!r}: baseline={baseline_score:.2f}, candidate={candidate_score:.2f}, winner={winner}", + file=sys.stderr, + ) + + results.append( + { + "query": query, + "judge": task.get("judge"), + "candidate_score": candidate_score, + "baseline_score": baseline_score, + "candidate_output": candidate_output, + "baseline_output": baseline_output, + "candidate_reasons": candidate_reasons, + "baseline_reasons": baseline_reasons, + "candidate_triggered": candidate_capture["triggered"], + "baseline_triggered": baseline_capture["triggered"], + "candidate_contaminated": candidate_capture["contaminated"], + "baseline_contaminated": baseline_capture["contaminated"], + "winner": winner, + "label_map": label_map, + "passed": candidate_score >= float(task.get("min_score", 0.7)), + } + ) + return results + # --------------------------------------------------------------------------- # Behavioral evaluator (runs claude -p and checks for artifact creation) @@ -984,14 +1352,14 @@ def _run_behavioral_eval( } return results - # Sequential path (parallel_workers <= 1): run tasks one at a time in project_root. + # Sequential path: still use isolated worktrees so tasks cannot mutate the real repo + # or contaminate each other by editing tracked files. sequential_results = [] for task in tasks: sequential_results.append( - _run_single_behavioral_task( + _run_single_behavioral_task_in_worktree( task=task, project_root=project_root, - worktree_path=project_root, env=env, timeout=timeout, verbose=verbose, @@ -1017,6 +1385,7 @@ def assess_target( behavioral_trigger_threshold: float = 0.5, parallel_eval_workers: int = 0, candidate_content: str | None = None, + baseline_content: str | None = None, eval_mode: str = "auto", ) -> dict: """Assess a target file against tasks. @@ -1080,7 +1449,8 @@ def assess_target( # Detect assessment mode from task format is_behavioral = all(_is_behavioral_task(task) for task in tasks) - is_trigger = not is_behavioral and all(_is_trigger_task(task) for task in tasks) + is_blind_compare = all(_is_blind_compare_task(task) for task in tasks) + is_trigger = not is_behavioral and not is_blind_compare and all(_is_trigger_task(task) for task in tasks) if is_trigger: task_expectations = {task.get("query", ""): task.get("should_trigger") for task in tasks} @@ -1090,6 +1460,7 @@ def assess_target( tasks, candidate_content=content, eval_mode=eval_mode, + runs_per_query=max(1, behavioral_runs_per_task), verbose=verbose, ) summary = results.get("summary", {}) @@ -1158,6 +1529,53 @@ def assess_target( ) return scores + if is_blind_compare: + compare_results = _run_blind_compare_eval( + target_path, + content, + tasks, + baseline_content=baseline_content, + verbose=verbose, + ) + total = len(compare_results) + if total == 0: + return scores + + absolute_quality = sum(r.get("candidate_score", 0.0) for r in compare_results) / total + wins = sum(1 for r in compare_results if r.get("winner") == "candidate") + ties = sum(1 for r in compare_results if r.get("winner") == "tie") + comparative_quality = (wins + 0.5 * ties) / total + + scores["correctness"] = round(absolute_quality * 10, 2) + scores["error_handling"] = round(absolute_quality * 8, 2) + scores["language_idioms"] = round(absolute_quality * 7, 2) + scores["testing"] = round(comparative_quality * 8.0, 2) + scores["efficiency"] = round(min(1.0, absolute_quality + 0.1) * 6, 2) + scores["tests_pass"] = all(r.get("passed", False) for r in compare_results) + + for r in compare_results: + scores["task_results"].append( + { + "name": r.get("query", "unnamed")[:40], + "query": r.get("query", ""), + "passed": r.get("passed", False), + "score": r.get("candidate_score", 0.0), + "details": ( + f"winner={r.get('winner')}; candidate={r.get('candidate_score', 0.0):.2f}; " + f"baseline={r.get('baseline_score', 0.0):.2f}; " + f"candidate_reasons={', '.join(r.get('candidate_reasons', []))}" + ), + "winner": r.get("winner"), + "candidate_score": r.get("candidate_score", 0.0), + "baseline_score": r.get("baseline_score", 0.0), + "candidate_output": r.get("candidate_output", ""), + "baseline_output": r.get("baseline_output", ""), + "candidate_reasons": r.get("candidate_reasons", []), + "baseline_reasons": r.get("baseline_reasons", []), + } + ) + return scores + # Benchmark behavioral assessment — not yet implemented. # Use trigger-rate tasks ('query' + 'should_trigger') or behavioral tasks # ('query' + 'should_trigger' + 'eval_mode: behavioral') per ADR-132. @@ -1212,6 +1630,7 @@ def run_optimization_loop( behavioral_trigger_threshold: float = 0.5, parallel_eval: int = 0, eval_mode: str = "auto", + optimization_scope: str = "description-only", ) -> dict: """Run the autoresearch optimization loop.""" if beam_width < 1: @@ -1252,7 +1671,7 @@ def run_optimization_loop( if not target_valid or not target_description: raise ValueError( "Target must have YAML frontmatter with a non-empty description. " - "optimize_loop.py currently supports frontmatter-description optimization only." + "optimize_loop.py requires valid SKILL.md-style frontmatter." ) target_label = target_path.name @@ -1364,6 +1783,7 @@ def run_optimization_loop( model=model, dry_run=dry_run, iteration_number=iteration_counter, + optimization_scope=optimization_scope, diversification_note=diversification_note, ) variant_content = variant_output["variant"] @@ -1479,6 +1899,7 @@ def run_optimization_loop( behavioral_trigger_threshold, effective_parallel_eval, candidate_content=variant_content, + baseline_content=parent["content"], eval_mode=eval_mode, ) eval_elapsed = time.time() - t0 @@ -1584,11 +2005,14 @@ def run_optimization_loop( behavioral_trigger_threshold, effective_parallel_eval, candidate_content=best_content, + baseline_content=original_content, eval_mode=eval_mode, ) holdout_composite = composite_score(holdout_scores) - if iterations: - iterations[-1]["score"]["test"] = holdout_composite + if best_iteration > 0: + best_iteration_entry = _iteration_entry_by_number(iterations, best_iteration) + if best_iteration_entry is not None: + best_iteration_entry["score"]["test"] = holdout_composite if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): if verbose: @@ -1630,33 +2054,33 @@ def run_optimization_loop( exit_reason = f"max_iterations ({max_iterations})" status = "COMPLETE" - # Final report - if report_path: - rd = _build_report_data( - target_label, - goal, - baseline_composite, - baseline_holdout, - len(train_tasks), - len(test_tasks), - iterations, - max_iterations, - status, - total_tokens, - ) - rd["search"] = { - "strategy": "beam", - "beam_width": beam_width, - "candidates_per_parent": candidates_per_parent, - "holdout_check_cadence": holdout_check_cadence, - } - report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) - # Best-by-test selection: if test tasks exist, prefer the ACCEPT iteration with the # highest held-out test score rather than the highest training score (anti-Goodhart). best_test_score: float | None = None if test_tasks and keep_contents: - # Find iterations with a recorded test score (set during holdout cadence checks) + for keep_iter, keep_content in keep_contents.items(): + entry = _iteration_entry_by_number(iterations, keep_iter) + if entry is not None and entry["score"].get("test") is not None: + continue + final_test_scores = assess_target( + target_path, + test_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=keep_content, + baseline_content=original_content, + eval_mode=eval_mode, + ) + keep_test_score = composite_score(final_test_scores) + if entry is not None: + entry["score"]["test"] = keep_test_score + if verbose: + print(f"Recorded final test eval for iter {keep_iter}: test={keep_test_score:.4f}", file=sys.stderr) + scored_keeps = [ (it["number"], it["score"]["test"]) for it in iterations @@ -1674,24 +2098,6 @@ def run_optimization_loop( ) best_content = keep_contents[best_test_iter] best_iteration = best_test_iter - else: - # No holdout-checked ACCEPT iterations — run a final test eval on best_content - if best_iteration > 0: - final_test_scores = assess_target( - target_path, - test_tasks, - goal, - verbose, - dry_run, - behavioral_runs_per_task, - behavioral_trigger_threshold, - effective_parallel_eval, - candidate_content=best_content, - eval_mode=eval_mode, - ) - best_test_score = composite_score(final_test_scores) - if verbose: - print(f"Final test eval on best_content: test={best_test_score:.4f}", file=sys.stderr) if best_iteration > 0: best_path = output_dir / "best_variant.md" @@ -1699,6 +2105,29 @@ def run_optimization_loop( if verbose: print(f"\nBest variant saved to: {best_path}", file=sys.stderr) + # Final report: write only after any final held-out evaluations and best-by-test + # selection so the HTML matches the finalized iterations/results.json state. + if report_path: + rd = _build_report_data( + target_label, + goal, + baseline_composite, + baseline_holdout, + len(train_tasks), + len(test_tasks), + iterations, + max_iterations, + status, + total_tokens, + ) + rd["search"] = { + "strategy": "beam", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + } + report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) + result = { "exit_reason": exit_reason, "status": status, @@ -1718,6 +2147,7 @@ def run_optimization_loop( "beam_width": beam_width, "candidates_per_parent": candidates_per_parent, "holdout_check_cadence": holdout_check_cadence, + "optimization_scope": optimization_scope, "train_size": len(train_tasks), "test_size": len(test_tasks), "iterations": iterations, @@ -1794,6 +2224,12 @@ def main(): default="auto", help="Trigger evaluator mode (default: auto; prefers registered-skill worktree eval when possible)", ) + parser.add_argument( + "--optimization-scope", + choices=["description-only", "body-only"], + default="description-only", + help="Which part of the file to mutate (default: description-only)", + ) args = parser.parse_args() target = Path(args.target) @@ -1827,6 +2263,7 @@ def main(): behavioral_trigger_threshold=args.behavioral_trigger_threshold, parallel_eval=args.parallel_eval, eval_mode=args.eval_mode, + optimization_scope=args.optimization_scope, ) except ValueError as e: print(f"Error: {e}", file=sys.stderr) diff --git a/skills/do/references/routing-tables.md b/skills/do/references/routing-tables.md index 41e9af10..f95e3843 100644 --- a/skills/do/references/routing-tables.md +++ b/skills/do/references/routing-tables.md @@ -111,10 +111,10 @@ Route to these agents based on the user's task domain. Each entry describes what | **code-cleanup** | User wants to remove stale TODOs, unused code, dead imports, or generally clean up accumulated debt. | | **comment-quality** | User wants to audit code comments for accuracy, temporal references, or staleness. | | **agent-evaluation** | User wants to grade or evaluate a skill, agent, or pipeline for quality and standards compliance. NOT: evaluating code output or test results. | -| **agent-comparison** | User wants to A/B test two agents or compare their outputs on the same task. | +| **agent-comparison** | User wants to A/B test agents, run autoresearch, optimize a skill description, or optimize a skill body with benchmark tasks. | | **agent-upgrade** | User wants to audit and systematically improve a specific agent to bring it up to current template standards. | | **testing-agents-with-subagents** | User wants to validate an agent by running it against real test cases in subagents. | -| **skill-eval** | User wants to improve a skill through measured testing, optimize its description, or benchmark it against scenarios. | +| **skill-eval** | User wants to evaluate a skill, test triggers manually, benchmark it against scenarios, or inspect skill quality without running the autoresearch optimizer. | | **full-repo-review** | User wants a comprehensive 3-wave review of all source files in the entire repository. | | **repo-value-analysis** | User wants to systematically analyze an external repository to determine what ideas or patterns are worth adopting. | | **data-analysis** | User wants to analyze data: CSV files, metrics, A/B test results, cohort analysis, statistical distributions, KPIs, or funnel data. | diff --git a/skills/skill-eval/SKILL.md b/skills/skill-eval/SKILL.md index ba89f3b1..f36a04b4 100644 --- a/skills/skill-eval/SKILL.md +++ b/skills/skill-eval/SKILL.md @@ -2,12 +2,12 @@ name: skill-eval description: | Evaluate and improve skills through measured testing. Run trigger evaluations - to test whether skill descriptions cause correct activation, optimize - descriptions via automated train/test loops, benchmark skill output quality + to test whether skill descriptions cause correct activation, benchmark skill output quality with A/B comparisons, and validate skill structure. Use when user says - "improve skill", "test skill triggers", "optimize description", "benchmark + "improve skill", "test skill triggers", "benchmark skill", "eval skill", or "skill quality". Do NOT use for creating new skills - (use skill-creator). + (use skill-creator). Route autoresearch loops for description/body optimization + to agent-comparison. version: 1.0.0 user-invocable: false argument-hint: "" @@ -23,7 +23,6 @@ routing: - improve skill - test skill - eval skill - - optimize description - benchmark skill - skill triggers - skill quality @@ -56,7 +55,7 @@ This checks: SKILL.md exists, valid frontmatter, required fields (name, descript | Intent | Mode | Script | |--------|------|--------| | "Test if description triggers correctly" | Trigger eval | `run_eval.py` | -| "Optimize/improve the description" | Description optimization | `run_loop.py` | +| "Optimize/improve the description through autoresearch" | Route to `agent-comparison` | `optimize_loop.py` | | "Compare skill vs no-skill output" | Output benchmark | Manual + `aggregate_benchmark.py` | | "Validate skill structure" | Quick validate | `quick_validate.py` | diff --git a/skills/socratic-debugging/SKILL.md b/skills/socratic-debugging/SKILL.md index 486c4ae8..719ad3ee 100644 --- a/skills/socratic-debugging/SKILL.md +++ b/skills/socratic-debugging/SKILL.md @@ -26,7 +26,6 @@ routing: - "teach me to find it" category: process --- - # Socratic Debugging Skill ## Overview @@ -64,7 +63,7 @@ Follow these phases in order. Each phase builds evidence for the next. ### Execution Flow 1. **User describes the bug.** Read the relevant code silently using Read/Grep/Glob. -2. **Ask Phase 1 question.** Even if the bug seems obvious from the code, start with symptoms. Make the question pointed if the answer is likely simple. +2. **Ask Phase 1 question.** Your first response must be exactly one question. Do not diagnose the bug, include code references or examples, or mention the files you read. Even if the bug seems obvious from the code, start with symptoms. Make the question pointed if the answer is likely simple. 3. **Listen, acknowledge, ask next question.** Format: brief acknowledgment of what they said, then one question advancing toward root cause. 4. **Track question count.** After 12 questions with no progress toward root cause, trigger escalation offer. 5. **When user identifies root cause**, confirm their finding and ask what fix they would apply. Let the user propose the fix. @@ -99,7 +98,7 @@ Solution: Acknowledge the frustration. Offer escalation. If they want to continu ### Bug Is Trivially Obvious From Code Cause: A typo, missing import, or simple syntax error visible in the source -Solution: Still ask Phase 1, but make the question very pointed -- narrow enough that the user will see the answer immediately. Example: "What do you expect `reponse.data` to contain?" (the typo in the variable name is the bug). Follow phase progression; pointed questions stay within the Socratic framework. +Solution: Still ask Phase 1, but make the question very pointed -- narrow enough that the user will see the answer immediately. Example: "What do you expect that expression to evaluate to?" Follow phase progression; pointed questions stay within the Socratic framework. --- From 98ecf1a587affda2e09611359dd6dd30132b48cd Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 20:19:01 -0700 Subject: [PATCH 3/5] fix(socratic-debugging): tighten first-turn question discipline --- skills/socratic-debugging/SKILL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skills/socratic-debugging/SKILL.md b/skills/socratic-debugging/SKILL.md index 719ad3ee..dc315c8c 100644 --- a/skills/socratic-debugging/SKILL.md +++ b/skills/socratic-debugging/SKILL.md @@ -63,14 +63,14 @@ Follow these phases in order. Each phase builds evidence for the next. ### Execution Flow 1. **User describes the bug.** Read the relevant code silently using Read/Grep/Glob. -2. **Ask Phase 1 question.** Your first response must be exactly one question. Do not diagnose the bug, include code references or examples, or mention the files you read. Even if the bug seems obvious from the code, start with symptoms. Make the question pointed if the answer is likely simple. +2. **Ask Phase 1 question.** Your first response must be exactly one question with no other text — no preamble, no diagnosis, no code references or examples, no mention of files you read, no announcement of tools used or planned. Even if the bug seems obvious from the code, start with symptoms. Make the question pointed if the answer is likely simple. 3. **Listen, acknowledge, ask next question.** Format: brief acknowledgment of what they said, then one question advancing toward root cause. 4. **Track question count.** After 12 questions with no progress toward root cause, trigger escalation offer. 5. **When user identifies root cause**, confirm their finding and ask what fix they would apply. Let the user propose the fix. ### Hints vs. Leading Questions -Questions may contain subtle directional hints. The goal is discovery, not suffering. A **good hint** directs attention without revealing the answer: "What happens if you log the value of `request.userId` right before line 42?" A **bad hint** is a leading question that contains the answer: "Could `request.userId` be null at line 42?" The line: open-ended questions that narrow focus are hints. Leading questions that contain the answer are violations. +Questions may contain subtle directional hints. The goal is discovery, not suffering. A **good hint** directs attention without revealing the answer: asking what a specific value is right before a failure. A **bad hint** is a leading question that contains the answer: asking whether a specific value could be null. The line: open-ended questions that narrow focus are hints. Leading questions that contain the answer are violations. ### Escalation Protocol @@ -98,7 +98,7 @@ Solution: Acknowledge the frustration. Offer escalation. If they want to continu ### Bug Is Trivially Obvious From Code Cause: A typo, missing import, or simple syntax error visible in the source -Solution: Still ask Phase 1, but make the question very pointed -- narrow enough that the user will see the answer immediately. Example: "What do you expect that expression to evaluate to?" Follow phase progression; pointed questions stay within the Socratic framework. +Solution: Still ask Phase 1, but make the question very pointed -- narrow enough that the user will see the answer immediately. Follow phase progression; pointed questions stay within the Socratic framework. --- From c10111dc3bea977f475b5cb0084790375af2e6a7 Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 20:57:50 -0700 Subject: [PATCH 4/5] fix(ci): address lint findings in autoresearch tests --- .../test_agent_comparison_optimize_loop.py | 6 +++++- scripts/tests/test_skill_eval_claude_code.py | 18 +++++++++++++++--- .../agent-comparison/scripts/optimize_loop.py | 4 ++-- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index 710e3a91..9c80c66d 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -1093,8 +1093,12 @@ def fake_assess_target( "task_results": [{"name": "task", "passed": True}], } + def fake_generate_optimization_report(data, auto_refresh=False): + _ = auto_refresh + return json.dumps(data) + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) - monkeypatch.setattr(optimize_loop, "generate_optimization_report", lambda data, auto_refresh=False: json.dumps(data)) + monkeypatch.setattr(optimize_loop, "generate_optimization_report", fake_generate_optimization_report) report_path = tmp_path / "out" / "report.html" optimize_loop.run_optimization_loop( diff --git a/scripts/tests/test_skill_eval_claude_code.py b/scripts/tests/test_skill_eval_claude_code.py index b740b3fe..552ba3c9 100644 --- a/scripts/tests/test_skill_eval_claude_code.py +++ b/scripts/tests/test_skill_eval_claude_code.py @@ -101,7 +101,11 @@ def test_run_single_query_ignores_unrelated_stream_tool_use_before_matching_read payload = ("\n".join(json.dumps(line) for line in stream_lines) + "\n").encode() monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) - monkeypatch.setattr(mod.subprocess, "Popen", lambda *args, **kwargs: _FakePopen(payload)) + + def fake_popen(*_args, **_kwargs): + return _FakePopen(payload) + + monkeypatch.setattr(mod.subprocess, "Popen", fake_popen) monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) triggered = mod.run_single_query( @@ -139,7 +143,11 @@ def test_run_single_query_scans_all_assistant_tool_uses_before_returning(monkeyp payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode() monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) - monkeypatch.setattr(mod.subprocess, "Popen", lambda *args, **kwargs: _FakePopen(payload)) + + def fake_popen(*_args, **_kwargs): + return _FakePopen(payload) + + monkeypatch.setattr(mod.subprocess, "Popen", fake_popen) monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) triggered = mod.run_single_query( @@ -175,7 +183,11 @@ def test_run_single_query_accepts_real_skill_name_not_just_temporary_alias(monke payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode() monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) - monkeypatch.setattr(mod.subprocess, "Popen", lambda *args, **kwargs: _FakePopen(payload)) + + def fake_popen(*_args, **_kwargs): + return _FakePopen(payload) + + monkeypatch.setattr(mod.subprocess, "Popen", fake_popen) monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) triggered = mod.run_single_query( diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index e23315f5..672b77f7 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -1482,7 +1482,7 @@ def assess_target( { "name": r.get("query", "unnamed")[:40], "query": r.get("query", ""), - "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""), None)), + "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""))), "trigger_rate": r.get("trigger_rate", 0.0), "passed": r.get("pass", False), "score": 1.0 if r.get("pass", False) else 0.0, @@ -1521,7 +1521,7 @@ def assess_target( { "name": r.get("query", "unnamed")[:40], "query": r.get("query", ""), - "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""), None)), + "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""))), "passed": r.get("pass", False), "score": 1.0 if r.get("pass", False) else 0.0, "details": f"triggered={r.get('triggered')}, artifacts={artifact_summary}", From 281b32a8f088ab6ecd7838ed63179890078941fb Mon Sep 17 00:00:00 2001 From: notque Date: Sun, 29 Mar 2026 21:19:57 -0700 Subject: [PATCH 5/5] fix(lint): auto-format 3 files to pass ruff format CI check --- scripts/skill_eval/run_eval.py | 11 +++++--- .../test_agent_comparison_optimize_loop.py | 28 ++++++++++++++----- .../agent-comparison/scripts/optimize_loop.py | 17 +++++------ 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/scripts/skill_eval/run_eval.py b/scripts/skill_eval/run_eval.py index 7b3a509a..372a877a 100755 --- a/scripts/skill_eval/run_eval.py +++ b/scripts/skill_eval/run_eval.py @@ -102,7 +102,9 @@ def load_eval_set(path: Path) -> list[dict]: test = payload.get("test") if isinstance(train, list) or isinstance(test, list): return [*(train or []), *(test or [])] - raise ValueError("Unsupported eval set format; expected list, {tasks:[...]}, {queries:[...]}, or {train:[...], test:[...]}") + raise ValueError( + "Unsupported eval set format; expected list, {tasks:[...]}, {queries:[...]}, or {train:[...], test:[...]}" + ) @contextlib.contextmanager @@ -275,9 +277,10 @@ def run_single_query( continue tool_name = content_item.get("name", "") tool_input = content_item.get("input", {}) - if (tool_name == "Skill" and any( - skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids - )) or ( + if ( + tool_name == "Skill" + and any(skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids) + ) or ( tool_name == "Read" and any(skill_id in tool_input.get("file_path", "") for skill_id in accepted_skill_ids) ): diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index 9c80c66d..21257019 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -642,7 +642,9 @@ def test_assess_target_scores_blind_compare_results(tmp_path, monkeypatch): target.write_text("---\ndescription: blind compare test\n---\n") tasks = [{"query": "help me debug this", "eval_mode": "blind_compare", "judge": "socratic_question_only"}] - def fake_run_blind_compare_eval(target_path, candidate_content, tasks, baseline_content=None, timeout=180, verbose=False): + def fake_run_blind_compare_eval( + target_path, candidate_content, tasks, baseline_content=None, timeout=180, verbose=False + ): assert baseline_content == "---\ndescription: baseline\n---\n" return [ { @@ -719,9 +721,7 @@ def test_socratic_question_only_heuristic_penalizes_preamble(): "skills/agent-comparison/scripts/optimize_loop.py", ) - clean_score, _ = optimize_loop._score_socratic_question_only_output( - "What did you expect the test to do?" - ) + clean_score, _ = optimize_loop._score_socratic_question_only_output("What did you expect the test to do?") preamble_score, _ = optimize_loop._score_socratic_question_only_output( "Let me read the skill first. What did you expect the test to do?" ) @@ -1133,8 +1133,20 @@ def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, m json.dumps( { "tasks": [ - {"name": "train-positive", "query": "make a skill", "should_trigger": True, "eval_mode": "behavioral", "split": "train"}, - {"name": "test-negative", "query": "debug kubernetes", "should_trigger": False, "eval_mode": "behavioral", "split": "test"}, + { + "name": "train-positive", + "query": "make a skill", + "should_trigger": True, + "eval_mode": "behavioral", + "split": "train", + }, + { + "name": "test-negative", + "query": "debug kubernetes", + "should_trigger": False, + "eval_mode": "behavioral", + "split": "test", + }, ] } ) @@ -1219,7 +1231,9 @@ def test_tiny_end_to_end_autoresearch_improves_real_weak_skill_copy(tmp_path, mo trigger_query = "help me think through this bug step by step" tasks_file = tmp_path / "tasks.json" - tasks_file.write_text(json.dumps({"tasks": [{"name": "positive", "query": trigger_query, "should_trigger": True, "split": "train"}]})) + tasks_file.write_text( + json.dumps({"tasks": [{"name": "positive", "query": trigger_query, "should_trigger": True, "split": "train"}]}) + ) def fake_generate_variant_output( current_content, diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index 672b77f7..b5ee4eaf 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -632,7 +632,9 @@ def _validate_task_set(tasks: list[dict]) -> None: ) if sum(1 for n in [behavioral_tasks > 0, pure_trigger_tasks > 0, blind_compare_tasks > 0] if n) > 1: - raise ValueError("Task file mixes trigger-rate, behavioral, and blind-compare eval modes. Use one eval_mode per run.") + raise ValueError( + "Task file mixes trigger-rate, behavioral, and blind-compare eval modes. Use one eval_mode per run." + ) if blind_compare_tasks == len(tasks): return @@ -745,6 +747,7 @@ def _run_trigger_rate( if task_file: Path(task_file).unlink(missing_ok=True) + # --------------------------------------------------------------------------- # Blind comparative behavioral evaluator # --------------------------------------------------------------------------- @@ -1019,19 +1022,13 @@ def _run_blind_compare_eval( query = task["query"] if baseline_source == candidate_content: with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt: - candidate_capture = _run_query_capture_output( - query, candidate_wt, candidate_skill_ids, timeout=timeout - ) + candidate_capture = _run_query_capture_output(query, candidate_wt, candidate_skill_ids, timeout=timeout) baseline_capture = dict(candidate_capture) else: with _candidate_worktree(project_root, relpath, baseline_source) as baseline_wt: - baseline_capture = _run_query_capture_output( - query, baseline_wt, baseline_skill_ids, timeout=timeout - ) + baseline_capture = _run_query_capture_output(query, baseline_wt, baseline_skill_ids, timeout=timeout) with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt: - candidate_capture = _run_query_capture_output( - query, candidate_wt, candidate_skill_ids, timeout=timeout - ) + candidate_capture = _run_query_capture_output(query, candidate_wt, candidate_skill_ids, timeout=timeout) baseline_output = baseline_capture["output"] candidate_output = candidate_capture["output"]