diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index 4253aef8..21257019 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -1,6 +1,7 @@ import contextlib import importlib.util import json +import os import subprocess import sys from pathlib import Path @@ -673,6 +674,47 @@ def fake_run_blind_compare_eval( assert scores["task_results"][0]["winner"] == "candidate" +def test_assess_target_trigger_eval_uses_multiple_runs(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_trigger_runs", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: trigger eval test\n---\n") + tasks = [{"query": "inspect this repo", "should_trigger": True}] + seen = {} + + def fake_run_trigger_rate( + target_path, + description, + tasks, + candidate_content=None, + eval_mode="auto", + num_workers=1, + timeout=30, + runs_per_query=3, + verbose=False, + ): + seen["runs_per_query"] = runs_per_query + return { + "results": [{"query": "inspect this repo", "should_trigger": True, "trigger_rate": 1.0, "pass": True}], + "summary": {"total": 1, "passed": 1, "failed": 0}, + } + + monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve routing precision", + behavioral_runs_per_task=4, + ) + + assert seen["runs_per_query"] == 4 + assert scores["tests_pass"] is True + + def test_socratic_question_only_heuristic_penalizes_preamble(): optimize_loop = load_module( "agent_comparison_optimize_loop_socratic_heuristic", @@ -755,6 +797,329 @@ def fake_capture(query, cwd, accepted_skill_ids, timeout=180): assert results[0]["candidate_reasons"][0] == "mentioned blocked skill tool" +def test_behavioral_eval_sequential_path_uses_isolated_worktrees(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_behavioral_isolation", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + project_root = tmp_path / "repo" + (project_root / ".claude").mkdir(parents=True) + cwd_before = Path.cwd() + os.chdir(project_root) + try: + calls = [] + + def fake_single_task_in_worktree(task, project_root, env, timeout, verbose, runs_per_task, trigger_threshold): + calls.append((task["query"], project_root)) + return { + "query": task["query"], + "triggered": task["should_trigger"], + "should_trigger": task["should_trigger"], + "pass": True, + "new_artifacts": [], + } + + monkeypatch.setattr(optimize_loop, "_run_single_behavioral_task_in_worktree", fake_single_task_in_worktree) + + results = optimize_loop._run_behavioral_eval( + tmp_path / "skills" / "example" / "SKILL.md", + "desc", + [{"query": "make a skill", "should_trigger": True}], + parallel_workers=0, + ) + finally: + os.chdir(cwd_before) + + assert len(calls) == 1 + assert results[0]["pass"] is True + + +def test_holdout_score_attaches_to_best_iteration_not_last(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_holdout_attachment", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"}, + {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"}, + ] + } + ) + ) + + variant_counter = {"n": 0} + + def fake_generate_variant_output(*args, **kwargs): + variant_counter["n"] += 1 + marker = f"candidate-{variant_counter['n']}" + return { + "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n", + "summary": marker, + "reasoning": marker, + "tokens_used": 1, + "deletions": [], + "deletion_justification": "", + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + baseline_content=None, + eval_mode="auto", + ): + is_test = bool(tasks and tasks[0].get("split") == "test") + content = candidate_content or path.read_text() + if is_test: + score = 9.0 if "candidate-1" in content else 3.0 + else: + if "candidate-1" in content: + score = 8.0 + elif "candidate-2" in content: + score = 7.0 + else: + score = 5.0 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [{"name": "task", "passed": True}], + } + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + beam_width=2, + candidates_per_parent=2, + holdout_check_cadence=1, + verbose=False, + dry_run=False, + ) + + iteration_one = next(it for it in result["iterations"] if it["number"] == 1) + iteration_two = next(it for it in result["iterations"] if it["number"] == 2) + assert iteration_one["score"]["test"] == 3.6 + assert iteration_two["score"]["test"] == 1.2 + + +def test_best_by_test_can_switch_from_later_train_best_to_earlier_candidate(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_best_by_test", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"}, + {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"}, + ] + } + ) + ) + + variant_counter = {"n": 0} + + def fake_generate_variant_output(*args, **kwargs): + variant_counter["n"] += 1 + marker = f"candidate-{variant_counter['n']}" + return { + "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n", + "summary": marker, + "reasoning": marker, + "tokens_used": 1, + "deletions": [], + "deletion_justification": "", + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + baseline_content=None, + eval_mode="auto", + ): + is_test = bool(tasks and tasks[0].get("split") == "test") + content = candidate_content or path.read_text() + if is_test: + score = 9.0 if "candidate-1" in content else 3.0 + else: + if "candidate-1" in content: + score = 8.0 + elif "candidate-2" in content: + score = 9.0 + else: + score = 5.0 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [{"name": "task", "passed": True}], + } + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=2, + min_gain=0.0, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + beam_width=1, + candidates_per_parent=1, + holdout_check_cadence=1, + verbose=False, + dry_run=False, + ) + + assert result["best_iteration"] == 1 + assert result["best_test_score"] == 3.6 + + +def test_final_report_uses_post_selection_test_scores(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_final_report", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: base\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + {"name": "train-positive", "query": "inspect this repo", "should_trigger": True, "split": "train"}, + {"name": "test-positive", "query": "inspect this repo", "should_trigger": True, "split": "test"}, + ] + } + ) + ) + + variant_counter = {"n": 0} + + def fake_generate_variant_output(*args, **kwargs): + variant_counter["n"] += 1 + marker = f"candidate-{variant_counter['n']}" + return { + "variant": f"---\nname: test-skill\ndescription: {marker}\nversion: 1.0.0\n---\n{marker}\n", + "summary": marker, + "reasoning": marker, + "tokens_used": 1, + "deletions": [], + "deletion_justification": "", + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + baseline_content=None, + eval_mode="auto", + ): + is_test = bool(tasks and tasks[0].get("split") == "test") + content = candidate_content or path.read_text() + if is_test: + score = 9.0 if "candidate-1" in content else 3.0 + else: + score = 8.0 if "candidate-1" in content else 5.0 + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": score, + "error_handling": 0.0, + "language_idioms": 0.0, + "testing": 0.0, + "efficiency": 0.0, + "task_results": [{"name": "task", "passed": True}], + } + + def fake_generate_optimization_report(data, auto_refresh=False): + _ = auto_refresh + return json.dumps(data) + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + monkeypatch.setattr(optimize_loop, "generate_optimization_report", fake_generate_optimization_report) + + report_path = tmp_path / "out" / "report.html" + optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + output_dir=tmp_path / "out", + report_path=report_path, + beam_width=1, + candidates_per_parent=1, + holdout_check_cadence=1, + verbose=False, + dry_run=False, + ) + + report = json.loads(report_path.read_text()) + assert report["iterations"][0]["score"]["test"] == 3.6 + + def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, monkeypatch): optimize_loop = load_module( "agent_comparison_optimize_loop_parallel_forwarding", @@ -903,6 +1268,7 @@ def fake_run_trigger_rate( eval_mode="auto", num_workers=5, timeout=30, + runs_per_query=3, verbose=False, ): passed = trigger_query in description diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 0c83c132..98e7a199 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -348,7 +348,7 @@ Recommended modes: Live eval defaults are intentionally short: - one optimization round -- one trigger-eval run per query +- three trigger-eval runs per query - one trigger-eval worker - no holdout cadence unless explicitly requested @@ -357,7 +357,7 @@ The registered-skill path also evaluates the current working copy, not just `HEA **Step 5: Present results in UI** -Open the generated `optimization-report.html` in a browser. The report shows: +If you passed `--report optimization-report.html`, open the generated file in a browser. The report shows: - Progress dashboard (status, baseline vs best, accepted/rejected counts) - Convergence chart (train solid line, held-out dashed line, baseline dotted) - Iteration table with verdict, composite score, delta, and change summary @@ -367,7 +367,7 @@ Open the generated `optimization-report.html` in a browser. The report shows: Not all ACCEPT iterations are real improvements — some may be harness artifacts. The user reviews the accepted iterations as candidate snapshots from the original target: - Inspect each accepted iteration's diff in the report -- Use "Preview Selected Snapshot" only as a comparison aid in the UI +- Use "Preview Combined" only as a comparison aid in the UI - Use "Export Selected" to download a review JSON describing the selected snapshot diff - In beam mode, review the retained frontier candidates first; they are the strongest candidates from the latest round @@ -392,10 +392,17 @@ cp evals/iterations/best_variant.md skills/{target}/SKILL.md **Step 8: Run final evaluation on FULL task set (train + test)** -After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize: +After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize. Use evaluation-only mode by rerunning the optimizer with `--max-iterations 0`, which records the baseline for the current file without generating fresh variants: ```bash -# Re-run optimize_loop.py against the same task file and inspect results.json/report output +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/{target}/SKILL.md \ + --goal "{same goal}" \ + --benchmark-tasks {full-task-file}.json \ + --max-iterations 0 \ + --report optimization-report.html \ + --output-dir evals/final-check \ + --verbose ``` Compare final scores to the baseline to confirm net improvement. In beam mode, the final report and `results.json` also include: @@ -421,8 +428,7 @@ The current optimizer is in a solid state for: - deterministic proof runs - isolated live evaluation of existing registered skills - short live optimization of `read-only-ops`, with the accepted description change now applied and validated against `references/read-only-ops-short-tasks.json` -- short live body evaluation of `socratic-debugging`, with `references/socratic-debugging-body-short-tasks.json` - now producing clean skill-triggered first-turn outputs instead of fallback chatter +- short live body optimization of `socratic-debugging`, with the accepted instruction-body update now applied and validated against `references/socratic-debugging-body-short-tasks.json`, now producing clean skill-triggered first-turn outputs instead of fallback chatter One live-harness caveat remains: - temporary renamed skill copies do not yet show reliable live trigger improvements through the dynamic command alias path diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md index 7d689e2c..38f1c12b 100644 --- a/skills/agent-comparison/references/optimization-guide.md +++ b/skills/agent-comparison/references/optimization-guide.md @@ -2,9 +2,15 @@ ## Scope -The current autoresearch loop optimizes a markdown target's frontmatter -`description` using trigger-rate eval tasks. This is useful for improving -skill routing accuracy and similar description-driven dispatch behavior. +The current autoresearch loop supports two optimization scopes: + +- `description-only`: mutate the frontmatter `description` and score it with + trigger-rate eval tasks +- `body-only`: mutate the instruction body and score it with `blind_compare` + behavioral tasks + +This is useful for improving skill routing accuracy and for short, repeatable +instruction-body improvements on real registered skills. It is not a replacement for the manual agent benchmark workflow in Phases 1-4. If you want to compare real code-generation quality across benchmark tasks, use @@ -22,7 +28,11 @@ drives routing. ## Supported Task Formats -Every task must include: +Two task families are supported: + +### Trigger-rate tasks + +Every trigger-rate task must include: - `query`: the prompt to test - `should_trigger`: whether the target should trigger for that prompt @@ -77,6 +87,40 @@ Explicit train/test sets: } ``` +### Blind body-compare tasks + +Every blind body-compare task must include: + +- `query`: the prompt to test +- `eval_mode: blind_compare` +- `judge`: currently `heuristic_socratic_debugging` + +Optional fields: + +- `name`: label shown in logs and reports +- `split`: `train` or `test` +- `min_score`: minimum candidate score required for the task to count as passed + +Example: + +```json +{ + "tasks": [ + { + "name": "socratic-first-turn", + "query": "Help me think through this bug. My Python script sometimes returns None instead of a dict when the cache is warm. Please do not solve it for me directly.", + "eval_mode": "blind_compare", + "judge": "heuristic_socratic_debugging", + "min_score": 0.7, + "split": "train" + } + ] +} +``` + +Within one run, tasks must all belong to the same family. The optimizer rejects +mixed trigger-rate and blind body-compare task sets. + If no split markers are present, the loop performs a reproducible random split using `--train-split` and seed `42`. @@ -262,9 +306,10 @@ What is currently demonstrated: - short live proof on `skills/read-only-ops/SKILL.md` using `references/read-only-ops-short-tasks.json`, improving from one failed positive to `2/2` live passes after the accepted description update -- short live body benchmark on `skills/socratic-debugging/SKILL.md` using - `references/socratic-debugging-body-short-tasks.json`, where the current - baseline now evaluates cleanly and non-improving body variants are rejected +- short live body optimization on `skills/socratic-debugging/SKILL.md` using + `references/socratic-debugging-body-short-tasks.json`, improving from `7.85` + to `8.45` after the accepted instruction-body update; the current baseline now + evaluates cleanly and non-improving body variants are rejected What remains imperfect: - live optimization of temporary renamed skill copies still fails to show measured improvement through the dynamic command alias path diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index f4463b1f..b5ee4eaf 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -261,6 +261,13 @@ def _build_report_data( } +def _iteration_entry_by_number(iterations: list[dict], number: int) -> dict | None: + for entry in iterations: + if entry.get("number") == number: + return entry + return None + + def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: """Generate iteration history HTML report. @@ -660,6 +667,7 @@ def _run_trigger_rate( eval_mode: str = "auto", num_workers: int = 1, timeout: int = 30, + runs_per_query: int = 3, verbose: bool = False, ) -> dict: """Run trigger-rate assessment using the skill_eval infrastructure. @@ -696,7 +704,7 @@ def _run_trigger_rate( "--timeout", str(timeout), "--runs-per-query", - "1", + str(runs_per_query), ] if candidate_content is not None: with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as candidate_file: @@ -1341,14 +1349,14 @@ def _run_behavioral_eval( } return results - # Sequential path (parallel_workers <= 1): run tasks one at a time in project_root. + # Sequential path: still use isolated worktrees so tasks cannot mutate the real repo + # or contaminate each other by editing tracked files. sequential_results = [] for task in tasks: sequential_results.append( - _run_single_behavioral_task( + _run_single_behavioral_task_in_worktree( task=task, project_root=project_root, - worktree_path=project_root, env=env, timeout=timeout, verbose=verbose, @@ -1449,6 +1457,7 @@ def assess_target( tasks, candidate_content=content, eval_mode=eval_mode, + runs_per_query=max(1, behavioral_runs_per_task), verbose=verbose, ) summary = results.get("summary", {}) @@ -1997,8 +2006,10 @@ def run_optimization_loop( eval_mode=eval_mode, ) holdout_composite = composite_score(holdout_scores) - if iterations: - iterations[-1]["score"]["test"] = holdout_composite + if best_iteration > 0: + best_iteration_entry = _iteration_entry_by_number(iterations, best_iteration) + if best_iteration_entry is not None: + best_iteration_entry["score"]["test"] = holdout_composite if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): if verbose: @@ -2040,33 +2051,33 @@ def run_optimization_loop( exit_reason = f"max_iterations ({max_iterations})" status = "COMPLETE" - # Final report - if report_path: - rd = _build_report_data( - target_label, - goal, - baseline_composite, - baseline_holdout, - len(train_tasks), - len(test_tasks), - iterations, - max_iterations, - status, - total_tokens, - ) - rd["search"] = { - "strategy": "beam", - "beam_width": beam_width, - "candidates_per_parent": candidates_per_parent, - "holdout_check_cadence": holdout_check_cadence, - } - report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) - # Best-by-test selection: if test tasks exist, prefer the ACCEPT iteration with the # highest held-out test score rather than the highest training score (anti-Goodhart). best_test_score: float | None = None if test_tasks and keep_contents: - # Find iterations with a recorded test score (set during holdout cadence checks) + for keep_iter, keep_content in keep_contents.items(): + entry = _iteration_entry_by_number(iterations, keep_iter) + if entry is not None and entry["score"].get("test") is not None: + continue + final_test_scores = assess_target( + target_path, + test_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=keep_content, + baseline_content=original_content, + eval_mode=eval_mode, + ) + keep_test_score = composite_score(final_test_scores) + if entry is not None: + entry["score"]["test"] = keep_test_score + if verbose: + print(f"Recorded final test eval for iter {keep_iter}: test={keep_test_score:.4f}", file=sys.stderr) + scored_keeps = [ (it["number"], it["score"]["test"]) for it in iterations @@ -2084,25 +2095,6 @@ def run_optimization_loop( ) best_content = keep_contents[best_test_iter] best_iteration = best_test_iter - else: - # No holdout-checked ACCEPT iterations — run a final test eval on best_content - if best_iteration > 0: - final_test_scores = assess_target( - target_path, - test_tasks, - goal, - verbose, - dry_run, - behavioral_runs_per_task, - behavioral_trigger_threshold, - effective_parallel_eval, - candidate_content=best_content, - baseline_content=original_content, - eval_mode=eval_mode, - ) - best_test_score = composite_score(final_test_scores) - if verbose: - print(f"Final test eval on best_content: test={best_test_score:.4f}", file=sys.stderr) if best_iteration > 0: best_path = output_dir / "best_variant.md" @@ -2110,6 +2102,29 @@ def run_optimization_loop( if verbose: print(f"\nBest variant saved to: {best_path}", file=sys.stderr) + # Final report: write only after any final held-out evaluations and best-by-test + # selection so the HTML matches the finalized iterations/results.json state. + if report_path: + rd = _build_report_data( + target_label, + goal, + baseline_composite, + baseline_holdout, + len(train_tasks), + len(test_tasks), + iterations, + max_iterations, + status, + total_tokens, + ) + rd["search"] = { + "strategy": "beam", + "beam_width": beam_width, + "candidates_per_parent": candidates_per_parent, + "holdout_check_cadence": holdout_check_cadence, + } + report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) + result = { "exit_reason": exit_reason, "status": status, diff --git a/skills/do/references/routing-tables.md b/skills/do/references/routing-tables.md index 41e9af10..f95e3843 100644 --- a/skills/do/references/routing-tables.md +++ b/skills/do/references/routing-tables.md @@ -111,10 +111,10 @@ Route to these agents based on the user's task domain. Each entry describes what | **code-cleanup** | User wants to remove stale TODOs, unused code, dead imports, or generally clean up accumulated debt. | | **comment-quality** | User wants to audit code comments for accuracy, temporal references, or staleness. | | **agent-evaluation** | User wants to grade or evaluate a skill, agent, or pipeline for quality and standards compliance. NOT: evaluating code output or test results. | -| **agent-comparison** | User wants to A/B test two agents or compare their outputs on the same task. | +| **agent-comparison** | User wants to A/B test agents, run autoresearch, optimize a skill description, or optimize a skill body with benchmark tasks. | | **agent-upgrade** | User wants to audit and systematically improve a specific agent to bring it up to current template standards. | | **testing-agents-with-subagents** | User wants to validate an agent by running it against real test cases in subagents. | -| **skill-eval** | User wants to improve a skill through measured testing, optimize its description, or benchmark it against scenarios. | +| **skill-eval** | User wants to evaluate a skill, test triggers manually, benchmark it against scenarios, or inspect skill quality without running the autoresearch optimizer. | | **full-repo-review** | User wants a comprehensive 3-wave review of all source files in the entire repository. | | **repo-value-analysis** | User wants to systematically analyze an external repository to determine what ideas or patterns are worth adopting. | | **data-analysis** | User wants to analyze data: CSV files, metrics, A/B test results, cohort analysis, statistical distributions, KPIs, or funnel data. | diff --git a/skills/skill-eval/SKILL.md b/skills/skill-eval/SKILL.md index ba89f3b1..f36a04b4 100644 --- a/skills/skill-eval/SKILL.md +++ b/skills/skill-eval/SKILL.md @@ -2,12 +2,12 @@ name: skill-eval description: | Evaluate and improve skills through measured testing. Run trigger evaluations - to test whether skill descriptions cause correct activation, optimize - descriptions via automated train/test loops, benchmark skill output quality + to test whether skill descriptions cause correct activation, benchmark skill output quality with A/B comparisons, and validate skill structure. Use when user says - "improve skill", "test skill triggers", "optimize description", "benchmark + "improve skill", "test skill triggers", "benchmark skill", "eval skill", or "skill quality". Do NOT use for creating new skills - (use skill-creator). + (use skill-creator). Route autoresearch loops for description/body optimization + to agent-comparison. version: 1.0.0 user-invocable: false argument-hint: "" @@ -23,7 +23,6 @@ routing: - improve skill - test skill - eval skill - - optimize description - benchmark skill - skill triggers - skill quality @@ -56,7 +55,7 @@ This checks: SKILL.md exists, valid frontmatter, required fields (name, descript | Intent | Mode | Script | |--------|------|--------| | "Test if description triggers correctly" | Trigger eval | `run_eval.py` | -| "Optimize/improve the description" | Description optimization | `run_loop.py` | +| "Optimize/improve the description through autoresearch" | Route to `agent-comparison` | `optimize_loop.py` | | "Compare skill vs no-skill output" | Output benchmark | Manual + `aggregate_benchmark.py` | | "Validate skill structure" | Quick validate | `quick_validate.py` | diff --git a/skills/socratic-debugging/SKILL.md b/skills/socratic-debugging/SKILL.md index 486c4ae8..dc315c8c 100644 --- a/skills/socratic-debugging/SKILL.md +++ b/skills/socratic-debugging/SKILL.md @@ -26,7 +26,6 @@ routing: - "teach me to find it" category: process --- - # Socratic Debugging Skill ## Overview @@ -64,14 +63,14 @@ Follow these phases in order. Each phase builds evidence for the next. ### Execution Flow 1. **User describes the bug.** Read the relevant code silently using Read/Grep/Glob. -2. **Ask Phase 1 question.** Even if the bug seems obvious from the code, start with symptoms. Make the question pointed if the answer is likely simple. +2. **Ask Phase 1 question.** Your first response must be exactly one question with no other text — no preamble, no diagnosis, no code references or examples, no mention of files you read, no announcement of tools used or planned. Even if the bug seems obvious from the code, start with symptoms. Make the question pointed if the answer is likely simple. 3. **Listen, acknowledge, ask next question.** Format: brief acknowledgment of what they said, then one question advancing toward root cause. 4. **Track question count.** After 12 questions with no progress toward root cause, trigger escalation offer. 5. **When user identifies root cause**, confirm their finding and ask what fix they would apply. Let the user propose the fix. ### Hints vs. Leading Questions -Questions may contain subtle directional hints. The goal is discovery, not suffering. A **good hint** directs attention without revealing the answer: "What happens if you log the value of `request.userId` right before line 42?" A **bad hint** is a leading question that contains the answer: "Could `request.userId` be null at line 42?" The line: open-ended questions that narrow focus are hints. Leading questions that contain the answer are violations. +Questions may contain subtle directional hints. The goal is discovery, not suffering. A **good hint** directs attention without revealing the answer: asking what a specific value is right before a failure. A **bad hint** is a leading question that contains the answer: asking whether a specific value could be null. The line: open-ended questions that narrow focus are hints. Leading questions that contain the answer are violations. ### Escalation Protocol @@ -99,7 +98,7 @@ Solution: Acknowledge the frustration. Offer escalation. If they want to continu ### Bug Is Trivially Obvious From Code Cause: A typo, missing import, or simple syntax error visible in the source -Solution: Still ask Phase 1, but make the question very pointed -- narrow enough that the user will see the answer immediately. Example: "What do you expect `reponse.data` to contain?" (the typo in the variable name is the bug). Follow phase progression; pointed questions stay within the Socratic framework. +Solution: Still ask Phase 1, but make the question very pointed -- narrow enough that the user will see the answer immediately. Follow phase progression; pointed questions stay within the Socratic framework. ---