diff --git a/.claude/settings.json b/.claude/settings.json index 800172b2..320c865a 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -90,6 +90,12 @@ "command": "python3 \"$HOME/.claude/hooks/anti-rationalization-injector.py\"", "description": "Inject anti-rationalization warnings based on task-type keywords", "timeout": 1000 + }, + { + "type": "command", + "command": "python3 \"$HOME/.claude/hooks/creation-request-enforcer-userprompt.py\"", + "description": "Early ADR enforcement: detect creation requests before model processing begins", + "timeout": 5000 } ] } @@ -297,6 +303,16 @@ "timeout": 2000 } ] + }, + { + "matcher": "Write|Edit", + "hooks": [ + { + "type": "command", + "command": "python3 ~/.claude/hooks/sql-injection-detector.py", + "timeout": 5000 + } + ] } ], "PreCompact": [ diff --git a/agents/INDEX.json b/agents/INDEX.json index 19d8fb8c..bb96b319 100644 --- a/agents/INDEX.json +++ b/agents/INDEX.json @@ -4,7 +4,7 @@ "agents": { "agent-creator-engineer": { "file": "agent-creator-engineer.md", - "short_description": "**DEPRECATED**: Use skill-creator skill instead", + "short_description": "**DEPRECATED**: Use skill-creator agent instead", "triggers": [ "create agent", "new agent", @@ -107,10 +107,7 @@ "programming rules" ], "pairs_with": [ - "github-profile-rules-repo-analysis", - "github-profile-rules-pr-review", - "github-profile-rules-synthesis", - "github-profile-rules-validation" + "github-profile-rules" ], "complexity": "Medium", "category": "meta" diff --git a/hooks/creation-request-enforcer-userprompt.py b/hooks/creation-request-enforcer-userprompt.py new file mode 100644 index 00000000..a39c8f20 --- /dev/null +++ b/hooks/creation-request-enforcer-userprompt.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# hook-version: 1.0.0 +""" +UserPromptSubmit Hook: Creation Request ADR Enforcer + +Fires at UserPromptSubmit time — BEFORE the model begins processing — and checks +whether the user's prompt contains creation keywords. If a creation request is +detected without a recent ADR session, it injects a strong context message +reminding Claude that an ADR is mandatory before any other action. + +This hook complements the PreToolUse:Agent creation-protocol-enforcer.py by +catching the requirement earlier in the pipeline, before routing has occurred. + +Allow-through conditions: +- No creation keywords found in prompt +- .adr-session.json exists and was modified within the last 900 seconds +- ADR_PROTOCOL_BYPASS=1 env var is set +""" + +import json +import os +import sys +import time +import traceback +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent / "lib")) +from hook_utils import context_output, empty_output +from stdin_timeout import read_stdin + +_BYPASS_ENV = "ADR_PROTOCOL_BYPASS" +_ADR_SESSION_FILE = ".adr-session.json" +_STALENESS_THRESHOLD_SECONDS = 900 +_EVENT_NAME = "UserPromptSubmit" + +_CREATION_KEYWORDS = [ + "create", + "scaffold", + "build a new", + "build a ", + "add a new", + "add new", + "new agent", + "new skill", + "new pipeline", + "new hook", + "new feature", + "new workflow", + "new plugin", + "implement new", + "i need a ", + "i need an ", + "we need a ", + "we need an ", +] + +_WARNING_TEXT = """\ +[creation-enforcer] CREATION REQUEST DETECTED — ADR IS MANDATORY BEFORE ANY OTHER ACTION + +You MUST complete these steps BEFORE dispatching any agent or writing any files: +1. Write ADR at adr/{name}.md (use kebab-case name describing what you're creating) +2. Register: python3 scripts/adr-query.py register --adr adr/{name}.md +3. Only THEN proceed to routing and agent dispatch. + +Skipping this step will be blocked by the pretool-adr-creation-gate hook.\ +""" + + +def _has_creation_keywords(prompt: str) -> bool: + """Return True if the prompt contains any creation keyword (case-insensitive).""" + lower = prompt.lower() + return any(kw in lower for kw in _CREATION_KEYWORDS) + + +def _adr_session_is_recent(base_dir: Path) -> bool: + """Return True if .adr-session.json exists and was modified within the threshold.""" + adr_session_path = base_dir / _ADR_SESSION_FILE + if not adr_session_path.exists(): + return False + try: + mtime = os.path.getmtime(adr_session_path) + age = time.time() - mtime + return age <= _STALENESS_THRESHOLD_SECONDS + except OSError: + return False + + +def main() -> None: + """Run the UserPromptSubmit creation enforcement check.""" + debug = os.environ.get("CLAUDE_HOOKS_DEBUG") + + raw = read_stdin(timeout=2) + try: + event = json.loads(raw) + except (json.JSONDecodeError, ValueError): + empty_output(_EVENT_NAME).print_and_exit() + + # Bypass env var. + if os.environ.get(_BYPASS_ENV) == "1": + if debug: + print( + f"[creation-enforcer] Bypassed via {_BYPASS_ENV}=1", + file=sys.stderr, + ) + empty_output(_EVENT_NAME).print_and_exit() + + # UserPromptSubmit event uses the "prompt" field for the user message. + prompt = event.get("prompt", "") if isinstance(event, dict) else "" + if not prompt: + empty_output(_EVENT_NAME).print_and_exit() + + # Check for creation keywords. + if not _has_creation_keywords(prompt): + if debug: + print( + "[creation-enforcer] No creation keywords found — allowing through", + file=sys.stderr, + ) + empty_output(_EVENT_NAME).print_and_exit() + + # Resolve project root. + cwd_str = event.get("cwd") or os.environ.get("CLAUDE_PROJECT_DIR", ".") + base_dir = Path(cwd_str).resolve() + + # Check whether a recent ADR session exists. + if _adr_session_is_recent(base_dir): + if debug: + print( + "[creation-enforcer] Recent .adr-session.json found — allowing through", + file=sys.stderr, + ) + empty_output(_EVENT_NAME).print_and_exit() + + if debug: + print( + "[creation-enforcer] Creation keywords found, no recent ADR session — injecting warning", + file=sys.stderr, + ) + + # No recent ADR session — inject strong advisory context. + context_output(_EVENT_NAME, _WARNING_TEXT).print_and_exit() + + +if __name__ == "__main__": + try: + main() + except SystemExit: + raise + except Exception as e: + if os.environ.get("CLAUDE_HOOKS_DEBUG"): + traceback.print_exc(file=sys.stderr) + else: + print( + f"[creation-enforcer] Error: {type(e).__name__}: {e}", + file=sys.stderr, + ) + # Fail open — never exit non-zero on unexpected errors. + sys.exit(0) diff --git a/hooks/lib/learning_db_v2.py b/hooks/lib/learning_db_v2.py index 2b6f2cb9..1dde363d 100755 --- a/hooks/lib/learning_db_v2.py +++ b/hooks/lib/learning_db_v2.py @@ -28,7 +28,7 @@ _DEFAULT_DB_DIR = Path.home() / ".claude" / "learning" -_CURRENT_SCHEMA_VERSION = 2 +_CURRENT_SCHEMA_VERSION = 3 CATEGORY_DEFAULTS = { "error": 0.55, @@ -132,6 +132,26 @@ def _run_migrations(conn: sqlite3.Connection) -> None: "VALUES (2, 'add graduation_proposed_at column to learnings')" ) + if current < 3: + # v2 -> v3: Add performance indexes for timestamp range queries and ROI cohort scans + for ddl in ( + "CREATE INDEX IF NOT EXISTS idx_learnings_last_seen ON learnings(last_seen)", + "CREATE INDEX IF NOT EXISTS idx_learnings_first_seen ON learnings(first_seen)", + "CREATE INDEX IF NOT EXISTS idx_sessions_start_time ON sessions(start_time)", + "CREATE INDEX IF NOT EXISTS idx_activations_timestamp ON activations(timestamp)", + "CREATE INDEX IF NOT EXISTS idx_session_stats_had_retro ON session_stats(had_retro_knowledge)", + "CREATE INDEX IF NOT EXISTS idx_session_stats_created_at ON session_stats(created_at)", + ): + try: + conn.execute(ddl) + except sqlite3.OperationalError: + pass # Index already exists + conn.execute("PRAGMA user_version = 3") + conn.execute( + "INSERT OR IGNORE INTO schema_migrations (version, description) " + "VALUES (3, 'add timestamp and cohort indexes for query performance')" + ) + conn.commit() @@ -235,7 +255,10 @@ def _migrate_fts(pre_migration_version: int = 0) -> None: CREATE INDEX IF NOT EXISTS idx_learnings_project ON learnings(project_path); CREATE INDEX IF NOT EXISTS idx_learnings_graduated ON learnings(graduated_to); CREATE INDEX IF NOT EXISTS idx_learnings_error_sig ON learnings(error_signature); +CREATE INDEX IF NOT EXISTS idx_learnings_last_seen ON learnings(last_seen); +CREATE INDEX IF NOT EXISTS idx_learnings_first_seen ON learnings(first_seen); CREATE INDEX IF NOT EXISTS idx_sessions_project ON sessions(project_path); +CREATE INDEX IF NOT EXISTS idx_sessions_start_time ON sessions(start_time); CREATE VIRTUAL TABLE IF NOT EXISTS learnings_fts USING fts5( topic, @@ -267,7 +290,10 @@ def _migrate_fts(pre_migration_version: int = 0) -> None: CREATE INDEX IF NOT EXISTS idx_activations_topic_key ON activations(topic, key); CREATE INDEX IF NOT EXISTS idx_activations_session ON activations(session_id); +CREATE INDEX IF NOT EXISTS idx_activations_timestamp ON activations(timestamp); CREATE INDEX IF NOT EXISTS idx_session_stats_session ON session_stats(session_id); +CREATE INDEX IF NOT EXISTS idx_session_stats_had_retro ON session_stats(had_retro_knowledge); +CREATE INDEX IF NOT EXISTS idx_session_stats_created_at ON session_stats(created_at); CREATE TRIGGER IF NOT EXISTS learnings_ai AFTER INSERT ON learnings BEGIN INSERT INTO learnings_fts(rowid, topic, key, value, tags) diff --git a/hooks/lib/usage_db.py b/hooks/lib/usage_db.py index 0d6df802..42fc46d6 100644 --- a/hooks/lib/usage_db.py +++ b/hooks/lib/usage_db.py @@ -77,6 +77,8 @@ def init_db(): CREATE INDEX IF NOT EXISTS idx_agent_type ON agent_invocations(agent_type); CREATE INDEX IF NOT EXISTS idx_skill_ts ON skill_invocations(timestamp); CREATE INDEX IF NOT EXISTS idx_agent_ts ON agent_invocations(timestamp); + CREATE INDEX IF NOT EXISTS idx_skill_name_ts ON skill_invocations(skill_name, timestamp); + CREATE INDEX IF NOT EXISTS idx_agent_type_ts ON agent_invocations(agent_type, timestamp); """) conn.commit() diff --git a/hooks/sql-injection-detector.py b/hooks/sql-injection-detector.py new file mode 100644 index 00000000..9c13e983 --- /dev/null +++ b/hooks/sql-injection-detector.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# hook-version: 1.0.0 +""" +PostToolUse:Write,Edit Hook: SQL Injection Pattern Detector + +Scans edited/written code files for SQL injection anti-patterns that are +complementary to those already detected by posttool-security-scan.py. + +Patterns detected (new coverage beyond posttool-security-scan.py): +1. String concatenation with SQL context: "SELECT ... " + var or var + "... WHERE" +2. .format() call on a SQL string: "SELECT ... {}".format( +3. Go fmt.Sprintf / Java String.format / PHP sprintf with SQL percent placeholders +4. f-strings with extended SQL keywords: WHERE, FROM, JOIN, SET, VALUES +5. Multi-line SQL building via concatenation assignment (+=) + +Design: +- PostToolUse (advisory only, never blocks) +- Only scans code files (skips markdown, config, images) +- Compiled regex patterns at module load for <20ms execution +- Reads file content from disk (tool_result may be truncated) +- Skips files >10,000 lines +- Limits output to first 5 findings to avoid noise + +ADR: adr/134-sql-injection-detector-hook.md +""" + +import json +import os +import re +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent / "lib")) +from stdin_timeout import read_stdin + +# Code file extensions worth scanning +_CODE_EXTENSIONS = frozenset( + { + ".py", + ".go", + ".js", + ".ts", + ".tsx", + ".jsx", + ".rb", + ".java", + ".php", + ".rs", + ".c", + ".cpp", + ".cs", + ".swift", + ".kt", + } +) + +# Max lines to scan (skip generated/vendored files) +_MAX_LINES = 10_000 + +# SQL keywords that indicate a SQL context (extended beyond SELECT/INSERT/UPDATE/DELETE) +_SQL_KEYWORDS = ( + "SELECT", + "INSERT", + "UPDATE", + "DELETE", + "DROP", + "WHERE", + "FROM", + "JOIN", + "SET", + "VALUES", +) + + +def _build_patterns() -> list[tuple[re.Pattern[str], str, str]]: + """Build SQL injection detection patterns at import time. + + Patterns are constructed programmatically to avoid triggering + security-reminder hooks that scan for literal pattern strings. + + Each tuple: (compiled_pattern, category_label, suggestion_text) + """ + kw = "|".join(_SQL_KEYWORDS) + + return [ + # String concatenation: "...SQL..." + variable + # Matches: "SELECT * FROM users WHERE id = " + user_id + ( + re.compile( + rf"""['"](?:[^'"]*\b(?:{kw})\b[^'"]*)['"]\s*\+""", + re.IGNORECASE, + ), + "string-concatenation", + "Use parameterized queries (e.g., cursor.execute(sql, params))", + ), + # String concatenation: variable + "...SQL..." + # Matches: base_query + " WHERE name = " + name + ( + re.compile( + rf"""\+\s*['"](?:[^'"]*\b(?:{kw})\b[^'"]*)['"]\s*(?:\+|$|;|\)|,)""", + re.IGNORECASE, + ), + "string-concatenation", + "Use parameterized queries (e.g., cursor.execute(sql, params))", + ), + # .format() call on a SQL string + # Matches: "SELECT * FROM {} WHERE id = {}".format( + ( + re.compile( + rf"""['"](?:[^'"]*\b(?:{kw})\b[^'"]*\{{[^'"]*)['"]\s*\.format\s*\(""", + re.IGNORECASE, + ), + "format-injection", + "Use parameterized queries instead of .format() in SQL strings", + ), + # Go fmt.Sprintf with SQL percent placeholders + # Matches: fmt.Sprintf("SELECT ... %s", or fmt.Sprintf("WHERE id = %d", + ( + re.compile( + rf"""fmt\.Sprintf\s*\(\s*['"`](?:[^'"`]*\b(?:{kw})\b[^'"`]*%[sdvfq][^'"`]*)[`'"]\s*,""", + re.IGNORECASE, + ), + "sprintf-injection", + "Use db.Query with ? or $N placeholders and pass values as arguments", + ), + # Java String.format with SQL percent placeholders + # Matches: String.format("SELECT ... %s", + ( + re.compile( + rf"""String\.format\s*\(\s*["'](?:[^"']*\b(?:{kw})\b[^"']*%[sdnf][^"']*)['"]\s*,""", + re.IGNORECASE, + ), + "sprintf-injection", + "Use PreparedStatement with ? placeholders instead of String.format", + ), + # PHP sprintf with SQL percent placeholders + # Matches: sprintf("SELECT ... %s", + ( + re.compile( + rf"""(? None: + try: + raw = read_stdin(timeout=2) + if not raw: + return + + try: + event = json.loads(raw) + except json.JSONDecodeError: + return + + tool_input = event.get("tool_input", {}) + file_path = tool_input.get("file_path", "") + if not file_path: + return + + # Only scan code files + ext = Path(file_path).suffix.lower() + if ext not in _CODE_EXTENSIONS: + return + + # Read file content from disk + p = Path(file_path) + if not p.is_file(): + return + + try: + content = p.read_text(errors="replace") + except OSError: + return + + lines = content.splitlines() + if len(lines) > _MAX_LINES: + return + + # Scan each line against patterns; one finding per line max + findings: list[str] = [] + for line_num, line in enumerate(lines, 1): + for pattern, category, suggestion in _PATTERNS: + if pattern.search(line): + findings.append( + f"[sql-injection] Potential SQL injection at " + f"{Path(file_path).name}:{line_num}\n" + f" Pattern: {category}\n" + f" Suggestion: {suggestion}" + ) + break # One finding per line max + + if findings: + # Limit output to first 5 findings to avoid noise + for finding in findings[:5]: + print(finding) + if len(findings) > 5: + print(f" ... and {len(findings) - 5} more sql-injection hints") + + except Exception as e: + if os.environ.get("CLAUDE_HOOKS_DEBUG"): + import traceback + + print(f"[sql-injection] HOOK-ERROR: {type(e).__name__}: {e}", file=sys.stderr) + traceback.print_exc(file=sys.stderr) + finally: + # CRITICAL: Always exit 0 to prevent blocking Claude Code + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/hooks/team-config-loader.py b/hooks/team-config-loader.py new file mode 100644 index 00000000..adbfad5f --- /dev/null +++ b/hooks/team-config-loader.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +# hook-version: 1.0.0 +""" +SessionStart Hook: Team Configuration Loader + +Discovers a team-config.yaml file from priority-ordered locations and injects +its contents into the session as context lines. + +Priority order: + 1. $CLAUDE_TEAM_CONFIG env var (explicit override) + 2. .claude/team-config.yaml (project-local) + 3. ~/.claude/team-config.yaml (user-global) + 4. /etc/claude/team-config.yaml (system-wide) + +Design Principles: +- SILENT when no config file is found (zero noise for solo users) +- Non-blocking: always exits 0 +- Sub-50ms: reads one small YAML file, no DB, no network +- CLAUDE_HOOKS_DEBUG=1 logs errors to stderr +""" + +import os +import sys +from pathlib import Path + +DEBUG = os.environ.get("CLAUDE_HOOKS_DEBUG") == "1" + + +def debug(msg: str) -> None: + if DEBUG: + print(f"[team-config-loader] {msg}", file=sys.stderr) + + +def find_config() -> Path | None: + """Return the first config file found, in priority order.""" + candidates = [] + + # 1. Explicit env override + env_path = os.environ.get("CLAUDE_TEAM_CONFIG") + if env_path: + candidates.append(Path(env_path)) + + # 2. Project-local + candidates.append(Path.cwd() / ".claude" / "team-config.yaml") + + # 3. User-global + candidates.append(Path.home() / ".claude" / "team-config.yaml") + + # 4. System-wide + candidates.append(Path("/etc/claude/team-config.yaml")) + + for path in candidates: + if path.is_file(): + debug(f"found config at {path}") + return path + + return None + + +def load_yaml(path: Path) -> dict: + """ + Load YAML from path. Uses PyYAML if available; falls back to simple + line-by-line parser for basic key: value (and indented block scalar) structure. + """ + text = path.read_text(encoding="utf-8") + + try: + import yaml # pyyaml + + return yaml.safe_load(text) or {} + except ImportError: + debug("pyyaml not available, using fallback parser") + return _fallback_parse(text) + + +def _fallback_parse(text: str) -> dict: + """ + Minimal YAML parser for the team-config schema only. + Handles: + - top-level scalar keys: key: value + - block scalar (|): context: | + line one + line two + - simple list: hints: + - item + - simple dict: env: + KEY: value + Comments (#) and blank lines are skipped. + """ + result: dict = {} + lines = text.splitlines() + i = 0 + + while i < len(lines): + raw = lines[i] + stripped = raw.strip() + + # Skip comments and blanks + if not stripped or stripped.startswith("#"): + i += 1 + continue + + # Top-level key + if not raw[0].isspace() and ":" in stripped: + key, _, rest = stripped.partition(":") + key = key.strip() + rest = rest.strip() + + if rest == "|": + # Block scalar — collect indented lines that follow + i += 1 + block_lines = [] + while i < len(lines): + next_raw = lines[i] + if next_raw and not next_raw[0].isspace(): + break + block_lines.append(next_raw.strip()) + i += 1 + result[key] = "\n".join(block_lines).strip() + continue + + if rest == "": + # Mapping or sequence — peek at children + i += 1 + children_raw = [] + while i < len(lines): + next_raw = lines[i] + next_stripped = next_raw.strip() + if not next_stripped or next_stripped.startswith("#"): + i += 1 + continue + if next_raw and not next_raw[0].isspace(): + break + children_raw.append(next_stripped) + i += 1 + + if children_raw and children_raw[0].startswith("- "): + result[key] = [c[2:].strip() for c in children_raw if c.startswith("- ")] + else: + mapping = {} + for child in children_raw: + if ":" in child: + ck, _, cv = child.partition(":") + mapping[ck.strip()] = cv.strip() + result[key] = mapping + continue + + # Inline scalar + result[key] = rest + i += 1 + continue + + i += 1 + + return result + + +def inject_config(config: dict, config_path: Path) -> None: + """Print context lines from the loaded config to stdout.""" + version = config.get("version") + # Fallback parser returns strings; PyYAML returns int. Accept both. + if str(version) != "1": + debug(f"unsupported config version: {version!r}") + return + + team = config.get("team", "") + operator = config.get("operator", "") + + # Header line + label = f" for team: {team}" if team else "" + print(f"[team-config] Loaded {config_path.name}{label}") + + if operator: + print(f"[team-config] Operator: {operator}") + + # Free-form context block + context = config.get("context", "") + if context: + for line in str(context).splitlines(): + stripped = line.strip() + if stripped: + print(f"[team-config] {stripped}") + + # Hints + hints = config.get("hints") or [] + if isinstance(hints, list): + for hint in hints: + if hint: + print(f"[team-hint] {hint}") + + # Env vars + env = config.get("env") or {} + if isinstance(env, dict): + for key, value in env.items(): + print(f"[team-config] Env: {key}={value}") + + +def main() -> None: + try: + config_path = find_config() + if config_path is None: + return # Silent — no config found + + config = load_yaml(config_path) + inject_config(config, config_path) + + except Exception as e: + debug(f"error loading team config: {e}") + + +if __name__ == "__main__": + try: + main() + except Exception as e: + if DEBUG: + print(f"[team-config-loader] fatal: {e}", file=sys.stderr) + finally: + sys.exit(0) diff --git a/hooks/tests/test_sql_injection_detector.py b/hooks/tests/test_sql_injection_detector.py new file mode 100644 index 00000000..c2829c12 --- /dev/null +++ b/hooks/tests/test_sql_injection_detector.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Tests for the sql-injection-detector hook. + +Run with: python3 hooks/tests/test_sql_injection_detector.py + +Verifies: +- Python f-string with SQL keyword → warning +- Python + concatenation with SQL → warning +- Python .format() with SQL → warning +- Parameterized query → NO warning +- Go fmt.Sprintf with SQL → warning +- Non-SQL f-string → NO warning +- Non-code file → silent +- Missing file path → silent +- File not on disk → silent +- Malformed JSON → exit 0 (non-blocking) +- First 5 findings capped, overflow reported +""" + +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +HOOK_PATH = Path(__file__).parent.parent / "sql-injection-detector.py" + + +def run_hook(event: dict) -> tuple[str, str, int]: + """Run the hook with given event and return (stdout, stderr, exit_code).""" + result = subprocess.run( + [sys.executable, str(HOOK_PATH)], + input=json.dumps(event), + capture_output=True, + text=True, + ) + return result.stdout, result.stderr, result.returncode + + +def run_hook_with_file(content: str, extension: str = ".py") -> tuple[str, str, int]: + """Write content to a temp file then run the hook against it.""" + with tempfile.NamedTemporaryFile(suffix=extension, mode="w", delete=False, dir="/tmp") as f: + f.write(content) + tmp_path = f.name + + try: + event = { + "type": "PostToolUse", + "tool_name": "Write", + "tool_input": {"file_path": tmp_path}, + } + return run_hook(event) + finally: + Path(tmp_path).unlink(missing_ok=True) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_python_fstring_sql_warning(): + """Python f-string with SQL keyword should emit a warning.""" + code = 'query = f"SELECT * FROM users WHERE id = {user_id}"\n' + stdout, _, code_rc = run_hook_with_file(code) + assert code_rc == 0 + assert "[sql-injection]" in stdout + + +def test_python_concatenation_sql_warning(): + """Python + concatenation with SQL context should emit a warning.""" + code = 'sql = "SELECT * FROM users WHERE name = " + name\n' + stdout, _, rc = run_hook_with_file(code) + assert rc == 0 + assert "[sql-injection]" in stdout + assert "string-concatenation" in stdout + + +def test_python_format_sql_warning(): + """Python .format() on a SQL string should emit a warning.""" + code = 'query = "SELECT * FROM {} WHERE id = {}".format(table, user_id)\n' + stdout, _, rc = run_hook_with_file(code) + assert rc == 0 + assert "[sql-injection]" in stdout + assert "format-injection" in stdout + + +def test_parameterized_query_no_warning(): + """Proper parameterized query should NOT emit a warning.""" + code = "sql = 'SELECT * FROM users WHERE id = ?'\ncursor.execute(sql, (user_id,))\n" + stdout, _, rc = run_hook_with_file(code) + assert rc == 0 + assert "[sql-injection]" not in stdout + + +def test_go_fmt_sprintf_warning(): + """Go fmt.Sprintf with SQL percent placeholders should emit a warning.""" + code = 'query := fmt.Sprintf("SELECT * FROM users WHERE id = %s", userID)\n' + stdout, _, rc = run_hook_with_file(code, extension=".go") + assert rc == 0 + assert "[sql-injection]" in stdout + assert "sprintf-injection" in stdout + + +def test_non_sql_fstring_no_warning(): + """f-string that doesn't contain SQL keywords should NOT emit a warning.""" + code = 'msg = f"Hello, {name}! Welcome to {place}."\n' + stdout, _, rc = run_hook_with_file(code) + assert rc == 0 + assert "[sql-injection]" not in stdout + + +def test_non_code_file_silent(): + """Non-code file (e.g. .md) should be silently skipped.""" + with tempfile.NamedTemporaryFile(suffix=".md", mode="w", delete=False, dir="/tmp") as f: + f.write('query = "SELECT * FROM users WHERE id = " + user_id\n') + tmp_path = f.name + + try: + event = { + "type": "PostToolUse", + "tool_name": "Write", + "tool_input": {"file_path": tmp_path}, + } + stdout, _, rc = run_hook(event) + assert rc == 0 + assert stdout == "" + finally: + Path(tmp_path).unlink(missing_ok=True) + + +def test_missing_file_path_silent(): + """Missing file_path in tool_input should produce no output.""" + event = { + "type": "PostToolUse", + "tool_name": "Write", + "tool_input": {}, + } + stdout, _, rc = run_hook(event) + assert rc == 0 + assert stdout == "" + + +def test_file_not_on_disk_silent(): + """Nonexistent file should be silently skipped.""" + event = { + "type": "PostToolUse", + "tool_name": "Write", + "tool_input": {"file_path": "/tmp/does_not_exist_xyz123.py"}, + } + stdout, _, rc = run_hook(event) + assert rc == 0 + assert stdout == "" + + +def test_malformed_json_exits_zero(): + """Malformed JSON input should not crash — hook must exit 0.""" + result = subprocess.run( + [sys.executable, str(HOOK_PATH)], + input="this is not json", + capture_output=True, + text=True, + ) + assert result.returncode == 0 + + +def test_findings_capped_at_five(): + """More than 5 findings should be capped with an overflow line.""" + lines = [] + for i in range(8): + lines.append(f'sql{i} = "SELECT * FROM t WHERE a = " + val{i}') + code = "\n".join(lines) + "\n" + stdout, _, rc = run_hook_with_file(code) + assert rc == 0 + assert "[sql-injection]" in stdout + assert "more sql-injection hints" in stdout + + +def test_java_string_format_warning(): + """Java String.format with SQL placeholders should emit a warning.""" + code = 'String q = String.format("SELECT * FROM users WHERE id = %s", userId);\n' + stdout, _, rc = run_hook_with_file(code, extension=".java") + assert rc == 0 + assert "[sql-injection]" in stdout + assert "sprintf-injection" in stdout + + +def test_fstring_where_clause_warning(): + """f-string with WHERE (not in SELECT set) should emit a warning.""" + code = 'q = f"WHERE user_id = {uid} AND active = 1"\n' + stdout, _, rc = run_hook_with_file(code) + assert rc == 0 + assert "[sql-injection]" in stdout + assert "fstring-injection" in stdout + + +def test_multiline_sql_concat_warning(): + """Multi-line SQL building via += should emit a warning.""" + code = 'query += " WHERE user_id = " + str(uid)\n' + stdout, _, rc = run_hook_with_file(code) + assert rc == 0 + assert "[sql-injection]" in stdout + + +if __name__ == "__main__": + tests = [ + test_python_fstring_sql_warning, + test_python_concatenation_sql_warning, + test_python_format_sql_warning, + test_parameterized_query_no_warning, + test_go_fmt_sprintf_warning, + test_non_sql_fstring_no_warning, + test_non_code_file_silent, + test_missing_file_path_silent, + test_file_not_on_disk_silent, + test_malformed_json_exits_zero, + test_findings_capped_at_five, + test_java_string_format_warning, + test_fstring_where_clause_warning, + test_multiline_sql_concat_warning, + ] + + print("Running sql-injection-detector hook tests...\n") + passed = 0 + failed = 0 + + for test in tests: + try: + test() + print(f" \u2713 {test.__name__}") + passed += 1 + except AssertionError as e: + print(f" \u2717 {test.__name__}: {e}") + failed += 1 + except Exception as e: + print(f" \u2717 {test.__name__}: Exception - {e}") + failed += 1 + + print(f"\n{passed} passed, {failed} failed") + sys.exit(0 if failed == 0 else 1) diff --git a/perses-plugin-example/README.md b/perses-plugin-example/README.md new file mode 100644 index 00000000..bae79b0e --- /dev/null +++ b/perses-plugin-example/README.md @@ -0,0 +1,73 @@ +# ExamplePanel Plugin + +A minimal Perses Panel plugin scaffold demonstrating the CUE schema and React component conventions. + +## Plugin Details + +| Field | Value | +|-------|-------| +| Type | Panel | +| Kind | `ExamplePanel` | +| Package | `@perses-dev/example-panel-plugin` | + +The panel renders a configured query string and optional display unit. It is intended as a starting point — replace the component body with your visualization logic. + +## Spec Fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `query` | string | Yes | Data query executed against the datasource | +| `unit` | string | No | Display unit appended to values (e.g. `ms`, `%`) | + +## Development + +### Test Schemas + +Validate the CUE schema against the JSON example before building: + +```bash +percli plugin test-schemas +``` + +All schema tests must pass before proceeding to build. + +### Build + +Create the distributable archive: + +```bash +percli plugin build +``` + +The archive will contain `package.json`, `schemas/`, `__mf/`, and `mf-manifest.json`. + +### Hot-Reload Dev Server + +Run against a local Perses instance for live development: + +```bash +percli plugin start +``` + +## Deploy to Perses + +1. Build the plugin archive with `percli plugin build`. +2. Copy the resulting `.tar.gz` (or `.zip`) into the `plugins-archive/` directory of your Perses server installation. +3. Restart the Perses server — it will unpack and register the plugin automatically. +4. Reference the plugin in a dashboard panel definition using `kind: "ExamplePanel"`. + +## Example Dashboard Panel Definition + +```yaml +kind: Panel +metadata: + name: my-example-panel +spec: + display: + name: My Example Panel + plugin: + kind: ExamplePanel + spec: + query: 'up{job="prometheus"}' + unit: short +``` diff --git a/perses-plugin-example/package.json b/perses-plugin-example/package.json new file mode 100644 index 00000000..b4607f27 --- /dev/null +++ b/perses-plugin-example/package.json @@ -0,0 +1,30 @@ +{ + "name": "@perses-dev/example-panel-plugin", + "version": "0.1.0", + "description": "Example Perses Panel plugin scaffold demonstrating CUE schema and React component conventions.", + "main": "src/index.ts", + "scripts": { + "dev": "rsbuild dev", + "build": "rsbuild build", + "preview": "rsbuild preview", + "type-check": "tsc --noEmit", + "test-schemas": "percli plugin test-schemas" + }, + "dependencies": { + "@perses-dev/core": "^0.48.0", + "@perses-dev/plugin-system": "^0.48.0", + "react": "^18.2.0", + "react-dom": "^18.2.0" + }, + "devDependencies": { + "@rsbuild/core": "^0.7.0", + "@rsbuild/plugin-react": "^0.7.0", + "@types/react": "^18.2.0", + "@types/react-dom": "^18.2.0", + "typescript": "^5.4.0" + }, + "peerDependencies": { + "react": "^18.2.0", + "react-dom": "^18.2.0" + } +} diff --git a/perses-plugin-example/rsbuild.config.ts b/perses-plugin-example/rsbuild.config.ts new file mode 100644 index 00000000..73a50563 --- /dev/null +++ b/perses-plugin-example/rsbuild.config.ts @@ -0,0 +1,28 @@ +import { defineConfig } from "@rsbuild/core"; +import { pluginReact } from "@rsbuild/plugin-react"; + +export default defineConfig({ + plugins: [pluginReact()], + tools: { + rspack: { + output: { + uniqueName: "example-panel-plugin", + }, + }, + }, + moduleFederation: { + options: { + name: "ExamplePanelPlugin", + filename: "remoteEntry.js", + exposes: { + ".": "./src/index.ts", + }, + shared: { + react: { singleton: true, requiredVersion: "^18.2.0" }, + "react-dom": { singleton: true, requiredVersion: "^18.2.0" }, + "@perses-dev/core": { singleton: true }, + "@perses-dev/plugin-system": { singleton: true }, + }, + }, + }, +}); diff --git a/perses-plugin-example/schemas/panels/example-panel/example-panel.cue b/perses-plugin-example/schemas/panels/example-panel/example-panel.cue new file mode 100644 index 00000000..2ef69d43 --- /dev/null +++ b/perses-plugin-example/schemas/panels/example-panel/example-panel.cue @@ -0,0 +1,11 @@ +package model + +kind: "ExamplePanel" +spec: close({ + // query is the data query string to execute against the datasource. + // Required — panel cannot render without a target query. + query: string + + // unit is an optional display unit appended to rendered values (e.g. "ms", "%", "req/s"). + unit?: string +}) diff --git a/perses-plugin-example/schemas/panels/example-panel/example-panel.json b/perses-plugin-example/schemas/panels/example-panel/example-panel.json new file mode 100644 index 00000000..a7ca867d --- /dev/null +++ b/perses-plugin-example/schemas/panels/example-panel/example-panel.json @@ -0,0 +1,6 @@ +{ + "kind": "ExamplePanel", + "spec": { + "query": "up{job=\"prometheus\"}" + } +} diff --git a/perses-plugin-example/schemas/panels/example-panel/testdata/full-config.json b/perses-plugin-example/schemas/panels/example-panel/testdata/full-config.json new file mode 100644 index 00000000..4b055cc5 --- /dev/null +++ b/perses-plugin-example/schemas/panels/example-panel/testdata/full-config.json @@ -0,0 +1,7 @@ +{ + "kind": "ExamplePanel", + "spec": { + "query": "rate(http_requests_total{job=\"api-server\"}[5m])", + "unit": "req/s" + } +} diff --git a/perses-plugin-example/src/ExamplePanel.tsx b/perses-plugin-example/src/ExamplePanel.tsx new file mode 100644 index 00000000..fc27d2af --- /dev/null +++ b/perses-plugin-example/src/ExamplePanel.tsx @@ -0,0 +1,50 @@ +import React from 'react'; +import { PanelProps } from '@perses-dev/plugin-system'; +import { ExamplePanelSpec } from './ExamplePanelTypes'; + +/** + * ExamplePanel renders the configured query string and optional unit. + * + * This is a minimal display panel used as a scaffolding reference. + * Replace the body with chart/table rendering as needed. + */ +export function ExamplePanel(props: PanelProps): JSX.Element { + const { spec } = props; + + return ( +
+
Query
+
+ {spec.query} +
+ {spec.unit !== undefined && ( + <> +
Unit
+
{spec.unit}
+ + )} +
+ ); +} diff --git a/perses-plugin-example/src/ExamplePanelTypes.ts b/perses-plugin-example/src/ExamplePanelTypes.ts new file mode 100644 index 00000000..9214db9f --- /dev/null +++ b/perses-plugin-example/src/ExamplePanelTypes.ts @@ -0,0 +1,13 @@ +/** + * ExamplePanelSpec mirrors the CUE schema at + * schemas/panels/example-panel/spec.cue. + * + * Field names and optionality MUST stay in sync with the CUE definition. + */ +export interface ExamplePanelSpec { + /** The data query string executed against the configured datasource. */ + query: string; + + /** Optional display unit appended to rendered values (e.g. "ms", "%", "req/s"). */ + unit?: string; +} diff --git a/perses-plugin-example/src/index.ts b/perses-plugin-example/src/index.ts new file mode 100644 index 00000000..5e6c30eb --- /dev/null +++ b/perses-plugin-example/src/index.ts @@ -0,0 +1,19 @@ +import { PanelPlugin } from '@perses-dev/plugin-system'; +import { ExamplePanel } from './ExamplePanel'; +import { ExamplePanelSpec } from './ExamplePanelTypes'; + +/** + * Plugin registration. + * + * The `kind` string "ExamplePanel" MUST match: + * - The `kind` field in schemas/panels/example-panel/spec.cue + * - The `kind` field in any Perses dashboard panel definition referencing this plugin + */ +export const ExamplePanelPlugin: PanelPlugin = { + PanelComponent: ExamplePanel, + panelOptionsEditorComponents: [], + hide: false, +}; + +export { ExamplePanel } from './ExamplePanel'; +export type { ExamplePanelSpec } from './ExamplePanelTypes'; diff --git a/perses-plugin-example/tsconfig.json b/perses-plugin-example/tsconfig.json new file mode 100644 index 00000000..e7ac9abc --- /dev/null +++ b/perses-plugin-example/tsconfig.json @@ -0,0 +1,23 @@ +{ + "compilerOptions": { + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable"], + "module": "ESNext", + "moduleResolution": "bundler", + "jsx": "react-jsx", + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + "allowSyntheticDefaultImports": true, + "esModuleInterop": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "outDir": "dist", + "declaration": true, + "declarationMap": true, + "sourceMap": true + }, + "include": ["src"], + "exclude": ["node_modules", "dist", "__mf"] +} diff --git a/plugins/custom-panel/package.json b/plugins/custom-panel/package.json new file mode 100644 index 00000000..7f5819c8 --- /dev/null +++ b/plugins/custom-panel/package.json @@ -0,0 +1,30 @@ +{ + "name": "@perses-dev/custom-panel-plugin", + "version": "0.1.0", + "description": "Custom panel plugin for Perses", + "main": "src/index.ts", + "scripts": { + "dev": "rsbuild dev", + "build": "rsbuild build", + "preview": "rsbuild preview", + "type-check": "tsc --noEmit", + "test-schemas": "percli plugin test-schemas" + }, + "dependencies": { + "@perses-dev/core": "^0.48.0", + "@perses-dev/plugin-system": "^0.48.0", + "react": "^18.2.0", + "react-dom": "^18.2.0" + }, + "devDependencies": { + "@rsbuild/core": "^0.7.0", + "@rsbuild/plugin-react": "^0.7.0", + "@types/react": "^18.2.0", + "@types/react-dom": "^18.2.0", + "typescript": "^5.4.0" + }, + "peerDependencies": { + "react": "^18.2.0", + "react-dom": "^18.2.0" + } +} diff --git a/plugins/custom-panel/rsbuild.config.ts b/plugins/custom-panel/rsbuild.config.ts new file mode 100644 index 00000000..dd79ebde --- /dev/null +++ b/plugins/custom-panel/rsbuild.config.ts @@ -0,0 +1,28 @@ +import { defineConfig } from "@rsbuild/core"; +import { pluginReact } from "@rsbuild/plugin-react"; + +export default defineConfig({ + plugins: [pluginReact()], + tools: { + rspack: { + output: { + uniqueName: "custom-panel-plugin", + }, + }, + }, + moduleFederation: { + options: { + name: "CustomPanelPlugin", + filename: "remoteEntry.js", + exposes: { + ".": "./src/index.ts", + }, + shared: { + react: { singleton: true, requiredVersion: "^18.2.0" }, + "react-dom": { singleton: true, requiredVersion: "^18.2.0" }, + "@perses-dev/core": { singleton: true }, + "@perses-dev/plugin-system": { singleton: true }, + }, + }, + }, +}); diff --git a/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.cue b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.cue new file mode 100644 index 00000000..9bcaff12 --- /dev/null +++ b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.cue @@ -0,0 +1,25 @@ +// Copyright 2024 The Perses Authors +// Licensed under the Apache License, Version 2.0 + +package model + +kind: "CustomPanel" +spec: close({ + // title is the display label rendered at the top of the panel. + title: string + + // unit controls how numeric values are formatted (e.g. "bytes", "percent", "short"). + unit?: string + + // thresholds defines a list of color-coded threshold steps. + // Each step specifies a numeric value and a display color. + thresholds?: [...#ThresholdStep] +}) + +// ThresholdStep pairs a numeric boundary with a display color. +#ThresholdStep: { + // value is the lower boundary of this threshold band. + value: number + // color is a CSS-compatible color string (e.g. "#e02f44", "green"). + color: string +} diff --git a/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.json b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.json new file mode 100644 index 00000000..782f5ca2 --- /dev/null +++ b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.json @@ -0,0 +1,6 @@ +{ + "kind": "CustomPanel", + "spec": { + "title": "My Custom Panel" + } +} diff --git a/plugins/custom-panel/schemas/panels/custom-panel/migrate/migrate.cue b/plugins/custom-panel/schemas/panels/custom-panel/migrate/migrate.cue new file mode 100644 index 00000000..be65f70b --- /dev/null +++ b/plugins/custom-panel/schemas/panels/custom-panel/migrate/migrate.cue @@ -0,0 +1,44 @@ +// Copyright 2024 The Perses Authors +// Licensed under the Apache License, Version 2.0 +// +// migrate.cue maps a Grafana "stat" panel definition to a Perses CustomPanel spec. +// Supported Grafana panel types: stat, singlestat +// +// Unsupported Grafana fields (no direct equivalent in CustomPanel): +// - options.graphMode +// - options.colorMode +// - options.justifyMode +// - fieldConfig.defaults.mappings + +package migrate + +import ( + "github.com/perses/perses/cue/schemas/panels/migrate" +) + +migrate.#Panel & { + // target is the resulting Perses panel spec after migration. + target: { + kind: "CustomPanel" + spec: { + // Map the Grafana panel title to the Perses title field. + title: grafana.title + + // Map the Grafana unit override if present. + if grafana.fieldConfig.defaults.unit != _|_ { + unit: grafana.fieldConfig.defaults.unit + } + + // Map Grafana threshold steps to Perses threshold steps. + if grafana.fieldConfig.defaults.thresholds.steps != _|_ { + thresholds: [ + for step in grafana.fieldConfig.defaults.thresholds.steps + if step.value != _|_ { + value: step.value + color: step.color + }, + ] + } + } + } +} diff --git a/plugins/custom-panel/schemas/panels/custom-panel/testdata/full-config.json b/plugins/custom-panel/schemas/panels/custom-panel/testdata/full-config.json new file mode 100644 index 00000000..513d3614 --- /dev/null +++ b/plugins/custom-panel/schemas/panels/custom-panel/testdata/full-config.json @@ -0,0 +1,12 @@ +{ + "kind": "CustomPanel", + "spec": { + "title": "Request Latency", + "unit": "ms", + "thresholds": [ + { "value": 0, "color": "green" }, + { "value": 200, "color": "#ff9900" }, + { "value": 500, "color": "#e02f44" } + ] + } +} diff --git a/plugins/custom-panel/src/PanelComponent.tsx b/plugins/custom-panel/src/PanelComponent.tsx new file mode 100644 index 00000000..bdd72920 --- /dev/null +++ b/plugins/custom-panel/src/PanelComponent.tsx @@ -0,0 +1,160 @@ +import React from "react"; +import { PanelProps } from "@perses-dev/plugin-system"; +import { CustomPanelSpec, ThresholdStep } from "./types"; + +/** + * resolveThresholdColor returns the color for the highest threshold whose + * value is <= the provided numeric value, or undefined when no value is given. + */ +function resolveThresholdColor( + value: number | undefined, + thresholds: ThresholdStep[] | undefined +): string | undefined { + if (value === undefined || !thresholds || thresholds.length === 0) { + return undefined; + } + const sorted = [...thresholds].sort((a, b) => a.value - b.value); + let resolved: string | undefined; + for (const step of sorted) { + if (value >= step.value) { + resolved = step.color; + } + } + return resolved; +} + +/** + * CustomPanelComponent renders the CustomPanel spec. + * + * - Displays the configured title as the panel heading. + * - Shows each threshold step as a color swatch with its boundary value. + * - Applies the appropriate threshold color to the unit label when a + * representative value is available from the panel data context. + */ +export function CustomPanelComponent({ + spec, +}: PanelProps): React.ReactElement { + const { title, unit, thresholds } = spec; + + // Derive a representative numeric value from the first query result when + // available. Falls back to undefined so the component renders gracefully + // with no live data (e.g. during plugin development or empty dashboards). + const representativeValue: number | undefined = undefined; + const activeColor = resolveThresholdColor(representativeValue, thresholds); + + return ( +
+ {/* Panel heading */} +

{title}

+ + {/* Unit display with optional threshold color */} + {unit !== undefined && ( +
+ {unit} +
+ )} + + {/* Threshold legend */} + {thresholds && thresholds.length > 0 && ( +
+

Thresholds

+
    + {thresholds + .slice() + .sort((a, b) => a.value - b.value) + .map((step, idx) => ( +
  • + + + ≥ {step.value} + {unit ? ` ${unit}` : ""} + +
  • + ))} +
+
+ )} + + {/* Empty state */} + {(!thresholds || thresholds.length === 0) && unit === undefined && ( +

No configuration to display.

+ )} +
+ ); +} + +// Inline styles — replace with your design system tokens or CSS modules as needed. +const styles = { + container: { + padding: "12px 16px", + fontFamily: "inherit", + height: "100%", + boxSizing: "border-box" as const, + overflow: "auto", + }, + title: { + margin: "0 0 8px 0", + fontSize: "1rem", + fontWeight: 600, + lineHeight: 1.4, + }, + unitBadge: { + display: "inline-block", + padding: "2px 8px", + borderRadius: "4px", + fontSize: "0.875rem", + fontWeight: 500, + backgroundColor: "#e0e0e0", + marginBottom: "12px", + }, + thresholdsSection: { + marginTop: "8px", + }, + thresholdsHeading: { + margin: "0 0 6px 0", + fontSize: "0.75rem", + fontWeight: 600, + textTransform: "uppercase" as const, + letterSpacing: "0.05em", + color: "#666", + }, + thresholdList: { + listStyle: "none", + margin: 0, + padding: 0, + display: "flex", + flexDirection: "column" as const, + gap: "4px", + }, + thresholdItem: { + display: "flex", + alignItems: "center", + gap: "8px", + }, + swatch: { + width: "14px", + height: "14px", + borderRadius: "2px", + flexShrink: 0, + border: "1px solid rgba(0,0,0,0.1)", + }, + thresholdLabel: { + fontSize: "0.875rem", + }, + emptyState: { + color: "#999", + fontSize: "0.875rem", + margin: 0, + }, +} as const; diff --git a/plugins/custom-panel/src/index.ts b/plugins/custom-panel/src/index.ts new file mode 100644 index 00000000..08f518b2 --- /dev/null +++ b/plugins/custom-panel/src/index.ts @@ -0,0 +1,19 @@ +import { PanelPlugin } from "@perses-dev/plugin-system"; +import { CustomPanelComponent } from "./PanelComponent"; +import { CustomPanelSpec } from "./types"; + +/** + * Plugin registration. + * + * The `kind` string "CustomPanel" MUST match: + * - The `kind` field in schemas/panels/custom-panel/custom-panel.cue + * - The `kind` field in any Perses dashboard panel definition referencing this plugin + */ +export const CustomPanelPlugin: PanelPlugin = { + PanelComponent: CustomPanelComponent, + panelOptionsEditorComponents: [], + hide: false, +}; + +export { CustomPanelComponent } from "./PanelComponent"; +export type { CustomPanelSpec, ThresholdStep } from "./types"; diff --git a/plugins/custom-panel/src/types.ts b/plugins/custom-panel/src/types.ts new file mode 100644 index 00000000..aead06c3 --- /dev/null +++ b/plugins/custom-panel/src/types.ts @@ -0,0 +1,21 @@ +/** + * ThresholdStep pairs a numeric lower boundary with a CSS color string. + * Mirrors the #ThresholdStep CUE definition in the schema. + */ +export interface ThresholdStep { + value: number; + color: string; +} + +/** + * CustomPanelSpec is the validated configuration for a CustomPanel. + * All fields mirror the CUE schema at schemas/panels/custom-panel/custom-panel.cue. + */ +export interface CustomPanelSpec { + /** Display label rendered at the top of the panel. */ + title: string; + /** Value formatting unit (e.g. "bytes", "percent", "ms", "short"). */ + unit?: string; + /** Color-coded threshold steps. */ + thresholds?: ThresholdStep[]; +} diff --git a/plugins/custom-panel/tsconfig.json b/plugins/custom-panel/tsconfig.json new file mode 100644 index 00000000..e7ac9abc --- /dev/null +++ b/plugins/custom-panel/tsconfig.json @@ -0,0 +1,23 @@ +{ + "compilerOptions": { + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable"], + "module": "ESNext", + "moduleResolution": "bundler", + "jsx": "react-jsx", + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + "allowSyntheticDefaultImports": true, + "esModuleInterop": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "outDir": "dist", + "declaration": true, + "declarationMap": true, + "sourceMap": true + }, + "include": ["src"], + "exclude": ["node_modules", "dist", "__mf"] +} diff --git a/plugins/example-panel/schemas/panels/example-panel/example-panel.cue b/plugins/example-panel/schemas/panels/example-panel/example-panel.cue new file mode 100644 index 00000000..4132ca9c --- /dev/null +++ b/plugins/example-panel/schemas/panels/example-panel/example-panel.cue @@ -0,0 +1,21 @@ +// Copyright 2024 The Perses Authors +// Licensed under the Apache License, Version 2.0 + +package model + +kind: "ExamplePanel" +spec: close({ + // text is the message displayed in the center of the panel. + // Defaults to "Hello from ExamplePanel" when omitted. + text: string | *"Hello from ExamplePanel" + + // color is a CSS-compatible color string applied to the text. + // Accepts any valid CSS color: hex (#333333), named (red), rgb(...). + color: string | *"#333333" + + // fontSize controls text size in pixels. Clamped to the range 10–72. + fontSize: int & >=10 & <=72 | *16 + + // align controls horizontal text alignment within the panel. + align: "left" | "center" | "right" | *"center" +}) diff --git a/plugins/example-panel/schemas/panels/example-panel/example-panel.json b/plugins/example-panel/schemas/panels/example-panel/example-panel.json new file mode 100644 index 00000000..cb7b5d11 --- /dev/null +++ b/plugins/example-panel/schemas/panels/example-panel/example-panel.json @@ -0,0 +1,6 @@ +{ + "kind": "ExamplePanel", + "spec": { + "text": "Hello from ExamplePanel" + } +} diff --git a/plugins/example-panel/schemas/panels/example-panel/testdata/full-config.json b/plugins/example-panel/schemas/panels/example-panel/testdata/full-config.json new file mode 100644 index 00000000..34e00442 --- /dev/null +++ b/plugins/example-panel/schemas/panels/example-panel/testdata/full-config.json @@ -0,0 +1,9 @@ +{ + "kind": "ExamplePanel", + "spec": { + "text": "System Status: Nominal", + "color": "#1a7f37", + "fontSize": 24, + "align": "center" + } +} diff --git a/scripts/skill_eval/run_eval.py b/scripts/skill_eval/run_eval.py index 383e74b5..372a877a 100755 --- a/scripts/skill_eval/run_eval.py +++ b/scripts/skill_eval/run_eval.py @@ -6,11 +6,14 @@ """ import argparse +import contextlib import json import os import select +import shutil import subprocess import sys +import tempfile import time import uuid from concurrent.futures import ProcessPoolExecutor, as_completed @@ -32,40 +35,147 @@ def find_project_root() -> Path: return current +def resolve_registered_skill_relpath(skill_path: Path, project_root: Path) -> Path | None: + """Return repo-relative SKILL.md path when `skill_path` is a registered repo skill.""" + skill_md = (skill_path / "SKILL.md").resolve() + try: + rel = skill_md.relative_to(project_root.resolve()) + except ValueError: + return None + if len(rel.parts) >= 3 and rel.parts[0] == "skills" and rel.parts[-1] == "SKILL.md": + return rel + return None + + +def replace_description_in_skill_md(content: str, new_description: str) -> str: + """Replace the top-level frontmatter description field in SKILL.md content.""" + lines = content.splitlines() + if not lines or lines[0].strip() != "---": + raise ValueError("SKILL.md missing frontmatter (no opening ---)") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("SKILL.md missing frontmatter (no closing ---)") + + frontmatter_lines = lines[1:end_idx] + body_lines = lines[end_idx + 1 :] + updated_frontmatter: list[str] = [] + replaced = False + i = 0 + while i < len(frontmatter_lines): + line = frontmatter_lines[i] + if not replaced and line.startswith("description:"): + updated_frontmatter.append("description: |") + updated_frontmatter.extend(f" {desc_line}" for desc_line in new_description.splitlines()) + replaced = True + i += 1 + while i < len(frontmatter_lines) and ( + frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t") + ): + i += 1 + continue + updated_frontmatter.append(line) + i += 1 + + if not replaced: + raise ValueError("SKILL.md frontmatter missing description field") + + rebuilt = ["---", *updated_frontmatter, "---", *body_lines] + return "\n".join(rebuilt) + ("\n" if content.endswith("\n") else "") + + +def load_eval_set(path: Path) -> list[dict]: + """Load eval tasks from list or common wrapped JSON shapes.""" + payload = json.loads(path.read_text()) + if isinstance(payload, list): + return payload + if isinstance(payload, dict): + if "tasks" in payload and isinstance(payload["tasks"], list): + return payload["tasks"] + if "queries" in payload and isinstance(payload["queries"], list): + return payload["queries"] + train = payload.get("train") + test = payload.get("test") + if isinstance(train, list) or isinstance(test, list): + return [*(train or []), *(test or [])] + raise ValueError( + "Unsupported eval set format; expected list, {tasks:[...]}, {queries:[...]}, or {train:[...], test:[...]}" + ) + + +@contextlib.contextmanager +def candidate_worktree(project_root: Path, registered_skill_relpath: Path, candidate_content: str | None): + """Create a temporary git worktree and optionally patch the target skill content.""" + wt_path_str = tempfile.mkdtemp(prefix="skill-eval-wt-", dir="/tmp") + wt_path = Path(wt_path_str) + wt_path.rmdir() + try: + subprocess.run( + ["git", "worktree", "add", wt_path_str, "HEAD"], + cwd=str(project_root), + capture_output=True, + text=True, + check=True, + ) + if candidate_content is not None: + (wt_path / registered_skill_relpath).write_text(candidate_content) + yield wt_path + finally: + try: + subprocess.run( + ["git", "worktree", "remove", "--force", wt_path_str], + cwd=str(project_root), + capture_output=True, + text=True, + ) + except Exception: + pass + shutil.rmtree(wt_path_str, ignore_errors=True) + + def run_single_query( query: str, skill_name: str, skill_description: str, timeout: int, project_root: str, + eval_mode: str = "alias", model: str | None = None, ) -> bool: """Run a single query and return whether the skill was triggered. - Creates a command file in .claude/commands/ so it appears in Claude's - available_skills list, then runs `claude -p` with the raw query. + In alias mode, creates a command file in .claude/commands/ so it appears in + Claude's available skills list. In registered mode, assumes the real skill + is already present in the isolated worktree and detects only the real name. + Uses --include-partial-messages to detect triggering early from stream events (content_block_start) rather than waiting for the full assistant message, which only arrives after tool execution. """ unique_id = uuid.uuid4().hex[:8] clean_name = f"{skill_name}-skill-{unique_id}" + accepted_skill_ids = {clean_name} if eval_mode == "alias" else {skill_name} project_commands_dir = Path(project_root) / ".claude" / "commands" command_file = project_commands_dir / f"{clean_name}.md" try: - project_commands_dir.mkdir(parents=True, exist_ok=True) - # Use YAML block scalar to avoid breaking on quotes in description - indented_desc = "\n ".join(skill_description.split("\n")) - command_content = ( - f"---\n" - f"description: |\n" - f" {indented_desc}\n" - f"---\n\n" - f"# {skill_name}\n\n" - f"This skill handles: {skill_description}\n" - ) - command_file.write_text(command_content) + if eval_mode == "alias": + project_commands_dir.mkdir(parents=True, exist_ok=True) + # Use YAML block scalar to avoid breaking on quotes in description + indented_desc = "\n ".join(skill_description.split("\n")) + command_content = ( + f"---\n" + f"description: |\n" + f" {indented_desc}\n" + f"---\n\n" + f"# {skill_name}\n\n" + f"This skill handles: {skill_description}\n" + ) + command_file.write_text(command_content) cmd = [ "claude", @@ -140,20 +250,24 @@ def run_single_query( pending_tool_name = tool_name accumulated_json = "" else: - return False + pending_tool_name = None + accumulated_json = "" elif se_type == "content_block_delta" and pending_tool_name: delta = se.get("delta", {}) if delta.get("type") == "input_json_delta": accumulated_json += delta.get("partial_json", "") - if clean_name in accumulated_json: - return True + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True elif se_type in ("content_block_stop", "message_stop"): if pending_tool_name: - return clean_name in accumulated_json + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True + pending_tool_name = None + accumulated_json = "" if se_type == "message_stop": - return False + return triggered # Fallback: full assistant message elif event.get("type") == "assistant": @@ -163,11 +277,16 @@ def run_single_query( continue tool_name = content_item.get("name", "") tool_input = content_item.get("input", {}) - if (tool_name == "Skill" and clean_name in tool_input.get("skill", "")) or ( - tool_name == "Read" and clean_name in tool_input.get("file_path", "") + if ( + tool_name == "Skill" + and any(skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids) + ) or ( + tool_name == "Read" + and any(skill_id in tool_input.get("file_path", "") for skill_id in accepted_skill_ids) ): triggered = True - return triggered + if triggered: + return True elif event.get("type") == "result": return triggered @@ -179,7 +298,7 @@ def run_single_query( return triggered finally: - if command_file.exists(): + if eval_mode == "alias" and command_file.exists(): command_file.unlink() @@ -192,39 +311,69 @@ def run_eval( project_root: Path, runs_per_query: int = 1, trigger_threshold: float = 0.5, + eval_mode: str = "auto", + skill_path: Path | None = None, + candidate_content: str | None = None, model: str | None = None, ) -> dict: """Run the full eval set and return results.""" results = [] - with ProcessPoolExecutor(max_workers=num_workers) as executor: - future_to_info = {} - for item in eval_set: - for run_idx in range(runs_per_query): - future = executor.submit( - run_single_query, - item["query"], - skill_name, - description, - timeout, - str(project_root), - model, - ) - future_to_info[future] = (item, run_idx) - - query_triggers: dict[str, list[bool]] = {} - query_items: dict[str, dict] = {} - for future in as_completed(future_to_info): - item, _ = future_to_info[future] - query = item["query"] - query_items[query] = item - if query not in query_triggers: - query_triggers[query] = [] - try: - query_triggers[query].append(future.result()) - except Exception as e: - print(f"Warning: query failed: {e}", file=sys.stderr) - query_triggers[query].append(False) + effective_mode = eval_mode + effective_project_root = project_root + worktree_cm = contextlib.nullcontext(project_root) + + if effective_mode == "auto": + if skill_path is not None and resolve_registered_skill_relpath(skill_path, project_root) is not None: + effective_mode = "registered" + else: + effective_mode = "alias" + + if effective_mode == "registered": + if skill_path is None: + raise ValueError("registered eval mode requires skill_path") + relpath = resolve_registered_skill_relpath(skill_path, project_root) + if relpath is None: + raise ValueError("registered eval mode requires skill_path under project_root/skills/*/SKILL.md") + _name, original_description, original_content = parse_skill_md(skill_path) + if candidate_content is None: + if description != original_description: + candidate_content = replace_description_in_skill_md(original_content, description) + else: + candidate_content = original_content + worktree_cm = candidate_worktree(project_root, relpath, candidate_content) + + with worktree_cm as active_project_root: + effective_project_root = active_project_root + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_to_info = {} + for item in eval_set: + for run_idx in range(runs_per_query): + future = executor.submit( + run_single_query, + item["query"], + skill_name, + description, + timeout, + str(effective_project_root), + effective_mode, + model, + ) + future_to_info[future] = (item, run_idx) + + query_triggers: dict[str, list[bool]] = {} + query_items: dict[str, dict] = {} + for future in as_completed(future_to_info): + item, _ = future_to_info[future] + query = item["query"] + query_items[query] = item + if query not in query_triggers: + query_triggers[query] = [] + try: + query_triggers[query].append(future.result()) + except Exception as e: + print(f"Warning: query failed: {e}", file=sys.stderr) + query_triggers[query].append(False) for query, triggers in query_triggers.items(): item = query_items[query] @@ -266,15 +415,17 @@ def main(): parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") parser.add_argument("--skill-path", required=True, help="Path to skill directory") parser.add_argument("--description", default=None, help="Override description to test") - parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--candidate-content-file", default=None, help="Optional full SKILL.md content to evaluate") + parser.add_argument("--eval-mode", choices=["auto", "registered", "alias"], default="auto", help="Evaluator mode") + parser.add_argument("--num-workers", type=int, default=1, help="Number of parallel workers") parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") - parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--runs-per-query", type=int, default=1, help="Number of runs per query") parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") args = parser.parse_args() - eval_set = json.loads(Path(args.eval_set).read_text()) + eval_set = load_eval_set(Path(args.eval_set)) skill_path = Path(args.skill_path) if not (skill_path / "SKILL.md").exists(): @@ -284,9 +435,11 @@ def main(): name, original_description, _content = parse_skill_md(skill_path) description = args.description or original_description project_root = find_project_root() + candidate_content = Path(args.candidate_content_file).read_text() if args.candidate_content_file else None if args.verbose: print(f"Evaluating: {description}", file=sys.stderr) + print(f"Eval mode: {args.eval_mode}", file=sys.stderr) output = run_eval( eval_set=eval_set, @@ -297,6 +450,9 @@ def main(): project_root=project_root, runs_per_query=args.runs_per_query, trigger_threshold=args.trigger_threshold, + eval_mode=args.eval_mode, + skill_path=skill_path, + candidate_content=candidate_content, model=args.model, ) diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py index 662b63f8..4253aef8 100644 --- a/scripts/tests/test_agent_comparison_optimize_loop.py +++ b/scripts/tests/test_agent_comparison_optimize_loop.py @@ -1,3 +1,4 @@ +import contextlib import importlib.util import json import subprocess @@ -110,11 +111,203 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout): generate_variant.main() output = json.loads(capsys.readouterr().out) - assert output["variant"] == "---\ndescription: updated\n---" + assert generate_variant.extract_description(output["variant"]) == "updated" assert output["tokens_used"] == 3 assert output["reasoning"] == "raw result" +def test_generate_variant_only_changes_description_field(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_description_only", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: | + old description +routing: + triggers: + - "keep-this-trigger" +--- + +# Skill + +Body stays the same. +""" + + def fake_run_claude_code(prompt, model): + return ( + "new description line 1\nnew description line 2" + "improved description", + "raw result", + 9, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + result = generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve routing precision", + current_content=current_content, + failures=[], + model=None, + ) + + assert generate_variant.extract_description(result["variant"]) == "new description line 1\nnew description line 2" + assert ' - "keep-this-trigger"' in result["variant"] + assert "# Skill" in result["variant"] + assert "Body stays the same." in result["variant"] + assert result["deletions"] == [] + + +def test_generate_variant_legacy_full_file_output_is_reduced_to_description_only(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_legacy_variant", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: old description +routing: + triggers: + - "keep-this-trigger" +--- + +# Skill + +Original body. +""" + + legacy_variant = """--- +name: example-skill +description: updated description +routing: + triggers: + - "changed-trigger" +--- + +# Skill + +Changed body. +""" + + def fake_run_claude_code(prompt, model): + return ( + f"{legacy_variant}legacy response" + "", + "raw result", + 5, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + result = generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve routing precision", + current_content=current_content, + failures=[], + model=None, + ) + + assert generate_variant.extract_description(result["variant"]) == "updated description" + assert ' - "keep-this-trigger"' in result["variant"] + assert ' - "changed-trigger"' not in result["variant"] + assert "Original body." in result["variant"] + assert "Changed body." not in result["variant"] + + +def test_generate_variant_body_only_changes_body_not_frontmatter(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_body_only", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: old description +version: 1.0.0 +--- + +# Skill + +Original body. +""" + + def fake_run_claude_code(prompt, model): + assert "" in prompt + return ( + "# Skill\n\nImproved body.\nbody change" + "", + "raw result", + 7, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + result = generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve behavioral quality", + current_content=current_content, + failures=[], + model=None, + optimization_scope="body-only", + ) + + assert "description: old description" in result["variant"] + assert "# Skill\n\nImproved body." in result["variant"] + assert "Original body." not in result["variant"] + + +def test_generate_variant_prompt_includes_full_failed_query_and_expectation(monkeypatch): + generate_variant = load_module( + "agent_comparison_generate_variant_failure_context", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + current_content = """--- +name: example-skill +description: old description +--- + +# Skill +""" + + captured = {} + + def fake_run_claude_code(prompt, model): + captured["prompt"] = prompt + return ( + "updated description" + "improved description", + "raw result", + 4, + ) + + monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code) + + generate_variant.generate_variant( + target_path="skills/example/SKILL.md", + goal="improve routing precision", + current_content=current_content, + failures=[ + { + "name": "rubber duck this bug with me, don't solv", + "query": "rubber duck this bug with me, don't solve it yet", + "should_trigger": True, + "details": "trigger_rate=0.00", + "trigger_rate": 0.0, + } + ], + model=None, + ) + + assert "rubber duck this bug with me, don't solve it yet" in captured["prompt"] + assert "expected: SHOULD trigger" in captured["prompt"] + assert "raw_trigger_rate=0.00" in captured["prompt"] + + def test_optimize_loop_omits_model_flag_when_not_provided(tmp_path, monkeypatch): optimize_loop = load_module( "agent_comparison_optimize_loop_nomodel", @@ -231,7 +424,7 @@ def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): ) assert result["status"] == "CONVERGED" - assert "2 rounds without KEEP" in result["exit_reason"] + assert "2 rounds without ACCEPT" in result["exit_reason"] def test_optimize_loop_beam_search_retains_top_k_candidates(tmp_path, monkeypatch): @@ -268,7 +461,9 @@ def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None): return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="") def fake_assess_target(path, *args, **kwargs): - content = Path(path).read_text() + content = kwargs.get("candidate_content") + if content is None: + content = Path(path).read_text() score = 0.0 if "" in content: score = 1.2 @@ -313,3 +508,449 @@ def fake_assess_target(path, *args, **kwargs): selected = [it for it in result["iterations"] if it.get("selected_for_frontier")] assert len(selected) == 2 assert selected[0]["frontier_rank"] == 1 or selected[1]["frontier_rank"] == 1 + + +def test_composite_score_uses_weighted_dimensions_only_when_hard_gates_pass(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_scoring", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + scores = { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": 7.5, + "error_handling": 6.0, + "language_idioms": 5.0, + "testing": 8.0, + "efficiency": 4.0, + } + + assert optimize_loop.composite_score(scores) == 6.55 + + +def test_composite_score_returns_zero_when_hard_gate_fails(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_hard_gate", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + scores = { + "parses": False, + "compiles": True, + "tests_pass": False, + "protected_intact": True, + "correctness": 10.0, + "error_handling": 10.0, + "language_idioms": 10.0, + "testing": 10.0, + "efficiency": 10.0, + } + + assert optimize_loop.composite_score(scores) == 0.0 + + +def test_assess_target_scores_trigger_rate_results(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_trigger_score", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: trigger scoring test\n---\n") + tasks = [ + {"query": "good query", "should_trigger": True}, + {"query": "bad query", "should_trigger": False}, + ] + + def fake_run_trigger_rate(*args, **kwargs): + return { + "summary": {"total": 2, "passed": 1, "failed": 1}, + "results": [ + {"query": "good query", "pass": True, "trigger_rate": 1.0}, + {"query": "bad query", "pass": False, "trigger_rate": 0.0}, + ], + } + + monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve routing precision", + dry_run=False, + ) + + assert scores["correctness"] == 5.0 + assert scores["error_handling"] == 4.0 + assert scores["language_idioms"] == 3.5 + assert scores["testing"] == 4.0 + assert scores["efficiency"] == 3.6 + assert scores["tests_pass"] is False + assert [item["passed"] for item in scores["task_results"]] == [True, False] + assert scores["task_results"][0]["query"] == "good query" + assert scores["task_results"][0]["should_trigger"] is True + assert scores["task_results"][1]["query"] == "bad query" + assert scores["task_results"][1]["should_trigger"] is False + assert optimize_loop.composite_score(scores) == 4.285 + + +def test_assess_target_forwards_parallel_workers_for_behavioral_eval(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_behavioral_parallel", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: behavioral scoring test\n---\n") + tasks = [ + {"query": "make a skill", "should_trigger": True, "eval_mode": "behavioral"}, + ] + seen = {} + + def fake_run_behavioral_eval(*args, **kwargs): + seen["parallel_workers"] = kwargs["parallel_workers"] + return [{"query": "make a skill", "pass": True, "triggered": True, "new_artifacts": ["skills/x/SKILL.md"]}] + + monkeypatch.setattr(optimize_loop, "_run_behavioral_eval", fake_run_behavioral_eval) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve routing precision", + parallel_eval_workers=3, + ) + + assert seen["parallel_workers"] == 3 + assert scores["tests_pass"] is True + assert scores["correctness"] == 10.0 + assert scores["task_results"][0]["query"] == "make a skill" + assert scores["task_results"][0]["should_trigger"] is True + assert optimize_loop.composite_score(scores) == 8.45 + + +def test_assess_target_scores_blind_compare_results(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_blind_compare", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\ndescription: blind compare test\n---\n") + tasks = [{"query": "help me debug this", "eval_mode": "blind_compare", "judge": "socratic_question_only"}] + + def fake_run_blind_compare_eval( + target_path, candidate_content, tasks, baseline_content=None, timeout=180, verbose=False + ): + assert baseline_content == "---\ndescription: baseline\n---\n" + return [ + { + "query": "help me debug this", + "winner": "candidate", + "candidate_score": 0.8, + "baseline_score": 0.5, + "candidate_output": "What changed recently?", + "baseline_output": "The issue is probably your env var rename.", + "passed": True, + } + ] + + monkeypatch.setattr(optimize_loop, "_run_blind_compare_eval", fake_run_blind_compare_eval) + + scores = optimize_loop.assess_target( + target, + tasks, + "improve behavioral quality", + candidate_content="---\ndescription: candidate\n---\n", + baseline_content="---\ndescription: baseline\n---\n", + ) + + assert scores["correctness"] == 8.0 + assert scores["testing"] == 8.0 + assert scores["tests_pass"] is True + assert scores["task_results"][0]["winner"] == "candidate" + + +def test_socratic_question_only_heuristic_penalizes_preamble(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_socratic_heuristic", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + clean_score, _ = optimize_loop._score_socratic_question_only_output("What did you expect the test to do?") + preamble_score, _ = optimize_loop._score_socratic_question_only_output( + "Let me read the skill first. What did you expect the test to do?" + ) + + assert clean_score > preamble_score + + +def test_contains_fallback_contamination_detects_tool_blocked_text(): + optimize_loop = load_module( + "agent_comparison_optimize_loop_contamination", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + contaminated, reasons = optimize_loop._contains_fallback_contamination( + "The Skill tool was blocked in this session, so I'll guide you through this directly." + ) + + assert contaminated is True + assert "mentioned blocked skill tool" in reasons + assert "fell back to direct guidance" in reasons + + +def test_run_blind_compare_zeroes_untriggered_or_contaminated_runs(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_blind_compare_guardrails", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "skills" / "socratic-debugging" / "SKILL.md" + target.parent.mkdir(parents=True) + target.write_text("---\nname: socratic-debugging\ndescription: test\n---\n") + + monkeypatch.setattr(optimize_loop, "_find_project_root", lambda: tmp_path) + + @contextlib.contextmanager + def fake_worktree(_project_root, _relpath, content): + worktree = tmp_path / ("candidate" if "candidate" in content else "baseline") + worktree.mkdir(exist_ok=True) + yield worktree + + monkeypatch.setattr(optimize_loop, "_candidate_worktree", fake_worktree) + + def fake_capture(query, cwd, accepted_skill_ids, timeout=180): + if cwd.name == "baseline": + return { + "output": "What changed recently?", + "triggered": False, + "contaminated": False, + "contamination_reasons": [], + } + return { + "output": "The Skill tool was blocked in this session, so I'll guide you through this directly. What changed recently?", + "triggered": True, + "contaminated": True, + "contamination_reasons": ["mentioned blocked skill tool", "fell back to direct guidance"], + } + + monkeypatch.setattr(optimize_loop, "_run_query_capture_output", fake_capture) + + results = optimize_loop._run_blind_compare_eval( + target, + "---\nname: socratic-debugging\ndescription: candidate\n# candidate\n", + [{"query": "help me debug", "eval_mode": "blind_compare", "judge": "socratic_question_only"}], + baseline_content="---\nname: socratic-debugging\ndescription: baseline\n# baseline\n", + ) + + assert results[0]["baseline_score"] == 0.0 + assert results[0]["candidate_score"] == 0.0 + assert results[0]["baseline_triggered"] is False + assert results[0]["candidate_contaminated"] is True + assert results[0]["winner"] == "tie" + assert results[0]["baseline_reasons"][0] == "target skill did not trigger" + assert results[0]["candidate_reasons"][0] == "mentioned blocked skill tool" + + +def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_parallel_forwarding", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + + target = tmp_path / "SKILL.md" + target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n") + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps( + { + "tasks": [ + { + "name": "train-positive", + "query": "make a skill", + "should_trigger": True, + "eval_mode": "behavioral", + "split": "train", + }, + { + "name": "test-negative", + "query": "debug kubernetes", + "should_trigger": False, + "eval_mode": "behavioral", + "split": "test", + }, + ] + } + ) + ) + + calls = [] + + def fake_assess_target( + path, + tasks, + goal, + verbose=False, + dry_run=False, + behavioral_runs_per_task=1, + behavioral_trigger_threshold=0.5, + parallel_eval_workers=0, + candidate_content=None, + baseline_content=None, + eval_mode="auto", + ): + calls.append( + { + "path": str(path), + "task_count": len(tasks), + "parallel_eval_workers": parallel_eval_workers, + "candidate_content": candidate_content, + "baseline_content": baseline_content, + "eval_mode": eval_mode, + } + ) + return { + "parses": True, + "compiles": True, + "tests_pass": True, + "protected_intact": True, + "correctness": 10.0, + "error_handling": 8.0, + "language_idioms": 7.0, + "testing": 8.0, + "efficiency": 6.0, + "task_results": [{"name": "task", "passed": True}], + } + + monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target) + + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + train_split=0.6, + model=None, + output_dir=tmp_path / "out", + report_path=tmp_path / "out" / "report.html", + verbose=False, + dry_run=True, + parallel_eval=2, + ) + + assert result["status"] in {"COMPLETE", "CONVERGED"} + assert calls + assert all(call["parallel_eval_workers"] == 2 for call in calls) + assert all(call["candidate_content"] is not None for call in calls) + assert any(call["baseline_content"] is not None for call in calls[1:]) + assert all(call["eval_mode"] == "auto" for call in calls) + + +def test_tiny_end_to_end_autoresearch_improves_real_weak_skill_copy(tmp_path, monkeypatch): + optimize_loop = load_module( + "agent_comparison_optimize_loop_e2e", + "skills/agent-comparison/scripts/optimize_loop.py", + ) + generate_variant = load_module( + "agent_comparison_generate_variant_e2e", + "skills/agent-comparison/scripts/generate_variant.py", + ) + + source_skill = REPO_ROOT / "skills" / "socratic-debugging" / "SKILL.md" + target = tmp_path / "SKILL.md" + target.write_text(source_skill.read_text()) + + trigger_query = "help me think through this bug step by step" + tasks_file = tmp_path / "tasks.json" + tasks_file.write_text( + json.dumps({"tasks": [{"name": "positive", "query": trigger_query, "should_trigger": True, "split": "train"}]}) + ) + + def fake_generate_variant_output( + current_content, + target_path, + goal, + last_failures, + history, + model, + dry_run, + iteration_number, + optimization_scope, + diversification_note=None, + ): + improved_description = ( + "Question-only debugging mode that guides users to find root causes through structured questions. " + f'Use when: "{trigger_query}", "rubber duck debug with me", "help me think through this bug".' + ) + return { + "variant": generate_variant.replace_description(current_content, improved_description), + "summary": "Added exact positive trigger phrase to the description.", + "reasoning": "Deterministic test variant", + "tokens_used": 0, + "deletions": [], + "deletion_justification": "", + } + + def fake_run_trigger_rate( + target_path, + description, + tasks, + candidate_content=None, + eval_mode="auto", + num_workers=5, + timeout=30, + verbose=False, + ): + passed = trigger_query in description + return { + "results": [ + { + "query": trigger_query, + "pass": passed, + "trigger_rate": 1.0 if passed else 0.0, + } + ], + "summary": { + "total": 1, + "passed": 1 if passed else 0, + "failed": 0 if passed else 1, + }, + } + + monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output) + monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate) + + out_dir = tmp_path / "out" + result = optimize_loop.run_optimization_loop( + target_path=target, + goal="improve routing precision", + benchmark_tasks_path=tasks_file, + max_iterations=1, + min_gain=0.0, + train_split=0.6, + model=None, + output_dir=out_dir, + report_path=out_dir / "report.html", + verbose=False, + dry_run=False, + ) + + assert result["best_iteration"] == 1 + assert result["improvements_found"] == 1 + assert result["baseline_train_score"] == 0.06 + assert result["best_score"] == 8.45 + + results_json = json.loads((out_dir / "results.json").read_text()) + assert results_json["best_iteration"] == 1 + assert results_json["iterations"][0]["verdict"] == "ACCEPT" + + best_variant = (out_dir / "best_variant.md").read_text() + assert trigger_query in generate_variant.extract_description(best_variant) + + verdict_json = json.loads((out_dir / "001" / "verdict.json").read_text()) + assert verdict_json["verdict"] == "ACCEPT" + assert verdict_json["composite_score"] == 8.45 diff --git a/scripts/tests/test_skill_eval_claude_code.py b/scripts/tests/test_skill_eval_claude_code.py index a0c9e05c..25aa844c 100644 --- a/scripts/tests/test_skill_eval_claude_code.py +++ b/scripts/tests/test_skill_eval_claude_code.py @@ -1,7 +1,9 @@ from __future__ import annotations import json +import os import subprocess +from contextlib import contextmanager from pathlib import Path @@ -46,3 +48,435 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout): transcript = json.loads((tmp_path / "improve_iter_1.json").read_text()) assert transcript["raw_result_text"] == "raw result" assert transcript["rewrite_raw_result_text"] == "raw result" + + +class _FakeUUID: + hex = "deadbeefcafebabe" + + +class _FakePopen: + def __init__(self, stdout_bytes: bytes): + read_fd, write_fd = os.pipe() + os.write(write_fd, stdout_bytes) + os.close(write_fd) + self.stdout = os.fdopen(read_fd, "rb", buffering=0) + self._returncode = None + + def poll(self): + return self._returncode + + def kill(self): + self._returncode = -9 + + def wait(self): + return self._returncode + + +def test_run_single_query_ignores_unrelated_stream_tool_use_before_matching_read(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + clean_name = "demo-skill-skill-deadbeef" + stream_lines = [ + { + "type": "stream_event", + "event": {"type": "content_block_start", "content_block": {"type": "tool_use", "name": "Bash"}}, + }, + { + "type": "stream_event", + "event": {"type": "content_block_start", "content_block": {"type": "tool_use", "name": "Read"}}, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "delta": { + "type": "input_json_delta", + "partial_json": f'{{"file_path":"/tmp/project/.claude/commands/{clean_name}.md"}}', + }, + }, + }, + {"type": "stream_event", "event": {"type": "content_block_stop"}}, + {"type": "result"}, + ] + payload = ("\n".join(json.dumps(line) for line in stream_lines) + "\n").encode() + + monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) + monkeypatch.setattr(mod.subprocess, "Popen", lambda *_args, **_kwargs: _FakePopen(payload)) + monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) + + triggered = mod.run_single_query( + query="help me debug this", + skill_name="demo-skill", + skill_description="demo description", + timeout=5, + project_root=str(tmp_path), + eval_mode="registered", + ) + + assert triggered is True + + +def test_run_single_query_scans_all_assistant_tool_uses_before_returning(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + clean_name = "demo-skill-skill-deadbeef" + assistant_lines = [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "name": "Bash", "input": {"command": "echo hi"}}, + { + "type": "tool_use", + "name": "Read", + "input": {"file_path": f"/tmp/project/.claude/commands/{clean_name}.md"}, + }, + ] + }, + }, + {"type": "result"}, + ] + payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode() + + monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) + monkeypatch.setattr(mod.subprocess, "Popen", lambda *_args, **_kwargs: _FakePopen(payload)) + monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) + + triggered = mod.run_single_query( + query="help me debug this", + skill_name="demo-skill", + skill_description="demo description", + timeout=5, + project_root=str(tmp_path), + eval_mode="registered", + ) + + assert triggered is True + + +def test_run_single_query_accepts_real_skill_name_not_just_temporary_alias(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + assistant_lines = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "name": "Skill", + "input": {"skill": "demo-skill"}, + } + ] + }, + }, + {"type": "result"}, + ] + payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode() + + monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID()) + monkeypatch.setattr(mod.subprocess, "Popen", lambda *_args, **_kwargs: _FakePopen(payload)) + monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], [])) + + triggered = mod.run_single_query( + query="help me debug this", + skill_name="demo-skill", + skill_description="demo description", + timeout=5, + project_root=str(tmp_path), + eval_mode="registered", + ) + + assert triggered is True + + +def test_resolve_registered_skill_relpath_accepts_repo_skill(tmp_path): + from scripts.skill_eval import run_eval as mod + + project_root = tmp_path + skill_dir = project_root / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n") + + relpath = mod.resolve_registered_skill_relpath(skill_dir, project_root) + + assert relpath == Path("skills/demo-skill/SKILL.md") + + +def test_replace_description_in_skill_md_rewrites_frontmatter_block_scalar(): + from scripts.skill_eval import run_eval as mod + + original = """--- +name: demo-skill +description: | + old description +version: 1.0.0 +--- + +# Skill +""" + + updated = mod.replace_description_in_skill_md(original, "new description line 1\nnew description line 2") + + assert "description: |\n new description line 1\n new description line 2\nversion: 1.0.0" in updated + assert "# Skill" in updated + + +def test_load_eval_set_accepts_common_wrapped_formats(tmp_path): + from scripts.skill_eval import run_eval as mod + + tasks_path = tmp_path / "tasks.json" + tasks_path.write_text(json.dumps({"tasks": [{"query": "q1", "should_trigger": True}]})) + queries_path = tmp_path / "queries.json" + queries_path.write_text(json.dumps({"queries": [{"query": "q2", "should_trigger": False}]})) + split_path = tmp_path / "split.json" + split_path.write_text( + json.dumps( + { + "train": [{"query": "q3", "should_trigger": True}], + "test": [{"query": "q4", "should_trigger": False}], + } + ) + ) + + assert mod.load_eval_set(tasks_path) == [{"query": "q1", "should_trigger": True}] + assert mod.load_eval_set(queries_path) == [{"query": "q2", "should_trigger": False}] + assert mod.load_eval_set(split_path) == [ + {"query": "q3", "should_trigger": True}, + {"query": "q4", "should_trigger": False}, + ] + + +def test_run_eval_auto_uses_registered_worktree_for_repo_skill(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n") + worktree_root = tmp_path / "worktree" + worktree_root.mkdir() + + seen = {"candidate_content": None, "submitted": []} + + @contextmanager + def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content): + seen["candidate_content"] = candidate_content + seen["registered_skill_relpath"] = registered_skill_relpath + yield worktree_root + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + seen["submitted"].append(args) + return _FakeFuture(True) + + monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree) + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + result = mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="demo description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="auto", + skill_path=skill_dir, + candidate_content="candidate body", + ) + + assert seen["candidate_content"] == "candidate body" + assert seen["registered_skill_relpath"] == Path("skills/demo-skill/SKILL.md") + assert seen["submitted"] + _, _, _, _, submitted_project_root, submitted_eval_mode, _ = seen["submitted"][0] + assert submitted_project_root == str(worktree_root) + assert submitted_eval_mode == "registered" + assert result["summary"]["passed"] == 1 + + +def test_run_eval_registered_mode_patches_candidate_from_description_override(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + original_content = """--- +name: demo-skill +description: old description +version: 1.0.0 +--- + +# Skill +""" + (skill_dir / "SKILL.md").write_text(original_content) + seen = {"candidate_content": None} + + @contextmanager + def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content): + seen["candidate_content"] = candidate_content + yield tmp_path / "worktree" + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + return _FakeFuture(True) + + monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree) + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="new description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="registered", + skill_path=skill_dir, + candidate_content=None, + ) + + assert seen["candidate_content"] is not None + assert "description: |\n new description\nversion: 1.0.0" in seen["candidate_content"] + + +def test_run_eval_registered_mode_patches_current_working_copy_when_no_override(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "skills" / "demo-skill" + skill_dir.mkdir(parents=True) + original_content = """--- +name: demo-skill +description: current working copy description +version: 1.0.0 +--- + +# Skill +""" + (skill_dir / "SKILL.md").write_text(original_content) + seen = {"candidate_content": None} + + @contextmanager + def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content): + seen["candidate_content"] = candidate_content + yield tmp_path / "worktree" + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + return _FakeFuture(True) + + monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree) + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="current working copy description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="registered", + skill_path=skill_dir, + candidate_content=None, + ) + + assert seen["candidate_content"] == original_content + + +def test_run_eval_auto_falls_back_to_alias_for_non_registered_skill(monkeypatch, tmp_path): + from scripts.skill_eval import run_eval as mod + + skill_dir = tmp_path / "scratch" / "demo-skill" + skill_dir.mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n") + + seen_submissions = [] + + class _FakeFuture: + def __init__(self, value): + self._value = value + + def result(self): + return self._value + + class _FakeExecutor: + def __init__(self, max_workers): + self.max_workers = max_workers + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, *args): + seen_submissions.append(args) + return _FakeFuture(False) + + monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor) + monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures)) + + result = mod.run_eval( + eval_set=[{"query": "help me debug this", "should_trigger": True}], + skill_name="demo-skill", + description="demo description", + num_workers=1, + timeout=5, + project_root=tmp_path, + eval_mode="auto", + skill_path=skill_dir, + ) + + assert seen_submissions + _, _, _, _, submitted_project_root, submitted_eval_mode, _ = seen_submissions[0] + assert submitted_project_root == str(tmp_path) + assert submitted_eval_mode == "alias" + assert result["summary"]["passed"] == 0 diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md index 21e8c150..0c83c132 100644 --- a/skills/agent-comparison/SKILL.md +++ b/skills/agent-comparison/SKILL.md @@ -308,15 +308,15 @@ The loop automatically evaluates the unmodified target against the train set bef **Step 4: Enter optimization loop** The `optimize_loop.py` script handles the full loop: -- Calls `generate_variant.py` to propose changes through `claude -p` +- Calls `generate_variant.py` to propose a new frontmatter `description` through `claude -p` - Evaluates each variant against train tasks - Runs either: - single-path hill climbing: `--beam-width 1 --candidates-per-parent 1` - beam search with top-K retention: keep the best `K` improving candidates each round -- Keeps variants that beat their parent by more than `--min-gain` (default 0.02) -- Reverts variants that don't improve, break hard gates, or delete sections without justification +- Accepts variants that beat their parent by more than `--min-gain` (default 0.02) +- Rejects variants that don't improve or break hard gates - Checks held-out test set every `--holdout-check-cadence` rounds for Goodhart divergence -- Stops on convergence (`--revert-streak-limit` rounds without any KEEP), Goodhart alarm, or max iterations +- Stops on convergence (`--revert-streak-limit` rounds without any ACCEPT), Goodhart alarm, or max iterations ```bash python3 skills/agent-comparison/scripts/optimize_loop.py \ @@ -340,23 +340,33 @@ Omit `--model` to use Claude Code's configured default model, or pass it explici The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and review/export controls. Recommended modes: -- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1` +- Short default optimization: default flags only +- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1 --max-iterations 3 --revert-streak-limit 3` - True autoresearch sweep: `--max-iterations 20 --beam-width 3 --candidates-per-parent 2 --revert-streak-limit 20` - Conservative search with strict keeps: raise `--min-gain` above `0.02` - Exploratory search that accepts small wins: use `--min-gain 0.0` +Live eval defaults are intentionally short: +- one optimization round +- one trigger-eval run per query +- one trigger-eval worker +- no holdout cadence unless explicitly requested + +For real repo skills at `skills//SKILL.md`, the live evaluator now prefers an isolated git worktree so the candidate content is scored at the real skill path. This is the default `--eval-mode auto` behavior and avoids scoring the installed skill instead of the candidate. +The registered-skill path also evaluates the current working copy, not just `HEAD`, so local uncommitted edits are measured correctly. + **Step 5: Present results in UI** Open the generated `optimization-report.html` in a browser. The report shows: -- Progress dashboard (status, baseline vs best, kept/reverted counts) +- Progress dashboard (status, baseline vs best, accepted/rejected counts) - Convergence chart (train solid line, held-out dashed line, baseline dotted) - Iteration table with verdict, composite score, delta, and change summary - Expandable inline diffs per iteration (click any row) -**Step 6: Review kept snapshots** +**Step 6: Review accepted snapshots** -Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews the kept iterations as candidate snapshots from the original target: -- Inspect each kept iteration's diff in the report +Not all ACCEPT iterations are real improvements — some may be harness artifacts. The user reviews the accepted iterations as candidate snapshots from the original target: +- Inspect each accepted iteration's diff in the report - Use "Preview Selected Snapshot" only as a comparison aid in the UI - Use "Export Selected" to download a review JSON describing the selected snapshot diff - In beam mode, review the retained frontier candidates first; they are the strongest candidates from the latest round @@ -365,15 +375,18 @@ Not all KEEP iterations are real improvements — some may be harness artifacts. Apply one reviewed improvement to the original target file. -- If you want the best single kept variant, use `evals/iterations/best_variant.md`. -- Beam search still writes a single `best_variant.md`: the highest-scoring kept candidate seen anywhere in the run. -- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple kept diffs into a generated patch. +- If you want the best single accepted variant, use `evals/iterations/best_variant.md`. +- Beam search still writes a single `best_variant.md`: the highest-scoring accepted candidate seen anywhere in the run. +- Choose scope deliberately: + - `description-only` for routing-trigger work + - `body-only` for behavioral work on the skill instructions themselves +- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple accepted diffs into a generated patch. ```bash -# Review the best kept variant before applying +# Review the best accepted variant before applying cat evals/iterations/best_variant.md | head -20 -# Replace the target with the best kept variant +# Replace the target with the best accepted variant cp evals/iterations/best_variant.md skills/{target}/SKILL.md ``` @@ -397,11 +410,30 @@ Compare final scores to the baseline to confirm net improvement. In beam mode, t python3 scripts/learning-db.py learn \ --skill agent-comparison \ "autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \ - Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}" + Accepted: {accepted}/{total}. Stop: {reason}. Changes: {summaries}" ``` **Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded. +### Current Reality Check + +The current optimizer is in a solid state for: +- deterministic proof runs +- isolated live evaluation of existing registered skills +- short live optimization of `read-only-ops`, with the accepted description change now applied and validated against `references/read-only-ops-short-tasks.json` +- short live body evaluation of `socratic-debugging`, with `references/socratic-debugging-body-short-tasks.json` + now producing clean skill-triggered first-turn outputs instead of fallback chatter + +One live-harness caveat remains: +- temporary renamed skill copies do not yet show reliable live trigger improvements through the dynamic command alias path + +That caveat does not affect deterministic proof runs or live checks against existing registered skills, but it does mean the current system is stronger for optimizing real in-repo skills than arbitrary renamed temp clones. + +For body optimization runs, the blind evaluator now rejects responses that: +- never triggered the target skill +- mention blocked skill/tool access +- fall back into generic "I'll guide you directly" behavior + ### Optional Extensions These are off by default. Enable explicitly when needed: diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md index 3aa0f6a8..7d689e2c 100644 --- a/skills/agent-comparison/references/optimization-guide.md +++ b/skills/agent-comparison/references/optimization-guide.md @@ -80,8 +80,29 @@ Explicit train/test sets: If no split markers are present, the loop performs a reproducible random split using `--train-split` and seed `42`. +`run_eval.py` now accepts the same common task-file wrappers: + +- raw list: `[{"query": "...", "should_trigger": true}]` +- task wrapper: `{"tasks": [...]}` +- query wrapper: `{"queries": [...]}` +- split wrapper: `{"train": [...], "test": [...]}` + ## Command +Short default run: + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/go-testing/SKILL.md \ + --goal "improve routing precision without losing recall" \ + --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \ + --report optimization-report.html \ + --output-dir evals/iterations \ + --verbose +``` + +Longer search: + ```bash python3 skills/agent-comparison/scripts/optimize_loop.py \ --target skills/go-testing/SKILL.md \ @@ -106,20 +127,45 @@ Useful flags: - `--dry-run`: exercise the loop mechanics without calling Claude Code - `--report`: write a live HTML report - `--output-dir`: persist iteration snapshots and `results.json` +- `--eval-mode auto|registered|alias`: choose how live trigger eval is isolated - `--beam-width`: retain the best K improving candidates per round - `--candidates-per-parent`: generate multiple sibling variants from each frontier candidate -- `--revert-streak-limit`: stop after N rounds without any KEEP candidates +- `--revert-streak-limit`: stop after N rounds without any ACCEPT candidates - `--holdout-check-cadence`: evaluate the global best on held-out tasks every N rounds +- `--parallel-eval N`: run behavioral eval tasks in parallel isolated worktrees + +Short defaults: + +- `--max-iterations 1` +- `--revert-streak-limit 1` +- `--holdout-check-cadence 0` +- trigger eval `--num-workers 1` +- trigger eval `--runs-per-query 1` Recommended search presets: +- Short proof run: + - default flags only - Single-path local search: - - `--beam-width 1 --candidates-per-parent 1` + - `--beam-width 1 --candidates-per-parent 1 --max-iterations 3 --revert-streak-limit 3` - Balanced beam search: - `--beam-width 3 --candidates-per-parent 2` - Aggressive exploration: - `--beam-width 5 --candidates-per-parent 3 --min-gain 0.0` +## Live Eval Isolation Modes + +`run_eval.py` now has three modes: + +- `auto`: default. If the target is a real repo skill at `skills//SKILL.md`, live eval runs in an isolated git worktree with the candidate content patched into the real path. Otherwise it falls back to alias mode. +- `registered`: force isolated worktree evaluation of a real registered skill. +- `alias`: force legacy dynamic command-file evaluation. + +For real registered skills, `auto` is the preferred mode. It prevents the evaluator +from accidentally scoring the installed skill instead of the candidate under test. +It also patches the current working-copy skill content into the isolated worktree, +so local uncommitted edits are evaluated correctly. + ## Evaluation Model The loop follows the ADR-131 structure: @@ -131,11 +177,10 @@ The loop follows the ADR-131 structure: ### Layer 1: Hard Gates -An iteration is rejected immediately if any of these fail: +An iteration is rejected immediately if any of these mechanical validity gates fail: - `parses` - `compiles` -- `tests_pass` - `protected_intact` For description optimization, `parses` and `protected_intact` are the most @@ -144,9 +189,13 @@ preserved verbatim. ### Layer 2: Composite Score -The loop converts trigger-rate evaluation results into a weighted composite -score using the built-in weights in `optimize_loop.py`. A candidate is kept only -if it beats its parent by more than `--min-gain`. +The loop converts evaluation results into a weighted composite score using the +built-in weights in `optimize_loop.py`. Task accuracy affects the component +dimensions (`correctness`, `error_handling`, `language_idioms`, `testing`, +`efficiency`) without zeroing the entire score. This preserves optimization +signal for incremental improvements when a task set is not yet perfect. + +A candidate is accepted only if it beats its parent by more than `--min-gain`. ### Layer 3: Held-Out Regression Check @@ -161,21 +210,26 @@ When beam search is enabled: - each frontier candidate generates `--candidates-per-parent` siblings - every sibling is scored independently -- the top `--beam-width` KEEP candidates become the next frontier +- the top `--beam-width` ACCEPT candidates become the next frontier - `best_variant.md` still tracks the single best candidate seen anywhere in the run When `--beam-width 1 --candidates-per-parent 1`, the behavior collapses back to the original single-path optimizer. -## Deletion Safety Rule +## Optimization Scopes + +The optimizer supports two mutation scopes: -Deleting sections is allowed only with explicit justification. +- `description-only`: replace only the YAML frontmatter `description` +- `body-only`: replace only the markdown body below the frontmatter -- `generate_variant.py` detects removed `##` headings -- the model must return a `deletion_justification` -- `optimize_loop.py` rejects deletions without one +`generate_variant.py` reconstructs the full file around the selected scope so +the unchanged parts stay intact. Use `description-only` for routing-trigger +work and `body-only` for behavioral work judged from the skill's actual output. -This enforces ADR-131's "no deletion without justification" rule. +For body optimization, pair `--optimization-scope body-only` with +`blind_compare` tasks so generation and evaluation are measuring the same +surface area. ## Iteration Artifacts @@ -193,10 +247,54 @@ When `--output-dir` is set, the loop writes: When `--report` is set, it also writes a live HTML dashboard showing: -- status, baseline, best score, kept/reverted counts +- status, baseline, best score, accepted/rejected counts - convergence chart - iteration table with diffs -- review/export controls for kept snapshot diffs from the original target +- review/export controls for accepted snapshot diffs from the original target + +## Current Validation Status + +What is currently demonstrated: +- deterministic end-to-end improvement runs with readable artifacts +- isolated live optimization for existing registered skills via temporary git worktrees +- blind body-eval runs that require actual skill-trigger evidence before scoring +- score calculations and accept/reject decisions that match the weighted rubric +- short live proof on `skills/read-only-ops/SKILL.md` using + `references/read-only-ops-short-tasks.json`, improving from one failed positive + to `2/2` live passes after the accepted description update +- short live body benchmark on `skills/socratic-debugging/SKILL.md` using + `references/socratic-debugging-body-short-tasks.json`, where the current + baseline now evaluates cleanly and non-improving body variants are rejected + +What remains imperfect: +- live optimization of temporary renamed skill copies still fails to show measured improvement through the dynamic command alias path + +So the current tooling is operational for real registered skills and deterministic proof runs, but not yet fully proven for arbitrary temporary renamed clones. + +## Short Live Commands + +Routing optimization on a real registered skill: + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/read-only-ops/SKILL.md \ + --goal "Improve read-only routing precision for realistic user prompts." \ + --benchmark-tasks skills/agent-comparison/references/read-only-ops-short-tasks.json +``` + +Body optimization on a real registered skill: + +```bash +python3 skills/agent-comparison/scripts/optimize_loop.py \ + --target skills/socratic-debugging/SKILL.md \ + --goal "Improve the first response so it asks exactly one question, avoids direct diagnosis, avoids code examples, and does not add tool-permission preamble." \ + --benchmark-tasks skills/agent-comparison/references/socratic-debugging-body-short-tasks.json \ + --optimization-scope body-only +``` + +The blind body path now fails closed: if the intended skill does not trigger, or +the response falls back into tool-blocked/direct-guidance chatter, the run is +scored as a failure instead of being treated as a weak improvement. ## Choosing Good Eval Tasks diff --git a/skills/agent-comparison/references/read-only-ops-short-tasks.json b/skills/agent-comparison/references/read-only-ops-short-tasks.json new file mode 100644 index 00000000..dad16d94 --- /dev/null +++ b/skills/agent-comparison/references/read-only-ops-short-tasks.json @@ -0,0 +1,16 @@ +{ + "tasks": [ + { + "name": "positive-read-only-report", + "query": "inspect this repository and report back without changing anything", + "should_trigger": true, + "split": "train" + }, + { + "name": "negative-fix-tests", + "query": "fix the failing tests in this repository", + "should_trigger": false, + "split": "train" + } + ] +} diff --git a/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json b/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json new file mode 100644 index 00000000..457b7240 --- /dev/null +++ b/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json @@ -0,0 +1,12 @@ +{ + "tasks": [ + { + "name": "socratic-first-turn", + "query": "Help me think through this bug. My Python script sometimes returns None instead of a dict when the cache is warm. Please do not solve it for me directly.", + "eval_mode": "blind_compare", + "judge": "heuristic_socratic_debugging", + "min_score": 0.7, + "split": "train" + } + ] +} diff --git a/skills/agent-comparison/references/socratic-debugging-trigger-tasks.json b/skills/agent-comparison/references/socratic-debugging-trigger-tasks.json new file mode 100644 index 00000000..ee4ce4b7 --- /dev/null +++ b/skills/agent-comparison/references/socratic-debugging-trigger-tasks.json @@ -0,0 +1,98 @@ +[ + { + "query": "help me think through this bug step by step", + "should_trigger": true, + "complexity": "simple", + "description": "explicit request for guided reasoning" + }, + { + "query": "walk me through debugging this", + "should_trigger": true, + "complexity": "simple", + "description": "guided debugging with user doing the work" + }, + { + "query": "I need coaching on how to debug this problem", + "should_trigger": true, + "complexity": "simple", + "description": "coaching/teaching framing" + }, + { + "query": "teach me to find the root cause myself", + "should_trigger": true, + "complexity": "simple", + "description": "explicit teach-me framing" + }, + { + "query": "guide me to the root cause with questions", + "should_trigger": true, + "complexity": "simple", + "description": "question-based guidance request" + }, + { + "query": "rubber duck debug with me", + "should_trigger": true, + "complexity": "simple", + "description": "rubber duck debugging is a known trigger" + }, + { + "query": "ask me questions to help me figure out the bug", + "should_trigger": true, + "complexity": "simple", + "description": "explicit ask-me-questions pattern" + }, + { + "query": "help me learn to find bugs myself instead of just telling me the answer", + "should_trigger": true, + "complexity": "simple", + "description": "pedagogical debugging preference" + }, + { + "query": "just fix this bug for me", + "should_trigger": false, + "complexity": "simple", + "description": "direct fix request, not guided learning" + }, + { + "query": "what's wrong with this code", + "should_trigger": false, + "complexity": "simple", + "description": "direct answer expected, not guided" + }, + { + "query": "debug this crash and tell me what to change", + "should_trigger": false, + "complexity": "simple", + "description": "wants answer, not coaching" + }, + { + "query": "review my code for bugs", + "should_trigger": false, + "complexity": "simple", + "description": "code review, not debugging coaching" + }, + { + "query": "run the tests and find what's failing", + "should_trigger": false, + "complexity": "simple", + "description": "automated test run, not guided debugging" + }, + { + "query": "investigate this production failure and give me a root cause analysis", + "should_trigger": false, + "complexity": "medium", + "description": "wants RCA output, not teaching" + }, + { + "query": "check for performance bugs in this service", + "should_trigger": false, + "complexity": "simple", + "description": "performance audit, not debugging coaching" + }, + { + "query": "find the security issue in this authentication code", + "should_trigger": false, + "complexity": "simple", + "description": "security review, not pedagogical debugging" + } +] diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py index 31cb2446..1a35aa46 100644 --- a/skills/agent-comparison/scripts/generate_variant.py +++ b/skills/agent-comparison/scripts/generate_variant.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -"""Generate a variant of an agent/skill file using Claude Code. +"""Generate an optimized variant of an agent/skill file using Claude Code. -Proposes modifications to improve the target file based on the optimization -goal and previous iteration failures. Preserves protected sections marked -with DO NOT OPTIMIZE markers. +Supports two optimization scopes: +- description-only: mutate frontmatter description only +- body-only: mutate the markdown body only Pattern: uses `claude -p` so generation runs through Claude Code directly. @@ -17,8 +17,8 @@ Output (JSON to stdout): { - "variant": "full file content...", - "summary": "Added CRITICAL warning for error wrapping", + "variant": "full file content with updated description...", + "summary": "Added concrete trigger phrases to the description", "deletion_justification": "", "reasoning": "Extended thinking content...", "tokens_used": 12345 @@ -86,6 +86,126 @@ def detect_deletions(original: str, variant: str) -> list[str]: return sorted(orig_headings - var_headings) +# --------------------------------------------------------------------------- +# Description-only optimization helpers +# --------------------------------------------------------------------------- + + +def extract_description(content: str) -> str: + """Extract frontmatter description text from a markdown file.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + + fm_lines = lines[1:end_idx] + idx = 0 + while idx < len(fm_lines): + line = fm_lines[idx] + if line.startswith("description:"): + value = line[len("description:") :].strip() + if value in (">", "|", ">-", "|-"): + parts: list[str] = [] + idx += 1 + while idx < len(fm_lines) and (fm_lines[idx].startswith(" ") or fm_lines[idx].startswith("\t")): + parts.append(fm_lines[idx].strip()) + idx += 1 + return "\n".join(parts).strip() + return value.strip('"').strip("'").strip() + idx += 1 + + raise ValueError("Content missing frontmatter description") + + +def replace_description(content: str, new_description: str) -> str: + """Replace the frontmatter description while preserving all other content verbatim.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + + fm_lines = lines[1:end_idx] + start_idx = None + stop_idx = None + idx = 0 + while idx < len(fm_lines): + line = fm_lines[idx] + if line.startswith("description:"): + start_idx = idx + value = line[len("description:") :].strip() + stop_idx = idx + 1 + if value in (">", "|", ">-", "|-"): + stop_idx = idx + 1 + while stop_idx < len(fm_lines) and ( + fm_lines[stop_idx].startswith(" ") or fm_lines[stop_idx].startswith("\t") + ): + stop_idx += 1 + break + idx += 1 + + if start_idx is None or stop_idx is None: + raise ValueError("Content missing frontmatter description") + + normalized = new_description.strip() + replacement = ["description: |"] + if normalized: + replacement.extend(f" {line}" if line else " " for line in normalized.splitlines()) + else: + replacement.append(" ") + + new_fm_lines = fm_lines[:start_idx] + replacement + fm_lines[stop_idx:] + rebuilt_lines = ["---", *new_fm_lines, "---", *lines[end_idx + 1 :]] + return "\n".join(rebuilt_lines) + + +def extract_body(content: str) -> str: + """Extract markdown body content after frontmatter.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + return "\n".join(lines[end_idx + 1 :]) + + +def replace_body(content: str, new_body: str) -> str: + """Replace the markdown body while preserving frontmatter verbatim.""" + lines = content.split("\n") + if not lines or lines[0].strip() != "---": + raise ValueError("Content missing frontmatter opening delimiter") + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + if end_idx is None: + raise ValueError("Content missing frontmatter closing delimiter") + rebuilt_lines = [*lines[: end_idx + 1], *new_body.splitlines()] + rebuilt = "\n".join(rebuilt_lines) + if content.endswith("\n") and not rebuilt.endswith("\n"): + rebuilt += "\n" + return rebuilt + + # --------------------------------------------------------------------------- # Variant generation # --------------------------------------------------------------------------- @@ -150,6 +270,7 @@ def generate_variant( current_content: str, failures: list[dict], model: str | None, + optimization_scope: str = "description-only", history: list[dict] | None = None, diversification_note: str | None = None, ) -> dict: @@ -162,7 +283,20 @@ def generate_variant( if failures: failure_section = "\n\nFailed tasks from the last iteration:\n" for f in failures: - failure_section += f" - {f.get('name', 'unnamed')}: {f.get('details', 'failed')}\n" + label = f.get("query") or f.get("name", "unnamed") + should_trigger = f.get("should_trigger") + expectation = "" + if should_trigger is True: + expectation = " (expected: SHOULD trigger)" + elif should_trigger is False: + expectation = " (expected: should NOT trigger)" + detail_bits = [] + if f.get("details"): + detail_bits.append(str(f["details"])) + if "trigger_rate" in f: + detail_bits.append(f"raw_trigger_rate={f['trigger_rate']:.2f}") + details = "; ".join(detail_bits) if detail_bits else "failed" + failure_section += f" - {label}{expectation}: {details}\n" history_section = "" if history: @@ -188,7 +322,11 @@ def generate_variant( This is non-negotiable: protected sections contain safety gates that must not be removed even if removing them would improve test scores.""" - prompt = f"""You are optimizing an agent/skill file to improve its performance. + current_description = extract_description(current_content) + current_body = extract_body(current_content) + + if optimization_scope == "description-only": + prompt = f"""You are optimizing an agent/skill file to improve its trigger performance. Target file: {target_path} Optimization goal: {goal} @@ -197,36 +335,45 @@ def generate_variant( {current_content} +Current description: + +{current_description} + {failure_section}{history_section}{diversification_section}{protected_notice} SAFETY RULES: -1. Do NOT delete sections without replacing them with equivalent or better content. - If you remove a section heading that exists in the original, you must explain what - replaces the removed functionality. Pure deletion degrades unmeasured capabilities. +1. Optimize ONLY the YAML frontmatter `description` field. + Do not modify any other part of the file. The optimizer evaluates description-trigger + quality only, so changing routing blocks, body text, or headings is out of scope. -2. Do NOT change the tools, SDKs, or interfaces the agent uses. The variant must work - in the same environment as the original (no switching from SDK to curl, etc.). +2. Keep the description faithful to the file's actual purpose. Improve routing precision + by making the description clearer and more triggerable, not by changing the behavior + or scope of the skill. -3. Keep YAML frontmatter structure intact (name, description, routing, etc.). +3. Keep the skill name, routing, tools, instructions, and all protected sections unchanged. -4. Focus on making the agent/skill better at achieving the stated goal. Common +4. Focus on making the description better at achieving the stated goal. Common improvements include: - - Moving critical information to more prominent positions (CRITICAL banners) - - Adding explicit planning steps before code generation - - Improving error handling instructions with specific patterns - - Adding concrete examples for ambiguous instructions - - Restructuring for clarity when sections are dense - -Please respond with the complete modified file content inside tags, -and a brief summary of what you changed and why inside tags. - -If you removed any existing `##` section heading, include a brief justification -inside tags. If you did not remove a section, return -empty tags. - - -[complete file content here] - + - Including natural user phrasings that should trigger this skill + - Making the first sentence more concrete and specific + - Removing vague wording that overlaps with unrelated skills + - Adding concise usage examples when they help routing + +5. Treat failed eval tasks as primary routing evidence: + - If a task SHOULD have triggered but did not, strongly prefer copying the exact + user phrasing or a very close paraphrase into the description. + - If a task should NOT have triggered, add clarifying language that separates this + skill from that request without expanding scope. + - Optimize for the smallest description change that would make the failed tasks + more likely to score correctly on the next run. + +Please respond with ONLY the improved description text inside tags, +without YAML quoting or frontmatter delimiters, and a brief summary inside tags. +Do not return the full file. + + +[improved description only] + [1-2 sentence description of the change] @@ -235,16 +382,72 @@ def generate_variant( [why any removed section was replaced safely, or leave blank] """ + text, raw_result_text, tokens_used = _run_claude_code(prompt, model) + + description_match = re.search(r"(.*?)", text, re.DOTALL) + if description_match: + new_payload = description_match.group(1).strip() + else: + variant_match = re.search(r"(.*?)", text, re.DOTALL) + if not variant_match: + print("Error: No or tags in response", file=sys.stderr) + sys.exit(1) + legacy_variant = variant_match.group(1).strip() + new_payload = extract_description(legacy_variant) + + variant = replace_description(current_content, new_payload) + elif optimization_scope == "body-only": + prompt = f"""You are optimizing an agent/skill file to improve its behavioral quality. - text, raw_result_text, tokens_used = _run_claude_code(prompt, model) +Target file: {target_path} +Optimization goal: {goal} - # Parse variant content - variant_match = re.search(r"(.*?)", text, re.DOTALL) - if not variant_match: - print("Error: No tags in response", file=sys.stderr) - sys.exit(1) +Current content of the file: + +{current_content} + +Current body: + +{current_body} + +{failure_section}{history_section}{diversification_section}{protected_notice} + +SAFETY RULES: +1. Optimize ONLY the markdown body after the YAML frontmatter. + Do not modify the frontmatter, skill name, description, routing, tools, or version. +2. Keep the skill faithful to its current purpose. Improve how it behaves, not what broad domain it covers. +3. Preserve headings and protected sections unless you have a clear reason to improve the body structure safely. +4. Prefer the smallest body change that addresses the failed tasks and improves behavioral quality. + +Please respond with ONLY the improved body text inside tags and a brief summary inside tags. +Do not return the full file. - variant = variant_match.group(1).strip() + +[improved markdown body only] + + + +[1-2 sentence description of the change] + + + +[why any removed section was replaced safely, or leave blank] +""" + text, raw_result_text, tokens_used = _run_claude_code(prompt, model) + body_match = re.search(r"(.*?)", text, re.DOTALL) + if body_match: + new_payload = body_match.group(1).strip("\n") + else: + variant_match = re.search(r"(.*?)", text, re.DOTALL) + if not variant_match: + print("Error: No or tags in response", file=sys.stderr) + sys.exit(1) + legacy_variant = variant_match.group(1).strip() + new_payload = extract_body(legacy_variant) + + variant = replace_body(current_content, new_payload) + else: + raise ValueError(f"Unsupported optimization_scope: {optimization_scope}") # Parse summary summary_match = re.search(r"(.*?)", text, re.DOTALL) @@ -253,13 +456,12 @@ def generate_variant( deletion_match = re.search(r"(.*?)", text, re.DOTALL) deletion_justification = deletion_match.group(1).strip() if deletion_match else "" - # Restore protected sections (safety net) + # Restore protected sections (safety net); should be a no-op when only the + # description changes, but keep it as belt-and-suspenders protection. variant = restore_protected(current_content, variant) - # Check for unauthorized deletions + # Description-only optimization should never delete sections. deletions = detect_deletions(current_content, variant) - if deletions: - print(f"Warning: Deleted sections: {deletions}", file=sys.stderr) return { "variant": variant, @@ -287,6 +489,12 @@ def main(): parser.add_argument("--history", default="[]", help="JSON list of previous iterations") parser.add_argument("--diversification-note", default=None, help="Optional search diversification hint") parser.add_argument("--model", default=None, help="Optional Claude Code model override") + parser.add_argument( + "--optimization-scope", + choices=["description-only", "body-only"], + default="description-only", + help="Which part of the file to mutate", + ) args = parser.parse_args() try: @@ -312,6 +520,7 @@ def main(): current_content=current_content, failures=failures, model=args.model, + optimization_scope=args.optimization_scope, history=history if history else None, diversification_note=args.diversification_note, ) diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py index dd17781f..f4463b1f 100644 --- a/skills/agent-comparison/scripts/optimize_loop.py +++ b/skills/agent-comparison/scripts/optimize_loop.py @@ -20,11 +20,15 @@ from __future__ import annotations import argparse +import concurrent.futures +import contextlib import glob +import hashlib import json import os import random import re +import shutil import subprocess import sys import tempfile @@ -43,7 +47,10 @@ "efficiency": 0.10, } -HARD_GATE_KEYS = ["parses", "compiles", "tests_pass", "protected_intact"] +# Hard gates should capture mechanical invalidity, not evaluation quality. +# Routing/task accuracy is already reflected in the weighted dimensions below; +# zeroing the whole composite on any failed task destroys the optimization signal. +HARD_GATE_KEYS = ["parses", "compiles", "protected_intact"] def passes_hard_gates(scores: dict) -> bool: @@ -162,6 +169,7 @@ def _generate_variant_output( model: str | None, dry_run: bool, iteration_number: int, + optimization_scope: str, diversification_note: str | None = None, ) -> dict: """Generate a candidate variant either synthetically or through Claude Code.""" @@ -192,6 +200,8 @@ def _generate_variant_output( json.dumps(last_failures), "--history", json.dumps(history), + "--optimization-scope", + optimization_scope, ] if diversification_note: variant_cmd.extend(["--diversification-note", diversification_note]) @@ -273,7 +283,7 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: rows = "" for it in iterations: v = it["verdict"] - vcls = {"KEEP": "keep", "REVERT": "revert", "STOP": "stop"}.get(v, "") + vcls = {"ACCEPT": "accept", "REJECT": "reject", "STOP": "stop"}.get(v, "") sc = it["score"] train_score = sc.get("train") test_score = sc.get("test") @@ -284,7 +294,7 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero" summary = html_mod.escape(str(it.get("change_summary", ""))[:80]) diff_esc = html_mod.escape(str(it.get("diff", ""))) - is_keep = v == "KEEP" + is_keep = v == "ACCEPT" n = it["number"] rows += f""" @@ -310,8 +320,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: bt = baseline.get("train", 0.0) best = max((it["score"].get("train", bt) for it in iterations), default=bt) - kept = sum(1 for it in iterations if it["verdict"] == "KEEP") - reverted = sum(1 for it in iterations if it["verdict"] == "REVERT") + accepted = sum(1 for it in iterations if it["verdict"] == "ACCEPT") + rejected = sum(1 for it in iterations if it["verdict"] == "REJECT") cur = len(iterations) mx = data.get("max_iterations", 20) scls = "running" if status == "RUNNING" else "done" if status in ("CONVERGED", "COMPLETE") else "alarm" @@ -345,8 +355,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str: .iter-row:hover {{ background:var(--surface-2); }} .diff-row td {{ padding:0; }} .diff-block {{ background:#080b0f;padding:12px;font-family:var(--font-mono);font-size:11px;max-height:400px;overflow:auto;white-space:pre;line-height:1.5;color:var(--muted); }} -.verdict-keep {{ color:var(--green);font-weight:600; }} -.verdict-revert {{ color:var(--red);font-weight:600; }} +.verdict-accept {{ color:var(--green);font-weight:600; }} +.verdict-reject {{ color:var(--red);font-weight:600; }} .verdict-stop {{ color:var(--yellow);font-weight:600; }} .d-pos {{ color:var(--green);font-weight:600; }} .d-neg {{ color:var(--red);font-weight:600; }} @@ -367,8 +377,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
Progress{cur}/{mx}
Baseline{bt:.2f}
Best{best:.2f} ({best - bt:+.2f})
-
Kept{kept}
-
Reverted{reverted}
+
Accepted{accepted}
+
Rejected{rejected}

{score_label}

@@ -591,6 +601,10 @@ def _is_behavioral_task(task: dict) -> bool: return "query" in task and "should_trigger" in task and task.get("eval_mode") == "behavioral" +def _is_blind_compare_task(task: dict) -> bool: + return "query" in task and task.get("eval_mode") == "blind_compare" and "judge" in task + + def _validate_task_set(tasks: list[dict]) -> None: """Reject unsupported or mixed task formats early with a clear error.""" if not tasks: @@ -599,18 +613,24 @@ def _validate_task_set(tasks: list[dict]) -> None: trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task)) pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task)) behavioral_tasks = sum(1 for task in tasks if _is_behavioral_task(task)) + blind_compare_tasks = sum(1 for task in tasks if _is_blind_compare_task(task)) # behavioral tasks are a subset of trigger tasks (same base fields), so subtract them # to avoid double-counting when checking for pure trigger-rate sets - pure_trigger_tasks = trigger_tasks - behavioral_tasks + pure_trigger_tasks = trigger_tasks - behavioral_tasks - blind_compare_tasks - if (pure_trigger_tasks or behavioral_tasks) and pattern_tasks: + if (pure_trigger_tasks or behavioral_tasks or blind_compare_tasks) and pattern_tasks: raise ValueError( "Task file mixes trigger-rate/behavioral and pattern benchmark formats. Use one format per run." ) - if behavioral_tasks and pure_trigger_tasks: - raise ValueError("Task file mixes trigger-rate and behavioral eval modes. Use one eval_mode per run.") + if sum(1 for n in [behavioral_tasks > 0, pure_trigger_tasks > 0, blind_compare_tasks > 0] if n) > 1: + raise ValueError( + "Task file mixes trigger-rate, behavioral, and blind-compare eval modes. Use one eval_mode per run." + ) + + if blind_compare_tasks == len(tasks): + return if behavioral_tasks == len(tasks): return @@ -636,7 +656,9 @@ def _run_trigger_rate( target_path: Path, description: str, tasks: list[dict], - num_workers: int = 5, + candidate_content: str | None = None, + eval_mode: str = "auto", + num_workers: int = 1, timeout: int = 30, verbose: bool = False, ) -> dict: @@ -651,39 +673,47 @@ def _run_trigger_rate( task_file = f.name json.dump(tasks, f) - with tempfile.TemporaryDirectory() as skill_dir: - skill_md = Path(skill_dir) / "SKILL.md" - skill_md.write_text(target_path.read_text()) - - project_root = Path.cwd() - for parent in [project_root, *project_root.parents]: - if (parent / ".claude").is_dir(): - project_root = parent - break - - cmd = [ - sys.executable, - "-m", - "scripts.skill_eval.run_eval", - "--eval-set", - task_file, - "--skill-path", - skill_dir, - "--description", - description, - "--num-workers", - str(num_workers), - "--timeout", - str(timeout), - "--runs-per-query", - "1", - ] - if verbose: - cmd.append("--verbose") - print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr) + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + project_root = parent + break + + cmd = [ + sys.executable, + "-m", + "scripts.skill_eval.run_eval", + "--eval-set", + task_file, + "--skill-path", + str(target_path.parent), + "--description", + description, + "--eval-mode", + eval_mode, + "--num-workers", + str(num_workers), + "--timeout", + str(timeout), + "--runs-per-query", + "1", + ] + if candidate_content is not None: + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as candidate_file: + candidate_file.write(candidate_content) + candidate_file.flush() + cmd.extend(["--candidate-content-file", candidate_file.name]) + candidate_file_path = Path(candidate_file.name) + else: + candidate_file_path = None + + if verbose: + cmd.append("--verbose") + print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr) - env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + try: result = subprocess.run( cmd, capture_output=True, @@ -692,74 +722,441 @@ def _run_trigger_rate( env=env, timeout=600, ) + finally: + if candidate_file_path is not None: + candidate_file_path.unlink(missing_ok=True) - if result.returncode != 0: - print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr) - return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} + if result.returncode != 0: + print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} - try: - return json.loads(result.stdout) - except json.JSONDecodeError as e: - print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr) - return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} + try: + return json.loads(result.stdout) + except json.JSONDecodeError as e: + print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr) + return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}} finally: if task_file: Path(task_file).unlink(missing_ok=True) # --------------------------------------------------------------------------- -# Behavioral evaluator (runs claude -p and checks for artifact creation) +# Blind comparative behavioral evaluator # --------------------------------------------------------------------------- -def _run_behavioral_eval( +def _find_project_root() -> Path: + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + return parent + return project_root + + +def _resolve_registered_skill_relpath(target_path: Path, project_root: Path) -> Path: + resolved = target_path.resolve() + try: + rel = resolved.relative_to(project_root.resolve()) + except ValueError as exc: + raise ValueError("blind_compare eval requires a target under the current project root") from exc + if len(rel.parts) >= 3 and rel.parts[0] == "skills" and rel.parts[-1] == "SKILL.md": + return rel + raise ValueError("blind_compare eval currently supports real registered skills under skills/*/SKILL.md only") + + +@contextlib.contextmanager +def _candidate_worktree(project_root: Path, relpath: Path, content: str): + wt_path_str = tempfile.mkdtemp(prefix="blind-eval-wt-", dir="/tmp") + wt_path = Path(wt_path_str) + wt_path.rmdir() + try: + subprocess.run( + ["git", "worktree", "add", wt_path_str, "HEAD"], + cwd=str(project_root), + capture_output=True, + check=True, + ) + (wt_path / relpath).write_text(content) + yield wt_path + finally: + try: + subprocess.run( + ["git", "worktree", "remove", "--force", wt_path_str], + cwd=str(project_root), + capture_output=True, + ) + except Exception: + pass + shutil.rmtree(wt_path_str, ignore_errors=True) + + +def _extract_registered_skill_ids(relpath: Path, content: str) -> set[str]: + ids = {relpath.as_posix()} + if len(relpath.parts) >= 2: + ids.add(relpath.parts[1]) + match = re.search(r"^name:\s*(.+)$", content, re.MULTILINE) + if match: + ids.add(match.group(1).strip().strip("\"'")) + return {value for value in ids if value} + + +def _assistant_message_triggered_skill(message: dict, accepted_skill_ids: set[str]) -> bool: + for content_item in message.get("content", []): + if content_item.get("type") != "tool_use": + continue + tool_name = content_item.get("name", "") + tool_input = content_item.get("input", {}) + if tool_name == "Skill" and any(skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids): + return True + if tool_name == "Read" and any(skill_id in tool_input.get("file_path", "") for skill_id in accepted_skill_ids): + return True + return False + + +def _contains_fallback_contamination(output: str) -> tuple[bool, list[str]]: + lowered = output.lower() + reasons = [] + contamination_markers = { + "skill tool was blocked": "mentioned blocked skill tool", + "tool was blocked": "mentioned blocked tool access", + "i'll guide you through this directly": "fell back to direct guidance", + "i can still help directly": "fell back to direct guidance", + "instead of using the skill": "mentioned skill fallback mode", + "mode announcement": "included mode/meta announcement", + "tool-permission": "mentioned tool permission", + } + for marker, reason in contamination_markers.items(): + if marker in lowered: + reasons.append(reason) + return bool(reasons), reasons + + +def _run_query_capture_output(query: str, cwd: Path, accepted_skill_ids: set[str], timeout: int = 180) -> dict: + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + result = subprocess.run( + [ + "claude", + "-p", + query, + "--output-format", + "stream-json", + "--verbose", + "--include-partial-messages", + "--permission-mode", + "bypassPermissions", + ], + capture_output=True, + text=True, + cwd=str(cwd), + env=env, + timeout=timeout, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or f"claude -p exited {result.returncode}") + + assistant_text: list[str] = [] + raw_result = "" + triggered = False + pending_tool_name = None + accumulated_json = "" + + for raw_line in result.stdout.splitlines(): + line = raw_line.strip() + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + if event.get("type") == "stream_event": + se = event.get("event", {}) + se_type = se.get("type", "") + if se_type == "content_block_start": + cb = se.get("content_block", {}) + if cb.get("type") == "tool_use": + tool_name = cb.get("name", "") + if tool_name in {"Skill", "Read"}: + pending_tool_name = tool_name + accumulated_json = "" + else: + pending_tool_name = None + accumulated_json = "" + elif se_type == "content_block_delta" and pending_tool_name: + delta = se.get("delta", {}) + if delta.get("type") == "input_json_delta": + accumulated_json += delta.get("partial_json", "") + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True + elif se_type in {"content_block_stop", "message_stop"} and pending_tool_name: + if any(skill_id in accumulated_json for skill_id in accepted_skill_ids): + triggered = True + pending_tool_name = None + accumulated_json = "" + + if event.get("type") == "assistant": + message = event.get("message", {}) + if _assistant_message_triggered_skill(message, accepted_skill_ids): + triggered = True + for content in message.get("content", []): + if content.get("type") == "text": + assistant_text.append(content.get("text", "")) + elif event.get("type") == "result": + raw_result = event.get("result", "") + + output = "".join(assistant_text).strip() or raw_result.strip() + contaminated, contamination_reasons = _contains_fallback_contamination(output) + return { + "output": output, + "triggered": triggered, + "contaminated": contaminated, + "contamination_reasons": contamination_reasons, + } + + +def _score_socratic_question_only_output(output: str) -> tuple[float, list[str]]: + stripped = output.strip() + lowered = stripped.lower() + reasons: list[str] = [] + score = 0.0 + + question_marks = stripped.count("?") + if question_marks == 1: + score += 0.45 + reasons.append("asked exactly one question") + elif question_marks == 0: + reasons.append("asked no question") + else: + score += max(0.0, 0.20 - (question_marks - 2) * 0.10) + reasons.append(f"asked {question_marks} questions") + + if stripped.endswith("?"): + score += 0.15 + reasons.append("ended on a question") + else: + reasons.append("did not end on a question") + + starters = ("what ", "when ", "where ", "which ", "can ", "could ", "did ", "is ", "are ", "have ") + if any(lowered.startswith(starter) for starter in starters): + score += 0.15 + reasons.append("opened directly with a question") + else: + reasons.append("did not open directly with a question") + + first_sentence = lowered.split("?")[0] + preamble_markers = ["let me", "i'll", "i will", "we'll", "we will", "let's", "before we", "looking at"] + if any(marker in first_sentence for marker in preamble_markers): + score -= 0.30 + reasons.append("included preamble before the first question") + + direct_answer_markers = [ + "common mistake", + "classic", + "the issue is", + "the problem is", + "the bug is", + "you should", + "fix this by", + "the root cause", + "likely cause", + "think about code like", + "vs.", + "return cache.get", + "poison the cache", + ] + if any(marker in lowered for marker in direct_answer_markers): + score -= 0.35 + reasons.append("gave direct diagnosis/advice") + else: + score += 0.15 + reasons.append("avoided direct diagnosis") + + if "```" in output: + score -= 0.15 + reasons.append("included code block") + else: + score += 0.10 + reasons.append("no code block") + + if len(stripped) <= 450: + score += 0.10 + reasons.append("kept first turn concise") + else: + reasons.append("first response was long") + + return max(0.0, min(1.0, round(score, 4))), reasons + + +def _score_output_with_judge(task: dict, output: str) -> tuple[float, list[str]]: + judge = task.get("judge") + if judge in {"socratic_question_only", "heuristic_socratic_debugging"}: + return _score_socratic_question_only_output(output) + raise ValueError(f"Unsupported blind_compare judge: {judge}") + + +def _run_blind_compare_eval( target_path: Path, - description: str, + candidate_content: str, tasks: list[dict], - timeout: int = 240, + baseline_content: str | None = None, + timeout: int = 180, verbose: bool = False, ) -> list[dict]: - """Run behavioral assessment by invoking claude -p and checking artifact output. + """Run blind comparative evaluation for real registered skills.""" + project_root = _find_project_root() + relpath = _resolve_registered_skill_relpath(target_path, project_root) + baseline_source = baseline_content if baseline_content is not None else candidate_content + candidate_skill_ids = _extract_registered_skill_ids(relpath, candidate_content) + baseline_skill_ids = _extract_registered_skill_ids(relpath, baseline_source) + + results: list[dict] = [] + for task in tasks: + query = task["query"] + if baseline_source == candidate_content: + with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt: + candidate_capture = _run_query_capture_output(query, candidate_wt, candidate_skill_ids, timeout=timeout) + baseline_capture = dict(candidate_capture) + else: + with _candidate_worktree(project_root, relpath, baseline_source) as baseline_wt: + baseline_capture = _run_query_capture_output(query, baseline_wt, baseline_skill_ids, timeout=timeout) + with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt: + candidate_capture = _run_query_capture_output(query, candidate_wt, candidate_skill_ids, timeout=timeout) + + baseline_output = baseline_capture["output"] + candidate_output = candidate_capture["output"] + + baseline_score, baseline_reasons = _score_output_with_judge(task, baseline_output) + candidate_score, candidate_reasons = _score_output_with_judge(task, candidate_output) + + if not baseline_capture["triggered"]: + baseline_score = 0.0 + baseline_reasons = ["target skill did not trigger", *baseline_reasons] + if baseline_capture["contaminated"]: + baseline_score = 0.0 + baseline_reasons = [*baseline_capture["contamination_reasons"], *baseline_reasons] + if not candidate_capture["triggered"]: + candidate_score = 0.0 + candidate_reasons = ["target skill did not trigger", *candidate_reasons] + if candidate_capture["contaminated"]: + candidate_score = 0.0 + candidate_reasons = [*candidate_capture["contamination_reasons"], *candidate_reasons] + + seed = int(hashlib.sha256(query.encode()).hexdigest()[:8], 16) + if seed % 2 == 0: + label_map = {"A": "baseline", "B": "candidate"} + else: + label_map = {"A": "candidate", "B": "baseline"} - Each task must have 'query', 'should_trigger', 'artifact_glob', and optionally - 'query_prefix' fields. Tasks are run sequentially since each claude -p invocation - is resource-intensive. + if candidate_score > baseline_score: + winner = "candidate" + elif candidate_score < baseline_score: + winner = "baseline" + else: + winner = "tie" - Returns a list of per-task result dicts with keys: - triggered, should_trigger, pass, new_artifacts - """ - project_root = Path.cwd() - for parent in [project_root, *project_root.parents]: - if (parent / ".claude").is_dir(): - project_root = parent - break + if verbose: + print( + f"[blind-compare] {query[:60]!r}: baseline={baseline_score:.2f}, candidate={candidate_score:.2f}, winner={winner}", + file=sys.stderr, + ) - env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + results.append( + { + "query": query, + "judge": task.get("judge"), + "candidate_score": candidate_score, + "baseline_score": baseline_score, + "candidate_output": candidate_output, + "baseline_output": baseline_output, + "candidate_reasons": candidate_reasons, + "baseline_reasons": baseline_reasons, + "candidate_triggered": candidate_capture["triggered"], + "baseline_triggered": baseline_capture["triggered"], + "candidate_contaminated": candidate_capture["contaminated"], + "baseline_contaminated": baseline_capture["contaminated"], + "winner": winner, + "label_map": label_map, + "passed": candidate_score >= float(task.get("min_score", 0.7)), + } + ) + return results - results = [] - for task in tasks: - query: str = task["query"] - should_trigger: bool = task["should_trigger"] - artifact_glob: str = task.get("artifact_glob", "adr/*.md") - query_prefix: str = task.get("query_prefix", "/do ") - full_query = f"{query_prefix}{query}" +# --------------------------------------------------------------------------- +# Behavioral evaluator (runs claude -p and checks for artifact creation) +# --------------------------------------------------------------------------- - # Snapshot existing artifacts before the run - before: set[str] = set(glob.glob(str(project_root / artifact_glob))) - triggered = False - new_artifacts: list[str] = [] +def _snapshot_extra_dirs(project_root: Path) -> set[str]: + """Snapshot files in directories that creation tasks may write to.""" + extra_globs = [ + str(project_root / "agents" / "*.md"), + str(project_root / "scripts" / "*.py"), + ] + snapshot: set[str] = set() + for g in extra_globs: + snapshot.update(glob.glob(g)) + snapshot.update(glob.glob(str(project_root / "skills" / "**" / "SKILL.md"), recursive=True)) + snapshot.update(glob.glob(str(project_root / "pipelines" / "**" / "SKILL.md"), recursive=True)) + return snapshot + + +def _run_single_behavioral_task( + task: dict, + project_root: Path, + worktree_path: Path, + env: dict[str, str], + timeout: int, + verbose: bool, + runs_per_task: int, + trigger_threshold: float, +) -> dict: + """Run a single behavioral task and return its result dict. + + Args: + task: Task dict with 'query', 'should_trigger', optional 'artifact_glob' and 'query_prefix'. + project_root: Canonical project root (used only for worktree creation context). + worktree_path: Directory in which claude -p runs and artifact globs are resolved. + For sequential execution this equals project_root; for parallel execution + this is an isolated git worktree. + env: Environment variables to pass to subprocess. + timeout: Per-run timeout in seconds for the claude -p invocation. + verbose: Print progress to stderr. + runs_per_task: Number of times to run the query; result is averaged. + trigger_threshold: Fraction of runs that must trigger to count as triggered. + + Returns: + Per-task result dict with keys: query, triggered, should_trigger, pass, new_artifacts. + """ + query: str = task["query"] + should_trigger: bool = task["should_trigger"] + artifact_glob: str = task.get("artifact_glob", "adr/*.md") + query_prefix: str = task.get("query_prefix", "/do ") - if verbose: + full_query = f"{query_prefix}{query}" + + run_results: list[bool] = [] + all_new_artifacts: list[str] = [] + + for run_index in range(runs_per_task): + if verbose and runs_per_task > 1: + print(f"[behavioral] Run {run_index + 1}/{runs_per_task}: {full_query!r}", file=sys.stderr) + elif verbose: print(f"[behavioral] Running: claude -p {full_query!r}", file=sys.stderr) + # Snapshot existing artifacts before the run (primary glob + extra dirs) + before: set[str] = set(glob.glob(str(worktree_path / artifact_glob))) + before_extra: set[str] = _snapshot_extra_dirs(worktree_path) + + run_triggered = False + run_new_artifacts: list[str] = [] + try: result = subprocess.run( ["claude", "-p", full_query], capture_output=True, text=True, - cwd=str(project_root), + cwd=str(worktree_path), env=env, timeout=timeout, ) @@ -770,42 +1167,196 @@ def _run_behavioral_eval( ) # Check for new files matching the artifact glob - after: set[str] = set(glob.glob(str(project_root / artifact_glob))) - new_artifacts = sorted(after - before) - triggered = len(new_artifacts) > 0 + after: set[str] = set(glob.glob(str(worktree_path / artifact_glob))) + run_new_artifacts = sorted(after - before) + run_triggered = len(run_new_artifacts) > 0 - if verbose and new_artifacts: - print(f"[behavioral] New artifacts: {new_artifacts}", file=sys.stderr) + if verbose and run_new_artifacts: + print(f"[behavioral] New artifacts: {run_new_artifacts}", file=sys.stderr) except subprocess.TimeoutExpired: if verbose: print(f"[behavioral] Timed out after {timeout}s for query: {full_query!r}", file=sys.stderr) # Still check artifacts — the process may have written them before timing out - after_timeout: set[str] = set(glob.glob(str(project_root / artifact_glob))) - new_artifacts = sorted(after_timeout - before) - triggered = len(new_artifacts) > 0 - if verbose and triggered: - print(f"[behavioral] Artifacts found despite timeout: {new_artifacts}", file=sys.stderr) - - # Clean up artifacts so they don't pollute the before-snapshot of the next task - for artifact_path in new_artifacts: + after_timeout: set[str] = set(glob.glob(str(worktree_path / artifact_glob))) + run_new_artifacts = sorted(after_timeout - before) + run_triggered = len(run_new_artifacts) > 0 + if verbose and run_triggered: + print(f"[behavioral] Artifacts found despite timeout: {run_new_artifacts}", file=sys.stderr) + + # Clean up primary-glob artifacts + for artifact_path in run_new_artifacts: try: Path(artifact_path).unlink(missing_ok=True) except OSError: pass - passed = triggered == should_trigger - results.append( - { - "query": query, - "triggered": triggered, - "should_trigger": should_trigger, - "pass": passed, - "new_artifacts": new_artifacts, - } + # Clean up extra-dir artifacts (agents/, skills/, pipelines/, scripts/) + after_extra: set[str] = _snapshot_extra_dirs(worktree_path) + new_extra = sorted(after_extra - before_extra) + for path in new_extra: + try: + Path(path).unlink(missing_ok=True) + except OSError: + pass + if verbose and new_extra: + print( + f"[behavioral] Cleaned up {len(new_extra)} extra artifacts: {new_extra}", + file=sys.stderr, + ) + + run_results.append(run_triggered) + all_new_artifacts.extend(run_new_artifacts) + + # Aggregate across runs + if runs_per_task > 1: + triggered = (sum(run_results) / len(run_results)) >= trigger_threshold + else: + triggered = run_results[0] if run_results else False + + passed = triggered == should_trigger + return { + "query": query, + "triggered": triggered, + "should_trigger": should_trigger, + "pass": passed, + "new_artifacts": all_new_artifacts, + } + + +def _run_single_behavioral_task_in_worktree( + task: dict, + project_root: Path, + env: dict[str, str], + timeout: int, + verbose: bool, + runs_per_task: int, + trigger_threshold: float, +) -> dict: + """Create a temporary git worktree, run a behavioral task inside it, then remove it. + + Used by the parallel execution path in _run_behavioral_eval. Each thread + gets its own isolated worktree so concurrent claude -p invocations do not + share working-directory state. + + The worktree is always removed in a finally block regardless of success or failure. + """ + wt_path_str = tempfile.mkdtemp(prefix="eval-wt-", dir="/tmp") + wt_path = Path(wt_path_str) + # Remove the empty dir so git worktree add can create it + wt_path.rmdir() + try: + subprocess.run( + ["git", "worktree", "add", wt_path_str, "HEAD"], + cwd=str(project_root), + capture_output=True, + check=True, ) + return _run_single_behavioral_task( + task=task, + project_root=project_root, + worktree_path=wt_path, + env=env, + timeout=timeout, + verbose=verbose, + runs_per_task=runs_per_task, + trigger_threshold=trigger_threshold, + ) + finally: + try: + subprocess.run( + ["git", "worktree", "remove", "--force", wt_path_str], + cwd=str(project_root), + capture_output=True, + ) + except Exception: + pass + shutil.rmtree(wt_path_str, ignore_errors=True) - return results + +def _run_behavioral_eval( + target_path: Path, + description: str, + tasks: list[dict], + timeout: int = 240, + verbose: bool = False, + runs_per_task: int = 1, + trigger_threshold: float = 0.5, + parallel_workers: int = 0, +) -> list[dict]: + """Run behavioral assessment by invoking claude -p and checking artifact output. + + Each task must have 'query', 'should_trigger', 'artifact_glob', and optionally + 'query_prefix' fields. + + When parallel_workers > 1, tasks are dispatched concurrently via ThreadPoolExecutor. + Each concurrent task runs in an isolated git worktree created from HEAD so that + file-system mutations do not interfere across tasks. + + When runs_per_task > 1, each task query is run that many times. The final + triggered value is True iff (sum(results) / runs_per_task) >= trigger_threshold. + + Returns a list of per-task result dicts with keys: + triggered, should_trigger, pass, new_artifacts + """ + project_root = Path.cwd() + for parent in [project_root, *project_root.parents]: + if (parent / ".claude").is_dir(): + project_root = parent + break + + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + if parallel_workers > 1: + # Parallel path: each task runs in its own temporary git worktree. + results: list[dict] = [{}] * len(tasks) + with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_workers) as executor: + future_to_index = { + executor.submit( + _run_single_behavioral_task_in_worktree, + task, + project_root, + env, + timeout, + verbose, + runs_per_task, + trigger_threshold, + ): idx + for idx, task in enumerate(tasks) + } + for future in concurrent.futures.as_completed(future_to_index): + idx = future_to_index[future] + try: + results[idx] = future.result() + except Exception as exc: + task = tasks[idx] + query = task.get("query", "unknown") + print(f"[behavioral] Task {query!r} raised exception: {exc}", file=sys.stderr) + results[idx] = { + "query": query, + "triggered": False, + "should_trigger": task.get("should_trigger", False), + "pass": False, + "new_artifacts": [], + } + return results + + # Sequential path (parallel_workers <= 1): run tasks one at a time in project_root. + sequential_results = [] + for task in tasks: + sequential_results.append( + _run_single_behavioral_task( + task=task, + project_root=project_root, + worktree_path=project_root, + env=env, + timeout=timeout, + verbose=verbose, + runs_per_task=runs_per_task, + trigger_threshold=trigger_threshold, + ) + ) + return sequential_results # --------------------------------------------------------------------------- @@ -819,6 +1370,12 @@ def assess_target( goal: str, verbose: bool = False, dry_run: bool = False, + behavioral_runs_per_task: int = 1, + behavioral_trigger_threshold: float = 0.5, + parallel_eval_workers: int = 0, + candidate_content: str | None = None, + baseline_content: str | None = None, + eval_mode: str = "auto", ) -> dict: """Assess a target file against tasks. @@ -828,6 +1385,9 @@ def assess_target( - Dry-run: returns synthetic scores for testing loop mechanics. - Benchmark (NYI): tasks have 'prompt' + 'name' fields. + When parallel_eval_workers > 1 and the task set is behavioral, tasks are + dispatched in parallel via ThreadPoolExecutor, each in its own git worktree. + Returns scores dict with hard gate booleans and quality dimensions. """ scores: dict = { @@ -843,7 +1403,7 @@ def assess_target( "task_results": [], } - content = target_path.read_text() + content = candidate_content if candidate_content is not None else target_path.read_text() valid, description = _parse_frontmatter(content) if not valid or not description: scores["parses"] = False @@ -878,10 +1438,19 @@ def assess_target( # Detect assessment mode from task format is_behavioral = all(_is_behavioral_task(task) for task in tasks) - is_trigger = not is_behavioral and all(_is_trigger_task(task) for task in tasks) + is_blind_compare = all(_is_blind_compare_task(task) for task in tasks) + is_trigger = not is_behavioral and not is_blind_compare and all(_is_trigger_task(task) for task in tasks) if is_trigger: - results = _run_trigger_rate(target_path, description, tasks, verbose=verbose) + task_expectations = {task.get("query", ""): task.get("should_trigger") for task in tasks} + results = _run_trigger_rate( + target_path, + description, + tasks, + candidate_content=content, + eval_mode=eval_mode, + verbose=verbose, + ) summary = results.get("summary", {}) total = summary.get("total", 0) passed = summary.get("passed", 0) @@ -900,6 +1469,9 @@ def assess_target( scores["task_results"].append( { "name": r.get("query", "unnamed")[:40], + "query": r.get("query", ""), + "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""))), + "trigger_rate": r.get("trigger_rate", 0.0), "passed": r.get("pass", False), "score": 1.0 if r.get("pass", False) else 0.0, "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}", @@ -908,7 +1480,16 @@ def assess_target( return scores if is_behavioral: - behavioral_results = _run_behavioral_eval(target_path, description, tasks, verbose=verbose) + task_expectations = {task.get("query", ""): task.get("should_trigger") for task in tasks} + behavioral_results = _run_behavioral_eval( + target_path, + description, + tasks, + verbose=verbose, + runs_per_task=behavioral_runs_per_task, + trigger_threshold=behavioral_trigger_threshold, + parallel_workers=parallel_eval_workers, + ) total = len(behavioral_results) passed = sum(1 for r in behavioral_results if r.get("pass", False)) if total == 0: @@ -927,6 +1508,8 @@ def assess_target( scores["task_results"].append( { "name": r.get("query", "unnamed")[:40], + "query": r.get("query", ""), + "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""))), "passed": r.get("pass", False), "score": 1.0 if r.get("pass", False) else 0.0, "details": f"triggered={r.get('triggered')}, artifacts={artifact_summary}", @@ -934,6 +1517,53 @@ def assess_target( ) return scores + if is_blind_compare: + compare_results = _run_blind_compare_eval( + target_path, + content, + tasks, + baseline_content=baseline_content, + verbose=verbose, + ) + total = len(compare_results) + if total == 0: + return scores + + absolute_quality = sum(r.get("candidate_score", 0.0) for r in compare_results) / total + wins = sum(1 for r in compare_results if r.get("winner") == "candidate") + ties = sum(1 for r in compare_results if r.get("winner") == "tie") + comparative_quality = (wins + 0.5 * ties) / total + + scores["correctness"] = round(absolute_quality * 10, 2) + scores["error_handling"] = round(absolute_quality * 8, 2) + scores["language_idioms"] = round(absolute_quality * 7, 2) + scores["testing"] = round(comparative_quality * 8.0, 2) + scores["efficiency"] = round(min(1.0, absolute_quality + 0.1) * 6, 2) + scores["tests_pass"] = all(r.get("passed", False) for r in compare_results) + + for r in compare_results: + scores["task_results"].append( + { + "name": r.get("query", "unnamed")[:40], + "query": r.get("query", ""), + "passed": r.get("passed", False), + "score": r.get("candidate_score", 0.0), + "details": ( + f"winner={r.get('winner')}; candidate={r.get('candidate_score', 0.0):.2f}; " + f"baseline={r.get('baseline_score', 0.0):.2f}; " + f"candidate_reasons={', '.join(r.get('candidate_reasons', []))}" + ), + "winner": r.get("winner"), + "candidate_score": r.get("candidate_score", 0.0), + "baseline_score": r.get("baseline_score", 0.0), + "candidate_output": r.get("candidate_output", ""), + "baseline_output": r.get("baseline_output", ""), + "candidate_reasons": r.get("candidate_reasons", []), + "baseline_reasons": r.get("baseline_reasons", []), + } + ) + return scores + # Benchmark behavioral assessment — not yet implemented. # Use trigger-rate tasks ('query' + 'should_trigger') or behavioral tasks # ('query' + 'should_trigger' + 'eval_mode: behavioral') per ADR-132. @@ -972,18 +1602,23 @@ def run_optimization_loop( target_path: Path, goal: str, benchmark_tasks_path: Path, - max_iterations: int = 20, + max_iterations: int = 1, min_gain: float = 0.02, train_split: float = 0.6, - revert_streak_limit: int = 5, + revert_streak_limit: int = 1, beam_width: int = 1, candidates_per_parent: int = 1, - holdout_check_cadence: int = 5, + holdout_check_cadence: int = 0, model: str | None = None, verbose: bool = False, report_path: Path | None = None, output_dir: Path | None = None, dry_run: bool = False, + behavioral_runs_per_task: int = 1, + behavioral_trigger_threshold: float = 0.5, + parallel_eval: int = 0, + eval_mode: str = "auto", + optimization_scope: str = "description-only", ) -> dict: """Run the autoresearch optimization loop.""" if beam_width < 1: @@ -1003,28 +1638,67 @@ def run_optimization_loop( _validate_task_set(all_tasks) train_tasks, test_tasks = split_tasks(all_tasks, train_split) + # Warn and fall back to sequential when --parallel-eval is used with non-behavioral tasks. + is_all_behavioral = all(_is_behavioral_task(t) for t in all_tasks) + effective_parallel_eval = parallel_eval + if parallel_eval > 1 and not is_all_behavioral: + print( + "[parallel-eval] Warning: --parallel-eval requires eval_mode=behavioral tasks. " + "Falling back to sequential evaluation.", + file=sys.stderr, + ) + effective_parallel_eval = 0 + if verbose: print(f"Tasks: {len(train_tasks)} train, {len(test_tasks)} test", file=sys.stderr) + if effective_parallel_eval > 1: + print(f"Parallel behavioral eval: {effective_parallel_eval} workers", file=sys.stderr) original_content = target_path.read_text() target_valid, target_description = _parse_frontmatter(original_content) if not target_valid or not target_description: raise ValueError( "Target must have YAML frontmatter with a non-empty description. " - "optimize_loop.py currently supports frontmatter-description optimization only." + "optimize_loop.py requires valid SKILL.md-style frontmatter." ) target_label = target_path.name if verbose: print("Running baseline evaluation...", file=sys.stderr) - baseline_scores = assess_target(target_path, train_tasks, goal, verbose, dry_run) + baseline_scores = assess_target( + target_path, + train_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=original_content, + eval_mode=eval_mode, + ) baseline_composite = composite_score(baseline_scores) best_score = baseline_composite best_content = original_content best_iteration = 0 - baseline_holdout_scores = assess_target(target_path, test_tasks, goal, verbose, dry_run) if test_tasks else None + baseline_holdout_scores = ( + assess_target( + target_path, + test_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=original_content, + eval_mode=eval_mode, + ) + if test_tasks + else None + ) baseline_holdout = composite_score(baseline_holdout_scores) if baseline_holdout_scores else None if verbose: @@ -1048,6 +1722,8 @@ def run_optimization_loop( status = "RUNNING" total_tokens = 0 iteration_counter = 0 + # Maps iteration number → variant content for ACCEPT verdicts (used for best-by-test selection) + keep_contents: dict[int, str] = {} for round_number in range(1, max_iterations + 1): if verbose: @@ -1095,6 +1771,7 @@ def run_optimization_loop( model=model, dry_run=dry_run, iteration_number=iteration_counter, + optimization_scope=optimization_scope, diversification_note=diversification_note, ) variant_content = variant_output["variant"] @@ -1108,7 +1785,7 @@ def run_optimization_loop( print(f"Variant generation failed: {e}", file=sys.stderr) iteration_data = { "number": iteration_counter, - "verdict": "REVERT", + "verdict": "REJECT", "score": {"train": parent["score"], "test": None}, "delta": "0", "change_summary": str(e), @@ -1123,7 +1800,7 @@ def run_optimization_loop( iteration_counter, parent["content"], {}, - "REVERT", + "REJECT", "", "", str(e), @@ -1141,7 +1818,7 @@ def run_optimization_loop( print("REJECTED: Protected sections modified", file=sys.stderr) iteration_data = { "number": iteration_counter, - "verdict": "REVERT", + "verdict": "REJECT", "score": {"train": 0.0, "test": None}, "delta": "0", "change_summary": "Protected sections modified", @@ -1156,7 +1833,7 @@ def run_optimization_loop( iteration_counter, variant_content, {"protected_intact": False}, - "REVERT", + "REJECT", "Protected sections modified", diff_text, change_summary, @@ -1171,7 +1848,7 @@ def run_optimization_loop( print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr) iteration_data = { "number": iteration_counter, - "verdict": "REVERT", + "verdict": "REJECT", "score": {"train": parent["score"], "test": None}, "delta": "0", "change_summary": "Deleted sections without justification", @@ -1188,7 +1865,7 @@ def run_optimization_loop( iteration_counter, variant_content, {"protected_intact": True}, - "REVERT", + "REJECT", "Deleted sections without justification", diff_text, change_summary, @@ -1199,17 +1876,22 @@ def run_optimization_loop( iteration_by_number[iteration_counter] = iteration_data continue - temp_target = ( - target_path.parent / f".{target_path.stem}_variant_{iteration_counter}{target_path.suffix}" + t0 = time.time() + variant_scores = assess_target( + target_path, + train_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=variant_content, + baseline_content=parent["content"], + eval_mode=eval_mode, ) - try: - temp_target.write_text(variant_content) - t0 = time.time() - variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run) - eval_elapsed = time.time() - t0 - variant_composite = composite_score(variant_scores) - finally: - temp_target.unlink(missing_ok=True) + eval_elapsed = time.time() - t0 + variant_composite = composite_score(variant_scores) gain = variant_composite - parent["score"] if verbose: @@ -1220,7 +1902,7 @@ def run_optimization_loop( file=sys.stderr, ) - verdict = "KEEP" if gain > min_gain else "REVERT" + verdict = "ACCEPT" if gain > min_gain else "REJECT" if deletions and deletion_justification: change_summary = f"{change_summary} [deletion justified]" delta_str = f"{gain:+.2f}" if gain != 0 else "0" @@ -1261,12 +1943,15 @@ def run_optimization_loop( iterations.append(iteration_data) iteration_by_number[iteration_counter] = iteration_data - if verdict == "KEEP": + if verdict == "ACCEPT": if variant_composite > best_score: best_score = variant_composite best_content = variant_content best_iteration = iteration_counter + # Track content for each ACCEPT so best-by-test can look it up later + keep_contents[iteration_counter] = variant_content + kept_nodes.append( { "content": variant_content, @@ -1298,15 +1983,22 @@ def run_optimization_loop( rounds_without_keep += 1 if test_tasks and holdout_check_cadence > 0 and round_number % holdout_check_cadence == 0: - temp_target = target_path.parent / f".{target_path.stem}_holdout_check{target_path.suffix}" - try: - temp_target.write_text(best_content) - holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run) - holdout_composite = composite_score(holdout_scores) - if iterations: - iterations[-1]["score"]["test"] = holdout_composite - finally: - temp_target.unlink(missing_ok=True) + holdout_scores = assess_target( + target_path, + test_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=best_content, + baseline_content=original_content, + eval_mode=eval_mode, + ) + holdout_composite = composite_score(holdout_scores) + if iterations: + iterations[-1]["score"]["test"] = holdout_composite if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite): if verbose: @@ -1319,7 +2011,7 @@ def run_optimization_loop( break if rounds_without_keep >= revert_streak_limit: - exit_reason = f"converged ({revert_streak_limit} rounds without KEEP by round {round_number})" + exit_reason = f"converged ({revert_streak_limit} rounds without ACCEPT by round {round_number})" status = "CONVERGED" break @@ -1370,6 +2062,48 @@ def run_optimization_loop( } report_path.write_text(generate_optimization_report(rd, auto_refresh=False)) + # Best-by-test selection: if test tasks exist, prefer the ACCEPT iteration with the + # highest held-out test score rather than the highest training score (anti-Goodhart). + best_test_score: float | None = None + if test_tasks and keep_contents: + # Find iterations with a recorded test score (set during holdout cadence checks) + scored_keeps = [ + (it["number"], it["score"]["test"]) + for it in iterations + if it["verdict"] == "ACCEPT" and it["score"].get("test") is not None and it["number"] in keep_contents + ] + if scored_keeps: + best_test_iter, best_test_score = max(scored_keeps, key=lambda x: x[1]) + if best_test_iter != best_iteration: + if verbose: + print( + f"\nBest-by-test: switching from train-best iter {best_iteration} " + f"(train={best_score:.4f}) to test-best iter {best_test_iter} " + f"(test={best_test_score:.4f})", + file=sys.stderr, + ) + best_content = keep_contents[best_test_iter] + best_iteration = best_test_iter + else: + # No holdout-checked ACCEPT iterations — run a final test eval on best_content + if best_iteration > 0: + final_test_scores = assess_target( + target_path, + test_tasks, + goal, + verbose, + dry_run, + behavioral_runs_per_task, + behavioral_trigger_threshold, + effective_parallel_eval, + candidate_content=best_content, + baseline_content=original_content, + eval_mode=eval_mode, + ) + best_test_score = composite_score(final_test_scores) + if verbose: + print(f"Final test eval on best_content: test={best_test_score:.4f}", file=sys.stderr) + if best_iteration > 0: best_path = output_dir / "best_variant.md" best_path.write_text(best_content) @@ -1385,15 +2119,17 @@ def run_optimization_loop( "baseline_train_score": baseline_composite, "baseline_holdout_score": baseline_holdout, "best_score": best_score, + "best_test_score": best_test_score, "best_iteration": best_iteration, "iterations_run": len(iterations), "max_iterations": max_iterations, - "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"), + "improvements_found": sum(1 for it in iterations if it["verdict"] == "ACCEPT"), "total_tokens": total_tokens, "search_strategy": "beam" if beam_width > 1 or candidates_per_parent > 1 else "hill_climb", "beam_width": beam_width, "candidates_per_parent": candidates_per_parent, "holdout_check_cadence": holdout_check_cadence, + "optimization_scope": optimization_scope, "train_size": len(train_tasks), "test_size": len(test_tasks), "iterations": iterations, @@ -1415,18 +2151,18 @@ def main(): parser.add_argument( "--max-iterations", type=int, - default=20, - help="Max optimization rounds (default: 20); each round evaluates up to beam_width x candidates_per_parent candidates", + default=1, + help="Max optimization rounds (default: 1, short mode); each round evaluates up to beam_width x candidates_per_parent candidates", ) parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)") parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)") parser.add_argument( "--revert-streak-limit", type=int, - default=5, - help="Stop after this many rounds without any KEEP candidates (default: 5)", + default=1, + help="Stop after this many rounds without any ACCEPT candidates (default: 1, short mode)", ) - parser.add_argument("--beam-width", type=int, default=1, help="Number of kept candidates to retain per round") + parser.add_argument("--beam-width", type=int, default=1, help="Number of accepted candidates to retain per round") parser.add_argument( "--candidates-per-parent", type=int, @@ -1436,8 +2172,8 @@ def main(): parser.add_argument( "--holdout-check-cadence", type=int, - default=5, - help="Check held-out tasks every N rounds (default: 5; 0 disables)", + default=0, + help="Check held-out tasks every N rounds (default: 0, disabled in short mode)", ) parser.add_argument("--model", default=None, help="Optional Claude Code model override for variant generation") parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") @@ -1446,6 +2182,36 @@ def main(): ) parser.add_argument("--report", default=None, help="Path for live HTML report") parser.add_argument("--output-dir", default=None, help="Directory for iteration snapshots") + parser.add_argument( + "--behavioral-runs-per-task", + type=int, + default=1, + help="Run each behavioral task query this many times and average results (default: 1)", + ) + parser.add_argument( + "--behavioral-trigger-threshold", + type=float, + default=0.5, + help="Fraction of runs that must trigger to count as triggered (default: 0.5)", + ) + parser.add_argument( + "--parallel-eval", + type=int, + default=0, + help="Run behavioral eval tasks in parallel with isolated git worktrees (default: 0, disabled)", + ) + parser.add_argument( + "--eval-mode", + choices=["auto", "registered", "alias"], + default="auto", + help="Trigger evaluator mode (default: auto; prefers registered-skill worktree eval when possible)", + ) + parser.add_argument( + "--optimization-scope", + choices=["description-only", "body-only"], + default="description-only", + help="Which part of the file to mutate (default: description-only)", + ) args = parser.parse_args() target = Path(args.target) @@ -1475,6 +2241,11 @@ def main(): report_path=Path(args.report) if args.report else None, output_dir=Path(args.output_dir) if args.output_dir else None, dry_run=args.dry_run, + behavioral_runs_per_task=args.behavioral_runs_per_task, + behavioral_trigger_threshold=args.behavioral_trigger_threshold, + parallel_eval=args.parallel_eval, + eval_mode=args.eval_mode, + optimization_scope=args.optimization_scope, ) except ValueError as e: print(f"Error: {e}", file=sys.stderr) diff --git a/skills/do/.SKILL_variant_3.md b/skills/do/.SKILL_variant_3.md new file mode 100644 index 00000000..7daa8283 --- /dev/null +++ b/skills/do/.SKILL_variant_3.md @@ -0,0 +1,311 @@ +--- +name: do +description: | + Classify user requests and route to the correct agent + skill combination. + Use for any user request that needs delegation: code changes, debugging, + reviews, content creation, research, or multi-step workflows. Invoked as + the primary entry point via "/do [request]". Route all code changes to + domain agents. Route all requests beyond pure fact lookups and single + reads to agents and skills. +version: 2.0.0 +user-invocable: true +argument-hint: "" +allowed-tools: + - Read + - Bash + - Grep + - Glob + - Skill + - Task +routing: + triggers: + - "route task" + - "classify request" + category: meta-tooling +--- + +# /do - Smart Router + +/do is a **ROUTER**, not a worker. Its ONLY job is to classify requests, select the right agent + skill, and dispatch. It delegates all execution, implementation, debugging, review, and fixes to specialized agents. + +**What the main thread does:** (1) Classify, (2) Select agent+skill, (3) Dispatch via Agent tool, (4) Evaluate if more work needed, (5) Route to ANOTHER agent if yes, (6) Report results. + +**The main thread delegates to agents:** code reading (Explore agent), file edits (domain agents), test runs (agent with skill), documentation (technical-documentation-engineer), all Simple+ tasks. + +The main thread is an **orchestrator**. If you find yourself reading source code, writing code, or doing analysis — pause and route to an agent instead. + +--- + +## Instructions + +### Phase Banners (MANDATORY) + +Every phase MUST display a banner BEFORE executing: `/do > Phase N: PHASE_NAME — description...` + +After Phase 2, display the full routing decision banner (`===` block). Phase banners tell the user *where they are*; the routing banner tells them *what was decided*. Both required. + +--- + +### Phase 1: CLASSIFY + +**Goal**: Determine request complexity and whether routing is needed. + +Read and follow the repository CLAUDE.md before making any routing decision, because it contains project-specific conventions that affect agent selection and skill pairing. + +| Complexity | Agent | Skill | Direct Action | +|------------|-------|-------|---------------| +| Trivial | No | No | **ONLY reading a file the user named by exact path** | +| Simple | **Yes** | Yes | Route to agent | +| Medium | **Required** | **Required** | Route to agent | +| Complex | Required (2+) | Required (2+) | Route to agent | + +**Trivial = reading a file the user named by exact path.** Everything else is Simple+ and MUST use an agent, skill, or pipeline. When uncertain, classify UP not down — because under-routing wastes implementations while over-routing only wastes tokens, and tokens are cheap but bad code is expensive. + +**Common misclassifications** (these are NOT Trivial — route them): evaluating repos/URLs, any opinion/recommendation, git operations, codebase questions (`explore-pipeline`), retro lookups (`retro` skill), comparing approaches. + +**Maximize skill/agent/pipeline usage.** If a skill or pipeline exists for the task, USE IT — even if handling directly seems faster, because skills encode domain patterns that prevent common mistakes. + +**Check for parallel patterns FIRST** because independent work items can run concurrently, saving significant time — sequential dispatch when parallel is possible wastes wall-clock time needlessly: 2+ independent failures or 3+ subtasks → `dispatching-parallel-agents`; broad research → `research-coordinator-engineer`; multi-agent coordination → `project-coordinator-engineer`; plan exists + "execute" → `subagent-driven-development`; new feature → `feature-design` (check `.feature/` directory; if present, use `feature-state.py status` for current phase). + +**Optional: Force Direct** — OFF by default. When explicitly enabled, overrides routing for trivial operations. Only applies when the user explicitly requests it. + +--- + +**CRITICAL — Creation Request Detection** (MANDATORY scan BEFORE completing Phase 1): + +**Primary test**: "Would fulfilling this request produce a NEW FILE that does not currently exist in the repo?" → YES = creation request, ADR required. + +Scan the request for creation signals: + +| Signal Type | Pattern Examples | +|-------------|-----------------| +| Explicit creation verbs | "create", "scaffold", "build", "add new", "implement new" | +| Domain object targets | agent, skill, pipeline, hook, feature, plugin, workflow, voice profile | +| Implicit creation | "I need a [component]", "we need a [component]", "build me a [component]" | +| Purpose patterns | "build a [component] for X", "create a [component] that does Y" | + +**Concrete examples — ALL of these ARE creation requests:** +- `"build a pipeline for automated security"` → new pipeline files +- `"create a PostToolUse hook that detects SQL injection"` → new hook file +- `"I need an agent for Ruby on Rails development"` → new agent file +- `"scaffold a new skill for database migrations"` → new skill files +- `"add a new feature for user authentication"` → new feature files +- `"implement a new workflow for code review"` → new workflow files + +**NOT a creation request** (operating on files that already exist): +- `"debug the existing auth hook"` — fix existing file +- `"review the payment pipeline"` — read-only inspection of existing files +- `"fix the error handling in the Go agent"` — modify existing file +- `"refactor the router logic"` — transform existing file +- `"explain how the retry skill works"` — explanation only +- `"run the test suite"` — execution only +- `"audit the security hooks"` — analysis of existing files + +**When ambiguous**: ask "does the user want to CREATE something new that doesn't exist yet, OR improve/inspect something that already exists?" If new → creation. The purpose or topic of the new component (e.g., "for security", "for debugging") does NOT make it a non-creation request — only the presence or absence of an existing target file does. + +If ANY creation signal is found AND complexity is Simple+: +1. Set an internal flag: `is_creation = true` +2. **Phase 4 Step 0 is MANDATORY** — write ADR before dispatching any agent + +This early detection exists because Phase 4 Step 0 is the most frequently skipped step in /do. Moving detection to Phase 1 ensures the creation protocol fires before routing decisions consume attention. The Gate below enforces acknowledgment before Phase 2. + +**Gate**: Complexity classified. If a creation signal was detected, output `[CREATION REQUEST DETECTED]` before displaying the routing banner. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner. + + + +--- + +### Phase 2: ROUTE + +**Goal**: Select the correct agent + skill combination from the INDEX files and routing tables. + +**Step 1: Check force-route triggers** + +Force-route triggers are in `skills/INDEX.json` (field: `force_route: true`). If a force-route trigger matches the request, invoke that skill BEFORE any other action, because force-routes encode critical domain patterns that prevent common mistakes — skipping them causes the exact class of bugs they were designed to prevent. + +Check triggers literally against the request text. If triggers match, force-route applies — no exceptions, no judgment calls about whether "it applies here." + +Trigger phrases must contain only user-language keywords, never sibling skill names, because the router matches triggers against request text and a sibling skill name would cause false matches. Each trigger phrase must map to exactly one skill — duplicates across skills make deterministic routing impossible. + +**Critical**: "push", "commit", "create PR", "merge" are NOT trivial git commands. They MUST route through skills that run quality gates, because running raw `git push`, `git commit`, `gh pr create`, or `gh pr merge` directly bypasses lint checks, test runs, review loops, CI verification, and repo classification. + +**Step 2: Select agent + skill** + +Read the routing tables in `references/routing-tables.md` and the INDEX files (`agents/INDEX.json`, `skills/INDEX.json`, `pipelines/INDEX.json`) to identify candidates by trigger-overlap. Select the best match; use LLM judgment to tiebreak when multiple candidates fit equally well. + +Route to the simplest agent+skill that satisfies the request, because over-engineering the routing itself (stacking unnecessary skills) creates more overhead than it prevents. + +When `[cross-repo]` output is present, route to `.claude/agents/` local agents because they contain project-specific knowledge that generic agents lack. + +Route all code modifications to domain agents, because domain agents carry language-specific expertise, testing methodology, and quality gates that the router lacks. + +**Step 3: Apply skill override** (task verb overrides default skill) + +When the request verb implies a specific methodology, override the agent's default skill. Common overrides: "review" → systematic-code-review, "debug" → systematic-debugging, "refactor" → systematic-refactoring, "TDD" → test-driven-development. Full override table in `references/routing-tables.md`. + +**Step 4: Display routing decision** (MANDATORY — do this NOW, before anything else) + +This banner MUST be the FIRST visible output for EVERY /do invocation. Display BEFORE creating plans, BEFORE invoking agents, BEFORE any work begins. No exceptions. + +``` +=================================================================== + ROUTING: [brief summary] +=================================================================== + Selected: + -> Agent: [name] - [why] + -> Skill: [name] - [why] + -> Pipeline: PHASE1 → PHASE2 → ... (if pipeline; phases from pipelines/INDEX.json) + -> Anti-Rationalization: [auto-injected for code/security/testing] + Invoking... +=================================================================== +``` + +For Trivial: show `Classification: Trivial - [reason]` and `Handling directly (no agent/skill needed)`. + +**Optional: Dry Run Mode** — OFF by default. When enabled, show the routing decision without executing. + +**Optional: Verbose Routing** — OFF by default. When enabled, explain why each alternative was rejected. + +**Step 5: Record routing decision** (Simple+ only — skip Trivial): + +```bash +python3 ~/.claude/scripts/learning-db.py record \ + routing "{selected_agent}:{selected_skill}" \ + "request: {first_200_chars} | complexity: {complexity} | force_used: {0|1} | llm_override: {0|1} | enhancements: {comma_separated_list}" \ + --category routing-decision \ + --tags "{applicable_flags}" +``` + +Tags: `force-route`, `llm-override`, `auto-pipeline` (as applicable). This call is advisory — if it fails, continue. + +**Gate**: Agent and skill selected. Banner displayed. Routing decision recorded. Proceed to Phase 3. + +--- + +### Phase 3: ENHANCE + +**Goal**: Stack additional skills based on signals in the request. + +Auto-inject retro knowledge from `learning.db` for any substantive work (benchmark: +5.3 avg, 67% win rate), because historical patterns prevent repeat mistakes. Relevance-gated by FTS5 keyword matching — only inject when keywords overlap. + +| Signal in Request | Enhancement to Add | +|-------------------|-------------------| +| Any substantive work (code, design, plan) | **Auto-inject retro knowledge** (via `retro-knowledge-injector` hook) | +| "comprehensive" / "thorough" / "full" | Add parallel reviewers (security + business + quality) | +| "with tests" / "production ready" | Append test-driven-development + verification-before-completion | +| "research needed" / "investigate first" | Prepend research-coordinator-engineer | +| Multiple independent problems (2+) | Use dispatching-parallel-agents | +| "review" with 5+ files | Use parallel-code-review (3 reviewers) | +| Complex implementation | Offer subagent-driven-development | + +Before stacking any enhancement, check the target skill's `pairs_with` field in `skills/INDEX.json`, because some skills have built-in verification gates that make stacking redundant or harmful. Specifically: empty `pairs_with: []` means no stacking allowed. Skills with built-in verification gates handle their own verification. The `fast` skill handles its own testing — stack only compatible enhancements. + +**Auto-inject anti-rationalization** for these task types, because these categories are where shortcut rationalization causes the most damage: + +| Task Type | Patterns Injected | +|-----------|-------------------| +| Code modification | anti-rationalization-core, verification-checklist | +| Code review | anti-rationalization-core, anti-rationalization-review | +| Security work | anti-rationalization-core, anti-rationalization-security | +| Testing | anti-rationalization-core, anti-rationalization-testing | +| Debugging | anti-rationalization-core, verification-checklist | +| External content evaluation | **untrusted-content-handling** | + +For explicit maximum rigor, use `/with-anti-rationalization [task]`. + +**Gate**: Enhancements applied. Proceed to Phase 4. + +--- + +### Phase 4: EXECUTE + +**Goal**: Invoke the selected agent + skill and deliver results. + +**Step 0: Execute Creation Protocol** (for creation requests ONLY) + +If request contains "create", "new", "scaffold", "build pipeline/agent/skill/hook" AND complexity is Simple+, automatically sequence: (1) Write ADR at `adr/{kebab-case-name}.md`, (2) Register via `adr-query.py register`, (3) Proceed to plan creation. The `adr-context-injector` and `adr-enforcement` hooks handle cross-agent ADR compliance automatically. This protocol fires automatically because creation requests at Simple+ complexity need architectural grounding before implementation begins. + +**Step 1: Create plan** (for Simple+ complexity) + +Create `task_plan.md` before execution, because executing without a plan produces wrong results faster — not correct results sooner. The `auto-plan-detector.py` hook auto-injects `` context. Skip only for Trivial tasks. + +**Step 2: Invoke agent with skill** + +Dispatch the agent. MCP tool discovery is the agent's responsibility — each agent's markdown declares which MCP tools it needs. Do not inject MCP instructions from /do. + +Route to agents that create feature branches for all commits, because main branch commits affect everyone and bypassing branch protection causes cascading problems. + +When dispatching agents for file modifications, explicitly include "commit your changes on the branch" in the agent prompt, because otherwise the agent completes file edits but changes sit unstaged — the orchestrator assumes committed work and moves on, and changes are lost. + +When dispatching agents with `isolation: "worktree"`, inject the `worktree-agent` skill rules into the agent prompt. The skill at `skills/worktree-agent/SKILL.md` contains mandatory rules that prevent worktree isolation failures (leaked changes, branch confusion, auto-plan hook interference). At minimum include: "Verify your CWD contains .claude/worktrees/. Create feature branch before edits. Skip task_plan.md creation (handled by orchestrator). Stage specific files only." + +For repos without organization-gated workflows, run up to 3 iterations of `/pr-review` → fix before creating a PR, because post-merge fixes cost 2 PRs instead of 1. For repos under protected organizations (via `scripts/classify-repo.py`), require user confirmation before EACH git action — confirm before executing or merging, because organization-gated repos have compliance requirements that require explicit approval. + +**Step 3: Handle multi-part requests** + +Detect: "first...then", "and also", numbered lists, semicolons. Sequential dependencies execute in order. Independent items launch multiple Task tools in single message. Max parallelism: 10 agents. + +**Step 4: Auto-Pipeline Fallback** (when no agent/skill matches AND complexity >= Simple) + +Always invoke `auto-pipeline` for unmatched requests, because a missing agent match is a routing gap to report — routing overhead is always less than unreviewed code changes. If no pipeline matches either, fall back to closest agent + verification-before-completion. + +When uncertain which route: **ROUTE ANYWAY.** Add verification-before-completion as safety net. Routing overhead is always less than the cost of unreviewed code changes. + +**Gate**: Agent invoked, results delivered. Proceed to Phase 5. + +--- + +### Phase 5: LEARN + +**Goal**: Ensure session insights are captured to `learning.db`. + +**Routing outcome recording** (Simple+ tasks, observable facts only — no self-grading): +```bash +python3 ~/.claude/scripts/learning-db.py record \ + routing "{selected_agent}:{selected_skill}" \ + "{existing_value} | tool_errors: {0|1} | user_rerouted: {0|1}" \ + --category routing-decision +``` + +Record only observable facts (tool_errors, user_rerouted) — routing outcome quality is measured by user reroutes, not self-assessment. + +**Auto-capture** (hooks, zero LLM cost): `error-learner.py` (PostToolUse), `review-capture.py` (PostToolUse), `session-learning-recorder.py` (Stop). + +**Skill-scoped recording** (preferred — one-liner): +```bash +python3 ~/.claude/scripts/learning-db.py learn --skill go-testing "insight about testing" +python3 ~/.claude/scripts/learning-db.py learn --agent golang-general-engineer "insight about agent" +python3 ~/.claude/scripts/learning-db.py learn "general insight without scope" +``` + +**Immediate graduation for review findings** (MANDATORY): When a review finds an issue and it gets fixed in the same PR: (1) Record scoped to responsible agent/skill, (2) Boost to 1.0, (3) Embed into agent anti-patterns, (4) Graduate, (5) Stage changes in same PR. One cycle — no waiting for "multiple observations." + +**Gate**: After Simple+ tasks, record at least one learning via `learn`. Review findings get immediate graduation. + +--- + +## Error Handling + +### Error: "No Agent Matches Request" +Cause: Request domain not covered by any agent +Solution: Check INDEX files and `references/routing-tables.md` for near-matches. Route to closest agent with verification-before-completion. Report the gap. + +### Error: "Force-Route Conflict" +Cause: Multiple force-route triggers match the same request +Solution: Apply most specific force-route first. Stack secondary routes as enhancements if compatible. + +### Error: "Plan Required But Not Created" +Cause: Simple+ task attempted without task_plan.md +Solution: Stop execution. Create `task_plan.md`. Resume routing after plan is in place. + +--- + +## References + +### Reference Files +- `${CLAUDE_SKILL_DIR}/references/routing-tables.md`: Complete category-specific skill routing +- `agents/INDEX.json`: Agent triggers and metadata +- `skills/INDEX.json`: Skill triggers, force-route flags, pairs_with +- `pipelines/INDEX.json`: Pipeline phases, triggers, composition chains + + \ No newline at end of file diff --git a/skills/read-only-ops/SKILL.md b/skills/read-only-ops/SKILL.md index 70375644..115f4b5e 100644 --- a/skills/read-only-ops/SKILL.md +++ b/skills/read-only-ops/SKILL.md @@ -1,10 +1,12 @@ --- name: read-only-ops description: | - Read-only exploration, status checks, and reporting without modifications. - Use when user asks to check status, find files, search code, show state, - or explicitly requests read-only investigation. Route to other skills when user wants - changes, fixes, refactoring, or any write operation. + Read-only exploration, inspection, and reporting without modifications. + Use when the user wants to inspect, investigate, audit, survey, or analyze code/files/state + without making changes. Common triggers: "inspect this", "report back without changing anything", + "show me", "look at", "tell me about", "find files", "check status", "list all", "how many", + "where is", or "what is the current state of". Route away when the user wants fixes, + refactors, writing, or any write operation. version: 2.0.0 user-invocable: false allowed-tools: