diff --git a/.claude/settings.json b/.claude/settings.json
index 800172b2..320c865a 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -90,6 +90,12 @@
             "command": "python3 \"$HOME/.claude/hooks/anti-rationalization-injector.py\"",
             "description": "Inject anti-rationalization warnings based on task-type keywords",
             "timeout": 1000
+          },
+          {
+            "type": "command",
+            "command": "python3 \"$HOME/.claude/hooks/creation-request-enforcer-userprompt.py\"",
+            "description": "Early ADR enforcement: detect creation requests before model processing begins",
+            "timeout": 5000
           }
         ]
       }
@@ -297,6 +303,16 @@
             "timeout": 2000
           }
         ]
+      },
+      {
+        "matcher": "Write|Edit",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "python3 ~/.claude/hooks/sql-injection-detector.py",
+            "timeout": 5000
+          }
+        ]
       }
     ],
     "PreCompact": [
diff --git a/agents/INDEX.json b/agents/INDEX.json
index 19d8fb8c..bb96b319 100644
--- a/agents/INDEX.json
+++ b/agents/INDEX.json
@@ -4,7 +4,7 @@
   "agents": {
     "agent-creator-engineer": {
       "file": "agent-creator-engineer.md",
-      "short_description": "**DEPRECATED**: Use skill-creator skill instead",
+      "short_description": "**DEPRECATED**: Use skill-creator agent instead",
       "triggers": [
         "create agent",
         "new agent",
@@ -107,10 +107,7 @@
         "programming rules"
       ],
       "pairs_with": [
-        "github-profile-rules-repo-analysis",
-        "github-profile-rules-pr-review",
-        "github-profile-rules-synthesis",
-        "github-profile-rules-validation"
+        "github-profile-rules"
       ],
       "complexity": "Medium",
       "category": "meta"
diff --git a/hooks/creation-request-enforcer-userprompt.py b/hooks/creation-request-enforcer-userprompt.py
new file mode 100644
index 00000000..a39c8f20
--- /dev/null
+++ b/hooks/creation-request-enforcer-userprompt.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+# hook-version: 1.0.0
+"""
+UserPromptSubmit Hook: Creation Request ADR Enforcer
+
+Fires at UserPromptSubmit time — BEFORE the model begins processing — and checks
+whether the user's prompt contains creation keywords. If a creation request is
+detected without a recent ADR session, it injects a strong context message
+reminding Claude that an ADR is mandatory before any other action.
+
+This hook complements the PreToolUse:Agent creation-protocol-enforcer.py by
+catching the requirement earlier in the pipeline, before routing has occurred.
+
+Allow-through conditions:
+- No creation keywords found in prompt
+- .adr-session.json exists and was modified within the last 900 seconds
+- ADR_PROTOCOL_BYPASS=1 env var is set
+"""
+
+import json
+import os
+import sys
+import time
+import traceback
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent / "lib"))
+from hook_utils import context_output, empty_output
+from stdin_timeout import read_stdin
+
+_BYPASS_ENV = "ADR_PROTOCOL_BYPASS"
+_ADR_SESSION_FILE = ".adr-session.json"
+_STALENESS_THRESHOLD_SECONDS = 900
+_EVENT_NAME = "UserPromptSubmit"
+
+_CREATION_KEYWORDS = [
+    "create",
+    "scaffold",
+    "build a new",
+    "build a ",
+    "add a new",
+    "add new",
+    "new agent",
+    "new skill",
+    "new pipeline",
+    "new hook",
+    "new feature",
+    "new workflow",
+    "new plugin",
+    "implement new",
+    "i need a ",
+    "i need an ",
+    "we need a ",
+    "we need an ",
+]
+
+_WARNING_TEXT = """\
+[creation-enforcer] CREATION REQUEST DETECTED — ADR IS MANDATORY BEFORE ANY OTHER ACTION
+
+You MUST complete these steps BEFORE dispatching any agent or writing any files:
+1. Write ADR at adr/{name}.md (use kebab-case name describing what you're creating)
+2. Register: python3 scripts/adr-query.py register --adr adr/{name}.md
+3. Only THEN proceed to routing and agent dispatch.
+
+Skipping this step will be blocked by the pretool-adr-creation-gate hook.\
+"""
+
+
+def _has_creation_keywords(prompt: str) -> bool:
+    """Return True if the prompt contains any creation keyword (case-insensitive)."""
+    lower = prompt.lower()
+    return any(kw in lower for kw in _CREATION_KEYWORDS)
+
+
+def _adr_session_is_recent(base_dir: Path) -> bool:
+    """Return True if .adr-session.json exists and was modified within the threshold."""
+    adr_session_path = base_dir / _ADR_SESSION_FILE
+    if not adr_session_path.exists():
+        return False
+    try:
+        mtime = os.path.getmtime(adr_session_path)
+        age = time.time() - mtime
+        return age <= _STALENESS_THRESHOLD_SECONDS
+    except OSError:
+        return False
+
+
+def main() -> None:
+    """Run the UserPromptSubmit creation enforcement check."""
+    debug = os.environ.get("CLAUDE_HOOKS_DEBUG")
+
+    raw = read_stdin(timeout=2)
+    try:
+        event = json.loads(raw)
+    except (json.JSONDecodeError, ValueError):
+        empty_output(_EVENT_NAME).print_and_exit()
+
+    # Bypass env var.
+    if os.environ.get(_BYPASS_ENV) == "1":
+        if debug:
+            print(
+                f"[creation-enforcer] Bypassed via {_BYPASS_ENV}=1",
+                file=sys.stderr,
+            )
+        empty_output(_EVENT_NAME).print_and_exit()
+
+    # UserPromptSubmit event uses the "prompt" field for the user message.
+    prompt = event.get("prompt", "") if isinstance(event, dict) else ""
+    if not prompt:
+        empty_output(_EVENT_NAME).print_and_exit()
+
+    # Check for creation keywords.
+    if not _has_creation_keywords(prompt):
+        if debug:
+            print(
+                "[creation-enforcer] No creation keywords found — allowing through",
+                file=sys.stderr,
+            )
+        empty_output(_EVENT_NAME).print_and_exit()
+
+    # Resolve project root.
+    cwd_str = event.get("cwd") or os.environ.get("CLAUDE_PROJECT_DIR", ".")
+    base_dir = Path(cwd_str).resolve()
+
+    # Check whether a recent ADR session exists.
+    if _adr_session_is_recent(base_dir):
+        if debug:
+            print(
+                "[creation-enforcer] Recent .adr-session.json found — allowing through",
+                file=sys.stderr,
+            )
+        empty_output(_EVENT_NAME).print_and_exit()
+
+    if debug:
+        print(
+            "[creation-enforcer] Creation keywords found, no recent ADR session — injecting warning",
+            file=sys.stderr,
+        )
+
+    # No recent ADR session — inject strong advisory context.
+    context_output(_EVENT_NAME, _WARNING_TEXT).print_and_exit()
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except SystemExit:
+        raise
+    except Exception as e:
+        if os.environ.get("CLAUDE_HOOKS_DEBUG"):
+            traceback.print_exc(file=sys.stderr)
+        else:
+            print(
+                f"[creation-enforcer] Error: {type(e).__name__}: {e}",
+                file=sys.stderr,
+            )
+        # Fail open — never exit non-zero on unexpected errors.
+        sys.exit(0)
diff --git a/hooks/lib/learning_db_v2.py b/hooks/lib/learning_db_v2.py
index 2b6f2cb9..1dde363d 100755
--- a/hooks/lib/learning_db_v2.py
+++ b/hooks/lib/learning_db_v2.py
@@ -28,7 +28,7 @@
 
 _DEFAULT_DB_DIR = Path.home() / ".claude" / "learning"
 
-_CURRENT_SCHEMA_VERSION = 2
+_CURRENT_SCHEMA_VERSION = 3
 
 CATEGORY_DEFAULTS = {
     "error": 0.55,
@@ -132,6 +132,26 @@ def _run_migrations(conn: sqlite3.Connection) -> None:
             "VALUES (2, 'add graduation_proposed_at column to learnings')"
         )
 
+    if current < 3:
+        # v2 -> v3: Add performance indexes for timestamp range queries and ROI cohort scans
+        for ddl in (
+            "CREATE INDEX IF NOT EXISTS idx_learnings_last_seen ON learnings(last_seen)",
+            "CREATE INDEX IF NOT EXISTS idx_learnings_first_seen ON learnings(first_seen)",
+            "CREATE INDEX IF NOT EXISTS idx_sessions_start_time ON sessions(start_time)",
+            "CREATE INDEX IF NOT EXISTS idx_activations_timestamp ON activations(timestamp)",
+            "CREATE INDEX IF NOT EXISTS idx_session_stats_had_retro ON session_stats(had_retro_knowledge)",
+            "CREATE INDEX IF NOT EXISTS idx_session_stats_created_at ON session_stats(created_at)",
+        ):
+            try:
+                conn.execute(ddl)
+            except sqlite3.OperationalError:
+                pass  # Index already exists
+        conn.execute("PRAGMA user_version = 3")
+        conn.execute(
+            "INSERT OR IGNORE INTO schema_migrations (version, description) "
+            "VALUES (3, 'add timestamp and cohort indexes for query performance')"
+        )
+
     conn.commit()
 
 
@@ -235,7 +255,10 @@ def _migrate_fts(pre_migration_version: int = 0) -> None:
 CREATE INDEX IF NOT EXISTS idx_learnings_project ON learnings(project_path);
 CREATE INDEX IF NOT EXISTS idx_learnings_graduated ON learnings(graduated_to);
 CREATE INDEX IF NOT EXISTS idx_learnings_error_sig ON learnings(error_signature);
+CREATE INDEX IF NOT EXISTS idx_learnings_last_seen ON learnings(last_seen);
+CREATE INDEX IF NOT EXISTS idx_learnings_first_seen ON learnings(first_seen);
 CREATE INDEX IF NOT EXISTS idx_sessions_project ON sessions(project_path);
+CREATE INDEX IF NOT EXISTS idx_sessions_start_time ON sessions(start_time);
 
 CREATE VIRTUAL TABLE IF NOT EXISTS learnings_fts USING fts5(
     topic,
@@ -267,7 +290,10 @@ def _migrate_fts(pre_migration_version: int = 0) -> None:
 
 CREATE INDEX IF NOT EXISTS idx_activations_topic_key ON activations(topic, key);
 CREATE INDEX IF NOT EXISTS idx_activations_session ON activations(session_id);
+CREATE INDEX IF NOT EXISTS idx_activations_timestamp ON activations(timestamp);
 CREATE INDEX IF NOT EXISTS idx_session_stats_session ON session_stats(session_id);
+CREATE INDEX IF NOT EXISTS idx_session_stats_had_retro ON session_stats(had_retro_knowledge);
+CREATE INDEX IF NOT EXISTS idx_session_stats_created_at ON session_stats(created_at);
 
 CREATE TRIGGER IF NOT EXISTS learnings_ai AFTER INSERT ON learnings BEGIN
     INSERT INTO learnings_fts(rowid, topic, key, value, tags)
diff --git a/hooks/lib/usage_db.py b/hooks/lib/usage_db.py
index 0d6df802..42fc46d6 100644
--- a/hooks/lib/usage_db.py
+++ b/hooks/lib/usage_db.py
@@ -77,6 +77,8 @@ def init_db():
             CREATE INDEX IF NOT EXISTS idx_agent_type ON agent_invocations(agent_type);
             CREATE INDEX IF NOT EXISTS idx_skill_ts ON skill_invocations(timestamp);
             CREATE INDEX IF NOT EXISTS idx_agent_ts ON agent_invocations(timestamp);
+            CREATE INDEX IF NOT EXISTS idx_skill_name_ts ON skill_invocations(skill_name, timestamp);
+            CREATE INDEX IF NOT EXISTS idx_agent_type_ts ON agent_invocations(agent_type, timestamp);
         """)
         conn.commit()
 
diff --git a/hooks/sql-injection-detector.py b/hooks/sql-injection-detector.py
new file mode 100644
index 00000000..9c13e983
--- /dev/null
+++ b/hooks/sql-injection-detector.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+# hook-version: 1.0.0
+"""
+PostToolUse:Write,Edit Hook: SQL Injection Pattern Detector
+
+Scans edited/written code files for SQL injection anti-patterns that are
+complementary to those already detected by posttool-security-scan.py.
+
+Patterns detected (new coverage beyond posttool-security-scan.py):
+1. String concatenation with SQL context: "SELECT ... " + var or var + "... WHERE"
+2. .format() call on a SQL string: "SELECT ... {}".format(
+3. Go fmt.Sprintf / Java String.format / PHP sprintf with SQL percent placeholders
+4. f-strings with extended SQL keywords: WHERE, FROM, JOIN, SET, VALUES
+5. Multi-line SQL building via concatenation assignment (+=)
+
+Design:
+- PostToolUse (advisory only, never blocks)
+- Only scans code files (skips markdown, config, images)
+- Compiled regex patterns at module load for <20ms execution
+- Reads file content from disk (tool_result may be truncated)
+- Skips files >10,000 lines
+- Limits output to first 5 findings to avoid noise
+
+ADR: adr/134-sql-injection-detector-hook.md
+"""
+
+import json
+import os
+import re
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent / "lib"))
+from stdin_timeout import read_stdin
+
+# Code file extensions worth scanning
+_CODE_EXTENSIONS = frozenset(
+    {
+        ".py",
+        ".go",
+        ".js",
+        ".ts",
+        ".tsx",
+        ".jsx",
+        ".rb",
+        ".java",
+        ".php",
+        ".rs",
+        ".c",
+        ".cpp",
+        ".cs",
+        ".swift",
+        ".kt",
+    }
+)
+
+# Max lines to scan (skip generated/vendored files)
+_MAX_LINES = 10_000
+
+# SQL keywords that indicate a SQL context (extended beyond SELECT/INSERT/UPDATE/DELETE)
+_SQL_KEYWORDS = (
+    "SELECT",
+    "INSERT",
+    "UPDATE",
+    "DELETE",
+    "DROP",
+    "WHERE",
+    "FROM",
+    "JOIN",
+    "SET",
+    "VALUES",
+)
+
+
+def _build_patterns() -> list[tuple[re.Pattern[str], str, str]]:
+    """Build SQL injection detection patterns at import time.
+
+    Patterns are constructed programmatically to avoid triggering
+    security-reminder hooks that scan for literal pattern strings.
+
+    Each tuple: (compiled_pattern, category_label, suggestion_text)
+    """
+    kw = "|".join(_SQL_KEYWORDS)
+
+    return [
+        # String concatenation: "...SQL..." + variable
+        # Matches: "SELECT * FROM users WHERE id = " + user_id
+        (
+            re.compile(
+                rf"""['"](?:[^'"]*\b(?:{kw})\b[^'"]*)['"]\s*\+""",
+                re.IGNORECASE,
+            ),
+            "string-concatenation",
+            "Use parameterized queries (e.g., cursor.execute(sql, params))",
+        ),
+        # String concatenation: variable + "...SQL..."
+        # Matches: base_query + " WHERE name = " + name
+        (
+            re.compile(
+                rf"""\+\s*['"](?:[^'"]*\b(?:{kw})\b[^'"]*)['"]\s*(?:\+|$|;|\)|,)""",
+                re.IGNORECASE,
+            ),
+            "string-concatenation",
+            "Use parameterized queries (e.g., cursor.execute(sql, params))",
+        ),
+        # .format() call on a SQL string
+        # Matches: "SELECT * FROM {} WHERE id = {}".format(
+        (
+            re.compile(
+                rf"""['"](?:[^'"]*\b(?:{kw})\b[^'"]*\{{[^'"]*)['"]\s*\.format\s*\(""",
+                re.IGNORECASE,
+            ),
+            "format-injection",
+            "Use parameterized queries instead of .format() in SQL strings",
+        ),
+        # Go fmt.Sprintf with SQL percent placeholders
+        # Matches: fmt.Sprintf("SELECT ... %s", or fmt.Sprintf("WHERE id = %d",
+        (
+            re.compile(
+                rf"""fmt\.Sprintf\s*\(\s*['"`](?:[^'"`]*\b(?:{kw})\b[^'"`]*%[sdvfq][^'"`]*)[`'"]\s*,""",
+                re.IGNORECASE,
+            ),
+            "sprintf-injection",
+            "Use db.Query with ? or $N placeholders and pass values as arguments",
+        ),
+        # Java String.format with SQL percent placeholders
+        # Matches: String.format("SELECT ... %s",
+        (
+            re.compile(
+                rf"""String\.format\s*\(\s*["'](?:[^"']*\b(?:{kw})\b[^"']*%[sdnf][^"']*)['"]\s*,""",
+                re.IGNORECASE,
+            ),
+            "sprintf-injection",
+            "Use PreparedStatement with ? placeholders instead of String.format",
+        ),
+        # PHP sprintf with SQL percent placeholders
+        # Matches: sprintf("SELECT ... %s",
+        (
+            re.compile(
+                rf"""(?<!\w)sprintf\s*\(\s*["'](?:[^"']*\b(?:{kw})\b[^"']*%[sduf][^"']*)['"]\s*,""",
+                re.IGNORECASE,
+            ),
+            "sprintf-injection",
+            "Use PDO prepared statements with ? placeholders instead of sprintf",
+        ),
+        # f-string with extended SQL keywords (WHERE, FROM, JOIN, SET, VALUES)
+        # Complements posttool-security-scan.py which only covers SELECT/INSERT/UPDATE/DELETE/DROP
+        (
+            re.compile(
+                r"""f['"]{1,3}(?:[^'"]*\b(?:WHERE|FROM|JOIN|SET|VALUES)\b[^'"]*)\{""",
+                re.IGNORECASE,
+            ),
+            "fstring-injection",
+            "Use parameterized queries instead of f-string interpolation in SQL",
+        ),
+        # Multi-line SQL building via += concatenation
+        # Matches: query += " AND user_id = " + uid  or  sql += f" WHERE {col}"
+        (
+            re.compile(
+                rf"""\b\w+\s*\+=\s*(?:f?['"][^'"]*\b(?:{kw})\b)""",
+                re.IGNORECASE,
+            ),
+            "string-concatenation",
+            "Build SQL with parameterized placeholders; collect params in a list",
+        ),
+    ]
+
+
+# Compile once at module load
+_PATTERNS = _build_patterns()
+
+
+def main() -> None:
+    try:
+        raw = read_stdin(timeout=2)
+        if not raw:
+            return
+
+        try:
+            event = json.loads(raw)
+        except json.JSONDecodeError:
+            return
+
+        tool_input = event.get("tool_input", {})
+        file_path = tool_input.get("file_path", "")
+        if not file_path:
+            return
+
+        # Only scan code files
+        ext = Path(file_path).suffix.lower()
+        if ext not in _CODE_EXTENSIONS:
+            return
+
+        # Read file content from disk
+        p = Path(file_path)
+        if not p.is_file():
+            return
+
+        try:
+            content = p.read_text(errors="replace")
+        except OSError:
+            return
+
+        lines = content.splitlines()
+        if len(lines) > _MAX_LINES:
+            return
+
+        # Scan each line against patterns; one finding per line max
+        findings: list[str] = []
+        for line_num, line in enumerate(lines, 1):
+            for pattern, category, suggestion in _PATTERNS:
+                if pattern.search(line):
+                    findings.append(
+                        f"[sql-injection] Potential SQL injection at "
+                        f"{Path(file_path).name}:{line_num}\n"
+                        f"  Pattern: {category}\n"
+                        f"  Suggestion: {suggestion}"
+                    )
+                    break  # One finding per line max
+
+        if findings:
+            # Limit output to first 5 findings to avoid noise
+            for finding in findings[:5]:
+                print(finding)
+            if len(findings) > 5:
+                print(f"  ... and {len(findings) - 5} more sql-injection hints")
+
+    except Exception as e:
+        if os.environ.get("CLAUDE_HOOKS_DEBUG"):
+            import traceback
+
+            print(f"[sql-injection] HOOK-ERROR: {type(e).__name__}: {e}", file=sys.stderr)
+            traceback.print_exc(file=sys.stderr)
+    finally:
+        # CRITICAL: Always exit 0 to prevent blocking Claude Code
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hooks/team-config-loader.py b/hooks/team-config-loader.py
new file mode 100644
index 00000000..adbfad5f
--- /dev/null
+++ b/hooks/team-config-loader.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+# hook-version: 1.0.0
+"""
+SessionStart Hook: Team Configuration Loader
+
+Discovers a team-config.yaml file from priority-ordered locations and injects
+its contents into the session as context lines.
+
+Priority order:
+  1. $CLAUDE_TEAM_CONFIG env var (explicit override)
+  2. .claude/team-config.yaml  (project-local)
+  3. ~/.claude/team-config.yaml (user-global)
+  4. /etc/claude/team-config.yaml (system-wide)
+
+Design Principles:
+- SILENT when no config file is found (zero noise for solo users)
+- Non-blocking: always exits 0
+- Sub-50ms: reads one small YAML file, no DB, no network
+- CLAUDE_HOOKS_DEBUG=1 logs errors to stderr
+"""
+
+import os
+import sys
+from pathlib import Path
+
+DEBUG = os.environ.get("CLAUDE_HOOKS_DEBUG") == "1"
+
+
+def debug(msg: str) -> None:
+    if DEBUG:
+        print(f"[team-config-loader] {msg}", file=sys.stderr)
+
+
+def find_config() -> Path | None:
+    """Return the first config file found, in priority order."""
+    candidates = []
+
+    # 1. Explicit env override
+    env_path = os.environ.get("CLAUDE_TEAM_CONFIG")
+    if env_path:
+        candidates.append(Path(env_path))
+
+    # 2. Project-local
+    candidates.append(Path.cwd() / ".claude" / "team-config.yaml")
+
+    # 3. User-global
+    candidates.append(Path.home() / ".claude" / "team-config.yaml")
+
+    # 4. System-wide
+    candidates.append(Path("/etc/claude/team-config.yaml"))
+
+    for path in candidates:
+        if path.is_file():
+            debug(f"found config at {path}")
+            return path
+
+    return None
+
+
+def load_yaml(path: Path) -> dict:
+    """
+    Load YAML from path. Uses PyYAML if available; falls back to simple
+    line-by-line parser for basic key: value (and indented block scalar) structure.
+    """
+    text = path.read_text(encoding="utf-8")
+
+    try:
+        import yaml  # pyyaml
+
+        return yaml.safe_load(text) or {}
+    except ImportError:
+        debug("pyyaml not available, using fallback parser")
+        return _fallback_parse(text)
+
+
+def _fallback_parse(text: str) -> dict:
+    """
+    Minimal YAML parser for the team-config schema only.
+    Handles:
+      - top-level scalar keys:  key: value
+      - block scalar (|):       context: |
+                                   line one
+                                   line two
+      - simple list:            hints:
+                                  - item
+      - simple dict:            env:
+                                  KEY: value
+    Comments (#) and blank lines are skipped.
+    """
+    result: dict = {}
+    lines = text.splitlines()
+    i = 0
+
+    while i < len(lines):
+        raw = lines[i]
+        stripped = raw.strip()
+
+        # Skip comments and blanks
+        if not stripped or stripped.startswith("#"):
+            i += 1
+            continue
+
+        # Top-level key
+        if not raw[0].isspace() and ":" in stripped:
+            key, _, rest = stripped.partition(":")
+            key = key.strip()
+            rest = rest.strip()
+
+            if rest == "|":
+                # Block scalar — collect indented lines that follow
+                i += 1
+                block_lines = []
+                while i < len(lines):
+                    next_raw = lines[i]
+                    if next_raw and not next_raw[0].isspace():
+                        break
+                    block_lines.append(next_raw.strip())
+                    i += 1
+                result[key] = "\n".join(block_lines).strip()
+                continue
+
+            if rest == "":
+                # Mapping or sequence — peek at children
+                i += 1
+                children_raw = []
+                while i < len(lines):
+                    next_raw = lines[i]
+                    next_stripped = next_raw.strip()
+                    if not next_stripped or next_stripped.startswith("#"):
+                        i += 1
+                        continue
+                    if next_raw and not next_raw[0].isspace():
+                        break
+                    children_raw.append(next_stripped)
+                    i += 1
+
+                if children_raw and children_raw[0].startswith("- "):
+                    result[key] = [c[2:].strip() for c in children_raw if c.startswith("- ")]
+                else:
+                    mapping = {}
+                    for child in children_raw:
+                        if ":" in child:
+                            ck, _, cv = child.partition(":")
+                            mapping[ck.strip()] = cv.strip()
+                    result[key] = mapping
+                continue
+
+            # Inline scalar
+            result[key] = rest
+            i += 1
+            continue
+
+        i += 1
+
+    return result
+
+
+def inject_config(config: dict, config_path: Path) -> None:
+    """Print context lines from the loaded config to stdout."""
+    version = config.get("version")
+    # Fallback parser returns strings; PyYAML returns int. Accept both.
+    if str(version) != "1":
+        debug(f"unsupported config version: {version!r}")
+        return
+
+    team = config.get("team", "")
+    operator = config.get("operator", "")
+
+    # Header line
+    label = f" for team: {team}" if team else ""
+    print(f"[team-config] Loaded {config_path.name}{label}")
+
+    if operator:
+        print(f"[team-config] Operator: {operator}")
+
+    # Free-form context block
+    context = config.get("context", "")
+    if context:
+        for line in str(context).splitlines():
+            stripped = line.strip()
+            if stripped:
+                print(f"[team-config] {stripped}")
+
+    # Hints
+    hints = config.get("hints") or []
+    if isinstance(hints, list):
+        for hint in hints:
+            if hint:
+                print(f"[team-hint] {hint}")
+
+    # Env vars
+    env = config.get("env") or {}
+    if isinstance(env, dict):
+        for key, value in env.items():
+            print(f"[team-config] Env: {key}={value}")
+
+
+def main() -> None:
+    try:
+        config_path = find_config()
+        if config_path is None:
+            return  # Silent — no config found
+
+        config = load_yaml(config_path)
+        inject_config(config, config_path)
+
+    except Exception as e:
+        debug(f"error loading team config: {e}")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        if DEBUG:
+            print(f"[team-config-loader] fatal: {e}", file=sys.stderr)
+    finally:
+        sys.exit(0)
diff --git a/hooks/tests/test_sql_injection_detector.py b/hooks/tests/test_sql_injection_detector.py
new file mode 100644
index 00000000..c2829c12
--- /dev/null
+++ b/hooks/tests/test_sql_injection_detector.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Tests for the sql-injection-detector hook.
+
+Run with: python3 hooks/tests/test_sql_injection_detector.py
+
+Verifies:
+- Python f-string with SQL keyword → warning
+- Python + concatenation with SQL → warning
+- Python .format() with SQL → warning
+- Parameterized query → NO warning
+- Go fmt.Sprintf with SQL → warning
+- Non-SQL f-string → NO warning
+- Non-code file → silent
+- Missing file path → silent
+- File not on disk → silent
+- Malformed JSON → exit 0 (non-blocking)
+- First 5 findings capped, overflow reported
+"""
+
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+HOOK_PATH = Path(__file__).parent.parent / "sql-injection-detector.py"
+
+
+def run_hook(event: dict) -> tuple[str, str, int]:
+    """Run the hook with given event and return (stdout, stderr, exit_code)."""
+    result = subprocess.run(
+        [sys.executable, str(HOOK_PATH)],
+        input=json.dumps(event),
+        capture_output=True,
+        text=True,
+    )
+    return result.stdout, result.stderr, result.returncode
+
+
+def run_hook_with_file(content: str, extension: str = ".py") -> tuple[str, str, int]:
+    """Write content to a temp file then run the hook against it."""
+    with tempfile.NamedTemporaryFile(suffix=extension, mode="w", delete=False, dir="/tmp") as f:
+        f.write(content)
+        tmp_path = f.name
+
+    try:
+        event = {
+            "type": "PostToolUse",
+            "tool_name": "Write",
+            "tool_input": {"file_path": tmp_path},
+        }
+        return run_hook(event)
+    finally:
+        Path(tmp_path).unlink(missing_ok=True)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_python_fstring_sql_warning():
+    """Python f-string with SQL keyword should emit a warning."""
+    code = 'query = f"SELECT * FROM users WHERE id = {user_id}"\n'
+    stdout, _, code_rc = run_hook_with_file(code)
+    assert code_rc == 0
+    assert "[sql-injection]" in stdout
+
+
+def test_python_concatenation_sql_warning():
+    """Python + concatenation with SQL context should emit a warning."""
+    code = 'sql = "SELECT * FROM users WHERE name = " + name\n'
+    stdout, _, rc = run_hook_with_file(code)
+    assert rc == 0
+    assert "[sql-injection]" in stdout
+    assert "string-concatenation" in stdout
+
+
+def test_python_format_sql_warning():
+    """Python .format() on a SQL string should emit a warning."""
+    code = 'query = "SELECT * FROM {} WHERE id = {}".format(table, user_id)\n'
+    stdout, _, rc = run_hook_with_file(code)
+    assert rc == 0
+    assert "[sql-injection]" in stdout
+    assert "format-injection" in stdout
+
+
+def test_parameterized_query_no_warning():
+    """Proper parameterized query should NOT emit a warning."""
+    code = "sql = 'SELECT * FROM users WHERE id = ?'\ncursor.execute(sql, (user_id,))\n"
+    stdout, _, rc = run_hook_with_file(code)
+    assert rc == 0
+    assert "[sql-injection]" not in stdout
+
+
+def test_go_fmt_sprintf_warning():
+    """Go fmt.Sprintf with SQL percent placeholders should emit a warning."""
+    code = 'query := fmt.Sprintf("SELECT * FROM users WHERE id = %s", userID)\n'
+    stdout, _, rc = run_hook_with_file(code, extension=".go")
+    assert rc == 0
+    assert "[sql-injection]" in stdout
+    assert "sprintf-injection" in stdout
+
+
+def test_non_sql_fstring_no_warning():
+    """f-string that doesn't contain SQL keywords should NOT emit a warning."""
+    code = 'msg = f"Hello, {name}! Welcome to {place}."\n'
+    stdout, _, rc = run_hook_with_file(code)
+    assert rc == 0
+    assert "[sql-injection]" not in stdout
+
+
+def test_non_code_file_silent():
+    """Non-code file (e.g. .md) should be silently skipped."""
+    with tempfile.NamedTemporaryFile(suffix=".md", mode="w", delete=False, dir="/tmp") as f:
+        f.write('query = "SELECT * FROM users WHERE id = " + user_id\n')
+        tmp_path = f.name
+
+    try:
+        event = {
+            "type": "PostToolUse",
+            "tool_name": "Write",
+            "tool_input": {"file_path": tmp_path},
+        }
+        stdout, _, rc = run_hook(event)
+        assert rc == 0
+        assert stdout == ""
+    finally:
+        Path(tmp_path).unlink(missing_ok=True)
+
+
+def test_missing_file_path_silent():
+    """Missing file_path in tool_input should produce no output."""
+    event = {
+        "type": "PostToolUse",
+        "tool_name": "Write",
+        "tool_input": {},
+    }
+    stdout, _, rc = run_hook(event)
+    assert rc == 0
+    assert stdout == ""
+
+
+def test_file_not_on_disk_silent():
+    """Nonexistent file should be silently skipped."""
+    event = {
+        "type": "PostToolUse",
+        "tool_name": "Write",
+        "tool_input": {"file_path": "/tmp/does_not_exist_xyz123.py"},
+    }
+    stdout, _, rc = run_hook(event)
+    assert rc == 0
+    assert stdout == ""
+
+
+def test_malformed_json_exits_zero():
+    """Malformed JSON input should not crash — hook must exit 0."""
+    result = subprocess.run(
+        [sys.executable, str(HOOK_PATH)],
+        input="this is not json",
+        capture_output=True,
+        text=True,
+    )
+    assert result.returncode == 0
+
+
+def test_findings_capped_at_five():
+    """More than 5 findings should be capped with an overflow line."""
+    lines = []
+    for i in range(8):
+        lines.append(f'sql{i} = "SELECT * FROM t WHERE a = " + val{i}')
+    code = "\n".join(lines) + "\n"
+    stdout, _, rc = run_hook_with_file(code)
+    assert rc == 0
+    assert "[sql-injection]" in stdout
+    assert "more sql-injection hints" in stdout
+
+
+def test_java_string_format_warning():
+    """Java String.format with SQL placeholders should emit a warning."""
+    code = 'String q = String.format("SELECT * FROM users WHERE id = %s", userId);\n'
+    stdout, _, rc = run_hook_with_file(code, extension=".java")
+    assert rc == 0
+    assert "[sql-injection]" in stdout
+    assert "sprintf-injection" in stdout
+
+
+def test_fstring_where_clause_warning():
+    """f-string with WHERE (not in SELECT set) should emit a warning."""
+    code = 'q = f"WHERE user_id = {uid} AND active = 1"\n'
+    stdout, _, rc = run_hook_with_file(code)
+    assert rc == 0
+    assert "[sql-injection]" in stdout
+    assert "fstring-injection" in stdout
+
+
+def test_multiline_sql_concat_warning():
+    """Multi-line SQL building via += should emit a warning."""
+    code = 'query += " WHERE user_id = " + str(uid)\n'
+    stdout, _, rc = run_hook_with_file(code)
+    assert rc == 0
+    assert "[sql-injection]" in stdout
+
+
+if __name__ == "__main__":
+    tests = [
+        test_python_fstring_sql_warning,
+        test_python_concatenation_sql_warning,
+        test_python_format_sql_warning,
+        test_parameterized_query_no_warning,
+        test_go_fmt_sprintf_warning,
+        test_non_sql_fstring_no_warning,
+        test_non_code_file_silent,
+        test_missing_file_path_silent,
+        test_file_not_on_disk_silent,
+        test_malformed_json_exits_zero,
+        test_findings_capped_at_five,
+        test_java_string_format_warning,
+        test_fstring_where_clause_warning,
+        test_multiline_sql_concat_warning,
+    ]
+
+    print("Running sql-injection-detector hook tests...\n")
+    passed = 0
+    failed = 0
+
+    for test in tests:
+        try:
+            test()
+            print(f"  \u2713 {test.__name__}")
+            passed += 1
+        except AssertionError as e:
+            print(f"  \u2717 {test.__name__}: {e}")
+            failed += 1
+        except Exception as e:
+            print(f"  \u2717 {test.__name__}: Exception - {e}")
+            failed += 1
+
+    print(f"\n{passed} passed, {failed} failed")
+    sys.exit(0 if failed == 0 else 1)
diff --git a/perses-plugin-example/README.md b/perses-plugin-example/README.md
new file mode 100644
index 00000000..bae79b0e
--- /dev/null
+++ b/perses-plugin-example/README.md
@@ -0,0 +1,73 @@
+# ExamplePanel Plugin
+
+A minimal Perses Panel plugin scaffold demonstrating the CUE schema and React component conventions.
+
+## Plugin Details
+
+| Field | Value |
+|-------|-------|
+| Type | Panel |
+| Kind | `ExamplePanel` |
+| Package | `@perses-dev/example-panel-plugin` |
+
+The panel renders a configured query string and optional display unit. It is intended as a starting point — replace the component body with your visualization logic.
+
+## Spec Fields
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `query` | string | Yes | Data query executed against the datasource |
+| `unit` | string | No | Display unit appended to values (e.g. `ms`, `%`) |
+
+## Development
+
+### Test Schemas
+
+Validate the CUE schema against the JSON example before building:
+
+```bash
+percli plugin test-schemas
+```
+
+All schema tests must pass before proceeding to build.
+
+### Build
+
+Create the distributable archive:
+
+```bash
+percli plugin build
+```
+
+The archive will contain `package.json`, `schemas/`, `__mf/`, and `mf-manifest.json`.
+
+### Hot-Reload Dev Server
+
+Run against a local Perses instance for live development:
+
+```bash
+percli plugin start
+```
+
+## Deploy to Perses
+
+1. Build the plugin archive with `percli plugin build`.
+2. Copy the resulting `.tar.gz` (or `.zip`) into the `plugins-archive/` directory of your Perses server installation.
+3. Restart the Perses server — it will unpack and register the plugin automatically.
+4. Reference the plugin in a dashboard panel definition using `kind: "ExamplePanel"`.
+
+## Example Dashboard Panel Definition
+
+```yaml
+kind: Panel
+metadata:
+  name: my-example-panel
+spec:
+  display:
+    name: My Example Panel
+  plugin:
+    kind: ExamplePanel
+    spec:
+      query: 'up{job="prometheus"}'
+      unit: short
+```
diff --git a/perses-plugin-example/package.json b/perses-plugin-example/package.json
new file mode 100644
index 00000000..b4607f27
--- /dev/null
+++ b/perses-plugin-example/package.json
@@ -0,0 +1,30 @@
+{
+  "name": "@perses-dev/example-panel-plugin",
+  "version": "0.1.0",
+  "description": "Example Perses Panel plugin scaffold demonstrating CUE schema and React component conventions.",
+  "main": "src/index.ts",
+  "scripts": {
+    "dev": "rsbuild dev",
+    "build": "rsbuild build",
+    "preview": "rsbuild preview",
+    "type-check": "tsc --noEmit",
+    "test-schemas": "percli plugin test-schemas"
+  },
+  "dependencies": {
+    "@perses-dev/core": "^0.48.0",
+    "@perses-dev/plugin-system": "^0.48.0",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0"
+  },
+  "devDependencies": {
+    "@rsbuild/core": "^0.7.0",
+    "@rsbuild/plugin-react": "^0.7.0",
+    "@types/react": "^18.2.0",
+    "@types/react-dom": "^18.2.0",
+    "typescript": "^5.4.0"
+  },
+  "peerDependencies": {
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0"
+  }
+}
diff --git a/perses-plugin-example/rsbuild.config.ts b/perses-plugin-example/rsbuild.config.ts
new file mode 100644
index 00000000..73a50563
--- /dev/null
+++ b/perses-plugin-example/rsbuild.config.ts
@@ -0,0 +1,28 @@
+import { defineConfig } from "@rsbuild/core";
+import { pluginReact } from "@rsbuild/plugin-react";
+
+export default defineConfig({
+  plugins: [pluginReact()],
+  tools: {
+    rspack: {
+      output: {
+        uniqueName: "example-panel-plugin",
+      },
+    },
+  },
+  moduleFederation: {
+    options: {
+      name: "ExamplePanelPlugin",
+      filename: "remoteEntry.js",
+      exposes: {
+        ".": "./src/index.ts",
+      },
+      shared: {
+        react: { singleton: true, requiredVersion: "^18.2.0" },
+        "react-dom": { singleton: true, requiredVersion: "^18.2.0" },
+        "@perses-dev/core": { singleton: true },
+        "@perses-dev/plugin-system": { singleton: true },
+      },
+    },
+  },
+});
diff --git a/perses-plugin-example/schemas/panels/example-panel/example-panel.cue b/perses-plugin-example/schemas/panels/example-panel/example-panel.cue
new file mode 100644
index 00000000..2ef69d43
--- /dev/null
+++ b/perses-plugin-example/schemas/panels/example-panel/example-panel.cue
@@ -0,0 +1,11 @@
+package model
+
+kind: "ExamplePanel"
+spec: close({
+	// query is the data query string to execute against the datasource.
+	// Required — panel cannot render without a target query.
+	query: string
+
+	// unit is an optional display unit appended to rendered values (e.g. "ms", "%", "req/s").
+	unit?: string
+})
diff --git a/perses-plugin-example/schemas/panels/example-panel/example-panel.json b/perses-plugin-example/schemas/panels/example-panel/example-panel.json
new file mode 100644
index 00000000..a7ca867d
--- /dev/null
+++ b/perses-plugin-example/schemas/panels/example-panel/example-panel.json
@@ -0,0 +1,6 @@
+{
+  "kind": "ExamplePanel",
+  "spec": {
+    "query": "up{job=\"prometheus\"}"
+  }
+}
diff --git a/perses-plugin-example/schemas/panels/example-panel/testdata/full-config.json b/perses-plugin-example/schemas/panels/example-panel/testdata/full-config.json
new file mode 100644
index 00000000..4b055cc5
--- /dev/null
+++ b/perses-plugin-example/schemas/panels/example-panel/testdata/full-config.json
@@ -0,0 +1,7 @@
+{
+  "kind": "ExamplePanel",
+  "spec": {
+    "query": "rate(http_requests_total{job=\"api-server\"}[5m])",
+    "unit": "req/s"
+  }
+}
diff --git a/perses-plugin-example/src/ExamplePanel.tsx b/perses-plugin-example/src/ExamplePanel.tsx
new file mode 100644
index 00000000..fc27d2af
--- /dev/null
+++ b/perses-plugin-example/src/ExamplePanel.tsx
@@ -0,0 +1,50 @@
+import React from 'react';
+import { PanelProps } from '@perses-dev/plugin-system';
+import { ExamplePanelSpec } from './ExamplePanelTypes';
+
+/**
+ * ExamplePanel renders the configured query string and optional unit.
+ *
+ * This is a minimal display panel used as a scaffolding reference.
+ * Replace the body with chart/table rendering as needed.
+ */
+export function ExamplePanel(props: PanelProps<ExamplePanelSpec>): JSX.Element {
+  const { spec } = props;
+
+  return (
+    <div
+      style={{
+        display: 'flex',
+        flexDirection: 'column',
+        alignItems: 'center',
+        justifyContent: 'center',
+        height: '100%',
+        padding: '16px',
+        fontFamily: 'monospace',
+        gap: '8px',
+      }}
+    >
+      <div style={{ fontSize: '14px', color: '#666' }}>Query</div>
+      <div
+        style={{
+          fontSize: '13px',
+          background: '#f5f5f5',
+          borderRadius: '4px',
+          padding: '8px 12px',
+          maxWidth: '100%',
+          overflowX: 'auto',
+          whiteSpace: 'pre-wrap',
+          wordBreak: 'break-all',
+        }}
+      >
+        {spec.query}
+      </div>
+      {spec.unit !== undefined && (
+        <>
+          <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>Unit</div>
+          <div style={{ fontSize: '13px' }}>{spec.unit}</div>
+        </>
+      )}
+    </div>
+  );
+}
diff --git a/perses-plugin-example/src/ExamplePanelTypes.ts b/perses-plugin-example/src/ExamplePanelTypes.ts
new file mode 100644
index 00000000..9214db9f
--- /dev/null
+++ b/perses-plugin-example/src/ExamplePanelTypes.ts
@@ -0,0 +1,13 @@
+/**
+ * ExamplePanelSpec mirrors the CUE schema at
+ * schemas/panels/example-panel/spec.cue.
+ *
+ * Field names and optionality MUST stay in sync with the CUE definition.
+ */
+export interface ExamplePanelSpec {
+  /** The data query string executed against the configured datasource. */
+  query: string;
+
+  /** Optional display unit appended to rendered values (e.g. "ms", "%", "req/s"). */
+  unit?: string;
+}
diff --git a/perses-plugin-example/src/index.ts b/perses-plugin-example/src/index.ts
new file mode 100644
index 00000000..5e6c30eb
--- /dev/null
+++ b/perses-plugin-example/src/index.ts
@@ -0,0 +1,19 @@
+import { PanelPlugin } from '@perses-dev/plugin-system';
+import { ExamplePanel } from './ExamplePanel';
+import { ExamplePanelSpec } from './ExamplePanelTypes';
+
+/**
+ * Plugin registration.
+ *
+ * The `kind` string "ExamplePanel" MUST match:
+ *   - The `kind` field in schemas/panels/example-panel/spec.cue
+ *   - The `kind` field in any Perses dashboard panel definition referencing this plugin
+ */
+export const ExamplePanelPlugin: PanelPlugin<ExamplePanelSpec> = {
+  PanelComponent: ExamplePanel,
+  panelOptionsEditorComponents: [],
+  hide: false,
+};
+
+export { ExamplePanel } from './ExamplePanel';
+export type { ExamplePanelSpec } from './ExamplePanelTypes';
diff --git a/perses-plugin-example/tsconfig.json b/perses-plugin-example/tsconfig.json
new file mode 100644
index 00000000..e7ac9abc
--- /dev/null
+++ b/perses-plugin-example/tsconfig.json
@@ -0,0 +1,23 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "moduleResolution": "bundler",
+    "jsx": "react-jsx",
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "allowSyntheticDefaultImports": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "resolveJsonModule": true,
+    "outDir": "dist",
+    "declaration": true,
+    "declarationMap": true,
+    "sourceMap": true
+  },
+  "include": ["src"],
+  "exclude": ["node_modules", "dist", "__mf"]
+}
diff --git a/plugins/custom-panel/package.json b/plugins/custom-panel/package.json
new file mode 100644
index 00000000..7f5819c8
--- /dev/null
+++ b/plugins/custom-panel/package.json
@@ -0,0 +1,30 @@
+{
+  "name": "@perses-dev/custom-panel-plugin",
+  "version": "0.1.0",
+  "description": "Custom panel plugin for Perses",
+  "main": "src/index.ts",
+  "scripts": {
+    "dev": "rsbuild dev",
+    "build": "rsbuild build",
+    "preview": "rsbuild preview",
+    "type-check": "tsc --noEmit",
+    "test-schemas": "percli plugin test-schemas"
+  },
+  "dependencies": {
+    "@perses-dev/core": "^0.48.0",
+    "@perses-dev/plugin-system": "^0.48.0",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0"
+  },
+  "devDependencies": {
+    "@rsbuild/core": "^0.7.0",
+    "@rsbuild/plugin-react": "^0.7.0",
+    "@types/react": "^18.2.0",
+    "@types/react-dom": "^18.2.0",
+    "typescript": "^5.4.0"
+  },
+  "peerDependencies": {
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0"
+  }
+}
diff --git a/plugins/custom-panel/rsbuild.config.ts b/plugins/custom-panel/rsbuild.config.ts
new file mode 100644
index 00000000..dd79ebde
--- /dev/null
+++ b/plugins/custom-panel/rsbuild.config.ts
@@ -0,0 +1,28 @@
+import { defineConfig } from "@rsbuild/core";
+import { pluginReact } from "@rsbuild/plugin-react";
+
+export default defineConfig({
+  plugins: [pluginReact()],
+  tools: {
+    rspack: {
+      output: {
+        uniqueName: "custom-panel-plugin",
+      },
+    },
+  },
+  moduleFederation: {
+    options: {
+      name: "CustomPanelPlugin",
+      filename: "remoteEntry.js",
+      exposes: {
+        ".": "./src/index.ts",
+      },
+      shared: {
+        react: { singleton: true, requiredVersion: "^18.2.0" },
+        "react-dom": { singleton: true, requiredVersion: "^18.2.0" },
+        "@perses-dev/core": { singleton: true },
+        "@perses-dev/plugin-system": { singleton: true },
+      },
+    },
+  },
+});
diff --git a/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.cue b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.cue
new file mode 100644
index 00000000..9bcaff12
--- /dev/null
+++ b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.cue
@@ -0,0 +1,25 @@
+// Copyright 2024 The Perses Authors
+// Licensed under the Apache License, Version 2.0
+
+package model
+
+kind: "CustomPanel"
+spec: close({
+	// title is the display label rendered at the top of the panel.
+	title: string
+
+	// unit controls how numeric values are formatted (e.g. "bytes", "percent", "short").
+	unit?: string
+
+	// thresholds defines a list of color-coded threshold steps.
+	// Each step specifies a numeric value and a display color.
+	thresholds?: [...#ThresholdStep]
+})
+
+// ThresholdStep pairs a numeric boundary with a display color.
+#ThresholdStep: {
+	// value is the lower boundary of this threshold band.
+	value: number
+	// color is a CSS-compatible color string (e.g. "#e02f44", "green").
+	color: string
+}
diff --git a/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.json b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.json
new file mode 100644
index 00000000..782f5ca2
--- /dev/null
+++ b/plugins/custom-panel/schemas/panels/custom-panel/custom-panel.json
@@ -0,0 +1,6 @@
+{
+  "kind": "CustomPanel",
+  "spec": {
+    "title": "My Custom Panel"
+  }
+}
diff --git a/plugins/custom-panel/schemas/panels/custom-panel/migrate/migrate.cue b/plugins/custom-panel/schemas/panels/custom-panel/migrate/migrate.cue
new file mode 100644
index 00000000..be65f70b
--- /dev/null
+++ b/plugins/custom-panel/schemas/panels/custom-panel/migrate/migrate.cue
@@ -0,0 +1,44 @@
+// Copyright 2024 The Perses Authors
+// Licensed under the Apache License, Version 2.0
+//
+// migrate.cue maps a Grafana "stat" panel definition to a Perses CustomPanel spec.
+// Supported Grafana panel types: stat, singlestat
+//
+// Unsupported Grafana fields (no direct equivalent in CustomPanel):
+//   - options.graphMode
+//   - options.colorMode
+//   - options.justifyMode
+//   - fieldConfig.defaults.mappings
+
+package migrate
+
+import (
+	"github.com/perses/perses/cue/schemas/panels/migrate"
+)
+
+migrate.#Panel & {
+	// target is the resulting Perses panel spec after migration.
+	target: {
+		kind: "CustomPanel"
+		spec: {
+			// Map the Grafana panel title to the Perses title field.
+			title: grafana.title
+
+			// Map the Grafana unit override if present.
+			if grafana.fieldConfig.defaults.unit != _|_ {
+				unit: grafana.fieldConfig.defaults.unit
+			}
+
+			// Map Grafana threshold steps to Perses threshold steps.
+			if grafana.fieldConfig.defaults.thresholds.steps != _|_ {
+				thresholds: [
+					for step in grafana.fieldConfig.defaults.thresholds.steps
+					if step.value != _|_ {
+						value: step.value
+						color: step.color
+					},
+				]
+			}
+		}
+	}
+}
diff --git a/plugins/custom-panel/schemas/panels/custom-panel/testdata/full-config.json b/plugins/custom-panel/schemas/panels/custom-panel/testdata/full-config.json
new file mode 100644
index 00000000..513d3614
--- /dev/null
+++ b/plugins/custom-panel/schemas/panels/custom-panel/testdata/full-config.json
@@ -0,0 +1,12 @@
+{
+  "kind": "CustomPanel",
+  "spec": {
+    "title": "Request Latency",
+    "unit": "ms",
+    "thresholds": [
+      { "value": 0, "color": "green" },
+      { "value": 200, "color": "#ff9900" },
+      { "value": 500, "color": "#e02f44" }
+    ]
+  }
+}
diff --git a/plugins/custom-panel/src/PanelComponent.tsx b/plugins/custom-panel/src/PanelComponent.tsx
new file mode 100644
index 00000000..bdd72920
--- /dev/null
+++ b/plugins/custom-panel/src/PanelComponent.tsx
@@ -0,0 +1,160 @@
+import React from "react";
+import { PanelProps } from "@perses-dev/plugin-system";
+import { CustomPanelSpec, ThresholdStep } from "./types";
+
+/**
+ * resolveThresholdColor returns the color for the highest threshold whose
+ * value is <= the provided numeric value, or undefined when no value is given.
+ */
+function resolveThresholdColor(
+  value: number | undefined,
+  thresholds: ThresholdStep[] | undefined
+): string | undefined {
+  if (value === undefined || !thresholds || thresholds.length === 0) {
+    return undefined;
+  }
+  const sorted = [...thresholds].sort((a, b) => a.value - b.value);
+  let resolved: string | undefined;
+  for (const step of sorted) {
+    if (value >= step.value) {
+      resolved = step.color;
+    }
+  }
+  return resolved;
+}
+
+/**
+ * CustomPanelComponent renders the CustomPanel spec.
+ *
+ * - Displays the configured title as the panel heading.
+ * - Shows each threshold step as a color swatch with its boundary value.
+ * - Applies the appropriate threshold color to the unit label when a
+ *   representative value is available from the panel data context.
+ */
+export function CustomPanelComponent({
+  spec,
+}: PanelProps<CustomPanelSpec>): React.ReactElement {
+  const { title, unit, thresholds } = spec;
+
+  // Derive a representative numeric value from the first query result when
+  // available. Falls back to undefined so the component renders gracefully
+  // with no live data (e.g. during plugin development or empty dashboards).
+  const representativeValue: number | undefined = undefined;
+  const activeColor = resolveThresholdColor(representativeValue, thresholds);
+
+  return (
+    <div style={styles.container}>
+      {/* Panel heading */}
+      <h2 style={styles.title}>{title}</h2>
+
+      {/* Unit display with optional threshold color */}
+      {unit !== undefined && (
+        <div
+          style={{
+            ...styles.unitBadge,
+            backgroundColor: activeColor ?? styles.unitBadge.backgroundColor,
+          }}
+        >
+          {unit}
+        </div>
+      )}
+
+      {/* Threshold legend */}
+      {thresholds && thresholds.length > 0 && (
+        <section style={styles.thresholdsSection}>
+          <h3 style={styles.thresholdsHeading}>Thresholds</h3>
+          <ul style={styles.thresholdList}>
+            {thresholds
+              .slice()
+              .sort((a, b) => a.value - b.value)
+              .map((step, idx) => (
+                <li key={idx} style={styles.thresholdItem}>
+                  <span
+                    style={{
+                      ...styles.swatch,
+                      backgroundColor: step.color,
+                    }}
+                    aria-label={`Threshold color: ${step.color}`}
+                  />
+                  <span style={styles.thresholdLabel}>
+                    &ge; {step.value}
+                    {unit ? ` ${unit}` : ""}
+                  </span>
+                </li>
+              ))}
+          </ul>
+        </section>
+      )}
+
+      {/* Empty state */}
+      {(!thresholds || thresholds.length === 0) && unit === undefined && (
+        <p style={styles.emptyState}>No configuration to display.</p>
+      )}
+    </div>
+  );
+}
+
+// Inline styles — replace with your design system tokens or CSS modules as needed.
+const styles = {
+  container: {
+    padding: "12px 16px",
+    fontFamily: "inherit",
+    height: "100%",
+    boxSizing: "border-box" as const,
+    overflow: "auto",
+  },
+  title: {
+    margin: "0 0 8px 0",
+    fontSize: "1rem",
+    fontWeight: 600,
+    lineHeight: 1.4,
+  },
+  unitBadge: {
+    display: "inline-block",
+    padding: "2px 8px",
+    borderRadius: "4px",
+    fontSize: "0.875rem",
+    fontWeight: 500,
+    backgroundColor: "#e0e0e0",
+    marginBottom: "12px",
+  },
+  thresholdsSection: {
+    marginTop: "8px",
+  },
+  thresholdsHeading: {
+    margin: "0 0 6px 0",
+    fontSize: "0.75rem",
+    fontWeight: 600,
+    textTransform: "uppercase" as const,
+    letterSpacing: "0.05em",
+    color: "#666",
+  },
+  thresholdList: {
+    listStyle: "none",
+    margin: 0,
+    padding: 0,
+    display: "flex",
+    flexDirection: "column" as const,
+    gap: "4px",
+  },
+  thresholdItem: {
+    display: "flex",
+    alignItems: "center",
+    gap: "8px",
+  },
+  swatch: {
+    width: "14px",
+    height: "14px",
+    borderRadius: "2px",
+    flexShrink: 0,
+    border: "1px solid rgba(0,0,0,0.1)",
+  },
+  thresholdLabel: {
+    fontSize: "0.875rem",
+  },
+  emptyState: {
+    color: "#999",
+    fontSize: "0.875rem",
+    margin: 0,
+  },
+} as const;
diff --git a/plugins/custom-panel/src/index.ts b/plugins/custom-panel/src/index.ts
new file mode 100644
index 00000000..08f518b2
--- /dev/null
+++ b/plugins/custom-panel/src/index.ts
@@ -0,0 +1,19 @@
+import { PanelPlugin } from "@perses-dev/plugin-system";
+import { CustomPanelComponent } from "./PanelComponent";
+import { CustomPanelSpec } from "./types";
+
+/**
+ * Plugin registration.
+ *
+ * The `kind` string "CustomPanel" MUST match:
+ *   - The `kind` field in schemas/panels/custom-panel/custom-panel.cue
+ *   - The `kind` field in any Perses dashboard panel definition referencing this plugin
+ */
+export const CustomPanelPlugin: PanelPlugin<CustomPanelSpec> = {
+  PanelComponent: CustomPanelComponent,
+  panelOptionsEditorComponents: [],
+  hide: false,
+};
+
+export { CustomPanelComponent } from "./PanelComponent";
+export type { CustomPanelSpec, ThresholdStep } from "./types";
diff --git a/plugins/custom-panel/src/types.ts b/plugins/custom-panel/src/types.ts
new file mode 100644
index 00000000..aead06c3
--- /dev/null
+++ b/plugins/custom-panel/src/types.ts
@@ -0,0 +1,21 @@
+/**
+ * ThresholdStep pairs a numeric lower boundary with a CSS color string.
+ * Mirrors the #ThresholdStep CUE definition in the schema.
+ */
+export interface ThresholdStep {
+  value: number;
+  color: string;
+}
+
+/**
+ * CustomPanelSpec is the validated configuration for a CustomPanel.
+ * All fields mirror the CUE schema at schemas/panels/custom-panel/custom-panel.cue.
+ */
+export interface CustomPanelSpec {
+  /** Display label rendered at the top of the panel. */
+  title: string;
+  /** Value formatting unit (e.g. "bytes", "percent", "ms", "short"). */
+  unit?: string;
+  /** Color-coded threshold steps. */
+  thresholds?: ThresholdStep[];
+}
diff --git a/plugins/custom-panel/tsconfig.json b/plugins/custom-panel/tsconfig.json
new file mode 100644
index 00000000..e7ac9abc
--- /dev/null
+++ b/plugins/custom-panel/tsconfig.json
@@ -0,0 +1,23 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "moduleResolution": "bundler",
+    "jsx": "react-jsx",
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "allowSyntheticDefaultImports": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "resolveJsonModule": true,
+    "outDir": "dist",
+    "declaration": true,
+    "declarationMap": true,
+    "sourceMap": true
+  },
+  "include": ["src"],
+  "exclude": ["node_modules", "dist", "__mf"]
+}
diff --git a/plugins/example-panel/schemas/panels/example-panel/example-panel.cue b/plugins/example-panel/schemas/panels/example-panel/example-panel.cue
new file mode 100644
index 00000000..4132ca9c
--- /dev/null
+++ b/plugins/example-panel/schemas/panels/example-panel/example-panel.cue
@@ -0,0 +1,21 @@
+// Copyright 2024 The Perses Authors
+// Licensed under the Apache License, Version 2.0
+
+package model
+
+kind: "ExamplePanel"
+spec: close({
+	// text is the message displayed in the center of the panel.
+	// Defaults to "Hello from ExamplePanel" when omitted.
+	text: string | *"Hello from ExamplePanel"
+
+	// color is a CSS-compatible color string applied to the text.
+	// Accepts any valid CSS color: hex (#333333), named (red), rgb(...).
+	color: string | *"#333333"
+
+	// fontSize controls text size in pixels. Clamped to the range 10–72.
+	fontSize: int & >=10 & <=72 | *16
+
+	// align controls horizontal text alignment within the panel.
+	align: "left" | "center" | "right" | *"center"
+})
diff --git a/plugins/example-panel/schemas/panels/example-panel/example-panel.json b/plugins/example-panel/schemas/panels/example-panel/example-panel.json
new file mode 100644
index 00000000..cb7b5d11
--- /dev/null
+++ b/plugins/example-panel/schemas/panels/example-panel/example-panel.json
@@ -0,0 +1,6 @@
+{
+  "kind": "ExamplePanel",
+  "spec": {
+    "text": "Hello from ExamplePanel"
+  }
+}
diff --git a/plugins/example-panel/schemas/panels/example-panel/testdata/full-config.json b/plugins/example-panel/schemas/panels/example-panel/testdata/full-config.json
new file mode 100644
index 00000000..34e00442
--- /dev/null
+++ b/plugins/example-panel/schemas/panels/example-panel/testdata/full-config.json
@@ -0,0 +1,9 @@
+{
+  "kind": "ExamplePanel",
+  "spec": {
+    "text": "System Status: Nominal",
+    "color": "#1a7f37",
+    "fontSize": 24,
+    "align": "center"
+  }
+}
diff --git a/scripts/skill_eval/run_eval.py b/scripts/skill_eval/run_eval.py
index 383e74b5..372a877a 100755
--- a/scripts/skill_eval/run_eval.py
+++ b/scripts/skill_eval/run_eval.py
@@ -6,11 +6,14 @@
 """
 
 import argparse
+import contextlib
 import json
 import os
 import select
+import shutil
 import subprocess
 import sys
+import tempfile
 import time
 import uuid
 from concurrent.futures import ProcessPoolExecutor, as_completed
@@ -32,40 +35,147 @@ def find_project_root() -> Path:
     return current
 
 
+def resolve_registered_skill_relpath(skill_path: Path, project_root: Path) -> Path | None:
+    """Return repo-relative SKILL.md path when `skill_path` is a registered repo skill."""
+    skill_md = (skill_path / "SKILL.md").resolve()
+    try:
+        rel = skill_md.relative_to(project_root.resolve())
+    except ValueError:
+        return None
+    if len(rel.parts) >= 3 and rel.parts[0] == "skills" and rel.parts[-1] == "SKILL.md":
+        return rel
+    return None
+
+
+def replace_description_in_skill_md(content: str, new_description: str) -> str:
+    """Replace the top-level frontmatter description field in SKILL.md content."""
+    lines = content.splitlines()
+    if not lines or lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+
+    frontmatter_lines = lines[1:end_idx]
+    body_lines = lines[end_idx + 1 :]
+    updated_frontmatter: list[str] = []
+    replaced = False
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if not replaced and line.startswith("description:"):
+            updated_frontmatter.append("description: |")
+            updated_frontmatter.extend(f"  {desc_line}" for desc_line in new_description.splitlines())
+            replaced = True
+            i += 1
+            while i < len(frontmatter_lines) and (
+                frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")
+            ):
+                i += 1
+            continue
+        updated_frontmatter.append(line)
+        i += 1
+
+    if not replaced:
+        raise ValueError("SKILL.md frontmatter missing description field")
+
+    rebuilt = ["---", *updated_frontmatter, "---", *body_lines]
+    return "\n".join(rebuilt) + ("\n" if content.endswith("\n") else "")
+
+
+def load_eval_set(path: Path) -> list[dict]:
+    """Load eval tasks from list or common wrapped JSON shapes."""
+    payload = json.loads(path.read_text())
+    if isinstance(payload, list):
+        return payload
+    if isinstance(payload, dict):
+        if "tasks" in payload and isinstance(payload["tasks"], list):
+            return payload["tasks"]
+        if "queries" in payload and isinstance(payload["queries"], list):
+            return payload["queries"]
+        train = payload.get("train")
+        test = payload.get("test")
+        if isinstance(train, list) or isinstance(test, list):
+            return [*(train or []), *(test or [])]
+    raise ValueError(
+        "Unsupported eval set format; expected list, {tasks:[...]}, {queries:[...]}, or {train:[...], test:[...]}"
+    )
+
+
+@contextlib.contextmanager
+def candidate_worktree(project_root: Path, registered_skill_relpath: Path, candidate_content: str | None):
+    """Create a temporary git worktree and optionally patch the target skill content."""
+    wt_path_str = tempfile.mkdtemp(prefix="skill-eval-wt-", dir="/tmp")
+    wt_path = Path(wt_path_str)
+    wt_path.rmdir()
+    try:
+        subprocess.run(
+            ["git", "worktree", "add", wt_path_str, "HEAD"],
+            cwd=str(project_root),
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        if candidate_content is not None:
+            (wt_path / registered_skill_relpath).write_text(candidate_content)
+        yield wt_path
+    finally:
+        try:
+            subprocess.run(
+                ["git", "worktree", "remove", "--force", wt_path_str],
+                cwd=str(project_root),
+                capture_output=True,
+                text=True,
+            )
+        except Exception:
+            pass
+        shutil.rmtree(wt_path_str, ignore_errors=True)
+
+
 def run_single_query(
     query: str,
     skill_name: str,
     skill_description: str,
     timeout: int,
     project_root: str,
+    eval_mode: str = "alias",
     model: str | None = None,
 ) -> bool:
     """Run a single query and return whether the skill was triggered.
 
-    Creates a command file in .claude/commands/ so it appears in Claude's
-    available_skills list, then runs `claude -p` with the raw query.
+    In alias mode, creates a command file in .claude/commands/ so it appears in
+    Claude's available skills list. In registered mode, assumes the real skill
+    is already present in the isolated worktree and detects only the real name.
+
     Uses --include-partial-messages to detect triggering early from
     stream events (content_block_start) rather than waiting for the
     full assistant message, which only arrives after tool execution.
     """
     unique_id = uuid.uuid4().hex[:8]
     clean_name = f"{skill_name}-skill-{unique_id}"
+    accepted_skill_ids = {clean_name} if eval_mode == "alias" else {skill_name}
     project_commands_dir = Path(project_root) / ".claude" / "commands"
     command_file = project_commands_dir / f"{clean_name}.md"
 
     try:
-        project_commands_dir.mkdir(parents=True, exist_ok=True)
-        # Use YAML block scalar to avoid breaking on quotes in description
-        indented_desc = "\n  ".join(skill_description.split("\n"))
-        command_content = (
-            f"---\n"
-            f"description: |\n"
-            f"  {indented_desc}\n"
-            f"---\n\n"
-            f"# {skill_name}\n\n"
-            f"This skill handles: {skill_description}\n"
-        )
-        command_file.write_text(command_content)
+        if eval_mode == "alias":
+            project_commands_dir.mkdir(parents=True, exist_ok=True)
+            # Use YAML block scalar to avoid breaking on quotes in description
+            indented_desc = "\n  ".join(skill_description.split("\n"))
+            command_content = (
+                f"---\n"
+                f"description: |\n"
+                f"  {indented_desc}\n"
+                f"---\n\n"
+                f"# {skill_name}\n\n"
+                f"This skill handles: {skill_description}\n"
+            )
+            command_file.write_text(command_content)
 
         cmd = [
             "claude",
@@ -140,20 +250,24 @@ def run_single_query(
                                     pending_tool_name = tool_name
                                     accumulated_json = ""
                                 else:
-                                    return False
+                                    pending_tool_name = None
+                                    accumulated_json = ""
 
                         elif se_type == "content_block_delta" and pending_tool_name:
                             delta = se.get("delta", {})
                             if delta.get("type") == "input_json_delta":
                                 accumulated_json += delta.get("partial_json", "")
-                                if clean_name in accumulated_json:
-                                    return True
+                                if any(skill_id in accumulated_json for skill_id in accepted_skill_ids):
+                                    triggered = True
 
                         elif se_type in ("content_block_stop", "message_stop"):
                             if pending_tool_name:
-                                return clean_name in accumulated_json
+                                if any(skill_id in accumulated_json for skill_id in accepted_skill_ids):
+                                    triggered = True
+                                pending_tool_name = None
+                                accumulated_json = ""
                             if se_type == "message_stop":
-                                return False
+                                return triggered
 
                     # Fallback: full assistant message
                     elif event.get("type") == "assistant":
@@ -163,11 +277,16 @@ def run_single_query(
                                 continue
                             tool_name = content_item.get("name", "")
                             tool_input = content_item.get("input", {})
-                            if (tool_name == "Skill" and clean_name in tool_input.get("skill", "")) or (
-                                tool_name == "Read" and clean_name in tool_input.get("file_path", "")
+                            if (
+                                tool_name == "Skill"
+                                and any(skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids)
+                            ) or (
+                                tool_name == "Read"
+                                and any(skill_id in tool_input.get("file_path", "") for skill_id in accepted_skill_ids)
                             ):
                                 triggered = True
-                            return triggered
+                        if triggered:
+                            return True
 
                     elif event.get("type") == "result":
                         return triggered
@@ -179,7 +298,7 @@ def run_single_query(
 
         return triggered
     finally:
-        if command_file.exists():
+        if eval_mode == "alias" and command_file.exists():
             command_file.unlink()
 
 
@@ -192,39 +311,69 @@ def run_eval(
     project_root: Path,
     runs_per_query: int = 1,
     trigger_threshold: float = 0.5,
+    eval_mode: str = "auto",
+    skill_path: Path | None = None,
+    candidate_content: str | None = None,
     model: str | None = None,
 ) -> dict:
     """Run the full eval set and return results."""
     results = []
 
-    with ProcessPoolExecutor(max_workers=num_workers) as executor:
-        future_to_info = {}
-        for item in eval_set:
-            for run_idx in range(runs_per_query):
-                future = executor.submit(
-                    run_single_query,
-                    item["query"],
-                    skill_name,
-                    description,
-                    timeout,
-                    str(project_root),
-                    model,
-                )
-                future_to_info[future] = (item, run_idx)
-
-        query_triggers: dict[str, list[bool]] = {}
-        query_items: dict[str, dict] = {}
-        for future in as_completed(future_to_info):
-            item, _ = future_to_info[future]
-            query = item["query"]
-            query_items[query] = item
-            if query not in query_triggers:
-                query_triggers[query] = []
-            try:
-                query_triggers[query].append(future.result())
-            except Exception as e:
-                print(f"Warning: query failed: {e}", file=sys.stderr)
-                query_triggers[query].append(False)
+    effective_mode = eval_mode
+    effective_project_root = project_root
+    worktree_cm = contextlib.nullcontext(project_root)
+
+    if effective_mode == "auto":
+        if skill_path is not None and resolve_registered_skill_relpath(skill_path, project_root) is not None:
+            effective_mode = "registered"
+        else:
+            effective_mode = "alias"
+
+    if effective_mode == "registered":
+        if skill_path is None:
+            raise ValueError("registered eval mode requires skill_path")
+        relpath = resolve_registered_skill_relpath(skill_path, project_root)
+        if relpath is None:
+            raise ValueError("registered eval mode requires skill_path under project_root/skills/*/SKILL.md")
+        _name, original_description, original_content = parse_skill_md(skill_path)
+        if candidate_content is None:
+            if description != original_description:
+                candidate_content = replace_description_in_skill_md(original_content, description)
+            else:
+                candidate_content = original_content
+        worktree_cm = candidate_worktree(project_root, relpath, candidate_content)
+
+    with worktree_cm as active_project_root:
+        effective_project_root = active_project_root
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            future_to_info = {}
+            for item in eval_set:
+                for run_idx in range(runs_per_query):
+                    future = executor.submit(
+                        run_single_query,
+                        item["query"],
+                        skill_name,
+                        description,
+                        timeout,
+                        str(effective_project_root),
+                        effective_mode,
+                        model,
+                    )
+                    future_to_info[future] = (item, run_idx)
+
+            query_triggers: dict[str, list[bool]] = {}
+            query_items: dict[str, dict] = {}
+            for future in as_completed(future_to_info):
+                item, _ = future_to_info[future]
+                query = item["query"]
+                query_items[query] = item
+                if query not in query_triggers:
+                    query_triggers[query] = []
+                try:
+                    query_triggers[query].append(future.result())
+                except Exception as e:
+                    print(f"Warning: query failed: {e}", file=sys.stderr)
+                    query_triggers[query].append(False)
 
     for query, triggers in query_triggers.items():
         item = query_items[query]
@@ -266,15 +415,17 @@ def main():
     parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
     parser.add_argument("--skill-path", required=True, help="Path to skill directory")
     parser.add_argument("--description", default=None, help="Override description to test")
-    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--candidate-content-file", default=None, help="Optional full SKILL.md content to evaluate")
+    parser.add_argument("--eval-mode", choices=["auto", "registered", "alias"], default="auto", help="Evaluator mode")
+    parser.add_argument("--num-workers", type=int, default=1, help="Number of parallel workers")
     parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
-    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--runs-per-query", type=int, default=1, help="Number of runs per query")
     parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
     parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
     parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
     args = parser.parse_args()
 
-    eval_set = json.loads(Path(args.eval_set).read_text())
+    eval_set = load_eval_set(Path(args.eval_set))
     skill_path = Path(args.skill_path)
 
     if not (skill_path / "SKILL.md").exists():
@@ -284,9 +435,11 @@ def main():
     name, original_description, _content = parse_skill_md(skill_path)
     description = args.description or original_description
     project_root = find_project_root()
+    candidate_content = Path(args.candidate_content_file).read_text() if args.candidate_content_file else None
 
     if args.verbose:
         print(f"Evaluating: {description}", file=sys.stderr)
+        print(f"Eval mode: {args.eval_mode}", file=sys.stderr)
 
     output = run_eval(
         eval_set=eval_set,
@@ -297,6 +450,9 @@ def main():
         project_root=project_root,
         runs_per_query=args.runs_per_query,
         trigger_threshold=args.trigger_threshold,
+        eval_mode=args.eval_mode,
+        skill_path=skill_path,
+        candidate_content=candidate_content,
         model=args.model,
     )
 
diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py
index 662b63f8..4253aef8 100644
--- a/scripts/tests/test_agent_comparison_optimize_loop.py
+++ b/scripts/tests/test_agent_comparison_optimize_loop.py
@@ -1,3 +1,4 @@
+import contextlib
 import importlib.util
 import json
 import subprocess
@@ -110,11 +111,203 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout):
     generate_variant.main()
     output = json.loads(capsys.readouterr().out)
 
-    assert output["variant"] == "---\ndescription: updated\n---"
+    assert generate_variant.extract_description(output["variant"]) == "updated"
     assert output["tokens_used"] == 3
     assert output["reasoning"] == "raw result"
 
 
+def test_generate_variant_only_changes_description_field(monkeypatch):
+    generate_variant = load_module(
+        "agent_comparison_generate_variant_description_only",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+
+    current_content = """---
+name: example-skill
+description: |
+  old description
+routing:
+  triggers:
+    - "keep-this-trigger"
+---
+
+# Skill
+
+Body stays the same.
+"""
+
+    def fake_run_claude_code(prompt, model):
+        return (
+            "<description>new description line 1\nnew description line 2</description>"
+            "<summary>improved description</summary><deletion_justification></deletion_justification>",
+            "raw result",
+            9,
+        )
+
+    monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code)
+
+    result = generate_variant.generate_variant(
+        target_path="skills/example/SKILL.md",
+        goal="improve routing precision",
+        current_content=current_content,
+        failures=[],
+        model=None,
+    )
+
+    assert generate_variant.extract_description(result["variant"]) == "new description line 1\nnew description line 2"
+    assert '    - "keep-this-trigger"' in result["variant"]
+    assert "# Skill" in result["variant"]
+    assert "Body stays the same." in result["variant"]
+    assert result["deletions"] == []
+
+
+def test_generate_variant_legacy_full_file_output_is_reduced_to_description_only(monkeypatch):
+    generate_variant = load_module(
+        "agent_comparison_generate_variant_legacy_variant",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+
+    current_content = """---
+name: example-skill
+description: old description
+routing:
+  triggers:
+    - "keep-this-trigger"
+---
+
+# Skill
+
+Original body.
+"""
+
+    legacy_variant = """---
+name: example-skill
+description: updated description
+routing:
+  triggers:
+    - "changed-trigger"
+---
+
+# Skill
+
+Changed body.
+"""
+
+    def fake_run_claude_code(prompt, model):
+        return (
+            f"<variant>{legacy_variant}</variant><summary>legacy response</summary>"
+            "<deletion_justification></deletion_justification>",
+            "raw result",
+            5,
+        )
+
+    monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code)
+
+    result = generate_variant.generate_variant(
+        target_path="skills/example/SKILL.md",
+        goal="improve routing precision",
+        current_content=current_content,
+        failures=[],
+        model=None,
+    )
+
+    assert generate_variant.extract_description(result["variant"]) == "updated description"
+    assert '    - "keep-this-trigger"' in result["variant"]
+    assert '    - "changed-trigger"' not in result["variant"]
+    assert "Original body." in result["variant"]
+    assert "Changed body." not in result["variant"]
+
+
+def test_generate_variant_body_only_changes_body_not_frontmatter(monkeypatch):
+    generate_variant = load_module(
+        "agent_comparison_generate_variant_body_only",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+
+    current_content = """---
+name: example-skill
+description: old description
+version: 1.0.0
+---
+
+# Skill
+
+Original body.
+"""
+
+    def fake_run_claude_code(prompt, model):
+        assert "<current_body>" in prompt
+        return (
+            "<body># Skill\n\nImproved body.\n</body><summary>body change</summary>"
+            "<deletion_justification></deletion_justification>",
+            "raw result",
+            7,
+        )
+
+    monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code)
+
+    result = generate_variant.generate_variant(
+        target_path="skills/example/SKILL.md",
+        goal="improve behavioral quality",
+        current_content=current_content,
+        failures=[],
+        model=None,
+        optimization_scope="body-only",
+    )
+
+    assert "description: old description" in result["variant"]
+    assert "# Skill\n\nImproved body." in result["variant"]
+    assert "Original body." not in result["variant"]
+
+
+def test_generate_variant_prompt_includes_full_failed_query_and_expectation(monkeypatch):
+    generate_variant = load_module(
+        "agent_comparison_generate_variant_failure_context",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+
+    current_content = """---
+name: example-skill
+description: old description
+---
+
+# Skill
+"""
+
+    captured = {}
+
+    def fake_run_claude_code(prompt, model):
+        captured["prompt"] = prompt
+        return (
+            "<description>updated description</description>"
+            "<summary>improved description</summary><deletion_justification></deletion_justification>",
+            "raw result",
+            4,
+        )
+
+    monkeypatch.setattr(generate_variant, "_run_claude_code", fake_run_claude_code)
+
+    generate_variant.generate_variant(
+        target_path="skills/example/SKILL.md",
+        goal="improve routing precision",
+        current_content=current_content,
+        failures=[
+            {
+                "name": "rubber duck this bug with me, don't solv",
+                "query": "rubber duck this bug with me, don't solve it yet",
+                "should_trigger": True,
+                "details": "trigger_rate=0.00",
+                "trigger_rate": 0.0,
+            }
+        ],
+        model=None,
+    )
+
+    assert "rubber duck this bug with me, don't solve it yet" in captured["prompt"]
+    assert "expected: SHOULD trigger" in captured["prompt"]
+    assert "raw_trigger_rate=0.00" in captured["prompt"]
+
+
 def test_optimize_loop_omits_model_flag_when_not_provided(tmp_path, monkeypatch):
     optimize_loop = load_module(
         "agent_comparison_optimize_loop_nomodel",
@@ -231,7 +424,7 @@ def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None):
     )
 
     assert result["status"] == "CONVERGED"
-    assert "2 rounds without KEEP" in result["exit_reason"]
+    assert "2 rounds without ACCEPT" in result["exit_reason"]
 
 
 def test_optimize_loop_beam_search_retains_top_k_candidates(tmp_path, monkeypatch):
@@ -268,7 +461,9 @@ def fake_run(cmd, capture_output, text, timeout, cwd=None, env=None):
         return subprocess.CompletedProcess(cmd, 0, stdout=json.dumps(payload), stderr="")
 
     def fake_assess_target(path, *args, **kwargs):
-        content = Path(path).read_text()
+        content = kwargs.get("candidate_content")
+        if content is None:
+            content = Path(path).read_text()
         score = 0.0
         if "<!-- alpha -->" in content:
             score = 1.2
@@ -313,3 +508,449 @@ def fake_assess_target(path, *args, **kwargs):
     selected = [it for it in result["iterations"] if it.get("selected_for_frontier")]
     assert len(selected) == 2
     assert selected[0]["frontier_rank"] == 1 or selected[1]["frontier_rank"] == 1
+
+
+def test_composite_score_uses_weighted_dimensions_only_when_hard_gates_pass():
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_scoring",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    scores = {
+        "parses": True,
+        "compiles": True,
+        "tests_pass": True,
+        "protected_intact": True,
+        "correctness": 7.5,
+        "error_handling": 6.0,
+        "language_idioms": 5.0,
+        "testing": 8.0,
+        "efficiency": 4.0,
+    }
+
+    assert optimize_loop.composite_score(scores) == 6.55
+
+
+def test_composite_score_returns_zero_when_hard_gate_fails():
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_hard_gate",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    scores = {
+        "parses": False,
+        "compiles": True,
+        "tests_pass": False,
+        "protected_intact": True,
+        "correctness": 10.0,
+        "error_handling": 10.0,
+        "language_idioms": 10.0,
+        "testing": 10.0,
+        "efficiency": 10.0,
+    }
+
+    assert optimize_loop.composite_score(scores) == 0.0
+
+
+def test_assess_target_scores_trigger_rate_results(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_trigger_score",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\ndescription: trigger scoring test\n---\n")
+    tasks = [
+        {"query": "good query", "should_trigger": True},
+        {"query": "bad query", "should_trigger": False},
+    ]
+
+    def fake_run_trigger_rate(*args, **kwargs):
+        return {
+            "summary": {"total": 2, "passed": 1, "failed": 1},
+            "results": [
+                {"query": "good query", "pass": True, "trigger_rate": 1.0},
+                {"query": "bad query", "pass": False, "trigger_rate": 0.0},
+            ],
+        }
+
+    monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate)
+
+    scores = optimize_loop.assess_target(
+        target,
+        tasks,
+        "improve routing precision",
+        dry_run=False,
+    )
+
+    assert scores["correctness"] == 5.0
+    assert scores["error_handling"] == 4.0
+    assert scores["language_idioms"] == 3.5
+    assert scores["testing"] == 4.0
+    assert scores["efficiency"] == 3.6
+    assert scores["tests_pass"] is False
+    assert [item["passed"] for item in scores["task_results"]] == [True, False]
+    assert scores["task_results"][0]["query"] == "good query"
+    assert scores["task_results"][0]["should_trigger"] is True
+    assert scores["task_results"][1]["query"] == "bad query"
+    assert scores["task_results"][1]["should_trigger"] is False
+    assert optimize_loop.composite_score(scores) == 4.285
+
+
+def test_assess_target_forwards_parallel_workers_for_behavioral_eval(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_behavioral_parallel",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\ndescription: behavioral scoring test\n---\n")
+    tasks = [
+        {"query": "make a skill", "should_trigger": True, "eval_mode": "behavioral"},
+    ]
+    seen = {}
+
+    def fake_run_behavioral_eval(*args, **kwargs):
+        seen["parallel_workers"] = kwargs["parallel_workers"]
+        return [{"query": "make a skill", "pass": True, "triggered": True, "new_artifacts": ["skills/x/SKILL.md"]}]
+
+    monkeypatch.setattr(optimize_loop, "_run_behavioral_eval", fake_run_behavioral_eval)
+
+    scores = optimize_loop.assess_target(
+        target,
+        tasks,
+        "improve routing precision",
+        parallel_eval_workers=3,
+    )
+
+    assert seen["parallel_workers"] == 3
+    assert scores["tests_pass"] is True
+    assert scores["correctness"] == 10.0
+    assert scores["task_results"][0]["query"] == "make a skill"
+    assert scores["task_results"][0]["should_trigger"] is True
+    assert optimize_loop.composite_score(scores) == 8.45
+
+
+def test_assess_target_scores_blind_compare_results(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_blind_compare",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\ndescription: blind compare test\n---\n")
+    tasks = [{"query": "help me debug this", "eval_mode": "blind_compare", "judge": "socratic_question_only"}]
+
+    def fake_run_blind_compare_eval(
+        target_path, candidate_content, tasks, baseline_content=None, timeout=180, verbose=False
+    ):
+        assert baseline_content == "---\ndescription: baseline\n---\n"
+        return [
+            {
+                "query": "help me debug this",
+                "winner": "candidate",
+                "candidate_score": 0.8,
+                "baseline_score": 0.5,
+                "candidate_output": "What changed recently?",
+                "baseline_output": "The issue is probably your env var rename.",
+                "passed": True,
+            }
+        ]
+
+    monkeypatch.setattr(optimize_loop, "_run_blind_compare_eval", fake_run_blind_compare_eval)
+
+    scores = optimize_loop.assess_target(
+        target,
+        tasks,
+        "improve behavioral quality",
+        candidate_content="---\ndescription: candidate\n---\n",
+        baseline_content="---\ndescription: baseline\n---\n",
+    )
+
+    assert scores["correctness"] == 8.0
+    assert scores["testing"] == 8.0
+    assert scores["tests_pass"] is True
+    assert scores["task_results"][0]["winner"] == "candidate"
+
+
+def test_socratic_question_only_heuristic_penalizes_preamble():
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_socratic_heuristic",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    clean_score, _ = optimize_loop._score_socratic_question_only_output("What did you expect the test to do?")
+    preamble_score, _ = optimize_loop._score_socratic_question_only_output(
+        "Let me read the skill first. What did you expect the test to do?"
+    )
+
+    assert clean_score > preamble_score
+
+
+def test_contains_fallback_contamination_detects_tool_blocked_text():
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_contamination",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    contaminated, reasons = optimize_loop._contains_fallback_contamination(
+        "The Skill tool was blocked in this session, so I'll guide you through this directly."
+    )
+
+    assert contaminated is True
+    assert "mentioned blocked skill tool" in reasons
+    assert "fell back to direct guidance" in reasons
+
+
+def test_run_blind_compare_zeroes_untriggered_or_contaminated_runs(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_blind_compare_guardrails",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "skills" / "socratic-debugging" / "SKILL.md"
+    target.parent.mkdir(parents=True)
+    target.write_text("---\nname: socratic-debugging\ndescription: test\n---\n")
+
+    monkeypatch.setattr(optimize_loop, "_find_project_root", lambda: tmp_path)
+
+    @contextlib.contextmanager
+    def fake_worktree(_project_root, _relpath, content):
+        worktree = tmp_path / ("candidate" if "candidate" in content else "baseline")
+        worktree.mkdir(exist_ok=True)
+        yield worktree
+
+    monkeypatch.setattr(optimize_loop, "_candidate_worktree", fake_worktree)
+
+    def fake_capture(query, cwd, accepted_skill_ids, timeout=180):
+        if cwd.name == "baseline":
+            return {
+                "output": "What changed recently?",
+                "triggered": False,
+                "contaminated": False,
+                "contamination_reasons": [],
+            }
+        return {
+            "output": "The Skill tool was blocked in this session, so I'll guide you through this directly. What changed recently?",
+            "triggered": True,
+            "contaminated": True,
+            "contamination_reasons": ["mentioned blocked skill tool", "fell back to direct guidance"],
+        }
+
+    monkeypatch.setattr(optimize_loop, "_run_query_capture_output", fake_capture)
+
+    results = optimize_loop._run_blind_compare_eval(
+        target,
+        "---\nname: socratic-debugging\ndescription: candidate\n# candidate\n",
+        [{"query": "help me debug", "eval_mode": "blind_compare", "judge": "socratic_question_only"}],
+        baseline_content="---\nname: socratic-debugging\ndescription: baseline\n# baseline\n",
+    )
+
+    assert results[0]["baseline_score"] == 0.0
+    assert results[0]["candidate_score"] == 0.0
+    assert results[0]["baseline_triggered"] is False
+    assert results[0]["candidate_contaminated"] is True
+    assert results[0]["winner"] == "tie"
+    assert results[0]["baseline_reasons"][0] == "target skill did not trigger"
+    assert results[0]["candidate_reasons"][0] == "mentioned blocked skill tool"
+
+
+def test_run_optimization_loop_forwards_parallel_eval_to_assessments(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_parallel_forwarding",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+
+    target = tmp_path / "SKILL.md"
+    target.write_text("---\nname: test-skill\ndescription: test description\nversion: 1.0.0\n---\n")
+    tasks_file = tmp_path / "tasks.json"
+    tasks_file.write_text(
+        json.dumps(
+            {
+                "tasks": [
+                    {
+                        "name": "train-positive",
+                        "query": "make a skill",
+                        "should_trigger": True,
+                        "eval_mode": "behavioral",
+                        "split": "train",
+                    },
+                    {
+                        "name": "test-negative",
+                        "query": "debug kubernetes",
+                        "should_trigger": False,
+                        "eval_mode": "behavioral",
+                        "split": "test",
+                    },
+                ]
+            }
+        )
+    )
+
+    calls = []
+
+    def fake_assess_target(
+        path,
+        tasks,
+        goal,
+        verbose=False,
+        dry_run=False,
+        behavioral_runs_per_task=1,
+        behavioral_trigger_threshold=0.5,
+        parallel_eval_workers=0,
+        candidate_content=None,
+        baseline_content=None,
+        eval_mode="auto",
+    ):
+        calls.append(
+            {
+                "path": str(path),
+                "task_count": len(tasks),
+                "parallel_eval_workers": parallel_eval_workers,
+                "candidate_content": candidate_content,
+                "baseline_content": baseline_content,
+                "eval_mode": eval_mode,
+            }
+        )
+        return {
+            "parses": True,
+            "compiles": True,
+            "tests_pass": True,
+            "protected_intact": True,
+            "correctness": 10.0,
+            "error_handling": 8.0,
+            "language_idioms": 7.0,
+            "testing": 8.0,
+            "efficiency": 6.0,
+            "task_results": [{"name": "task", "passed": True}],
+        }
+
+    monkeypatch.setattr(optimize_loop, "assess_target", fake_assess_target)
+
+    result = optimize_loop.run_optimization_loop(
+        target_path=target,
+        goal="improve routing precision",
+        benchmark_tasks_path=tasks_file,
+        max_iterations=1,
+        min_gain=0.0,
+        train_split=0.6,
+        model=None,
+        output_dir=tmp_path / "out",
+        report_path=tmp_path / "out" / "report.html",
+        verbose=False,
+        dry_run=True,
+        parallel_eval=2,
+    )
+
+    assert result["status"] in {"COMPLETE", "CONVERGED"}
+    assert calls
+    assert all(call["parallel_eval_workers"] == 2 for call in calls)
+    assert all(call["candidate_content"] is not None for call in calls)
+    assert any(call["baseline_content"] is not None for call in calls[1:])
+    assert all(call["eval_mode"] == "auto" for call in calls)
+
+
+def test_tiny_end_to_end_autoresearch_improves_real_weak_skill_copy(tmp_path, monkeypatch):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop_e2e",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+    generate_variant = load_module(
+        "agent_comparison_generate_variant_e2e",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+
+    source_skill = REPO_ROOT / "skills" / "socratic-debugging" / "SKILL.md"
+    target = tmp_path / "SKILL.md"
+    target.write_text(source_skill.read_text())
+
+    trigger_query = "help me think through this bug step by step"
+    tasks_file = tmp_path / "tasks.json"
+    tasks_file.write_text(
+        json.dumps({"tasks": [{"name": "positive", "query": trigger_query, "should_trigger": True, "split": "train"}]})
+    )
+
+    def fake_generate_variant_output(
+        current_content,
+        target_path,
+        goal,
+        last_failures,
+        history,
+        model,
+        dry_run,
+        iteration_number,
+        optimization_scope,
+        diversification_note=None,
+    ):
+        improved_description = (
+            "Question-only debugging mode that guides users to find root causes through structured questions. "
+            f'Use when: "{trigger_query}", "rubber duck debug with me", "help me think through this bug".'
+        )
+        return {
+            "variant": generate_variant.replace_description(current_content, improved_description),
+            "summary": "Added exact positive trigger phrase to the description.",
+            "reasoning": "Deterministic test variant",
+            "tokens_used": 0,
+            "deletions": [],
+            "deletion_justification": "",
+        }
+
+    def fake_run_trigger_rate(
+        target_path,
+        description,
+        tasks,
+        candidate_content=None,
+        eval_mode="auto",
+        num_workers=5,
+        timeout=30,
+        verbose=False,
+    ):
+        passed = trigger_query in description
+        return {
+            "results": [
+                {
+                    "query": trigger_query,
+                    "pass": passed,
+                    "trigger_rate": 1.0 if passed else 0.0,
+                }
+            ],
+            "summary": {
+                "total": 1,
+                "passed": 1 if passed else 0,
+                "failed": 0 if passed else 1,
+            },
+        }
+
+    monkeypatch.setattr(optimize_loop, "_generate_variant_output", fake_generate_variant_output)
+    monkeypatch.setattr(optimize_loop, "_run_trigger_rate", fake_run_trigger_rate)
+
+    out_dir = tmp_path / "out"
+    result = optimize_loop.run_optimization_loop(
+        target_path=target,
+        goal="improve routing precision",
+        benchmark_tasks_path=tasks_file,
+        max_iterations=1,
+        min_gain=0.0,
+        train_split=0.6,
+        model=None,
+        output_dir=out_dir,
+        report_path=out_dir / "report.html",
+        verbose=False,
+        dry_run=False,
+    )
+
+    assert result["best_iteration"] == 1
+    assert result["improvements_found"] == 1
+    assert result["baseline_train_score"] == 0.06
+    assert result["best_score"] == 8.45
+
+    results_json = json.loads((out_dir / "results.json").read_text())
+    assert results_json["best_iteration"] == 1
+    assert results_json["iterations"][0]["verdict"] == "ACCEPT"
+
+    best_variant = (out_dir / "best_variant.md").read_text()
+    assert trigger_query in generate_variant.extract_description(best_variant)
+
+    verdict_json = json.loads((out_dir / "001" / "verdict.json").read_text())
+    assert verdict_json["verdict"] == "ACCEPT"
+    assert verdict_json["composite_score"] == 8.45
diff --git a/scripts/tests/test_skill_eval_claude_code.py b/scripts/tests/test_skill_eval_claude_code.py
index a0c9e05c..25aa844c 100644
--- a/scripts/tests/test_skill_eval_claude_code.py
+++ b/scripts/tests/test_skill_eval_claude_code.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
 import json
+import os
 import subprocess
+from contextlib import contextmanager
 from pathlib import Path
 
 
@@ -46,3 +48,435 @@ def fake_run(cmd, capture_output, text, cwd, env, timeout):
     transcript = json.loads((tmp_path / "improve_iter_1.json").read_text())
     assert transcript["raw_result_text"] == "raw result"
     assert transcript["rewrite_raw_result_text"] == "raw result"
+
+
+class _FakeUUID:
+    hex = "deadbeefcafebabe"
+
+
+class _FakePopen:
+    def __init__(self, stdout_bytes: bytes):
+        read_fd, write_fd = os.pipe()
+        os.write(write_fd, stdout_bytes)
+        os.close(write_fd)
+        self.stdout = os.fdopen(read_fd, "rb", buffering=0)
+        self._returncode = None
+
+    def poll(self):
+        return self._returncode
+
+    def kill(self):
+        self._returncode = -9
+
+    def wait(self):
+        return self._returncode
+
+
+def test_run_single_query_ignores_unrelated_stream_tool_use_before_matching_read(monkeypatch, tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    clean_name = "demo-skill-skill-deadbeef"
+    stream_lines = [
+        {
+            "type": "stream_event",
+            "event": {"type": "content_block_start", "content_block": {"type": "tool_use", "name": "Bash"}},
+        },
+        {
+            "type": "stream_event",
+            "event": {"type": "content_block_start", "content_block": {"type": "tool_use", "name": "Read"}},
+        },
+        {
+            "type": "stream_event",
+            "event": {
+                "type": "content_block_delta",
+                "delta": {
+                    "type": "input_json_delta",
+                    "partial_json": f'{{"file_path":"/tmp/project/.claude/commands/{clean_name}.md"}}',
+                },
+            },
+        },
+        {"type": "stream_event", "event": {"type": "content_block_stop"}},
+        {"type": "result"},
+    ]
+    payload = ("\n".join(json.dumps(line) for line in stream_lines) + "\n").encode()
+
+    monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID())
+    monkeypatch.setattr(mod.subprocess, "Popen", lambda *_args, **_kwargs: _FakePopen(payload))
+    monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], []))
+
+    triggered = mod.run_single_query(
+        query="help me debug this",
+        skill_name="demo-skill",
+        skill_description="demo description",
+        timeout=5,
+        project_root=str(tmp_path),
+        eval_mode="registered",
+    )
+
+    assert triggered is True
+
+
+def test_run_single_query_scans_all_assistant_tool_uses_before_returning(monkeypatch, tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    clean_name = "demo-skill-skill-deadbeef"
+    assistant_lines = [
+        {
+            "type": "assistant",
+            "message": {
+                "content": [
+                    {"type": "tool_use", "name": "Bash", "input": {"command": "echo hi"}},
+                    {
+                        "type": "tool_use",
+                        "name": "Read",
+                        "input": {"file_path": f"/tmp/project/.claude/commands/{clean_name}.md"},
+                    },
+                ]
+            },
+        },
+        {"type": "result"},
+    ]
+    payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode()
+
+    monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID())
+    monkeypatch.setattr(mod.subprocess, "Popen", lambda *_args, **_kwargs: _FakePopen(payload))
+    monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], []))
+
+    triggered = mod.run_single_query(
+        query="help me debug this",
+        skill_name="demo-skill",
+        skill_description="demo description",
+        timeout=5,
+        project_root=str(tmp_path),
+        eval_mode="registered",
+    )
+
+    assert triggered is True
+
+
+def test_run_single_query_accepts_real_skill_name_not_just_temporary_alias(monkeypatch, tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    assistant_lines = [
+        {
+            "type": "assistant",
+            "message": {
+                "content": [
+                    {
+                        "type": "tool_use",
+                        "name": "Skill",
+                        "input": {"skill": "demo-skill"},
+                    }
+                ]
+            },
+        },
+        {"type": "result"},
+    ]
+    payload = ("\n".join(json.dumps(line) for line in assistant_lines) + "\n").encode()
+
+    monkeypatch.setattr(mod.uuid, "uuid4", lambda: _FakeUUID())
+    monkeypatch.setattr(mod.subprocess, "Popen", lambda *_args, **_kwargs: _FakePopen(payload))
+    monkeypatch.setattr(mod.select, "select", lambda readables, *_args: (readables, [], []))
+
+    triggered = mod.run_single_query(
+        query="help me debug this",
+        skill_name="demo-skill",
+        skill_description="demo description",
+        timeout=5,
+        project_root=str(tmp_path),
+        eval_mode="registered",
+    )
+
+    assert triggered is True
+
+
+def test_resolve_registered_skill_relpath_accepts_repo_skill(tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    project_root = tmp_path
+    skill_dir = project_root / "skills" / "demo-skill"
+    skill_dir.mkdir(parents=True)
+    (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n")
+
+    relpath = mod.resolve_registered_skill_relpath(skill_dir, project_root)
+
+    assert relpath == Path("skills/demo-skill/SKILL.md")
+
+
+def test_replace_description_in_skill_md_rewrites_frontmatter_block_scalar():
+    from scripts.skill_eval import run_eval as mod
+
+    original = """---
+name: demo-skill
+description: |
+  old description
+version: 1.0.0
+---
+
+# Skill
+"""
+
+    updated = mod.replace_description_in_skill_md(original, "new description line 1\nnew description line 2")
+
+    assert "description: |\n  new description line 1\n  new description line 2\nversion: 1.0.0" in updated
+    assert "# Skill" in updated
+
+
+def test_load_eval_set_accepts_common_wrapped_formats(tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    tasks_path = tmp_path / "tasks.json"
+    tasks_path.write_text(json.dumps({"tasks": [{"query": "q1", "should_trigger": True}]}))
+    queries_path = tmp_path / "queries.json"
+    queries_path.write_text(json.dumps({"queries": [{"query": "q2", "should_trigger": False}]}))
+    split_path = tmp_path / "split.json"
+    split_path.write_text(
+        json.dumps(
+            {
+                "train": [{"query": "q3", "should_trigger": True}],
+                "test": [{"query": "q4", "should_trigger": False}],
+            }
+        )
+    )
+
+    assert mod.load_eval_set(tasks_path) == [{"query": "q1", "should_trigger": True}]
+    assert mod.load_eval_set(queries_path) == [{"query": "q2", "should_trigger": False}]
+    assert mod.load_eval_set(split_path) == [
+        {"query": "q3", "should_trigger": True},
+        {"query": "q4", "should_trigger": False},
+    ]
+
+
+def test_run_eval_auto_uses_registered_worktree_for_repo_skill(monkeypatch, tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    skill_dir = tmp_path / "skills" / "demo-skill"
+    skill_dir.mkdir(parents=True)
+    (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n")
+    worktree_root = tmp_path / "worktree"
+    worktree_root.mkdir()
+
+    seen = {"candidate_content": None, "submitted": []}
+
+    @contextmanager
+    def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content):
+        seen["candidate_content"] = candidate_content
+        seen["registered_skill_relpath"] = registered_skill_relpath
+        yield worktree_root
+
+    class _FakeFuture:
+        def __init__(self, value):
+            self._value = value
+
+        def result(self):
+            return self._value
+
+    class _FakeExecutor:
+        def __init__(self, max_workers):
+            self.max_workers = max_workers
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, tb):
+            return False
+
+        def submit(self, fn, *args):
+            seen["submitted"].append(args)
+            return _FakeFuture(True)
+
+    monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree)
+    monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor)
+    monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures))
+
+    result = mod.run_eval(
+        eval_set=[{"query": "help me debug this", "should_trigger": True}],
+        skill_name="demo-skill",
+        description="demo description",
+        num_workers=1,
+        timeout=5,
+        project_root=tmp_path,
+        eval_mode="auto",
+        skill_path=skill_dir,
+        candidate_content="candidate body",
+    )
+
+    assert seen["candidate_content"] == "candidate body"
+    assert seen["registered_skill_relpath"] == Path("skills/demo-skill/SKILL.md")
+    assert seen["submitted"]
+    _, _, _, _, submitted_project_root, submitted_eval_mode, _ = seen["submitted"][0]
+    assert submitted_project_root == str(worktree_root)
+    assert submitted_eval_mode == "registered"
+    assert result["summary"]["passed"] == 1
+
+
+def test_run_eval_registered_mode_patches_candidate_from_description_override(monkeypatch, tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    skill_dir = tmp_path / "skills" / "demo-skill"
+    skill_dir.mkdir(parents=True)
+    original_content = """---
+name: demo-skill
+description: old description
+version: 1.0.0
+---
+
+# Skill
+"""
+    (skill_dir / "SKILL.md").write_text(original_content)
+    seen = {"candidate_content": None}
+
+    @contextmanager
+    def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content):
+        seen["candidate_content"] = candidate_content
+        yield tmp_path / "worktree"
+
+    class _FakeFuture:
+        def __init__(self, value):
+            self._value = value
+
+        def result(self):
+            return self._value
+
+    class _FakeExecutor:
+        def __init__(self, max_workers):
+            self.max_workers = max_workers
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, tb):
+            return False
+
+        def submit(self, fn, *args):
+            return _FakeFuture(True)
+
+    monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree)
+    monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor)
+    monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures))
+
+    mod.run_eval(
+        eval_set=[{"query": "help me debug this", "should_trigger": True}],
+        skill_name="demo-skill",
+        description="new description",
+        num_workers=1,
+        timeout=5,
+        project_root=tmp_path,
+        eval_mode="registered",
+        skill_path=skill_dir,
+        candidate_content=None,
+    )
+
+    assert seen["candidate_content"] is not None
+    assert "description: |\n  new description\nversion: 1.0.0" in seen["candidate_content"]
+
+
+def test_run_eval_registered_mode_patches_current_working_copy_when_no_override(monkeypatch, tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    skill_dir = tmp_path / "skills" / "demo-skill"
+    skill_dir.mkdir(parents=True)
+    original_content = """---
+name: demo-skill
+description: current working copy description
+version: 1.0.0
+---
+
+# Skill
+"""
+    (skill_dir / "SKILL.md").write_text(original_content)
+    seen = {"candidate_content": None}
+
+    @contextmanager
+    def fake_candidate_worktree(project_root, registered_skill_relpath, candidate_content):
+        seen["candidate_content"] = candidate_content
+        yield tmp_path / "worktree"
+
+    class _FakeFuture:
+        def __init__(self, value):
+            self._value = value
+
+        def result(self):
+            return self._value
+
+    class _FakeExecutor:
+        def __init__(self, max_workers):
+            self.max_workers = max_workers
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, tb):
+            return False
+
+        def submit(self, fn, *args):
+            return _FakeFuture(True)
+
+    monkeypatch.setattr(mod, "candidate_worktree", fake_candidate_worktree)
+    monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor)
+    monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures))
+
+    mod.run_eval(
+        eval_set=[{"query": "help me debug this", "should_trigger": True}],
+        skill_name="demo-skill",
+        description="current working copy description",
+        num_workers=1,
+        timeout=5,
+        project_root=tmp_path,
+        eval_mode="registered",
+        skill_path=skill_dir,
+        candidate_content=None,
+    )
+
+    assert seen["candidate_content"] == original_content
+
+
+def test_run_eval_auto_falls_back_to_alias_for_non_registered_skill(monkeypatch, tmp_path):
+    from scripts.skill_eval import run_eval as mod
+
+    skill_dir = tmp_path / "scratch" / "demo-skill"
+    skill_dir.mkdir(parents=True)
+    (skill_dir / "SKILL.md").write_text("---\nname: demo-skill\ndescription: demo\n---\n")
+
+    seen_submissions = []
+
+    class _FakeFuture:
+        def __init__(self, value):
+            self._value = value
+
+        def result(self):
+            return self._value
+
+    class _FakeExecutor:
+        def __init__(self, max_workers):
+            self.max_workers = max_workers
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, tb):
+            return False
+
+        def submit(self, fn, *args):
+            seen_submissions.append(args)
+            return _FakeFuture(False)
+
+    monkeypatch.setattr(mod, "ProcessPoolExecutor", _FakeExecutor)
+    monkeypatch.setattr(mod, "as_completed", lambda futures: list(futures))
+
+    result = mod.run_eval(
+        eval_set=[{"query": "help me debug this", "should_trigger": True}],
+        skill_name="demo-skill",
+        description="demo description",
+        num_workers=1,
+        timeout=5,
+        project_root=tmp_path,
+        eval_mode="auto",
+        skill_path=skill_dir,
+    )
+
+    assert seen_submissions
+    _, _, _, _, submitted_project_root, submitted_eval_mode, _ = seen_submissions[0]
+    assert submitted_project_root == str(tmp_path)
+    assert submitted_eval_mode == "alias"
+    assert result["summary"]["passed"] == 0
diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md
index 21e8c150..0c83c132 100644
--- a/skills/agent-comparison/SKILL.md
+++ b/skills/agent-comparison/SKILL.md
@@ -308,15 +308,15 @@ The loop automatically evaluates the unmodified target against the train set bef
 **Step 4: Enter optimization loop**
 
 The `optimize_loop.py` script handles the full loop:
-- Calls `generate_variant.py` to propose changes through `claude -p`
+- Calls `generate_variant.py` to propose a new frontmatter `description` through `claude -p`
 - Evaluates each variant against train tasks
 - Runs either:
   - single-path hill climbing: `--beam-width 1 --candidates-per-parent 1`
   - beam search with top-K retention: keep the best `K` improving candidates each round
-- Keeps variants that beat their parent by more than `--min-gain` (default 0.02)
-- Reverts variants that don't improve, break hard gates, or delete sections without justification
+- Accepts variants that beat their parent by more than `--min-gain` (default 0.02)
+- Rejects variants that don't improve or break hard gates
 - Checks held-out test set every `--holdout-check-cadence` rounds for Goodhart divergence
-- Stops on convergence (`--revert-streak-limit` rounds without any KEEP), Goodhart alarm, or max iterations
+- Stops on convergence (`--revert-streak-limit` rounds without any ACCEPT), Goodhart alarm, or max iterations
 
 ```bash
 python3 skills/agent-comparison/scripts/optimize_loop.py \
@@ -340,23 +340,33 @@ Omit `--model` to use Claude Code's configured default model, or pass it explici
 The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and review/export controls.
 
 Recommended modes:
-- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1`
+- Short default optimization: default flags only
+- Fast single-path optimization: `--beam-width 1 --candidates-per-parent 1 --max-iterations 3 --revert-streak-limit 3`
 - True autoresearch sweep: `--max-iterations 20 --beam-width 3 --candidates-per-parent 2 --revert-streak-limit 20`
 - Conservative search with strict keeps: raise `--min-gain` above `0.02`
 - Exploratory search that accepts small wins: use `--min-gain 0.0`
 
+Live eval defaults are intentionally short:
+- one optimization round
+- one trigger-eval run per query
+- one trigger-eval worker
+- no holdout cadence unless explicitly requested
+
+For real repo skills at `skills/<name>/SKILL.md`, the live evaluator now prefers an isolated git worktree so the candidate content is scored at the real skill path. This is the default `--eval-mode auto` behavior and avoids scoring the installed skill instead of the candidate.
+The registered-skill path also evaluates the current working copy, not just `HEAD`, so local uncommitted edits are measured correctly.
+
 **Step 5: Present results in UI**
 
 Open the generated `optimization-report.html` in a browser. The report shows:
-- Progress dashboard (status, baseline vs best, kept/reverted counts)
+- Progress dashboard (status, baseline vs best, accepted/rejected counts)
 - Convergence chart (train solid line, held-out dashed line, baseline dotted)
 - Iteration table with verdict, composite score, delta, and change summary
 - Expandable inline diffs per iteration (click any row)
 
-**Step 6: Review kept snapshots**
+**Step 6: Review accepted snapshots**
 
-Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews the kept iterations as candidate snapshots from the original target:
-- Inspect each kept iteration's diff in the report
+Not all ACCEPT iterations are real improvements — some may be harness artifacts. The user reviews the accepted iterations as candidate snapshots from the original target:
+- Inspect each accepted iteration's diff in the report
 - Use "Preview Selected Snapshot" only as a comparison aid in the UI
 - Use "Export Selected" to download a review JSON describing the selected snapshot diff
 - In beam mode, review the retained frontier candidates first; they are the strongest candidates from the latest round
@@ -365,15 +375,18 @@ Not all KEEP iterations are real improvements — some may be harness artifacts.
 
 Apply one reviewed improvement to the original target file.
 
-- If you want the best single kept variant, use `evals/iterations/best_variant.md`.
-- Beam search still writes a single `best_variant.md`: the highest-scoring kept candidate seen anywhere in the run.
-- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple kept diffs into a generated patch.
+- If you want the best single accepted variant, use `evals/iterations/best_variant.md`.
+- Beam search still writes a single `best_variant.md`: the highest-scoring accepted candidate seen anywhere in the run.
+- Choose scope deliberately:
+  - `description-only` for routing-trigger work
+  - `body-only` for behavioral work on the skill instructions themselves
+- If you exported selected diffs, treat that JSON as review material only. It is not auto-applied by the current tooling, and the current workflow does not support merging multiple accepted diffs into a generated patch.
 
 ```bash
-# Review the best kept variant before applying
+# Review the best accepted variant before applying
 cat evals/iterations/best_variant.md | head -20
 
-# Replace the target with the best kept variant
+# Replace the target with the best accepted variant
 cp evals/iterations/best_variant.md skills/{target}/SKILL.md
 ```
 
@@ -397,11 +410,30 @@ Compare final scores to the baseline to confirm net improvement. In beam mode, t
 python3 scripts/learning-db.py learn \
     --skill agent-comparison \
     "autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \
-     Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}"
+     Accepted: {accepted}/{total}. Stop: {reason}. Changes: {summaries}"
 ```
 
 **Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded.
 
+### Current Reality Check
+
+The current optimizer is in a solid state for:
+- deterministic proof runs
+- isolated live evaluation of existing registered skills
+- short live optimization of `read-only-ops`, with the accepted description change now applied and validated against `references/read-only-ops-short-tasks.json`
+- short live body evaluation of `socratic-debugging`, with `references/socratic-debugging-body-short-tasks.json`
+  now producing clean skill-triggered first-turn outputs instead of fallback chatter
+
+One live-harness caveat remains:
+- temporary renamed skill copies do not yet show reliable live trigger improvements through the dynamic command alias path
+
+That caveat does not affect deterministic proof runs or live checks against existing registered skills, but it does mean the current system is stronger for optimizing real in-repo skills than arbitrary renamed temp clones.
+
+For body optimization runs, the blind evaluator now rejects responses that:
+- never triggered the target skill
+- mention blocked skill/tool access
+- fall back into generic "I'll guide you directly" behavior
+
 ### Optional Extensions
 
 These are off by default. Enable explicitly when needed:
diff --git a/skills/agent-comparison/references/optimization-guide.md b/skills/agent-comparison/references/optimization-guide.md
index 3aa0f6a8..7d689e2c 100644
--- a/skills/agent-comparison/references/optimization-guide.md
+++ b/skills/agent-comparison/references/optimization-guide.md
@@ -80,8 +80,29 @@ Explicit train/test sets:
 If no split markers are present, the loop performs a reproducible random split
 using `--train-split` and seed `42`.
 
+`run_eval.py` now accepts the same common task-file wrappers:
+
+- raw list: `[{"query": "...", "should_trigger": true}]`
+- task wrapper: `{"tasks": [...]}`
+- query wrapper: `{"queries": [...]}`
+- split wrapper: `{"train": [...], "test": [...]}`
+
 ## Command
 
+Short default run:
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+  --target skills/go-testing/SKILL.md \
+  --goal "improve routing precision without losing recall" \
+  --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
+  --report optimization-report.html \
+  --output-dir evals/iterations \
+  --verbose
+```
+
+Longer search:
+
 ```bash
 python3 skills/agent-comparison/scripts/optimize_loop.py \
   --target skills/go-testing/SKILL.md \
@@ -106,20 +127,45 @@ Useful flags:
 - `--dry-run`: exercise the loop mechanics without calling Claude Code
 - `--report`: write a live HTML report
 - `--output-dir`: persist iteration snapshots and `results.json`
+- `--eval-mode auto|registered|alias`: choose how live trigger eval is isolated
 - `--beam-width`: retain the best K improving candidates per round
 - `--candidates-per-parent`: generate multiple sibling variants from each frontier candidate
-- `--revert-streak-limit`: stop after N rounds without any KEEP candidates
+- `--revert-streak-limit`: stop after N rounds without any ACCEPT candidates
 - `--holdout-check-cadence`: evaluate the global best on held-out tasks every N rounds
+- `--parallel-eval N`: run behavioral eval tasks in parallel isolated worktrees
+
+Short defaults:
+
+- `--max-iterations 1`
+- `--revert-streak-limit 1`
+- `--holdout-check-cadence 0`
+- trigger eval `--num-workers 1`
+- trigger eval `--runs-per-query 1`
 
 Recommended search presets:
 
+- Short proof run:
+  - default flags only
 - Single-path local search:
-  - `--beam-width 1 --candidates-per-parent 1`
+  - `--beam-width 1 --candidates-per-parent 1 --max-iterations 3 --revert-streak-limit 3`
 - Balanced beam search:
   - `--beam-width 3 --candidates-per-parent 2`
 - Aggressive exploration:
   - `--beam-width 5 --candidates-per-parent 3 --min-gain 0.0`
 
+## Live Eval Isolation Modes
+
+`run_eval.py` now has three modes:
+
+- `auto`: default. If the target is a real repo skill at `skills/<name>/SKILL.md`, live eval runs in an isolated git worktree with the candidate content patched into the real path. Otherwise it falls back to alias mode.
+- `registered`: force isolated worktree evaluation of a real registered skill.
+- `alias`: force legacy dynamic command-file evaluation.
+
+For real registered skills, `auto` is the preferred mode. It prevents the evaluator
+from accidentally scoring the installed skill instead of the candidate under test.
+It also patches the current working-copy skill content into the isolated worktree,
+so local uncommitted edits are evaluated correctly.
+
 ## Evaluation Model
 
 The loop follows the ADR-131 structure:
@@ -131,11 +177,10 @@ The loop follows the ADR-131 structure:
 
 ### Layer 1: Hard Gates
 
-An iteration is rejected immediately if any of these fail:
+An iteration is rejected immediately if any of these mechanical validity gates fail:
 
 - `parses`
 - `compiles`
-- `tests_pass`
 - `protected_intact`
 
 For description optimization, `parses` and `protected_intact` are the most
@@ -144,9 +189,13 @@ preserved verbatim.
 
 ### Layer 2: Composite Score
 
-The loop converts trigger-rate evaluation results into a weighted composite
-score using the built-in weights in `optimize_loop.py`. A candidate is kept only
-if it beats its parent by more than `--min-gain`.
+The loop converts evaluation results into a weighted composite score using the
+built-in weights in `optimize_loop.py`. Task accuracy affects the component
+dimensions (`correctness`, `error_handling`, `language_idioms`, `testing`,
+`efficiency`) without zeroing the entire score. This preserves optimization
+signal for incremental improvements when a task set is not yet perfect.
+
+A candidate is accepted only if it beats its parent by more than `--min-gain`.
 
 ### Layer 3: Held-Out Regression Check
 
@@ -161,21 +210,26 @@ When beam search is enabled:
 
 - each frontier candidate generates `--candidates-per-parent` siblings
 - every sibling is scored independently
-- the top `--beam-width` KEEP candidates become the next frontier
+- the top `--beam-width` ACCEPT candidates become the next frontier
 - `best_variant.md` still tracks the single best candidate seen anywhere in the run
 
 When `--beam-width 1 --candidates-per-parent 1`, the behavior collapses back to
 the original single-path optimizer.
 
-## Deletion Safety Rule
+## Optimization Scopes
+
+The optimizer supports two mutation scopes:
 
-Deleting sections is allowed only with explicit justification.
+- `description-only`: replace only the YAML frontmatter `description`
+- `body-only`: replace only the markdown body below the frontmatter
 
-- `generate_variant.py` detects removed `##` headings
-- the model must return a `deletion_justification`
-- `optimize_loop.py` rejects deletions without one
+`generate_variant.py` reconstructs the full file around the selected scope so
+the unchanged parts stay intact. Use `description-only` for routing-trigger
+work and `body-only` for behavioral work judged from the skill's actual output.
 
-This enforces ADR-131's "no deletion without justification" rule.
+For body optimization, pair `--optimization-scope body-only` with
+`blind_compare` tasks so generation and evaluation are measuring the same
+surface area.
 
 ## Iteration Artifacts
 
@@ -193,10 +247,54 @@ When `--output-dir` is set, the loop writes:
 
 When `--report` is set, it also writes a live HTML dashboard showing:
 
-- status, baseline, best score, kept/reverted counts
+- status, baseline, best score, accepted/rejected counts
 - convergence chart
 - iteration table with diffs
-- review/export controls for kept snapshot diffs from the original target
+- review/export controls for accepted snapshot diffs from the original target
+
+## Current Validation Status
+
+What is currently demonstrated:
+- deterministic end-to-end improvement runs with readable artifacts
+- isolated live optimization for existing registered skills via temporary git worktrees
+- blind body-eval runs that require actual skill-trigger evidence before scoring
+- score calculations and accept/reject decisions that match the weighted rubric
+- short live proof on `skills/read-only-ops/SKILL.md` using
+  `references/read-only-ops-short-tasks.json`, improving from one failed positive
+  to `2/2` live passes after the accepted description update
+- short live body benchmark on `skills/socratic-debugging/SKILL.md` using
+  `references/socratic-debugging-body-short-tasks.json`, where the current
+  baseline now evaluates cleanly and non-improving body variants are rejected
+
+What remains imperfect:
+- live optimization of temporary renamed skill copies still fails to show measured improvement through the dynamic command alias path
+
+So the current tooling is operational for real registered skills and deterministic proof runs, but not yet fully proven for arbitrary temporary renamed clones.
+
+## Short Live Commands
+
+Routing optimization on a real registered skill:
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+  --target skills/read-only-ops/SKILL.md \
+  --goal "Improve read-only routing precision for realistic user prompts." \
+  --benchmark-tasks skills/agent-comparison/references/read-only-ops-short-tasks.json
+```
+
+Body optimization on a real registered skill:
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+  --target skills/socratic-debugging/SKILL.md \
+  --goal "Improve the first response so it asks exactly one question, avoids direct diagnosis, avoids code examples, and does not add tool-permission preamble." \
+  --benchmark-tasks skills/agent-comparison/references/socratic-debugging-body-short-tasks.json \
+  --optimization-scope body-only
+```
+
+The blind body path now fails closed: if the intended skill does not trigger, or
+the response falls back into tool-blocked/direct-guidance chatter, the run is
+scored as a failure instead of being treated as a weak improvement.
 
 ## Choosing Good Eval Tasks
 
diff --git a/skills/agent-comparison/references/read-only-ops-short-tasks.json b/skills/agent-comparison/references/read-only-ops-short-tasks.json
new file mode 100644
index 00000000..dad16d94
--- /dev/null
+++ b/skills/agent-comparison/references/read-only-ops-short-tasks.json
@@ -0,0 +1,16 @@
+{
+  "tasks": [
+    {
+      "name": "positive-read-only-report",
+      "query": "inspect this repository and report back without changing anything",
+      "should_trigger": true,
+      "split": "train"
+    },
+    {
+      "name": "negative-fix-tests",
+      "query": "fix the failing tests in this repository",
+      "should_trigger": false,
+      "split": "train"
+    }
+  ]
+}
diff --git a/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json b/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json
new file mode 100644
index 00000000..457b7240
--- /dev/null
+++ b/skills/agent-comparison/references/socratic-debugging-body-short-tasks.json
@@ -0,0 +1,12 @@
+{
+  "tasks": [
+    {
+      "name": "socratic-first-turn",
+      "query": "Help me think through this bug. My Python script sometimes returns None instead of a dict when the cache is warm. Please do not solve it for me directly.",
+      "eval_mode": "blind_compare",
+      "judge": "heuristic_socratic_debugging",
+      "min_score": 0.7,
+      "split": "train"
+    }
+  ]
+}
diff --git a/skills/agent-comparison/references/socratic-debugging-trigger-tasks.json b/skills/agent-comparison/references/socratic-debugging-trigger-tasks.json
new file mode 100644
index 00000000..ee4ce4b7
--- /dev/null
+++ b/skills/agent-comparison/references/socratic-debugging-trigger-tasks.json
@@ -0,0 +1,98 @@
+[
+  {
+    "query": "help me think through this bug step by step",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "explicit request for guided reasoning"
+  },
+  {
+    "query": "walk me through debugging this",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "guided debugging with user doing the work"
+  },
+  {
+    "query": "I need coaching on how to debug this problem",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "coaching/teaching framing"
+  },
+  {
+    "query": "teach me to find the root cause myself",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "explicit teach-me framing"
+  },
+  {
+    "query": "guide me to the root cause with questions",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "question-based guidance request"
+  },
+  {
+    "query": "rubber duck debug with me",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "rubber duck debugging is a known trigger"
+  },
+  {
+    "query": "ask me questions to help me figure out the bug",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "explicit ask-me-questions pattern"
+  },
+  {
+    "query": "help me learn to find bugs myself instead of just telling me the answer",
+    "should_trigger": true,
+    "complexity": "simple",
+    "description": "pedagogical debugging preference"
+  },
+  {
+    "query": "just fix this bug for me",
+    "should_trigger": false,
+    "complexity": "simple",
+    "description": "direct fix request, not guided learning"
+  },
+  {
+    "query": "what's wrong with this code",
+    "should_trigger": false,
+    "complexity": "simple",
+    "description": "direct answer expected, not guided"
+  },
+  {
+    "query": "debug this crash and tell me what to change",
+    "should_trigger": false,
+    "complexity": "simple",
+    "description": "wants answer, not coaching"
+  },
+  {
+    "query": "review my code for bugs",
+    "should_trigger": false,
+    "complexity": "simple",
+    "description": "code review, not debugging coaching"
+  },
+  {
+    "query": "run the tests and find what's failing",
+    "should_trigger": false,
+    "complexity": "simple",
+    "description": "automated test run, not guided debugging"
+  },
+  {
+    "query": "investigate this production failure and give me a root cause analysis",
+    "should_trigger": false,
+    "complexity": "medium",
+    "description": "wants RCA output, not teaching"
+  },
+  {
+    "query": "check for performance bugs in this service",
+    "should_trigger": false,
+    "complexity": "simple",
+    "description": "performance audit, not debugging coaching"
+  },
+  {
+    "query": "find the security issue in this authentication code",
+    "should_trigger": false,
+    "complexity": "simple",
+    "description": "security review, not pedagogical debugging"
+  }
+]
diff --git a/skills/agent-comparison/scripts/generate_variant.py b/skills/agent-comparison/scripts/generate_variant.py
index 31cb2446..1a35aa46 100644
--- a/skills/agent-comparison/scripts/generate_variant.py
+++ b/skills/agent-comparison/scripts/generate_variant.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
-"""Generate a variant of an agent/skill file using Claude Code.
+"""Generate an optimized variant of an agent/skill file using Claude Code.
 
-Proposes modifications to improve the target file based on the optimization
-goal and previous iteration failures. Preserves protected sections marked
-with DO NOT OPTIMIZE markers.
+Supports two optimization scopes:
+- description-only: mutate frontmatter description only
+- body-only: mutate the markdown body only
 
 Pattern: uses `claude -p` so generation runs through Claude Code directly.
 
@@ -17,8 +17,8 @@
 
 Output (JSON to stdout):
     {
-        "variant": "full file content...",
-        "summary": "Added CRITICAL warning for error wrapping",
+        "variant": "full file content with updated description...",
+        "summary": "Added concrete trigger phrases to the description",
         "deletion_justification": "",
         "reasoning": "Extended thinking content...",
         "tokens_used": 12345
@@ -86,6 +86,126 @@ def detect_deletions(original: str, variant: str) -> list[str]:
     return sorted(orig_headings - var_headings)
 
 
+# ---------------------------------------------------------------------------
+# Description-only optimization helpers
+# ---------------------------------------------------------------------------
+
+
+def extract_description(content: str) -> str:
+    """Extract frontmatter description text from a markdown file."""
+    lines = content.split("\n")
+    if not lines or lines[0].strip() != "---":
+        raise ValueError("Content missing frontmatter opening delimiter")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+    if end_idx is None:
+        raise ValueError("Content missing frontmatter closing delimiter")
+
+    fm_lines = lines[1:end_idx]
+    idx = 0
+    while idx < len(fm_lines):
+        line = fm_lines[idx]
+        if line.startswith("description:"):
+            value = line[len("description:") :].strip()
+            if value in (">", "|", ">-", "|-"):
+                parts: list[str] = []
+                idx += 1
+                while idx < len(fm_lines) and (fm_lines[idx].startswith("  ") or fm_lines[idx].startswith("\t")):
+                    parts.append(fm_lines[idx].strip())
+                    idx += 1
+                return "\n".join(parts).strip()
+            return value.strip('"').strip("'").strip()
+        idx += 1
+
+    raise ValueError("Content missing frontmatter description")
+
+
+def replace_description(content: str, new_description: str) -> str:
+    """Replace the frontmatter description while preserving all other content verbatim."""
+    lines = content.split("\n")
+    if not lines or lines[0].strip() != "---":
+        raise ValueError("Content missing frontmatter opening delimiter")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+    if end_idx is None:
+        raise ValueError("Content missing frontmatter closing delimiter")
+
+    fm_lines = lines[1:end_idx]
+    start_idx = None
+    stop_idx = None
+    idx = 0
+    while idx < len(fm_lines):
+        line = fm_lines[idx]
+        if line.startswith("description:"):
+            start_idx = idx
+            value = line[len("description:") :].strip()
+            stop_idx = idx + 1
+            if value in (">", "|", ">-", "|-"):
+                stop_idx = idx + 1
+                while stop_idx < len(fm_lines) and (
+                    fm_lines[stop_idx].startswith("  ") or fm_lines[stop_idx].startswith("\t")
+                ):
+                    stop_idx += 1
+            break
+        idx += 1
+
+    if start_idx is None or stop_idx is None:
+        raise ValueError("Content missing frontmatter description")
+
+    normalized = new_description.strip()
+    replacement = ["description: |"]
+    if normalized:
+        replacement.extend(f"  {line}" if line else "  " for line in normalized.splitlines())
+    else:
+        replacement.append("  ")
+
+    new_fm_lines = fm_lines[:start_idx] + replacement + fm_lines[stop_idx:]
+    rebuilt_lines = ["---", *new_fm_lines, "---", *lines[end_idx + 1 :]]
+    return "\n".join(rebuilt_lines)
+
+
+def extract_body(content: str) -> str:
+    """Extract markdown body content after frontmatter."""
+    lines = content.split("\n")
+    if not lines or lines[0].strip() != "---":
+        raise ValueError("Content missing frontmatter opening delimiter")
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+    if end_idx is None:
+        raise ValueError("Content missing frontmatter closing delimiter")
+    return "\n".join(lines[end_idx + 1 :])
+
+
+def replace_body(content: str, new_body: str) -> str:
+    """Replace the markdown body while preserving frontmatter verbatim."""
+    lines = content.split("\n")
+    if not lines or lines[0].strip() != "---":
+        raise ValueError("Content missing frontmatter opening delimiter")
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+    if end_idx is None:
+        raise ValueError("Content missing frontmatter closing delimiter")
+    rebuilt_lines = [*lines[: end_idx + 1], *new_body.splitlines()]
+    rebuilt = "\n".join(rebuilt_lines)
+    if content.endswith("\n") and not rebuilt.endswith("\n"):
+        rebuilt += "\n"
+    return rebuilt
+
+
 # ---------------------------------------------------------------------------
 # Variant generation
 # ---------------------------------------------------------------------------
@@ -150,6 +270,7 @@ def generate_variant(
     current_content: str,
     failures: list[dict],
     model: str | None,
+    optimization_scope: str = "description-only",
     history: list[dict] | None = None,
     diversification_note: str | None = None,
 ) -> dict:
@@ -162,7 +283,20 @@ def generate_variant(
     if failures:
         failure_section = "\n\nFailed tasks from the last iteration:\n"
         for f in failures:
-            failure_section += f"  - {f.get('name', 'unnamed')}: {f.get('details', 'failed')}\n"
+            label = f.get("query") or f.get("name", "unnamed")
+            should_trigger = f.get("should_trigger")
+            expectation = ""
+            if should_trigger is True:
+                expectation = " (expected: SHOULD trigger)"
+            elif should_trigger is False:
+                expectation = " (expected: should NOT trigger)"
+            detail_bits = []
+            if f.get("details"):
+                detail_bits.append(str(f["details"]))
+            if "trigger_rate" in f:
+                detail_bits.append(f"raw_trigger_rate={f['trigger_rate']:.2f}")
+            details = "; ".join(detail_bits) if detail_bits else "failed"
+            failure_section += f"  - {label}{expectation}: {details}\n"
 
     history_section = ""
     if history:
@@ -188,7 +322,11 @@ def generate_variant(
 This is non-negotiable: protected sections contain safety gates that must not be
 removed even if removing them would improve test scores."""
 
-    prompt = f"""You are optimizing an agent/skill file to improve its performance.
+    current_description = extract_description(current_content)
+    current_body = extract_body(current_content)
+
+    if optimization_scope == "description-only":
+        prompt = f"""You are optimizing an agent/skill file to improve its trigger performance.
 
 Target file: {target_path}
 Optimization goal: {goal}
@@ -197,36 +335,45 @@ def generate_variant(
 <current_content>
 {current_content}
 </current_content>
+Current description:
+<current_description>
+{current_description}
+</current_description>
 {failure_section}{history_section}{diversification_section}{protected_notice}
 
 SAFETY RULES:
-1. Do NOT delete sections without replacing them with equivalent or better content.
-   If you remove a section heading that exists in the original, you must explain what
-   replaces the removed functionality. Pure deletion degrades unmeasured capabilities.
+1. Optimize ONLY the YAML frontmatter `description` field.
+   Do not modify any other part of the file. The optimizer evaluates description-trigger
+   quality only, so changing routing blocks, body text, or headings is out of scope.
 
-2. Do NOT change the tools, SDKs, or interfaces the agent uses. The variant must work
-   in the same environment as the original (no switching from SDK to curl, etc.).
+2. Keep the description faithful to the file's actual purpose. Improve routing precision
+   by making the description clearer and more triggerable, not by changing the behavior
+   or scope of the skill.
 
-3. Keep YAML frontmatter structure intact (name, description, routing, etc.).
+3. Keep the skill name, routing, tools, instructions, and all protected sections unchanged.
 
-4. Focus on making the agent/skill better at achieving the stated goal. Common
+4. Focus on making the description better at achieving the stated goal. Common
    improvements include:
-   - Moving critical information to more prominent positions (CRITICAL banners)
-   - Adding explicit planning steps before code generation
-   - Improving error handling instructions with specific patterns
-   - Adding concrete examples for ambiguous instructions
-   - Restructuring for clarity when sections are dense
-
-Please respond with the complete modified file content inside <variant> tags,
-and a brief summary of what you changed and why inside <summary> tags.
-
-If you removed any existing `##` section heading, include a brief justification
-inside <deletion_justification> tags. If you did not remove a section, return
-empty tags.
-
-<variant>
-[complete file content here]
-</variant>
+   - Including natural user phrasings that should trigger this skill
+   - Making the first sentence more concrete and specific
+   - Removing vague wording that overlaps with unrelated skills
+   - Adding concise usage examples when they help routing
+
+5. Treat failed eval tasks as primary routing evidence:
+   - If a task SHOULD have triggered but did not, strongly prefer copying the exact
+     user phrasing or a very close paraphrase into the description.
+   - If a task should NOT have triggered, add clarifying language that separates this
+     skill from that request without expanding scope.
+   - Optimize for the smallest description change that would make the failed tasks
+     more likely to score correctly on the next run.
+
+Please respond with ONLY the improved description text inside <description> tags,
+without YAML quoting or frontmatter delimiters, and a brief summary inside <summary> tags.
+Do not return the full file.
+
+<description>
+[improved description only]
+</description>
 
 <summary>
 [1-2 sentence description of the change]
@@ -235,16 +382,72 @@ def generate_variant(
 <deletion_justification>
 [why any removed section was replaced safely, or leave blank]
 </deletion_justification>"""
+        text, raw_result_text, tokens_used = _run_claude_code(prompt, model)
+
+        description_match = re.search(r"<description>(.*?)</description>", text, re.DOTALL)
+        if description_match:
+            new_payload = description_match.group(1).strip()
+        else:
+            variant_match = re.search(r"<variant>(.*?)</variant>", text, re.DOTALL)
+            if not variant_match:
+                print("Error: No <description> or <variant> tags in response", file=sys.stderr)
+                sys.exit(1)
+            legacy_variant = variant_match.group(1).strip()
+            new_payload = extract_description(legacy_variant)
+
+        variant = replace_description(current_content, new_payload)
+    elif optimization_scope == "body-only":
+        prompt = f"""You are optimizing an agent/skill file to improve its behavioral quality.
 
-    text, raw_result_text, tokens_used = _run_claude_code(prompt, model)
+Target file: {target_path}
+Optimization goal: {goal}
 
-    # Parse variant content
-    variant_match = re.search(r"<variant>(.*?)</variant>", text, re.DOTALL)
-    if not variant_match:
-        print("Error: No <variant> tags in response", file=sys.stderr)
-        sys.exit(1)
+Current content of the file:
+<current_content>
+{current_content}
+</current_content>
+Current body:
+<current_body>
+{current_body}
+</current_body>
+{failure_section}{history_section}{diversification_section}{protected_notice}
+
+SAFETY RULES:
+1. Optimize ONLY the markdown body after the YAML frontmatter.
+   Do not modify the frontmatter, skill name, description, routing, tools, or version.
+2. Keep the skill faithful to its current purpose. Improve how it behaves, not what broad domain it covers.
+3. Preserve headings and protected sections unless you have a clear reason to improve the body structure safely.
+4. Prefer the smallest body change that addresses the failed tasks and improves behavioral quality.
+
+Please respond with ONLY the improved body text inside <body> tags and a brief summary inside <summary> tags.
+Do not return the full file.
 
-    variant = variant_match.group(1).strip()
+<body>
+[improved markdown body only]
+</body>
+
+<summary>
+[1-2 sentence description of the change]
+</summary>
+
+<deletion_justification>
+[why any removed section was replaced safely, or leave blank]
+</deletion_justification>"""
+        text, raw_result_text, tokens_used = _run_claude_code(prompt, model)
+        body_match = re.search(r"<body>(.*?)</body>", text, re.DOTALL)
+        if body_match:
+            new_payload = body_match.group(1).strip("\n")
+        else:
+            variant_match = re.search(r"<variant>(.*?)</variant>", text, re.DOTALL)
+            if not variant_match:
+                print("Error: No <body> or <variant> tags in response", file=sys.stderr)
+                sys.exit(1)
+            legacy_variant = variant_match.group(1).strip()
+            new_payload = extract_body(legacy_variant)
+
+        variant = replace_body(current_content, new_payload)
+    else:
+        raise ValueError(f"Unsupported optimization_scope: {optimization_scope}")
 
     # Parse summary
     summary_match = re.search(r"<summary>(.*?)</summary>", text, re.DOTALL)
@@ -253,13 +456,12 @@ def generate_variant(
     deletion_match = re.search(r"<deletion_justification>(.*?)</deletion_justification>", text, re.DOTALL)
     deletion_justification = deletion_match.group(1).strip() if deletion_match else ""
 
-    # Restore protected sections (safety net)
+    # Restore protected sections (safety net); should be a no-op when only the
+    # description changes, but keep it as belt-and-suspenders protection.
     variant = restore_protected(current_content, variant)
 
-    # Check for unauthorized deletions
+    # Description-only optimization should never delete sections.
     deletions = detect_deletions(current_content, variant)
-    if deletions:
-        print(f"Warning: Deleted sections: {deletions}", file=sys.stderr)
 
     return {
         "variant": variant,
@@ -287,6 +489,12 @@ def main():
     parser.add_argument("--history", default="[]", help="JSON list of previous iterations")
     parser.add_argument("--diversification-note", default=None, help="Optional search diversification hint")
     parser.add_argument("--model", default=None, help="Optional Claude Code model override")
+    parser.add_argument(
+        "--optimization-scope",
+        choices=["description-only", "body-only"],
+        default="description-only",
+        help="Which part of the file to mutate",
+    )
     args = parser.parse_args()
 
     try:
@@ -312,6 +520,7 @@ def main():
         current_content=current_content,
         failures=failures,
         model=args.model,
+        optimization_scope=args.optimization_scope,
         history=history if history else None,
         diversification_note=args.diversification_note,
     )
diff --git a/skills/agent-comparison/scripts/optimize_loop.py b/skills/agent-comparison/scripts/optimize_loop.py
index dd17781f..f4463b1f 100644
--- a/skills/agent-comparison/scripts/optimize_loop.py
+++ b/skills/agent-comparison/scripts/optimize_loop.py
@@ -20,11 +20,15 @@
 from __future__ import annotations
 
 import argparse
+import concurrent.futures
+import contextlib
 import glob
+import hashlib
 import json
 import os
 import random
 import re
+import shutil
 import subprocess
 import sys
 import tempfile
@@ -43,7 +47,10 @@
     "efficiency": 0.10,
 }
 
-HARD_GATE_KEYS = ["parses", "compiles", "tests_pass", "protected_intact"]
+# Hard gates should capture mechanical invalidity, not evaluation quality.
+# Routing/task accuracy is already reflected in the weighted dimensions below;
+# zeroing the whole composite on any failed task destroys the optimization signal.
+HARD_GATE_KEYS = ["parses", "compiles", "protected_intact"]
 
 
 def passes_hard_gates(scores: dict) -> bool:
@@ -162,6 +169,7 @@ def _generate_variant_output(
     model: str | None,
     dry_run: bool,
     iteration_number: int,
+    optimization_scope: str,
     diversification_note: str | None = None,
 ) -> dict:
     """Generate a candidate variant either synthetically or through Claude Code."""
@@ -192,6 +200,8 @@ def _generate_variant_output(
             json.dumps(last_failures),
             "--history",
             json.dumps(history),
+            "--optimization-scope",
+            optimization_scope,
         ]
         if diversification_note:
             variant_cmd.extend(["--diversification-note", diversification_note])
@@ -273,7 +283,7 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
     rows = ""
     for it in iterations:
         v = it["verdict"]
-        vcls = {"KEEP": "keep", "REVERT": "revert", "STOP": "stop"}.get(v, "")
+        vcls = {"ACCEPT": "accept", "REJECT": "reject", "STOP": "stop"}.get(v, "")
         sc = it["score"]
         train_score = sc.get("train")
         test_score = sc.get("test")
@@ -284,7 +294,7 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
         dcls = "d-pos" if delta.startswith("+") and delta != "+0" else "d-neg" if delta.startswith("-") else "d-zero"
         summary = html_mod.escape(str(it.get("change_summary", ""))[:80])
         diff_esc = html_mod.escape(str(it.get("diff", "")))
-        is_keep = v == "KEEP"
+        is_keep = v == "ACCEPT"
         n = it["number"]
 
         rows += f"""
@@ -310,8 +320,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
 
     bt = baseline.get("train", 0.0)
     best = max((it["score"].get("train", bt) for it in iterations), default=bt)
-    kept = sum(1 for it in iterations if it["verdict"] == "KEEP")
-    reverted = sum(1 for it in iterations if it["verdict"] == "REVERT")
+    accepted = sum(1 for it in iterations if it["verdict"] == "ACCEPT")
+    rejected = sum(1 for it in iterations if it["verdict"] == "REJECT")
     cur = len(iterations)
     mx = data.get("max_iterations", 20)
     scls = "running" if status == "RUNNING" else "done" if status in ("CONVERGED", "COMPLETE") else "alarm"
@@ -345,8 +355,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
 .iter-row:hover {{ background:var(--surface-2); }}
 .diff-row td {{ padding:0; }}
 .diff-block {{ background:#080b0f;padding:12px;font-family:var(--font-mono);font-size:11px;max-height:400px;overflow:auto;white-space:pre;line-height:1.5;color:var(--muted); }}
-.verdict-keep {{ color:var(--green);font-weight:600; }}
-.verdict-revert {{ color:var(--red);font-weight:600; }}
+.verdict-accept {{ color:var(--green);font-weight:600; }}
+.verdict-reject {{ color:var(--red);font-weight:600; }}
 .verdict-stop {{ color:var(--yellow);font-weight:600; }}
 .d-pos {{ color:var(--green);font-weight:600; }}
 .d-neg {{ color:var(--red);font-weight:600; }}
@@ -367,8 +377,8 @@ def generate_optimization_report(data: dict, auto_refresh: bool = False) -> str:
   <div class="dash-item"><span class="dash-label">Progress</span><span class="dash-value">{cur}/{mx}</span></div>
   <div class="dash-item"><span class="dash-label">Baseline</span><span class="dash-value">{bt:.2f}</span></div>
   <div class="dash-item"><span class="dash-label">Best</span><span class="dash-value">{best:.2f} ({best - bt:+.2f})</span></div>
-  <div class="dash-item"><span class="dash-label">Kept</span><span class="dash-value">{kept}</span></div>
-  <div class="dash-item"><span class="dash-label">Reverted</span><span class="dash-value">{reverted}</span></div>
+  <div class="dash-item"><span class="dash-label">Accepted</span><span class="dash-value">{accepted}</span></div>
+  <div class="dash-item"><span class="dash-label">Rejected</span><span class="dash-value">{rejected}</span></div>
 </div>
 <p class="subtitle">{score_label}</p>
 <div class="chart-box" id="chart"></div>
@@ -591,6 +601,10 @@ def _is_behavioral_task(task: dict) -> bool:
     return "query" in task and "should_trigger" in task and task.get("eval_mode") == "behavioral"
 
 
+def _is_blind_compare_task(task: dict) -> bool:
+    return "query" in task and task.get("eval_mode") == "blind_compare" and "judge" in task
+
+
 def _validate_task_set(tasks: list[dict]) -> None:
     """Reject unsupported or mixed task formats early with a clear error."""
     if not tasks:
@@ -599,18 +613,24 @@ def _validate_task_set(tasks: list[dict]) -> None:
     trigger_tasks = sum(1 for task in tasks if _is_trigger_task(task))
     pattern_tasks = sum(1 for task in tasks if _is_pattern_task(task))
     behavioral_tasks = sum(1 for task in tasks if _is_behavioral_task(task))
+    blind_compare_tasks = sum(1 for task in tasks if _is_blind_compare_task(task))
 
     # behavioral tasks are a subset of trigger tasks (same base fields), so subtract them
     # to avoid double-counting when checking for pure trigger-rate sets
-    pure_trigger_tasks = trigger_tasks - behavioral_tasks
+    pure_trigger_tasks = trigger_tasks - behavioral_tasks - blind_compare_tasks
 
-    if (pure_trigger_tasks or behavioral_tasks) and pattern_tasks:
+    if (pure_trigger_tasks or behavioral_tasks or blind_compare_tasks) and pattern_tasks:
         raise ValueError(
             "Task file mixes trigger-rate/behavioral and pattern benchmark formats. Use one format per run."
         )
 
-    if behavioral_tasks and pure_trigger_tasks:
-        raise ValueError("Task file mixes trigger-rate and behavioral eval modes. Use one eval_mode per run.")
+    if sum(1 for n in [behavioral_tasks > 0, pure_trigger_tasks > 0, blind_compare_tasks > 0] if n) > 1:
+        raise ValueError(
+            "Task file mixes trigger-rate, behavioral, and blind-compare eval modes. Use one eval_mode per run."
+        )
+
+    if blind_compare_tasks == len(tasks):
+        return
 
     if behavioral_tasks == len(tasks):
         return
@@ -636,7 +656,9 @@ def _run_trigger_rate(
     target_path: Path,
     description: str,
     tasks: list[dict],
-    num_workers: int = 5,
+    candidate_content: str | None = None,
+    eval_mode: str = "auto",
+    num_workers: int = 1,
     timeout: int = 30,
     verbose: bool = False,
 ) -> dict:
@@ -651,39 +673,47 @@ def _run_trigger_rate(
             task_file = f.name
             json.dump(tasks, f)
 
-        with tempfile.TemporaryDirectory() as skill_dir:
-            skill_md = Path(skill_dir) / "SKILL.md"
-            skill_md.write_text(target_path.read_text())
-
-            project_root = Path.cwd()
-            for parent in [project_root, *project_root.parents]:
-                if (parent / ".claude").is_dir():
-                    project_root = parent
-                    break
-
-            cmd = [
-                sys.executable,
-                "-m",
-                "scripts.skill_eval.run_eval",
-                "--eval-set",
-                task_file,
-                "--skill-path",
-                skill_dir,
-                "--description",
-                description,
-                "--num-workers",
-                str(num_workers),
-                "--timeout",
-                str(timeout),
-                "--runs-per-query",
-                "1",
-            ]
-            if verbose:
-                cmd.append("--verbose")
-                print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr)
+        project_root = Path.cwd()
+        for parent in [project_root, *project_root.parents]:
+            if (parent / ".claude").is_dir():
+                project_root = parent
+                break
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "scripts.skill_eval.run_eval",
+            "--eval-set",
+            task_file,
+            "--skill-path",
+            str(target_path.parent),
+            "--description",
+            description,
+            "--eval-mode",
+            eval_mode,
+            "--num-workers",
+            str(num_workers),
+            "--timeout",
+            str(timeout),
+            "--runs-per-query",
+            "1",
+        ]
+        if candidate_content is not None:
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as candidate_file:
+                candidate_file.write(candidate_content)
+                candidate_file.flush()
+                cmd.extend(["--candidate-content-file", candidate_file.name])
+                candidate_file_path = Path(candidate_file.name)
+        else:
+            candidate_file_path = None
+
+        if verbose:
+            cmd.append("--verbose")
+            print(f"Running trigger assessment: {len(tasks)} queries", file=sys.stderr)
 
-            env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
 
+        try:
             result = subprocess.run(
                 cmd,
                 capture_output=True,
@@ -692,74 +722,441 @@ def _run_trigger_rate(
                 env=env,
                 timeout=600,
             )
+        finally:
+            if candidate_file_path is not None:
+                candidate_file_path.unlink(missing_ok=True)
 
-            if result.returncode != 0:
-                print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr)
-                return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}}
+        if result.returncode != 0:
+            print(f"Trigger assessment failed (exit {result.returncode}): {result.stderr[:300]}", file=sys.stderr)
+            return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}}
 
-            try:
-                return json.loads(result.stdout)
-            except json.JSONDecodeError as e:
-                print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr)
-                return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}}
+        try:
+            return json.loads(result.stdout)
+        except json.JSONDecodeError as e:
+            print(f"Trigger assessment returned invalid JSON: {e} — stdout: {result.stdout[:200]}", file=sys.stderr)
+            return {"results": [], "summary": {"total": 0, "passed": 0, "failed": 0}}
     finally:
         if task_file:
             Path(task_file).unlink(missing_ok=True)
 
 
 # ---------------------------------------------------------------------------
-# Behavioral evaluator (runs claude -p and checks for artifact creation)
+# Blind comparative behavioral evaluator
 # ---------------------------------------------------------------------------
 
 
-def _run_behavioral_eval(
+def _find_project_root() -> Path:
+    project_root = Path.cwd()
+    for parent in [project_root, *project_root.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return project_root
+
+
+def _resolve_registered_skill_relpath(target_path: Path, project_root: Path) -> Path:
+    resolved = target_path.resolve()
+    try:
+        rel = resolved.relative_to(project_root.resolve())
+    except ValueError as exc:
+        raise ValueError("blind_compare eval requires a target under the current project root") from exc
+    if len(rel.parts) >= 3 and rel.parts[0] == "skills" and rel.parts[-1] == "SKILL.md":
+        return rel
+    raise ValueError("blind_compare eval currently supports real registered skills under skills/*/SKILL.md only")
+
+
+@contextlib.contextmanager
+def _candidate_worktree(project_root: Path, relpath: Path, content: str):
+    wt_path_str = tempfile.mkdtemp(prefix="blind-eval-wt-", dir="/tmp")
+    wt_path = Path(wt_path_str)
+    wt_path.rmdir()
+    try:
+        subprocess.run(
+            ["git", "worktree", "add", wt_path_str, "HEAD"],
+            cwd=str(project_root),
+            capture_output=True,
+            check=True,
+        )
+        (wt_path / relpath).write_text(content)
+        yield wt_path
+    finally:
+        try:
+            subprocess.run(
+                ["git", "worktree", "remove", "--force", wt_path_str],
+                cwd=str(project_root),
+                capture_output=True,
+            )
+        except Exception:
+            pass
+        shutil.rmtree(wt_path_str, ignore_errors=True)
+
+
+def _extract_registered_skill_ids(relpath: Path, content: str) -> set[str]:
+    ids = {relpath.as_posix()}
+    if len(relpath.parts) >= 2:
+        ids.add(relpath.parts[1])
+    match = re.search(r"^name:\s*(.+)$", content, re.MULTILINE)
+    if match:
+        ids.add(match.group(1).strip().strip("\"'"))
+    return {value for value in ids if value}
+
+
+def _assistant_message_triggered_skill(message: dict, accepted_skill_ids: set[str]) -> bool:
+    for content_item in message.get("content", []):
+        if content_item.get("type") != "tool_use":
+            continue
+        tool_name = content_item.get("name", "")
+        tool_input = content_item.get("input", {})
+        if tool_name == "Skill" and any(skill_id in tool_input.get("skill", "") for skill_id in accepted_skill_ids):
+            return True
+        if tool_name == "Read" and any(skill_id in tool_input.get("file_path", "") for skill_id in accepted_skill_ids):
+            return True
+    return False
+
+
+def _contains_fallback_contamination(output: str) -> tuple[bool, list[str]]:
+    lowered = output.lower()
+    reasons = []
+    contamination_markers = {
+        "skill tool was blocked": "mentioned blocked skill tool",
+        "tool was blocked": "mentioned blocked tool access",
+        "i'll guide you through this directly": "fell back to direct guidance",
+        "i can still help directly": "fell back to direct guidance",
+        "instead of using the skill": "mentioned skill fallback mode",
+        "mode announcement": "included mode/meta announcement",
+        "tool-permission": "mentioned tool permission",
+    }
+    for marker, reason in contamination_markers.items():
+        if marker in lowered:
+            reasons.append(reason)
+    return bool(reasons), reasons
+
+
+def _run_query_capture_output(query: str, cwd: Path, accepted_skill_ids: set[str], timeout: int = 180) -> dict:
+    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+    result = subprocess.run(
+        [
+            "claude",
+            "-p",
+            query,
+            "--output-format",
+            "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+            "--permission-mode",
+            "bypassPermissions",
+        ],
+        capture_output=True,
+        text=True,
+        cwd=str(cwd),
+        env=env,
+        timeout=timeout,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(result.stderr.strip() or f"claude -p exited {result.returncode}")
+
+    assistant_text: list[str] = []
+    raw_result = ""
+    triggered = False
+    pending_tool_name = None
+    accumulated_json = ""
+
+    for raw_line in result.stdout.splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+
+        if event.get("type") == "stream_event":
+            se = event.get("event", {})
+            se_type = se.get("type", "")
+            if se_type == "content_block_start":
+                cb = se.get("content_block", {})
+                if cb.get("type") == "tool_use":
+                    tool_name = cb.get("name", "")
+                    if tool_name in {"Skill", "Read"}:
+                        pending_tool_name = tool_name
+                        accumulated_json = ""
+                    else:
+                        pending_tool_name = None
+                        accumulated_json = ""
+            elif se_type == "content_block_delta" and pending_tool_name:
+                delta = se.get("delta", {})
+                if delta.get("type") == "input_json_delta":
+                    accumulated_json += delta.get("partial_json", "")
+                    if any(skill_id in accumulated_json for skill_id in accepted_skill_ids):
+                        triggered = True
+            elif se_type in {"content_block_stop", "message_stop"} and pending_tool_name:
+                if any(skill_id in accumulated_json for skill_id in accepted_skill_ids):
+                    triggered = True
+                pending_tool_name = None
+                accumulated_json = ""
+
+        if event.get("type") == "assistant":
+            message = event.get("message", {})
+            if _assistant_message_triggered_skill(message, accepted_skill_ids):
+                triggered = True
+            for content in message.get("content", []):
+                if content.get("type") == "text":
+                    assistant_text.append(content.get("text", ""))
+        elif event.get("type") == "result":
+            raw_result = event.get("result", "")
+
+    output = "".join(assistant_text).strip() or raw_result.strip()
+    contaminated, contamination_reasons = _contains_fallback_contamination(output)
+    return {
+        "output": output,
+        "triggered": triggered,
+        "contaminated": contaminated,
+        "contamination_reasons": contamination_reasons,
+    }
+
+
+def _score_socratic_question_only_output(output: str) -> tuple[float, list[str]]:
+    stripped = output.strip()
+    lowered = stripped.lower()
+    reasons: list[str] = []
+    score = 0.0
+
+    question_marks = stripped.count("?")
+    if question_marks == 1:
+        score += 0.45
+        reasons.append("asked exactly one question")
+    elif question_marks == 0:
+        reasons.append("asked no question")
+    else:
+        score += max(0.0, 0.20 - (question_marks - 2) * 0.10)
+        reasons.append(f"asked {question_marks} questions")
+
+    if stripped.endswith("?"):
+        score += 0.15
+        reasons.append("ended on a question")
+    else:
+        reasons.append("did not end on a question")
+
+    starters = ("what ", "when ", "where ", "which ", "can ", "could ", "did ", "is ", "are ", "have ")
+    if any(lowered.startswith(starter) for starter in starters):
+        score += 0.15
+        reasons.append("opened directly with a question")
+    else:
+        reasons.append("did not open directly with a question")
+
+    first_sentence = lowered.split("?")[0]
+    preamble_markers = ["let me", "i'll", "i will", "we'll", "we will", "let's", "before we", "looking at"]
+    if any(marker in first_sentence for marker in preamble_markers):
+        score -= 0.30
+        reasons.append("included preamble before the first question")
+
+    direct_answer_markers = [
+        "common mistake",
+        "classic",
+        "the issue is",
+        "the problem is",
+        "the bug is",
+        "you should",
+        "fix this by",
+        "the root cause",
+        "likely cause",
+        "think about code like",
+        "vs.",
+        "return cache.get",
+        "poison the cache",
+    ]
+    if any(marker in lowered for marker in direct_answer_markers):
+        score -= 0.35
+        reasons.append("gave direct diagnosis/advice")
+    else:
+        score += 0.15
+        reasons.append("avoided direct diagnosis")
+
+    if "```" in output:
+        score -= 0.15
+        reasons.append("included code block")
+    else:
+        score += 0.10
+        reasons.append("no code block")
+
+    if len(stripped) <= 450:
+        score += 0.10
+        reasons.append("kept first turn concise")
+    else:
+        reasons.append("first response was long")
+
+    return max(0.0, min(1.0, round(score, 4))), reasons
+
+
+def _score_output_with_judge(task: dict, output: str) -> tuple[float, list[str]]:
+    judge = task.get("judge")
+    if judge in {"socratic_question_only", "heuristic_socratic_debugging"}:
+        return _score_socratic_question_only_output(output)
+    raise ValueError(f"Unsupported blind_compare judge: {judge}")
+
+
+def _run_blind_compare_eval(
     target_path: Path,
-    description: str,
+    candidate_content: str,
     tasks: list[dict],
-    timeout: int = 240,
+    baseline_content: str | None = None,
+    timeout: int = 180,
     verbose: bool = False,
 ) -> list[dict]:
-    """Run behavioral assessment by invoking claude -p and checking artifact output.
+    """Run blind comparative evaluation for real registered skills."""
+    project_root = _find_project_root()
+    relpath = _resolve_registered_skill_relpath(target_path, project_root)
+    baseline_source = baseline_content if baseline_content is not None else candidate_content
+    candidate_skill_ids = _extract_registered_skill_ids(relpath, candidate_content)
+    baseline_skill_ids = _extract_registered_skill_ids(relpath, baseline_source)
+
+    results: list[dict] = []
+    for task in tasks:
+        query = task["query"]
+        if baseline_source == candidate_content:
+            with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt:
+                candidate_capture = _run_query_capture_output(query, candidate_wt, candidate_skill_ids, timeout=timeout)
+            baseline_capture = dict(candidate_capture)
+        else:
+            with _candidate_worktree(project_root, relpath, baseline_source) as baseline_wt:
+                baseline_capture = _run_query_capture_output(query, baseline_wt, baseline_skill_ids, timeout=timeout)
+            with _candidate_worktree(project_root, relpath, candidate_content) as candidate_wt:
+                candidate_capture = _run_query_capture_output(query, candidate_wt, candidate_skill_ids, timeout=timeout)
+
+        baseline_output = baseline_capture["output"]
+        candidate_output = candidate_capture["output"]
+
+        baseline_score, baseline_reasons = _score_output_with_judge(task, baseline_output)
+        candidate_score, candidate_reasons = _score_output_with_judge(task, candidate_output)
+
+        if not baseline_capture["triggered"]:
+            baseline_score = 0.0
+            baseline_reasons = ["target skill did not trigger", *baseline_reasons]
+        if baseline_capture["contaminated"]:
+            baseline_score = 0.0
+            baseline_reasons = [*baseline_capture["contamination_reasons"], *baseline_reasons]
+        if not candidate_capture["triggered"]:
+            candidate_score = 0.0
+            candidate_reasons = ["target skill did not trigger", *candidate_reasons]
+        if candidate_capture["contaminated"]:
+            candidate_score = 0.0
+            candidate_reasons = [*candidate_capture["contamination_reasons"], *candidate_reasons]
+
+        seed = int(hashlib.sha256(query.encode()).hexdigest()[:8], 16)
+        if seed % 2 == 0:
+            label_map = {"A": "baseline", "B": "candidate"}
+        else:
+            label_map = {"A": "candidate", "B": "baseline"}
 
-    Each task must have 'query', 'should_trigger', 'artifact_glob', and optionally
-    'query_prefix' fields. Tasks are run sequentially since each claude -p invocation
-    is resource-intensive.
+        if candidate_score > baseline_score:
+            winner = "candidate"
+        elif candidate_score < baseline_score:
+            winner = "baseline"
+        else:
+            winner = "tie"
 
-    Returns a list of per-task result dicts with keys:
-      triggered, should_trigger, pass, new_artifacts
-    """
-    project_root = Path.cwd()
-    for parent in [project_root, *project_root.parents]:
-        if (parent / ".claude").is_dir():
-            project_root = parent
-            break
+        if verbose:
+            print(
+                f"[blind-compare] {query[:60]!r}: baseline={baseline_score:.2f}, candidate={candidate_score:.2f}, winner={winner}",
+                file=sys.stderr,
+            )
 
-    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+        results.append(
+            {
+                "query": query,
+                "judge": task.get("judge"),
+                "candidate_score": candidate_score,
+                "baseline_score": baseline_score,
+                "candidate_output": candidate_output,
+                "baseline_output": baseline_output,
+                "candidate_reasons": candidate_reasons,
+                "baseline_reasons": baseline_reasons,
+                "candidate_triggered": candidate_capture["triggered"],
+                "baseline_triggered": baseline_capture["triggered"],
+                "candidate_contaminated": candidate_capture["contaminated"],
+                "baseline_contaminated": baseline_capture["contaminated"],
+                "winner": winner,
+                "label_map": label_map,
+                "passed": candidate_score >= float(task.get("min_score", 0.7)),
+            }
+        )
+    return results
 
-    results = []
-    for task in tasks:
-        query: str = task["query"]
-        should_trigger: bool = task["should_trigger"]
-        artifact_glob: str = task.get("artifact_glob", "adr/*.md")
-        query_prefix: str = task.get("query_prefix", "/do ")
 
-        full_query = f"{query_prefix}{query}"
+# ---------------------------------------------------------------------------
+# Behavioral evaluator (runs claude -p and checks for artifact creation)
+# ---------------------------------------------------------------------------
 
-        # Snapshot existing artifacts before the run
-        before: set[str] = set(glob.glob(str(project_root / artifact_glob)))
 
-        triggered = False
-        new_artifacts: list[str] = []
+def _snapshot_extra_dirs(project_root: Path) -> set[str]:
+    """Snapshot files in directories that creation tasks may write to."""
+    extra_globs = [
+        str(project_root / "agents" / "*.md"),
+        str(project_root / "scripts" / "*.py"),
+    ]
+    snapshot: set[str] = set()
+    for g in extra_globs:
+        snapshot.update(glob.glob(g))
+    snapshot.update(glob.glob(str(project_root / "skills" / "**" / "SKILL.md"), recursive=True))
+    snapshot.update(glob.glob(str(project_root / "pipelines" / "**" / "SKILL.md"), recursive=True))
+    return snapshot
+
+
+def _run_single_behavioral_task(
+    task: dict,
+    project_root: Path,
+    worktree_path: Path,
+    env: dict[str, str],
+    timeout: int,
+    verbose: bool,
+    runs_per_task: int,
+    trigger_threshold: float,
+) -> dict:
+    """Run a single behavioral task and return its result dict.
+
+    Args:
+        task: Task dict with 'query', 'should_trigger', optional 'artifact_glob' and 'query_prefix'.
+        project_root: Canonical project root (used only for worktree creation context).
+        worktree_path: Directory in which claude -p runs and artifact globs are resolved.
+            For sequential execution this equals project_root; for parallel execution
+            this is an isolated git worktree.
+        env: Environment variables to pass to subprocess.
+        timeout: Per-run timeout in seconds for the claude -p invocation.
+        verbose: Print progress to stderr.
+        runs_per_task: Number of times to run the query; result is averaged.
+        trigger_threshold: Fraction of runs that must trigger to count as triggered.
+
+    Returns:
+        Per-task result dict with keys: query, triggered, should_trigger, pass, new_artifacts.
+    """
+    query: str = task["query"]
+    should_trigger: bool = task["should_trigger"]
+    artifact_glob: str = task.get("artifact_glob", "adr/*.md")
+    query_prefix: str = task.get("query_prefix", "/do ")
 
-        if verbose:
+    full_query = f"{query_prefix}{query}"
+
+    run_results: list[bool] = []
+    all_new_artifacts: list[str] = []
+
+    for run_index in range(runs_per_task):
+        if verbose and runs_per_task > 1:
+            print(f"[behavioral] Run {run_index + 1}/{runs_per_task}: {full_query!r}", file=sys.stderr)
+        elif verbose:
             print(f"[behavioral] Running: claude -p {full_query!r}", file=sys.stderr)
 
+        # Snapshot existing artifacts before the run (primary glob + extra dirs)
+        before: set[str] = set(glob.glob(str(worktree_path / artifact_glob)))
+        before_extra: set[str] = _snapshot_extra_dirs(worktree_path)
+
+        run_triggered = False
+        run_new_artifacts: list[str] = []
+
         try:
             result = subprocess.run(
                 ["claude", "-p", full_query],
                 capture_output=True,
                 text=True,
-                cwd=str(project_root),
+                cwd=str(worktree_path),
                 env=env,
                 timeout=timeout,
             )
@@ -770,42 +1167,196 @@ def _run_behavioral_eval(
                 )
 
             # Check for new files matching the artifact glob
-            after: set[str] = set(glob.glob(str(project_root / artifact_glob)))
-            new_artifacts = sorted(after - before)
-            triggered = len(new_artifacts) > 0
+            after: set[str] = set(glob.glob(str(worktree_path / artifact_glob)))
+            run_new_artifacts = sorted(after - before)
+            run_triggered = len(run_new_artifacts) > 0
 
-            if verbose and new_artifacts:
-                print(f"[behavioral] New artifacts: {new_artifacts}", file=sys.stderr)
+            if verbose and run_new_artifacts:
+                print(f"[behavioral] New artifacts: {run_new_artifacts}", file=sys.stderr)
 
         except subprocess.TimeoutExpired:
             if verbose:
                 print(f"[behavioral] Timed out after {timeout}s for query: {full_query!r}", file=sys.stderr)
             # Still check artifacts — the process may have written them before timing out
-            after_timeout: set[str] = set(glob.glob(str(project_root / artifact_glob)))
-            new_artifacts = sorted(after_timeout - before)
-            triggered = len(new_artifacts) > 0
-            if verbose and triggered:
-                print(f"[behavioral] Artifacts found despite timeout: {new_artifacts}", file=sys.stderr)
-
-        # Clean up artifacts so they don't pollute the before-snapshot of the next task
-        for artifact_path in new_artifacts:
+            after_timeout: set[str] = set(glob.glob(str(worktree_path / artifact_glob)))
+            run_new_artifacts = sorted(after_timeout - before)
+            run_triggered = len(run_new_artifacts) > 0
+            if verbose and run_triggered:
+                print(f"[behavioral] Artifacts found despite timeout: {run_new_artifacts}", file=sys.stderr)
+
+        # Clean up primary-glob artifacts
+        for artifact_path in run_new_artifacts:
             try:
                 Path(artifact_path).unlink(missing_ok=True)
             except OSError:
                 pass
 
-        passed = triggered == should_trigger
-        results.append(
-            {
-                "query": query,
-                "triggered": triggered,
-                "should_trigger": should_trigger,
-                "pass": passed,
-                "new_artifacts": new_artifacts,
-            }
+        # Clean up extra-dir artifacts (agents/, skills/, pipelines/, scripts/)
+        after_extra: set[str] = _snapshot_extra_dirs(worktree_path)
+        new_extra = sorted(after_extra - before_extra)
+        for path in new_extra:
+            try:
+                Path(path).unlink(missing_ok=True)
+            except OSError:
+                pass
+        if verbose and new_extra:
+            print(
+                f"[behavioral] Cleaned up {len(new_extra)} extra artifacts: {new_extra}",
+                file=sys.stderr,
+            )
+
+        run_results.append(run_triggered)
+        all_new_artifacts.extend(run_new_artifacts)
+
+    # Aggregate across runs
+    if runs_per_task > 1:
+        triggered = (sum(run_results) / len(run_results)) >= trigger_threshold
+    else:
+        triggered = run_results[0] if run_results else False
+
+    passed = triggered == should_trigger
+    return {
+        "query": query,
+        "triggered": triggered,
+        "should_trigger": should_trigger,
+        "pass": passed,
+        "new_artifacts": all_new_artifacts,
+    }
+
+
+def _run_single_behavioral_task_in_worktree(
+    task: dict,
+    project_root: Path,
+    env: dict[str, str],
+    timeout: int,
+    verbose: bool,
+    runs_per_task: int,
+    trigger_threshold: float,
+) -> dict:
+    """Create a temporary git worktree, run a behavioral task inside it, then remove it.
+
+    Used by the parallel execution path in _run_behavioral_eval.  Each thread
+    gets its own isolated worktree so concurrent claude -p invocations do not
+    share working-directory state.
+
+    The worktree is always removed in a finally block regardless of success or failure.
+    """
+    wt_path_str = tempfile.mkdtemp(prefix="eval-wt-", dir="/tmp")
+    wt_path = Path(wt_path_str)
+    # Remove the empty dir so git worktree add can create it
+    wt_path.rmdir()
+    try:
+        subprocess.run(
+            ["git", "worktree", "add", wt_path_str, "HEAD"],
+            cwd=str(project_root),
+            capture_output=True,
+            check=True,
         )
+        return _run_single_behavioral_task(
+            task=task,
+            project_root=project_root,
+            worktree_path=wt_path,
+            env=env,
+            timeout=timeout,
+            verbose=verbose,
+            runs_per_task=runs_per_task,
+            trigger_threshold=trigger_threshold,
+        )
+    finally:
+        try:
+            subprocess.run(
+                ["git", "worktree", "remove", "--force", wt_path_str],
+                cwd=str(project_root),
+                capture_output=True,
+            )
+        except Exception:
+            pass
+        shutil.rmtree(wt_path_str, ignore_errors=True)
 
-    return results
+
+def _run_behavioral_eval(
+    target_path: Path,
+    description: str,
+    tasks: list[dict],
+    timeout: int = 240,
+    verbose: bool = False,
+    runs_per_task: int = 1,
+    trigger_threshold: float = 0.5,
+    parallel_workers: int = 0,
+) -> list[dict]:
+    """Run behavioral assessment by invoking claude -p and checking artifact output.
+
+    Each task must have 'query', 'should_trigger', 'artifact_glob', and optionally
+    'query_prefix' fields.
+
+    When parallel_workers > 1, tasks are dispatched concurrently via ThreadPoolExecutor.
+    Each concurrent task runs in an isolated git worktree created from HEAD so that
+    file-system mutations do not interfere across tasks.
+
+    When runs_per_task > 1, each task query is run that many times. The final
+    triggered value is True iff (sum(results) / runs_per_task) >= trigger_threshold.
+
+    Returns a list of per-task result dicts with keys:
+      triggered, should_trigger, pass, new_artifacts
+    """
+    project_root = Path.cwd()
+    for parent in [project_root, *project_root.parents]:
+        if (parent / ".claude").is_dir():
+            project_root = parent
+            break
+
+    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+    if parallel_workers > 1:
+        # Parallel path: each task runs in its own temporary git worktree.
+        results: list[dict] = [{}] * len(tasks)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_workers) as executor:
+            future_to_index = {
+                executor.submit(
+                    _run_single_behavioral_task_in_worktree,
+                    task,
+                    project_root,
+                    env,
+                    timeout,
+                    verbose,
+                    runs_per_task,
+                    trigger_threshold,
+                ): idx
+                for idx, task in enumerate(tasks)
+            }
+            for future in concurrent.futures.as_completed(future_to_index):
+                idx = future_to_index[future]
+                try:
+                    results[idx] = future.result()
+                except Exception as exc:
+                    task = tasks[idx]
+                    query = task.get("query", "unknown")
+                    print(f"[behavioral] Task {query!r} raised exception: {exc}", file=sys.stderr)
+                    results[idx] = {
+                        "query": query,
+                        "triggered": False,
+                        "should_trigger": task.get("should_trigger", False),
+                        "pass": False,
+                        "new_artifacts": [],
+                    }
+        return results
+
+    # Sequential path (parallel_workers <= 1): run tasks one at a time in project_root.
+    sequential_results = []
+    for task in tasks:
+        sequential_results.append(
+            _run_single_behavioral_task(
+                task=task,
+                project_root=project_root,
+                worktree_path=project_root,
+                env=env,
+                timeout=timeout,
+                verbose=verbose,
+                runs_per_task=runs_per_task,
+                trigger_threshold=trigger_threshold,
+            )
+        )
+    return sequential_results
 
 
 # ---------------------------------------------------------------------------
@@ -819,6 +1370,12 @@ def assess_target(
     goal: str,
     verbose: bool = False,
     dry_run: bool = False,
+    behavioral_runs_per_task: int = 1,
+    behavioral_trigger_threshold: float = 0.5,
+    parallel_eval_workers: int = 0,
+    candidate_content: str | None = None,
+    baseline_content: str | None = None,
+    eval_mode: str = "auto",
 ) -> dict:
     """Assess a target file against tasks.
 
@@ -828,6 +1385,9 @@ def assess_target(
     - Dry-run: returns synthetic scores for testing loop mechanics.
     - Benchmark (NYI): tasks have 'prompt' + 'name' fields.
 
+    When parallel_eval_workers > 1 and the task set is behavioral, tasks are
+    dispatched in parallel via ThreadPoolExecutor, each in its own git worktree.
+
     Returns scores dict with hard gate booleans and quality dimensions.
     """
     scores: dict = {
@@ -843,7 +1403,7 @@ def assess_target(
         "task_results": [],
     }
 
-    content = target_path.read_text()
+    content = candidate_content if candidate_content is not None else target_path.read_text()
     valid, description = _parse_frontmatter(content)
     if not valid or not description:
         scores["parses"] = False
@@ -878,10 +1438,19 @@ def assess_target(
 
     # Detect assessment mode from task format
     is_behavioral = all(_is_behavioral_task(task) for task in tasks)
-    is_trigger = not is_behavioral and all(_is_trigger_task(task) for task in tasks)
+    is_blind_compare = all(_is_blind_compare_task(task) for task in tasks)
+    is_trigger = not is_behavioral and not is_blind_compare and all(_is_trigger_task(task) for task in tasks)
 
     if is_trigger:
-        results = _run_trigger_rate(target_path, description, tasks, verbose=verbose)
+        task_expectations = {task.get("query", ""): task.get("should_trigger") for task in tasks}
+        results = _run_trigger_rate(
+            target_path,
+            description,
+            tasks,
+            candidate_content=content,
+            eval_mode=eval_mode,
+            verbose=verbose,
+        )
         summary = results.get("summary", {})
         total = summary.get("total", 0)
         passed = summary.get("passed", 0)
@@ -900,6 +1469,9 @@ def assess_target(
             scores["task_results"].append(
                 {
                     "name": r.get("query", "unnamed")[:40],
+                    "query": r.get("query", ""),
+                    "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""))),
+                    "trigger_rate": r.get("trigger_rate", 0.0),
                     "passed": r.get("pass", False),
                     "score": 1.0 if r.get("pass", False) else 0.0,
                     "details": f"trigger_rate={r.get('trigger_rate', 0):.2f}",
@@ -908,7 +1480,16 @@ def assess_target(
         return scores
 
     if is_behavioral:
-        behavioral_results = _run_behavioral_eval(target_path, description, tasks, verbose=verbose)
+        task_expectations = {task.get("query", ""): task.get("should_trigger") for task in tasks}
+        behavioral_results = _run_behavioral_eval(
+            target_path,
+            description,
+            tasks,
+            verbose=verbose,
+            runs_per_task=behavioral_runs_per_task,
+            trigger_threshold=behavioral_trigger_threshold,
+            parallel_workers=parallel_eval_workers,
+        )
         total = len(behavioral_results)
         passed = sum(1 for r in behavioral_results if r.get("pass", False))
         if total == 0:
@@ -927,6 +1508,8 @@ def assess_target(
             scores["task_results"].append(
                 {
                     "name": r.get("query", "unnamed")[:40],
+                    "query": r.get("query", ""),
+                    "should_trigger": r.get("should_trigger", task_expectations.get(r.get("query", ""))),
                     "passed": r.get("pass", False),
                     "score": 1.0 if r.get("pass", False) else 0.0,
                     "details": f"triggered={r.get('triggered')}, artifacts={artifact_summary}",
@@ -934,6 +1517,53 @@ def assess_target(
             )
         return scores
 
+    if is_blind_compare:
+        compare_results = _run_blind_compare_eval(
+            target_path,
+            content,
+            tasks,
+            baseline_content=baseline_content,
+            verbose=verbose,
+        )
+        total = len(compare_results)
+        if total == 0:
+            return scores
+
+        absolute_quality = sum(r.get("candidate_score", 0.0) for r in compare_results) / total
+        wins = sum(1 for r in compare_results if r.get("winner") == "candidate")
+        ties = sum(1 for r in compare_results if r.get("winner") == "tie")
+        comparative_quality = (wins + 0.5 * ties) / total
+
+        scores["correctness"] = round(absolute_quality * 10, 2)
+        scores["error_handling"] = round(absolute_quality * 8, 2)
+        scores["language_idioms"] = round(absolute_quality * 7, 2)
+        scores["testing"] = round(comparative_quality * 8.0, 2)
+        scores["efficiency"] = round(min(1.0, absolute_quality + 0.1) * 6, 2)
+        scores["tests_pass"] = all(r.get("passed", False) for r in compare_results)
+
+        for r in compare_results:
+            scores["task_results"].append(
+                {
+                    "name": r.get("query", "unnamed")[:40],
+                    "query": r.get("query", ""),
+                    "passed": r.get("passed", False),
+                    "score": r.get("candidate_score", 0.0),
+                    "details": (
+                        f"winner={r.get('winner')}; candidate={r.get('candidate_score', 0.0):.2f}; "
+                        f"baseline={r.get('baseline_score', 0.0):.2f}; "
+                        f"candidate_reasons={', '.join(r.get('candidate_reasons', []))}"
+                    ),
+                    "winner": r.get("winner"),
+                    "candidate_score": r.get("candidate_score", 0.0),
+                    "baseline_score": r.get("baseline_score", 0.0),
+                    "candidate_output": r.get("candidate_output", ""),
+                    "baseline_output": r.get("baseline_output", ""),
+                    "candidate_reasons": r.get("candidate_reasons", []),
+                    "baseline_reasons": r.get("baseline_reasons", []),
+                }
+            )
+        return scores
+
     # Benchmark behavioral assessment — not yet implemented.
     # Use trigger-rate tasks ('query' + 'should_trigger') or behavioral tasks
     # ('query' + 'should_trigger' + 'eval_mode: behavioral') per ADR-132.
@@ -972,18 +1602,23 @@ def run_optimization_loop(
     target_path: Path,
     goal: str,
     benchmark_tasks_path: Path,
-    max_iterations: int = 20,
+    max_iterations: int = 1,
     min_gain: float = 0.02,
     train_split: float = 0.6,
-    revert_streak_limit: int = 5,
+    revert_streak_limit: int = 1,
     beam_width: int = 1,
     candidates_per_parent: int = 1,
-    holdout_check_cadence: int = 5,
+    holdout_check_cadence: int = 0,
     model: str | None = None,
     verbose: bool = False,
     report_path: Path | None = None,
     output_dir: Path | None = None,
     dry_run: bool = False,
+    behavioral_runs_per_task: int = 1,
+    behavioral_trigger_threshold: float = 0.5,
+    parallel_eval: int = 0,
+    eval_mode: str = "auto",
+    optimization_scope: str = "description-only",
 ) -> dict:
     """Run the autoresearch optimization loop."""
     if beam_width < 1:
@@ -1003,28 +1638,67 @@ def run_optimization_loop(
     _validate_task_set(all_tasks)
     train_tasks, test_tasks = split_tasks(all_tasks, train_split)
 
+    # Warn and fall back to sequential when --parallel-eval is used with non-behavioral tasks.
+    is_all_behavioral = all(_is_behavioral_task(t) for t in all_tasks)
+    effective_parallel_eval = parallel_eval
+    if parallel_eval > 1 and not is_all_behavioral:
+        print(
+            "[parallel-eval] Warning: --parallel-eval requires eval_mode=behavioral tasks. "
+            "Falling back to sequential evaluation.",
+            file=sys.stderr,
+        )
+        effective_parallel_eval = 0
+
     if verbose:
         print(f"Tasks: {len(train_tasks)} train, {len(test_tasks)} test", file=sys.stderr)
+        if effective_parallel_eval > 1:
+            print(f"Parallel behavioral eval: {effective_parallel_eval} workers", file=sys.stderr)
 
     original_content = target_path.read_text()
     target_valid, target_description = _parse_frontmatter(original_content)
     if not target_valid or not target_description:
         raise ValueError(
             "Target must have YAML frontmatter with a non-empty description. "
-            "optimize_loop.py currently supports frontmatter-description optimization only."
+            "optimize_loop.py requires valid SKILL.md-style frontmatter."
         )
     target_label = target_path.name
 
     if verbose:
         print("Running baseline evaluation...", file=sys.stderr)
 
-    baseline_scores = assess_target(target_path, train_tasks, goal, verbose, dry_run)
+    baseline_scores = assess_target(
+        target_path,
+        train_tasks,
+        goal,
+        verbose,
+        dry_run,
+        behavioral_runs_per_task,
+        behavioral_trigger_threshold,
+        effective_parallel_eval,
+        candidate_content=original_content,
+        eval_mode=eval_mode,
+    )
     baseline_composite = composite_score(baseline_scores)
     best_score = baseline_composite
     best_content = original_content
     best_iteration = 0
 
-    baseline_holdout_scores = assess_target(target_path, test_tasks, goal, verbose, dry_run) if test_tasks else None
+    baseline_holdout_scores = (
+        assess_target(
+            target_path,
+            test_tasks,
+            goal,
+            verbose,
+            dry_run,
+            behavioral_runs_per_task,
+            behavioral_trigger_threshold,
+            effective_parallel_eval,
+            candidate_content=original_content,
+            eval_mode=eval_mode,
+        )
+        if test_tasks
+        else None
+    )
     baseline_holdout = composite_score(baseline_holdout_scores) if baseline_holdout_scores else None
 
     if verbose:
@@ -1048,6 +1722,8 @@ def run_optimization_loop(
     status = "RUNNING"
     total_tokens = 0
     iteration_counter = 0
+    # Maps iteration number → variant content for ACCEPT verdicts (used for best-by-test selection)
+    keep_contents: dict[int, str] = {}
 
     for round_number in range(1, max_iterations + 1):
         if verbose:
@@ -1095,6 +1771,7 @@ def run_optimization_loop(
                         model=model,
                         dry_run=dry_run,
                         iteration_number=iteration_counter,
+                        optimization_scope=optimization_scope,
                         diversification_note=diversification_note,
                     )
                     variant_content = variant_output["variant"]
@@ -1108,7 +1785,7 @@ def run_optimization_loop(
                         print(f"Variant generation failed: {e}", file=sys.stderr)
                     iteration_data = {
                         "number": iteration_counter,
-                        "verdict": "REVERT",
+                        "verdict": "REJECT",
                         "score": {"train": parent["score"], "test": None},
                         "delta": "0",
                         "change_summary": str(e),
@@ -1123,7 +1800,7 @@ def run_optimization_loop(
                         iteration_counter,
                         parent["content"],
                         {},
-                        "REVERT",
+                        "REJECT",
                         "",
                         "",
                         str(e),
@@ -1141,7 +1818,7 @@ def run_optimization_loop(
                         print("REJECTED: Protected sections modified", file=sys.stderr)
                     iteration_data = {
                         "number": iteration_counter,
-                        "verdict": "REVERT",
+                        "verdict": "REJECT",
                         "score": {"train": 0.0, "test": None},
                         "delta": "0",
                         "change_summary": "Protected sections modified",
@@ -1156,7 +1833,7 @@ def run_optimization_loop(
                         iteration_counter,
                         variant_content,
                         {"protected_intact": False},
-                        "REVERT",
+                        "REJECT",
                         "Protected sections modified",
                         diff_text,
                         change_summary,
@@ -1171,7 +1848,7 @@ def run_optimization_loop(
                         print(f"REJECTED: Deleted sections without justification: {deletions}", file=sys.stderr)
                     iteration_data = {
                         "number": iteration_counter,
-                        "verdict": "REVERT",
+                        "verdict": "REJECT",
                         "score": {"train": parent["score"], "test": None},
                         "delta": "0",
                         "change_summary": "Deleted sections without justification",
@@ -1188,7 +1865,7 @@ def run_optimization_loop(
                         iteration_counter,
                         variant_content,
                         {"protected_intact": True},
-                        "REVERT",
+                        "REJECT",
                         "Deleted sections without justification",
                         diff_text,
                         change_summary,
@@ -1199,17 +1876,22 @@ def run_optimization_loop(
                     iteration_by_number[iteration_counter] = iteration_data
                     continue
 
-                temp_target = (
-                    target_path.parent / f".{target_path.stem}_variant_{iteration_counter}{target_path.suffix}"
+                t0 = time.time()
+                variant_scores = assess_target(
+                    target_path,
+                    train_tasks,
+                    goal,
+                    verbose,
+                    dry_run,
+                    behavioral_runs_per_task,
+                    behavioral_trigger_threshold,
+                    effective_parallel_eval,
+                    candidate_content=variant_content,
+                    baseline_content=parent["content"],
+                    eval_mode=eval_mode,
                 )
-                try:
-                    temp_target.write_text(variant_content)
-                    t0 = time.time()
-                    variant_scores = assess_target(temp_target, train_tasks, goal, verbose, dry_run)
-                    eval_elapsed = time.time() - t0
-                    variant_composite = composite_score(variant_scores)
-                finally:
-                    temp_target.unlink(missing_ok=True)
+                eval_elapsed = time.time() - t0
+                variant_composite = composite_score(variant_scores)
 
                 gain = variant_composite - parent["score"]
                 if verbose:
@@ -1220,7 +1902,7 @@ def run_optimization_loop(
                         file=sys.stderr,
                     )
 
-                verdict = "KEEP" if gain > min_gain else "REVERT"
+                verdict = "ACCEPT" if gain > min_gain else "REJECT"
                 if deletions and deletion_justification:
                     change_summary = f"{change_summary} [deletion justified]"
                 delta_str = f"{gain:+.2f}" if gain != 0 else "0"
@@ -1261,12 +1943,15 @@ def run_optimization_loop(
                 iterations.append(iteration_data)
                 iteration_by_number[iteration_counter] = iteration_data
 
-                if verdict == "KEEP":
+                if verdict == "ACCEPT":
                     if variant_composite > best_score:
                         best_score = variant_composite
                         best_content = variant_content
                         best_iteration = iteration_counter
 
+                    # Track content for each ACCEPT so best-by-test can look it up later
+                    keep_contents[iteration_counter] = variant_content
+
                     kept_nodes.append(
                         {
                             "content": variant_content,
@@ -1298,15 +1983,22 @@ def run_optimization_loop(
             rounds_without_keep += 1
 
         if test_tasks and holdout_check_cadence > 0 and round_number % holdout_check_cadence == 0:
-            temp_target = target_path.parent / f".{target_path.stem}_holdout_check{target_path.suffix}"
-            try:
-                temp_target.write_text(best_content)
-                holdout_scores = assess_target(temp_target, test_tasks, goal, verbose, dry_run)
-                holdout_composite = composite_score(holdout_scores)
-                if iterations:
-                    iterations[-1]["score"]["test"] = holdout_composite
-            finally:
-                temp_target.unlink(missing_ok=True)
+            holdout_scores = assess_target(
+                target_path,
+                test_tasks,
+                goal,
+                verbose,
+                dry_run,
+                behavioral_runs_per_task,
+                behavioral_trigger_threshold,
+                effective_parallel_eval,
+                candidate_content=best_content,
+                baseline_content=original_content,
+                eval_mode=eval_mode,
+            )
+            holdout_composite = composite_score(holdout_scores)
+            if iterations:
+                iterations[-1]["score"]["test"] = holdout_composite
 
             if holdout_diverges(best_score, holdout_composite, baseline_holdout, baseline_composite):
                 if verbose:
@@ -1319,7 +2011,7 @@ def run_optimization_loop(
                 break
 
         if rounds_without_keep >= revert_streak_limit:
-            exit_reason = f"converged ({revert_streak_limit} rounds without KEEP by round {round_number})"
+            exit_reason = f"converged ({revert_streak_limit} rounds without ACCEPT by round {round_number})"
             status = "CONVERGED"
             break
 
@@ -1370,6 +2062,48 @@ def run_optimization_loop(
         }
         report_path.write_text(generate_optimization_report(rd, auto_refresh=False))
 
+    # Best-by-test selection: if test tasks exist, prefer the ACCEPT iteration with the
+    # highest held-out test score rather than the highest training score (anti-Goodhart).
+    best_test_score: float | None = None
+    if test_tasks and keep_contents:
+        # Find iterations with a recorded test score (set during holdout cadence checks)
+        scored_keeps = [
+            (it["number"], it["score"]["test"])
+            for it in iterations
+            if it["verdict"] == "ACCEPT" and it["score"].get("test") is not None and it["number"] in keep_contents
+        ]
+        if scored_keeps:
+            best_test_iter, best_test_score = max(scored_keeps, key=lambda x: x[1])
+            if best_test_iter != best_iteration:
+                if verbose:
+                    print(
+                        f"\nBest-by-test: switching from train-best iter {best_iteration} "
+                        f"(train={best_score:.4f}) to test-best iter {best_test_iter} "
+                        f"(test={best_test_score:.4f})",
+                        file=sys.stderr,
+                    )
+                best_content = keep_contents[best_test_iter]
+                best_iteration = best_test_iter
+        else:
+            # No holdout-checked ACCEPT iterations — run a final test eval on best_content
+            if best_iteration > 0:
+                final_test_scores = assess_target(
+                    target_path,
+                    test_tasks,
+                    goal,
+                    verbose,
+                    dry_run,
+                    behavioral_runs_per_task,
+                    behavioral_trigger_threshold,
+                    effective_parallel_eval,
+                    candidate_content=best_content,
+                    baseline_content=original_content,
+                    eval_mode=eval_mode,
+                )
+                best_test_score = composite_score(final_test_scores)
+                if verbose:
+                    print(f"Final test eval on best_content: test={best_test_score:.4f}", file=sys.stderr)
+
     if best_iteration > 0:
         best_path = output_dir / "best_variant.md"
         best_path.write_text(best_content)
@@ -1385,15 +2119,17 @@ def run_optimization_loop(
         "baseline_train_score": baseline_composite,
         "baseline_holdout_score": baseline_holdout,
         "best_score": best_score,
+        "best_test_score": best_test_score,
         "best_iteration": best_iteration,
         "iterations_run": len(iterations),
         "max_iterations": max_iterations,
-        "improvements_found": sum(1 for it in iterations if it["verdict"] == "KEEP"),
+        "improvements_found": sum(1 for it in iterations if it["verdict"] == "ACCEPT"),
         "total_tokens": total_tokens,
         "search_strategy": "beam" if beam_width > 1 or candidates_per_parent > 1 else "hill_climb",
         "beam_width": beam_width,
         "candidates_per_parent": candidates_per_parent,
         "holdout_check_cadence": holdout_check_cadence,
+        "optimization_scope": optimization_scope,
         "train_size": len(train_tasks),
         "test_size": len(test_tasks),
         "iterations": iterations,
@@ -1415,18 +2151,18 @@ def main():
     parser.add_argument(
         "--max-iterations",
         type=int,
-        default=20,
-        help="Max optimization rounds (default: 20); each round evaluates up to beam_width x candidates_per_parent candidates",
+        default=1,
+        help="Max optimization rounds (default: 1, short mode); each round evaluates up to beam_width x candidates_per_parent candidates",
     )
     parser.add_argument("--min-gain", type=float, default=0.02, help="Min score gain to keep (default: 0.02)")
     parser.add_argument("--train-split", type=float, default=0.6, help="Train fraction (default: 0.6)")
     parser.add_argument(
         "--revert-streak-limit",
         type=int,
-        default=5,
-        help="Stop after this many rounds without any KEEP candidates (default: 5)",
+        default=1,
+        help="Stop after this many rounds without any ACCEPT candidates (default: 1, short mode)",
     )
-    parser.add_argument("--beam-width", type=int, default=1, help="Number of kept candidates to retain per round")
+    parser.add_argument("--beam-width", type=int, default=1, help="Number of accepted candidates to retain per round")
     parser.add_argument(
         "--candidates-per-parent",
         type=int,
@@ -1436,8 +2172,8 @@ def main():
     parser.add_argument(
         "--holdout-check-cadence",
         type=int,
-        default=5,
-        help="Check held-out tasks every N rounds (default: 5; 0 disables)",
+        default=0,
+        help="Check held-out tasks every N rounds (default: 0, disabled in short mode)",
     )
     parser.add_argument("--model", default=None, help="Optional Claude Code model override for variant generation")
     parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
@@ -1446,6 +2182,36 @@ def main():
     )
     parser.add_argument("--report", default=None, help="Path for live HTML report")
     parser.add_argument("--output-dir", default=None, help="Directory for iteration snapshots")
+    parser.add_argument(
+        "--behavioral-runs-per-task",
+        type=int,
+        default=1,
+        help="Run each behavioral task query this many times and average results (default: 1)",
+    )
+    parser.add_argument(
+        "--behavioral-trigger-threshold",
+        type=float,
+        default=0.5,
+        help="Fraction of runs that must trigger to count as triggered (default: 0.5)",
+    )
+    parser.add_argument(
+        "--parallel-eval",
+        type=int,
+        default=0,
+        help="Run behavioral eval tasks in parallel with isolated git worktrees (default: 0, disabled)",
+    )
+    parser.add_argument(
+        "--eval-mode",
+        choices=["auto", "registered", "alias"],
+        default="auto",
+        help="Trigger evaluator mode (default: auto; prefers registered-skill worktree eval when possible)",
+    )
+    parser.add_argument(
+        "--optimization-scope",
+        choices=["description-only", "body-only"],
+        default="description-only",
+        help="Which part of the file to mutate (default: description-only)",
+    )
     args = parser.parse_args()
 
     target = Path(args.target)
@@ -1475,6 +2241,11 @@ def main():
             report_path=Path(args.report) if args.report else None,
             output_dir=Path(args.output_dir) if args.output_dir else None,
             dry_run=args.dry_run,
+            behavioral_runs_per_task=args.behavioral_runs_per_task,
+            behavioral_trigger_threshold=args.behavioral_trigger_threshold,
+            parallel_eval=args.parallel_eval,
+            eval_mode=args.eval_mode,
+            optimization_scope=args.optimization_scope,
         )
     except ValueError as e:
         print(f"Error: {e}", file=sys.stderr)
diff --git a/skills/do/.SKILL_variant_3.md b/skills/do/.SKILL_variant_3.md
new file mode 100644
index 00000000..7daa8283
--- /dev/null
+++ b/skills/do/.SKILL_variant_3.md
@@ -0,0 +1,311 @@
+---
+name: do
+description: |
+  Classify user requests and route to the correct agent + skill combination.
+  Use for any user request that needs delegation: code changes, debugging,
+  reviews, content creation, research, or multi-step workflows. Invoked as
+  the primary entry point via "/do [request]". Route all code changes to
+  domain agents. Route all requests beyond pure fact lookups and single
+  reads to agents and skills.
+version: 2.0.0
+user-invocable: true
+argument-hint: "<request>"
+allowed-tools:
+  - Read
+  - Bash
+  - Grep
+  - Glob
+  - Skill
+  - Task
+routing:
+  triggers:
+    - "route task"
+    - "classify request"
+  category: meta-tooling
+---
+
+# /do - Smart Router
+
+/do is a **ROUTER**, not a worker. Its ONLY job is to classify requests, select the right agent + skill, and dispatch. It delegates all execution, implementation, debugging, review, and fixes to specialized agents.
+
+**What the main thread does:** (1) Classify, (2) Select agent+skill, (3) Dispatch via Agent tool, (4) Evaluate if more work needed, (5) Route to ANOTHER agent if yes, (6) Report results.
+
+**The main thread delegates to agents:** code reading (Explore agent), file edits (domain agents), test runs (agent with skill), documentation (technical-documentation-engineer), all Simple+ tasks.
+
+The main thread is an **orchestrator**. If you find yourself reading source code, writing code, or doing analysis — pause and route to an agent instead.
+
+---
+
+## Instructions
+
+### Phase Banners (MANDATORY)
+
+Every phase MUST display a banner BEFORE executing: `/do > Phase N: PHASE_NAME — description...`
+
+After Phase 2, display the full routing decision banner (`===` block). Phase banners tell the user *where they are*; the routing banner tells them *what was decided*. Both required.
+
+---
+
+### Phase 1: CLASSIFY
+
+**Goal**: Determine request complexity and whether routing is needed.
+
+Read and follow the repository CLAUDE.md before making any routing decision, because it contains project-specific conventions that affect agent selection and skill pairing.
+
+| Complexity | Agent | Skill | Direct Action |
+|------------|-------|-------|---------------|
+| Trivial | No | No | **ONLY reading a file the user named by exact path** |
+| Simple | **Yes** | Yes | Route to agent |
+| Medium | **Required** | **Required** | Route to agent |
+| Complex | Required (2+) | Required (2+) | Route to agent |
+
+**Trivial = reading a file the user named by exact path.** Everything else is Simple+ and MUST use an agent, skill, or pipeline. When uncertain, classify UP not down — because under-routing wastes implementations while over-routing only wastes tokens, and tokens are cheap but bad code is expensive.
+
+**Common misclassifications** (these are NOT Trivial — route them): evaluating repos/URLs, any opinion/recommendation, git operations, codebase questions (`explore-pipeline`), retro lookups (`retro` skill), comparing approaches.
+
+**Maximize skill/agent/pipeline usage.** If a skill or pipeline exists for the task, USE IT — even if handling directly seems faster, because skills encode domain patterns that prevent common mistakes.
+
+**Check for parallel patterns FIRST** because independent work items can run concurrently, saving significant time — sequential dispatch when parallel is possible wastes wall-clock time needlessly: 2+ independent failures or 3+ subtasks → `dispatching-parallel-agents`; broad research → `research-coordinator-engineer`; multi-agent coordination → `project-coordinator-engineer`; plan exists + "execute" → `subagent-driven-development`; new feature → `feature-design` (check `.feature/` directory; if present, use `feature-state.py status` for current phase).
+
+**Optional: Force Direct** — OFF by default. When explicitly enabled, overrides routing for trivial operations. Only applies when the user explicitly requests it.
+
+---
+
+**CRITICAL — Creation Request Detection** (MANDATORY scan BEFORE completing Phase 1):
+
+**Primary test**: "Would fulfilling this request produce a NEW FILE that does not currently exist in the repo?" → YES = creation request, ADR required.
+
+Scan the request for creation signals:
+
+| Signal Type | Pattern Examples |
+|-------------|-----------------|
+| Explicit creation verbs | "create", "scaffold", "build", "add new", "implement new" |
+| Domain object targets | agent, skill, pipeline, hook, feature, plugin, workflow, voice profile |
+| Implicit creation | "I need a [component]", "we need a [component]", "build me a [component]" |
+| Purpose patterns | "build a [component] for X", "create a [component] that does Y" |
+
+**Concrete examples — ALL of these ARE creation requests:**
+- `"build a pipeline for automated security"` → new pipeline files
+- `"create a PostToolUse hook that detects SQL injection"` → new hook file
+- `"I need an agent for Ruby on Rails development"` → new agent file
+- `"scaffold a new skill for database migrations"` → new skill files
+- `"add a new feature for user authentication"` → new feature files
+- `"implement a new workflow for code review"` → new workflow files
+
+**NOT a creation request** (operating on files that already exist):
+- `"debug the existing auth hook"` — fix existing file
+- `"review the payment pipeline"` — read-only inspection of existing files
+- `"fix the error handling in the Go agent"` — modify existing file
+- `"refactor the router logic"` — transform existing file
+- `"explain how the retry skill works"` — explanation only
+- `"run the test suite"` — execution only
+- `"audit the security hooks"` — analysis of existing files
+
+**When ambiguous**: ask "does the user want to CREATE something new that doesn't exist yet, OR improve/inspect something that already exists?" If new → creation. The purpose or topic of the new component (e.g., "for security", "for debugging") does NOT make it a non-creation request — only the presence or absence of an existing target file does.
+
+If ANY creation signal is found AND complexity is Simple+:
+1. Set an internal flag: `is_creation = true`
+2. **Phase 4 Step 0 is MANDATORY** — write ADR before dispatching any agent
+
+This early detection exists because Phase 4 Step 0 is the most frequently skipped step in /do. Moving detection to Phase 1 ensures the creation protocol fires before routing decisions consume attention. The Gate below enforces acknowledgment before Phase 2.
+
+**Gate**: Complexity classified. If a creation signal was detected, output `[CREATION REQUEST DETECTED]` before displaying the routing banner. Display routing banner (ALL classifications). If not Trivial, proceed to Phase 2. If Trivial, handle directly after showing banner.
+
+<!-- DO NOT OPTIMIZE -->
+
+---
+
+### Phase 2: ROUTE
+
+**Goal**: Select the correct agent + skill combination from the INDEX files and routing tables.
+
+**Step 1: Check force-route triggers**
+
+Force-route triggers are in `skills/INDEX.json` (field: `force_route: true`). If a force-route trigger matches the request, invoke that skill BEFORE any other action, because force-routes encode critical domain patterns that prevent common mistakes — skipping them causes the exact class of bugs they were designed to prevent.
+
+Check triggers literally against the request text. If triggers match, force-route applies — no exceptions, no judgment calls about whether "it applies here."
+
+Trigger phrases must contain only user-language keywords, never sibling skill names, because the router matches triggers against request text and a sibling skill name would cause false matches. Each trigger phrase must map to exactly one skill — duplicates across skills make deterministic routing impossible.
+
+**Critical**: "push", "commit", "create PR", "merge" are NOT trivial git commands. They MUST route through skills that run quality gates, because running raw `git push`, `git commit`, `gh pr create`, or `gh pr merge` directly bypasses lint checks, test runs, review loops, CI verification, and repo classification.
+
+**Step 2: Select agent + skill**
+
+Read the routing tables in `references/routing-tables.md` and the INDEX files (`agents/INDEX.json`, `skills/INDEX.json`, `pipelines/INDEX.json`) to identify candidates by trigger-overlap. Select the best match; use LLM judgment to tiebreak when multiple candidates fit equally well.
+
+Route to the simplest agent+skill that satisfies the request, because over-engineering the routing itself (stacking unnecessary skills) creates more overhead than it prevents.
+
+When `[cross-repo]` output is present, route to `.claude/agents/` local agents because they contain project-specific knowledge that generic agents lack.
+
+Route all code modifications to domain agents, because domain agents carry language-specific expertise, testing methodology, and quality gates that the router lacks.
+
+**Step 3: Apply skill override** (task verb overrides default skill)
+
+When the request verb implies a specific methodology, override the agent's default skill. Common overrides: "review" → systematic-code-review, "debug" → systematic-debugging, "refactor" → systematic-refactoring, "TDD" → test-driven-development. Full override table in `references/routing-tables.md`.
+
+**Step 4: Display routing decision** (MANDATORY — do this NOW, before anything else)
+
+This banner MUST be the FIRST visible output for EVERY /do invocation. Display BEFORE creating plans, BEFORE invoking agents, BEFORE any work begins. No exceptions.
+
+```
+===================================================================
+ ROUTING: [brief summary]
+===================================================================
+ Selected:
+   -> Agent: [name] - [why]
+   -> Skill: [name] - [why]
+   -> Pipeline: PHASE1 → PHASE2 → ... (if pipeline; phases from pipelines/INDEX.json)
+   -> Anti-Rationalization: [auto-injected for code/security/testing]
+ Invoking...
+===================================================================
+```
+
+For Trivial: show `Classification: Trivial - [reason]` and `Handling directly (no agent/skill needed)`.
+
+**Optional: Dry Run Mode** — OFF by default. When enabled, show the routing decision without executing.
+
+**Optional: Verbose Routing** — OFF by default. When enabled, explain why each alternative was rejected.
+
+**Step 5: Record routing decision** (Simple+ only — skip Trivial):
+
+```bash
+python3 ~/.claude/scripts/learning-db.py record \
+    routing "{selected_agent}:{selected_skill}" \
+    "request: {first_200_chars} | complexity: {complexity} | force_used: {0|1} | llm_override: {0|1} | enhancements: {comma_separated_list}" \
+    --category routing-decision \
+    --tags "{applicable_flags}"
+```
+
+Tags: `force-route`, `llm-override`, `auto-pipeline` (as applicable). This call is advisory — if it fails, continue.
+
+**Gate**: Agent and skill selected. Banner displayed. Routing decision recorded. Proceed to Phase 3.
+
+---
+
+### Phase 3: ENHANCE
+
+**Goal**: Stack additional skills based on signals in the request.
+
+Auto-inject retro knowledge from `learning.db` for any substantive work (benchmark: +5.3 avg, 67% win rate), because historical patterns prevent repeat mistakes. Relevance-gated by FTS5 keyword matching — only inject when keywords overlap.
+
+| Signal in Request | Enhancement to Add |
+|-------------------|-------------------|
+| Any substantive work (code, design, plan) | **Auto-inject retro knowledge** (via `retro-knowledge-injector` hook) |
+| "comprehensive" / "thorough" / "full" | Add parallel reviewers (security + business + quality) |
+| "with tests" / "production ready" | Append test-driven-development + verification-before-completion |
+| "research needed" / "investigate first" | Prepend research-coordinator-engineer |
+| Multiple independent problems (2+) | Use dispatching-parallel-agents |
+| "review" with 5+ files | Use parallel-code-review (3 reviewers) |
+| Complex implementation | Offer subagent-driven-development |
+
+Before stacking any enhancement, check the target skill's `pairs_with` field in `skills/INDEX.json`, because some skills have built-in verification gates that make stacking redundant or harmful. Specifically: empty `pairs_with: []` means no stacking allowed. Skills with built-in verification gates handle their own verification. The `fast` skill handles its own testing — stack only compatible enhancements.
+
+**Auto-inject anti-rationalization** for these task types, because these categories are where shortcut rationalization causes the most damage:
+
+| Task Type | Patterns Injected |
+|-----------|-------------------|
+| Code modification | anti-rationalization-core, verification-checklist |
+| Code review | anti-rationalization-core, anti-rationalization-review |
+| Security work | anti-rationalization-core, anti-rationalization-security |
+| Testing | anti-rationalization-core, anti-rationalization-testing |
+| Debugging | anti-rationalization-core, verification-checklist |
+| External content evaluation | **untrusted-content-handling** |
+
+For explicit maximum rigor, use `/with-anti-rationalization [task]`.
+
+**Gate**: Enhancements applied. Proceed to Phase 4.
+
+---
+
+### Phase 4: EXECUTE
+
+**Goal**: Invoke the selected agent + skill and deliver results.
+
+**Step 0: Execute Creation Protocol** (for creation requests ONLY)
+
+If request contains "create", "new", "scaffold", "build pipeline/agent/skill/hook" AND complexity is Simple+, automatically sequence: (1) Write ADR at `adr/{kebab-case-name}.md`, (2) Register via `adr-query.py register`, (3) Proceed to plan creation. The `adr-context-injector` and `adr-enforcement` hooks handle cross-agent ADR compliance automatically. This protocol fires automatically because creation requests at Simple+ complexity need architectural grounding before implementation begins.
+
+**Step 1: Create plan** (for Simple+ complexity)
+
+Create `task_plan.md` before execution, because executing without a plan produces wrong results faster — not correct results sooner. The `auto-plan-detector.py` hook auto-injects `<auto-plan-required>` context. Skip only for Trivial tasks.
+
+**Step 2: Invoke agent with skill**
+
+Dispatch the agent. MCP tool discovery is the agent's responsibility — each agent's markdown declares which MCP tools it needs. Do not inject MCP instructions from /do.
+
+Route to agents that create feature branches for all commits, because main branch commits affect everyone and bypassing branch protection causes cascading problems.
+
+When dispatching agents for file modifications, explicitly include "commit your changes on the branch" in the agent prompt, because otherwise the agent completes file edits but changes sit unstaged — the orchestrator assumes committed work and moves on, and changes are lost.
+
+When dispatching agents with `isolation: "worktree"`, inject the `worktree-agent` skill rules into the agent prompt. The skill at `skills/worktree-agent/SKILL.md` contains mandatory rules that prevent worktree isolation failures (leaked changes, branch confusion, auto-plan hook interference). At minimum include: "Verify your CWD contains .claude/worktrees/. Create feature branch before edits. Skip task_plan.md creation (handled by orchestrator). Stage specific files only."
+
+For repos without organization-gated workflows, run up to 3 iterations of `/pr-review` → fix before creating a PR, because post-merge fixes cost 2 PRs instead of 1. For repos under protected organizations (via `scripts/classify-repo.py`), require user confirmation before EACH git action — confirm before executing or merging, because organization-gated repos have compliance requirements that require explicit approval.
+
+**Step 3: Handle multi-part requests**
+
+Detect: "first...then", "and also", numbered lists, semicolons. Sequential dependencies execute in order. Independent items launch multiple Task tools in single message. Max parallelism: 10 agents.
+
+**Step 4: Auto-Pipeline Fallback** (when no agent/skill matches AND complexity >= Simple)
+
+Always invoke `auto-pipeline` for unmatched requests, because a missing agent match is a routing gap to report — routing overhead is always less than unreviewed code changes. If no pipeline matches either, fall back to closest agent + verification-before-completion.
+
+When uncertain which route: **ROUTE ANYWAY.** Add verification-before-completion as safety net. Routing overhead is always less than the cost of unreviewed code changes.
+
+**Gate**: Agent invoked, results delivered. Proceed to Phase 5.
+
+---
+
+### Phase 5: LEARN
+
+**Goal**: Ensure session insights are captured to `learning.db`.
+
+**Routing outcome recording** (Simple+ tasks, observable facts only — no self-grading):
+```bash
+python3 ~/.claude/scripts/learning-db.py record \
+    routing "{selected_agent}:{selected_skill}" \
+    "{existing_value} | tool_errors: {0|1} | user_rerouted: {0|1}" \
+    --category routing-decision
+```
+
+Record only observable facts (tool_errors, user_rerouted) — routing outcome quality is measured by user reroutes, not self-assessment.
+
+**Auto-capture** (hooks, zero LLM cost): `error-learner.py` (PostToolUse), `review-capture.py` (PostToolUse), `session-learning-recorder.py` (Stop).
+
+**Skill-scoped recording** (preferred — one-liner):
+```bash
+python3 ~/.claude/scripts/learning-db.py learn --skill go-testing "insight about testing"
+python3 ~/.claude/scripts/learning-db.py learn --agent golang-general-engineer "insight about agent"
+python3 ~/.claude/scripts/learning-db.py learn "general insight without scope"
+```
+
+**Immediate graduation for review findings** (MANDATORY): When a review finds an issue and it gets fixed in the same PR: (1) Record scoped to responsible agent/skill, (2) Boost to 1.0, (3) Embed into agent anti-patterns, (4) Graduate, (5) Stage changes in same PR. One cycle — no waiting for "multiple observations."
+
+**Gate**: After Simple+ tasks, record at least one learning via `learn`. Review findings get immediate graduation.
+
+---
+
+## Error Handling
+
+### Error: "No Agent Matches Request"
+Cause: Request domain not covered by any agent
+Solution: Check INDEX files and `references/routing-tables.md` for near-matches. Route to closest agent with verification-before-completion. Report the gap.
+
+### Error: "Force-Route Conflict"
+Cause: Multiple force-route triggers match the same request
+Solution: Apply most specific force-route first. Stack secondary routes as enhancements if compatible.
+
+### Error: "Plan Required But Not Created"
+Cause: Simple+ task attempted without task_plan.md
+Solution: Stop execution. Create `task_plan.md`. Resume routing after plan is in place.
+
+---
+
+## References
+
+### Reference Files
+- `${CLAUDE_SKILL_DIR}/references/routing-tables.md`: Complete category-specific skill routing
+- `agents/INDEX.json`: Agent triggers and metadata
+- `skills/INDEX.json`: Skill triggers, force-route flags, pairs_with
+- `pipelines/INDEX.json`: Pipeline phases, triggers, composition chains
+
+<!-- END DO NOT OPTIMIZE -->
\ No newline at end of file
diff --git a/skills/read-only-ops/SKILL.md b/skills/read-only-ops/SKILL.md
index 70375644..115f4b5e 100644
--- a/skills/read-only-ops/SKILL.md
+++ b/skills/read-only-ops/SKILL.md
@@ -1,10 +1,12 @@
 ---
 name: read-only-ops
 description: |
-  Read-only exploration, status checks, and reporting without modifications.
-  Use when user asks to check status, find files, search code, show state,
-  or explicitly requests read-only investigation. Route to other skills when user wants
-  changes, fixes, refactoring, or any write operation.
+  Read-only exploration, inspection, and reporting without modifications.
+  Use when the user wants to inspect, investigate, audit, survey, or analyze code/files/state
+  without making changes. Common triggers: "inspect this", "report back without changing anything",
+  "show me", "look at", "tell me about", "find files", "check status", "list all", "how many",
+  "where is", or "what is the current state of". Route away when the user wants fixes,
+  refactors, writing, or any write operation.
 version: 2.0.0
 user-invocable: false
 allowed-tools: