diff --git a/evolution/core/config.py b/evolution/core/config.py
index 9e880e4d..4c3d8a46 100644
--- a/evolution/core/config.py
+++ b/evolution/core/config.py
@@ -23,7 +23,7 @@ class EvolutionConfig:
     judge_model: str = "openai/gpt-4.1"  # Model for dataset generation
 
     # Constraints
-    max_skill_size: int = 15_000  # 15KB default
+    max_skill_size: int = 50_000  # 50KB default — evolved skills may include few-shot examples
     max_tool_desc_size: int = 500  # chars
     max_param_desc_size: int = 200  # chars
     max_prompt_growth: float = 0.2  # 20% max growth over baseline
diff --git a/evolution/core/constraints.py b/evolution/core/constraints.py
index d6b13459..fe4c3498 100644
--- a/evolution/core/constraints.py
+++ b/evolution/core/constraints.py
@@ -4,6 +4,7 @@
 considered valid. Failed constraints = immediate rejection.
 """
 
+import re
 import subprocess
 from pathlib import Path
 from dataclasses import dataclass
@@ -148,27 +149,65 @@ def _check_non_empty(self, text: str) -> ConstraintResult:
             )
 
     def _check_skill_structure(self, text: str) -> ConstraintResult:
-        """Check that a skill file has valid YAML frontmatter and markdown body."""
+        """Check that a skill file has valid YAML frontmatter AND a substantive body.
+
+        Frontmatter validation (YAML between --- markers):
+        - Must start with ---
+        - Must contain 'name:' field
+        - Must contain 'description:' field
+
+        Body validation (markdown after frontmatter):
+        - Must have at least 2 of 3: headings, procedural content, substantial length
+        This allows varied skill formats while ensuring meaningful content.
+        """
         has_frontmatter = text.strip().startswith("---")
         has_name = "name:" in text[:500] if has_frontmatter else False
         has_description = "description:" in text[:500] if has_frontmatter else False
 
-        if has_frontmatter and has_name and has_description:
+        frontmatter_ok = has_frontmatter and has_name and has_description
+
+        # Separate body from frontmatter for body validation
+        body = text
+        if has_frontmatter:
+            parts = text.split("---", 2)
+            if len(parts) >= 3:
+                body = parts[2].strip()
+
+        # Body must have ≥2 of 3: headings, procedural content, substantial length
+        has_headings = bool(re.search(r"^#+\s", body, re.MULTILINE))
+        has_steps = any(
+            marker in body.lower()
+            for marker in ["step", "1.", "procedure", "how to", "instructions"]
+        )
+        has_content = len(body.strip()) > 100
+
+        body_checks = {
+            "headings": has_headings,
+            "procedural content": has_steps,
+            "substantial content": has_content,
+        }
+        body_passed = sum(body_checks.values()) >= 2
+
+        if frontmatter_ok and body_passed:
             return ConstraintResult(
                 passed=True,
                 constraint_name="skill_structure",
-                message="Skill has valid frontmatter (name + description)",
-            )
-        else:
-            missing = []
-            if not has_frontmatter:
-                missing.append("YAML frontmatter (---)")
-            if not has_name:
-                missing.append("name field")
-            if not has_description:
-                missing.append("description field")
-            return ConstraintResult(
-                passed=False,
-                constraint_name="skill_structure",
-                message=f"Skill missing: {', '.join(missing)}",
+                message="Skill has valid frontmatter (name + description) and substantive body",
             )
+
+        missing = []
+        if not has_frontmatter:
+            missing.append("YAML frontmatter (---)")
+        if not has_name:
+            missing.append("name field")
+        if not has_description:
+            missing.append("description field")
+        if not body_passed:
+            failed_checks = [k for k, v in body_checks.items() if not v]
+            missing.append(f"body lacks: {', '.join(failed_checks)}")
+
+        return ConstraintResult(
+            passed=False,
+            constraint_name="skill_structure",
+            message=f"Skill missing: {', '.join(missing)}",
+        )
diff --git a/evolution/core/dataset_builder.py b/evolution/core/dataset_builder.py
index 3a430ce1..7118b7f8 100644
--- a/evolution/core/dataset_builder.py
+++ b/evolution/core/dataset_builder.py
@@ -6,17 +6,96 @@
 C) Golden sets — hand-curated JSONL files
 """
 
+import ast
 import json
 import random
+import re
 from pathlib import Path
 from dataclasses import dataclass, field
 from typing import Optional
 
 import dspy
+import os
 
 from evolution.core.config import EvolutionConfig
 
 
+def _try_parse_json(text: str) -> list:
+    """Parse JSON with multiple fallback strategies for LLM output.
+
+    LLMs frequently produce malformed JSON: trailing commas, single quotes,
+    text wrapped in markdown fences, etc. This tries progressively more
+    aggressive fixes before giving up.
+    """
+    text = text.strip()
+
+    # Strategy 1: Direct parse
+    try:
+        result = json.loads(text)
+        if isinstance(result, list):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # Strategy 2: Python literal_eval — handles single-quoted dicts/strings
+    try:
+        result = ast.literal_eval(text)
+        if isinstance(result, list):
+            return result
+    except (ValueError, SyntaxError):
+        pass
+
+    # Strategy 3: Extract JSON array from surrounding text
+    match = re.search(r'\[\s*\{.*\}\s*\]', text, re.DOTALL)
+    if match:
+        try:
+            result = json.loads(match.group())
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            pass
+
+    # Strategy 4: Try literal_eval on extracted candidate
+    if match:
+        try:
+            result = ast.literal_eval(match.group())
+            if isinstance(result, list):
+                return result
+        except (ValueError, SyntaxError):
+            pass
+
+    # Strategy 5: Fix trailing commas, then parse
+    fixed = re.sub(r',\s*([}\]])', r'\1', text)
+    fixed = re.sub(r"(?<!')\'([^']+?)'(?=\s*[:,\]\}])", r'"\1"', fixed)
+    try:
+        result = json.loads(fixed)
+        if isinstance(result, list):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # Strategy 6: Strip markdown code fences
+    stripped = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
+    stripped = re.sub(r'\s*```$', '', stripped)
+    try:
+        result = json.loads(stripped)
+        if isinstance(result, list):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # Last resort: extract all {...} blocks and try each
+    for block_match in re.finditer(r'\{[^{}]*\}', text):
+        try:
+            result = json.loads(block_match.group())
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            continue
+
+    return None
+
+
 @dataclass
 class EvalExample:
     """A single evaluation example."""
@@ -123,7 +202,7 @@ def generate(
         n = num_cases or self.config.eval_dataset_size
 
         # Configure DSPy to use the judge model for generation
-        lm = dspy.LM(self.config.judge_model)
+        lm = dspy.LM(self.config.judge_model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.config.judge_model)
 
         with dspy.context(lm=lm):
             result = self.generator(
@@ -132,17 +211,10 @@ def generate(
                 num_cases=n,
             )
 
-        # Parse the generated test cases
-        try:
-            cases_raw = json.loads(result.test_cases)
-        except json.JSONDecodeError:
-            # Try to extract JSON from the response
-            import re
-            match = re.search(r'\[.*\]', result.test_cases, re.DOTALL)
-            if match:
-                cases_raw = json.loads(match.group())
-            else:
-                raise ValueError(f"Could not parse test cases from LLM output: {result.test_cases[:200]}")
+        # Parse the generated test cases using robust multi-strategy parser
+        cases_raw = _try_parse_json(result.test_cases)
+        if cases_raw is None:
+            raise ValueError(f"Could not parse test cases from LLM output: {result.test_cases[:500]}")
 
         examples = [
             EvalExample(
diff --git a/evolution/core/external_importers.py b/evolution/core/external_importers.py
index 65fe0aaa..3c580729 100644
--- a/evolution/core/external_importers.py
+++ b/evolution/core/external_importers.py
@@ -30,6 +30,7 @@
 
 import click
 import dspy
+import os
 from rich.console import Console
 from rich.progress import Progress
 
@@ -490,7 +491,7 @@ def filter_and_score(
         # Stage 2: LLM relevance scoring
         examples = []
         errors = 0
-        lm = dspy.LM(self.model)
+        lm = dspy.LM(self.model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.model)
 
         with Progress() as progress:
             task = progress.add_task("Scoring relevance...", total=len(candidates))
diff --git a/evolution/core/fitness.py b/evolution/core/fitness.py
index 04f2c78b..1046f866 100644
--- a/evolution/core/fitness.py
+++ b/evolution/core/fitness.py
@@ -5,6 +5,7 @@
 """
 
 import dspy
+import os
 from dataclasses import dataclass
 from typing import Optional
 
@@ -72,7 +73,7 @@ def score(
     ) -> FitnessScore:
         """Score an agent output using LLM-as-judge."""
 
-        lm = dspy.LM(self.config.eval_model)
+        lm = dspy.LM(self.config.eval_model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.config.eval_model)
 
         with dspy.context(lm=lm):
             result = self.judge(
@@ -104,10 +105,10 @@ def score(
         )
 
 
-def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> float:
+def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None, pred_name=None, pred_trace=None) -> float:
     """DSPy-compatible metric function for skill optimization.
 
-    This is what gets passed to dspy.GEPA(metric=...).
+    Accepts 5 args for GEPA compatibility: (gold, pred, trace, pred_name, pred_trace).
     Returns a float 0-1 score.
     """
     # The prediction should have an 'output' field with the agent's response
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 8ad4d89c..e3db94f8 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -6,6 +6,7 @@
 """
 
 import json
+import os
 import sys
 import time
 from pathlib import Path
@@ -28,11 +29,54 @@
     load_skill,
     find_skill,
     reassemble_skill,
+    _SKILL_BODY_SENTINEL_,
 )
 
 console = Console()
 
 
+def _extract_evolved_skill_body(module, original_skill_text: str) -> str:
+    """Extract the evolved skill body from a GEPA-optimized SkillModule.
+
+    Recovery strategy (in order of priority):
+    1. Extract from signature instructions using the HTML sentinel.
+       This works when the optimizer mutated the body in-place.
+    2. Return the original skill_body stored in module.skill_body.
+       This works when the optimizer replaced the instruction text entirely
+       but left self.skill_body unchanged (the typical GEPA case).
+    3. Return original_skill_text as last resort.
+
+    The sentinel (HTML comment) is used because skill bodies often contain
+    "---" markdown dividers that would otherwise corrupt the split.
+    """
+    # Strategy 1: Try extracting via sentinel from signature instructions
+    try:
+        evolved_instruction = module.predictor.predict.signature.instructions
+    except Exception:
+        try:
+            evolved_instruction = module.predictor.signature.instructions
+        except Exception:
+            evolved_instruction = None
+
+    if evolved_instruction:
+        skill_header = "Follow these skill instructions to complete the task:\n\n"
+        if evolved_instruction.startswith(skill_header):
+            rest = evolved_instruction[len(skill_header):]
+            if _SKILL_BODY_SENTINEL_ in rest:
+                evolved_body = rest.split(_SKILL_BODY_SENTINEL_, 1)[0]
+                if evolved_body.strip():
+                    return evolved_body
+
+    # Strategy 2: Use the original skill_body stored in the module.
+    # GEPA copies the module and may update the predictor but typically
+    # leaves self.skill_body pointing to the original body text.
+    if hasattr(module, 'skill_body') and module.skill_body:
+        return module.skill_body
+
+    # Strategy 3: Fallback to original
+    return original_skill_text
+
+
 def evolve(
     skill_name: str,
     iterations: int = 10,
@@ -119,7 +163,7 @@ def evolve(
     # ── 3. Validate constraints on baseline ─────────────────────────────
     console.print(f"\n[bold]Validating baseline constraints[/bold]")
     validator = ConstraintValidator(config)
-    baseline_constraints = validator.validate_all(skill["body"], "skill")
+    baseline_constraints = validator.validate_all(skill["raw"], "skill")
     all_pass = True
     for c in baseline_constraints:
         icon = "✓" if c.passed else "✗"
@@ -137,11 +181,12 @@ def evolve(
     console.print(f"  Optimizer model: {optimizer_model}")
     console.print(f"  Eval model: {eval_model}")
 
-    # Configure DSPy
-    lm = dspy.LM(eval_model)
+    # Configure DSPy LM with retry handling for rate limits (PR #35)
+    _base = os.getenv("OPENROUTER_BASE_URL")
+    lm = dspy.LM(eval_model, api_base=_base, num_retries=8) if _base else dspy.LM(eval_model, num_retries=8)
     dspy.configure(lm=lm)
 
-    # Create the baseline skill module
+    # Create the baseline skill module — skill text embedded in signature instructions
     baseline_module = SkillModule(skill["body"])
 
     # Prepare DSPy examples
@@ -154,9 +199,13 @@ def evolve(
     start_time = time.time()
 
     try:
+        _base = os.getenv("OPENROUTER_BASE_URL")
+        _ref_lm = dspy.LM(optimizer_model, api_base=_base) if _base else dspy.LM(optimizer_model)
+        # PR #35: use max_metric_calls (not max_full_evals); do NOT mix with auto="light"
         optimizer = dspy.GEPA(
             metric=skill_fitness_metric,
-            max_steps=iterations,
+            max_metric_calls=iterations * 10,  # metric calls budget
+            reflection_lm=_ref_lm,
         )
 
         optimized_module = optimizer.compile(
@@ -167,9 +216,11 @@ def evolve(
     except Exception as e:
         # Fall back to MIPROv2 if GEPA isn't available in this DSPy version
         console.print(f"[yellow]GEPA not available ({e}), falling back to MIPROv2[/yellow]")
+        # PR #35: add num_threads=1 to serialize eval calls and avoid rate limits
         optimizer = dspy.MIPROv2(
             metric=skill_fitness_metric,
             auto="light",
+            num_threads=1,
         )
         optimized_module = optimizer.compile(
             baseline_module,
@@ -179,14 +230,25 @@ def evolve(
     elapsed = time.time() - start_time
     console.print(f"\n  Optimization completed in {elapsed:.1f}s")
 
-    # ── 6. Extract evolved skill text ───────────────────────────────────
-    # The optimized module's instructions contain the evolved skill text
-    evolved_body = optimized_module.skill_text
+    # ── 6. Extract evolved skill body ───────────────────────────────────
+    # The skill body is embedded in signature instructions and GEPA mutated it.
+    # Extract it by stripping the wrapper header/parator that SkillModule added.
+    evolved_body = _extract_evolved_skill_body(optimized_module, skill["body"])
+
+    # Fallback only if extraction produced nothing meaningful (empty or null)
+    if not evolved_body.strip():
+        console.print("[yellow]  ⚠ Could not extract evolved body — using baseline[/yellow]")
+        evolved_body = skill["body"]
+    elif evolved_body == skill["body"]:
+        # Extraction worked but GEPA found no better variant — this is normal
+        console.print("[dim]  (baseline body retained — GEPA found no improved variant)[/dim]")
+
     evolved_full = reassemble_skill(skill["frontmatter"], evolved_body)
 
     # ── 7. Validate evolved skill ───────────────────────────────────────
     console.print(f"\n[bold]Validating evolved skill[/bold]")
-    evolved_constraints = validator.validate_all(evolved_body, "skill", baseline_text=skill["body"])
+    # PR #35: pass evolved_full (reassembled with frontmatter), not body-only
+    evolved_constraints = validator.validate_all(evolved_full, "skill", baseline_text=skill["raw"])
     all_pass = True
     for c in evolved_constraints:
         icon = "✓" if c.passed else "✗"
diff --git a/evolution/skills/skill_module.py b/evolution/skills/skill_module.py
index 6d4d22ed..e5a4099f 100644
--- a/evolution/skills/skill_module.py
+++ b/evolution/skills/skill_module.py
@@ -1,8 +1,8 @@
 """Wraps a SKILL.md file as a DSPy module for optimization.
 
 The key abstraction: a skill file becomes a parameterized DSPy module
-where the skill text is the optimizable parameter. GEPA can then
-mutate the skill text and evaluate the results.
+where the skill text is the optimizable parameter via signature instructions.
+GEPA can then mutate the skill text and evaluate the results.
 """
 
 import re
@@ -12,6 +12,11 @@
 import dspy
 
 
+# Unique sentinel that cannot appear in any skill content.
+# HTML comment format — markdown/skills never contain HTML comments.
+_SKILL_BODY_SENTINEL_ = "\n\n<!-- ___SKILL_EVOLUTION_SENTINEL___ -->\n\n"
+
+
 def load_skill(skill_path: Path) -> dict:
     """Load a skill file and parse its frontmatter + body.
 
@@ -84,33 +89,46 @@ def find_skill(skill_name: str, hermes_agent_path: Path) -> Optional[Path]:
 class SkillModule(dspy.Module):
     """A DSPy module that wraps a skill file for optimization.
 
-    The skill text (body) is the parameter that GEPA optimizes.
-    On each forward pass, the module:
-    1. Uses the skill text as instructions
-    2. Processes the task input
-    3. Returns the agent's response
+    The skill text body is embedded in the signature's instructions so that
+    GEPA/MIPROv2 can actually propose mutations to it. The skill text is NOT
+    passed as an InputField (which would make it invisible to DSPy optimizers).
+
+    The original skill body is also stored separately in self.skill_body so
+    it can be recovered after optimization even if the instruction text is
+    replaced entirely by the optimizer.
     """
 
+    def __init__(self, skill_text: str):
+        super().__init__()
+        # Store original body separately — needed for recovery after optimization
+        # since optimizer may replace instruction text entirely.
+        self.skill_body = skill_text
+
+        # Embed skill text in the signature instructions so GEPA can optimize it.
+        # Use a unique HTML-comment sentinel (cannot appear in any skill content).
+        base_sig = self.TaskWithSkill
+        base_instructions = base_sig.__doc__ or ""
+        enriched_instructions = (
+            f"Follow these skill instructions to complete the task:\n\n"
+            f"{skill_text}"
+            + _SKILL_BODY_SENTINEL_
+            + base_instructions
+        )
+        custom_sig = base_sig.with_instructions(enriched_instructions)
+        self.predictor = dspy.ChainOfThought(custom_sig)
+
     class TaskWithSkill(dspy.Signature):
         """Complete a task following the provided skill instructions.
 
         You are an AI agent following specific skill instructions to complete a task.
         Read the skill instructions carefully and follow the procedure described.
         """
-        skill_instructions: str = dspy.InputField(desc="The skill instructions to follow")
         task_input: str = dspy.InputField(desc="The task to complete")
         output: str = dspy.OutputField(desc="Your response following the skill instructions")
 
-    def __init__(self, skill_text: str):
-        super().__init__()
-        self.skill_text = skill_text
-        self.predictor = dspy.ChainOfThought(self.TaskWithSkill)
-
     def forward(self, task_input: str) -> dspy.Prediction:
-        result = self.predictor(
-            skill_instructions=self.skill_text,
-            task_input=task_input,
-        )
+        # skill_text is now in the signature instructions, not passed as InputField
+        result = self.predictor(task_input=task_input)
         return dspy.Prediction(output=result.output)
 
 
diff --git a/output/github-code-review/20260424_004628/baseline_skill.md b/output/github-code-review/20260424_004628/baseline_skill.md
new file mode 100644
index 00000000..8041fbb6
--- /dev/null
+++ b/output/github-code-review/20260424_004628/baseline_skill.md
@@ -0,0 +1,480 @@
+---
+name: github-code-review
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
+version: 1.1.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [GitHub, Code-Review, Pull-Requests, Git, Quality]
+    related_skills: [github-auth, github-pr-workflow]
+---
+
+# GitHub Code Review
+
+Perform code reviews on local changes before pushing, or review open PRs on GitHub. Most of this skill uses plain `git` — the `gh`/`curl` split only matters for PR-level interactions.
+
+## Prerequisites
+
+- Authenticated with GitHub (see `github-auth` skill)
+- Inside a git repository
+
+### Setup (for PR interactions)
+
+```bash
+if command -v gh &>/dev/null && gh auth status &>/dev/null; then
+  AUTH="gh"
+else
+  AUTH="git"
+  if [ -z "$GITHUB_TOKEN" ]; then
+    if [ -f ~/.hermes/.env ] && grep -q "^GITHUB_TOKEN=" ~/.hermes/.env; then
+      GITHUB_TOKEN=$(grep "^GITHUB_TOKEN=" ~/.hermes/.env | head -1 | cut -d= -f2 | tr -d '\n\r')
+    elif grep -q "github.com" ~/.git-credentials 2>/dev/null; then
+      GITHUB_TOKEN=$(grep "github.com" ~/.git-credentials 2>/dev/null | head -1 | sed 's|https://[^:]*:\([^@]*\)@.*|\1|')
+    fi
+  fi
+fi
+
+REMOTE_URL=$(git remote get-url origin)
+OWNER_REPO=$(echo "$REMOTE_URL" | sed -E 's|.*github\.com[:/]||; s|\.git$||')
+OWNER=$(echo "$OWNER_REPO" | cut -d/ -f1)
+REPO=$(echo "$OWNER_REPO" | cut -d/ -f2)
+```
+
+---
+
+## 1. Reviewing Local Changes (Pre-Push)
+
+This is pure `git` — works everywhere, no API needed.
+
+### Get the Diff
+
+```bash
+# Staged changes (what would be committed)
+git diff --staged
+
+# All changes vs main (what a PR would contain)
+git diff main...HEAD
+
+# File names only
+git diff main...HEAD --name-only
+
+# Stat summary (insertions/deletions per file)
+git diff main...HEAD --stat
+```
+
+### Review Strategy
+
+1. **Get the big picture first:**
+
+```bash
+git diff main...HEAD --stat
+git log main..HEAD --oneline
+```
+
+2. **Review file by file** — use `read_file` on changed files for full context, and the diff to see what changed:
+
+```bash
+git diff main...HEAD -- src/auth/login.py
+```
+
+3. **Check for common issues:**
+
+```bash
+# Debug statements, TODOs, console.logs left behind
+git diff main...HEAD | grep -n "print(\|console\.log\|TODO\|FIXME\|HACK\|XXX\|debugger"
+
+# Large files accidentally staged
+git diff main...HEAD --stat | sort -t'|' -k2 -rn | head -10
+
+# Secrets or credential patterns
+git diff main...HEAD | grep -in "password\|secret\|api_key\|token.*=\|private_key"
+
+# Merge conflict markers
+git diff main...HEAD | grep -n "<<<<<<\|>>>>>>\|======="
+```
+
+4. **Present structured feedback** to the user.
+
+### Review Output Format
+
+When reviewing local changes, present findings in this structure:
+
+```
+## Code Review Summary
+
+### Critical
+- **src/auth.py:45** — SQL injection: user input passed directly to query.
+  Suggestion: Use parameterized queries.
+
+### Warnings
+- **src/models/user.py:23** — Password stored in plaintext. Use bcrypt or argon2.
+- **src/api/routes.py:112** — No rate limiting on login endpoint.
+
+### Suggestions
+- **src/utils/helpers.py:8** — Duplicates logic in `src/core/utils.py:34`. Consolidate.
+- **tests/test_auth.py** — Missing edge case: expired token test.
+
+### Looks Good
+- Clean separation of concerns in the middleware layer
+- Good test coverage for the happy path
+```
+
+---
+
+## 2. Reviewing a Pull Request on GitHub
+
+### View PR Details
+
+**With gh:**
+
+```bash
+gh pr view 123
+gh pr diff 123
+gh pr diff 123 --name-only
+```
+
+**With git + curl:**
+
+```bash
+PR_NUMBER=123
+
+# Get PR details
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "
+import sys, json
+pr = json.load(sys.stdin)
+print(f\"Title: {pr['title']}\")
+print(f\"Author: {pr['user']['login']}\")
+print(f\"Branch: {pr['head']['ref']} -> {pr['base']['ref']}\")
+print(f\"State: {pr['state']}\")
+print(f\"Body:\n{pr['body']}\")"
+
+# List changed files
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/files \
+  | python3 -c "
+import sys, json
+for f in json.load(sys.stdin):
+    print(f\"{f['status']:10} +{f['additions']:-4} -{f['deletions']:-4}  {f['filename']}\")"
+```
+
+### Check Out PR Locally for Full Review
+
+This works with plain `git` — no `gh` needed:
+
+```bash
+# Fetch the PR branch and check it out
+git fetch origin pull/123/head:pr-123
+git checkout pr-123
+
+# Now you can use read_file, search_files, run tests, etc.
+
+# View diff against the base branch
+git diff main...pr-123
+```
+
+**With gh (shortcut):**
+
+```bash
+gh pr checkout 123
+```
+
+### Leave Comments on a PR
+
+**General PR comment — with gh:**
+
+```bash
+gh pr comment 123 --body "Overall looks good, a few suggestions below."
+```
+
+**General PR comment — with curl:**
+
+```bash
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/issues/$PR_NUMBER/comments \
+  -d '{"body": "Overall looks good, a few suggestions below."}'
+```
+
+### Leave Inline Review Comments
+
+**Single inline comment — with gh (via API):**
+
+```bash
+HEAD_SHA=$(gh pr view 123 --json headRefOid --jq '.headRefOid')
+
+gh api repos/$OWNER/$REPO/pulls/123/comments \
+  --method POST \
+  -f body="This could be simplified with a list comprehension." \
+  -f path="src/auth/login.py" \
+  -f commit_id="$HEAD_SHA" \
+  -f line=45 \
+  -f side="RIGHT"
+```
+
+**Single inline comment — with curl:**
+
+```bash
+# Get the head commit SHA
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/comments \
+  -d "{
+    \"body\": \"This could be simplified with a list comprehension.\",
+    \"path\": \"src/auth/login.py\",
+    \"commit_id\": \"$HEAD_SHA\",
+    \"line\": 45,
+    \"side\": \"RIGHT\"
+  }"
+```
+
+### Submit a Formal Review (Approve / Request Changes)
+
+**With gh:**
+
+```bash
+gh pr review 123 --approve --body "LGTM!"
+gh pr review 123 --request-changes --body "See inline comments."
+gh pr review 123 --comment --body "Some suggestions, nothing blocking."
+```
+
+**With curl — multi-comment review submitted atomically:**
+
+```bash
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"COMMENT\",
+    \"body\": \"Code review from Hermes Agent\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"Use parameterized queries to prevent SQL injection.\"},
+      {\"path\": \"src/models/user.py\", \"line\": 23, \"body\": \"Hash passwords with bcrypt before storing.\"},
+      {\"path\": \"tests/test_auth.py\", \"line\": 1, \"body\": \"Add test for expired token edge case.\"}
+    ]
+  }"
+```
+
+Event values: `"APPROVE"`, `"REQUEST_CHANGES"`, `"COMMENT"`
+
+The `line` field refers to the line number in the *new* version of the file. For deleted lines, use `"side": "LEFT"`.
+
+---
+
+## 3. Review Checklist
+
+When performing a code review (local or PR), systematically check:
+
+### Correctness
+- Does the code do what it claims?
+- Edge cases handled (empty inputs, nulls, large data, concurrent access)?
+- Error paths handled gracefully?
+
+### Security
+- No hardcoded secrets, credentials, or API keys
+- Input validation on user-facing inputs
+- No SQL injection, XSS, or path traversal
+- Auth/authz checks where needed
+
+### Code Quality
+- Clear naming (variables, functions, classes)
+- No unnecessary complexity or premature abstraction
+- DRY — no duplicated logic that should be extracted
+- Functions are focused (single responsibility)
+
+### Testing
+- New code paths tested?
+- Happy path and error cases covered?
+- Tests readable and maintainable?
+
+### Performance
+- No N+1 queries or unnecessary loops
+- Appropriate caching where beneficial
+- No blocking operations in async code paths
+
+### Documentation
+- Public APIs documented
+- Non-obvious logic has comments explaining "why"
+- README updated if behavior changed
+
+---
+
+## 4. Pre-Push Review Workflow
+
+When the user asks you to "review the code" or "check before pushing":
+
+1. `git diff main...HEAD --stat` — see scope of changes
+2. `git diff main...HEAD` — read the full diff
+3. For each changed file, use `read_file` if you need more context
+4. Apply the checklist above
+5. Present findings in the structured format (Critical / Warnings / Suggestions / Looks Good)
+6. If critical issues found, offer to fix them before the user pushes
+
+---
+
+## 5. PR Review Workflow (End-to-End)
+
+When the user asks you to "review PR #N", "look at this PR", or gives you a PR URL, follow this recipe:
+
+### Step 1: Set up environment
+
+```bash
+source "${HERMES_HOME:-$HOME/.hermes}/skills/github/github-auth/scripts/gh-env.sh"
+# Or run the inline setup block from the top of this skill
+```
+
+### Step 2: Gather PR context
+
+Get the PR metadata, description, and list of changed files to understand scope before diving into code.
+
+**With gh:**
+```bash
+gh pr view 123
+gh pr diff 123 --name-only
+gh pr checks 123
+```
+
+**With curl:**
+```bash
+PR_NUMBER=123
+
+# PR details (title, author, description, branch)
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER
+
+# Changed files with line counts
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/files
+```
+
+### Step 3: Check out the PR locally
+
+This gives you full access to `read_file`, `search_files`, and the ability to run tests.
+
+```bash
+git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout pr-$PR_NUMBER
+```
+
+### Step 4: Read the diff and understand changes
+
+```bash
+# Full diff against the base branch
+git diff main...HEAD
+
+# Or file-by-file for large PRs
+git diff main...HEAD --name-only
+# Then for each file:
+git diff main...HEAD -- path/to/file.py
+```
+
+For each changed file, use `read_file` to see full context around the changes — diffs alone can miss issues visible only with surrounding code.
+
+### Step 5: Run automated checks locally (if applicable)
+
+```bash
+# Run tests if there's a test suite
+python -m pytest 2>&1 | tail -20
+# or: npm test, cargo test, go test ./..., etc.
+
+# Run linter if configured
+ruff check . 2>&1 | head -30
+# or: eslint, clippy, etc.
+```
+
+### Step 6: Apply the review checklist (Section 3)
+
+Go through each category: Correctness, Security, Code Quality, Testing, Performance, Documentation.
+
+### Step 7: Post the review to GitHub
+
+Collect your findings and submit them as a formal review with inline comments.
+
+**With gh:**
+```bash
+# If no issues — approve
+gh pr review $PR_NUMBER --approve --body "Reviewed by Hermes Agent. Code looks clean — good test coverage, no security concerns."
+
+# If issues found — request changes with inline comments
+gh pr review $PR_NUMBER --request-changes --body "Found a few issues — see inline comments."
+```
+
+**With curl — atomic review with multiple inline comments:**
+```bash
+HEAD_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+# Build the review JSON — event is APPROVE, REQUEST_CHANGES, or COMMENT
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"REQUEST_CHANGES\",
+    \"body\": \"## Hermes Agent Review\n\nFound 2 issues, 1 suggestion. See inline comments.\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"🔴 **Critical:** User input passed directly to SQL query — use parameterized queries.\"},
+      {\"path\": \"src/models.py\", \"line\": 23, \"body\": \"⚠️ **Warning:** Password stored without hashing.\"},
+      {\"path\": \"src/utils.py\", \"line\": 8, \"body\": \"💡 **Suggestion:** This duplicates logic in core/utils.py:34.\"}
+    ]
+  }"
+```
+
+### Step 8: Also post a summary comment
+
+In addition to inline comments, leave a top-level summary so the PR author gets the full picture at a glance. Use the review output format from `references/review-output-template.md`.
+
+**With gh:**
+```bash
+gh pr comment $PR_NUMBER --body "$(cat <<'EOF'
+## Code Review Summary
+
+**Verdict: Changes Requested** (2 issues, 1 suggestion)
+
+### 🔴 Critical
+- **src/auth.py:45** — SQL injection vulnerability
+
+### ⚠️ Warnings
+- **src/models.py:23** — Plaintext password storage
+
+### 💡 Suggestions
+- **src/utils.py:8** — Duplicated logic, consider consolidating
+
+### ✅ Looks Good
+- Clean API design
+- Good error handling in the middleware layer
+
+---
+*Reviewed by Hermes Agent*
+EOF
+)"
+```
+
+### Step 9: Clean up
+
+```bash
+git checkout main
+git branch -D pr-$PR_NUMBER
+```
+
+### Decision: Approve vs Request Changes vs Comment
+
+- **Approve** — no critical or warning-level issues, only minor suggestions or all clear
+- **Request Changes** — any critical or warning-level issue that should be fixed before merge
+- **Comment** — observations and suggestions, but nothing blocking (use when you're unsure or the PR is a draft)
diff --git a/output/github-code-review/20260424_004628/evolved_skill.md b/output/github-code-review/20260424_004628/evolved_skill.md
new file mode 100644
index 00000000..8041fbb6
--- /dev/null
+++ b/output/github-code-review/20260424_004628/evolved_skill.md
@@ -0,0 +1,480 @@
+---
+name: github-code-review
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
+version: 1.1.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [GitHub, Code-Review, Pull-Requests, Git, Quality]
+    related_skills: [github-auth, github-pr-workflow]
+---
+
+# GitHub Code Review
+
+Perform code reviews on local changes before pushing, or review open PRs on GitHub. Most of this skill uses plain `git` — the `gh`/`curl` split only matters for PR-level interactions.
+
+## Prerequisites
+
+- Authenticated with GitHub (see `github-auth` skill)
+- Inside a git repository
+
+### Setup (for PR interactions)
+
+```bash
+if command -v gh &>/dev/null && gh auth status &>/dev/null; then
+  AUTH="gh"
+else
+  AUTH="git"
+  if [ -z "$GITHUB_TOKEN" ]; then
+    if [ -f ~/.hermes/.env ] && grep -q "^GITHUB_TOKEN=" ~/.hermes/.env; then
+      GITHUB_TOKEN=$(grep "^GITHUB_TOKEN=" ~/.hermes/.env | head -1 | cut -d= -f2 | tr -d '\n\r')
+    elif grep -q "github.com" ~/.git-credentials 2>/dev/null; then
+      GITHUB_TOKEN=$(grep "github.com" ~/.git-credentials 2>/dev/null | head -1 | sed 's|https://[^:]*:\([^@]*\)@.*|\1|')
+    fi
+  fi
+fi
+
+REMOTE_URL=$(git remote get-url origin)
+OWNER_REPO=$(echo "$REMOTE_URL" | sed -E 's|.*github\.com[:/]||; s|\.git$||')
+OWNER=$(echo "$OWNER_REPO" | cut -d/ -f1)
+REPO=$(echo "$OWNER_REPO" | cut -d/ -f2)
+```
+
+---
+
+## 1. Reviewing Local Changes (Pre-Push)
+
+This is pure `git` — works everywhere, no API needed.
+
+### Get the Diff
+
+```bash
+# Staged changes (what would be committed)
+git diff --staged
+
+# All changes vs main (what a PR would contain)
+git diff main...HEAD
+
+# File names only
+git diff main...HEAD --name-only
+
+# Stat summary (insertions/deletions per file)
+git diff main...HEAD --stat
+```
+
+### Review Strategy
+
+1. **Get the big picture first:**
+
+```bash
+git diff main...HEAD --stat
+git log main..HEAD --oneline
+```
+
+2. **Review file by file** — use `read_file` on changed files for full context, and the diff to see what changed:
+
+```bash
+git diff main...HEAD -- src/auth/login.py
+```
+
+3. **Check for common issues:**
+
+```bash
+# Debug statements, TODOs, console.logs left behind
+git diff main...HEAD | grep -n "print(\|console\.log\|TODO\|FIXME\|HACK\|XXX\|debugger"
+
+# Large files accidentally staged
+git diff main...HEAD --stat | sort -t'|' -k2 -rn | head -10
+
+# Secrets or credential patterns
+git diff main...HEAD | grep -in "password\|secret\|api_key\|token.*=\|private_key"
+
+# Merge conflict markers
+git diff main...HEAD | grep -n "<<<<<<\|>>>>>>\|======="
+```
+
+4. **Present structured feedback** to the user.
+
+### Review Output Format
+
+When reviewing local changes, present findings in this structure:
+
+```
+## Code Review Summary
+
+### Critical
+- **src/auth.py:45** — SQL injection: user input passed directly to query.
+  Suggestion: Use parameterized queries.
+
+### Warnings
+- **src/models/user.py:23** — Password stored in plaintext. Use bcrypt or argon2.
+- **src/api/routes.py:112** — No rate limiting on login endpoint.
+
+### Suggestions
+- **src/utils/helpers.py:8** — Duplicates logic in `src/core/utils.py:34`. Consolidate.
+- **tests/test_auth.py** — Missing edge case: expired token test.
+
+### Looks Good
+- Clean separation of concerns in the middleware layer
+- Good test coverage for the happy path
+```
+
+---
+
+## 2. Reviewing a Pull Request on GitHub
+
+### View PR Details
+
+**With gh:**
+
+```bash
+gh pr view 123
+gh pr diff 123
+gh pr diff 123 --name-only
+```
+
+**With git + curl:**
+
+```bash
+PR_NUMBER=123
+
+# Get PR details
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "
+import sys, json
+pr = json.load(sys.stdin)
+print(f\"Title: {pr['title']}\")
+print(f\"Author: {pr['user']['login']}\")
+print(f\"Branch: {pr['head']['ref']} -> {pr['base']['ref']}\")
+print(f\"State: {pr['state']}\")
+print(f\"Body:\n{pr['body']}\")"
+
+# List changed files
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/files \
+  | python3 -c "
+import sys, json
+for f in json.load(sys.stdin):
+    print(f\"{f['status']:10} +{f['additions']:-4} -{f['deletions']:-4}  {f['filename']}\")"
+```
+
+### Check Out PR Locally for Full Review
+
+This works with plain `git` — no `gh` needed:
+
+```bash
+# Fetch the PR branch and check it out
+git fetch origin pull/123/head:pr-123
+git checkout pr-123
+
+# Now you can use read_file, search_files, run tests, etc.
+
+# View diff against the base branch
+git diff main...pr-123
+```
+
+**With gh (shortcut):**
+
+```bash
+gh pr checkout 123
+```
+
+### Leave Comments on a PR
+
+**General PR comment — with gh:**
+
+```bash
+gh pr comment 123 --body "Overall looks good, a few suggestions below."
+```
+
+**General PR comment — with curl:**
+
+```bash
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/issues/$PR_NUMBER/comments \
+  -d '{"body": "Overall looks good, a few suggestions below."}'
+```
+
+### Leave Inline Review Comments
+
+**Single inline comment — with gh (via API):**
+
+```bash
+HEAD_SHA=$(gh pr view 123 --json headRefOid --jq '.headRefOid')
+
+gh api repos/$OWNER/$REPO/pulls/123/comments \
+  --method POST \
+  -f body="This could be simplified with a list comprehension." \
+  -f path="src/auth/login.py" \
+  -f commit_id="$HEAD_SHA" \
+  -f line=45 \
+  -f side="RIGHT"
+```
+
+**Single inline comment — with curl:**
+
+```bash
+# Get the head commit SHA
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/comments \
+  -d "{
+    \"body\": \"This could be simplified with a list comprehension.\",
+    \"path\": \"src/auth/login.py\",
+    \"commit_id\": \"$HEAD_SHA\",
+    \"line\": 45,
+    \"side\": \"RIGHT\"
+  }"
+```
+
+### Submit a Formal Review (Approve / Request Changes)
+
+**With gh:**
+
+```bash
+gh pr review 123 --approve --body "LGTM!"
+gh pr review 123 --request-changes --body "See inline comments."
+gh pr review 123 --comment --body "Some suggestions, nothing blocking."
+```
+
+**With curl — multi-comment review submitted atomically:**
+
+```bash
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"COMMENT\",
+    \"body\": \"Code review from Hermes Agent\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"Use parameterized queries to prevent SQL injection.\"},
+      {\"path\": \"src/models/user.py\", \"line\": 23, \"body\": \"Hash passwords with bcrypt before storing.\"},
+      {\"path\": \"tests/test_auth.py\", \"line\": 1, \"body\": \"Add test for expired token edge case.\"}
+    ]
+  }"
+```
+
+Event values: `"APPROVE"`, `"REQUEST_CHANGES"`, `"COMMENT"`
+
+The `line` field refers to the line number in the *new* version of the file. For deleted lines, use `"side": "LEFT"`.
+
+---
+
+## 3. Review Checklist
+
+When performing a code review (local or PR), systematically check:
+
+### Correctness
+- Does the code do what it claims?
+- Edge cases handled (empty inputs, nulls, large data, concurrent access)?
+- Error paths handled gracefully?
+
+### Security
+- No hardcoded secrets, credentials, or API keys
+- Input validation on user-facing inputs
+- No SQL injection, XSS, or path traversal
+- Auth/authz checks where needed
+
+### Code Quality
+- Clear naming (variables, functions, classes)
+- No unnecessary complexity or premature abstraction
+- DRY — no duplicated logic that should be extracted
+- Functions are focused (single responsibility)
+
+### Testing
+- New code paths tested?
+- Happy path and error cases covered?
+- Tests readable and maintainable?
+
+### Performance
+- No N+1 queries or unnecessary loops
+- Appropriate caching where beneficial
+- No blocking operations in async code paths
+
+### Documentation
+- Public APIs documented
+- Non-obvious logic has comments explaining "why"
+- README updated if behavior changed
+
+---
+
+## 4. Pre-Push Review Workflow
+
+When the user asks you to "review the code" or "check before pushing":
+
+1. `git diff main...HEAD --stat` — see scope of changes
+2. `git diff main...HEAD` — read the full diff
+3. For each changed file, use `read_file` if you need more context
+4. Apply the checklist above
+5. Present findings in the structured format (Critical / Warnings / Suggestions / Looks Good)
+6. If critical issues found, offer to fix them before the user pushes
+
+---
+
+## 5. PR Review Workflow (End-to-End)
+
+When the user asks you to "review PR #N", "look at this PR", or gives you a PR URL, follow this recipe:
+
+### Step 1: Set up environment
+
+```bash
+source "${HERMES_HOME:-$HOME/.hermes}/skills/github/github-auth/scripts/gh-env.sh"
+# Or run the inline setup block from the top of this skill
+```
+
+### Step 2: Gather PR context
+
+Get the PR metadata, description, and list of changed files to understand scope before diving into code.
+
+**With gh:**
+```bash
+gh pr view 123
+gh pr diff 123 --name-only
+gh pr checks 123
+```
+
+**With curl:**
+```bash
+PR_NUMBER=123
+
+# PR details (title, author, description, branch)
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER
+
+# Changed files with line counts
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/files
+```
+
+### Step 3: Check out the PR locally
+
+This gives you full access to `read_file`, `search_files`, and the ability to run tests.
+
+```bash
+git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout pr-$PR_NUMBER
+```
+
+### Step 4: Read the diff and understand changes
+
+```bash
+# Full diff against the base branch
+git diff main...HEAD
+
+# Or file-by-file for large PRs
+git diff main...HEAD --name-only
+# Then for each file:
+git diff main...HEAD -- path/to/file.py
+```
+
+For each changed file, use `read_file` to see full context around the changes — diffs alone can miss issues visible only with surrounding code.
+
+### Step 5: Run automated checks locally (if applicable)
+
+```bash
+# Run tests if there's a test suite
+python -m pytest 2>&1 | tail -20
+# or: npm test, cargo test, go test ./..., etc.
+
+# Run linter if configured
+ruff check . 2>&1 | head -30
+# or: eslint, clippy, etc.
+```
+
+### Step 6: Apply the review checklist (Section 3)
+
+Go through each category: Correctness, Security, Code Quality, Testing, Performance, Documentation.
+
+### Step 7: Post the review to GitHub
+
+Collect your findings and submit them as a formal review with inline comments.
+
+**With gh:**
+```bash
+# If no issues — approve
+gh pr review $PR_NUMBER --approve --body "Reviewed by Hermes Agent. Code looks clean — good test coverage, no security concerns."
+
+# If issues found — request changes with inline comments
+gh pr review $PR_NUMBER --request-changes --body "Found a few issues — see inline comments."
+```
+
+**With curl — atomic review with multiple inline comments:**
+```bash
+HEAD_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+# Build the review JSON — event is APPROVE, REQUEST_CHANGES, or COMMENT
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"REQUEST_CHANGES\",
+    \"body\": \"## Hermes Agent Review\n\nFound 2 issues, 1 suggestion. See inline comments.\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"🔴 **Critical:** User input passed directly to SQL query — use parameterized queries.\"},
+      {\"path\": \"src/models.py\", \"line\": 23, \"body\": \"⚠️ **Warning:** Password stored without hashing.\"},
+      {\"path\": \"src/utils.py\", \"line\": 8, \"body\": \"💡 **Suggestion:** This duplicates logic in core/utils.py:34.\"}
+    ]
+  }"
+```
+
+### Step 8: Also post a summary comment
+
+In addition to inline comments, leave a top-level summary so the PR author gets the full picture at a glance. Use the review output format from `references/review-output-template.md`.
+
+**With gh:**
+```bash
+gh pr comment $PR_NUMBER --body "$(cat <<'EOF'
+## Code Review Summary
+
+**Verdict: Changes Requested** (2 issues, 1 suggestion)
+
+### 🔴 Critical
+- **src/auth.py:45** — SQL injection vulnerability
+
+### ⚠️ Warnings
+- **src/models.py:23** — Plaintext password storage
+
+### 💡 Suggestions
+- **src/utils.py:8** — Duplicated logic, consider consolidating
+
+### ✅ Looks Good
+- Clean API design
+- Good error handling in the middleware layer
+
+---
+*Reviewed by Hermes Agent*
+EOF
+)"
+```
+
+### Step 9: Clean up
+
+```bash
+git checkout main
+git branch -D pr-$PR_NUMBER
+```
+
+### Decision: Approve vs Request Changes vs Comment
+
+- **Approve** — no critical or warning-level issues, only minor suggestions or all clear
+- **Request Changes** — any critical or warning-level issue that should be fixed before merge
+- **Comment** — observations and suggestions, but nothing blocking (use when you're unsure or the PR is a draft)
diff --git a/output/github-code-review/20260424_004628/metrics.json b/output/github-code-review/20260424_004628/metrics.json
new file mode 100644
index 00000000..859e7ee7
--- /dev/null
+++ b/output/github-code-review/20260424_004628/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "github-code-review",
+  "timestamp": "20260424_004628",
+  "iterations": 2,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.46486634460547505,
+  "evolved_score": 0.46486634460547505,
+  "improvement": 0.0,
+  "baseline_size": 13161,
+  "evolved_size": 13161,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 121.41262984275818,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/github-code-review/20260424_005952/baseline_skill.md b/output/github-code-review/20260424_005952/baseline_skill.md
new file mode 100644
index 00000000..8041fbb6
--- /dev/null
+++ b/output/github-code-review/20260424_005952/baseline_skill.md
@@ -0,0 +1,480 @@
+---
+name: github-code-review
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
+version: 1.1.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [GitHub, Code-Review, Pull-Requests, Git, Quality]
+    related_skills: [github-auth, github-pr-workflow]
+---
+
+# GitHub Code Review
+
+Perform code reviews on local changes before pushing, or review open PRs on GitHub. Most of this skill uses plain `git` — the `gh`/`curl` split only matters for PR-level interactions.
+
+## Prerequisites
+
+- Authenticated with GitHub (see `github-auth` skill)
+- Inside a git repository
+
+### Setup (for PR interactions)
+
+```bash
+if command -v gh &>/dev/null && gh auth status &>/dev/null; then
+  AUTH="gh"
+else
+  AUTH="git"
+  if [ -z "$GITHUB_TOKEN" ]; then
+    if [ -f ~/.hermes/.env ] && grep -q "^GITHUB_TOKEN=" ~/.hermes/.env; then
+      GITHUB_TOKEN=$(grep "^GITHUB_TOKEN=" ~/.hermes/.env | head -1 | cut -d= -f2 | tr -d '\n\r')
+    elif grep -q "github.com" ~/.git-credentials 2>/dev/null; then
+      GITHUB_TOKEN=$(grep "github.com" ~/.git-credentials 2>/dev/null | head -1 | sed 's|https://[^:]*:\([^@]*\)@.*|\1|')
+    fi
+  fi
+fi
+
+REMOTE_URL=$(git remote get-url origin)
+OWNER_REPO=$(echo "$REMOTE_URL" | sed -E 's|.*github\.com[:/]||; s|\.git$||')
+OWNER=$(echo "$OWNER_REPO" | cut -d/ -f1)
+REPO=$(echo "$OWNER_REPO" | cut -d/ -f2)
+```
+
+---
+
+## 1. Reviewing Local Changes (Pre-Push)
+
+This is pure `git` — works everywhere, no API needed.
+
+### Get the Diff
+
+```bash
+# Staged changes (what would be committed)
+git diff --staged
+
+# All changes vs main (what a PR would contain)
+git diff main...HEAD
+
+# File names only
+git diff main...HEAD --name-only
+
+# Stat summary (insertions/deletions per file)
+git diff main...HEAD --stat
+```
+
+### Review Strategy
+
+1. **Get the big picture first:**
+
+```bash
+git diff main...HEAD --stat
+git log main..HEAD --oneline
+```
+
+2. **Review file by file** — use `read_file` on changed files for full context, and the diff to see what changed:
+
+```bash
+git diff main...HEAD -- src/auth/login.py
+```
+
+3. **Check for common issues:**
+
+```bash
+# Debug statements, TODOs, console.logs left behind
+git diff main...HEAD | grep -n "print(\|console\.log\|TODO\|FIXME\|HACK\|XXX\|debugger"
+
+# Large files accidentally staged
+git diff main...HEAD --stat | sort -t'|' -k2 -rn | head -10
+
+# Secrets or credential patterns
+git diff main...HEAD | grep -in "password\|secret\|api_key\|token.*=\|private_key"
+
+# Merge conflict markers
+git diff main...HEAD | grep -n "<<<<<<\|>>>>>>\|======="
+```
+
+4. **Present structured feedback** to the user.
+
+### Review Output Format
+
+When reviewing local changes, present findings in this structure:
+
+```
+## Code Review Summary
+
+### Critical
+- **src/auth.py:45** — SQL injection: user input passed directly to query.
+  Suggestion: Use parameterized queries.
+
+### Warnings
+- **src/models/user.py:23** — Password stored in plaintext. Use bcrypt or argon2.
+- **src/api/routes.py:112** — No rate limiting on login endpoint.
+
+### Suggestions
+- **src/utils/helpers.py:8** — Duplicates logic in `src/core/utils.py:34`. Consolidate.
+- **tests/test_auth.py** — Missing edge case: expired token test.
+
+### Looks Good
+- Clean separation of concerns in the middleware layer
+- Good test coverage for the happy path
+```
+
+---
+
+## 2. Reviewing a Pull Request on GitHub
+
+### View PR Details
+
+**With gh:**
+
+```bash
+gh pr view 123
+gh pr diff 123
+gh pr diff 123 --name-only
+```
+
+**With git + curl:**
+
+```bash
+PR_NUMBER=123
+
+# Get PR details
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "
+import sys, json
+pr = json.load(sys.stdin)
+print(f\"Title: {pr['title']}\")
+print(f\"Author: {pr['user']['login']}\")
+print(f\"Branch: {pr['head']['ref']} -> {pr['base']['ref']}\")
+print(f\"State: {pr['state']}\")
+print(f\"Body:\n{pr['body']}\")"
+
+# List changed files
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/files \
+  | python3 -c "
+import sys, json
+for f in json.load(sys.stdin):
+    print(f\"{f['status']:10} +{f['additions']:-4} -{f['deletions']:-4}  {f['filename']}\")"
+```
+
+### Check Out PR Locally for Full Review
+
+This works with plain `git` — no `gh` needed:
+
+```bash
+# Fetch the PR branch and check it out
+git fetch origin pull/123/head:pr-123
+git checkout pr-123
+
+# Now you can use read_file, search_files, run tests, etc.
+
+# View diff against the base branch
+git diff main...pr-123
+```
+
+**With gh (shortcut):**
+
+```bash
+gh pr checkout 123
+```
+
+### Leave Comments on a PR
+
+**General PR comment — with gh:**
+
+```bash
+gh pr comment 123 --body "Overall looks good, a few suggestions below."
+```
+
+**General PR comment — with curl:**
+
+```bash
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/issues/$PR_NUMBER/comments \
+  -d '{"body": "Overall looks good, a few suggestions below."}'
+```
+
+### Leave Inline Review Comments
+
+**Single inline comment — with gh (via API):**
+
+```bash
+HEAD_SHA=$(gh pr view 123 --json headRefOid --jq '.headRefOid')
+
+gh api repos/$OWNER/$REPO/pulls/123/comments \
+  --method POST \
+  -f body="This could be simplified with a list comprehension." \
+  -f path="src/auth/login.py" \
+  -f commit_id="$HEAD_SHA" \
+  -f line=45 \
+  -f side="RIGHT"
+```
+
+**Single inline comment — with curl:**
+
+```bash
+# Get the head commit SHA
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/comments \
+  -d "{
+    \"body\": \"This could be simplified with a list comprehension.\",
+    \"path\": \"src/auth/login.py\",
+    \"commit_id\": \"$HEAD_SHA\",
+    \"line\": 45,
+    \"side\": \"RIGHT\"
+  }"
+```
+
+### Submit a Formal Review (Approve / Request Changes)
+
+**With gh:**
+
+```bash
+gh pr review 123 --approve --body "LGTM!"
+gh pr review 123 --request-changes --body "See inline comments."
+gh pr review 123 --comment --body "Some suggestions, nothing blocking."
+```
+
+**With curl — multi-comment review submitted atomically:**
+
+```bash
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"COMMENT\",
+    \"body\": \"Code review from Hermes Agent\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"Use parameterized queries to prevent SQL injection.\"},
+      {\"path\": \"src/models/user.py\", \"line\": 23, \"body\": \"Hash passwords with bcrypt before storing.\"},
+      {\"path\": \"tests/test_auth.py\", \"line\": 1, \"body\": \"Add test for expired token edge case.\"}
+    ]
+  }"
+```
+
+Event values: `"APPROVE"`, `"REQUEST_CHANGES"`, `"COMMENT"`
+
+The `line` field refers to the line number in the *new* version of the file. For deleted lines, use `"side": "LEFT"`.
+
+---
+
+## 3. Review Checklist
+
+When performing a code review (local or PR), systematically check:
+
+### Correctness
+- Does the code do what it claims?
+- Edge cases handled (empty inputs, nulls, large data, concurrent access)?
+- Error paths handled gracefully?
+
+### Security
+- No hardcoded secrets, credentials, or API keys
+- Input validation on user-facing inputs
+- No SQL injection, XSS, or path traversal
+- Auth/authz checks where needed
+
+### Code Quality
+- Clear naming (variables, functions, classes)
+- No unnecessary complexity or premature abstraction
+- DRY — no duplicated logic that should be extracted
+- Functions are focused (single responsibility)
+
+### Testing
+- New code paths tested?
+- Happy path and error cases covered?
+- Tests readable and maintainable?
+
+### Performance
+- No N+1 queries or unnecessary loops
+- Appropriate caching where beneficial
+- No blocking operations in async code paths
+
+### Documentation
+- Public APIs documented
+- Non-obvious logic has comments explaining "why"
+- README updated if behavior changed
+
+---
+
+## 4. Pre-Push Review Workflow
+
+When the user asks you to "review the code" or "check before pushing":
+
+1. `git diff main...HEAD --stat` — see scope of changes
+2. `git diff main...HEAD` — read the full diff
+3. For each changed file, use `read_file` if you need more context
+4. Apply the checklist above
+5. Present findings in the structured format (Critical / Warnings / Suggestions / Looks Good)
+6. If critical issues found, offer to fix them before the user pushes
+
+---
+
+## 5. PR Review Workflow (End-to-End)
+
+When the user asks you to "review PR #N", "look at this PR", or gives you a PR URL, follow this recipe:
+
+### Step 1: Set up environment
+
+```bash
+source "${HERMES_HOME:-$HOME/.hermes}/skills/github/github-auth/scripts/gh-env.sh"
+# Or run the inline setup block from the top of this skill
+```
+
+### Step 2: Gather PR context
+
+Get the PR metadata, description, and list of changed files to understand scope before diving into code.
+
+**With gh:**
+```bash
+gh pr view 123
+gh pr diff 123 --name-only
+gh pr checks 123
+```
+
+**With curl:**
+```bash
+PR_NUMBER=123
+
+# PR details (title, author, description, branch)
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER
+
+# Changed files with line counts
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/files
+```
+
+### Step 3: Check out the PR locally
+
+This gives you full access to `read_file`, `search_files`, and the ability to run tests.
+
+```bash
+git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout pr-$PR_NUMBER
+```
+
+### Step 4: Read the diff and understand changes
+
+```bash
+# Full diff against the base branch
+git diff main...HEAD
+
+# Or file-by-file for large PRs
+git diff main...HEAD --name-only
+# Then for each file:
+git diff main...HEAD -- path/to/file.py
+```
+
+For each changed file, use `read_file` to see full context around the changes — diffs alone can miss issues visible only with surrounding code.
+
+### Step 5: Run automated checks locally (if applicable)
+
+```bash
+# Run tests if there's a test suite
+python -m pytest 2>&1 | tail -20
+# or: npm test, cargo test, go test ./..., etc.
+
+# Run linter if configured
+ruff check . 2>&1 | head -30
+# or: eslint, clippy, etc.
+```
+
+### Step 6: Apply the review checklist (Section 3)
+
+Go through each category: Correctness, Security, Code Quality, Testing, Performance, Documentation.
+
+### Step 7: Post the review to GitHub
+
+Collect your findings and submit them as a formal review with inline comments.
+
+**With gh:**
+```bash
+# If no issues — approve
+gh pr review $PR_NUMBER --approve --body "Reviewed by Hermes Agent. Code looks clean — good test coverage, no security concerns."
+
+# If issues found — request changes with inline comments
+gh pr review $PR_NUMBER --request-changes --body "Found a few issues — see inline comments."
+```
+
+**With curl — atomic review with multiple inline comments:**
+```bash
+HEAD_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+# Build the review JSON — event is APPROVE, REQUEST_CHANGES, or COMMENT
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"REQUEST_CHANGES\",
+    \"body\": \"## Hermes Agent Review\n\nFound 2 issues, 1 suggestion. See inline comments.\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"🔴 **Critical:** User input passed directly to SQL query — use parameterized queries.\"},
+      {\"path\": \"src/models.py\", \"line\": 23, \"body\": \"⚠️ **Warning:** Password stored without hashing.\"},
+      {\"path\": \"src/utils.py\", \"line\": 8, \"body\": \"💡 **Suggestion:** This duplicates logic in core/utils.py:34.\"}
+    ]
+  }"
+```
+
+### Step 8: Also post a summary comment
+
+In addition to inline comments, leave a top-level summary so the PR author gets the full picture at a glance. Use the review output format from `references/review-output-template.md`.
+
+**With gh:**
+```bash
+gh pr comment $PR_NUMBER --body "$(cat <<'EOF'
+## Code Review Summary
+
+**Verdict: Changes Requested** (2 issues, 1 suggestion)
+
+### 🔴 Critical
+- **src/auth.py:45** — SQL injection vulnerability
+
+### ⚠️ Warnings
+- **src/models.py:23** — Plaintext password storage
+
+### 💡 Suggestions
+- **src/utils.py:8** — Duplicated logic, consider consolidating
+
+### ✅ Looks Good
+- Clean API design
+- Good error handling in the middleware layer
+
+---
+*Reviewed by Hermes Agent*
+EOF
+)"
+```
+
+### Step 9: Clean up
+
+```bash
+git checkout main
+git branch -D pr-$PR_NUMBER
+```
+
+### Decision: Approve vs Request Changes vs Comment
+
+- **Approve** — no critical or warning-level issues, only minor suggestions or all clear
+- **Request Changes** — any critical or warning-level issue that should be fixed before merge
+- **Comment** — observations and suggestions, but nothing blocking (use when you're unsure or the PR is a draft)
diff --git a/output/github-code-review/20260424_005952/evolved_skill.md b/output/github-code-review/20260424_005952/evolved_skill.md
new file mode 100644
index 00000000..8041fbb6
--- /dev/null
+++ b/output/github-code-review/20260424_005952/evolved_skill.md
@@ -0,0 +1,480 @@
+---
+name: github-code-review
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
+version: 1.1.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [GitHub, Code-Review, Pull-Requests, Git, Quality]
+    related_skills: [github-auth, github-pr-workflow]
+---
+
+# GitHub Code Review
+
+Perform code reviews on local changes before pushing, or review open PRs on GitHub. Most of this skill uses plain `git` — the `gh`/`curl` split only matters for PR-level interactions.
+
+## Prerequisites
+
+- Authenticated with GitHub (see `github-auth` skill)
+- Inside a git repository
+
+### Setup (for PR interactions)
+
+```bash
+if command -v gh &>/dev/null && gh auth status &>/dev/null; then
+  AUTH="gh"
+else
+  AUTH="git"
+  if [ -z "$GITHUB_TOKEN" ]; then
+    if [ -f ~/.hermes/.env ] && grep -q "^GITHUB_TOKEN=" ~/.hermes/.env; then
+      GITHUB_TOKEN=$(grep "^GITHUB_TOKEN=" ~/.hermes/.env | head -1 | cut -d= -f2 | tr -d '\n\r')
+    elif grep -q "github.com" ~/.git-credentials 2>/dev/null; then
+      GITHUB_TOKEN=$(grep "github.com" ~/.git-credentials 2>/dev/null | head -1 | sed 's|https://[^:]*:\([^@]*\)@.*|\1|')
+    fi
+  fi
+fi
+
+REMOTE_URL=$(git remote get-url origin)
+OWNER_REPO=$(echo "$REMOTE_URL" | sed -E 's|.*github\.com[:/]||; s|\.git$||')
+OWNER=$(echo "$OWNER_REPO" | cut -d/ -f1)
+REPO=$(echo "$OWNER_REPO" | cut -d/ -f2)
+```
+
+---
+
+## 1. Reviewing Local Changes (Pre-Push)
+
+This is pure `git` — works everywhere, no API needed.
+
+### Get the Diff
+
+```bash
+# Staged changes (what would be committed)
+git diff --staged
+
+# All changes vs main (what a PR would contain)
+git diff main...HEAD
+
+# File names only
+git diff main...HEAD --name-only
+
+# Stat summary (insertions/deletions per file)
+git diff main...HEAD --stat
+```
+
+### Review Strategy
+
+1. **Get the big picture first:**
+
+```bash
+git diff main...HEAD --stat
+git log main..HEAD --oneline
+```
+
+2. **Review file by file** — use `read_file` on changed files for full context, and the diff to see what changed:
+
+```bash
+git diff main...HEAD -- src/auth/login.py
+```
+
+3. **Check for common issues:**
+
+```bash
+# Debug statements, TODOs, console.logs left behind
+git diff main...HEAD | grep -n "print(\|console\.log\|TODO\|FIXME\|HACK\|XXX\|debugger"
+
+# Large files accidentally staged
+git diff main...HEAD --stat | sort -t'|' -k2 -rn | head -10
+
+# Secrets or credential patterns
+git diff main...HEAD | grep -in "password\|secret\|api_key\|token.*=\|private_key"
+
+# Merge conflict markers
+git diff main...HEAD | grep -n "<<<<<<\|>>>>>>\|======="
+```
+
+4. **Present structured feedback** to the user.
+
+### Review Output Format
+
+When reviewing local changes, present findings in this structure:
+
+```
+## Code Review Summary
+
+### Critical
+- **src/auth.py:45** — SQL injection: user input passed directly to query.
+  Suggestion: Use parameterized queries.
+
+### Warnings
+- **src/models/user.py:23** — Password stored in plaintext. Use bcrypt or argon2.
+- **src/api/routes.py:112** — No rate limiting on login endpoint.
+
+### Suggestions
+- **src/utils/helpers.py:8** — Duplicates logic in `src/core/utils.py:34`. Consolidate.
+- **tests/test_auth.py** — Missing edge case: expired token test.
+
+### Looks Good
+- Clean separation of concerns in the middleware layer
+- Good test coverage for the happy path
+```
+
+---
+
+## 2. Reviewing a Pull Request on GitHub
+
+### View PR Details
+
+**With gh:**
+
+```bash
+gh pr view 123
+gh pr diff 123
+gh pr diff 123 --name-only
+```
+
+**With git + curl:**
+
+```bash
+PR_NUMBER=123
+
+# Get PR details
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "
+import sys, json
+pr = json.load(sys.stdin)
+print(f\"Title: {pr['title']}\")
+print(f\"Author: {pr['user']['login']}\")
+print(f\"Branch: {pr['head']['ref']} -> {pr['base']['ref']}\")
+print(f\"State: {pr['state']}\")
+print(f\"Body:\n{pr['body']}\")"
+
+# List changed files
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/files \
+  | python3 -c "
+import sys, json
+for f in json.load(sys.stdin):
+    print(f\"{f['status']:10} +{f['additions']:-4} -{f['deletions']:-4}  {f['filename']}\")"
+```
+
+### Check Out PR Locally for Full Review
+
+This works with plain `git` — no `gh` needed:
+
+```bash
+# Fetch the PR branch and check it out
+git fetch origin pull/123/head:pr-123
+git checkout pr-123
+
+# Now you can use read_file, search_files, run tests, etc.
+
+# View diff against the base branch
+git diff main...pr-123
+```
+
+**With gh (shortcut):**
+
+```bash
+gh pr checkout 123
+```
+
+### Leave Comments on a PR
+
+**General PR comment — with gh:**
+
+```bash
+gh pr comment 123 --body "Overall looks good, a few suggestions below."
+```
+
+**General PR comment — with curl:**
+
+```bash
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/issues/$PR_NUMBER/comments \
+  -d '{"body": "Overall looks good, a few suggestions below."}'
+```
+
+### Leave Inline Review Comments
+
+**Single inline comment — with gh (via API):**
+
+```bash
+HEAD_SHA=$(gh pr view 123 --json headRefOid --jq '.headRefOid')
+
+gh api repos/$OWNER/$REPO/pulls/123/comments \
+  --method POST \
+  -f body="This could be simplified with a list comprehension." \
+  -f path="src/auth/login.py" \
+  -f commit_id="$HEAD_SHA" \
+  -f line=45 \
+  -f side="RIGHT"
+```
+
+**Single inline comment — with curl:**
+
+```bash
+# Get the head commit SHA
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/comments \
+  -d "{
+    \"body\": \"This could be simplified with a list comprehension.\",
+    \"path\": \"src/auth/login.py\",
+    \"commit_id\": \"$HEAD_SHA\",
+    \"line\": 45,
+    \"side\": \"RIGHT\"
+  }"
+```
+
+### Submit a Formal Review (Approve / Request Changes)
+
+**With gh:**
+
+```bash
+gh pr review 123 --approve --body "LGTM!"
+gh pr review 123 --request-changes --body "See inline comments."
+gh pr review 123 --comment --body "Some suggestions, nothing blocking."
+```
+
+**With curl — multi-comment review submitted atomically:**
+
+```bash
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"COMMENT\",
+    \"body\": \"Code review from Hermes Agent\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"Use parameterized queries to prevent SQL injection.\"},
+      {\"path\": \"src/models/user.py\", \"line\": 23, \"body\": \"Hash passwords with bcrypt before storing.\"},
+      {\"path\": \"tests/test_auth.py\", \"line\": 1, \"body\": \"Add test for expired token edge case.\"}
+    ]
+  }"
+```
+
+Event values: `"APPROVE"`, `"REQUEST_CHANGES"`, `"COMMENT"`
+
+The `line` field refers to the line number in the *new* version of the file. For deleted lines, use `"side": "LEFT"`.
+
+---
+
+## 3. Review Checklist
+
+When performing a code review (local or PR), systematically check:
+
+### Correctness
+- Does the code do what it claims?
+- Edge cases handled (empty inputs, nulls, large data, concurrent access)?
+- Error paths handled gracefully?
+
+### Security
+- No hardcoded secrets, credentials, or API keys
+- Input validation on user-facing inputs
+- No SQL injection, XSS, or path traversal
+- Auth/authz checks where needed
+
+### Code Quality
+- Clear naming (variables, functions, classes)
+- No unnecessary complexity or premature abstraction
+- DRY — no duplicated logic that should be extracted
+- Functions are focused (single responsibility)
+
+### Testing
+- New code paths tested?
+- Happy path and error cases covered?
+- Tests readable and maintainable?
+
+### Performance
+- No N+1 queries or unnecessary loops
+- Appropriate caching where beneficial
+- No blocking operations in async code paths
+
+### Documentation
+- Public APIs documented
+- Non-obvious logic has comments explaining "why"
+- README updated if behavior changed
+
+---
+
+## 4. Pre-Push Review Workflow
+
+When the user asks you to "review the code" or "check before pushing":
+
+1. `git diff main...HEAD --stat` — see scope of changes
+2. `git diff main...HEAD` — read the full diff
+3. For each changed file, use `read_file` if you need more context
+4. Apply the checklist above
+5. Present findings in the structured format (Critical / Warnings / Suggestions / Looks Good)
+6. If critical issues found, offer to fix them before the user pushes
+
+---
+
+## 5. PR Review Workflow (End-to-End)
+
+When the user asks you to "review PR #N", "look at this PR", or gives you a PR URL, follow this recipe:
+
+### Step 1: Set up environment
+
+```bash
+source "${HERMES_HOME:-$HOME/.hermes}/skills/github/github-auth/scripts/gh-env.sh"
+# Or run the inline setup block from the top of this skill
+```
+
+### Step 2: Gather PR context
+
+Get the PR metadata, description, and list of changed files to understand scope before diving into code.
+
+**With gh:**
+```bash
+gh pr view 123
+gh pr diff 123 --name-only
+gh pr checks 123
+```
+
+**With curl:**
+```bash
+PR_NUMBER=123
+
+# PR details (title, author, description, branch)
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER
+
+# Changed files with line counts
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/files
+```
+
+### Step 3: Check out the PR locally
+
+This gives you full access to `read_file`, `search_files`, and the ability to run tests.
+
+```bash
+git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout pr-$PR_NUMBER
+```
+
+### Step 4: Read the diff and understand changes
+
+```bash
+# Full diff against the base branch
+git diff main...HEAD
+
+# Or file-by-file for large PRs
+git diff main...HEAD --name-only
+# Then for each file:
+git diff main...HEAD -- path/to/file.py
+```
+
+For each changed file, use `read_file` to see full context around the changes — diffs alone can miss issues visible only with surrounding code.
+
+### Step 5: Run automated checks locally (if applicable)
+
+```bash
+# Run tests if there's a test suite
+python -m pytest 2>&1 | tail -20
+# or: npm test, cargo test, go test ./..., etc.
+
+# Run linter if configured
+ruff check . 2>&1 | head -30
+# or: eslint, clippy, etc.
+```
+
+### Step 6: Apply the review checklist (Section 3)
+
+Go through each category: Correctness, Security, Code Quality, Testing, Performance, Documentation.
+
+### Step 7: Post the review to GitHub
+
+Collect your findings and submit them as a formal review with inline comments.
+
+**With gh:**
+```bash
+# If no issues — approve
+gh pr review $PR_NUMBER --approve --body "Reviewed by Hermes Agent. Code looks clean — good test coverage, no security concerns."
+
+# If issues found — request changes with inline comments
+gh pr review $PR_NUMBER --request-changes --body "Found a few issues — see inline comments."
+```
+
+**With curl — atomic review with multiple inline comments:**
+```bash
+HEAD_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+# Build the review JSON — event is APPROVE, REQUEST_CHANGES, or COMMENT
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"REQUEST_CHANGES\",
+    \"body\": \"## Hermes Agent Review\n\nFound 2 issues, 1 suggestion. See inline comments.\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"🔴 **Critical:** User input passed directly to SQL query — use parameterized queries.\"},
+      {\"path\": \"src/models.py\", \"line\": 23, \"body\": \"⚠️ **Warning:** Password stored without hashing.\"},
+      {\"path\": \"src/utils.py\", \"line\": 8, \"body\": \"💡 **Suggestion:** This duplicates logic in core/utils.py:34.\"}
+    ]
+  }"
+```
+
+### Step 8: Also post a summary comment
+
+In addition to inline comments, leave a top-level summary so the PR author gets the full picture at a glance. Use the review output format from `references/review-output-template.md`.
+
+**With gh:**
+```bash
+gh pr comment $PR_NUMBER --body "$(cat <<'EOF'
+## Code Review Summary
+
+**Verdict: Changes Requested** (2 issues, 1 suggestion)
+
+### 🔴 Critical
+- **src/auth.py:45** — SQL injection vulnerability
+
+### ⚠️ Warnings
+- **src/models.py:23** — Plaintext password storage
+
+### 💡 Suggestions
+- **src/utils.py:8** — Duplicated logic, consider consolidating
+
+### ✅ Looks Good
+- Clean API design
+- Good error handling in the middleware layer
+
+---
+*Reviewed by Hermes Agent*
+EOF
+)"
+```
+
+### Step 9: Clean up
+
+```bash
+git checkout main
+git branch -D pr-$PR_NUMBER
+```
+
+### Decision: Approve vs Request Changes vs Comment
+
+- **Approve** — no critical or warning-level issues, only minor suggestions or all clear
+- **Request Changes** — any critical or warning-level issue that should be fixed before merge
+- **Comment** — observations and suggestions, but nothing blocking (use when you're unsure or the PR is a draft)
diff --git a/output/github-code-review/20260424_005952/metrics.json b/output/github-code-review/20260424_005952/metrics.json
new file mode 100644
index 00000000..2cf3f020
--- /dev/null
+++ b/output/github-code-review/20260424_005952/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "github-code-review",
+  "timestamp": "20260424_005952",
+  "iterations": 2,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.5408152861238757,
+  "evolved_score": 0.520067397195711,
+  "improvement": -0.020747888928164704,
+  "baseline_size": 13161,
+  "evolved_size": 13161,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 169.20341062545776,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/github-code-review/20260424_010955/baseline_skill.md b/output/github-code-review/20260424_010955/baseline_skill.md
new file mode 100644
index 00000000..8041fbb6
--- /dev/null
+++ b/output/github-code-review/20260424_010955/baseline_skill.md
@@ -0,0 +1,480 @@
+---
+name: github-code-review
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
+version: 1.1.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [GitHub, Code-Review, Pull-Requests, Git, Quality]
+    related_skills: [github-auth, github-pr-workflow]
+---
+
+# GitHub Code Review
+
+Perform code reviews on local changes before pushing, or review open PRs on GitHub. Most of this skill uses plain `git` — the `gh`/`curl` split only matters for PR-level interactions.
+
+## Prerequisites
+
+- Authenticated with GitHub (see `github-auth` skill)
+- Inside a git repository
+
+### Setup (for PR interactions)
+
+```bash
+if command -v gh &>/dev/null && gh auth status &>/dev/null; then
+  AUTH="gh"
+else
+  AUTH="git"
+  if [ -z "$GITHUB_TOKEN" ]; then
+    if [ -f ~/.hermes/.env ] && grep -q "^GITHUB_TOKEN=" ~/.hermes/.env; then
+      GITHUB_TOKEN=$(grep "^GITHUB_TOKEN=" ~/.hermes/.env | head -1 | cut -d= -f2 | tr -d '\n\r')
+    elif grep -q "github.com" ~/.git-credentials 2>/dev/null; then
+      GITHUB_TOKEN=$(grep "github.com" ~/.git-credentials 2>/dev/null | head -1 | sed 's|https://[^:]*:\([^@]*\)@.*|\1|')
+    fi
+  fi
+fi
+
+REMOTE_URL=$(git remote get-url origin)
+OWNER_REPO=$(echo "$REMOTE_URL" | sed -E 's|.*github\.com[:/]||; s|\.git$||')
+OWNER=$(echo "$OWNER_REPO" | cut -d/ -f1)
+REPO=$(echo "$OWNER_REPO" | cut -d/ -f2)
+```
+
+---
+
+## 1. Reviewing Local Changes (Pre-Push)
+
+This is pure `git` — works everywhere, no API needed.
+
+### Get the Diff
+
+```bash
+# Staged changes (what would be committed)
+git diff --staged
+
+# All changes vs main (what a PR would contain)
+git diff main...HEAD
+
+# File names only
+git diff main...HEAD --name-only
+
+# Stat summary (insertions/deletions per file)
+git diff main...HEAD --stat
+```
+
+### Review Strategy
+
+1. **Get the big picture first:**
+
+```bash
+git diff main...HEAD --stat
+git log main..HEAD --oneline
+```
+
+2. **Review file by file** — use `read_file` on changed files for full context, and the diff to see what changed:
+
+```bash
+git diff main...HEAD -- src/auth/login.py
+```
+
+3. **Check for common issues:**
+
+```bash
+# Debug statements, TODOs, console.logs left behind
+git diff main...HEAD | grep -n "print(\|console\.log\|TODO\|FIXME\|HACK\|XXX\|debugger"
+
+# Large files accidentally staged
+git diff main...HEAD --stat | sort -t'|' -k2 -rn | head -10
+
+# Secrets or credential patterns
+git diff main...HEAD | grep -in "password\|secret\|api_key\|token.*=\|private_key"
+
+# Merge conflict markers
+git diff main...HEAD | grep -n "<<<<<<\|>>>>>>\|======="
+```
+
+4. **Present structured feedback** to the user.
+
+### Review Output Format
+
+When reviewing local changes, present findings in this structure:
+
+```
+## Code Review Summary
+
+### Critical
+- **src/auth.py:45** — SQL injection: user input passed directly to query.
+  Suggestion: Use parameterized queries.
+
+### Warnings
+- **src/models/user.py:23** — Password stored in plaintext. Use bcrypt or argon2.
+- **src/api/routes.py:112** — No rate limiting on login endpoint.
+
+### Suggestions
+- **src/utils/helpers.py:8** — Duplicates logic in `src/core/utils.py:34`. Consolidate.
+- **tests/test_auth.py** — Missing edge case: expired token test.
+
+### Looks Good
+- Clean separation of concerns in the middleware layer
+- Good test coverage for the happy path
+```
+
+---
+
+## 2. Reviewing a Pull Request on GitHub
+
+### View PR Details
+
+**With gh:**
+
+```bash
+gh pr view 123
+gh pr diff 123
+gh pr diff 123 --name-only
+```
+
+**With git + curl:**
+
+```bash
+PR_NUMBER=123
+
+# Get PR details
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "
+import sys, json
+pr = json.load(sys.stdin)
+print(f\"Title: {pr['title']}\")
+print(f\"Author: {pr['user']['login']}\")
+print(f\"Branch: {pr['head']['ref']} -> {pr['base']['ref']}\")
+print(f\"State: {pr['state']}\")
+print(f\"Body:\n{pr['body']}\")"
+
+# List changed files
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/files \
+  | python3 -c "
+import sys, json
+for f in json.load(sys.stdin):
+    print(f\"{f['status']:10} +{f['additions']:-4} -{f['deletions']:-4}  {f['filename']}\")"
+```
+
+### Check Out PR Locally for Full Review
+
+This works with plain `git` — no `gh` needed:
+
+```bash
+# Fetch the PR branch and check it out
+git fetch origin pull/123/head:pr-123
+git checkout pr-123
+
+# Now you can use read_file, search_files, run tests, etc.
+
+# View diff against the base branch
+git diff main...pr-123
+```
+
+**With gh (shortcut):**
+
+```bash
+gh pr checkout 123
+```
+
+### Leave Comments on a PR
+
+**General PR comment — with gh:**
+
+```bash
+gh pr comment 123 --body "Overall looks good, a few suggestions below."
+```
+
+**General PR comment — with curl:**
+
+```bash
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/issues/$PR_NUMBER/comments \
+  -d '{"body": "Overall looks good, a few suggestions below."}'
+```
+
+### Leave Inline Review Comments
+
+**Single inline comment — with gh (via API):**
+
+```bash
+HEAD_SHA=$(gh pr view 123 --json headRefOid --jq '.headRefOid')
+
+gh api repos/$OWNER/$REPO/pulls/123/comments \
+  --method POST \
+  -f body="This could be simplified with a list comprehension." \
+  -f path="src/auth/login.py" \
+  -f commit_id="$HEAD_SHA" \
+  -f line=45 \
+  -f side="RIGHT"
+```
+
+**Single inline comment — with curl:**
+
+```bash
+# Get the head commit SHA
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/comments \
+  -d "{
+    \"body\": \"This could be simplified with a list comprehension.\",
+    \"path\": \"src/auth/login.py\",
+    \"commit_id\": \"$HEAD_SHA\",
+    \"line\": 45,
+    \"side\": \"RIGHT\"
+  }"
+```
+
+### Submit a Formal Review (Approve / Request Changes)
+
+**With gh:**
+
+```bash
+gh pr review 123 --approve --body "LGTM!"
+gh pr review 123 --request-changes --body "See inline comments."
+gh pr review 123 --comment --body "Some suggestions, nothing blocking."
+```
+
+**With curl — multi-comment review submitted atomically:**
+
+```bash
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"COMMENT\",
+    \"body\": \"Code review from Hermes Agent\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"Use parameterized queries to prevent SQL injection.\"},
+      {\"path\": \"src/models/user.py\", \"line\": 23, \"body\": \"Hash passwords with bcrypt before storing.\"},
+      {\"path\": \"tests/test_auth.py\", \"line\": 1, \"body\": \"Add test for expired token edge case.\"}
+    ]
+  }"
+```
+
+Event values: `"APPROVE"`, `"REQUEST_CHANGES"`, `"COMMENT"`
+
+The `line` field refers to the line number in the *new* version of the file. For deleted lines, use `"side": "LEFT"`.
+
+---
+
+## 3. Review Checklist
+
+When performing a code review (local or PR), systematically check:
+
+### Correctness
+- Does the code do what it claims?
+- Edge cases handled (empty inputs, nulls, large data, concurrent access)?
+- Error paths handled gracefully?
+
+### Security
+- No hardcoded secrets, credentials, or API keys
+- Input validation on user-facing inputs
+- No SQL injection, XSS, or path traversal
+- Auth/authz checks where needed
+
+### Code Quality
+- Clear naming (variables, functions, classes)
+- No unnecessary complexity or premature abstraction
+- DRY — no duplicated logic that should be extracted
+- Functions are focused (single responsibility)
+
+### Testing
+- New code paths tested?
+- Happy path and error cases covered?
+- Tests readable and maintainable?
+
+### Performance
+- No N+1 queries or unnecessary loops
+- Appropriate caching where beneficial
+- No blocking operations in async code paths
+
+### Documentation
+- Public APIs documented
+- Non-obvious logic has comments explaining "why"
+- README updated if behavior changed
+
+---
+
+## 4. Pre-Push Review Workflow
+
+When the user asks you to "review the code" or "check before pushing":
+
+1. `git diff main...HEAD --stat` — see scope of changes
+2. `git diff main...HEAD` — read the full diff
+3. For each changed file, use `read_file` if you need more context
+4. Apply the checklist above
+5. Present findings in the structured format (Critical / Warnings / Suggestions / Looks Good)
+6. If critical issues found, offer to fix them before the user pushes
+
+---
+
+## 5. PR Review Workflow (End-to-End)
+
+When the user asks you to "review PR #N", "look at this PR", or gives you a PR URL, follow this recipe:
+
+### Step 1: Set up environment
+
+```bash
+source "${HERMES_HOME:-$HOME/.hermes}/skills/github/github-auth/scripts/gh-env.sh"
+# Or run the inline setup block from the top of this skill
+```
+
+### Step 2: Gather PR context
+
+Get the PR metadata, description, and list of changed files to understand scope before diving into code.
+
+**With gh:**
+```bash
+gh pr view 123
+gh pr diff 123 --name-only
+gh pr checks 123
+```
+
+**With curl:**
+```bash
+PR_NUMBER=123
+
+# PR details (title, author, description, branch)
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER
+
+# Changed files with line counts
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/files
+```
+
+### Step 3: Check out the PR locally
+
+This gives you full access to `read_file`, `search_files`, and the ability to run tests.
+
+```bash
+git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout pr-$PR_NUMBER
+```
+
+### Step 4: Read the diff and understand changes
+
+```bash
+# Full diff against the base branch
+git diff main...HEAD
+
+# Or file-by-file for large PRs
+git diff main...HEAD --name-only
+# Then for each file:
+git diff main...HEAD -- path/to/file.py
+```
+
+For each changed file, use `read_file` to see full context around the changes — diffs alone can miss issues visible only with surrounding code.
+
+### Step 5: Run automated checks locally (if applicable)
+
+```bash
+# Run tests if there's a test suite
+python -m pytest 2>&1 | tail -20
+# or: npm test, cargo test, go test ./..., etc.
+
+# Run linter if configured
+ruff check . 2>&1 | head -30
+# or: eslint, clippy, etc.
+```
+
+### Step 6: Apply the review checklist (Section 3)
+
+Go through each category: Correctness, Security, Code Quality, Testing, Performance, Documentation.
+
+### Step 7: Post the review to GitHub
+
+Collect your findings and submit them as a formal review with inline comments.
+
+**With gh:**
+```bash
+# If no issues — approve
+gh pr review $PR_NUMBER --approve --body "Reviewed by Hermes Agent. Code looks clean — good test coverage, no security concerns."
+
+# If issues found — request changes with inline comments
+gh pr review $PR_NUMBER --request-changes --body "Found a few issues — see inline comments."
+```
+
+**With curl — atomic review with multiple inline comments:**
+```bash
+HEAD_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+# Build the review JSON — event is APPROVE, REQUEST_CHANGES, or COMMENT
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"REQUEST_CHANGES\",
+    \"body\": \"## Hermes Agent Review\n\nFound 2 issues, 1 suggestion. See inline comments.\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"🔴 **Critical:** User input passed directly to SQL query — use parameterized queries.\"},
+      {\"path\": \"src/models.py\", \"line\": 23, \"body\": \"⚠️ **Warning:** Password stored without hashing.\"},
+      {\"path\": \"src/utils.py\", \"line\": 8, \"body\": \"💡 **Suggestion:** This duplicates logic in core/utils.py:34.\"}
+    ]
+  }"
+```
+
+### Step 8: Also post a summary comment
+
+In addition to inline comments, leave a top-level summary so the PR author gets the full picture at a glance. Use the review output format from `references/review-output-template.md`.
+
+**With gh:**
+```bash
+gh pr comment $PR_NUMBER --body "$(cat <<'EOF'
+## Code Review Summary
+
+**Verdict: Changes Requested** (2 issues, 1 suggestion)
+
+### 🔴 Critical
+- **src/auth.py:45** — SQL injection vulnerability
+
+### ⚠️ Warnings
+- **src/models.py:23** — Plaintext password storage
+
+### 💡 Suggestions
+- **src/utils.py:8** — Duplicated logic, consider consolidating
+
+### ✅ Looks Good
+- Clean API design
+- Good error handling in the middleware layer
+
+---
+*Reviewed by Hermes Agent*
+EOF
+)"
+```
+
+### Step 9: Clean up
+
+```bash
+git checkout main
+git branch -D pr-$PR_NUMBER
+```
+
+### Decision: Approve vs Request Changes vs Comment
+
+- **Approve** — no critical or warning-level issues, only minor suggestions or all clear
+- **Request Changes** — any critical or warning-level issue that should be fixed before merge
+- **Comment** — observations and suggestions, but nothing blocking (use when you're unsure or the PR is a draft)
diff --git a/output/github-code-review/20260424_010955/evolved_skill.md b/output/github-code-review/20260424_010955/evolved_skill.md
new file mode 100644
index 00000000..8041fbb6
--- /dev/null
+++ b/output/github-code-review/20260424_010955/evolved_skill.md
@@ -0,0 +1,480 @@
+---
+name: github-code-review
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
+version: 1.1.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [GitHub, Code-Review, Pull-Requests, Git, Quality]
+    related_skills: [github-auth, github-pr-workflow]
+---
+
+# GitHub Code Review
+
+Perform code reviews on local changes before pushing, or review open PRs on GitHub. Most of this skill uses plain `git` — the `gh`/`curl` split only matters for PR-level interactions.
+
+## Prerequisites
+
+- Authenticated with GitHub (see `github-auth` skill)
+- Inside a git repository
+
+### Setup (for PR interactions)
+
+```bash
+if command -v gh &>/dev/null && gh auth status &>/dev/null; then
+  AUTH="gh"
+else
+  AUTH="git"
+  if [ -z "$GITHUB_TOKEN" ]; then
+    if [ -f ~/.hermes/.env ] && grep -q "^GITHUB_TOKEN=" ~/.hermes/.env; then
+      GITHUB_TOKEN=$(grep "^GITHUB_TOKEN=" ~/.hermes/.env | head -1 | cut -d= -f2 | tr -d '\n\r')
+    elif grep -q "github.com" ~/.git-credentials 2>/dev/null; then
+      GITHUB_TOKEN=$(grep "github.com" ~/.git-credentials 2>/dev/null | head -1 | sed 's|https://[^:]*:\([^@]*\)@.*|\1|')
+    fi
+  fi
+fi
+
+REMOTE_URL=$(git remote get-url origin)
+OWNER_REPO=$(echo "$REMOTE_URL" | sed -E 's|.*github\.com[:/]||; s|\.git$||')
+OWNER=$(echo "$OWNER_REPO" | cut -d/ -f1)
+REPO=$(echo "$OWNER_REPO" | cut -d/ -f2)
+```
+
+---
+
+## 1. Reviewing Local Changes (Pre-Push)
+
+This is pure `git` — works everywhere, no API needed.
+
+### Get the Diff
+
+```bash
+# Staged changes (what would be committed)
+git diff --staged
+
+# All changes vs main (what a PR would contain)
+git diff main...HEAD
+
+# File names only
+git diff main...HEAD --name-only
+
+# Stat summary (insertions/deletions per file)
+git diff main...HEAD --stat
+```
+
+### Review Strategy
+
+1. **Get the big picture first:**
+
+```bash
+git diff main...HEAD --stat
+git log main..HEAD --oneline
+```
+
+2. **Review file by file** — use `read_file` on changed files for full context, and the diff to see what changed:
+
+```bash
+git diff main...HEAD -- src/auth/login.py
+```
+
+3. **Check for common issues:**
+
+```bash
+# Debug statements, TODOs, console.logs left behind
+git diff main...HEAD | grep -n "print(\|console\.log\|TODO\|FIXME\|HACK\|XXX\|debugger"
+
+# Large files accidentally staged
+git diff main...HEAD --stat | sort -t'|' -k2 -rn | head -10
+
+# Secrets or credential patterns
+git diff main...HEAD | grep -in "password\|secret\|api_key\|token.*=\|private_key"
+
+# Merge conflict markers
+git diff main...HEAD | grep -n "<<<<<<\|>>>>>>\|======="
+```
+
+4. **Present structured feedback** to the user.
+
+### Review Output Format
+
+When reviewing local changes, present findings in this structure:
+
+```
+## Code Review Summary
+
+### Critical
+- **src/auth.py:45** — SQL injection: user input passed directly to query.
+  Suggestion: Use parameterized queries.
+
+### Warnings
+- **src/models/user.py:23** — Password stored in plaintext. Use bcrypt or argon2.
+- **src/api/routes.py:112** — No rate limiting on login endpoint.
+
+### Suggestions
+- **src/utils/helpers.py:8** — Duplicates logic in `src/core/utils.py:34`. Consolidate.
+- **tests/test_auth.py** — Missing edge case: expired token test.
+
+### Looks Good
+- Clean separation of concerns in the middleware layer
+- Good test coverage for the happy path
+```
+
+---
+
+## 2. Reviewing a Pull Request on GitHub
+
+### View PR Details
+
+**With gh:**
+
+```bash
+gh pr view 123
+gh pr diff 123
+gh pr diff 123 --name-only
+```
+
+**With git + curl:**
+
+```bash
+PR_NUMBER=123
+
+# Get PR details
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "
+import sys, json
+pr = json.load(sys.stdin)
+print(f\"Title: {pr['title']}\")
+print(f\"Author: {pr['user']['login']}\")
+print(f\"Branch: {pr['head']['ref']} -> {pr['base']['ref']}\")
+print(f\"State: {pr['state']}\")
+print(f\"Body:\n{pr['body']}\")"
+
+# List changed files
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/files \
+  | python3 -c "
+import sys, json
+for f in json.load(sys.stdin):
+    print(f\"{f['status']:10} +{f['additions']:-4} -{f['deletions']:-4}  {f['filename']}\")"
+```
+
+### Check Out PR Locally for Full Review
+
+This works with plain `git` — no `gh` needed:
+
+```bash
+# Fetch the PR branch and check it out
+git fetch origin pull/123/head:pr-123
+git checkout pr-123
+
+# Now you can use read_file, search_files, run tests, etc.
+
+# View diff against the base branch
+git diff main...pr-123
+```
+
+**With gh (shortcut):**
+
+```bash
+gh pr checkout 123
+```
+
+### Leave Comments on a PR
+
+**General PR comment — with gh:**
+
+```bash
+gh pr comment 123 --body "Overall looks good, a few suggestions below."
+```
+
+**General PR comment — with curl:**
+
+```bash
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/issues/$PR_NUMBER/comments \
+  -d '{"body": "Overall looks good, a few suggestions below."}'
+```
+
+### Leave Inline Review Comments
+
+**Single inline comment — with gh (via API):**
+
+```bash
+HEAD_SHA=$(gh pr view 123 --json headRefOid --jq '.headRefOid')
+
+gh api repos/$OWNER/$REPO/pulls/123/comments \
+  --method POST \
+  -f body="This could be simplified with a list comprehension." \
+  -f path="src/auth/login.py" \
+  -f commit_id="$HEAD_SHA" \
+  -f line=45 \
+  -f side="RIGHT"
+```
+
+**Single inline comment — with curl:**
+
+```bash
+# Get the head commit SHA
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/comments \
+  -d "{
+    \"body\": \"This could be simplified with a list comprehension.\",
+    \"path\": \"src/auth/login.py\",
+    \"commit_id\": \"$HEAD_SHA\",
+    \"line\": 45,
+    \"side\": \"RIGHT\"
+  }"
+```
+
+### Submit a Formal Review (Approve / Request Changes)
+
+**With gh:**
+
+```bash
+gh pr review 123 --approve --body "LGTM!"
+gh pr review 123 --request-changes --body "See inline comments."
+gh pr review 123 --comment --body "Some suggestions, nothing blocking."
+```
+
+**With curl — multi-comment review submitted atomically:**
+
+```bash
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"COMMENT\",
+    \"body\": \"Code review from Hermes Agent\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"Use parameterized queries to prevent SQL injection.\"},
+      {\"path\": \"src/models/user.py\", \"line\": 23, \"body\": \"Hash passwords with bcrypt before storing.\"},
+      {\"path\": \"tests/test_auth.py\", \"line\": 1, \"body\": \"Add test for expired token edge case.\"}
+    ]
+  }"
+```
+
+Event values: `"APPROVE"`, `"REQUEST_CHANGES"`, `"COMMENT"`
+
+The `line` field refers to the line number in the *new* version of the file. For deleted lines, use `"side": "LEFT"`.
+
+---
+
+## 3. Review Checklist
+
+When performing a code review (local or PR), systematically check:
+
+### Correctness
+- Does the code do what it claims?
+- Edge cases handled (empty inputs, nulls, large data, concurrent access)?
+- Error paths handled gracefully?
+
+### Security
+- No hardcoded secrets, credentials, or API keys
+- Input validation on user-facing inputs
+- No SQL injection, XSS, or path traversal
+- Auth/authz checks where needed
+
+### Code Quality
+- Clear naming (variables, functions, classes)
+- No unnecessary complexity or premature abstraction
+- DRY — no duplicated logic that should be extracted
+- Functions are focused (single responsibility)
+
+### Testing
+- New code paths tested?
+- Happy path and error cases covered?
+- Tests readable and maintainable?
+
+### Performance
+- No N+1 queries or unnecessary loops
+- Appropriate caching where beneficial
+- No blocking operations in async code paths
+
+### Documentation
+- Public APIs documented
+- Non-obvious logic has comments explaining "why"
+- README updated if behavior changed
+
+---
+
+## 4. Pre-Push Review Workflow
+
+When the user asks you to "review the code" or "check before pushing":
+
+1. `git diff main...HEAD --stat` — see scope of changes
+2. `git diff main...HEAD` — read the full diff
+3. For each changed file, use `read_file` if you need more context
+4. Apply the checklist above
+5. Present findings in the structured format (Critical / Warnings / Suggestions / Looks Good)
+6. If critical issues found, offer to fix them before the user pushes
+
+---
+
+## 5. PR Review Workflow (End-to-End)
+
+When the user asks you to "review PR #N", "look at this PR", or gives you a PR URL, follow this recipe:
+
+### Step 1: Set up environment
+
+```bash
+source "${HERMES_HOME:-$HOME/.hermes}/skills/github/github-auth/scripts/gh-env.sh"
+# Or run the inline setup block from the top of this skill
+```
+
+### Step 2: Gather PR context
+
+Get the PR metadata, description, and list of changed files to understand scope before diving into code.
+
+**With gh:**
+```bash
+gh pr view 123
+gh pr diff 123 --name-only
+gh pr checks 123
+```
+
+**With curl:**
+```bash
+PR_NUMBER=123
+
+# PR details (title, author, description, branch)
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER
+
+# Changed files with line counts
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/files
+```
+
+### Step 3: Check out the PR locally
+
+This gives you full access to `read_file`, `search_files`, and the ability to run tests.
+
+```bash
+git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout pr-$PR_NUMBER
+```
+
+### Step 4: Read the diff and understand changes
+
+```bash
+# Full diff against the base branch
+git diff main...HEAD
+
+# Or file-by-file for large PRs
+git diff main...HEAD --name-only
+# Then for each file:
+git diff main...HEAD -- path/to/file.py
+```
+
+For each changed file, use `read_file` to see full context around the changes — diffs alone can miss issues visible only with surrounding code.
+
+### Step 5: Run automated checks locally (if applicable)
+
+```bash
+# Run tests if there's a test suite
+python -m pytest 2>&1 | tail -20
+# or: npm test, cargo test, go test ./..., etc.
+
+# Run linter if configured
+ruff check . 2>&1 | head -30
+# or: eslint, clippy, etc.
+```
+
+### Step 6: Apply the review checklist (Section 3)
+
+Go through each category: Correctness, Security, Code Quality, Testing, Performance, Documentation.
+
+### Step 7: Post the review to GitHub
+
+Collect your findings and submit them as a formal review with inline comments.
+
+**With gh:**
+```bash
+# If no issues — approve
+gh pr review $PR_NUMBER --approve --body "Reviewed by Hermes Agent. Code looks clean — good test coverage, no security concerns."
+
+# If issues found — request changes with inline comments
+gh pr review $PR_NUMBER --request-changes --body "Found a few issues — see inline comments."
+```
+
+**With curl — atomic review with multiple inline comments:**
+```bash
+HEAD_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+# Build the review JSON — event is APPROVE, REQUEST_CHANGES, or COMMENT
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"REQUEST_CHANGES\",
+    \"body\": \"## Hermes Agent Review\n\nFound 2 issues, 1 suggestion. See inline comments.\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"🔴 **Critical:** User input passed directly to SQL query — use parameterized queries.\"},
+      {\"path\": \"src/models.py\", \"line\": 23, \"body\": \"⚠️ **Warning:** Password stored without hashing.\"},
+      {\"path\": \"src/utils.py\", \"line\": 8, \"body\": \"💡 **Suggestion:** This duplicates logic in core/utils.py:34.\"}
+    ]
+  }"
+```
+
+### Step 8: Also post a summary comment
+
+In addition to inline comments, leave a top-level summary so the PR author gets the full picture at a glance. Use the review output format from `references/review-output-template.md`.
+
+**With gh:**
+```bash
+gh pr comment $PR_NUMBER --body "$(cat <<'EOF'
+## Code Review Summary
+
+**Verdict: Changes Requested** (2 issues, 1 suggestion)
+
+### 🔴 Critical
+- **src/auth.py:45** — SQL injection vulnerability
+
+### ⚠️ Warnings
+- **src/models.py:23** — Plaintext password storage
+
+### 💡 Suggestions
+- **src/utils.py:8** — Duplicated logic, consider consolidating
+
+### ✅ Looks Good
+- Clean API design
+- Good error handling in the middleware layer
+
+---
+*Reviewed by Hermes Agent*
+EOF
+)"
+```
+
+### Step 9: Clean up
+
+```bash
+git checkout main
+git branch -D pr-$PR_NUMBER
+```
+
+### Decision: Approve vs Request Changes vs Comment
+
+- **Approve** — no critical or warning-level issues, only minor suggestions or all clear
+- **Request Changes** — any critical or warning-level issue that should be fixed before merge
+- **Comment** — observations and suggestions, but nothing blocking (use when you're unsure or the PR is a draft)
diff --git a/output/github-code-review/20260424_010955/metrics.json b/output/github-code-review/20260424_010955/metrics.json
new file mode 100644
index 00000000..3c87517b
--- /dev/null
+++ b/output/github-code-review/20260424_010955/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "github-code-review",
+  "timestamp": "20260424_010955",
+  "iterations": 10,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.0,
+  "evolved_score": 0.0,
+  "improvement": 0.0,
+  "baseline_size": 13161,
+  "evolved_size": 13161,
+  "train_examples": 1,
+  "val_examples": 0,
+  "holdout_examples": 0,
+  "elapsed_seconds": 32.19987154006958,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/github-code-review/evolved_FAILED.md b/output/github-code-review/evolved_FAILED.md
new file mode 100644
index 00000000..8041fbb6
--- /dev/null
+++ b/output/github-code-review/evolved_FAILED.md
@@ -0,0 +1,480 @@
+---
+name: github-code-review
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
+version: 1.1.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [GitHub, Code-Review, Pull-Requests, Git, Quality]
+    related_skills: [github-auth, github-pr-workflow]
+---
+
+# GitHub Code Review
+
+Perform code reviews on local changes before pushing, or review open PRs on GitHub. Most of this skill uses plain `git` — the `gh`/`curl` split only matters for PR-level interactions.
+
+## Prerequisites
+
+- Authenticated with GitHub (see `github-auth` skill)
+- Inside a git repository
+
+### Setup (for PR interactions)
+
+```bash
+if command -v gh &>/dev/null && gh auth status &>/dev/null; then
+  AUTH="gh"
+else
+  AUTH="git"
+  if [ -z "$GITHUB_TOKEN" ]; then
+    if [ -f ~/.hermes/.env ] && grep -q "^GITHUB_TOKEN=" ~/.hermes/.env; then
+      GITHUB_TOKEN=$(grep "^GITHUB_TOKEN=" ~/.hermes/.env | head -1 | cut -d= -f2 | tr -d '\n\r')
+    elif grep -q "github.com" ~/.git-credentials 2>/dev/null; then
+      GITHUB_TOKEN=$(grep "github.com" ~/.git-credentials 2>/dev/null | head -1 | sed 's|https://[^:]*:\([^@]*\)@.*|\1|')
+    fi
+  fi
+fi
+
+REMOTE_URL=$(git remote get-url origin)
+OWNER_REPO=$(echo "$REMOTE_URL" | sed -E 's|.*github\.com[:/]||; s|\.git$||')
+OWNER=$(echo "$OWNER_REPO" | cut -d/ -f1)
+REPO=$(echo "$OWNER_REPO" | cut -d/ -f2)
+```
+
+---
+
+## 1. Reviewing Local Changes (Pre-Push)
+
+This is pure `git` — works everywhere, no API needed.
+
+### Get the Diff
+
+```bash
+# Staged changes (what would be committed)
+git diff --staged
+
+# All changes vs main (what a PR would contain)
+git diff main...HEAD
+
+# File names only
+git diff main...HEAD --name-only
+
+# Stat summary (insertions/deletions per file)
+git diff main...HEAD --stat
+```
+
+### Review Strategy
+
+1. **Get the big picture first:**
+
+```bash
+git diff main...HEAD --stat
+git log main..HEAD --oneline
+```
+
+2. **Review file by file** — use `read_file` on changed files for full context, and the diff to see what changed:
+
+```bash
+git diff main...HEAD -- src/auth/login.py
+```
+
+3. **Check for common issues:**
+
+```bash
+# Debug statements, TODOs, console.logs left behind
+git diff main...HEAD | grep -n "print(\|console\.log\|TODO\|FIXME\|HACK\|XXX\|debugger"
+
+# Large files accidentally staged
+git diff main...HEAD --stat | sort -t'|' -k2 -rn | head -10
+
+# Secrets or credential patterns
+git diff main...HEAD | grep -in "password\|secret\|api_key\|token.*=\|private_key"
+
+# Merge conflict markers
+git diff main...HEAD | grep -n "<<<<<<\|>>>>>>\|======="
+```
+
+4. **Present structured feedback** to the user.
+
+### Review Output Format
+
+When reviewing local changes, present findings in this structure:
+
+```
+## Code Review Summary
+
+### Critical
+- **src/auth.py:45** — SQL injection: user input passed directly to query.
+  Suggestion: Use parameterized queries.
+
+### Warnings
+- **src/models/user.py:23** — Password stored in plaintext. Use bcrypt or argon2.
+- **src/api/routes.py:112** — No rate limiting on login endpoint.
+
+### Suggestions
+- **src/utils/helpers.py:8** — Duplicates logic in `src/core/utils.py:34`. Consolidate.
+- **tests/test_auth.py** — Missing edge case: expired token test.
+
+### Looks Good
+- Clean separation of concerns in the middleware layer
+- Good test coverage for the happy path
+```
+
+---
+
+## 2. Reviewing a Pull Request on GitHub
+
+### View PR Details
+
+**With gh:**
+
+```bash
+gh pr view 123
+gh pr diff 123
+gh pr diff 123 --name-only
+```
+
+**With git + curl:**
+
+```bash
+PR_NUMBER=123
+
+# Get PR details
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "
+import sys, json
+pr = json.load(sys.stdin)
+print(f\"Title: {pr['title']}\")
+print(f\"Author: {pr['user']['login']}\")
+print(f\"Branch: {pr['head']['ref']} -> {pr['base']['ref']}\")
+print(f\"State: {pr['state']}\")
+print(f\"Body:\n{pr['body']}\")"
+
+# List changed files
+curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/files \
+  | python3 -c "
+import sys, json
+for f in json.load(sys.stdin):
+    print(f\"{f['status']:10} +{f['additions']:-4} -{f['deletions']:-4}  {f['filename']}\")"
+```
+
+### Check Out PR Locally for Full Review
+
+This works with plain `git` — no `gh` needed:
+
+```bash
+# Fetch the PR branch and check it out
+git fetch origin pull/123/head:pr-123
+git checkout pr-123
+
+# Now you can use read_file, search_files, run tests, etc.
+
+# View diff against the base branch
+git diff main...pr-123
+```
+
+**With gh (shortcut):**
+
+```bash
+gh pr checkout 123
+```
+
+### Leave Comments on a PR
+
+**General PR comment — with gh:**
+
+```bash
+gh pr comment 123 --body "Overall looks good, a few suggestions below."
+```
+
+**General PR comment — with curl:**
+
+```bash
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/issues/$PR_NUMBER/comments \
+  -d '{"body": "Overall looks good, a few suggestions below."}'
+```
+
+### Leave Inline Review Comments
+
+**Single inline comment — with gh (via API):**
+
+```bash
+HEAD_SHA=$(gh pr view 123 --json headRefOid --jq '.headRefOid')
+
+gh api repos/$OWNER/$REPO/pulls/123/comments \
+  --method POST \
+  -f body="This could be simplified with a list comprehension." \
+  -f path="src/auth/login.py" \
+  -f commit_id="$HEAD_SHA" \
+  -f line=45 \
+  -f side="RIGHT"
+```
+
+**Single inline comment — with curl:**
+
+```bash
+# Get the head commit SHA
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/comments \
+  -d "{
+    \"body\": \"This could be simplified with a list comprehension.\",
+    \"path\": \"src/auth/login.py\",
+    \"commit_id\": \"$HEAD_SHA\",
+    \"line\": 45,
+    \"side\": \"RIGHT\"
+  }"
+```
+
+### Submit a Formal Review (Approve / Request Changes)
+
+**With gh:**
+
+```bash
+gh pr review 123 --approve --body "LGTM!"
+gh pr review 123 --request-changes --body "See inline comments."
+gh pr review 123 --comment --body "Some suggestions, nothing blocking."
+```
+
+**With curl — multi-comment review submitted atomically:**
+
+```bash
+HEAD_SHA=$(curl -s \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"COMMENT\",
+    \"body\": \"Code review from Hermes Agent\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"Use parameterized queries to prevent SQL injection.\"},
+      {\"path\": \"src/models/user.py\", \"line\": 23, \"body\": \"Hash passwords with bcrypt before storing.\"},
+      {\"path\": \"tests/test_auth.py\", \"line\": 1, \"body\": \"Add test for expired token edge case.\"}
+    ]
+  }"
+```
+
+Event values: `"APPROVE"`, `"REQUEST_CHANGES"`, `"COMMENT"`
+
+The `line` field refers to the line number in the *new* version of the file. For deleted lines, use `"side": "LEFT"`.
+
+---
+
+## 3. Review Checklist
+
+When performing a code review (local or PR), systematically check:
+
+### Correctness
+- Does the code do what it claims?
+- Edge cases handled (empty inputs, nulls, large data, concurrent access)?
+- Error paths handled gracefully?
+
+### Security
+- No hardcoded secrets, credentials, or API keys
+- Input validation on user-facing inputs
+- No SQL injection, XSS, or path traversal
+- Auth/authz checks where needed
+
+### Code Quality
+- Clear naming (variables, functions, classes)
+- No unnecessary complexity or premature abstraction
+- DRY — no duplicated logic that should be extracted
+- Functions are focused (single responsibility)
+
+### Testing
+- New code paths tested?
+- Happy path and error cases covered?
+- Tests readable and maintainable?
+
+### Performance
+- No N+1 queries or unnecessary loops
+- Appropriate caching where beneficial
+- No blocking operations in async code paths
+
+### Documentation
+- Public APIs documented
+- Non-obvious logic has comments explaining "why"
+- README updated if behavior changed
+
+---
+
+## 4. Pre-Push Review Workflow
+
+When the user asks you to "review the code" or "check before pushing":
+
+1. `git diff main...HEAD --stat` — see scope of changes
+2. `git diff main...HEAD` — read the full diff
+3. For each changed file, use `read_file` if you need more context
+4. Apply the checklist above
+5. Present findings in the structured format (Critical / Warnings / Suggestions / Looks Good)
+6. If critical issues found, offer to fix them before the user pushes
+
+---
+
+## 5. PR Review Workflow (End-to-End)
+
+When the user asks you to "review PR #N", "look at this PR", or gives you a PR URL, follow this recipe:
+
+### Step 1: Set up environment
+
+```bash
+source "${HERMES_HOME:-$HOME/.hermes}/skills/github/github-auth/scripts/gh-env.sh"
+# Or run the inline setup block from the top of this skill
+```
+
+### Step 2: Gather PR context
+
+Get the PR metadata, description, and list of changed files to understand scope before diving into code.
+
+**With gh:**
+```bash
+gh pr view 123
+gh pr diff 123 --name-only
+gh pr checks 123
+```
+
+**With curl:**
+```bash
+PR_NUMBER=123
+
+# PR details (title, author, description, branch)
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER
+
+# Changed files with line counts
+curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/files
+```
+
+### Step 3: Check out the PR locally
+
+This gives you full access to `read_file`, `search_files`, and the ability to run tests.
+
+```bash
+git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout pr-$PR_NUMBER
+```
+
+### Step 4: Read the diff and understand changes
+
+```bash
+# Full diff against the base branch
+git diff main...HEAD
+
+# Or file-by-file for large PRs
+git diff main...HEAD --name-only
+# Then for each file:
+git diff main...HEAD -- path/to/file.py
+```
+
+For each changed file, use `read_file` to see full context around the changes — diffs alone can miss issues visible only with surrounding code.
+
+### Step 5: Run automated checks locally (if applicable)
+
+```bash
+# Run tests if there's a test suite
+python -m pytest 2>&1 | tail -20
+# or: npm test, cargo test, go test ./..., etc.
+
+# Run linter if configured
+ruff check . 2>&1 | head -30
+# or: eslint, clippy, etc.
+```
+
+### Step 6: Apply the review checklist (Section 3)
+
+Go through each category: Correctness, Security, Code Quality, Testing, Performance, Documentation.
+
+### Step 7: Post the review to GitHub
+
+Collect your findings and submit them as a formal review with inline comments.
+
+**With gh:**
+```bash
+# If no issues — approve
+gh pr review $PR_NUMBER --approve --body "Reviewed by Hermes Agent. Code looks clean — good test coverage, no security concerns."
+
+# If issues found — request changes with inline comments
+gh pr review $PR_NUMBER --request-changes --body "Found a few issues — see inline comments."
+```
+
+**With curl — atomic review with multiple inline comments:**
+```bash
+HEAD_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['head']['sha'])")
+
+# Build the review JSON — event is APPROVE, REQUEST_CHANGES, or COMMENT
+curl -s -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  https://api.github.com/repos/$GH_OWNER/$GH_REPO/pulls/$PR_NUMBER/reviews \
+  -d "{
+    \"commit_id\": \"$HEAD_SHA\",
+    \"event\": \"REQUEST_CHANGES\",
+    \"body\": \"## Hermes Agent Review\n\nFound 2 issues, 1 suggestion. See inline comments.\",
+    \"comments\": [
+      {\"path\": \"src/auth.py\", \"line\": 45, \"body\": \"🔴 **Critical:** User input passed directly to SQL query — use parameterized queries.\"},
+      {\"path\": \"src/models.py\", \"line\": 23, \"body\": \"⚠️ **Warning:** Password stored without hashing.\"},
+      {\"path\": \"src/utils.py\", \"line\": 8, \"body\": \"💡 **Suggestion:** This duplicates logic in core/utils.py:34.\"}
+    ]
+  }"
+```
+
+### Step 8: Also post a summary comment
+
+In addition to inline comments, leave a top-level summary so the PR author gets the full picture at a glance. Use the review output format from `references/review-output-template.md`.
+
+**With gh:**
+```bash
+gh pr comment $PR_NUMBER --body "$(cat <<'EOF'
+## Code Review Summary
+
+**Verdict: Changes Requested** (2 issues, 1 suggestion)
+
+### 🔴 Critical
+- **src/auth.py:45** — SQL injection vulnerability
+
+### ⚠️ Warnings
+- **src/models.py:23** — Plaintext password storage
+
+### 💡 Suggestions
+- **src/utils.py:8** — Duplicated logic, consider consolidating
+
+### ✅ Looks Good
+- Clean API design
+- Good error handling in the middleware layer
+
+---
+*Reviewed by Hermes Agent*
+EOF
+)"
+```
+
+### Step 9: Clean up
+
+```bash
+git checkout main
+git branch -D pr-$PR_NUMBER
+```
+
+### Decision: Approve vs Request Changes vs Comment
+
+- **Approve** — no critical or warning-level issues, only minor suggestions or all clear
+- **Request Changes** — any critical or warning-level issue that should be fixed before merge
+- **Comment** — observations and suggestions, but nothing blocking (use when you're unsure or the PR is a draft)
diff --git a/output/hermes-agent/20260424_012839/baseline_skill.md b/output/hermes-agent/20260424_012839/baseline_skill.md
new file mode 100644
index 00000000..d19471c8
--- /dev/null
+++ b/output/hermes-agent/20260424_012839/baseline_skill.md
@@ -0,0 +1,705 @@
+---
+name: hermes-agent
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
+version: 2.0.0
+author: Hermes Agent + Teknium
+license: MIT
+metadata:
+  hermes:
+    tags: [hermes, setup, configuration, multi-agent, spawning, cli, gateway, development]
+    homepage: https://github.com/NousResearch/hermes-agent
+    related_skills: [claude-code, codex, opencode]
+---
+
+# Hermes Agent
+
+Hermes Agent is an open-source AI agent framework by Nous Research that runs in your terminal, messaging platforms, and IDEs. It belongs to the same category as Claude Code (Anthropic), Codex (OpenAI), and OpenClaw — autonomous coding and task-execution agents that use tool calling to interact with your system. Hermes works with any LLM provider (OpenRouter, Anthropic, OpenAI, DeepSeek, local models, and 15+ others) and runs on Linux, macOS, and WSL.
+
+What makes Hermes different:
+
+- **Self-improving through skills** — Hermes learns from experience by saving reusable procedures as skills. When it solves a complex problem, discovers a workflow, or gets corrected, it can persist that knowledge as a skill document that loads into future sessions. Skills accumulate over time, making the agent better at your specific tasks and environment.
+- **Persistent memory across sessions** — remembers who you are, your preferences, environment details, and lessons learned. Pluggable memory backends (built-in, Honcho, Mem0, and more) let you choose how memory works.
+- **Multi-platform gateway** — the same agent runs on Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, and 10+ other platforms with full tool access, not just chat.
+- **Provider-agnostic** — swap models and providers mid-workflow without changing anything else. Credential pools rotate across multiple API keys automatically.
+- **Profiles** — run multiple independent Hermes instances with isolated configs, sessions, skills, and memory.
+- **Extensible** — plugins, MCP servers, custom tools, webhook triggers, cron scheduling, and the full Python ecosystem.
+
+People use Hermes for software development, research, system administration, data analysis, content creation, home automation, and anything else that benefits from an AI agent with persistent context and full system access.
+
+**This skill helps you work with Hermes Agent effectively** — setting it up, configuring features, spawning additional agent instances, troubleshooting issues, finding the right commands and settings, and understanding how the system works when you need to extend or contribute to it.
+
+**Docs:** https://hermes-agent.nousresearch.com/docs/
+
+## Quick Start
+
+```bash
+# Install
+curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
+
+# Interactive chat (default)
+hermes
+
+# Single query
+hermes chat -q "What is the capital of France?"
+
+# Setup wizard
+hermes setup
+
+# Change model/provider
+hermes model
+
+# Check health
+hermes doctor
+```
+
+---
+
+## CLI Reference
+
+### Global Flags
+
+```
+hermes [flags] [command]
+
+  --version, -V             Show version
+  --resume, -r SESSION      Resume session by ID or title
+  --continue, -c [NAME]     Resume by name, or most recent session
+  --worktree, -w            Isolated git worktree mode (parallel agents)
+  --skills, -s SKILL        Preload skills (comma-separate or repeat)
+  --profile, -p NAME        Use a named profile
+  --yolo                    Skip dangerous command approval
+  --pass-session-id         Include session ID in system prompt
+```
+
+No subcommand defaults to `chat`.
+
+### Chat
+
+```
+hermes chat [flags]
+  -q, --query TEXT          Single query, non-interactive
+  -m, --model MODEL         Model (e.g. anthropic/claude-sonnet-4)
+  -t, --toolsets LIST       Comma-separated toolsets
+  --provider PROVIDER       Force provider (openrouter, anthropic, nous, etc.)
+  -v, --verbose             Verbose output
+  -Q, --quiet               Suppress banner, spinner, tool previews
+  --checkpoints             Enable filesystem checkpoints (/rollback)
+  --source TAG              Session source tag (default: cli)
+```
+
+### Configuration
+
+```
+hermes setup [section]      Interactive wizard (model|terminal|gateway|tools|agent)
+hermes model                Interactive model/provider picker
+hermes config               View current config
+hermes config edit          Open config.yaml in $EDITOR
+hermes config set KEY VAL   Set a config value
+hermes config path          Print config.yaml path
+hermes config env-path      Print .env path
+hermes config check         Check for missing/outdated config
+hermes config migrate       Update config with new options
+hermes login [--provider P] OAuth login (nous, openai-codex)
+hermes logout               Clear stored auth
+hermes doctor [--fix]       Check dependencies and config
+hermes status [--all]       Show component status
+```
+
+### Tools & Skills
+
+```
+hermes tools                Interactive tool enable/disable (curses UI)
+hermes tools list           Show all tools and status
+hermes tools enable NAME    Enable a toolset
+hermes tools disable NAME   Disable a toolset
+
+hermes skills list          List installed skills
+hermes skills search QUERY  Search the skills hub
+hermes skills install ID    Install a skill
+hermes skills inspect ID    Preview without installing
+hermes skills config        Enable/disable skills per platform
+hermes skills check         Check for updates
+hermes skills update        Update outdated skills
+hermes skills uninstall N   Remove a hub skill
+hermes skills publish PATH  Publish to registry
+hermes skills browse        Browse all available skills
+hermes skills tap add REPO  Add a GitHub repo as skill source
+```
+
+### MCP Servers
+
+```
+hermes mcp serve            Run Hermes as an MCP server
+hermes mcp add NAME         Add an MCP server (--url or --command)
+hermes mcp remove NAME      Remove an MCP server
+hermes mcp list             List configured servers
+hermes mcp test NAME        Test connection
+hermes mcp configure NAME   Toggle tool selection
+```
+
+### Gateway (Messaging Platforms)
+
+```
+hermes gateway run          Start gateway foreground
+hermes gateway install      Install as background service
+hermes gateway start/stop   Control the service
+hermes gateway restart      Restart the service
+hermes gateway status       Check status
+hermes gateway setup        Configure platforms
+```
+
+Supported platforms: Telegram, Discord, Slack, WhatsApp, Signal, Email, SMS, Matrix, Mattermost, Home Assistant, DingTalk, Feishu, WeCom, BlueBubbles (iMessage), Weixin (WeChat), API Server, Webhooks. Open WebUI connects via the API Server adapter.
+
+Platform docs: https://hermes-agent.nousresearch.com/docs/user-guide/messaging/
+
+### Sessions
+
+```
+hermes sessions list        List recent sessions
+hermes sessions browse      Interactive picker
+hermes sessions export OUT  Export to JSONL
+hermes sessions rename ID T Rename a session
+hermes sessions delete ID   Delete a session
+hermes sessions prune       Clean up old sessions (--older-than N days)
+hermes sessions stats       Session store statistics
+```
+
+### Cron Jobs
+
+```
+hermes cron list            List jobs (--all for disabled)
+hermes cron create SCHED    Create: '30m', 'every 2h', '0 9 * * *'
+hermes cron edit ID         Edit schedule, prompt, delivery
+hermes cron pause/resume ID Control job state
+hermes cron run ID          Trigger on next tick
+hermes cron remove ID       Delete a job
+hermes cron status          Scheduler status
+```
+
+### Webhooks
+
+```
+hermes webhook subscribe N  Create route at /webhooks/<name>
+hermes webhook list         List subscriptions
+hermes webhook remove NAME  Remove a subscription
+hermes webhook test NAME    Send a test POST
+```
+
+### Profiles
+
+```
+hermes profile list         List all profiles
+hermes profile create NAME  Create (--clone, --clone-all, --clone-from)
+hermes profile use NAME     Set sticky default
+hermes profile delete NAME  Delete a profile
+hermes profile show NAME    Show details
+hermes profile alias NAME   Manage wrapper scripts
+hermes profile rename A B   Rename a profile
+hermes profile export NAME  Export to tar.gz
+hermes profile import FILE  Import from archive
+```
+
+### Credential Pools
+
+```
+hermes auth add             Interactive credential wizard
+hermes auth list [PROVIDER] List pooled credentials
+hermes auth remove P INDEX  Remove by provider + index
+hermes auth reset PROVIDER  Clear exhaustion status
+```
+
+### Other
+
+```
+hermes insights [--days N]  Usage analytics
+hermes update               Update to latest version
+hermes pairing list/approve/revoke  DM authorization
+hermes plugins list/install/remove  Plugin management
+hermes honcho setup/status  Honcho memory integration (requires honcho plugin)
+hermes memory setup/status/off  Memory provider config
+hermes completion bash|zsh  Shell completions
+hermes acp                  ACP server (IDE integration)
+hermes claw migrate         Migrate from OpenClaw
+hermes uninstall            Uninstall Hermes
+```
+
+---
+
+## Slash Commands (In-Session)
+
+Type these during an interactive chat session.
+
+### Session Control
+```
+/new (/reset)        Fresh session
+/clear               Clear screen + new session (CLI)
+/retry               Resend last message
+/undo                Remove last exchange
+/title [name]        Name the session
+/compress            Manually compress context
+/stop                Kill background processes
+/rollback [N]        Restore filesystem checkpoint
+/background <prompt> Run prompt in background
+/queue <prompt>      Queue for next turn
+/resume [name]       Resume a named session
+```
+
+### Configuration
+```
+/config              Show config (CLI)
+/model [name]        Show or change model
+/provider            Show provider info
+/personality [name]  Set personality
+/reasoning [level]   Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
+/verbose             Cycle: off → new → all → verbose
+/voice [on|off|tts]  Voice mode
+/yolo                Toggle approval bypass
+/skin [name]         Change theme (CLI)
+/statusbar           Toggle status bar (CLI)
+```
+
+### Tools & Skills
+```
+/tools               Manage tools (CLI)
+/toolsets            List toolsets (CLI)
+/skills              Search/install skills (CLI)
+/skill <name>        Load a skill into session
+/cron                Manage cron jobs (CLI)
+/reload-mcp          Reload MCP servers
+/plugins             List plugins (CLI)
+```
+
+### Gateway
+```
+/approve             Approve a pending command (gateway)
+/deny                Deny a pending command (gateway)
+/restart             Restart gateway (gateway)
+/sethome             Set current chat as home channel (gateway)
+/update              Update Hermes to latest (gateway)
+/platforms (/gateway) Show platform connection status (gateway)
+```
+
+### Utility
+```
+/branch (/fork)      Branch the current session
+/btw                 Ephemeral side question (doesn't interrupt main task)
+/fast                Toggle priority/fast processing
+/browser             Open CDP browser connection
+/history             Show conversation history (CLI)
+/save                Save conversation to file (CLI)
+/paste               Attach clipboard image (CLI)
+/image               Attach local image file (CLI)
+```
+
+### Info
+```
+/help                Show commands
+/commands [page]     Browse all commands (gateway)
+/usage               Token usage
+/insights [days]     Usage analytics
+/status              Session info (gateway)
+/profile             Active profile info
+```
+
+### Exit
+```
+/quit (/exit, /q)    Exit CLI
+```
+
+---
+
+## Key Paths & Config
+
+```
+~/.hermes/config.yaml       Main configuration
+~/.hermes/.env              API keys and secrets
+$HERMES_HOME/skills/        Installed skills
+~/.hermes/sessions/         Session transcripts
+~/.hermes/logs/             Gateway and error logs
+~/.hermes/auth.json         OAuth tokens and credential pools
+~/.hermes/hermes-agent/     Source code (if git-installed)
+```
+
+Profiles use `~/.hermes/profiles/<name>/` with the same layout.
+
+### Config Sections
+
+Edit with `hermes config edit` or `hermes config set section.key value`.
+
+| Section | Key options |
+|---------|-------------|
+| `model` | `default`, `provider`, `base_url`, `api_key`, `context_length` |
+| `agent` | `max_turns` (90), `tool_use_enforcement` |
+| `terminal` | `backend` (local/docker/ssh/modal), `cwd`, `timeout` (180) |
+| `compression` | `enabled`, `threshold` (0.50), `target_ratio` (0.20) |
+| `display` | `skin`, `tool_progress`, `show_reasoning`, `show_cost` |
+| `stt` | `enabled`, `provider` (local/groq/openai/mistral) |
+| `tts` | `provider` (edge/elevenlabs/openai/minimax/mistral/neutts) |
+| `memory` | `memory_enabled`, `user_profile_enabled`, `provider` |
+| `security` | `tirith_enabled`, `website_blocklist` |
+| `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` |
+| `checkpoints` | `enabled`, `max_snapshots` (50) |
+
+Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration
+
+### Providers
+
+20+ providers supported. Set via `hermes model` or `hermes setup`.
+
+| Provider | Auth | Key env var |
+|----------|------|-------------|
+| OpenRouter | API key | `OPENROUTER_API_KEY` |
+| Anthropic | API key | `ANTHROPIC_API_KEY` |
+| Nous Portal | OAuth | `hermes auth` |
+| OpenAI Codex | OAuth | `hermes auth` |
+| GitHub Copilot | Token | `COPILOT_GITHUB_TOKEN` |
+| Google Gemini | API key | `GOOGLE_API_KEY` or `GEMINI_API_KEY` |
+| DeepSeek | API key | `DEEPSEEK_API_KEY` |
+| xAI / Grok | API key | `XAI_API_KEY` |
+| Hugging Face | Token | `HF_TOKEN` |
+| Z.AI / GLM | API key | `GLM_API_KEY` |
+| MiniMax | API key | `MINIMAX_API_KEY` |
+| MiniMax CN | API key | `MINIMAX_CN_API_KEY` |
+| Kimi / Moonshot | API key | `KIMI_API_KEY` |
+| Alibaba / DashScope | API key | `DASHSCOPE_API_KEY` |
+| Xiaomi MiMo | API key | `XIAOMI_API_KEY` |
+| Kilo Code | API key | `KILOCODE_API_KEY` |
+| AI Gateway (Vercel) | API key | `AI_GATEWAY_API_KEY` |
+| OpenCode Zen | API key | `OPENCODE_ZEN_API_KEY` |
+| OpenCode Go | API key | `OPENCODE_GO_API_KEY` |
+| Qwen OAuth | OAuth | `hermes login --provider qwen-oauth` |
+| Custom endpoint | Config | `model.base_url` + `model.api_key` in config.yaml |
+| GitHub Copilot ACP | External | `COPILOT_CLI_PATH` or Copilot CLI |
+
+Full provider docs: https://hermes-agent.nousresearch.com/docs/integrations/providers
+
+### Toolsets
+
+Enable/disable via `hermes tools` (interactive) or `hermes tools enable/disable NAME`.
+
+| Toolset | What it provides |
+|---------|-----------------|
+| `web` | Web search and content extraction |
+| `browser` | Browser automation (Browserbase, Camofox, or local Chromium) |
+| `terminal` | Shell commands and process management |
+| `file` | File read/write/search/patch |
+| `code_execution` | Sandboxed Python execution |
+| `vision` | Image analysis |
+| `image_gen` | AI image generation |
+| `tts` | Text-to-speech |
+| `skills` | Skill browsing and management |
+| `memory` | Persistent cross-session memory |
+| `session_search` | Search past conversations |
+| `delegation` | Subagent task delegation |
+| `cronjob` | Scheduled task management |
+| `clarify` | Ask user clarifying questions |
+| `messaging` | Cross-platform message sending |
+| `search` | Web search only (subset of `web`) |
+| `todo` | In-session task planning and tracking |
+| `rl` | Reinforcement learning tools (off by default) |
+| `moa` | Mixture of Agents (off by default) |
+| `homeassistant` | Smart home control (off by default) |
+
+Tool changes take effect on `/reset` (new session). They do NOT apply mid-conversation to preserve prompt caching.
+
+---
+
+## Voice & Transcription
+
+### STT (Voice → Text)
+
+Voice messages from messaging platforms are auto-transcribed.
+
+Provider priority (auto-detected):
+1. **Local faster-whisper** — free, no API key: `pip install faster-whisper`
+2. **Groq Whisper** — free tier: set `GROQ_API_KEY`
+3. **OpenAI Whisper** — paid: set `VOICE_TOOLS_OPENAI_KEY`
+4. **Mistral Voxtral** — set `MISTRAL_API_KEY`
+
+Config:
+```yaml
+stt:
+  enabled: true
+  provider: local        # local, groq, openai, mistral
+  local:
+    model: base          # tiny, base, small, medium, large-v3
+```
+
+### TTS (Text → Voice)
+
+| Provider | Env var | Free? |
+|----------|---------|-------|
+| Edge TTS | None | Yes (default) |
+| ElevenLabs | `ELEVENLABS_API_KEY` | Free tier |
+| OpenAI | `VOICE_TOOLS_OPENAI_KEY` | Paid |
+| MiniMax | `MINIMAX_API_KEY` | Paid |
+| Mistral (Voxtral) | `MISTRAL_API_KEY` | Paid |
+| NeuTTS (local) | None (`pip install neutts[all]` + `espeak-ng`) | Free |
+
+Voice commands: `/voice on` (voice-to-voice), `/voice tts` (always voice), `/voice off`.
+
+---
+
+## Spawning Additional Hermes Instances
+
+Run additional Hermes processes as fully independent subprocesses — separate sessions, tools, and environments.
+
+### When to Use This vs delegate_task
+
+| | `delegate_task` | Spawning `hermes` process |
+|-|-----------------|--------------------------|
+| Isolation | Separate conversation, shared process | Fully independent process |
+| Duration | Minutes (bounded by parent loop) | Hours/days |
+| Tool access | Subset of parent's tools | Full tool access |
+| Interactive | No | Yes (PTY mode) |
+| Use case | Quick parallel subtasks | Long autonomous missions |
+
+### One-Shot Mode
+
+```
+terminal(command="hermes chat -q 'Research GRPO papers and write summary to ~/research/grpo.md'", timeout=300)
+
+# Background for long tasks:
+terminal(command="hermes chat -q 'Set up CI/CD for ~/myapp'", background=true)
+```
+
+### Interactive PTY Mode (via tmux)
+
+Hermes uses prompt_toolkit, which requires a real terminal. Use tmux for interactive spawning:
+
+```
+# Start
+terminal(command="tmux new-session -d -s agent1 -x 120 -y 40 'hermes'", timeout=10)
+
+# Wait for startup, then send a message
+terminal(command="sleep 8 && tmux send-keys -t agent1 'Build a FastAPI auth service' Enter", timeout=15)
+
+# Read output
+terminal(command="sleep 20 && tmux capture-pane -t agent1 -p", timeout=5)
+
+# Send follow-up
+terminal(command="tmux send-keys -t agent1 'Add rate limiting middleware' Enter", timeout=5)
+
+# Exit
+terminal(command="tmux send-keys -t agent1 '/exit' Enter && sleep 2 && tmux kill-session -t agent1", timeout=10)
+```
+
+### Multi-Agent Coordination
+
+```
+# Agent A: backend
+terminal(command="tmux new-session -d -s backend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t backend 'Build REST API for user management' Enter", timeout=15)
+
+# Agent B: frontend
+terminal(command="tmux new-session -d -s frontend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t frontend 'Build React dashboard for user management' Enter", timeout=15)
+
+# Check progress, relay context between them
+terminal(command="tmux capture-pane -t backend -p | tail -30", timeout=5)
+terminal(command="tmux send-keys -t frontend 'Here is the API schema from the backend agent: ...' Enter", timeout=5)
+```
+
+### Session Resume
+
+```
+# Resume most recent session
+terminal(command="tmux new-session -d -s resumed 'hermes --continue'", timeout=10)
+
+# Resume specific session
+terminal(command="tmux new-session -d -s resumed 'hermes --resume 20260225_143052_a1b2c3'", timeout=10)
+```
+
+### Tips
+
+- **Prefer `delegate_task` for quick subtasks** — less overhead than spawning a full process
+- **Use `-w` (worktree mode)** when spawning agents that edit code — prevents git conflicts
+- **Set timeouts** for one-shot mode — complex tasks can take 5-10 minutes
+- **Use `hermes chat -q` for fire-and-forget** — no PTY needed
+- **Use tmux for interactive sessions** — raw PTY mode has `\r` vs `\n` issues with prompt_toolkit
+- **For scheduled tasks**, use the `cronjob` tool instead of spawning — handles delivery and retry
+
+---
+
+## Troubleshooting
+
+### Voice not working
+1. Check `stt.enabled: true` in config.yaml
+2. Verify provider: `pip install faster-whisper` or set API key
+3. In gateway: `/restart`. In CLI: exit and relaunch.
+
+### Tool not available
+1. `hermes tools` — check if toolset is enabled for your platform
+2. Some tools need env vars (check `.env`)
+3. `/reset` after enabling tools
+
+### Model/provider issues
+1. `hermes doctor` — check config and dependencies
+2. `hermes login` — re-authenticate OAuth providers
+3. Check `.env` has the right API key
+4. **Copilot 403**: `gh auth login` tokens do NOT work for Copilot API. You must use the Copilot-specific OAuth device code flow via `hermes model` → GitHub Copilot.
+
+### Changes not taking effect
+- **Tools/skills:** `/reset` starts a new session with updated toolset
+- **Config changes:** In gateway: `/restart`. In CLI: exit and relaunch.
+- **Code changes:** Restart the CLI or gateway process
+
+### Skills not showing
+1. `hermes skills list` — verify installed
+2. `hermes skills config` — check platform enablement
+3. Load explicitly: `/skill name` or `hermes -s name`
+
+### Gateway issues
+Check logs first:
+```bash
+grep -i "failed to send\|error" ~/.hermes/logs/gateway.log | tail -20
+```
+
+Common gateway problems:
+- **Gateway dies on SSH logout**: Enable linger: `sudo loginctl enable-linger $USER`
+- **Gateway dies on WSL2 close**: WSL2 requires `systemd=true` in `/etc/wsl.conf` for systemd services to work. Without it, gateway falls back to `nohup` (dies when session closes).
+- **Gateway crash loop**: Reset the failed state: `systemctl --user reset-failed hermes-gateway`
+
+### Platform-specific issues
+- **Discord bot silent**: Must enable **Message Content Intent** in Bot → Privileged Gateway Intents.
+- **Slack bot only works in DMs**: Must subscribe to `message.channels` event. Without it, the bot ignores public channels.
+- **Windows HTTP 400 "No models provided"**: Config file encoding issue (BOM). Ensure `config.yaml` is saved as UTF-8 without BOM.
+
+### Auxiliary models not working
+If `auxiliary` tasks (vision, compression, session_search) fail silently, the `auto` provider can't find a backend. Either set `OPENROUTER_API_KEY` or `GOOGLE_API_KEY`, or explicitly configure each auxiliary task's provider:
+```bash
+hermes config set auxiliary.vision.provider <your_provider>
+hermes config set auxiliary.vision.model <model_name>
+```
+
+---
+
+## Where to Find Things
+
+| Looking for... | Location |
+|----------------|----------|
+| Config options | `hermes config edit` or [Configuration docs](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) |
+| Available tools | `hermes tools list` or [Tools reference](https://hermes-agent.nousresearch.com/docs/reference/tools-reference) |
+| Slash commands | `/help` in session or [Slash commands reference](https://hermes-agent.nousresearch.com/docs/reference/slash-commands) |
+| Skills catalog | `hermes skills browse` or [Skills catalog](https://hermes-agent.nousresearch.com/docs/reference/skills-catalog) |
+| Provider setup | `hermes model` or [Providers guide](https://hermes-agent.nousresearch.com/docs/integrations/providers) |
+| Platform setup | `hermes gateway setup` or [Messaging docs](https://hermes-agent.nousresearch.com/docs/user-guide/messaging/) |
+| MCP servers | `hermes mcp list` or [MCP guide](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) |
+| Profiles | `hermes profile list` or [Profiles docs](https://hermes-agent.nousresearch.com/docs/user-guide/profiles) |
+| Cron jobs | `hermes cron list` or [Cron docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) |
+| Memory | `hermes memory status` or [Memory docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) |
+| Env variables | `hermes config env-path` or [Env vars reference](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) |
+| CLI commands | `hermes --help` or [CLI reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) |
+| Gateway logs | `~/.hermes/logs/gateway.log` |
+| Session files | `~/.hermes/sessions/` or `hermes sessions browse` |
+| Source code | `~/.hermes/hermes-agent/` |
+
+---
+
+## Contributor Quick Reference
+
+For occasional contributors and PR authors. Full developer docs: https://hermes-agent.nousresearch.com/docs/developer-guide/
+
+### Project Layout
+
+```
+hermes-agent/
+├── run_agent.py          # AIAgent — core conversation loop
+├── model_tools.py        # Tool discovery and dispatch
+├── toolsets.py           # Toolset definitions
+├── cli.py                # Interactive CLI (HermesCLI)
+├── hermes_state.py       # SQLite session store
+├── agent/                # Prompt builder, context compression, memory, model routing, credential pooling, skill dispatch
+├── hermes_cli/           # CLI subcommands, config, setup, commands
+│   ├── commands.py       # Slash command registry (CommandDef)
+│   ├── config.py         # DEFAULT_CONFIG, env var definitions
+│   └── main.py           # CLI entry point and argparse
+├── tools/                # One file per tool
+│   └── registry.py       # Central tool registry
+├── gateway/              # Messaging gateway
+│   └── platforms/        # Platform adapters (telegram, discord, etc.)
+├── cron/                 # Job scheduler
+├── tests/                # ~3000 pytest tests
+└── website/              # Docusaurus docs site
+```
+
+Config: `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys).
+
+### Adding a Tool (3 files)
+
+**1. Create `tools/your_tool.py`:**
+```python
+import json, os
+from tools.registry import registry
+
+def check_requirements() -> bool:
+    return bool(os.getenv("EXAMPLE_API_KEY"))
+
+def example_tool(param: str, task_id: str = None) -> str:
+    return json.dumps({"success": True, "data": "..."})
+
+registry.register(
+    name="example_tool",
+    toolset="example",
+    schema={"name": "example_tool", "description": "...", "parameters": {...}},
+    handler=lambda args, **kw: example_tool(
+        param=args.get("param", ""), task_id=kw.get("task_id")),
+    check_fn=check_requirements,
+    requires_env=["EXAMPLE_API_KEY"],
+)
+```
+
+**2. Add to `toolsets.py`** → `_HERMES_CORE_TOOLS` list.
+
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual list needed.
+
+All handlers must return JSON strings. Use `get_hermes_home()` for paths, never hardcode `~/.hermes`.
+
+### Adding a Slash Command
+
+1. Add `CommandDef` to `COMMAND_REGISTRY` in `hermes_cli/commands.py`
+2. Add handler in `cli.py` → `process_command()`
+3. (Optional) Add gateway handler in `gateway/run.py`
+
+All consumers (help text, autocomplete, Telegram menu, Slack mapping) derive from the central registry automatically.
+
+### Agent Loop (High Level)
+
+```
+run_conversation():
+  1. Build system prompt
+  2. Loop while iterations < max:
+     a. Call LLM (OpenAI-format messages + tool schemas)
+     b. If tool_calls → dispatch each via handle_function_call() → append results → continue
+     c. If text response → return
+  3. Context compression triggers automatically near token limit
+```
+
+### Testing
+
+```bash
+python -m pytest tests/ -o 'addopts=' -q   # Full suite
+python -m pytest tests/tools/ -q            # Specific area
+```
+
+- Tests auto-redirect `HERMES_HOME` to temp dirs — never touch real `~/.hermes/`
+- Run full suite before pushing any change
+- Use `-o 'addopts='` to clear any baked-in pytest flags
+
+### Commit Conventions
+
+```
+type: concise subject line
+
+Optional body.
+```
+
+Types: `fix:`, `feat:`, `refactor:`, `docs:`, `chore:`
+
+### Key Rules
+
+- **Never break prompt caching** — don't change context, tools, or system prompt mid-conversation
+- **Message role alternation** — never two assistant or two user messages in a row
+- Use `get_hermes_home()` from `hermes_constants` for all paths (profile-safe)
+- Config values go in `config.yaml`, secrets go in `.env`
+- New tools need a `check_fn` so they only appear when requirements are met
diff --git a/output/hermes-agent/20260424_012839/evolved_skill.md b/output/hermes-agent/20260424_012839/evolved_skill.md
new file mode 100644
index 00000000..d19471c8
--- /dev/null
+++ b/output/hermes-agent/20260424_012839/evolved_skill.md
@@ -0,0 +1,705 @@
+---
+name: hermes-agent
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
+version: 2.0.0
+author: Hermes Agent + Teknium
+license: MIT
+metadata:
+  hermes:
+    tags: [hermes, setup, configuration, multi-agent, spawning, cli, gateway, development]
+    homepage: https://github.com/NousResearch/hermes-agent
+    related_skills: [claude-code, codex, opencode]
+---
+
+# Hermes Agent
+
+Hermes Agent is an open-source AI agent framework by Nous Research that runs in your terminal, messaging platforms, and IDEs. It belongs to the same category as Claude Code (Anthropic), Codex (OpenAI), and OpenClaw — autonomous coding and task-execution agents that use tool calling to interact with your system. Hermes works with any LLM provider (OpenRouter, Anthropic, OpenAI, DeepSeek, local models, and 15+ others) and runs on Linux, macOS, and WSL.
+
+What makes Hermes different:
+
+- **Self-improving through skills** — Hermes learns from experience by saving reusable procedures as skills. When it solves a complex problem, discovers a workflow, or gets corrected, it can persist that knowledge as a skill document that loads into future sessions. Skills accumulate over time, making the agent better at your specific tasks and environment.
+- **Persistent memory across sessions** — remembers who you are, your preferences, environment details, and lessons learned. Pluggable memory backends (built-in, Honcho, Mem0, and more) let you choose how memory works.
+- **Multi-platform gateway** — the same agent runs on Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, and 10+ other platforms with full tool access, not just chat.
+- **Provider-agnostic** — swap models and providers mid-workflow without changing anything else. Credential pools rotate across multiple API keys automatically.
+- **Profiles** — run multiple independent Hermes instances with isolated configs, sessions, skills, and memory.
+- **Extensible** — plugins, MCP servers, custom tools, webhook triggers, cron scheduling, and the full Python ecosystem.
+
+People use Hermes for software development, research, system administration, data analysis, content creation, home automation, and anything else that benefits from an AI agent with persistent context and full system access.
+
+**This skill helps you work with Hermes Agent effectively** — setting it up, configuring features, spawning additional agent instances, troubleshooting issues, finding the right commands and settings, and understanding how the system works when you need to extend or contribute to it.
+
+**Docs:** https://hermes-agent.nousresearch.com/docs/
+
+## Quick Start
+
+```bash
+# Install
+curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
+
+# Interactive chat (default)
+hermes
+
+# Single query
+hermes chat -q "What is the capital of France?"
+
+# Setup wizard
+hermes setup
+
+# Change model/provider
+hermes model
+
+# Check health
+hermes doctor
+```
+
+---
+
+## CLI Reference
+
+### Global Flags
+
+```
+hermes [flags] [command]
+
+  --version, -V             Show version
+  --resume, -r SESSION      Resume session by ID or title
+  --continue, -c [NAME]     Resume by name, or most recent session
+  --worktree, -w            Isolated git worktree mode (parallel agents)
+  --skills, -s SKILL        Preload skills (comma-separate or repeat)
+  --profile, -p NAME        Use a named profile
+  --yolo                    Skip dangerous command approval
+  --pass-session-id         Include session ID in system prompt
+```
+
+No subcommand defaults to `chat`.
+
+### Chat
+
+```
+hermes chat [flags]
+  -q, --query TEXT          Single query, non-interactive
+  -m, --model MODEL         Model (e.g. anthropic/claude-sonnet-4)
+  -t, --toolsets LIST       Comma-separated toolsets
+  --provider PROVIDER       Force provider (openrouter, anthropic, nous, etc.)
+  -v, --verbose             Verbose output
+  -Q, --quiet               Suppress banner, spinner, tool previews
+  --checkpoints             Enable filesystem checkpoints (/rollback)
+  --source TAG              Session source tag (default: cli)
+```
+
+### Configuration
+
+```
+hermes setup [section]      Interactive wizard (model|terminal|gateway|tools|agent)
+hermes model                Interactive model/provider picker
+hermes config               View current config
+hermes config edit          Open config.yaml in $EDITOR
+hermes config set KEY VAL   Set a config value
+hermes config path          Print config.yaml path
+hermes config env-path      Print .env path
+hermes config check         Check for missing/outdated config
+hermes config migrate       Update config with new options
+hermes login [--provider P] OAuth login (nous, openai-codex)
+hermes logout               Clear stored auth
+hermes doctor [--fix]       Check dependencies and config
+hermes status [--all]       Show component status
+```
+
+### Tools & Skills
+
+```
+hermes tools                Interactive tool enable/disable (curses UI)
+hermes tools list           Show all tools and status
+hermes tools enable NAME    Enable a toolset
+hermes tools disable NAME   Disable a toolset
+
+hermes skills list          List installed skills
+hermes skills search QUERY  Search the skills hub
+hermes skills install ID    Install a skill
+hermes skills inspect ID    Preview without installing
+hermes skills config        Enable/disable skills per platform
+hermes skills check         Check for updates
+hermes skills update        Update outdated skills
+hermes skills uninstall N   Remove a hub skill
+hermes skills publish PATH  Publish to registry
+hermes skills browse        Browse all available skills
+hermes skills tap add REPO  Add a GitHub repo as skill source
+```
+
+### MCP Servers
+
+```
+hermes mcp serve            Run Hermes as an MCP server
+hermes mcp add NAME         Add an MCP server (--url or --command)
+hermes mcp remove NAME      Remove an MCP server
+hermes mcp list             List configured servers
+hermes mcp test NAME        Test connection
+hermes mcp configure NAME   Toggle tool selection
+```
+
+### Gateway (Messaging Platforms)
+
+```
+hermes gateway run          Start gateway foreground
+hermes gateway install      Install as background service
+hermes gateway start/stop   Control the service
+hermes gateway restart      Restart the service
+hermes gateway status       Check status
+hermes gateway setup        Configure platforms
+```
+
+Supported platforms: Telegram, Discord, Slack, WhatsApp, Signal, Email, SMS, Matrix, Mattermost, Home Assistant, DingTalk, Feishu, WeCom, BlueBubbles (iMessage), Weixin (WeChat), API Server, Webhooks. Open WebUI connects via the API Server adapter.
+
+Platform docs: https://hermes-agent.nousresearch.com/docs/user-guide/messaging/
+
+### Sessions
+
+```
+hermes sessions list        List recent sessions
+hermes sessions browse      Interactive picker
+hermes sessions export OUT  Export to JSONL
+hermes sessions rename ID T Rename a session
+hermes sessions delete ID   Delete a session
+hermes sessions prune       Clean up old sessions (--older-than N days)
+hermes sessions stats       Session store statistics
+```
+
+### Cron Jobs
+
+```
+hermes cron list            List jobs (--all for disabled)
+hermes cron create SCHED    Create: '30m', 'every 2h', '0 9 * * *'
+hermes cron edit ID         Edit schedule, prompt, delivery
+hermes cron pause/resume ID Control job state
+hermes cron run ID          Trigger on next tick
+hermes cron remove ID       Delete a job
+hermes cron status          Scheduler status
+```
+
+### Webhooks
+
+```
+hermes webhook subscribe N  Create route at /webhooks/<name>
+hermes webhook list         List subscriptions
+hermes webhook remove NAME  Remove a subscription
+hermes webhook test NAME    Send a test POST
+```
+
+### Profiles
+
+```
+hermes profile list         List all profiles
+hermes profile create NAME  Create (--clone, --clone-all, --clone-from)
+hermes profile use NAME     Set sticky default
+hermes profile delete NAME  Delete a profile
+hermes profile show NAME    Show details
+hermes profile alias NAME   Manage wrapper scripts
+hermes profile rename A B   Rename a profile
+hermes profile export NAME  Export to tar.gz
+hermes profile import FILE  Import from archive
+```
+
+### Credential Pools
+
+```
+hermes auth add             Interactive credential wizard
+hermes auth list [PROVIDER] List pooled credentials
+hermes auth remove P INDEX  Remove by provider + index
+hermes auth reset PROVIDER  Clear exhaustion status
+```
+
+### Other
+
+```
+hermes insights [--days N]  Usage analytics
+hermes update               Update to latest version
+hermes pairing list/approve/revoke  DM authorization
+hermes plugins list/install/remove  Plugin management
+hermes honcho setup/status  Honcho memory integration (requires honcho plugin)
+hermes memory setup/status/off  Memory provider config
+hermes completion bash|zsh  Shell completions
+hermes acp                  ACP server (IDE integration)
+hermes claw migrate         Migrate from OpenClaw
+hermes uninstall            Uninstall Hermes
+```
+
+---
+
+## Slash Commands (In-Session)
+
+Type these during an interactive chat session.
+
+### Session Control
+```
+/new (/reset)        Fresh session
+/clear               Clear screen + new session (CLI)
+/retry               Resend last message
+/undo                Remove last exchange
+/title [name]        Name the session
+/compress            Manually compress context
+/stop                Kill background processes
+/rollback [N]        Restore filesystem checkpoint
+/background <prompt> Run prompt in background
+/queue <prompt>      Queue for next turn
+/resume [name]       Resume a named session
+```
+
+### Configuration
+```
+/config              Show config (CLI)
+/model [name]        Show or change model
+/provider            Show provider info
+/personality [name]  Set personality
+/reasoning [level]   Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
+/verbose             Cycle: off → new → all → verbose
+/voice [on|off|tts]  Voice mode
+/yolo                Toggle approval bypass
+/skin [name]         Change theme (CLI)
+/statusbar           Toggle status bar (CLI)
+```
+
+### Tools & Skills
+```
+/tools               Manage tools (CLI)
+/toolsets            List toolsets (CLI)
+/skills              Search/install skills (CLI)
+/skill <name>        Load a skill into session
+/cron                Manage cron jobs (CLI)
+/reload-mcp          Reload MCP servers
+/plugins             List plugins (CLI)
+```
+
+### Gateway
+```
+/approve             Approve a pending command (gateway)
+/deny                Deny a pending command (gateway)
+/restart             Restart gateway (gateway)
+/sethome             Set current chat as home channel (gateway)
+/update              Update Hermes to latest (gateway)
+/platforms (/gateway) Show platform connection status (gateway)
+```
+
+### Utility
+```
+/branch (/fork)      Branch the current session
+/btw                 Ephemeral side question (doesn't interrupt main task)
+/fast                Toggle priority/fast processing
+/browser             Open CDP browser connection
+/history             Show conversation history (CLI)
+/save                Save conversation to file (CLI)
+/paste               Attach clipboard image (CLI)
+/image               Attach local image file (CLI)
+```
+
+### Info
+```
+/help                Show commands
+/commands [page]     Browse all commands (gateway)
+/usage               Token usage
+/insights [days]     Usage analytics
+/status              Session info (gateway)
+/profile             Active profile info
+```
+
+### Exit
+```
+/quit (/exit, /q)    Exit CLI
+```
+
+---
+
+## Key Paths & Config
+
+```
+~/.hermes/config.yaml       Main configuration
+~/.hermes/.env              API keys and secrets
+$HERMES_HOME/skills/        Installed skills
+~/.hermes/sessions/         Session transcripts
+~/.hermes/logs/             Gateway and error logs
+~/.hermes/auth.json         OAuth tokens and credential pools
+~/.hermes/hermes-agent/     Source code (if git-installed)
+```
+
+Profiles use `~/.hermes/profiles/<name>/` with the same layout.
+
+### Config Sections
+
+Edit with `hermes config edit` or `hermes config set section.key value`.
+
+| Section | Key options |
+|---------|-------------|
+| `model` | `default`, `provider`, `base_url`, `api_key`, `context_length` |
+| `agent` | `max_turns` (90), `tool_use_enforcement` |
+| `terminal` | `backend` (local/docker/ssh/modal), `cwd`, `timeout` (180) |
+| `compression` | `enabled`, `threshold` (0.50), `target_ratio` (0.20) |
+| `display` | `skin`, `tool_progress`, `show_reasoning`, `show_cost` |
+| `stt` | `enabled`, `provider` (local/groq/openai/mistral) |
+| `tts` | `provider` (edge/elevenlabs/openai/minimax/mistral/neutts) |
+| `memory` | `memory_enabled`, `user_profile_enabled`, `provider` |
+| `security` | `tirith_enabled`, `website_blocklist` |
+| `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` |
+| `checkpoints` | `enabled`, `max_snapshots` (50) |
+
+Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration
+
+### Providers
+
+20+ providers supported. Set via `hermes model` or `hermes setup`.
+
+| Provider | Auth | Key env var |
+|----------|------|-------------|
+| OpenRouter | API key | `OPENROUTER_API_KEY` |
+| Anthropic | API key | `ANTHROPIC_API_KEY` |
+| Nous Portal | OAuth | `hermes auth` |
+| OpenAI Codex | OAuth | `hermes auth` |
+| GitHub Copilot | Token | `COPILOT_GITHUB_TOKEN` |
+| Google Gemini | API key | `GOOGLE_API_KEY` or `GEMINI_API_KEY` |
+| DeepSeek | API key | `DEEPSEEK_API_KEY` |
+| xAI / Grok | API key | `XAI_API_KEY` |
+| Hugging Face | Token | `HF_TOKEN` |
+| Z.AI / GLM | API key | `GLM_API_KEY` |
+| MiniMax | API key | `MINIMAX_API_KEY` |
+| MiniMax CN | API key | `MINIMAX_CN_API_KEY` |
+| Kimi / Moonshot | API key | `KIMI_API_KEY` |
+| Alibaba / DashScope | API key | `DASHSCOPE_API_KEY` |
+| Xiaomi MiMo | API key | `XIAOMI_API_KEY` |
+| Kilo Code | API key | `KILOCODE_API_KEY` |
+| AI Gateway (Vercel) | API key | `AI_GATEWAY_API_KEY` |
+| OpenCode Zen | API key | `OPENCODE_ZEN_API_KEY` |
+| OpenCode Go | API key | `OPENCODE_GO_API_KEY` |
+| Qwen OAuth | OAuth | `hermes login --provider qwen-oauth` |
+| Custom endpoint | Config | `model.base_url` + `model.api_key` in config.yaml |
+| GitHub Copilot ACP | External | `COPILOT_CLI_PATH` or Copilot CLI |
+
+Full provider docs: https://hermes-agent.nousresearch.com/docs/integrations/providers
+
+### Toolsets
+
+Enable/disable via `hermes tools` (interactive) or `hermes tools enable/disable NAME`.
+
+| Toolset | What it provides |
+|---------|-----------------|
+| `web` | Web search and content extraction |
+| `browser` | Browser automation (Browserbase, Camofox, or local Chromium) |
+| `terminal` | Shell commands and process management |
+| `file` | File read/write/search/patch |
+| `code_execution` | Sandboxed Python execution |
+| `vision` | Image analysis |
+| `image_gen` | AI image generation |
+| `tts` | Text-to-speech |
+| `skills` | Skill browsing and management |
+| `memory` | Persistent cross-session memory |
+| `session_search` | Search past conversations |
+| `delegation` | Subagent task delegation |
+| `cronjob` | Scheduled task management |
+| `clarify` | Ask user clarifying questions |
+| `messaging` | Cross-platform message sending |
+| `search` | Web search only (subset of `web`) |
+| `todo` | In-session task planning and tracking |
+| `rl` | Reinforcement learning tools (off by default) |
+| `moa` | Mixture of Agents (off by default) |
+| `homeassistant` | Smart home control (off by default) |
+
+Tool changes take effect on `/reset` (new session). They do NOT apply mid-conversation to preserve prompt caching.
+
+---
+
+## Voice & Transcription
+
+### STT (Voice → Text)
+
+Voice messages from messaging platforms are auto-transcribed.
+
+Provider priority (auto-detected):
+1. **Local faster-whisper** — free, no API key: `pip install faster-whisper`
+2. **Groq Whisper** — free tier: set `GROQ_API_KEY`
+3. **OpenAI Whisper** — paid: set `VOICE_TOOLS_OPENAI_KEY`
+4. **Mistral Voxtral** — set `MISTRAL_API_KEY`
+
+Config:
+```yaml
+stt:
+  enabled: true
+  provider: local        # local, groq, openai, mistral
+  local:
+    model: base          # tiny, base, small, medium, large-v3
+```
+
+### TTS (Text → Voice)
+
+| Provider | Env var | Free? |
+|----------|---------|-------|
+| Edge TTS | None | Yes (default) |
+| ElevenLabs | `ELEVENLABS_API_KEY` | Free tier |
+| OpenAI | `VOICE_TOOLS_OPENAI_KEY` | Paid |
+| MiniMax | `MINIMAX_API_KEY` | Paid |
+| Mistral (Voxtral) | `MISTRAL_API_KEY` | Paid |
+| NeuTTS (local) | None (`pip install neutts[all]` + `espeak-ng`) | Free |
+
+Voice commands: `/voice on` (voice-to-voice), `/voice tts` (always voice), `/voice off`.
+
+---
+
+## Spawning Additional Hermes Instances
+
+Run additional Hermes processes as fully independent subprocesses — separate sessions, tools, and environments.
+
+### When to Use This vs delegate_task
+
+| | `delegate_task` | Spawning `hermes` process |
+|-|-----------------|--------------------------|
+| Isolation | Separate conversation, shared process | Fully independent process |
+| Duration | Minutes (bounded by parent loop) | Hours/days |
+| Tool access | Subset of parent's tools | Full tool access |
+| Interactive | No | Yes (PTY mode) |
+| Use case | Quick parallel subtasks | Long autonomous missions |
+
+### One-Shot Mode
+
+```
+terminal(command="hermes chat -q 'Research GRPO papers and write summary to ~/research/grpo.md'", timeout=300)
+
+# Background for long tasks:
+terminal(command="hermes chat -q 'Set up CI/CD for ~/myapp'", background=true)
+```
+
+### Interactive PTY Mode (via tmux)
+
+Hermes uses prompt_toolkit, which requires a real terminal. Use tmux for interactive spawning:
+
+```
+# Start
+terminal(command="tmux new-session -d -s agent1 -x 120 -y 40 'hermes'", timeout=10)
+
+# Wait for startup, then send a message
+terminal(command="sleep 8 && tmux send-keys -t agent1 'Build a FastAPI auth service' Enter", timeout=15)
+
+# Read output
+terminal(command="sleep 20 && tmux capture-pane -t agent1 -p", timeout=5)
+
+# Send follow-up
+terminal(command="tmux send-keys -t agent1 'Add rate limiting middleware' Enter", timeout=5)
+
+# Exit
+terminal(command="tmux send-keys -t agent1 '/exit' Enter && sleep 2 && tmux kill-session -t agent1", timeout=10)
+```
+
+### Multi-Agent Coordination
+
+```
+# Agent A: backend
+terminal(command="tmux new-session -d -s backend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t backend 'Build REST API for user management' Enter", timeout=15)
+
+# Agent B: frontend
+terminal(command="tmux new-session -d -s frontend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t frontend 'Build React dashboard for user management' Enter", timeout=15)
+
+# Check progress, relay context between them
+terminal(command="tmux capture-pane -t backend -p | tail -30", timeout=5)
+terminal(command="tmux send-keys -t frontend 'Here is the API schema from the backend agent: ...' Enter", timeout=5)
+```
+
+### Session Resume
+
+```
+# Resume most recent session
+terminal(command="tmux new-session -d -s resumed 'hermes --continue'", timeout=10)
+
+# Resume specific session
+terminal(command="tmux new-session -d -s resumed 'hermes --resume 20260225_143052_a1b2c3'", timeout=10)
+```
+
+### Tips
+
+- **Prefer `delegate_task` for quick subtasks** — less overhead than spawning a full process
+- **Use `-w` (worktree mode)** when spawning agents that edit code — prevents git conflicts
+- **Set timeouts** for one-shot mode — complex tasks can take 5-10 minutes
+- **Use `hermes chat -q` for fire-and-forget** — no PTY needed
+- **Use tmux for interactive sessions** — raw PTY mode has `\r` vs `\n` issues with prompt_toolkit
+- **For scheduled tasks**, use the `cronjob` tool instead of spawning — handles delivery and retry
+
+---
+
+## Troubleshooting
+
+### Voice not working
+1. Check `stt.enabled: true` in config.yaml
+2. Verify provider: `pip install faster-whisper` or set API key
+3. In gateway: `/restart`. In CLI: exit and relaunch.
+
+### Tool not available
+1. `hermes tools` — check if toolset is enabled for your platform
+2. Some tools need env vars (check `.env`)
+3. `/reset` after enabling tools
+
+### Model/provider issues
+1. `hermes doctor` — check config and dependencies
+2. `hermes login` — re-authenticate OAuth providers
+3. Check `.env` has the right API key
+4. **Copilot 403**: `gh auth login` tokens do NOT work for Copilot API. You must use the Copilot-specific OAuth device code flow via `hermes model` → GitHub Copilot.
+
+### Changes not taking effect
+- **Tools/skills:** `/reset` starts a new session with updated toolset
+- **Config changes:** In gateway: `/restart`. In CLI: exit and relaunch.
+- **Code changes:** Restart the CLI or gateway process
+
+### Skills not showing
+1. `hermes skills list` — verify installed
+2. `hermes skills config` — check platform enablement
+3. Load explicitly: `/skill name` or `hermes -s name`
+
+### Gateway issues
+Check logs first:
+```bash
+grep -i "failed to send\|error" ~/.hermes/logs/gateway.log | tail -20
+```
+
+Common gateway problems:
+- **Gateway dies on SSH logout**: Enable linger: `sudo loginctl enable-linger $USER`
+- **Gateway dies on WSL2 close**: WSL2 requires `systemd=true` in `/etc/wsl.conf` for systemd services to work. Without it, gateway falls back to `nohup` (dies when session closes).
+- **Gateway crash loop**: Reset the failed state: `systemctl --user reset-failed hermes-gateway`
+
+### Platform-specific issues
+- **Discord bot silent**: Must enable **Message Content Intent** in Bot → Privileged Gateway Intents.
+- **Slack bot only works in DMs**: Must subscribe to `message.channels` event. Without it, the bot ignores public channels.
+- **Windows HTTP 400 "No models provided"**: Config file encoding issue (BOM). Ensure `config.yaml` is saved as UTF-8 without BOM.
+
+### Auxiliary models not working
+If `auxiliary` tasks (vision, compression, session_search) fail silently, the `auto` provider can't find a backend. Either set `OPENROUTER_API_KEY` or `GOOGLE_API_KEY`, or explicitly configure each auxiliary task's provider:
+```bash
+hermes config set auxiliary.vision.provider <your_provider>
+hermes config set auxiliary.vision.model <model_name>
+```
+
+---
+
+## Where to Find Things
+
+| Looking for... | Location |
+|----------------|----------|
+| Config options | `hermes config edit` or [Configuration docs](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) |
+| Available tools | `hermes tools list` or [Tools reference](https://hermes-agent.nousresearch.com/docs/reference/tools-reference) |
+| Slash commands | `/help` in session or [Slash commands reference](https://hermes-agent.nousresearch.com/docs/reference/slash-commands) |
+| Skills catalog | `hermes skills browse` or [Skills catalog](https://hermes-agent.nousresearch.com/docs/reference/skills-catalog) |
+| Provider setup | `hermes model` or [Providers guide](https://hermes-agent.nousresearch.com/docs/integrations/providers) |
+| Platform setup | `hermes gateway setup` or [Messaging docs](https://hermes-agent.nousresearch.com/docs/user-guide/messaging/) |
+| MCP servers | `hermes mcp list` or [MCP guide](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) |
+| Profiles | `hermes profile list` or [Profiles docs](https://hermes-agent.nousresearch.com/docs/user-guide/profiles) |
+| Cron jobs | `hermes cron list` or [Cron docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) |
+| Memory | `hermes memory status` or [Memory docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) |
+| Env variables | `hermes config env-path` or [Env vars reference](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) |
+| CLI commands | `hermes --help` or [CLI reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) |
+| Gateway logs | `~/.hermes/logs/gateway.log` |
+| Session files | `~/.hermes/sessions/` or `hermes sessions browse` |
+| Source code | `~/.hermes/hermes-agent/` |
+
+---
+
+## Contributor Quick Reference
+
+For occasional contributors and PR authors. Full developer docs: https://hermes-agent.nousresearch.com/docs/developer-guide/
+
+### Project Layout
+
+```
+hermes-agent/
+├── run_agent.py          # AIAgent — core conversation loop
+├── model_tools.py        # Tool discovery and dispatch
+├── toolsets.py           # Toolset definitions
+├── cli.py                # Interactive CLI (HermesCLI)
+├── hermes_state.py       # SQLite session store
+├── agent/                # Prompt builder, context compression, memory, model routing, credential pooling, skill dispatch
+├── hermes_cli/           # CLI subcommands, config, setup, commands
+│   ├── commands.py       # Slash command registry (CommandDef)
+│   ├── config.py         # DEFAULT_CONFIG, env var definitions
+│   └── main.py           # CLI entry point and argparse
+├── tools/                # One file per tool
+│   └── registry.py       # Central tool registry
+├── gateway/              # Messaging gateway
+│   └── platforms/        # Platform adapters (telegram, discord, etc.)
+├── cron/                 # Job scheduler
+├── tests/                # ~3000 pytest tests
+└── website/              # Docusaurus docs site
+```
+
+Config: `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys).
+
+### Adding a Tool (3 files)
+
+**1. Create `tools/your_tool.py`:**
+```python
+import json, os
+from tools.registry import registry
+
+def check_requirements() -> bool:
+    return bool(os.getenv("EXAMPLE_API_KEY"))
+
+def example_tool(param: str, task_id: str = None) -> str:
+    return json.dumps({"success": True, "data": "..."})
+
+registry.register(
+    name="example_tool",
+    toolset="example",
+    schema={"name": "example_tool", "description": "...", "parameters": {...}},
+    handler=lambda args, **kw: example_tool(
+        param=args.get("param", ""), task_id=kw.get("task_id")),
+    check_fn=check_requirements,
+    requires_env=["EXAMPLE_API_KEY"],
+)
+```
+
+**2. Add to `toolsets.py`** → `_HERMES_CORE_TOOLS` list.
+
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual list needed.
+
+All handlers must return JSON strings. Use `get_hermes_home()` for paths, never hardcode `~/.hermes`.
+
+### Adding a Slash Command
+
+1. Add `CommandDef` to `COMMAND_REGISTRY` in `hermes_cli/commands.py`
+2. Add handler in `cli.py` → `process_command()`
+3. (Optional) Add gateway handler in `gateway/run.py`
+
+All consumers (help text, autocomplete, Telegram menu, Slack mapping) derive from the central registry automatically.
+
+### Agent Loop (High Level)
+
+```
+run_conversation():
+  1. Build system prompt
+  2. Loop while iterations < max:
+     a. Call LLM (OpenAI-format messages + tool schemas)
+     b. If tool_calls → dispatch each via handle_function_call() → append results → continue
+     c. If text response → return
+  3. Context compression triggers automatically near token limit
+```
+
+### Testing
+
+```bash
+python -m pytest tests/ -o 'addopts=' -q   # Full suite
+python -m pytest tests/tools/ -q            # Specific area
+```
+
+- Tests auto-redirect `HERMES_HOME` to temp dirs — never touch real `~/.hermes/`
+- Run full suite before pushing any change
+- Use `-o 'addopts='` to clear any baked-in pytest flags
+
+### Commit Conventions
+
+```
+type: concise subject line
+
+Optional body.
+```
+
+Types: `fix:`, `feat:`, `refactor:`, `docs:`, `chore:`
+
+### Key Rules
+
+- **Never break prompt caching** — don't change context, tools, or system prompt mid-conversation
+- **Message role alternation** — never two assistant or two user messages in a row
+- Use `get_hermes_home()` from `hermes_constants` for all paths (profile-safe)
+- Config values go in `config.yaml`, secrets go in `.env`
+- New tools need a `check_fn` so they only appear when requirements are met
diff --git a/output/hermes-agent/20260424_012839/metrics.json b/output/hermes-agent/20260424_012839/metrics.json
new file mode 100644
index 00000000..6eed3b34
--- /dev/null
+++ b/output/hermes-agent/20260424_012839/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "hermes-agent",
+  "timestamp": "20260424_012839",
+  "iterations": 10,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.5979440860215053,
+  "evolved_score": 0.732700358422939,
+  "improvement": 0.13475627240143373,
+  "baseline_size": 27212,
+  "evolved_size": 27212,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 892.1851706504822,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/hermes-agent/20260424_013726/baseline_skill.md b/output/hermes-agent/20260424_013726/baseline_skill.md
new file mode 100644
index 00000000..d19471c8
--- /dev/null
+++ b/output/hermes-agent/20260424_013726/baseline_skill.md
@@ -0,0 +1,705 @@
+---
+name: hermes-agent
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
+version: 2.0.0
+author: Hermes Agent + Teknium
+license: MIT
+metadata:
+  hermes:
+    tags: [hermes, setup, configuration, multi-agent, spawning, cli, gateway, development]
+    homepage: https://github.com/NousResearch/hermes-agent
+    related_skills: [claude-code, codex, opencode]
+---
+
+# Hermes Agent
+
+Hermes Agent is an open-source AI agent framework by Nous Research that runs in your terminal, messaging platforms, and IDEs. It belongs to the same category as Claude Code (Anthropic), Codex (OpenAI), and OpenClaw — autonomous coding and task-execution agents that use tool calling to interact with your system. Hermes works with any LLM provider (OpenRouter, Anthropic, OpenAI, DeepSeek, local models, and 15+ others) and runs on Linux, macOS, and WSL.
+
+What makes Hermes different:
+
+- **Self-improving through skills** — Hermes learns from experience by saving reusable procedures as skills. When it solves a complex problem, discovers a workflow, or gets corrected, it can persist that knowledge as a skill document that loads into future sessions. Skills accumulate over time, making the agent better at your specific tasks and environment.
+- **Persistent memory across sessions** — remembers who you are, your preferences, environment details, and lessons learned. Pluggable memory backends (built-in, Honcho, Mem0, and more) let you choose how memory works.
+- **Multi-platform gateway** — the same agent runs on Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, and 10+ other platforms with full tool access, not just chat.
+- **Provider-agnostic** — swap models and providers mid-workflow without changing anything else. Credential pools rotate across multiple API keys automatically.
+- **Profiles** — run multiple independent Hermes instances with isolated configs, sessions, skills, and memory.
+- **Extensible** — plugins, MCP servers, custom tools, webhook triggers, cron scheduling, and the full Python ecosystem.
+
+People use Hermes for software development, research, system administration, data analysis, content creation, home automation, and anything else that benefits from an AI agent with persistent context and full system access.
+
+**This skill helps you work with Hermes Agent effectively** — setting it up, configuring features, spawning additional agent instances, troubleshooting issues, finding the right commands and settings, and understanding how the system works when you need to extend or contribute to it.
+
+**Docs:** https://hermes-agent.nousresearch.com/docs/
+
+## Quick Start
+
+```bash
+# Install
+curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
+
+# Interactive chat (default)
+hermes
+
+# Single query
+hermes chat -q "What is the capital of France?"
+
+# Setup wizard
+hermes setup
+
+# Change model/provider
+hermes model
+
+# Check health
+hermes doctor
+```
+
+---
+
+## CLI Reference
+
+### Global Flags
+
+```
+hermes [flags] [command]
+
+  --version, -V             Show version
+  --resume, -r SESSION      Resume session by ID or title
+  --continue, -c [NAME]     Resume by name, or most recent session
+  --worktree, -w            Isolated git worktree mode (parallel agents)
+  --skills, -s SKILL        Preload skills (comma-separate or repeat)
+  --profile, -p NAME        Use a named profile
+  --yolo                    Skip dangerous command approval
+  --pass-session-id         Include session ID in system prompt
+```
+
+No subcommand defaults to `chat`.
+
+### Chat
+
+```
+hermes chat [flags]
+  -q, --query TEXT          Single query, non-interactive
+  -m, --model MODEL         Model (e.g. anthropic/claude-sonnet-4)
+  -t, --toolsets LIST       Comma-separated toolsets
+  --provider PROVIDER       Force provider (openrouter, anthropic, nous, etc.)
+  -v, --verbose             Verbose output
+  -Q, --quiet               Suppress banner, spinner, tool previews
+  --checkpoints             Enable filesystem checkpoints (/rollback)
+  --source TAG              Session source tag (default: cli)
+```
+
+### Configuration
+
+```
+hermes setup [section]      Interactive wizard (model|terminal|gateway|tools|agent)
+hermes model                Interactive model/provider picker
+hermes config               View current config
+hermes config edit          Open config.yaml in $EDITOR
+hermes config set KEY VAL   Set a config value
+hermes config path          Print config.yaml path
+hermes config env-path      Print .env path
+hermes config check         Check for missing/outdated config
+hermes config migrate       Update config with new options
+hermes login [--provider P] OAuth login (nous, openai-codex)
+hermes logout               Clear stored auth
+hermes doctor [--fix]       Check dependencies and config
+hermes status [--all]       Show component status
+```
+
+### Tools & Skills
+
+```
+hermes tools                Interactive tool enable/disable (curses UI)
+hermes tools list           Show all tools and status
+hermes tools enable NAME    Enable a toolset
+hermes tools disable NAME   Disable a toolset
+
+hermes skills list          List installed skills
+hermes skills search QUERY  Search the skills hub
+hermes skills install ID    Install a skill
+hermes skills inspect ID    Preview without installing
+hermes skills config        Enable/disable skills per platform
+hermes skills check         Check for updates
+hermes skills update        Update outdated skills
+hermes skills uninstall N   Remove a hub skill
+hermes skills publish PATH  Publish to registry
+hermes skills browse        Browse all available skills
+hermes skills tap add REPO  Add a GitHub repo as skill source
+```
+
+### MCP Servers
+
+```
+hermes mcp serve            Run Hermes as an MCP server
+hermes mcp add NAME         Add an MCP server (--url or --command)
+hermes mcp remove NAME      Remove an MCP server
+hermes mcp list             List configured servers
+hermes mcp test NAME        Test connection
+hermes mcp configure NAME   Toggle tool selection
+```
+
+### Gateway (Messaging Platforms)
+
+```
+hermes gateway run          Start gateway foreground
+hermes gateway install      Install as background service
+hermes gateway start/stop   Control the service
+hermes gateway restart      Restart the service
+hermes gateway status       Check status
+hermes gateway setup        Configure platforms
+```
+
+Supported platforms: Telegram, Discord, Slack, WhatsApp, Signal, Email, SMS, Matrix, Mattermost, Home Assistant, DingTalk, Feishu, WeCom, BlueBubbles (iMessage), Weixin (WeChat), API Server, Webhooks. Open WebUI connects via the API Server adapter.
+
+Platform docs: https://hermes-agent.nousresearch.com/docs/user-guide/messaging/
+
+### Sessions
+
+```
+hermes sessions list        List recent sessions
+hermes sessions browse      Interactive picker
+hermes sessions export OUT  Export to JSONL
+hermes sessions rename ID T Rename a session
+hermes sessions delete ID   Delete a session
+hermes sessions prune       Clean up old sessions (--older-than N days)
+hermes sessions stats       Session store statistics
+```
+
+### Cron Jobs
+
+```
+hermes cron list            List jobs (--all for disabled)
+hermes cron create SCHED    Create: '30m', 'every 2h', '0 9 * * *'
+hermes cron edit ID         Edit schedule, prompt, delivery
+hermes cron pause/resume ID Control job state
+hermes cron run ID          Trigger on next tick
+hermes cron remove ID       Delete a job
+hermes cron status          Scheduler status
+```
+
+### Webhooks
+
+```
+hermes webhook subscribe N  Create route at /webhooks/<name>
+hermes webhook list         List subscriptions
+hermes webhook remove NAME  Remove a subscription
+hermes webhook test NAME    Send a test POST
+```
+
+### Profiles
+
+```
+hermes profile list         List all profiles
+hermes profile create NAME  Create (--clone, --clone-all, --clone-from)
+hermes profile use NAME     Set sticky default
+hermes profile delete NAME  Delete a profile
+hermes profile show NAME    Show details
+hermes profile alias NAME   Manage wrapper scripts
+hermes profile rename A B   Rename a profile
+hermes profile export NAME  Export to tar.gz
+hermes profile import FILE  Import from archive
+```
+
+### Credential Pools
+
+```
+hermes auth add             Interactive credential wizard
+hermes auth list [PROVIDER] List pooled credentials
+hermes auth remove P INDEX  Remove by provider + index
+hermes auth reset PROVIDER  Clear exhaustion status
+```
+
+### Other
+
+```
+hermes insights [--days N]  Usage analytics
+hermes update               Update to latest version
+hermes pairing list/approve/revoke  DM authorization
+hermes plugins list/install/remove  Plugin management
+hermes honcho setup/status  Honcho memory integration (requires honcho plugin)
+hermes memory setup/status/off  Memory provider config
+hermes completion bash|zsh  Shell completions
+hermes acp                  ACP server (IDE integration)
+hermes claw migrate         Migrate from OpenClaw
+hermes uninstall            Uninstall Hermes
+```
+
+---
+
+## Slash Commands (In-Session)
+
+Type these during an interactive chat session.
+
+### Session Control
+```
+/new (/reset)        Fresh session
+/clear               Clear screen + new session (CLI)
+/retry               Resend last message
+/undo                Remove last exchange
+/title [name]        Name the session
+/compress            Manually compress context
+/stop                Kill background processes
+/rollback [N]        Restore filesystem checkpoint
+/background <prompt> Run prompt in background
+/queue <prompt>      Queue for next turn
+/resume [name]       Resume a named session
+```
+
+### Configuration
+```
+/config              Show config (CLI)
+/model [name]        Show or change model
+/provider            Show provider info
+/personality [name]  Set personality
+/reasoning [level]   Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
+/verbose             Cycle: off → new → all → verbose
+/voice [on|off|tts]  Voice mode
+/yolo                Toggle approval bypass
+/skin [name]         Change theme (CLI)
+/statusbar           Toggle status bar (CLI)
+```
+
+### Tools & Skills
+```
+/tools               Manage tools (CLI)
+/toolsets            List toolsets (CLI)
+/skills              Search/install skills (CLI)
+/skill <name>        Load a skill into session
+/cron                Manage cron jobs (CLI)
+/reload-mcp          Reload MCP servers
+/plugins             List plugins (CLI)
+```
+
+### Gateway
+```
+/approve             Approve a pending command (gateway)
+/deny                Deny a pending command (gateway)
+/restart             Restart gateway (gateway)
+/sethome             Set current chat as home channel (gateway)
+/update              Update Hermes to latest (gateway)
+/platforms (/gateway) Show platform connection status (gateway)
+```
+
+### Utility
+```
+/branch (/fork)      Branch the current session
+/btw                 Ephemeral side question (doesn't interrupt main task)
+/fast                Toggle priority/fast processing
+/browser             Open CDP browser connection
+/history             Show conversation history (CLI)
+/save                Save conversation to file (CLI)
+/paste               Attach clipboard image (CLI)
+/image               Attach local image file (CLI)
+```
+
+### Info
+```
+/help                Show commands
+/commands [page]     Browse all commands (gateway)
+/usage               Token usage
+/insights [days]     Usage analytics
+/status              Session info (gateway)
+/profile             Active profile info
+```
+
+### Exit
+```
+/quit (/exit, /q)    Exit CLI
+```
+
+---
+
+## Key Paths & Config
+
+```
+~/.hermes/config.yaml       Main configuration
+~/.hermes/.env              API keys and secrets
+$HERMES_HOME/skills/        Installed skills
+~/.hermes/sessions/         Session transcripts
+~/.hermes/logs/             Gateway and error logs
+~/.hermes/auth.json         OAuth tokens and credential pools
+~/.hermes/hermes-agent/     Source code (if git-installed)
+```
+
+Profiles use `~/.hermes/profiles/<name>/` with the same layout.
+
+### Config Sections
+
+Edit with `hermes config edit` or `hermes config set section.key value`.
+
+| Section | Key options |
+|---------|-------------|
+| `model` | `default`, `provider`, `base_url`, `api_key`, `context_length` |
+| `agent` | `max_turns` (90), `tool_use_enforcement` |
+| `terminal` | `backend` (local/docker/ssh/modal), `cwd`, `timeout` (180) |
+| `compression` | `enabled`, `threshold` (0.50), `target_ratio` (0.20) |
+| `display` | `skin`, `tool_progress`, `show_reasoning`, `show_cost` |
+| `stt` | `enabled`, `provider` (local/groq/openai/mistral) |
+| `tts` | `provider` (edge/elevenlabs/openai/minimax/mistral/neutts) |
+| `memory` | `memory_enabled`, `user_profile_enabled`, `provider` |
+| `security` | `tirith_enabled`, `website_blocklist` |
+| `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` |
+| `checkpoints` | `enabled`, `max_snapshots` (50) |
+
+Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration
+
+### Providers
+
+20+ providers supported. Set via `hermes model` or `hermes setup`.
+
+| Provider | Auth | Key env var |
+|----------|------|-------------|
+| OpenRouter | API key | `OPENROUTER_API_KEY` |
+| Anthropic | API key | `ANTHROPIC_API_KEY` |
+| Nous Portal | OAuth | `hermes auth` |
+| OpenAI Codex | OAuth | `hermes auth` |
+| GitHub Copilot | Token | `COPILOT_GITHUB_TOKEN` |
+| Google Gemini | API key | `GOOGLE_API_KEY` or `GEMINI_API_KEY` |
+| DeepSeek | API key | `DEEPSEEK_API_KEY` |
+| xAI / Grok | API key | `XAI_API_KEY` |
+| Hugging Face | Token | `HF_TOKEN` |
+| Z.AI / GLM | API key | `GLM_API_KEY` |
+| MiniMax | API key | `MINIMAX_API_KEY` |
+| MiniMax CN | API key | `MINIMAX_CN_API_KEY` |
+| Kimi / Moonshot | API key | `KIMI_API_KEY` |
+| Alibaba / DashScope | API key | `DASHSCOPE_API_KEY` |
+| Xiaomi MiMo | API key | `XIAOMI_API_KEY` |
+| Kilo Code | API key | `KILOCODE_API_KEY` |
+| AI Gateway (Vercel) | API key | `AI_GATEWAY_API_KEY` |
+| OpenCode Zen | API key | `OPENCODE_ZEN_API_KEY` |
+| OpenCode Go | API key | `OPENCODE_GO_API_KEY` |
+| Qwen OAuth | OAuth | `hermes login --provider qwen-oauth` |
+| Custom endpoint | Config | `model.base_url` + `model.api_key` in config.yaml |
+| GitHub Copilot ACP | External | `COPILOT_CLI_PATH` or Copilot CLI |
+
+Full provider docs: https://hermes-agent.nousresearch.com/docs/integrations/providers
+
+### Toolsets
+
+Enable/disable via `hermes tools` (interactive) or `hermes tools enable/disable NAME`.
+
+| Toolset | What it provides |
+|---------|-----------------|
+| `web` | Web search and content extraction |
+| `browser` | Browser automation (Browserbase, Camofox, or local Chromium) |
+| `terminal` | Shell commands and process management |
+| `file` | File read/write/search/patch |
+| `code_execution` | Sandboxed Python execution |
+| `vision` | Image analysis |
+| `image_gen` | AI image generation |
+| `tts` | Text-to-speech |
+| `skills` | Skill browsing and management |
+| `memory` | Persistent cross-session memory |
+| `session_search` | Search past conversations |
+| `delegation` | Subagent task delegation |
+| `cronjob` | Scheduled task management |
+| `clarify` | Ask user clarifying questions |
+| `messaging` | Cross-platform message sending |
+| `search` | Web search only (subset of `web`) |
+| `todo` | In-session task planning and tracking |
+| `rl` | Reinforcement learning tools (off by default) |
+| `moa` | Mixture of Agents (off by default) |
+| `homeassistant` | Smart home control (off by default) |
+
+Tool changes take effect on `/reset` (new session). They do NOT apply mid-conversation to preserve prompt caching.
+
+---
+
+## Voice & Transcription
+
+### STT (Voice → Text)
+
+Voice messages from messaging platforms are auto-transcribed.
+
+Provider priority (auto-detected):
+1. **Local faster-whisper** — free, no API key: `pip install faster-whisper`
+2. **Groq Whisper** — free tier: set `GROQ_API_KEY`
+3. **OpenAI Whisper** — paid: set `VOICE_TOOLS_OPENAI_KEY`
+4. **Mistral Voxtral** — set `MISTRAL_API_KEY`
+
+Config:
+```yaml
+stt:
+  enabled: true
+  provider: local        # local, groq, openai, mistral
+  local:
+    model: base          # tiny, base, small, medium, large-v3
+```
+
+### TTS (Text → Voice)
+
+| Provider | Env var | Free? |
+|----------|---------|-------|
+| Edge TTS | None | Yes (default) |
+| ElevenLabs | `ELEVENLABS_API_KEY` | Free tier |
+| OpenAI | `VOICE_TOOLS_OPENAI_KEY` | Paid |
+| MiniMax | `MINIMAX_API_KEY` | Paid |
+| Mistral (Voxtral) | `MISTRAL_API_KEY` | Paid |
+| NeuTTS (local) | None (`pip install neutts[all]` + `espeak-ng`) | Free |
+
+Voice commands: `/voice on` (voice-to-voice), `/voice tts` (always voice), `/voice off`.
+
+---
+
+## Spawning Additional Hermes Instances
+
+Run additional Hermes processes as fully independent subprocesses — separate sessions, tools, and environments.
+
+### When to Use This vs delegate_task
+
+| | `delegate_task` | Spawning `hermes` process |
+|-|-----------------|--------------------------|
+| Isolation | Separate conversation, shared process | Fully independent process |
+| Duration | Minutes (bounded by parent loop) | Hours/days |
+| Tool access | Subset of parent's tools | Full tool access |
+| Interactive | No | Yes (PTY mode) |
+| Use case | Quick parallel subtasks | Long autonomous missions |
+
+### One-Shot Mode
+
+```
+terminal(command="hermes chat -q 'Research GRPO papers and write summary to ~/research/grpo.md'", timeout=300)
+
+# Background for long tasks:
+terminal(command="hermes chat -q 'Set up CI/CD for ~/myapp'", background=true)
+```
+
+### Interactive PTY Mode (via tmux)
+
+Hermes uses prompt_toolkit, which requires a real terminal. Use tmux for interactive spawning:
+
+```
+# Start
+terminal(command="tmux new-session -d -s agent1 -x 120 -y 40 'hermes'", timeout=10)
+
+# Wait for startup, then send a message
+terminal(command="sleep 8 && tmux send-keys -t agent1 'Build a FastAPI auth service' Enter", timeout=15)
+
+# Read output
+terminal(command="sleep 20 && tmux capture-pane -t agent1 -p", timeout=5)
+
+# Send follow-up
+terminal(command="tmux send-keys -t agent1 'Add rate limiting middleware' Enter", timeout=5)
+
+# Exit
+terminal(command="tmux send-keys -t agent1 '/exit' Enter && sleep 2 && tmux kill-session -t agent1", timeout=10)
+```
+
+### Multi-Agent Coordination
+
+```
+# Agent A: backend
+terminal(command="tmux new-session -d -s backend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t backend 'Build REST API for user management' Enter", timeout=15)
+
+# Agent B: frontend
+terminal(command="tmux new-session -d -s frontend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t frontend 'Build React dashboard for user management' Enter", timeout=15)
+
+# Check progress, relay context between them
+terminal(command="tmux capture-pane -t backend -p | tail -30", timeout=5)
+terminal(command="tmux send-keys -t frontend 'Here is the API schema from the backend agent: ...' Enter", timeout=5)
+```
+
+### Session Resume
+
+```
+# Resume most recent session
+terminal(command="tmux new-session -d -s resumed 'hermes --continue'", timeout=10)
+
+# Resume specific session
+terminal(command="tmux new-session -d -s resumed 'hermes --resume 20260225_143052_a1b2c3'", timeout=10)
+```
+
+### Tips
+
+- **Prefer `delegate_task` for quick subtasks** — less overhead than spawning a full process
+- **Use `-w` (worktree mode)** when spawning agents that edit code — prevents git conflicts
+- **Set timeouts** for one-shot mode — complex tasks can take 5-10 minutes
+- **Use `hermes chat -q` for fire-and-forget** — no PTY needed
+- **Use tmux for interactive sessions** — raw PTY mode has `\r` vs `\n` issues with prompt_toolkit
+- **For scheduled tasks**, use the `cronjob` tool instead of spawning — handles delivery and retry
+
+---
+
+## Troubleshooting
+
+### Voice not working
+1. Check `stt.enabled: true` in config.yaml
+2. Verify provider: `pip install faster-whisper` or set API key
+3. In gateway: `/restart`. In CLI: exit and relaunch.
+
+### Tool not available
+1. `hermes tools` — check if toolset is enabled for your platform
+2. Some tools need env vars (check `.env`)
+3. `/reset` after enabling tools
+
+### Model/provider issues
+1. `hermes doctor` — check config and dependencies
+2. `hermes login` — re-authenticate OAuth providers
+3. Check `.env` has the right API key
+4. **Copilot 403**: `gh auth login` tokens do NOT work for Copilot API. You must use the Copilot-specific OAuth device code flow via `hermes model` → GitHub Copilot.
+
+### Changes not taking effect
+- **Tools/skills:** `/reset` starts a new session with updated toolset
+- **Config changes:** In gateway: `/restart`. In CLI: exit and relaunch.
+- **Code changes:** Restart the CLI or gateway process
+
+### Skills not showing
+1. `hermes skills list` — verify installed
+2. `hermes skills config` — check platform enablement
+3. Load explicitly: `/skill name` or `hermes -s name`
+
+### Gateway issues
+Check logs first:
+```bash
+grep -i "failed to send\|error" ~/.hermes/logs/gateway.log | tail -20
+```
+
+Common gateway problems:
+- **Gateway dies on SSH logout**: Enable linger: `sudo loginctl enable-linger $USER`
+- **Gateway dies on WSL2 close**: WSL2 requires `systemd=true` in `/etc/wsl.conf` for systemd services to work. Without it, gateway falls back to `nohup` (dies when session closes).
+- **Gateway crash loop**: Reset the failed state: `systemctl --user reset-failed hermes-gateway`
+
+### Platform-specific issues
+- **Discord bot silent**: Must enable **Message Content Intent** in Bot → Privileged Gateway Intents.
+- **Slack bot only works in DMs**: Must subscribe to `message.channels` event. Without it, the bot ignores public channels.
+- **Windows HTTP 400 "No models provided"**: Config file encoding issue (BOM). Ensure `config.yaml` is saved as UTF-8 without BOM.
+
+### Auxiliary models not working
+If `auxiliary` tasks (vision, compression, session_search) fail silently, the `auto` provider can't find a backend. Either set `OPENROUTER_API_KEY` or `GOOGLE_API_KEY`, or explicitly configure each auxiliary task's provider:
+```bash
+hermes config set auxiliary.vision.provider <your_provider>
+hermes config set auxiliary.vision.model <model_name>
+```
+
+---
+
+## Where to Find Things
+
+| Looking for... | Location |
+|----------------|----------|
+| Config options | `hermes config edit` or [Configuration docs](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) |
+| Available tools | `hermes tools list` or [Tools reference](https://hermes-agent.nousresearch.com/docs/reference/tools-reference) |
+| Slash commands | `/help` in session or [Slash commands reference](https://hermes-agent.nousresearch.com/docs/reference/slash-commands) |
+| Skills catalog | `hermes skills browse` or [Skills catalog](https://hermes-agent.nousresearch.com/docs/reference/skills-catalog) |
+| Provider setup | `hermes model` or [Providers guide](https://hermes-agent.nousresearch.com/docs/integrations/providers) |
+| Platform setup | `hermes gateway setup` or [Messaging docs](https://hermes-agent.nousresearch.com/docs/user-guide/messaging/) |
+| MCP servers | `hermes mcp list` or [MCP guide](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) |
+| Profiles | `hermes profile list` or [Profiles docs](https://hermes-agent.nousresearch.com/docs/user-guide/profiles) |
+| Cron jobs | `hermes cron list` or [Cron docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) |
+| Memory | `hermes memory status` or [Memory docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) |
+| Env variables | `hermes config env-path` or [Env vars reference](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) |
+| CLI commands | `hermes --help` or [CLI reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) |
+| Gateway logs | `~/.hermes/logs/gateway.log` |
+| Session files | `~/.hermes/sessions/` or `hermes sessions browse` |
+| Source code | `~/.hermes/hermes-agent/` |
+
+---
+
+## Contributor Quick Reference
+
+For occasional contributors and PR authors. Full developer docs: https://hermes-agent.nousresearch.com/docs/developer-guide/
+
+### Project Layout
+
+```
+hermes-agent/
+├── run_agent.py          # AIAgent — core conversation loop
+├── model_tools.py        # Tool discovery and dispatch
+├── toolsets.py           # Toolset definitions
+├── cli.py                # Interactive CLI (HermesCLI)
+├── hermes_state.py       # SQLite session store
+├── agent/                # Prompt builder, context compression, memory, model routing, credential pooling, skill dispatch
+├── hermes_cli/           # CLI subcommands, config, setup, commands
+│   ├── commands.py       # Slash command registry (CommandDef)
+│   ├── config.py         # DEFAULT_CONFIG, env var definitions
+│   └── main.py           # CLI entry point and argparse
+├── tools/                # One file per tool
+│   └── registry.py       # Central tool registry
+├── gateway/              # Messaging gateway
+│   └── platforms/        # Platform adapters (telegram, discord, etc.)
+├── cron/                 # Job scheduler
+├── tests/                # ~3000 pytest tests
+└── website/              # Docusaurus docs site
+```
+
+Config: `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys).
+
+### Adding a Tool (3 files)
+
+**1. Create `tools/your_tool.py`:**
+```python
+import json, os
+from tools.registry import registry
+
+def check_requirements() -> bool:
+    return bool(os.getenv("EXAMPLE_API_KEY"))
+
+def example_tool(param: str, task_id: str = None) -> str:
+    return json.dumps({"success": True, "data": "..."})
+
+registry.register(
+    name="example_tool",
+    toolset="example",
+    schema={"name": "example_tool", "description": "...", "parameters": {...}},
+    handler=lambda args, **kw: example_tool(
+        param=args.get("param", ""), task_id=kw.get("task_id")),
+    check_fn=check_requirements,
+    requires_env=["EXAMPLE_API_KEY"],
+)
+```
+
+**2. Add to `toolsets.py`** → `_HERMES_CORE_TOOLS` list.
+
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual list needed.
+
+All handlers must return JSON strings. Use `get_hermes_home()` for paths, never hardcode `~/.hermes`.
+
+### Adding a Slash Command
+
+1. Add `CommandDef` to `COMMAND_REGISTRY` in `hermes_cli/commands.py`
+2. Add handler in `cli.py` → `process_command()`
+3. (Optional) Add gateway handler in `gateway/run.py`
+
+All consumers (help text, autocomplete, Telegram menu, Slack mapping) derive from the central registry automatically.
+
+### Agent Loop (High Level)
+
+```
+run_conversation():
+  1. Build system prompt
+  2. Loop while iterations < max:
+     a. Call LLM (OpenAI-format messages + tool schemas)
+     b. If tool_calls → dispatch each via handle_function_call() → append results → continue
+     c. If text response → return
+  3. Context compression triggers automatically near token limit
+```
+
+### Testing
+
+```bash
+python -m pytest tests/ -o 'addopts=' -q   # Full suite
+python -m pytest tests/tools/ -q            # Specific area
+```
+
+- Tests auto-redirect `HERMES_HOME` to temp dirs — never touch real `~/.hermes/`
+- Run full suite before pushing any change
+- Use `-o 'addopts='` to clear any baked-in pytest flags
+
+### Commit Conventions
+
+```
+type: concise subject line
+
+Optional body.
+```
+
+Types: `fix:`, `feat:`, `refactor:`, `docs:`, `chore:`
+
+### Key Rules
+
+- **Never break prompt caching** — don't change context, tools, or system prompt mid-conversation
+- **Message role alternation** — never two assistant or two user messages in a row
+- Use `get_hermes_home()` from `hermes_constants` for all paths (profile-safe)
+- Config values go in `config.yaml`, secrets go in `.env`
+- New tools need a `check_fn` so they only appear when requirements are met
diff --git a/output/hermes-agent/20260424_013726/evolved_skill.md b/output/hermes-agent/20260424_013726/evolved_skill.md
new file mode 100644
index 00000000..d19471c8
--- /dev/null
+++ b/output/hermes-agent/20260424_013726/evolved_skill.md
@@ -0,0 +1,705 @@
+---
+name: hermes-agent
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
+version: 2.0.0
+author: Hermes Agent + Teknium
+license: MIT
+metadata:
+  hermes:
+    tags: [hermes, setup, configuration, multi-agent, spawning, cli, gateway, development]
+    homepage: https://github.com/NousResearch/hermes-agent
+    related_skills: [claude-code, codex, opencode]
+---
+
+# Hermes Agent
+
+Hermes Agent is an open-source AI agent framework by Nous Research that runs in your terminal, messaging platforms, and IDEs. It belongs to the same category as Claude Code (Anthropic), Codex (OpenAI), and OpenClaw — autonomous coding and task-execution agents that use tool calling to interact with your system. Hermes works with any LLM provider (OpenRouter, Anthropic, OpenAI, DeepSeek, local models, and 15+ others) and runs on Linux, macOS, and WSL.
+
+What makes Hermes different:
+
+- **Self-improving through skills** — Hermes learns from experience by saving reusable procedures as skills. When it solves a complex problem, discovers a workflow, or gets corrected, it can persist that knowledge as a skill document that loads into future sessions. Skills accumulate over time, making the agent better at your specific tasks and environment.
+- **Persistent memory across sessions** — remembers who you are, your preferences, environment details, and lessons learned. Pluggable memory backends (built-in, Honcho, Mem0, and more) let you choose how memory works.
+- **Multi-platform gateway** — the same agent runs on Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, and 10+ other platforms with full tool access, not just chat.
+- **Provider-agnostic** — swap models and providers mid-workflow without changing anything else. Credential pools rotate across multiple API keys automatically.
+- **Profiles** — run multiple independent Hermes instances with isolated configs, sessions, skills, and memory.
+- **Extensible** — plugins, MCP servers, custom tools, webhook triggers, cron scheduling, and the full Python ecosystem.
+
+People use Hermes for software development, research, system administration, data analysis, content creation, home automation, and anything else that benefits from an AI agent with persistent context and full system access.
+
+**This skill helps you work with Hermes Agent effectively** — setting it up, configuring features, spawning additional agent instances, troubleshooting issues, finding the right commands and settings, and understanding how the system works when you need to extend or contribute to it.
+
+**Docs:** https://hermes-agent.nousresearch.com/docs/
+
+## Quick Start
+
+```bash
+# Install
+curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
+
+# Interactive chat (default)
+hermes
+
+# Single query
+hermes chat -q "What is the capital of France?"
+
+# Setup wizard
+hermes setup
+
+# Change model/provider
+hermes model
+
+# Check health
+hermes doctor
+```
+
+---
+
+## CLI Reference
+
+### Global Flags
+
+```
+hermes [flags] [command]
+
+  --version, -V             Show version
+  --resume, -r SESSION      Resume session by ID or title
+  --continue, -c [NAME]     Resume by name, or most recent session
+  --worktree, -w            Isolated git worktree mode (parallel agents)
+  --skills, -s SKILL        Preload skills (comma-separate or repeat)
+  --profile, -p NAME        Use a named profile
+  --yolo                    Skip dangerous command approval
+  --pass-session-id         Include session ID in system prompt
+```
+
+No subcommand defaults to `chat`.
+
+### Chat
+
+```
+hermes chat [flags]
+  -q, --query TEXT          Single query, non-interactive
+  -m, --model MODEL         Model (e.g. anthropic/claude-sonnet-4)
+  -t, --toolsets LIST       Comma-separated toolsets
+  --provider PROVIDER       Force provider (openrouter, anthropic, nous, etc.)
+  -v, --verbose             Verbose output
+  -Q, --quiet               Suppress banner, spinner, tool previews
+  --checkpoints             Enable filesystem checkpoints (/rollback)
+  --source TAG              Session source tag (default: cli)
+```
+
+### Configuration
+
+```
+hermes setup [section]      Interactive wizard (model|terminal|gateway|tools|agent)
+hermes model                Interactive model/provider picker
+hermes config               View current config
+hermes config edit          Open config.yaml in $EDITOR
+hermes config set KEY VAL   Set a config value
+hermes config path          Print config.yaml path
+hermes config env-path      Print .env path
+hermes config check         Check for missing/outdated config
+hermes config migrate       Update config with new options
+hermes login [--provider P] OAuth login (nous, openai-codex)
+hermes logout               Clear stored auth
+hermes doctor [--fix]       Check dependencies and config
+hermes status [--all]       Show component status
+```
+
+### Tools & Skills
+
+```
+hermes tools                Interactive tool enable/disable (curses UI)
+hermes tools list           Show all tools and status
+hermes tools enable NAME    Enable a toolset
+hermes tools disable NAME   Disable a toolset
+
+hermes skills list          List installed skills
+hermes skills search QUERY  Search the skills hub
+hermes skills install ID    Install a skill
+hermes skills inspect ID    Preview without installing
+hermes skills config        Enable/disable skills per platform
+hermes skills check         Check for updates
+hermes skills update        Update outdated skills
+hermes skills uninstall N   Remove a hub skill
+hermes skills publish PATH  Publish to registry
+hermes skills browse        Browse all available skills
+hermes skills tap add REPO  Add a GitHub repo as skill source
+```
+
+### MCP Servers
+
+```
+hermes mcp serve            Run Hermes as an MCP server
+hermes mcp add NAME         Add an MCP server (--url or --command)
+hermes mcp remove NAME      Remove an MCP server
+hermes mcp list             List configured servers
+hermes mcp test NAME        Test connection
+hermes mcp configure NAME   Toggle tool selection
+```
+
+### Gateway (Messaging Platforms)
+
+```
+hermes gateway run          Start gateway foreground
+hermes gateway install      Install as background service
+hermes gateway start/stop   Control the service
+hermes gateway restart      Restart the service
+hermes gateway status       Check status
+hermes gateway setup        Configure platforms
+```
+
+Supported platforms: Telegram, Discord, Slack, WhatsApp, Signal, Email, SMS, Matrix, Mattermost, Home Assistant, DingTalk, Feishu, WeCom, BlueBubbles (iMessage), Weixin (WeChat), API Server, Webhooks. Open WebUI connects via the API Server adapter.
+
+Platform docs: https://hermes-agent.nousresearch.com/docs/user-guide/messaging/
+
+### Sessions
+
+```
+hermes sessions list        List recent sessions
+hermes sessions browse      Interactive picker
+hermes sessions export OUT  Export to JSONL
+hermes sessions rename ID T Rename a session
+hermes sessions delete ID   Delete a session
+hermes sessions prune       Clean up old sessions (--older-than N days)
+hermes sessions stats       Session store statistics
+```
+
+### Cron Jobs
+
+```
+hermes cron list            List jobs (--all for disabled)
+hermes cron create SCHED    Create: '30m', 'every 2h', '0 9 * * *'
+hermes cron edit ID         Edit schedule, prompt, delivery
+hermes cron pause/resume ID Control job state
+hermes cron run ID          Trigger on next tick
+hermes cron remove ID       Delete a job
+hermes cron status          Scheduler status
+```
+
+### Webhooks
+
+```
+hermes webhook subscribe N  Create route at /webhooks/<name>
+hermes webhook list         List subscriptions
+hermes webhook remove NAME  Remove a subscription
+hermes webhook test NAME    Send a test POST
+```
+
+### Profiles
+
+```
+hermes profile list         List all profiles
+hermes profile create NAME  Create (--clone, --clone-all, --clone-from)
+hermes profile use NAME     Set sticky default
+hermes profile delete NAME  Delete a profile
+hermes profile show NAME    Show details
+hermes profile alias NAME   Manage wrapper scripts
+hermes profile rename A B   Rename a profile
+hermes profile export NAME  Export to tar.gz
+hermes profile import FILE  Import from archive
+```
+
+### Credential Pools
+
+```
+hermes auth add             Interactive credential wizard
+hermes auth list [PROVIDER] List pooled credentials
+hermes auth remove P INDEX  Remove by provider + index
+hermes auth reset PROVIDER  Clear exhaustion status
+```
+
+### Other
+
+```
+hermes insights [--days N]  Usage analytics
+hermes update               Update to latest version
+hermes pairing list/approve/revoke  DM authorization
+hermes plugins list/install/remove  Plugin management
+hermes honcho setup/status  Honcho memory integration (requires honcho plugin)
+hermes memory setup/status/off  Memory provider config
+hermes completion bash|zsh  Shell completions
+hermes acp                  ACP server (IDE integration)
+hermes claw migrate         Migrate from OpenClaw
+hermes uninstall            Uninstall Hermes
+```
+
+---
+
+## Slash Commands (In-Session)
+
+Type these during an interactive chat session.
+
+### Session Control
+```
+/new (/reset)        Fresh session
+/clear               Clear screen + new session (CLI)
+/retry               Resend last message
+/undo                Remove last exchange
+/title [name]        Name the session
+/compress            Manually compress context
+/stop                Kill background processes
+/rollback [N]        Restore filesystem checkpoint
+/background <prompt> Run prompt in background
+/queue <prompt>      Queue for next turn
+/resume [name]       Resume a named session
+```
+
+### Configuration
+```
+/config              Show config (CLI)
+/model [name]        Show or change model
+/provider            Show provider info
+/personality [name]  Set personality
+/reasoning [level]   Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
+/verbose             Cycle: off → new → all → verbose
+/voice [on|off|tts]  Voice mode
+/yolo                Toggle approval bypass
+/skin [name]         Change theme (CLI)
+/statusbar           Toggle status bar (CLI)
+```
+
+### Tools & Skills
+```
+/tools               Manage tools (CLI)
+/toolsets            List toolsets (CLI)
+/skills              Search/install skills (CLI)
+/skill <name>        Load a skill into session
+/cron                Manage cron jobs (CLI)
+/reload-mcp          Reload MCP servers
+/plugins             List plugins (CLI)
+```
+
+### Gateway
+```
+/approve             Approve a pending command (gateway)
+/deny                Deny a pending command (gateway)
+/restart             Restart gateway (gateway)
+/sethome             Set current chat as home channel (gateway)
+/update              Update Hermes to latest (gateway)
+/platforms (/gateway) Show platform connection status (gateway)
+```
+
+### Utility
+```
+/branch (/fork)      Branch the current session
+/btw                 Ephemeral side question (doesn't interrupt main task)
+/fast                Toggle priority/fast processing
+/browser             Open CDP browser connection
+/history             Show conversation history (CLI)
+/save                Save conversation to file (CLI)
+/paste               Attach clipboard image (CLI)
+/image               Attach local image file (CLI)
+```
+
+### Info
+```
+/help                Show commands
+/commands [page]     Browse all commands (gateway)
+/usage               Token usage
+/insights [days]     Usage analytics
+/status              Session info (gateway)
+/profile             Active profile info
+```
+
+### Exit
+```
+/quit (/exit, /q)    Exit CLI
+```
+
+---
+
+## Key Paths & Config
+
+```
+~/.hermes/config.yaml       Main configuration
+~/.hermes/.env              API keys and secrets
+$HERMES_HOME/skills/        Installed skills
+~/.hermes/sessions/         Session transcripts
+~/.hermes/logs/             Gateway and error logs
+~/.hermes/auth.json         OAuth tokens and credential pools
+~/.hermes/hermes-agent/     Source code (if git-installed)
+```
+
+Profiles use `~/.hermes/profiles/<name>/` with the same layout.
+
+### Config Sections
+
+Edit with `hermes config edit` or `hermes config set section.key value`.
+
+| Section | Key options |
+|---------|-------------|
+| `model` | `default`, `provider`, `base_url`, `api_key`, `context_length` |
+| `agent` | `max_turns` (90), `tool_use_enforcement` |
+| `terminal` | `backend` (local/docker/ssh/modal), `cwd`, `timeout` (180) |
+| `compression` | `enabled`, `threshold` (0.50), `target_ratio` (0.20) |
+| `display` | `skin`, `tool_progress`, `show_reasoning`, `show_cost` |
+| `stt` | `enabled`, `provider` (local/groq/openai/mistral) |
+| `tts` | `provider` (edge/elevenlabs/openai/minimax/mistral/neutts) |
+| `memory` | `memory_enabled`, `user_profile_enabled`, `provider` |
+| `security` | `tirith_enabled`, `website_blocklist` |
+| `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` |
+| `checkpoints` | `enabled`, `max_snapshots` (50) |
+
+Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration
+
+### Providers
+
+20+ providers supported. Set via `hermes model` or `hermes setup`.
+
+| Provider | Auth | Key env var |
+|----------|------|-------------|
+| OpenRouter | API key | `OPENROUTER_API_KEY` |
+| Anthropic | API key | `ANTHROPIC_API_KEY` |
+| Nous Portal | OAuth | `hermes auth` |
+| OpenAI Codex | OAuth | `hermes auth` |
+| GitHub Copilot | Token | `COPILOT_GITHUB_TOKEN` |
+| Google Gemini | API key | `GOOGLE_API_KEY` or `GEMINI_API_KEY` |
+| DeepSeek | API key | `DEEPSEEK_API_KEY` |
+| xAI / Grok | API key | `XAI_API_KEY` |
+| Hugging Face | Token | `HF_TOKEN` |
+| Z.AI / GLM | API key | `GLM_API_KEY` |
+| MiniMax | API key | `MINIMAX_API_KEY` |
+| MiniMax CN | API key | `MINIMAX_CN_API_KEY` |
+| Kimi / Moonshot | API key | `KIMI_API_KEY` |
+| Alibaba / DashScope | API key | `DASHSCOPE_API_KEY` |
+| Xiaomi MiMo | API key | `XIAOMI_API_KEY` |
+| Kilo Code | API key | `KILOCODE_API_KEY` |
+| AI Gateway (Vercel) | API key | `AI_GATEWAY_API_KEY` |
+| OpenCode Zen | API key | `OPENCODE_ZEN_API_KEY` |
+| OpenCode Go | API key | `OPENCODE_GO_API_KEY` |
+| Qwen OAuth | OAuth | `hermes login --provider qwen-oauth` |
+| Custom endpoint | Config | `model.base_url` + `model.api_key` in config.yaml |
+| GitHub Copilot ACP | External | `COPILOT_CLI_PATH` or Copilot CLI |
+
+Full provider docs: https://hermes-agent.nousresearch.com/docs/integrations/providers
+
+### Toolsets
+
+Enable/disable via `hermes tools` (interactive) or `hermes tools enable/disable NAME`.
+
+| Toolset | What it provides |
+|---------|-----------------|
+| `web` | Web search and content extraction |
+| `browser` | Browser automation (Browserbase, Camofox, or local Chromium) |
+| `terminal` | Shell commands and process management |
+| `file` | File read/write/search/patch |
+| `code_execution` | Sandboxed Python execution |
+| `vision` | Image analysis |
+| `image_gen` | AI image generation |
+| `tts` | Text-to-speech |
+| `skills` | Skill browsing and management |
+| `memory` | Persistent cross-session memory |
+| `session_search` | Search past conversations |
+| `delegation` | Subagent task delegation |
+| `cronjob` | Scheduled task management |
+| `clarify` | Ask user clarifying questions |
+| `messaging` | Cross-platform message sending |
+| `search` | Web search only (subset of `web`) |
+| `todo` | In-session task planning and tracking |
+| `rl` | Reinforcement learning tools (off by default) |
+| `moa` | Mixture of Agents (off by default) |
+| `homeassistant` | Smart home control (off by default) |
+
+Tool changes take effect on `/reset` (new session). They do NOT apply mid-conversation to preserve prompt caching.
+
+---
+
+## Voice & Transcription
+
+### STT (Voice → Text)
+
+Voice messages from messaging platforms are auto-transcribed.
+
+Provider priority (auto-detected):
+1. **Local faster-whisper** — free, no API key: `pip install faster-whisper`
+2. **Groq Whisper** — free tier: set `GROQ_API_KEY`
+3. **OpenAI Whisper** — paid: set `VOICE_TOOLS_OPENAI_KEY`
+4. **Mistral Voxtral** — set `MISTRAL_API_KEY`
+
+Config:
+```yaml
+stt:
+  enabled: true
+  provider: local        # local, groq, openai, mistral
+  local:
+    model: base          # tiny, base, small, medium, large-v3
+```
+
+### TTS (Text → Voice)
+
+| Provider | Env var | Free? |
+|----------|---------|-------|
+| Edge TTS | None | Yes (default) |
+| ElevenLabs | `ELEVENLABS_API_KEY` | Free tier |
+| OpenAI | `VOICE_TOOLS_OPENAI_KEY` | Paid |
+| MiniMax | `MINIMAX_API_KEY` | Paid |
+| Mistral (Voxtral) | `MISTRAL_API_KEY` | Paid |
+| NeuTTS (local) | None (`pip install neutts[all]` + `espeak-ng`) | Free |
+
+Voice commands: `/voice on` (voice-to-voice), `/voice tts` (always voice), `/voice off`.
+
+---
+
+## Spawning Additional Hermes Instances
+
+Run additional Hermes processes as fully independent subprocesses — separate sessions, tools, and environments.
+
+### When to Use This vs delegate_task
+
+| | `delegate_task` | Spawning `hermes` process |
+|-|-----------------|--------------------------|
+| Isolation | Separate conversation, shared process | Fully independent process |
+| Duration | Minutes (bounded by parent loop) | Hours/days |
+| Tool access | Subset of parent's tools | Full tool access |
+| Interactive | No | Yes (PTY mode) |
+| Use case | Quick parallel subtasks | Long autonomous missions |
+
+### One-Shot Mode
+
+```
+terminal(command="hermes chat -q 'Research GRPO papers and write summary to ~/research/grpo.md'", timeout=300)
+
+# Background for long tasks:
+terminal(command="hermes chat -q 'Set up CI/CD for ~/myapp'", background=true)
+```
+
+### Interactive PTY Mode (via tmux)
+
+Hermes uses prompt_toolkit, which requires a real terminal. Use tmux for interactive spawning:
+
+```
+# Start
+terminal(command="tmux new-session -d -s agent1 -x 120 -y 40 'hermes'", timeout=10)
+
+# Wait for startup, then send a message
+terminal(command="sleep 8 && tmux send-keys -t agent1 'Build a FastAPI auth service' Enter", timeout=15)
+
+# Read output
+terminal(command="sleep 20 && tmux capture-pane -t agent1 -p", timeout=5)
+
+# Send follow-up
+terminal(command="tmux send-keys -t agent1 'Add rate limiting middleware' Enter", timeout=5)
+
+# Exit
+terminal(command="tmux send-keys -t agent1 '/exit' Enter && sleep 2 && tmux kill-session -t agent1", timeout=10)
+```
+
+### Multi-Agent Coordination
+
+```
+# Agent A: backend
+terminal(command="tmux new-session -d -s backend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t backend 'Build REST API for user management' Enter", timeout=15)
+
+# Agent B: frontend
+terminal(command="tmux new-session -d -s frontend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t frontend 'Build React dashboard for user management' Enter", timeout=15)
+
+# Check progress, relay context between them
+terminal(command="tmux capture-pane -t backend -p | tail -30", timeout=5)
+terminal(command="tmux send-keys -t frontend 'Here is the API schema from the backend agent: ...' Enter", timeout=5)
+```
+
+### Session Resume
+
+```
+# Resume most recent session
+terminal(command="tmux new-session -d -s resumed 'hermes --continue'", timeout=10)
+
+# Resume specific session
+terminal(command="tmux new-session -d -s resumed 'hermes --resume 20260225_143052_a1b2c3'", timeout=10)
+```
+
+### Tips
+
+- **Prefer `delegate_task` for quick subtasks** — less overhead than spawning a full process
+- **Use `-w` (worktree mode)** when spawning agents that edit code — prevents git conflicts
+- **Set timeouts** for one-shot mode — complex tasks can take 5-10 minutes
+- **Use `hermes chat -q` for fire-and-forget** — no PTY needed
+- **Use tmux for interactive sessions** — raw PTY mode has `\r` vs `\n` issues with prompt_toolkit
+- **For scheduled tasks**, use the `cronjob` tool instead of spawning — handles delivery and retry
+
+---
+
+## Troubleshooting
+
+### Voice not working
+1. Check `stt.enabled: true` in config.yaml
+2. Verify provider: `pip install faster-whisper` or set API key
+3. In gateway: `/restart`. In CLI: exit and relaunch.
+
+### Tool not available
+1. `hermes tools` — check if toolset is enabled for your platform
+2. Some tools need env vars (check `.env`)
+3. `/reset` after enabling tools
+
+### Model/provider issues
+1. `hermes doctor` — check config and dependencies
+2. `hermes login` — re-authenticate OAuth providers
+3. Check `.env` has the right API key
+4. **Copilot 403**: `gh auth login` tokens do NOT work for Copilot API. You must use the Copilot-specific OAuth device code flow via `hermes model` → GitHub Copilot.
+
+### Changes not taking effect
+- **Tools/skills:** `/reset` starts a new session with updated toolset
+- **Config changes:** In gateway: `/restart`. In CLI: exit and relaunch.
+- **Code changes:** Restart the CLI or gateway process
+
+### Skills not showing
+1. `hermes skills list` — verify installed
+2. `hermes skills config` — check platform enablement
+3. Load explicitly: `/skill name` or `hermes -s name`
+
+### Gateway issues
+Check logs first:
+```bash
+grep -i "failed to send\|error" ~/.hermes/logs/gateway.log | tail -20
+```
+
+Common gateway problems:
+- **Gateway dies on SSH logout**: Enable linger: `sudo loginctl enable-linger $USER`
+- **Gateway dies on WSL2 close**: WSL2 requires `systemd=true` in `/etc/wsl.conf` for systemd services to work. Without it, gateway falls back to `nohup` (dies when session closes).
+- **Gateway crash loop**: Reset the failed state: `systemctl --user reset-failed hermes-gateway`
+
+### Platform-specific issues
+- **Discord bot silent**: Must enable **Message Content Intent** in Bot → Privileged Gateway Intents.
+- **Slack bot only works in DMs**: Must subscribe to `message.channels` event. Without it, the bot ignores public channels.
+- **Windows HTTP 400 "No models provided"**: Config file encoding issue (BOM). Ensure `config.yaml` is saved as UTF-8 without BOM.
+
+### Auxiliary models not working
+If `auxiliary` tasks (vision, compression, session_search) fail silently, the `auto` provider can't find a backend. Either set `OPENROUTER_API_KEY` or `GOOGLE_API_KEY`, or explicitly configure each auxiliary task's provider:
+```bash
+hermes config set auxiliary.vision.provider <your_provider>
+hermes config set auxiliary.vision.model <model_name>
+```
+
+---
+
+## Where to Find Things
+
+| Looking for... | Location |
+|----------------|----------|
+| Config options | `hermes config edit` or [Configuration docs](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) |
+| Available tools | `hermes tools list` or [Tools reference](https://hermes-agent.nousresearch.com/docs/reference/tools-reference) |
+| Slash commands | `/help` in session or [Slash commands reference](https://hermes-agent.nousresearch.com/docs/reference/slash-commands) |
+| Skills catalog | `hermes skills browse` or [Skills catalog](https://hermes-agent.nousresearch.com/docs/reference/skills-catalog) |
+| Provider setup | `hermes model` or [Providers guide](https://hermes-agent.nousresearch.com/docs/integrations/providers) |
+| Platform setup | `hermes gateway setup` or [Messaging docs](https://hermes-agent.nousresearch.com/docs/user-guide/messaging/) |
+| MCP servers | `hermes mcp list` or [MCP guide](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) |
+| Profiles | `hermes profile list` or [Profiles docs](https://hermes-agent.nousresearch.com/docs/user-guide/profiles) |
+| Cron jobs | `hermes cron list` or [Cron docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) |
+| Memory | `hermes memory status` or [Memory docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) |
+| Env variables | `hermes config env-path` or [Env vars reference](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) |
+| CLI commands | `hermes --help` or [CLI reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) |
+| Gateway logs | `~/.hermes/logs/gateway.log` |
+| Session files | `~/.hermes/sessions/` or `hermes sessions browse` |
+| Source code | `~/.hermes/hermes-agent/` |
+
+---
+
+## Contributor Quick Reference
+
+For occasional contributors and PR authors. Full developer docs: https://hermes-agent.nousresearch.com/docs/developer-guide/
+
+### Project Layout
+
+```
+hermes-agent/
+├── run_agent.py          # AIAgent — core conversation loop
+├── model_tools.py        # Tool discovery and dispatch
+├── toolsets.py           # Toolset definitions
+├── cli.py                # Interactive CLI (HermesCLI)
+├── hermes_state.py       # SQLite session store
+├── agent/                # Prompt builder, context compression, memory, model routing, credential pooling, skill dispatch
+├── hermes_cli/           # CLI subcommands, config, setup, commands
+│   ├── commands.py       # Slash command registry (CommandDef)
+│   ├── config.py         # DEFAULT_CONFIG, env var definitions
+│   └── main.py           # CLI entry point and argparse
+├── tools/                # One file per tool
+│   └── registry.py       # Central tool registry
+├── gateway/              # Messaging gateway
+│   └── platforms/        # Platform adapters (telegram, discord, etc.)
+├── cron/                 # Job scheduler
+├── tests/                # ~3000 pytest tests
+└── website/              # Docusaurus docs site
+```
+
+Config: `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys).
+
+### Adding a Tool (3 files)
+
+**1. Create `tools/your_tool.py`:**
+```python
+import json, os
+from tools.registry import registry
+
+def check_requirements() -> bool:
+    return bool(os.getenv("EXAMPLE_API_KEY"))
+
+def example_tool(param: str, task_id: str = None) -> str:
+    return json.dumps({"success": True, "data": "..."})
+
+registry.register(
+    name="example_tool",
+    toolset="example",
+    schema={"name": "example_tool", "description": "...", "parameters": {...}},
+    handler=lambda args, **kw: example_tool(
+        param=args.get("param", ""), task_id=kw.get("task_id")),
+    check_fn=check_requirements,
+    requires_env=["EXAMPLE_API_KEY"],
+)
+```
+
+**2. Add to `toolsets.py`** → `_HERMES_CORE_TOOLS` list.
+
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual list needed.
+
+All handlers must return JSON strings. Use `get_hermes_home()` for paths, never hardcode `~/.hermes`.
+
+### Adding a Slash Command
+
+1. Add `CommandDef` to `COMMAND_REGISTRY` in `hermes_cli/commands.py`
+2. Add handler in `cli.py` → `process_command()`
+3. (Optional) Add gateway handler in `gateway/run.py`
+
+All consumers (help text, autocomplete, Telegram menu, Slack mapping) derive from the central registry automatically.
+
+### Agent Loop (High Level)
+
+```
+run_conversation():
+  1. Build system prompt
+  2. Loop while iterations < max:
+     a. Call LLM (OpenAI-format messages + tool schemas)
+     b. If tool_calls → dispatch each via handle_function_call() → append results → continue
+     c. If text response → return
+  3. Context compression triggers automatically near token limit
+```
+
+### Testing
+
+```bash
+python -m pytest tests/ -o 'addopts=' -q   # Full suite
+python -m pytest tests/tools/ -q            # Specific area
+```
+
+- Tests auto-redirect `HERMES_HOME` to temp dirs — never touch real `~/.hermes/`
+- Run full suite before pushing any change
+- Use `-o 'addopts='` to clear any baked-in pytest flags
+
+### Commit Conventions
+
+```
+type: concise subject line
+
+Optional body.
+```
+
+Types: `fix:`, `feat:`, `refactor:`, `docs:`, `chore:`
+
+### Key Rules
+
+- **Never break prompt caching** — don't change context, tools, or system prompt mid-conversation
+- **Message role alternation** — never two assistant or two user messages in a row
+- Use `get_hermes_home()` from `hermes_constants` for all paths (profile-safe)
+- Config values go in `config.yaml`, secrets go in `.env`
+- New tools need a `check_fn` so they only appear when requirements are met
diff --git a/output/hermes-agent/20260424_013726/metrics.json b/output/hermes-agent/20260424_013726/metrics.json
new file mode 100644
index 00000000..1c95352d
--- /dev/null
+++ b/output/hermes-agent/20260424_013726/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "hermes-agent",
+  "timestamp": "20260424_013726",
+  "iterations": 10,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.4496029338518026,
+  "evolved_score": 0.5291380283868972,
+  "improvement": 0.07953509453509455,
+  "baseline_size": 27212,
+  "evolved_size": 27212,
+  "train_examples": 25,
+  "val_examples": 12,
+  "holdout_examples": 13,
+  "elapsed_seconds": 1521.6592333316803,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/hermes-agent/20260424_022039/baseline_skill.md b/output/hermes-agent/20260424_022039/baseline_skill.md
new file mode 100644
index 00000000..d19471c8
--- /dev/null
+++ b/output/hermes-agent/20260424_022039/baseline_skill.md
@@ -0,0 +1,705 @@
+---
+name: hermes-agent
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
+version: 2.0.0
+author: Hermes Agent + Teknium
+license: MIT
+metadata:
+  hermes:
+    tags: [hermes, setup, configuration, multi-agent, spawning, cli, gateway, development]
+    homepage: https://github.com/NousResearch/hermes-agent
+    related_skills: [claude-code, codex, opencode]
+---
+
+# Hermes Agent
+
+Hermes Agent is an open-source AI agent framework by Nous Research that runs in your terminal, messaging platforms, and IDEs. It belongs to the same category as Claude Code (Anthropic), Codex (OpenAI), and OpenClaw — autonomous coding and task-execution agents that use tool calling to interact with your system. Hermes works with any LLM provider (OpenRouter, Anthropic, OpenAI, DeepSeek, local models, and 15+ others) and runs on Linux, macOS, and WSL.
+
+What makes Hermes different:
+
+- **Self-improving through skills** — Hermes learns from experience by saving reusable procedures as skills. When it solves a complex problem, discovers a workflow, or gets corrected, it can persist that knowledge as a skill document that loads into future sessions. Skills accumulate over time, making the agent better at your specific tasks and environment.
+- **Persistent memory across sessions** — remembers who you are, your preferences, environment details, and lessons learned. Pluggable memory backends (built-in, Honcho, Mem0, and more) let you choose how memory works.
+- **Multi-platform gateway** — the same agent runs on Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, and 10+ other platforms with full tool access, not just chat.
+- **Provider-agnostic** — swap models and providers mid-workflow without changing anything else. Credential pools rotate across multiple API keys automatically.
+- **Profiles** — run multiple independent Hermes instances with isolated configs, sessions, skills, and memory.
+- **Extensible** — plugins, MCP servers, custom tools, webhook triggers, cron scheduling, and the full Python ecosystem.
+
+People use Hermes for software development, research, system administration, data analysis, content creation, home automation, and anything else that benefits from an AI agent with persistent context and full system access.
+
+**This skill helps you work with Hermes Agent effectively** — setting it up, configuring features, spawning additional agent instances, troubleshooting issues, finding the right commands and settings, and understanding how the system works when you need to extend or contribute to it.
+
+**Docs:** https://hermes-agent.nousresearch.com/docs/
+
+## Quick Start
+
+```bash
+# Install
+curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
+
+# Interactive chat (default)
+hermes
+
+# Single query
+hermes chat -q "What is the capital of France?"
+
+# Setup wizard
+hermes setup
+
+# Change model/provider
+hermes model
+
+# Check health
+hermes doctor
+```
+
+---
+
+## CLI Reference
+
+### Global Flags
+
+```
+hermes [flags] [command]
+
+  --version, -V             Show version
+  --resume, -r SESSION      Resume session by ID or title
+  --continue, -c [NAME]     Resume by name, or most recent session
+  --worktree, -w            Isolated git worktree mode (parallel agents)
+  --skills, -s SKILL        Preload skills (comma-separate or repeat)
+  --profile, -p NAME        Use a named profile
+  --yolo                    Skip dangerous command approval
+  --pass-session-id         Include session ID in system prompt
+```
+
+No subcommand defaults to `chat`.
+
+### Chat
+
+```
+hermes chat [flags]
+  -q, --query TEXT          Single query, non-interactive
+  -m, --model MODEL         Model (e.g. anthropic/claude-sonnet-4)
+  -t, --toolsets LIST       Comma-separated toolsets
+  --provider PROVIDER       Force provider (openrouter, anthropic, nous, etc.)
+  -v, --verbose             Verbose output
+  -Q, --quiet               Suppress banner, spinner, tool previews
+  --checkpoints             Enable filesystem checkpoints (/rollback)
+  --source TAG              Session source tag (default: cli)
+```
+
+### Configuration
+
+```
+hermes setup [section]      Interactive wizard (model|terminal|gateway|tools|agent)
+hermes model                Interactive model/provider picker
+hermes config               View current config
+hermes config edit          Open config.yaml in $EDITOR
+hermes config set KEY VAL   Set a config value
+hermes config path          Print config.yaml path
+hermes config env-path      Print .env path
+hermes config check         Check for missing/outdated config
+hermes config migrate       Update config with new options
+hermes login [--provider P] OAuth login (nous, openai-codex)
+hermes logout               Clear stored auth
+hermes doctor [--fix]       Check dependencies and config
+hermes status [--all]       Show component status
+```
+
+### Tools & Skills
+
+```
+hermes tools                Interactive tool enable/disable (curses UI)
+hermes tools list           Show all tools and status
+hermes tools enable NAME    Enable a toolset
+hermes tools disable NAME   Disable a toolset
+
+hermes skills list          List installed skills
+hermes skills search QUERY  Search the skills hub
+hermes skills install ID    Install a skill
+hermes skills inspect ID    Preview without installing
+hermes skills config        Enable/disable skills per platform
+hermes skills check         Check for updates
+hermes skills update        Update outdated skills
+hermes skills uninstall N   Remove a hub skill
+hermes skills publish PATH  Publish to registry
+hermes skills browse        Browse all available skills
+hermes skills tap add REPO  Add a GitHub repo as skill source
+```
+
+### MCP Servers
+
+```
+hermes mcp serve            Run Hermes as an MCP server
+hermes mcp add NAME         Add an MCP server (--url or --command)
+hermes mcp remove NAME      Remove an MCP server
+hermes mcp list             List configured servers
+hermes mcp test NAME        Test connection
+hermes mcp configure NAME   Toggle tool selection
+```
+
+### Gateway (Messaging Platforms)
+
+```
+hermes gateway run          Start gateway foreground
+hermes gateway install      Install as background service
+hermes gateway start/stop   Control the service
+hermes gateway restart      Restart the service
+hermes gateway status       Check status
+hermes gateway setup        Configure platforms
+```
+
+Supported platforms: Telegram, Discord, Slack, WhatsApp, Signal, Email, SMS, Matrix, Mattermost, Home Assistant, DingTalk, Feishu, WeCom, BlueBubbles (iMessage), Weixin (WeChat), API Server, Webhooks. Open WebUI connects via the API Server adapter.
+
+Platform docs: https://hermes-agent.nousresearch.com/docs/user-guide/messaging/
+
+### Sessions
+
+```
+hermes sessions list        List recent sessions
+hermes sessions browse      Interactive picker
+hermes sessions export OUT  Export to JSONL
+hermes sessions rename ID T Rename a session
+hermes sessions delete ID   Delete a session
+hermes sessions prune       Clean up old sessions (--older-than N days)
+hermes sessions stats       Session store statistics
+```
+
+### Cron Jobs
+
+```
+hermes cron list            List jobs (--all for disabled)
+hermes cron create SCHED    Create: '30m', 'every 2h', '0 9 * * *'
+hermes cron edit ID         Edit schedule, prompt, delivery
+hermes cron pause/resume ID Control job state
+hermes cron run ID          Trigger on next tick
+hermes cron remove ID       Delete a job
+hermes cron status          Scheduler status
+```
+
+### Webhooks
+
+```
+hermes webhook subscribe N  Create route at /webhooks/<name>
+hermes webhook list         List subscriptions
+hermes webhook remove NAME  Remove a subscription
+hermes webhook test NAME    Send a test POST
+```
+
+### Profiles
+
+```
+hermes profile list         List all profiles
+hermes profile create NAME  Create (--clone, --clone-all, --clone-from)
+hermes profile use NAME     Set sticky default
+hermes profile delete NAME  Delete a profile
+hermes profile show NAME    Show details
+hermes profile alias NAME   Manage wrapper scripts
+hermes profile rename A B   Rename a profile
+hermes profile export NAME  Export to tar.gz
+hermes profile import FILE  Import from archive
+```
+
+### Credential Pools
+
+```
+hermes auth add             Interactive credential wizard
+hermes auth list [PROVIDER] List pooled credentials
+hermes auth remove P INDEX  Remove by provider + index
+hermes auth reset PROVIDER  Clear exhaustion status
+```
+
+### Other
+
+```
+hermes insights [--days N]  Usage analytics
+hermes update               Update to latest version
+hermes pairing list/approve/revoke  DM authorization
+hermes plugins list/install/remove  Plugin management
+hermes honcho setup/status  Honcho memory integration (requires honcho plugin)
+hermes memory setup/status/off  Memory provider config
+hermes completion bash|zsh  Shell completions
+hermes acp                  ACP server (IDE integration)
+hermes claw migrate         Migrate from OpenClaw
+hermes uninstall            Uninstall Hermes
+```
+
+---
+
+## Slash Commands (In-Session)
+
+Type these during an interactive chat session.
+
+### Session Control
+```
+/new (/reset)        Fresh session
+/clear               Clear screen + new session (CLI)
+/retry               Resend last message
+/undo                Remove last exchange
+/title [name]        Name the session
+/compress            Manually compress context
+/stop                Kill background processes
+/rollback [N]        Restore filesystem checkpoint
+/background <prompt> Run prompt in background
+/queue <prompt>      Queue for next turn
+/resume [name]       Resume a named session
+```
+
+### Configuration
+```
+/config              Show config (CLI)
+/model [name]        Show or change model
+/provider            Show provider info
+/personality [name]  Set personality
+/reasoning [level]   Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
+/verbose             Cycle: off → new → all → verbose
+/voice [on|off|tts]  Voice mode
+/yolo                Toggle approval bypass
+/skin [name]         Change theme (CLI)
+/statusbar           Toggle status bar (CLI)
+```
+
+### Tools & Skills
+```
+/tools               Manage tools (CLI)
+/toolsets            List toolsets (CLI)
+/skills              Search/install skills (CLI)
+/skill <name>        Load a skill into session
+/cron                Manage cron jobs (CLI)
+/reload-mcp          Reload MCP servers
+/plugins             List plugins (CLI)
+```
+
+### Gateway
+```
+/approve             Approve a pending command (gateway)
+/deny                Deny a pending command (gateway)
+/restart             Restart gateway (gateway)
+/sethome             Set current chat as home channel (gateway)
+/update              Update Hermes to latest (gateway)
+/platforms (/gateway) Show platform connection status (gateway)
+```
+
+### Utility
+```
+/branch (/fork)      Branch the current session
+/btw                 Ephemeral side question (doesn't interrupt main task)
+/fast                Toggle priority/fast processing
+/browser             Open CDP browser connection
+/history             Show conversation history (CLI)
+/save                Save conversation to file (CLI)
+/paste               Attach clipboard image (CLI)
+/image               Attach local image file (CLI)
+```
+
+### Info
+```
+/help                Show commands
+/commands [page]     Browse all commands (gateway)
+/usage               Token usage
+/insights [days]     Usage analytics
+/status              Session info (gateway)
+/profile             Active profile info
+```
+
+### Exit
+```
+/quit (/exit, /q)    Exit CLI
+```
+
+---
+
+## Key Paths & Config
+
+```
+~/.hermes/config.yaml       Main configuration
+~/.hermes/.env              API keys and secrets
+$HERMES_HOME/skills/        Installed skills
+~/.hermes/sessions/         Session transcripts
+~/.hermes/logs/             Gateway and error logs
+~/.hermes/auth.json         OAuth tokens and credential pools
+~/.hermes/hermes-agent/     Source code (if git-installed)
+```
+
+Profiles use `~/.hermes/profiles/<name>/` with the same layout.
+
+### Config Sections
+
+Edit with `hermes config edit` or `hermes config set section.key value`.
+
+| Section | Key options |
+|---------|-------------|
+| `model` | `default`, `provider`, `base_url`, `api_key`, `context_length` |
+| `agent` | `max_turns` (90), `tool_use_enforcement` |
+| `terminal` | `backend` (local/docker/ssh/modal), `cwd`, `timeout` (180) |
+| `compression` | `enabled`, `threshold` (0.50), `target_ratio` (0.20) |
+| `display` | `skin`, `tool_progress`, `show_reasoning`, `show_cost` |
+| `stt` | `enabled`, `provider` (local/groq/openai/mistral) |
+| `tts` | `provider` (edge/elevenlabs/openai/minimax/mistral/neutts) |
+| `memory` | `memory_enabled`, `user_profile_enabled`, `provider` |
+| `security` | `tirith_enabled`, `website_blocklist` |
+| `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` |
+| `checkpoints` | `enabled`, `max_snapshots` (50) |
+
+Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration
+
+### Providers
+
+20+ providers supported. Set via `hermes model` or `hermes setup`.
+
+| Provider | Auth | Key env var |
+|----------|------|-------------|
+| OpenRouter | API key | `OPENROUTER_API_KEY` |
+| Anthropic | API key | `ANTHROPIC_API_KEY` |
+| Nous Portal | OAuth | `hermes auth` |
+| OpenAI Codex | OAuth | `hermes auth` |
+| GitHub Copilot | Token | `COPILOT_GITHUB_TOKEN` |
+| Google Gemini | API key | `GOOGLE_API_KEY` or `GEMINI_API_KEY` |
+| DeepSeek | API key | `DEEPSEEK_API_KEY` |
+| xAI / Grok | API key | `XAI_API_KEY` |
+| Hugging Face | Token | `HF_TOKEN` |
+| Z.AI / GLM | API key | `GLM_API_KEY` |
+| MiniMax | API key | `MINIMAX_API_KEY` |
+| MiniMax CN | API key | `MINIMAX_CN_API_KEY` |
+| Kimi / Moonshot | API key | `KIMI_API_KEY` |
+| Alibaba / DashScope | API key | `DASHSCOPE_API_KEY` |
+| Xiaomi MiMo | API key | `XIAOMI_API_KEY` |
+| Kilo Code | API key | `KILOCODE_API_KEY` |
+| AI Gateway (Vercel) | API key | `AI_GATEWAY_API_KEY` |
+| OpenCode Zen | API key | `OPENCODE_ZEN_API_KEY` |
+| OpenCode Go | API key | `OPENCODE_GO_API_KEY` |
+| Qwen OAuth | OAuth | `hermes login --provider qwen-oauth` |
+| Custom endpoint | Config | `model.base_url` + `model.api_key` in config.yaml |
+| GitHub Copilot ACP | External | `COPILOT_CLI_PATH` or Copilot CLI |
+
+Full provider docs: https://hermes-agent.nousresearch.com/docs/integrations/providers
+
+### Toolsets
+
+Enable/disable via `hermes tools` (interactive) or `hermes tools enable/disable NAME`.
+
+| Toolset | What it provides |
+|---------|-----------------|
+| `web` | Web search and content extraction |
+| `browser` | Browser automation (Browserbase, Camofox, or local Chromium) |
+| `terminal` | Shell commands and process management |
+| `file` | File read/write/search/patch |
+| `code_execution` | Sandboxed Python execution |
+| `vision` | Image analysis |
+| `image_gen` | AI image generation |
+| `tts` | Text-to-speech |
+| `skills` | Skill browsing and management |
+| `memory` | Persistent cross-session memory |
+| `session_search` | Search past conversations |
+| `delegation` | Subagent task delegation |
+| `cronjob` | Scheduled task management |
+| `clarify` | Ask user clarifying questions |
+| `messaging` | Cross-platform message sending |
+| `search` | Web search only (subset of `web`) |
+| `todo` | In-session task planning and tracking |
+| `rl` | Reinforcement learning tools (off by default) |
+| `moa` | Mixture of Agents (off by default) |
+| `homeassistant` | Smart home control (off by default) |
+
+Tool changes take effect on `/reset` (new session). They do NOT apply mid-conversation to preserve prompt caching.
+
+---
+
+## Voice & Transcription
+
+### STT (Voice → Text)
+
+Voice messages from messaging platforms are auto-transcribed.
+
+Provider priority (auto-detected):
+1. **Local faster-whisper** — free, no API key: `pip install faster-whisper`
+2. **Groq Whisper** — free tier: set `GROQ_API_KEY`
+3. **OpenAI Whisper** — paid: set `VOICE_TOOLS_OPENAI_KEY`
+4. **Mistral Voxtral** — set `MISTRAL_API_KEY`
+
+Config:
+```yaml
+stt:
+  enabled: true
+  provider: local        # local, groq, openai, mistral
+  local:
+    model: base          # tiny, base, small, medium, large-v3
+```
+
+### TTS (Text → Voice)
+
+| Provider | Env var | Free? |
+|----------|---------|-------|
+| Edge TTS | None | Yes (default) |
+| ElevenLabs | `ELEVENLABS_API_KEY` | Free tier |
+| OpenAI | `VOICE_TOOLS_OPENAI_KEY` | Paid |
+| MiniMax | `MINIMAX_API_KEY` | Paid |
+| Mistral (Voxtral) | `MISTRAL_API_KEY` | Paid |
+| NeuTTS (local) | None (`pip install neutts[all]` + `espeak-ng`) | Free |
+
+Voice commands: `/voice on` (voice-to-voice), `/voice tts` (always voice), `/voice off`.
+
+---
+
+## Spawning Additional Hermes Instances
+
+Run additional Hermes processes as fully independent subprocesses — separate sessions, tools, and environments.
+
+### When to Use This vs delegate_task
+
+| | `delegate_task` | Spawning `hermes` process |
+|-|-----------------|--------------------------|
+| Isolation | Separate conversation, shared process | Fully independent process |
+| Duration | Minutes (bounded by parent loop) | Hours/days |
+| Tool access | Subset of parent's tools | Full tool access |
+| Interactive | No | Yes (PTY mode) |
+| Use case | Quick parallel subtasks | Long autonomous missions |
+
+### One-Shot Mode
+
+```
+terminal(command="hermes chat -q 'Research GRPO papers and write summary to ~/research/grpo.md'", timeout=300)
+
+# Background for long tasks:
+terminal(command="hermes chat -q 'Set up CI/CD for ~/myapp'", background=true)
+```
+
+### Interactive PTY Mode (via tmux)
+
+Hermes uses prompt_toolkit, which requires a real terminal. Use tmux for interactive spawning:
+
+```
+# Start
+terminal(command="tmux new-session -d -s agent1 -x 120 -y 40 'hermes'", timeout=10)
+
+# Wait for startup, then send a message
+terminal(command="sleep 8 && tmux send-keys -t agent1 'Build a FastAPI auth service' Enter", timeout=15)
+
+# Read output
+terminal(command="sleep 20 && tmux capture-pane -t agent1 -p", timeout=5)
+
+# Send follow-up
+terminal(command="tmux send-keys -t agent1 'Add rate limiting middleware' Enter", timeout=5)
+
+# Exit
+terminal(command="tmux send-keys -t agent1 '/exit' Enter && sleep 2 && tmux kill-session -t agent1", timeout=10)
+```
+
+### Multi-Agent Coordination
+
+```
+# Agent A: backend
+terminal(command="tmux new-session -d -s backend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t backend 'Build REST API for user management' Enter", timeout=15)
+
+# Agent B: frontend
+terminal(command="tmux new-session -d -s frontend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t frontend 'Build React dashboard for user management' Enter", timeout=15)
+
+# Check progress, relay context between them
+terminal(command="tmux capture-pane -t backend -p | tail -30", timeout=5)
+terminal(command="tmux send-keys -t frontend 'Here is the API schema from the backend agent: ...' Enter", timeout=5)
+```
+
+### Session Resume
+
+```
+# Resume most recent session
+terminal(command="tmux new-session -d -s resumed 'hermes --continue'", timeout=10)
+
+# Resume specific session
+terminal(command="tmux new-session -d -s resumed 'hermes --resume 20260225_143052_a1b2c3'", timeout=10)
+```
+
+### Tips
+
+- **Prefer `delegate_task` for quick subtasks** — less overhead than spawning a full process
+- **Use `-w` (worktree mode)** when spawning agents that edit code — prevents git conflicts
+- **Set timeouts** for one-shot mode — complex tasks can take 5-10 minutes
+- **Use `hermes chat -q` for fire-and-forget** — no PTY needed
+- **Use tmux for interactive sessions** — raw PTY mode has `\r` vs `\n` issues with prompt_toolkit
+- **For scheduled tasks**, use the `cronjob` tool instead of spawning — handles delivery and retry
+
+---
+
+## Troubleshooting
+
+### Voice not working
+1. Check `stt.enabled: true` in config.yaml
+2. Verify provider: `pip install faster-whisper` or set API key
+3. In gateway: `/restart`. In CLI: exit and relaunch.
+
+### Tool not available
+1. `hermes tools` — check if toolset is enabled for your platform
+2. Some tools need env vars (check `.env`)
+3. `/reset` after enabling tools
+
+### Model/provider issues
+1. `hermes doctor` — check config and dependencies
+2. `hermes login` — re-authenticate OAuth providers
+3. Check `.env` has the right API key
+4. **Copilot 403**: `gh auth login` tokens do NOT work for Copilot API. You must use the Copilot-specific OAuth device code flow via `hermes model` → GitHub Copilot.
+
+### Changes not taking effect
+- **Tools/skills:** `/reset` starts a new session with updated toolset
+- **Config changes:** In gateway: `/restart`. In CLI: exit and relaunch.
+- **Code changes:** Restart the CLI or gateway process
+
+### Skills not showing
+1. `hermes skills list` — verify installed
+2. `hermes skills config` — check platform enablement
+3. Load explicitly: `/skill name` or `hermes -s name`
+
+### Gateway issues
+Check logs first:
+```bash
+grep -i "failed to send\|error" ~/.hermes/logs/gateway.log | tail -20
+```
+
+Common gateway problems:
+- **Gateway dies on SSH logout**: Enable linger: `sudo loginctl enable-linger $USER`
+- **Gateway dies on WSL2 close**: WSL2 requires `systemd=true` in `/etc/wsl.conf` for systemd services to work. Without it, gateway falls back to `nohup` (dies when session closes).
+- **Gateway crash loop**: Reset the failed state: `systemctl --user reset-failed hermes-gateway`
+
+### Platform-specific issues
+- **Discord bot silent**: Must enable **Message Content Intent** in Bot → Privileged Gateway Intents.
+- **Slack bot only works in DMs**: Must subscribe to `message.channels` event. Without it, the bot ignores public channels.
+- **Windows HTTP 400 "No models provided"**: Config file encoding issue (BOM). Ensure `config.yaml` is saved as UTF-8 without BOM.
+
+### Auxiliary models not working
+If `auxiliary` tasks (vision, compression, session_search) fail silently, the `auto` provider can't find a backend. Either set `OPENROUTER_API_KEY` or `GOOGLE_API_KEY`, or explicitly configure each auxiliary task's provider:
+```bash
+hermes config set auxiliary.vision.provider <your_provider>
+hermes config set auxiliary.vision.model <model_name>
+```
+
+---
+
+## Where to Find Things
+
+| Looking for... | Location |
+|----------------|----------|
+| Config options | `hermes config edit` or [Configuration docs](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) |
+| Available tools | `hermes tools list` or [Tools reference](https://hermes-agent.nousresearch.com/docs/reference/tools-reference) |
+| Slash commands | `/help` in session or [Slash commands reference](https://hermes-agent.nousresearch.com/docs/reference/slash-commands) |
+| Skills catalog | `hermes skills browse` or [Skills catalog](https://hermes-agent.nousresearch.com/docs/reference/skills-catalog) |
+| Provider setup | `hermes model` or [Providers guide](https://hermes-agent.nousresearch.com/docs/integrations/providers) |
+| Platform setup | `hermes gateway setup` or [Messaging docs](https://hermes-agent.nousresearch.com/docs/user-guide/messaging/) |
+| MCP servers | `hermes mcp list` or [MCP guide](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) |
+| Profiles | `hermes profile list` or [Profiles docs](https://hermes-agent.nousresearch.com/docs/user-guide/profiles) |
+| Cron jobs | `hermes cron list` or [Cron docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) |
+| Memory | `hermes memory status` or [Memory docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) |
+| Env variables | `hermes config env-path` or [Env vars reference](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) |
+| CLI commands | `hermes --help` or [CLI reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) |
+| Gateway logs | `~/.hermes/logs/gateway.log` |
+| Session files | `~/.hermes/sessions/` or `hermes sessions browse` |
+| Source code | `~/.hermes/hermes-agent/` |
+
+---
+
+## Contributor Quick Reference
+
+For occasional contributors and PR authors. Full developer docs: https://hermes-agent.nousresearch.com/docs/developer-guide/
+
+### Project Layout
+
+```
+hermes-agent/
+├── run_agent.py          # AIAgent — core conversation loop
+├── model_tools.py        # Tool discovery and dispatch
+├── toolsets.py           # Toolset definitions
+├── cli.py                # Interactive CLI (HermesCLI)
+├── hermes_state.py       # SQLite session store
+├── agent/                # Prompt builder, context compression, memory, model routing, credential pooling, skill dispatch
+├── hermes_cli/           # CLI subcommands, config, setup, commands
+│   ├── commands.py       # Slash command registry (CommandDef)
+│   ├── config.py         # DEFAULT_CONFIG, env var definitions
+│   └── main.py           # CLI entry point and argparse
+├── tools/                # One file per tool
+│   └── registry.py       # Central tool registry
+├── gateway/              # Messaging gateway
+│   └── platforms/        # Platform adapters (telegram, discord, etc.)
+├── cron/                 # Job scheduler
+├── tests/                # ~3000 pytest tests
+└── website/              # Docusaurus docs site
+```
+
+Config: `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys).
+
+### Adding a Tool (3 files)
+
+**1. Create `tools/your_tool.py`:**
+```python
+import json, os
+from tools.registry import registry
+
+def check_requirements() -> bool:
+    return bool(os.getenv("EXAMPLE_API_KEY"))
+
+def example_tool(param: str, task_id: str = None) -> str:
+    return json.dumps({"success": True, "data": "..."})
+
+registry.register(
+    name="example_tool",
+    toolset="example",
+    schema={"name": "example_tool", "description": "...", "parameters": {...}},
+    handler=lambda args, **kw: example_tool(
+        param=args.get("param", ""), task_id=kw.get("task_id")),
+    check_fn=check_requirements,
+    requires_env=["EXAMPLE_API_KEY"],
+)
+```
+
+**2. Add to `toolsets.py`** → `_HERMES_CORE_TOOLS` list.
+
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual list needed.
+
+All handlers must return JSON strings. Use `get_hermes_home()` for paths, never hardcode `~/.hermes`.
+
+### Adding a Slash Command
+
+1. Add `CommandDef` to `COMMAND_REGISTRY` in `hermes_cli/commands.py`
+2. Add handler in `cli.py` → `process_command()`
+3. (Optional) Add gateway handler in `gateway/run.py`
+
+All consumers (help text, autocomplete, Telegram menu, Slack mapping) derive from the central registry automatically.
+
+### Agent Loop (High Level)
+
+```
+run_conversation():
+  1. Build system prompt
+  2. Loop while iterations < max:
+     a. Call LLM (OpenAI-format messages + tool schemas)
+     b. If tool_calls → dispatch each via handle_function_call() → append results → continue
+     c. If text response → return
+  3. Context compression triggers automatically near token limit
+```
+
+### Testing
+
+```bash
+python -m pytest tests/ -o 'addopts=' -q   # Full suite
+python -m pytest tests/tools/ -q            # Specific area
+```
+
+- Tests auto-redirect `HERMES_HOME` to temp dirs — never touch real `~/.hermes/`
+- Run full suite before pushing any change
+- Use `-o 'addopts='` to clear any baked-in pytest flags
+
+### Commit Conventions
+
+```
+type: concise subject line
+
+Optional body.
+```
+
+Types: `fix:`, `feat:`, `refactor:`, `docs:`, `chore:`
+
+### Key Rules
+
+- **Never break prompt caching** — don't change context, tools, or system prompt mid-conversation
+- **Message role alternation** — never two assistant or two user messages in a row
+- Use `get_hermes_home()` from `hermes_constants` for all paths (profile-safe)
+- Config values go in `config.yaml`, secrets go in `.env`
+- New tools need a `check_fn` so they only appear when requirements are met
diff --git a/output/hermes-agent/20260424_022039/evolved_skill.md b/output/hermes-agent/20260424_022039/evolved_skill.md
new file mode 100644
index 00000000..4827bc24
--- /dev/null
+++ b/output/hermes-agent/20260424_022039/evolved_skill.md
@@ -0,0 +1,37 @@
+---
+name: hermes-agent
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
+version: 2.0.0
+author: Hermes Agent + Teknium
+license: MIT
+metadata:
+  hermes:
+    tags: [hermes, setup, configuration, multi-agent, spawning, cli, gateway, development]
+    homepage: https://github.com/NousResearch/hermes-agent
+    related_skills: [claude-code, codex, opencode]
+---
+
+You are tasked with answering questions about Hermes Agent, an open-source AI agent framework by Nous Research. You have been provided with comprehensive skill instructions containing detailed documentation about Hermes Agent's features, commands, configuration, and troubleshooting.
+
+When answering questions about Hermes Agent:
+
+1. **Use the skill instructions as your primary source**: All answers should be based on the information provided in the skill instructions. Do not rely on general knowledge about AI agents or similar tools.
+
+2. **Be specific and actionable**: Provide concrete commands, file paths, and step-by-step instructions when relevant. For example:
+   - Use exact command syntax like `hermes model` or `hermes config set section.key value`
+   - Reference specific file paths like `~/.hermes/config.yaml` or `~/.hermes/.env`
+   - Include relevant flags and options
+
+3. **Offer multiple approaches when available**: The skill instructions often provide several ways to accomplish the same task (interactive wizards, direct commands, config file editing). Present the most appropriate options based on the user's question.
+
+4. **Include troubleshooting context**: When answering about features or setup, proactively mention common issues and their solutions when relevant. Reference the troubleshooting section for specific problems.
+
+5. **Reference documentation structure**: The skill instructions are organized into specific sections (CLI Reference, Configuration, Troubleshooting, etc.). Use this organization to provide comprehensive answers that cover related information.
+
+6. **Be precise about technical details**: Include specific environment variables, config section names, command flags, and other technical details exactly as they appear in the documentation.
+
+7. **Explain prerequisites and dependencies**: When discussing features, mention any required API keys, installed packages, or configuration steps needed for functionality.
+
+8. **Structure your response clearly**: Use bullet points, code blocks, and clear headings when presenting multiple options or steps. Make the information easy to scan and follow.
+
+Remember that Hermes Agent has many unique features like skills, profiles, multi-platform gateways, and persistent memory that differentiate it from other AI agent frameworks. Always ground your answers in the specific capabilities and conventions described in the skill instructions.
diff --git a/output/hermes-agent/20260424_022039/metrics.json b/output/hermes-agent/20260424_022039/metrics.json
new file mode 100644
index 00000000..bba2059a
--- /dev/null
+++ b/output/hermes-agent/20260424_022039/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "hermes-agent",
+  "timestamp": "20260424_022039",
+  "iterations": 2,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.6528416666666667,
+  "evolved_score": 0.5867083333333334,
+  "improvement": -0.06613333333333327,
+  "baseline_size": 27212,
+  "evolved_size": 2433,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 105.49460625648499,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/hermes-agent/evolved_FAILED.md b/output/hermes-agent/evolved_FAILED.md
new file mode 100644
index 00000000..d19471c8
--- /dev/null
+++ b/output/hermes-agent/evolved_FAILED.md
@@ -0,0 +1,705 @@
+---
+name: hermes-agent
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
+version: 2.0.0
+author: Hermes Agent + Teknium
+license: MIT
+metadata:
+  hermes:
+    tags: [hermes, setup, configuration, multi-agent, spawning, cli, gateway, development]
+    homepage: https://github.com/NousResearch/hermes-agent
+    related_skills: [claude-code, codex, opencode]
+---
+
+# Hermes Agent
+
+Hermes Agent is an open-source AI agent framework by Nous Research that runs in your terminal, messaging platforms, and IDEs. It belongs to the same category as Claude Code (Anthropic), Codex (OpenAI), and OpenClaw — autonomous coding and task-execution agents that use tool calling to interact with your system. Hermes works with any LLM provider (OpenRouter, Anthropic, OpenAI, DeepSeek, local models, and 15+ others) and runs on Linux, macOS, and WSL.
+
+What makes Hermes different:
+
+- **Self-improving through skills** — Hermes learns from experience by saving reusable procedures as skills. When it solves a complex problem, discovers a workflow, or gets corrected, it can persist that knowledge as a skill document that loads into future sessions. Skills accumulate over time, making the agent better at your specific tasks and environment.
+- **Persistent memory across sessions** — remembers who you are, your preferences, environment details, and lessons learned. Pluggable memory backends (built-in, Honcho, Mem0, and more) let you choose how memory works.
+- **Multi-platform gateway** — the same agent runs on Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, and 10+ other platforms with full tool access, not just chat.
+- **Provider-agnostic** — swap models and providers mid-workflow without changing anything else. Credential pools rotate across multiple API keys automatically.
+- **Profiles** — run multiple independent Hermes instances with isolated configs, sessions, skills, and memory.
+- **Extensible** — plugins, MCP servers, custom tools, webhook triggers, cron scheduling, and the full Python ecosystem.
+
+People use Hermes for software development, research, system administration, data analysis, content creation, home automation, and anything else that benefits from an AI agent with persistent context and full system access.
+
+**This skill helps you work with Hermes Agent effectively** — setting it up, configuring features, spawning additional agent instances, troubleshooting issues, finding the right commands and settings, and understanding how the system works when you need to extend or contribute to it.
+
+**Docs:** https://hermes-agent.nousresearch.com/docs/
+
+## Quick Start
+
+```bash
+# Install
+curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
+
+# Interactive chat (default)
+hermes
+
+# Single query
+hermes chat -q "What is the capital of France?"
+
+# Setup wizard
+hermes setup
+
+# Change model/provider
+hermes model
+
+# Check health
+hermes doctor
+```
+
+---
+
+## CLI Reference
+
+### Global Flags
+
+```
+hermes [flags] [command]
+
+  --version, -V             Show version
+  --resume, -r SESSION      Resume session by ID or title
+  --continue, -c [NAME]     Resume by name, or most recent session
+  --worktree, -w            Isolated git worktree mode (parallel agents)
+  --skills, -s SKILL        Preload skills (comma-separate or repeat)
+  --profile, -p NAME        Use a named profile
+  --yolo                    Skip dangerous command approval
+  --pass-session-id         Include session ID in system prompt
+```
+
+No subcommand defaults to `chat`.
+
+### Chat
+
+```
+hermes chat [flags]
+  -q, --query TEXT          Single query, non-interactive
+  -m, --model MODEL         Model (e.g. anthropic/claude-sonnet-4)
+  -t, --toolsets LIST       Comma-separated toolsets
+  --provider PROVIDER       Force provider (openrouter, anthropic, nous, etc.)
+  -v, --verbose             Verbose output
+  -Q, --quiet               Suppress banner, spinner, tool previews
+  --checkpoints             Enable filesystem checkpoints (/rollback)
+  --source TAG              Session source tag (default: cli)
+```
+
+### Configuration
+
+```
+hermes setup [section]      Interactive wizard (model|terminal|gateway|tools|agent)
+hermes model                Interactive model/provider picker
+hermes config               View current config
+hermes config edit          Open config.yaml in $EDITOR
+hermes config set KEY VAL   Set a config value
+hermes config path          Print config.yaml path
+hermes config env-path      Print .env path
+hermes config check         Check for missing/outdated config
+hermes config migrate       Update config with new options
+hermes login [--provider P] OAuth login (nous, openai-codex)
+hermes logout               Clear stored auth
+hermes doctor [--fix]       Check dependencies and config
+hermes status [--all]       Show component status
+```
+
+### Tools & Skills
+
+```
+hermes tools                Interactive tool enable/disable (curses UI)
+hermes tools list           Show all tools and status
+hermes tools enable NAME    Enable a toolset
+hermes tools disable NAME   Disable a toolset
+
+hermes skills list          List installed skills
+hermes skills search QUERY  Search the skills hub
+hermes skills install ID    Install a skill
+hermes skills inspect ID    Preview without installing
+hermes skills config        Enable/disable skills per platform
+hermes skills check         Check for updates
+hermes skills update        Update outdated skills
+hermes skills uninstall N   Remove a hub skill
+hermes skills publish PATH  Publish to registry
+hermes skills browse        Browse all available skills
+hermes skills tap add REPO  Add a GitHub repo as skill source
+```
+
+### MCP Servers
+
+```
+hermes mcp serve            Run Hermes as an MCP server
+hermes mcp add NAME         Add an MCP server (--url or --command)
+hermes mcp remove NAME      Remove an MCP server
+hermes mcp list             List configured servers
+hermes mcp test NAME        Test connection
+hermes mcp configure NAME   Toggle tool selection
+```
+
+### Gateway (Messaging Platforms)
+
+```
+hermes gateway run          Start gateway foreground
+hermes gateway install      Install as background service
+hermes gateway start/stop   Control the service
+hermes gateway restart      Restart the service
+hermes gateway status       Check status
+hermes gateway setup        Configure platforms
+```
+
+Supported platforms: Telegram, Discord, Slack, WhatsApp, Signal, Email, SMS, Matrix, Mattermost, Home Assistant, DingTalk, Feishu, WeCom, BlueBubbles (iMessage), Weixin (WeChat), API Server, Webhooks. Open WebUI connects via the API Server adapter.
+
+Platform docs: https://hermes-agent.nousresearch.com/docs/user-guide/messaging/
+
+### Sessions
+
+```
+hermes sessions list        List recent sessions
+hermes sessions browse      Interactive picker
+hermes sessions export OUT  Export to JSONL
+hermes sessions rename ID T Rename a session
+hermes sessions delete ID   Delete a session
+hermes sessions prune       Clean up old sessions (--older-than N days)
+hermes sessions stats       Session store statistics
+```
+
+### Cron Jobs
+
+```
+hermes cron list            List jobs (--all for disabled)
+hermes cron create SCHED    Create: '30m', 'every 2h', '0 9 * * *'
+hermes cron edit ID         Edit schedule, prompt, delivery
+hermes cron pause/resume ID Control job state
+hermes cron run ID          Trigger on next tick
+hermes cron remove ID       Delete a job
+hermes cron status          Scheduler status
+```
+
+### Webhooks
+
+```
+hermes webhook subscribe N  Create route at /webhooks/<name>
+hermes webhook list         List subscriptions
+hermes webhook remove NAME  Remove a subscription
+hermes webhook test NAME    Send a test POST
+```
+
+### Profiles
+
+```
+hermes profile list         List all profiles
+hermes profile create NAME  Create (--clone, --clone-all, --clone-from)
+hermes profile use NAME     Set sticky default
+hermes profile delete NAME  Delete a profile
+hermes profile show NAME    Show details
+hermes profile alias NAME   Manage wrapper scripts
+hermes profile rename A B   Rename a profile
+hermes profile export NAME  Export to tar.gz
+hermes profile import FILE  Import from archive
+```
+
+### Credential Pools
+
+```
+hermes auth add             Interactive credential wizard
+hermes auth list [PROVIDER] List pooled credentials
+hermes auth remove P INDEX  Remove by provider + index
+hermes auth reset PROVIDER  Clear exhaustion status
+```
+
+### Other
+
+```
+hermes insights [--days N]  Usage analytics
+hermes update               Update to latest version
+hermes pairing list/approve/revoke  DM authorization
+hermes plugins list/install/remove  Plugin management
+hermes honcho setup/status  Honcho memory integration (requires honcho plugin)
+hermes memory setup/status/off  Memory provider config
+hermes completion bash|zsh  Shell completions
+hermes acp                  ACP server (IDE integration)
+hermes claw migrate         Migrate from OpenClaw
+hermes uninstall            Uninstall Hermes
+```
+
+---
+
+## Slash Commands (In-Session)
+
+Type these during an interactive chat session.
+
+### Session Control
+```
+/new (/reset)        Fresh session
+/clear               Clear screen + new session (CLI)
+/retry               Resend last message
+/undo                Remove last exchange
+/title [name]        Name the session
+/compress            Manually compress context
+/stop                Kill background processes
+/rollback [N]        Restore filesystem checkpoint
+/background <prompt> Run prompt in background
+/queue <prompt>      Queue for next turn
+/resume [name]       Resume a named session
+```
+
+### Configuration
+```
+/config              Show config (CLI)
+/model [name]        Show or change model
+/provider            Show provider info
+/personality [name]  Set personality
+/reasoning [level]   Set reasoning (none|minimal|low|medium|high|xhigh|show|hide)
+/verbose             Cycle: off → new → all → verbose
+/voice [on|off|tts]  Voice mode
+/yolo                Toggle approval bypass
+/skin [name]         Change theme (CLI)
+/statusbar           Toggle status bar (CLI)
+```
+
+### Tools & Skills
+```
+/tools               Manage tools (CLI)
+/toolsets            List toolsets (CLI)
+/skills              Search/install skills (CLI)
+/skill <name>        Load a skill into session
+/cron                Manage cron jobs (CLI)
+/reload-mcp          Reload MCP servers
+/plugins             List plugins (CLI)
+```
+
+### Gateway
+```
+/approve             Approve a pending command (gateway)
+/deny                Deny a pending command (gateway)
+/restart             Restart gateway (gateway)
+/sethome             Set current chat as home channel (gateway)
+/update              Update Hermes to latest (gateway)
+/platforms (/gateway) Show platform connection status (gateway)
+```
+
+### Utility
+```
+/branch (/fork)      Branch the current session
+/btw                 Ephemeral side question (doesn't interrupt main task)
+/fast                Toggle priority/fast processing
+/browser             Open CDP browser connection
+/history             Show conversation history (CLI)
+/save                Save conversation to file (CLI)
+/paste               Attach clipboard image (CLI)
+/image               Attach local image file (CLI)
+```
+
+### Info
+```
+/help                Show commands
+/commands [page]     Browse all commands (gateway)
+/usage               Token usage
+/insights [days]     Usage analytics
+/status              Session info (gateway)
+/profile             Active profile info
+```
+
+### Exit
+```
+/quit (/exit, /q)    Exit CLI
+```
+
+---
+
+## Key Paths & Config
+
+```
+~/.hermes/config.yaml       Main configuration
+~/.hermes/.env              API keys and secrets
+$HERMES_HOME/skills/        Installed skills
+~/.hermes/sessions/         Session transcripts
+~/.hermes/logs/             Gateway and error logs
+~/.hermes/auth.json         OAuth tokens and credential pools
+~/.hermes/hermes-agent/     Source code (if git-installed)
+```
+
+Profiles use `~/.hermes/profiles/<name>/` with the same layout.
+
+### Config Sections
+
+Edit with `hermes config edit` or `hermes config set section.key value`.
+
+| Section | Key options |
+|---------|-------------|
+| `model` | `default`, `provider`, `base_url`, `api_key`, `context_length` |
+| `agent` | `max_turns` (90), `tool_use_enforcement` |
+| `terminal` | `backend` (local/docker/ssh/modal), `cwd`, `timeout` (180) |
+| `compression` | `enabled`, `threshold` (0.50), `target_ratio` (0.20) |
+| `display` | `skin`, `tool_progress`, `show_reasoning`, `show_cost` |
+| `stt` | `enabled`, `provider` (local/groq/openai/mistral) |
+| `tts` | `provider` (edge/elevenlabs/openai/minimax/mistral/neutts) |
+| `memory` | `memory_enabled`, `user_profile_enabled`, `provider` |
+| `security` | `tirith_enabled`, `website_blocklist` |
+| `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` |
+| `checkpoints` | `enabled`, `max_snapshots` (50) |
+
+Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration
+
+### Providers
+
+20+ providers supported. Set via `hermes model` or `hermes setup`.
+
+| Provider | Auth | Key env var |
+|----------|------|-------------|
+| OpenRouter | API key | `OPENROUTER_API_KEY` |
+| Anthropic | API key | `ANTHROPIC_API_KEY` |
+| Nous Portal | OAuth | `hermes auth` |
+| OpenAI Codex | OAuth | `hermes auth` |
+| GitHub Copilot | Token | `COPILOT_GITHUB_TOKEN` |
+| Google Gemini | API key | `GOOGLE_API_KEY` or `GEMINI_API_KEY` |
+| DeepSeek | API key | `DEEPSEEK_API_KEY` |
+| xAI / Grok | API key | `XAI_API_KEY` |
+| Hugging Face | Token | `HF_TOKEN` |
+| Z.AI / GLM | API key | `GLM_API_KEY` |
+| MiniMax | API key | `MINIMAX_API_KEY` |
+| MiniMax CN | API key | `MINIMAX_CN_API_KEY` |
+| Kimi / Moonshot | API key | `KIMI_API_KEY` |
+| Alibaba / DashScope | API key | `DASHSCOPE_API_KEY` |
+| Xiaomi MiMo | API key | `XIAOMI_API_KEY` |
+| Kilo Code | API key | `KILOCODE_API_KEY` |
+| AI Gateway (Vercel) | API key | `AI_GATEWAY_API_KEY` |
+| OpenCode Zen | API key | `OPENCODE_ZEN_API_KEY` |
+| OpenCode Go | API key | `OPENCODE_GO_API_KEY` |
+| Qwen OAuth | OAuth | `hermes login --provider qwen-oauth` |
+| Custom endpoint | Config | `model.base_url` + `model.api_key` in config.yaml |
+| GitHub Copilot ACP | External | `COPILOT_CLI_PATH` or Copilot CLI |
+
+Full provider docs: https://hermes-agent.nousresearch.com/docs/integrations/providers
+
+### Toolsets
+
+Enable/disable via `hermes tools` (interactive) or `hermes tools enable/disable NAME`.
+
+| Toolset | What it provides |
+|---------|-----------------|
+| `web` | Web search and content extraction |
+| `browser` | Browser automation (Browserbase, Camofox, or local Chromium) |
+| `terminal` | Shell commands and process management |
+| `file` | File read/write/search/patch |
+| `code_execution` | Sandboxed Python execution |
+| `vision` | Image analysis |
+| `image_gen` | AI image generation |
+| `tts` | Text-to-speech |
+| `skills` | Skill browsing and management |
+| `memory` | Persistent cross-session memory |
+| `session_search` | Search past conversations |
+| `delegation` | Subagent task delegation |
+| `cronjob` | Scheduled task management |
+| `clarify` | Ask user clarifying questions |
+| `messaging` | Cross-platform message sending |
+| `search` | Web search only (subset of `web`) |
+| `todo` | In-session task planning and tracking |
+| `rl` | Reinforcement learning tools (off by default) |
+| `moa` | Mixture of Agents (off by default) |
+| `homeassistant` | Smart home control (off by default) |
+
+Tool changes take effect on `/reset` (new session). They do NOT apply mid-conversation to preserve prompt caching.
+
+---
+
+## Voice & Transcription
+
+### STT (Voice → Text)
+
+Voice messages from messaging platforms are auto-transcribed.
+
+Provider priority (auto-detected):
+1. **Local faster-whisper** — free, no API key: `pip install faster-whisper`
+2. **Groq Whisper** — free tier: set `GROQ_API_KEY`
+3. **OpenAI Whisper** — paid: set `VOICE_TOOLS_OPENAI_KEY`
+4. **Mistral Voxtral** — set `MISTRAL_API_KEY`
+
+Config:
+```yaml
+stt:
+  enabled: true
+  provider: local        # local, groq, openai, mistral
+  local:
+    model: base          # tiny, base, small, medium, large-v3
+```
+
+### TTS (Text → Voice)
+
+| Provider | Env var | Free? |
+|----------|---------|-------|
+| Edge TTS | None | Yes (default) |
+| ElevenLabs | `ELEVENLABS_API_KEY` | Free tier |
+| OpenAI | `VOICE_TOOLS_OPENAI_KEY` | Paid |
+| MiniMax | `MINIMAX_API_KEY` | Paid |
+| Mistral (Voxtral) | `MISTRAL_API_KEY` | Paid |
+| NeuTTS (local) | None (`pip install neutts[all]` + `espeak-ng`) | Free |
+
+Voice commands: `/voice on` (voice-to-voice), `/voice tts` (always voice), `/voice off`.
+
+---
+
+## Spawning Additional Hermes Instances
+
+Run additional Hermes processes as fully independent subprocesses — separate sessions, tools, and environments.
+
+### When to Use This vs delegate_task
+
+| | `delegate_task` | Spawning `hermes` process |
+|-|-----------------|--------------------------|
+| Isolation | Separate conversation, shared process | Fully independent process |
+| Duration | Minutes (bounded by parent loop) | Hours/days |
+| Tool access | Subset of parent's tools | Full tool access |
+| Interactive | No | Yes (PTY mode) |
+| Use case | Quick parallel subtasks | Long autonomous missions |
+
+### One-Shot Mode
+
+```
+terminal(command="hermes chat -q 'Research GRPO papers and write summary to ~/research/grpo.md'", timeout=300)
+
+# Background for long tasks:
+terminal(command="hermes chat -q 'Set up CI/CD for ~/myapp'", background=true)
+```
+
+### Interactive PTY Mode (via tmux)
+
+Hermes uses prompt_toolkit, which requires a real terminal. Use tmux for interactive spawning:
+
+```
+# Start
+terminal(command="tmux new-session -d -s agent1 -x 120 -y 40 'hermes'", timeout=10)
+
+# Wait for startup, then send a message
+terminal(command="sleep 8 && tmux send-keys -t agent1 'Build a FastAPI auth service' Enter", timeout=15)
+
+# Read output
+terminal(command="sleep 20 && tmux capture-pane -t agent1 -p", timeout=5)
+
+# Send follow-up
+terminal(command="tmux send-keys -t agent1 'Add rate limiting middleware' Enter", timeout=5)
+
+# Exit
+terminal(command="tmux send-keys -t agent1 '/exit' Enter && sleep 2 && tmux kill-session -t agent1", timeout=10)
+```
+
+### Multi-Agent Coordination
+
+```
+# Agent A: backend
+terminal(command="tmux new-session -d -s backend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t backend 'Build REST API for user management' Enter", timeout=15)
+
+# Agent B: frontend
+terminal(command="tmux new-session -d -s frontend -x 120 -y 40 'hermes -w'", timeout=10)
+terminal(command="sleep 8 && tmux send-keys -t frontend 'Build React dashboard for user management' Enter", timeout=15)
+
+# Check progress, relay context between them
+terminal(command="tmux capture-pane -t backend -p | tail -30", timeout=5)
+terminal(command="tmux send-keys -t frontend 'Here is the API schema from the backend agent: ...' Enter", timeout=5)
+```
+
+### Session Resume
+
+```
+# Resume most recent session
+terminal(command="tmux new-session -d -s resumed 'hermes --continue'", timeout=10)
+
+# Resume specific session
+terminal(command="tmux new-session -d -s resumed 'hermes --resume 20260225_143052_a1b2c3'", timeout=10)
+```
+
+### Tips
+
+- **Prefer `delegate_task` for quick subtasks** — less overhead than spawning a full process
+- **Use `-w` (worktree mode)** when spawning agents that edit code — prevents git conflicts
+- **Set timeouts** for one-shot mode — complex tasks can take 5-10 minutes
+- **Use `hermes chat -q` for fire-and-forget** — no PTY needed
+- **Use tmux for interactive sessions** — raw PTY mode has `\r` vs `\n` issues with prompt_toolkit
+- **For scheduled tasks**, use the `cronjob` tool instead of spawning — handles delivery and retry
+
+---
+
+## Troubleshooting
+
+### Voice not working
+1. Check `stt.enabled: true` in config.yaml
+2. Verify provider: `pip install faster-whisper` or set API key
+3. In gateway: `/restart`. In CLI: exit and relaunch.
+
+### Tool not available
+1. `hermes tools` — check if toolset is enabled for your platform
+2. Some tools need env vars (check `.env`)
+3. `/reset` after enabling tools
+
+### Model/provider issues
+1. `hermes doctor` — check config and dependencies
+2. `hermes login` — re-authenticate OAuth providers
+3. Check `.env` has the right API key
+4. **Copilot 403**: `gh auth login` tokens do NOT work for Copilot API. You must use the Copilot-specific OAuth device code flow via `hermes model` → GitHub Copilot.
+
+### Changes not taking effect
+- **Tools/skills:** `/reset` starts a new session with updated toolset
+- **Config changes:** In gateway: `/restart`. In CLI: exit and relaunch.
+- **Code changes:** Restart the CLI or gateway process
+
+### Skills not showing
+1. `hermes skills list` — verify installed
+2. `hermes skills config` — check platform enablement
+3. Load explicitly: `/skill name` or `hermes -s name`
+
+### Gateway issues
+Check logs first:
+```bash
+grep -i "failed to send\|error" ~/.hermes/logs/gateway.log | tail -20
+```
+
+Common gateway problems:
+- **Gateway dies on SSH logout**: Enable linger: `sudo loginctl enable-linger $USER`
+- **Gateway dies on WSL2 close**: WSL2 requires `systemd=true` in `/etc/wsl.conf` for systemd services to work. Without it, gateway falls back to `nohup` (dies when session closes).
+- **Gateway crash loop**: Reset the failed state: `systemctl --user reset-failed hermes-gateway`
+
+### Platform-specific issues
+- **Discord bot silent**: Must enable **Message Content Intent** in Bot → Privileged Gateway Intents.
+- **Slack bot only works in DMs**: Must subscribe to `message.channels` event. Without it, the bot ignores public channels.
+- **Windows HTTP 400 "No models provided"**: Config file encoding issue (BOM). Ensure `config.yaml` is saved as UTF-8 without BOM.
+
+### Auxiliary models not working
+If `auxiliary` tasks (vision, compression, session_search) fail silently, the `auto` provider can't find a backend. Either set `OPENROUTER_API_KEY` or `GOOGLE_API_KEY`, or explicitly configure each auxiliary task's provider:
+```bash
+hermes config set auxiliary.vision.provider <your_provider>
+hermes config set auxiliary.vision.model <model_name>
+```
+
+---
+
+## Where to Find Things
+
+| Looking for... | Location |
+|----------------|----------|
+| Config options | `hermes config edit` or [Configuration docs](https://hermes-agent.nousresearch.com/docs/user-guide/configuration) |
+| Available tools | `hermes tools list` or [Tools reference](https://hermes-agent.nousresearch.com/docs/reference/tools-reference) |
+| Slash commands | `/help` in session or [Slash commands reference](https://hermes-agent.nousresearch.com/docs/reference/slash-commands) |
+| Skills catalog | `hermes skills browse` or [Skills catalog](https://hermes-agent.nousresearch.com/docs/reference/skills-catalog) |
+| Provider setup | `hermes model` or [Providers guide](https://hermes-agent.nousresearch.com/docs/integrations/providers) |
+| Platform setup | `hermes gateway setup` or [Messaging docs](https://hermes-agent.nousresearch.com/docs/user-guide/messaging/) |
+| MCP servers | `hermes mcp list` or [MCP guide](https://hermes-agent.nousresearch.com/docs/user-guide/features/mcp) |
+| Profiles | `hermes profile list` or [Profiles docs](https://hermes-agent.nousresearch.com/docs/user-guide/profiles) |
+| Cron jobs | `hermes cron list` or [Cron docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/cron) |
+| Memory | `hermes memory status` or [Memory docs](https://hermes-agent.nousresearch.com/docs/user-guide/features/memory) |
+| Env variables | `hermes config env-path` or [Env vars reference](https://hermes-agent.nousresearch.com/docs/reference/environment-variables) |
+| CLI commands | `hermes --help` or [CLI reference](https://hermes-agent.nousresearch.com/docs/reference/cli-commands) |
+| Gateway logs | `~/.hermes/logs/gateway.log` |
+| Session files | `~/.hermes/sessions/` or `hermes sessions browse` |
+| Source code | `~/.hermes/hermes-agent/` |
+
+---
+
+## Contributor Quick Reference
+
+For occasional contributors and PR authors. Full developer docs: https://hermes-agent.nousresearch.com/docs/developer-guide/
+
+### Project Layout
+
+```
+hermes-agent/
+├── run_agent.py          # AIAgent — core conversation loop
+├── model_tools.py        # Tool discovery and dispatch
+├── toolsets.py           # Toolset definitions
+├── cli.py                # Interactive CLI (HermesCLI)
+├── hermes_state.py       # SQLite session store
+├── agent/                # Prompt builder, context compression, memory, model routing, credential pooling, skill dispatch
+├── hermes_cli/           # CLI subcommands, config, setup, commands
+│   ├── commands.py       # Slash command registry (CommandDef)
+│   ├── config.py         # DEFAULT_CONFIG, env var definitions
+│   └── main.py           # CLI entry point and argparse
+├── tools/                # One file per tool
+│   └── registry.py       # Central tool registry
+├── gateway/              # Messaging gateway
+│   └── platforms/        # Platform adapters (telegram, discord, etc.)
+├── cron/                 # Job scheduler
+├── tests/                # ~3000 pytest tests
+└── website/              # Docusaurus docs site
+```
+
+Config: `~/.hermes/config.yaml` (settings), `~/.hermes/.env` (API keys).
+
+### Adding a Tool (3 files)
+
+**1. Create `tools/your_tool.py`:**
+```python
+import json, os
+from tools.registry import registry
+
+def check_requirements() -> bool:
+    return bool(os.getenv("EXAMPLE_API_KEY"))
+
+def example_tool(param: str, task_id: str = None) -> str:
+    return json.dumps({"success": True, "data": "..."})
+
+registry.register(
+    name="example_tool",
+    toolset="example",
+    schema={"name": "example_tool", "description": "...", "parameters": {...}},
+    handler=lambda args, **kw: example_tool(
+        param=args.get("param", ""), task_id=kw.get("task_id")),
+    check_fn=check_requirements,
+    requires_env=["EXAMPLE_API_KEY"],
+)
+```
+
+**2. Add to `toolsets.py`** → `_HERMES_CORE_TOOLS` list.
+
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual list needed.
+
+All handlers must return JSON strings. Use `get_hermes_home()` for paths, never hardcode `~/.hermes`.
+
+### Adding a Slash Command
+
+1. Add `CommandDef` to `COMMAND_REGISTRY` in `hermes_cli/commands.py`
+2. Add handler in `cli.py` → `process_command()`
+3. (Optional) Add gateway handler in `gateway/run.py`
+
+All consumers (help text, autocomplete, Telegram menu, Slack mapping) derive from the central registry automatically.
+
+### Agent Loop (High Level)
+
+```
+run_conversation():
+  1. Build system prompt
+  2. Loop while iterations < max:
+     a. Call LLM (OpenAI-format messages + tool schemas)
+     b. If tool_calls → dispatch each via handle_function_call() → append results → continue
+     c. If text response → return
+  3. Context compression triggers automatically near token limit
+```
+
+### Testing
+
+```bash
+python -m pytest tests/ -o 'addopts=' -q   # Full suite
+python -m pytest tests/tools/ -q            # Specific area
+```
+
+- Tests auto-redirect `HERMES_HOME` to temp dirs — never touch real `~/.hermes/`
+- Run full suite before pushing any change
+- Use `-o 'addopts='` to clear any baked-in pytest flags
+
+### Commit Conventions
+
+```
+type: concise subject line
+
+Optional body.
+```
+
+Types: `fix:`, `feat:`, `refactor:`, `docs:`, `chore:`
+
+### Key Rules
+
+- **Never break prompt caching** — don't change context, tools, or system prompt mid-conversation
+- **Message role alternation** — never two assistant or two user messages in a row
+- Use `get_hermes_home()` from `hermes_constants` for all paths (profile-safe)
+- Config values go in `config.yaml`, secrets go in `.env`
+- New tools need a `check_fn` so they only appear when requirements are met
diff --git a/output/systematic-debugging/20260424_222559/baseline_skill.md b/output/systematic-debugging/20260424_222559/baseline_skill.md
new file mode 100644
index 00000000..70a68d58
--- /dev/null
+++ b/output/systematic-debugging/20260424_222559/baseline_skill.md
@@ -0,0 +1,366 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+# Systematic Debugging
+
+## Overview
+
+Random fixes waste time and create new bugs. Quick patches mask underlying issues.
+
+**Core principle:** ALWAYS find root cause before attempting fixes. Symptom fixes are failure.
+
+**Violating the letter of this process is violating the spirit of debugging.**
+
+## The Iron Law
+
+```
+NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST
+```
+
+If you haven't completed Phase 1, you cannot propose fixes.
+
+## When to Use
+
+Use for ANY technical issue:
+- Test failures
+- Bugs in production
+- Unexpected behavior
+- Performance problems
+- Build failures
+- Integration issues
+
+**Use this ESPECIALLY when:**
+- Under time pressure (emergencies make guessing tempting)
+- "Just one quick fix" seems obvious
+- You've already tried multiple fixes
+- Previous fix didn't work
+- You don't fully understand the issue
+
+**Don't skip when:**
+- Issue seems simple (simple bugs have root causes too)
+- You're in a hurry (rushing guarantees rework)
+- Someone wants it fixed NOW (systematic is faster than thrashing)
+
+## The Four Phases
+
+You MUST complete each phase before proceeding to the next.
+
+---
+
+## Phase 1: Root Cause Investigation
+
+**BEFORE attempting ANY fix:**
+
+### 1. Read Error Messages Carefully
+
+- Don't skip past errors or warnings
+- They often contain the exact solution
+- Read stack traces completely
+- Note line numbers, file paths, error codes
+
+**Action:** Use `read_file` on the relevant source files. Use `search_files` to find the error string in the codebase.
+
+### 2. Reproduce Consistently
+
+- Can you trigger it reliably?
+- What are the exact steps?
+- Does it happen every time?
+- If not reproducible → gather more data, don't guess
+
+**Action:** Use the `terminal` tool to run the failing test or trigger the bug:
+
+```bash
+# Run specific failing test
+pytest tests/test_module.py::test_name -v
+
+# Run with verbose output
+pytest tests/test_module.py -v --tb=long
+```
+
+### 3. Check Recent Changes
+
+- What changed that could cause this?
+- Git diff, recent commits
+- New dependencies, config changes
+
+**Action:**
+
+```bash
+# Recent commits
+git log --oneline -10
+
+# Uncommitted changes
+git diff
+
+# Changes in specific file
+git log -p --follow src/problematic_file.py | head -100
+```
+
+### 4. Gather Evidence in Multi-Component Systems
+
+**WHEN system has multiple components (API → service → database, CI → build → deploy):**
+
+**BEFORE proposing fixes, add diagnostic instrumentation:**
+
+For EACH component boundary:
+- Log what data enters the component
+- Log what data exits the component
+- Verify environment/config propagation
+- Check state at each layer
+
+Run once to gather evidence showing WHERE it breaks.
+THEN analyze evidence to identify the failing component.
+THEN investigate that specific component.
+
+### 5. Trace Data Flow
+
+**WHEN error is deep in the call stack:**
+
+- Where does the bad value originate?
+- What called this function with the bad value?
+- Keep tracing upstream until you find the source
+- Fix at the source, not at the symptom
+
+**Action:** Use `search_files` to trace references:
+
+```python
+# Find where the function is called
+search_files("function_name(", path="src/", file_glob="*.py")
+
+# Find where the variable is set
+search_files("variable_name\\s*=", path="src/", file_glob="*.py")
+```
+
+### Phase 1 Completion Checklist
+
+- [ ] Error messages fully read and understood
+- [ ] Issue reproduced consistently
+- [ ] Recent changes identified and reviewed
+- [ ] Evidence gathered (logs, state, data flow)
+- [ ] Problem isolated to specific component/code
+- [ ] Root cause hypothesis formed
+
+**STOP:** Do not proceed to Phase 2 until you understand WHY it's happening.
+
+---
+
+## Phase 2: Pattern Analysis
+
+**Find the pattern before fixing:**
+
+### 1. Find Working Examples
+
+- Locate similar working code in the same codebase
+- What works that's similar to what's broken?
+
+**Action:** Use `search_files` to find comparable patterns:
+
+```python
+search_files("similar_pattern", path="src/", file_glob="*.py")
+```
+
+### 2. Compare Against References
+
+- If implementing a pattern, read the reference implementation COMPLETELY
+- Don't skim — read every line
+- Understand the pattern fully before applying
+
+### 3. Identify Differences
+
+- What's different between working and broken?
+- List every difference, however small
+- Don't assume "that can't matter"
+
+### 4. Understand Dependencies
+
+- What other components does this need?
+- What settings, config, environment?
+- What assumptions does it make?
+
+---
+
+## Phase 3: Hypothesis and Testing
+
+**Scientific method:**
+
+### 1. Form a Single Hypothesis
+
+- State clearly: "I think X is the root cause because Y"
+- Write it down
+- Be specific, not vague
+
+### 2. Test Minimally
+
+- Make the SMALLEST possible change to test the hypothesis
+- One variable at a time
+- Don't fix multiple things at once
+
+### 3. Verify Before Continuing
+
+- Did it work? → Phase 4
+- Didn't work? → Form NEW hypothesis
+- DON'T add more fixes on top
+
+### 4. When You Don't Know
+
+- Say "I don't understand X"
+- Don't pretend to know
+- Ask the user for help
+- Research more
+
+---
+
+## Phase 4: Implementation
+
+**Fix the root cause, not the symptom:**
+
+### 1. Create Failing Test Case
+
+- Simplest possible reproduction
+- Automated test if possible
+- MUST have before fixing
+- Use the `test-driven-development` skill
+
+### 2. Implement Single Fix
+
+- Address the root cause identified
+- ONE change at a time
+- No "while I'm here" improvements
+- No bundled refactoring
+
+### 3. Verify Fix
+
+```bash
+# Run the specific regression test
+pytest tests/test_module.py::test_regression -v
+
+# Run full suite — no regressions
+pytest tests/ -q
+```
+
+### 4. If Fix Doesn't Work — The Rule of Three
+
+- **STOP.**
+- Count: How many fixes have you tried?
+- If < 3: Return to Phase 1, re-analyze with new information
+- **If ≥ 3: STOP and question the architecture (step 5 below)**
+- DON'T attempt Fix #4 without architectural discussion
+
+### 5. If 3+ Fixes Failed: Question Architecture
+
+**Pattern indicating an architectural problem:**
+- Each fix reveals new shared state/coupling in a different place
+- Fixes require "massive refactoring" to implement
+- Each fix creates new symptoms elsewhere
+
+**STOP and question fundamentals:**
+- Is this pattern fundamentally sound?
+- Are we "sticking with it through sheer inertia"?
+- Should we refactor the architecture vs. continue fixing symptoms?
+
+**Discuss with the user before attempting more fixes.**
+
+This is NOT a failed hypothesis — this is a wrong architecture.
+
+---
+
+## Red Flags — STOP and Follow Process
+
+If you catch yourself thinking:
+- "Quick fix for now, investigate later"
+- "Just try changing X and see if it works"
+- "Add multiple changes, run tests"
+- "Skip the test, I'll manually verify"
+- "It's probably X, let me fix that"
+- "I don't fully understand but this might work"
+- "Pattern says X but I'll adapt it differently"
+- "Here are the main problems: [lists fixes without investigation]"
+- Proposing solutions before tracing data flow
+- **"One more fix attempt" (when already tried 2+)**
+- **Each fix reveals a new problem in a different place**
+
+**ALL of these mean: STOP. Return to Phase 1.**
+
+**If 3+ fixes failed:** Question the architecture (Phase 4 step 5).
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "Issue is simple, don't need process" | Simple issues have root causes too. Process is fast for simple bugs. |
+| "Emergency, no time for process" | Systematic debugging is FASTER than guess-and-check thrashing. |
+| "Just try this first, then investigate" | First fix sets the pattern. Do it right from the start. |
+| "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves it. |
+| "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
+| "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read it completely. |
+| "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
+| "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Question the pattern, don't fix again. |
+
+## Quick Reference
+
+| Phase | Key Activities | Success Criteria |
+|-------|---------------|------------------|
+| **1. Root Cause** | Read errors, reproduce, check changes, gather evidence, trace data flow | Understand WHAT and WHY |
+| **2. Pattern** | Find working examples, compare, identify differences | Know what's different |
+| **3. Hypothesis** | Form theory, test minimally, one variable at a time | Confirmed or new hypothesis |
+| **4. Implementation** | Create regression test, fix root cause, verify | Bug resolved, all tests pass |
+
+## Hermes Agent Integration
+
+### Investigation Tools
+
+Use these Hermes tools during Phase 1:
+
+- **`search_files`** — Find error strings, trace function calls, locate patterns
+- **`read_file`** — Read source code with line numbers for precise analysis
+- **`terminal`** — Run tests, check git history, reproduce bugs
+- **`web_search`/`web_extract`** — Research error messages, library docs
+
+### With delegate_task
+
+For complex multi-component debugging, dispatch investigation subagents:
+
+```python
+delegate_task(
+    goal="Investigate why [specific test/behavior] fails",
+    context="""
+    Follow systematic-debugging skill:
+    1. Read the error message carefully
+    2. Reproduce the issue
+    3. Trace the data flow to find root cause
+    4. Report findings — do NOT fix yet
+
+    Error: [paste full error]
+    File: [path to failing code]
+    Test command: [exact command]
+    """,
+    toolsets=['terminal', 'file']
+)
+```
+
+### With test-driven-development
+
+When fixing bugs:
+1. Write a test that reproduces the bug (RED)
+2. Debug systematically to find root cause
+3. Fix the root cause (GREEN)
+4. The test proves the fix and prevents regression
+
+## Real-World Impact
+
+From debugging sessions:
+- Systematic approach: 15-30 minutes to fix
+- Random fixes approach: 2-3 hours of thrashing
+- First-time fix rate: 95% vs 40%
+- New bugs introduced: Near zero vs common
+
+**No shortcuts. No guessing. Systematic always wins.**
diff --git a/output/systematic-debugging/20260424_222559/evolved_skill.md b/output/systematic-debugging/20260424_222559/evolved_skill.md
new file mode 100644
index 00000000..f8599b2e
--- /dev/null
+++ b/output/systematic-debugging/20260424_222559/evolved_skill.md
@@ -0,0 +1,16 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+Complete a task following the provided skill instructions.
+
+You are an AI agent following specific skill instructions to complete a task.
+Read the skill instructions carefully and follow the procedure described.
diff --git a/output/systematic-debugging/20260424_222559/metrics.json b/output/systematic-debugging/20260424_222559/metrics.json
new file mode 100644
index 00000000..b8d05a66
--- /dev/null
+++ b/output/systematic-debugging/20260424_222559/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "systematic-debugging",
+  "timestamp": "20260424_222559",
+  "iterations": 2,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.5941389497026612,
+  "evolved_score": 0.5941389497026612,
+  "improvement": 0.0,
+  "baseline_size": 10020,
+  "evolved_size": 211,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 135.27510261535645,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/systematic-debugging/20260424_231312/baseline_skill.md b/output/systematic-debugging/20260424_231312/baseline_skill.md
new file mode 100644
index 00000000..70a68d58
--- /dev/null
+++ b/output/systematic-debugging/20260424_231312/baseline_skill.md
@@ -0,0 +1,366 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+# Systematic Debugging
+
+## Overview
+
+Random fixes waste time and create new bugs. Quick patches mask underlying issues.
+
+**Core principle:** ALWAYS find root cause before attempting fixes. Symptom fixes are failure.
+
+**Violating the letter of this process is violating the spirit of debugging.**
+
+## The Iron Law
+
+```
+NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST
+```
+
+If you haven't completed Phase 1, you cannot propose fixes.
+
+## When to Use
+
+Use for ANY technical issue:
+- Test failures
+- Bugs in production
+- Unexpected behavior
+- Performance problems
+- Build failures
+- Integration issues
+
+**Use this ESPECIALLY when:**
+- Under time pressure (emergencies make guessing tempting)
+- "Just one quick fix" seems obvious
+- You've already tried multiple fixes
+- Previous fix didn't work
+- You don't fully understand the issue
+
+**Don't skip when:**
+- Issue seems simple (simple bugs have root causes too)
+- You're in a hurry (rushing guarantees rework)
+- Someone wants it fixed NOW (systematic is faster than thrashing)
+
+## The Four Phases
+
+You MUST complete each phase before proceeding to the next.
+
+---
+
+## Phase 1: Root Cause Investigation
+
+**BEFORE attempting ANY fix:**
+
+### 1. Read Error Messages Carefully
+
+- Don't skip past errors or warnings
+- They often contain the exact solution
+- Read stack traces completely
+- Note line numbers, file paths, error codes
+
+**Action:** Use `read_file` on the relevant source files. Use `search_files` to find the error string in the codebase.
+
+### 2. Reproduce Consistently
+
+- Can you trigger it reliably?
+- What are the exact steps?
+- Does it happen every time?
+- If not reproducible → gather more data, don't guess
+
+**Action:** Use the `terminal` tool to run the failing test or trigger the bug:
+
+```bash
+# Run specific failing test
+pytest tests/test_module.py::test_name -v
+
+# Run with verbose output
+pytest tests/test_module.py -v --tb=long
+```
+
+### 3. Check Recent Changes
+
+- What changed that could cause this?
+- Git diff, recent commits
+- New dependencies, config changes
+
+**Action:**
+
+```bash
+# Recent commits
+git log --oneline -10
+
+# Uncommitted changes
+git diff
+
+# Changes in specific file
+git log -p --follow src/problematic_file.py | head -100
+```
+
+### 4. Gather Evidence in Multi-Component Systems
+
+**WHEN system has multiple components (API → service → database, CI → build → deploy):**
+
+**BEFORE proposing fixes, add diagnostic instrumentation:**
+
+For EACH component boundary:
+- Log what data enters the component
+- Log what data exits the component
+- Verify environment/config propagation
+- Check state at each layer
+
+Run once to gather evidence showing WHERE it breaks.
+THEN analyze evidence to identify the failing component.
+THEN investigate that specific component.
+
+### 5. Trace Data Flow
+
+**WHEN error is deep in the call stack:**
+
+- Where does the bad value originate?
+- What called this function with the bad value?
+- Keep tracing upstream until you find the source
+- Fix at the source, not at the symptom
+
+**Action:** Use `search_files` to trace references:
+
+```python
+# Find where the function is called
+search_files("function_name(", path="src/", file_glob="*.py")
+
+# Find where the variable is set
+search_files("variable_name\\s*=", path="src/", file_glob="*.py")
+```
+
+### Phase 1 Completion Checklist
+
+- [ ] Error messages fully read and understood
+- [ ] Issue reproduced consistently
+- [ ] Recent changes identified and reviewed
+- [ ] Evidence gathered (logs, state, data flow)
+- [ ] Problem isolated to specific component/code
+- [ ] Root cause hypothesis formed
+
+**STOP:** Do not proceed to Phase 2 until you understand WHY it's happening.
+
+---
+
+## Phase 2: Pattern Analysis
+
+**Find the pattern before fixing:**
+
+### 1. Find Working Examples
+
+- Locate similar working code in the same codebase
+- What works that's similar to what's broken?
+
+**Action:** Use `search_files` to find comparable patterns:
+
+```python
+search_files("similar_pattern", path="src/", file_glob="*.py")
+```
+
+### 2. Compare Against References
+
+- If implementing a pattern, read the reference implementation COMPLETELY
+- Don't skim — read every line
+- Understand the pattern fully before applying
+
+### 3. Identify Differences
+
+- What's different between working and broken?
+- List every difference, however small
+- Don't assume "that can't matter"
+
+### 4. Understand Dependencies
+
+- What other components does this need?
+- What settings, config, environment?
+- What assumptions does it make?
+
+---
+
+## Phase 3: Hypothesis and Testing
+
+**Scientific method:**
+
+### 1. Form a Single Hypothesis
+
+- State clearly: "I think X is the root cause because Y"
+- Write it down
+- Be specific, not vague
+
+### 2. Test Minimally
+
+- Make the SMALLEST possible change to test the hypothesis
+- One variable at a time
+- Don't fix multiple things at once
+
+### 3. Verify Before Continuing
+
+- Did it work? → Phase 4
+- Didn't work? → Form NEW hypothesis
+- DON'T add more fixes on top
+
+### 4. When You Don't Know
+
+- Say "I don't understand X"
+- Don't pretend to know
+- Ask the user for help
+- Research more
+
+---
+
+## Phase 4: Implementation
+
+**Fix the root cause, not the symptom:**
+
+### 1. Create Failing Test Case
+
+- Simplest possible reproduction
+- Automated test if possible
+- MUST have before fixing
+- Use the `test-driven-development` skill
+
+### 2. Implement Single Fix
+
+- Address the root cause identified
+- ONE change at a time
+- No "while I'm here" improvements
+- No bundled refactoring
+
+### 3. Verify Fix
+
+```bash
+# Run the specific regression test
+pytest tests/test_module.py::test_regression -v
+
+# Run full suite — no regressions
+pytest tests/ -q
+```
+
+### 4. If Fix Doesn't Work — The Rule of Three
+
+- **STOP.**
+- Count: How many fixes have you tried?
+- If < 3: Return to Phase 1, re-analyze with new information
+- **If ≥ 3: STOP and question the architecture (step 5 below)**
+- DON'T attempt Fix #4 without architectural discussion
+
+### 5. If 3+ Fixes Failed: Question Architecture
+
+**Pattern indicating an architectural problem:**
+- Each fix reveals new shared state/coupling in a different place
+- Fixes require "massive refactoring" to implement
+- Each fix creates new symptoms elsewhere
+
+**STOP and question fundamentals:**
+- Is this pattern fundamentally sound?
+- Are we "sticking with it through sheer inertia"?
+- Should we refactor the architecture vs. continue fixing symptoms?
+
+**Discuss with the user before attempting more fixes.**
+
+This is NOT a failed hypothesis — this is a wrong architecture.
+
+---
+
+## Red Flags — STOP and Follow Process
+
+If you catch yourself thinking:
+- "Quick fix for now, investigate later"
+- "Just try changing X and see if it works"
+- "Add multiple changes, run tests"
+- "Skip the test, I'll manually verify"
+- "It's probably X, let me fix that"
+- "I don't fully understand but this might work"
+- "Pattern says X but I'll adapt it differently"
+- "Here are the main problems: [lists fixes without investigation]"
+- Proposing solutions before tracing data flow
+- **"One more fix attempt" (when already tried 2+)**
+- **Each fix reveals a new problem in a different place**
+
+**ALL of these mean: STOP. Return to Phase 1.**
+
+**If 3+ fixes failed:** Question the architecture (Phase 4 step 5).
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "Issue is simple, don't need process" | Simple issues have root causes too. Process is fast for simple bugs. |
+| "Emergency, no time for process" | Systematic debugging is FASTER than guess-and-check thrashing. |
+| "Just try this first, then investigate" | First fix sets the pattern. Do it right from the start. |
+| "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves it. |
+| "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
+| "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read it completely. |
+| "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
+| "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Question the pattern, don't fix again. |
+
+## Quick Reference
+
+| Phase | Key Activities | Success Criteria |
+|-------|---------------|------------------|
+| **1. Root Cause** | Read errors, reproduce, check changes, gather evidence, trace data flow | Understand WHAT and WHY |
+| **2. Pattern** | Find working examples, compare, identify differences | Know what's different |
+| **3. Hypothesis** | Form theory, test minimally, one variable at a time | Confirmed or new hypothesis |
+| **4. Implementation** | Create regression test, fix root cause, verify | Bug resolved, all tests pass |
+
+## Hermes Agent Integration
+
+### Investigation Tools
+
+Use these Hermes tools during Phase 1:
+
+- **`search_files`** — Find error strings, trace function calls, locate patterns
+- **`read_file`** — Read source code with line numbers for precise analysis
+- **`terminal`** — Run tests, check git history, reproduce bugs
+- **`web_search`/`web_extract`** — Research error messages, library docs
+
+### With delegate_task
+
+For complex multi-component debugging, dispatch investigation subagents:
+
+```python
+delegate_task(
+    goal="Investigate why [specific test/behavior] fails",
+    context="""
+    Follow systematic-debugging skill:
+    1. Read the error message carefully
+    2. Reproduce the issue
+    3. Trace the data flow to find root cause
+    4. Report findings — do NOT fix yet
+
+    Error: [paste full error]
+    File: [path to failing code]
+    Test command: [exact command]
+    """,
+    toolsets=['terminal', 'file']
+)
+```
+
+### With test-driven-development
+
+When fixing bugs:
+1. Write a test that reproduces the bug (RED)
+2. Debug systematically to find root cause
+3. Fix the root cause (GREEN)
+4. The test proves the fix and prevents regression
+
+## Real-World Impact
+
+From debugging sessions:
+- Systematic approach: 15-30 minutes to fix
+- Random fixes approach: 2-3 hours of thrashing
+- First-time fix rate: 95% vs 40%
+- New bugs introduced: Near zero vs common
+
+**No shortcuts. No guessing. Systematic always wins.**
diff --git a/output/systematic-debugging/20260424_231312/evolved_skill.md b/output/systematic-debugging/20260424_231312/evolved_skill.md
new file mode 100644
index 00000000..70a68d58
--- /dev/null
+++ b/output/systematic-debugging/20260424_231312/evolved_skill.md
@@ -0,0 +1,366 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+# Systematic Debugging
+
+## Overview
+
+Random fixes waste time and create new bugs. Quick patches mask underlying issues.
+
+**Core principle:** ALWAYS find root cause before attempting fixes. Symptom fixes are failure.
+
+**Violating the letter of this process is violating the spirit of debugging.**
+
+## The Iron Law
+
+```
+NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST
+```
+
+If you haven't completed Phase 1, you cannot propose fixes.
+
+## When to Use
+
+Use for ANY technical issue:
+- Test failures
+- Bugs in production
+- Unexpected behavior
+- Performance problems
+- Build failures
+- Integration issues
+
+**Use this ESPECIALLY when:**
+- Under time pressure (emergencies make guessing tempting)
+- "Just one quick fix" seems obvious
+- You've already tried multiple fixes
+- Previous fix didn't work
+- You don't fully understand the issue
+
+**Don't skip when:**
+- Issue seems simple (simple bugs have root causes too)
+- You're in a hurry (rushing guarantees rework)
+- Someone wants it fixed NOW (systematic is faster than thrashing)
+
+## The Four Phases
+
+You MUST complete each phase before proceeding to the next.
+
+---
+
+## Phase 1: Root Cause Investigation
+
+**BEFORE attempting ANY fix:**
+
+### 1. Read Error Messages Carefully
+
+- Don't skip past errors or warnings
+- They often contain the exact solution
+- Read stack traces completely
+- Note line numbers, file paths, error codes
+
+**Action:** Use `read_file` on the relevant source files. Use `search_files` to find the error string in the codebase.
+
+### 2. Reproduce Consistently
+
+- Can you trigger it reliably?
+- What are the exact steps?
+- Does it happen every time?
+- If not reproducible → gather more data, don't guess
+
+**Action:** Use the `terminal` tool to run the failing test or trigger the bug:
+
+```bash
+# Run specific failing test
+pytest tests/test_module.py::test_name -v
+
+# Run with verbose output
+pytest tests/test_module.py -v --tb=long
+```
+
+### 3. Check Recent Changes
+
+- What changed that could cause this?
+- Git diff, recent commits
+- New dependencies, config changes
+
+**Action:**
+
+```bash
+# Recent commits
+git log --oneline -10
+
+# Uncommitted changes
+git diff
+
+# Changes in specific file
+git log -p --follow src/problematic_file.py | head -100
+```
+
+### 4. Gather Evidence in Multi-Component Systems
+
+**WHEN system has multiple components (API → service → database, CI → build → deploy):**
+
+**BEFORE proposing fixes, add diagnostic instrumentation:**
+
+For EACH component boundary:
+- Log what data enters the component
+- Log what data exits the component
+- Verify environment/config propagation
+- Check state at each layer
+
+Run once to gather evidence showing WHERE it breaks.
+THEN analyze evidence to identify the failing component.
+THEN investigate that specific component.
+
+### 5. Trace Data Flow
+
+**WHEN error is deep in the call stack:**
+
+- Where does the bad value originate?
+- What called this function with the bad value?
+- Keep tracing upstream until you find the source
+- Fix at the source, not at the symptom
+
+**Action:** Use `search_files` to trace references:
+
+```python
+# Find where the function is called
+search_files("function_name(", path="src/", file_glob="*.py")
+
+# Find where the variable is set
+search_files("variable_name\\s*=", path="src/", file_glob="*.py")
+```
+
+### Phase 1 Completion Checklist
+
+- [ ] Error messages fully read and understood
+- [ ] Issue reproduced consistently
+- [ ] Recent changes identified and reviewed
+- [ ] Evidence gathered (logs, state, data flow)
+- [ ] Problem isolated to specific component/code
+- [ ] Root cause hypothesis formed
+
+**STOP:** Do not proceed to Phase 2 until you understand WHY it's happening.
+
+---
+
+## Phase 2: Pattern Analysis
+
+**Find the pattern before fixing:**
+
+### 1. Find Working Examples
+
+- Locate similar working code in the same codebase
+- What works that's similar to what's broken?
+
+**Action:** Use `search_files` to find comparable patterns:
+
+```python
+search_files("similar_pattern", path="src/", file_glob="*.py")
+```
+
+### 2. Compare Against References
+
+- If implementing a pattern, read the reference implementation COMPLETELY
+- Don't skim — read every line
+- Understand the pattern fully before applying
+
+### 3. Identify Differences
+
+- What's different between working and broken?
+- List every difference, however small
+- Don't assume "that can't matter"
+
+### 4. Understand Dependencies
+
+- What other components does this need?
+- What settings, config, environment?
+- What assumptions does it make?
+
+---
+
+## Phase 3: Hypothesis and Testing
+
+**Scientific method:**
+
+### 1. Form a Single Hypothesis
+
+- State clearly: "I think X is the root cause because Y"
+- Write it down
+- Be specific, not vague
+
+### 2. Test Minimally
+
+- Make the SMALLEST possible change to test the hypothesis
+- One variable at a time
+- Don't fix multiple things at once
+
+### 3. Verify Before Continuing
+
+- Did it work? → Phase 4
+- Didn't work? → Form NEW hypothesis
+- DON'T add more fixes on top
+
+### 4. When You Don't Know
+
+- Say "I don't understand X"
+- Don't pretend to know
+- Ask the user for help
+- Research more
+
+---
+
+## Phase 4: Implementation
+
+**Fix the root cause, not the symptom:**
+
+### 1. Create Failing Test Case
+
+- Simplest possible reproduction
+- Automated test if possible
+- MUST have before fixing
+- Use the `test-driven-development` skill
+
+### 2. Implement Single Fix
+
+- Address the root cause identified
+- ONE change at a time
+- No "while I'm here" improvements
+- No bundled refactoring
+
+### 3. Verify Fix
+
+```bash
+# Run the specific regression test
+pytest tests/test_module.py::test_regression -v
+
+# Run full suite — no regressions
+pytest tests/ -q
+```
+
+### 4. If Fix Doesn't Work — The Rule of Three
+
+- **STOP.**
+- Count: How many fixes have you tried?
+- If < 3: Return to Phase 1, re-analyze with new information
+- **If ≥ 3: STOP and question the architecture (step 5 below)**
+- DON'T attempt Fix #4 without architectural discussion
+
+### 5. If 3+ Fixes Failed: Question Architecture
+
+**Pattern indicating an architectural problem:**
+- Each fix reveals new shared state/coupling in a different place
+- Fixes require "massive refactoring" to implement
+- Each fix creates new symptoms elsewhere
+
+**STOP and question fundamentals:**
+- Is this pattern fundamentally sound?
+- Are we "sticking with it through sheer inertia"?
+- Should we refactor the architecture vs. continue fixing symptoms?
+
+**Discuss with the user before attempting more fixes.**
+
+This is NOT a failed hypothesis — this is a wrong architecture.
+
+---
+
+## Red Flags — STOP and Follow Process
+
+If you catch yourself thinking:
+- "Quick fix for now, investigate later"
+- "Just try changing X and see if it works"
+- "Add multiple changes, run tests"
+- "Skip the test, I'll manually verify"
+- "It's probably X, let me fix that"
+- "I don't fully understand but this might work"
+- "Pattern says X but I'll adapt it differently"
+- "Here are the main problems: [lists fixes without investigation]"
+- Proposing solutions before tracing data flow
+- **"One more fix attempt" (when already tried 2+)**
+- **Each fix reveals a new problem in a different place**
+
+**ALL of these mean: STOP. Return to Phase 1.**
+
+**If 3+ fixes failed:** Question the architecture (Phase 4 step 5).
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "Issue is simple, don't need process" | Simple issues have root causes too. Process is fast for simple bugs. |
+| "Emergency, no time for process" | Systematic debugging is FASTER than guess-and-check thrashing. |
+| "Just try this first, then investigate" | First fix sets the pattern. Do it right from the start. |
+| "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves it. |
+| "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
+| "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read it completely. |
+| "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
+| "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Question the pattern, don't fix again. |
+
+## Quick Reference
+
+| Phase | Key Activities | Success Criteria |
+|-------|---------------|------------------|
+| **1. Root Cause** | Read errors, reproduce, check changes, gather evidence, trace data flow | Understand WHAT and WHY |
+| **2. Pattern** | Find working examples, compare, identify differences | Know what's different |
+| **3. Hypothesis** | Form theory, test minimally, one variable at a time | Confirmed or new hypothesis |
+| **4. Implementation** | Create regression test, fix root cause, verify | Bug resolved, all tests pass |
+
+## Hermes Agent Integration
+
+### Investigation Tools
+
+Use these Hermes tools during Phase 1:
+
+- **`search_files`** — Find error strings, trace function calls, locate patterns
+- **`read_file`** — Read source code with line numbers for precise analysis
+- **`terminal`** — Run tests, check git history, reproduce bugs
+- **`web_search`/`web_extract`** — Research error messages, library docs
+
+### With delegate_task
+
+For complex multi-component debugging, dispatch investigation subagents:
+
+```python
+delegate_task(
+    goal="Investigate why [specific test/behavior] fails",
+    context="""
+    Follow systematic-debugging skill:
+    1. Read the error message carefully
+    2. Reproduce the issue
+    3. Trace the data flow to find root cause
+    4. Report findings — do NOT fix yet
+
+    Error: [paste full error]
+    File: [path to failing code]
+    Test command: [exact command]
+    """,
+    toolsets=['terminal', 'file']
+)
+```
+
+### With test-driven-development
+
+When fixing bugs:
+1. Write a test that reproduces the bug (RED)
+2. Debug systematically to find root cause
+3. Fix the root cause (GREEN)
+4. The test proves the fix and prevents regression
+
+## Real-World Impact
+
+From debugging sessions:
+- Systematic approach: 15-30 minutes to fix
+- Random fixes approach: 2-3 hours of thrashing
+- First-time fix rate: 95% vs 40%
+- New bugs introduced: Near zero vs common
+
+**No shortcuts. No guessing. Systematic always wins.**
diff --git a/output/systematic-debugging/20260424_231312/metrics.json b/output/systematic-debugging/20260424_231312/metrics.json
new file mode 100644
index 00000000..3194dbb1
--- /dev/null
+++ b/output/systematic-debugging/20260424_231312/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "systematic-debugging",
+  "timestamp": "20260424_231312",
+  "iterations": 3,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.519781361749106,
+  "evolved_score": 0.519781361749106,
+  "improvement": 0.0,
+  "baseline_size": 10020,
+  "evolved_size": 10020,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 112.58387398719788,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/systematic-debugging/20260424_232123/baseline_skill.md b/output/systematic-debugging/20260424_232123/baseline_skill.md
new file mode 100644
index 00000000..70a68d58
--- /dev/null
+++ b/output/systematic-debugging/20260424_232123/baseline_skill.md
@@ -0,0 +1,366 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+# Systematic Debugging
+
+## Overview
+
+Random fixes waste time and create new bugs. Quick patches mask underlying issues.
+
+**Core principle:** ALWAYS find root cause before attempting fixes. Symptom fixes are failure.
+
+**Violating the letter of this process is violating the spirit of debugging.**
+
+## The Iron Law
+
+```
+NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST
+```
+
+If you haven't completed Phase 1, you cannot propose fixes.
+
+## When to Use
+
+Use for ANY technical issue:
+- Test failures
+- Bugs in production
+- Unexpected behavior
+- Performance problems
+- Build failures
+- Integration issues
+
+**Use this ESPECIALLY when:**
+- Under time pressure (emergencies make guessing tempting)
+- "Just one quick fix" seems obvious
+- You've already tried multiple fixes
+- Previous fix didn't work
+- You don't fully understand the issue
+
+**Don't skip when:**
+- Issue seems simple (simple bugs have root causes too)
+- You're in a hurry (rushing guarantees rework)
+- Someone wants it fixed NOW (systematic is faster than thrashing)
+
+## The Four Phases
+
+You MUST complete each phase before proceeding to the next.
+
+---
+
+## Phase 1: Root Cause Investigation
+
+**BEFORE attempting ANY fix:**
+
+### 1. Read Error Messages Carefully
+
+- Don't skip past errors or warnings
+- They often contain the exact solution
+- Read stack traces completely
+- Note line numbers, file paths, error codes
+
+**Action:** Use `read_file` on the relevant source files. Use `search_files` to find the error string in the codebase.
+
+### 2. Reproduce Consistently
+
+- Can you trigger it reliably?
+- What are the exact steps?
+- Does it happen every time?
+- If not reproducible → gather more data, don't guess
+
+**Action:** Use the `terminal` tool to run the failing test or trigger the bug:
+
+```bash
+# Run specific failing test
+pytest tests/test_module.py::test_name -v
+
+# Run with verbose output
+pytest tests/test_module.py -v --tb=long
+```
+
+### 3. Check Recent Changes
+
+- What changed that could cause this?
+- Git diff, recent commits
+- New dependencies, config changes
+
+**Action:**
+
+```bash
+# Recent commits
+git log --oneline -10
+
+# Uncommitted changes
+git diff
+
+# Changes in specific file
+git log -p --follow src/problematic_file.py | head -100
+```
+
+### 4. Gather Evidence in Multi-Component Systems
+
+**WHEN system has multiple components (API → service → database, CI → build → deploy):**
+
+**BEFORE proposing fixes, add diagnostic instrumentation:**
+
+For EACH component boundary:
+- Log what data enters the component
+- Log what data exits the component
+- Verify environment/config propagation
+- Check state at each layer
+
+Run once to gather evidence showing WHERE it breaks.
+THEN analyze evidence to identify the failing component.
+THEN investigate that specific component.
+
+### 5. Trace Data Flow
+
+**WHEN error is deep in the call stack:**
+
+- Where does the bad value originate?
+- What called this function with the bad value?
+- Keep tracing upstream until you find the source
+- Fix at the source, not at the symptom
+
+**Action:** Use `search_files` to trace references:
+
+```python
+# Find where the function is called
+search_files("function_name(", path="src/", file_glob="*.py")
+
+# Find where the variable is set
+search_files("variable_name\\s*=", path="src/", file_glob="*.py")
+```
+
+### Phase 1 Completion Checklist
+
+- [ ] Error messages fully read and understood
+- [ ] Issue reproduced consistently
+- [ ] Recent changes identified and reviewed
+- [ ] Evidence gathered (logs, state, data flow)
+- [ ] Problem isolated to specific component/code
+- [ ] Root cause hypothesis formed
+
+**STOP:** Do not proceed to Phase 2 until you understand WHY it's happening.
+
+---
+
+## Phase 2: Pattern Analysis
+
+**Find the pattern before fixing:**
+
+### 1. Find Working Examples
+
+- Locate similar working code in the same codebase
+- What works that's similar to what's broken?
+
+**Action:** Use `search_files` to find comparable patterns:
+
+```python
+search_files("similar_pattern", path="src/", file_glob="*.py")
+```
+
+### 2. Compare Against References
+
+- If implementing a pattern, read the reference implementation COMPLETELY
+- Don't skim — read every line
+- Understand the pattern fully before applying
+
+### 3. Identify Differences
+
+- What's different between working and broken?
+- List every difference, however small
+- Don't assume "that can't matter"
+
+### 4. Understand Dependencies
+
+- What other components does this need?
+- What settings, config, environment?
+- What assumptions does it make?
+
+---
+
+## Phase 3: Hypothesis and Testing
+
+**Scientific method:**
+
+### 1. Form a Single Hypothesis
+
+- State clearly: "I think X is the root cause because Y"
+- Write it down
+- Be specific, not vague
+
+### 2. Test Minimally
+
+- Make the SMALLEST possible change to test the hypothesis
+- One variable at a time
+- Don't fix multiple things at once
+
+### 3. Verify Before Continuing
+
+- Did it work? → Phase 4
+- Didn't work? → Form NEW hypothesis
+- DON'T add more fixes on top
+
+### 4. When You Don't Know
+
+- Say "I don't understand X"
+- Don't pretend to know
+- Ask the user for help
+- Research more
+
+---
+
+## Phase 4: Implementation
+
+**Fix the root cause, not the symptom:**
+
+### 1. Create Failing Test Case
+
+- Simplest possible reproduction
+- Automated test if possible
+- MUST have before fixing
+- Use the `test-driven-development` skill
+
+### 2. Implement Single Fix
+
+- Address the root cause identified
+- ONE change at a time
+- No "while I'm here" improvements
+- No bundled refactoring
+
+### 3. Verify Fix
+
+```bash
+# Run the specific regression test
+pytest tests/test_module.py::test_regression -v
+
+# Run full suite — no regressions
+pytest tests/ -q
+```
+
+### 4. If Fix Doesn't Work — The Rule of Three
+
+- **STOP.**
+- Count: How many fixes have you tried?
+- If < 3: Return to Phase 1, re-analyze with new information
+- **If ≥ 3: STOP and question the architecture (step 5 below)**
+- DON'T attempt Fix #4 without architectural discussion
+
+### 5. If 3+ Fixes Failed: Question Architecture
+
+**Pattern indicating an architectural problem:**
+- Each fix reveals new shared state/coupling in a different place
+- Fixes require "massive refactoring" to implement
+- Each fix creates new symptoms elsewhere
+
+**STOP and question fundamentals:**
+- Is this pattern fundamentally sound?
+- Are we "sticking with it through sheer inertia"?
+- Should we refactor the architecture vs. continue fixing symptoms?
+
+**Discuss with the user before attempting more fixes.**
+
+This is NOT a failed hypothesis — this is a wrong architecture.
+
+---
+
+## Red Flags — STOP and Follow Process
+
+If you catch yourself thinking:
+- "Quick fix for now, investigate later"
+- "Just try changing X and see if it works"
+- "Add multiple changes, run tests"
+- "Skip the test, I'll manually verify"
+- "It's probably X, let me fix that"
+- "I don't fully understand but this might work"
+- "Pattern says X but I'll adapt it differently"
+- "Here are the main problems: [lists fixes without investigation]"
+- Proposing solutions before tracing data flow
+- **"One more fix attempt" (when already tried 2+)**
+- **Each fix reveals a new problem in a different place**
+
+**ALL of these mean: STOP. Return to Phase 1.**
+
+**If 3+ fixes failed:** Question the architecture (Phase 4 step 5).
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "Issue is simple, don't need process" | Simple issues have root causes too. Process is fast for simple bugs. |
+| "Emergency, no time for process" | Systematic debugging is FASTER than guess-and-check thrashing. |
+| "Just try this first, then investigate" | First fix sets the pattern. Do it right from the start. |
+| "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves it. |
+| "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
+| "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read it completely. |
+| "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
+| "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Question the pattern, don't fix again. |
+
+## Quick Reference
+
+| Phase | Key Activities | Success Criteria |
+|-------|---------------|------------------|
+| **1. Root Cause** | Read errors, reproduce, check changes, gather evidence, trace data flow | Understand WHAT and WHY |
+| **2. Pattern** | Find working examples, compare, identify differences | Know what's different |
+| **3. Hypothesis** | Form theory, test minimally, one variable at a time | Confirmed or new hypothesis |
+| **4. Implementation** | Create regression test, fix root cause, verify | Bug resolved, all tests pass |
+
+## Hermes Agent Integration
+
+### Investigation Tools
+
+Use these Hermes tools during Phase 1:
+
+- **`search_files`** — Find error strings, trace function calls, locate patterns
+- **`read_file`** — Read source code with line numbers for precise analysis
+- **`terminal`** — Run tests, check git history, reproduce bugs
+- **`web_search`/`web_extract`** — Research error messages, library docs
+
+### With delegate_task
+
+For complex multi-component debugging, dispatch investigation subagents:
+
+```python
+delegate_task(
+    goal="Investigate why [specific test/behavior] fails",
+    context="""
+    Follow systematic-debugging skill:
+    1. Read the error message carefully
+    2. Reproduce the issue
+    3. Trace the data flow to find root cause
+    4. Report findings — do NOT fix yet
+
+    Error: [paste full error]
+    File: [path to failing code]
+    Test command: [exact command]
+    """,
+    toolsets=['terminal', 'file']
+)
+```
+
+### With test-driven-development
+
+When fixing bugs:
+1. Write a test that reproduces the bug (RED)
+2. Debug systematically to find root cause
+3. Fix the root cause (GREEN)
+4. The test proves the fix and prevents regression
+
+## Real-World Impact
+
+From debugging sessions:
+- Systematic approach: 15-30 minutes to fix
+- Random fixes approach: 2-3 hours of thrashing
+- First-time fix rate: 95% vs 40%
+- New bugs introduced: Near zero vs common
+
+**No shortcuts. No guessing. Systematic always wins.**
diff --git a/output/systematic-debugging/20260424_232123/evolved_skill.md b/output/systematic-debugging/20260424_232123/evolved_skill.md
new file mode 100644
index 00000000..70a68d58
--- /dev/null
+++ b/output/systematic-debugging/20260424_232123/evolved_skill.md
@@ -0,0 +1,366 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+# Systematic Debugging
+
+## Overview
+
+Random fixes waste time and create new bugs. Quick patches mask underlying issues.
+
+**Core principle:** ALWAYS find root cause before attempting fixes. Symptom fixes are failure.
+
+**Violating the letter of this process is violating the spirit of debugging.**
+
+## The Iron Law
+
+```
+NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST
+```
+
+If you haven't completed Phase 1, you cannot propose fixes.
+
+## When to Use
+
+Use for ANY technical issue:
+- Test failures
+- Bugs in production
+- Unexpected behavior
+- Performance problems
+- Build failures
+- Integration issues
+
+**Use this ESPECIALLY when:**
+- Under time pressure (emergencies make guessing tempting)
+- "Just one quick fix" seems obvious
+- You've already tried multiple fixes
+- Previous fix didn't work
+- You don't fully understand the issue
+
+**Don't skip when:**
+- Issue seems simple (simple bugs have root causes too)
+- You're in a hurry (rushing guarantees rework)
+- Someone wants it fixed NOW (systematic is faster than thrashing)
+
+## The Four Phases
+
+You MUST complete each phase before proceeding to the next.
+
+---
+
+## Phase 1: Root Cause Investigation
+
+**BEFORE attempting ANY fix:**
+
+### 1. Read Error Messages Carefully
+
+- Don't skip past errors or warnings
+- They often contain the exact solution
+- Read stack traces completely
+- Note line numbers, file paths, error codes
+
+**Action:** Use `read_file` on the relevant source files. Use `search_files` to find the error string in the codebase.
+
+### 2. Reproduce Consistently
+
+- Can you trigger it reliably?
+- What are the exact steps?
+- Does it happen every time?
+- If not reproducible → gather more data, don't guess
+
+**Action:** Use the `terminal` tool to run the failing test or trigger the bug:
+
+```bash
+# Run specific failing test
+pytest tests/test_module.py::test_name -v
+
+# Run with verbose output
+pytest tests/test_module.py -v --tb=long
+```
+
+### 3. Check Recent Changes
+
+- What changed that could cause this?
+- Git diff, recent commits
+- New dependencies, config changes
+
+**Action:**
+
+```bash
+# Recent commits
+git log --oneline -10
+
+# Uncommitted changes
+git diff
+
+# Changes in specific file
+git log -p --follow src/problematic_file.py | head -100
+```
+
+### 4. Gather Evidence in Multi-Component Systems
+
+**WHEN system has multiple components (API → service → database, CI → build → deploy):**
+
+**BEFORE proposing fixes, add diagnostic instrumentation:**
+
+For EACH component boundary:
+- Log what data enters the component
+- Log what data exits the component
+- Verify environment/config propagation
+- Check state at each layer
+
+Run once to gather evidence showing WHERE it breaks.
+THEN analyze evidence to identify the failing component.
+THEN investigate that specific component.
+
+### 5. Trace Data Flow
+
+**WHEN error is deep in the call stack:**
+
+- Where does the bad value originate?
+- What called this function with the bad value?
+- Keep tracing upstream until you find the source
+- Fix at the source, not at the symptom
+
+**Action:** Use `search_files` to trace references:
+
+```python
+# Find where the function is called
+search_files("function_name(", path="src/", file_glob="*.py")
+
+# Find where the variable is set
+search_files("variable_name\\s*=", path="src/", file_glob="*.py")
+```
+
+### Phase 1 Completion Checklist
+
+- [ ] Error messages fully read and understood
+- [ ] Issue reproduced consistently
+- [ ] Recent changes identified and reviewed
+- [ ] Evidence gathered (logs, state, data flow)
+- [ ] Problem isolated to specific component/code
+- [ ] Root cause hypothesis formed
+
+**STOP:** Do not proceed to Phase 2 until you understand WHY it's happening.
+
+---
+
+## Phase 2: Pattern Analysis
+
+**Find the pattern before fixing:**
+
+### 1. Find Working Examples
+
+- Locate similar working code in the same codebase
+- What works that's similar to what's broken?
+
+**Action:** Use `search_files` to find comparable patterns:
+
+```python
+search_files("similar_pattern", path="src/", file_glob="*.py")
+```
+
+### 2. Compare Against References
+
+- If implementing a pattern, read the reference implementation COMPLETELY
+- Don't skim — read every line
+- Understand the pattern fully before applying
+
+### 3. Identify Differences
+
+- What's different between working and broken?
+- List every difference, however small
+- Don't assume "that can't matter"
+
+### 4. Understand Dependencies
+
+- What other components does this need?
+- What settings, config, environment?
+- What assumptions does it make?
+
+---
+
+## Phase 3: Hypothesis and Testing
+
+**Scientific method:**
+
+### 1. Form a Single Hypothesis
+
+- State clearly: "I think X is the root cause because Y"
+- Write it down
+- Be specific, not vague
+
+### 2. Test Minimally
+
+- Make the SMALLEST possible change to test the hypothesis
+- One variable at a time
+- Don't fix multiple things at once
+
+### 3. Verify Before Continuing
+
+- Did it work? → Phase 4
+- Didn't work? → Form NEW hypothesis
+- DON'T add more fixes on top
+
+### 4. When You Don't Know
+
+- Say "I don't understand X"
+- Don't pretend to know
+- Ask the user for help
+- Research more
+
+---
+
+## Phase 4: Implementation
+
+**Fix the root cause, not the symptom:**
+
+### 1. Create Failing Test Case
+
+- Simplest possible reproduction
+- Automated test if possible
+- MUST have before fixing
+- Use the `test-driven-development` skill
+
+### 2. Implement Single Fix
+
+- Address the root cause identified
+- ONE change at a time
+- No "while I'm here" improvements
+- No bundled refactoring
+
+### 3. Verify Fix
+
+```bash
+# Run the specific regression test
+pytest tests/test_module.py::test_regression -v
+
+# Run full suite — no regressions
+pytest tests/ -q
+```
+
+### 4. If Fix Doesn't Work — The Rule of Three
+
+- **STOP.**
+- Count: How many fixes have you tried?
+- If < 3: Return to Phase 1, re-analyze with new information
+- **If ≥ 3: STOP and question the architecture (step 5 below)**
+- DON'T attempt Fix #4 without architectural discussion
+
+### 5. If 3+ Fixes Failed: Question Architecture
+
+**Pattern indicating an architectural problem:**
+- Each fix reveals new shared state/coupling in a different place
+- Fixes require "massive refactoring" to implement
+- Each fix creates new symptoms elsewhere
+
+**STOP and question fundamentals:**
+- Is this pattern fundamentally sound?
+- Are we "sticking with it through sheer inertia"?
+- Should we refactor the architecture vs. continue fixing symptoms?
+
+**Discuss with the user before attempting more fixes.**
+
+This is NOT a failed hypothesis — this is a wrong architecture.
+
+---
+
+## Red Flags — STOP and Follow Process
+
+If you catch yourself thinking:
+- "Quick fix for now, investigate later"
+- "Just try changing X and see if it works"
+- "Add multiple changes, run tests"
+- "Skip the test, I'll manually verify"
+- "It's probably X, let me fix that"
+- "I don't fully understand but this might work"
+- "Pattern says X but I'll adapt it differently"
+- "Here are the main problems: [lists fixes without investigation]"
+- Proposing solutions before tracing data flow
+- **"One more fix attempt" (when already tried 2+)**
+- **Each fix reveals a new problem in a different place**
+
+**ALL of these mean: STOP. Return to Phase 1.**
+
+**If 3+ fixes failed:** Question the architecture (Phase 4 step 5).
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "Issue is simple, don't need process" | Simple issues have root causes too. Process is fast for simple bugs. |
+| "Emergency, no time for process" | Systematic debugging is FASTER than guess-and-check thrashing. |
+| "Just try this first, then investigate" | First fix sets the pattern. Do it right from the start. |
+| "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves it. |
+| "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
+| "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read it completely. |
+| "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
+| "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Question the pattern, don't fix again. |
+
+## Quick Reference
+
+| Phase | Key Activities | Success Criteria |
+|-------|---------------|------------------|
+| **1. Root Cause** | Read errors, reproduce, check changes, gather evidence, trace data flow | Understand WHAT and WHY |
+| **2. Pattern** | Find working examples, compare, identify differences | Know what's different |
+| **3. Hypothesis** | Form theory, test minimally, one variable at a time | Confirmed or new hypothesis |
+| **4. Implementation** | Create regression test, fix root cause, verify | Bug resolved, all tests pass |
+
+## Hermes Agent Integration
+
+### Investigation Tools
+
+Use these Hermes tools during Phase 1:
+
+- **`search_files`** — Find error strings, trace function calls, locate patterns
+- **`read_file`** — Read source code with line numbers for precise analysis
+- **`terminal`** — Run tests, check git history, reproduce bugs
+- **`web_search`/`web_extract`** — Research error messages, library docs
+
+### With delegate_task
+
+For complex multi-component debugging, dispatch investigation subagents:
+
+```python
+delegate_task(
+    goal="Investigate why [specific test/behavior] fails",
+    context="""
+    Follow systematic-debugging skill:
+    1. Read the error message carefully
+    2. Reproduce the issue
+    3. Trace the data flow to find root cause
+    4. Report findings — do NOT fix yet
+
+    Error: [paste full error]
+    File: [path to failing code]
+    Test command: [exact command]
+    """,
+    toolsets=['terminal', 'file']
+)
+```
+
+### With test-driven-development
+
+When fixing bugs:
+1. Write a test that reproduces the bug (RED)
+2. Debug systematically to find root cause
+3. Fix the root cause (GREEN)
+4. The test proves the fix and prevents regression
+
+## Real-World Impact
+
+From debugging sessions:
+- Systematic approach: 15-30 minutes to fix
+- Random fixes approach: 2-3 hours of thrashing
+- First-time fix rate: 95% vs 40%
+- New bugs introduced: Near zero vs common
+
+**No shortcuts. No guessing. Systematic always wins.**
diff --git a/output/systematic-debugging/20260424_232123/metrics.json b/output/systematic-debugging/20260424_232123/metrics.json
new file mode 100644
index 00000000..bfa61e13
--- /dev/null
+++ b/output/systematic-debugging/20260424_232123/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "systematic-debugging",
+  "timestamp": "20260424_232123",
+  "iterations": 3,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.5561533244876898,
+  "evolved_score": 0.5561533244876898,
+  "improvement": 0.0,
+  "baseline_size": 10020,
+  "evolved_size": 10020,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 81.4948399066925,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/output/systematic-debugging/20260424_232423/baseline_skill.md b/output/systematic-debugging/20260424_232423/baseline_skill.md
new file mode 100644
index 00000000..70a68d58
--- /dev/null
+++ b/output/systematic-debugging/20260424_232423/baseline_skill.md
@@ -0,0 +1,366 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+# Systematic Debugging
+
+## Overview
+
+Random fixes waste time and create new bugs. Quick patches mask underlying issues.
+
+**Core principle:** ALWAYS find root cause before attempting fixes. Symptom fixes are failure.
+
+**Violating the letter of this process is violating the spirit of debugging.**
+
+## The Iron Law
+
+```
+NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST
+```
+
+If you haven't completed Phase 1, you cannot propose fixes.
+
+## When to Use
+
+Use for ANY technical issue:
+- Test failures
+- Bugs in production
+- Unexpected behavior
+- Performance problems
+- Build failures
+- Integration issues
+
+**Use this ESPECIALLY when:**
+- Under time pressure (emergencies make guessing tempting)
+- "Just one quick fix" seems obvious
+- You've already tried multiple fixes
+- Previous fix didn't work
+- You don't fully understand the issue
+
+**Don't skip when:**
+- Issue seems simple (simple bugs have root causes too)
+- You're in a hurry (rushing guarantees rework)
+- Someone wants it fixed NOW (systematic is faster than thrashing)
+
+## The Four Phases
+
+You MUST complete each phase before proceeding to the next.
+
+---
+
+## Phase 1: Root Cause Investigation
+
+**BEFORE attempting ANY fix:**
+
+### 1. Read Error Messages Carefully
+
+- Don't skip past errors or warnings
+- They often contain the exact solution
+- Read stack traces completely
+- Note line numbers, file paths, error codes
+
+**Action:** Use `read_file` on the relevant source files. Use `search_files` to find the error string in the codebase.
+
+### 2. Reproduce Consistently
+
+- Can you trigger it reliably?
+- What are the exact steps?
+- Does it happen every time?
+- If not reproducible → gather more data, don't guess
+
+**Action:** Use the `terminal` tool to run the failing test or trigger the bug:
+
+```bash
+# Run specific failing test
+pytest tests/test_module.py::test_name -v
+
+# Run with verbose output
+pytest tests/test_module.py -v --tb=long
+```
+
+### 3. Check Recent Changes
+
+- What changed that could cause this?
+- Git diff, recent commits
+- New dependencies, config changes
+
+**Action:**
+
+```bash
+# Recent commits
+git log --oneline -10
+
+# Uncommitted changes
+git diff
+
+# Changes in specific file
+git log -p --follow src/problematic_file.py | head -100
+```
+
+### 4. Gather Evidence in Multi-Component Systems
+
+**WHEN system has multiple components (API → service → database, CI → build → deploy):**
+
+**BEFORE proposing fixes, add diagnostic instrumentation:**
+
+For EACH component boundary:
+- Log what data enters the component
+- Log what data exits the component
+- Verify environment/config propagation
+- Check state at each layer
+
+Run once to gather evidence showing WHERE it breaks.
+THEN analyze evidence to identify the failing component.
+THEN investigate that specific component.
+
+### 5. Trace Data Flow
+
+**WHEN error is deep in the call stack:**
+
+- Where does the bad value originate?
+- What called this function with the bad value?
+- Keep tracing upstream until you find the source
+- Fix at the source, not at the symptom
+
+**Action:** Use `search_files` to trace references:
+
+```python
+# Find where the function is called
+search_files("function_name(", path="src/", file_glob="*.py")
+
+# Find where the variable is set
+search_files("variable_name\\s*=", path="src/", file_glob="*.py")
+```
+
+### Phase 1 Completion Checklist
+
+- [ ] Error messages fully read and understood
+- [ ] Issue reproduced consistently
+- [ ] Recent changes identified and reviewed
+- [ ] Evidence gathered (logs, state, data flow)
+- [ ] Problem isolated to specific component/code
+- [ ] Root cause hypothesis formed
+
+**STOP:** Do not proceed to Phase 2 until you understand WHY it's happening.
+
+---
+
+## Phase 2: Pattern Analysis
+
+**Find the pattern before fixing:**
+
+### 1. Find Working Examples
+
+- Locate similar working code in the same codebase
+- What works that's similar to what's broken?
+
+**Action:** Use `search_files` to find comparable patterns:
+
+```python
+search_files("similar_pattern", path="src/", file_glob="*.py")
+```
+
+### 2. Compare Against References
+
+- If implementing a pattern, read the reference implementation COMPLETELY
+- Don't skim — read every line
+- Understand the pattern fully before applying
+
+### 3. Identify Differences
+
+- What's different between working and broken?
+- List every difference, however small
+- Don't assume "that can't matter"
+
+### 4. Understand Dependencies
+
+- What other components does this need?
+- What settings, config, environment?
+- What assumptions does it make?
+
+---
+
+## Phase 3: Hypothesis and Testing
+
+**Scientific method:**
+
+### 1. Form a Single Hypothesis
+
+- State clearly: "I think X is the root cause because Y"
+- Write it down
+- Be specific, not vague
+
+### 2. Test Minimally
+
+- Make the SMALLEST possible change to test the hypothesis
+- One variable at a time
+- Don't fix multiple things at once
+
+### 3. Verify Before Continuing
+
+- Did it work? → Phase 4
+- Didn't work? → Form NEW hypothesis
+- DON'T add more fixes on top
+
+### 4. When You Don't Know
+
+- Say "I don't understand X"
+- Don't pretend to know
+- Ask the user for help
+- Research more
+
+---
+
+## Phase 4: Implementation
+
+**Fix the root cause, not the symptom:**
+
+### 1. Create Failing Test Case
+
+- Simplest possible reproduction
+- Automated test if possible
+- MUST have before fixing
+- Use the `test-driven-development` skill
+
+### 2. Implement Single Fix
+
+- Address the root cause identified
+- ONE change at a time
+- No "while I'm here" improvements
+- No bundled refactoring
+
+### 3. Verify Fix
+
+```bash
+# Run the specific regression test
+pytest tests/test_module.py::test_regression -v
+
+# Run full suite — no regressions
+pytest tests/ -q
+```
+
+### 4. If Fix Doesn't Work — The Rule of Three
+
+- **STOP.**
+- Count: How many fixes have you tried?
+- If < 3: Return to Phase 1, re-analyze with new information
+- **If ≥ 3: STOP and question the architecture (step 5 below)**
+- DON'T attempt Fix #4 without architectural discussion
+
+### 5. If 3+ Fixes Failed: Question Architecture
+
+**Pattern indicating an architectural problem:**
+- Each fix reveals new shared state/coupling in a different place
+- Fixes require "massive refactoring" to implement
+- Each fix creates new symptoms elsewhere
+
+**STOP and question fundamentals:**
+- Is this pattern fundamentally sound?
+- Are we "sticking with it through sheer inertia"?
+- Should we refactor the architecture vs. continue fixing symptoms?
+
+**Discuss with the user before attempting more fixes.**
+
+This is NOT a failed hypothesis — this is a wrong architecture.
+
+---
+
+## Red Flags — STOP and Follow Process
+
+If you catch yourself thinking:
+- "Quick fix for now, investigate later"
+- "Just try changing X and see if it works"
+- "Add multiple changes, run tests"
+- "Skip the test, I'll manually verify"
+- "It's probably X, let me fix that"
+- "I don't fully understand but this might work"
+- "Pattern says X but I'll adapt it differently"
+- "Here are the main problems: [lists fixes without investigation]"
+- Proposing solutions before tracing data flow
+- **"One more fix attempt" (when already tried 2+)**
+- **Each fix reveals a new problem in a different place**
+
+**ALL of these mean: STOP. Return to Phase 1.**
+
+**If 3+ fixes failed:** Question the architecture (Phase 4 step 5).
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "Issue is simple, don't need process" | Simple issues have root causes too. Process is fast for simple bugs. |
+| "Emergency, no time for process" | Systematic debugging is FASTER than guess-and-check thrashing. |
+| "Just try this first, then investigate" | First fix sets the pattern. Do it right from the start. |
+| "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves it. |
+| "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
+| "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read it completely. |
+| "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
+| "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Question the pattern, don't fix again. |
+
+## Quick Reference
+
+| Phase | Key Activities | Success Criteria |
+|-------|---------------|------------------|
+| **1. Root Cause** | Read errors, reproduce, check changes, gather evidence, trace data flow | Understand WHAT and WHY |
+| **2. Pattern** | Find working examples, compare, identify differences | Know what's different |
+| **3. Hypothesis** | Form theory, test minimally, one variable at a time | Confirmed or new hypothesis |
+| **4. Implementation** | Create regression test, fix root cause, verify | Bug resolved, all tests pass |
+
+## Hermes Agent Integration
+
+### Investigation Tools
+
+Use these Hermes tools during Phase 1:
+
+- **`search_files`** — Find error strings, trace function calls, locate patterns
+- **`read_file`** — Read source code with line numbers for precise analysis
+- **`terminal`** — Run tests, check git history, reproduce bugs
+- **`web_search`/`web_extract`** — Research error messages, library docs
+
+### With delegate_task
+
+For complex multi-component debugging, dispatch investigation subagents:
+
+```python
+delegate_task(
+    goal="Investigate why [specific test/behavior] fails",
+    context="""
+    Follow systematic-debugging skill:
+    1. Read the error message carefully
+    2. Reproduce the issue
+    3. Trace the data flow to find root cause
+    4. Report findings — do NOT fix yet
+
+    Error: [paste full error]
+    File: [path to failing code]
+    Test command: [exact command]
+    """,
+    toolsets=['terminal', 'file']
+)
+```
+
+### With test-driven-development
+
+When fixing bugs:
+1. Write a test that reproduces the bug (RED)
+2. Debug systematically to find root cause
+3. Fix the root cause (GREEN)
+4. The test proves the fix and prevents regression
+
+## Real-World Impact
+
+From debugging sessions:
+- Systematic approach: 15-30 minutes to fix
+- Random fixes approach: 2-3 hours of thrashing
+- First-time fix rate: 95% vs 40%
+- New bugs introduced: Near zero vs common
+
+**No shortcuts. No guessing. Systematic always wins.**
diff --git a/output/systematic-debugging/20260424_232423/evolved_skill.md b/output/systematic-debugging/20260424_232423/evolved_skill.md
new file mode 100644
index 00000000..70a68d58
--- /dev/null
+++ b/output/systematic-debugging/20260424_232423/evolved_skill.md
@@ -0,0 +1,366 @@
+---
+name: systematic-debugging
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
+version: 1.1.0
+author: Hermes Agent (adapted from obra/superpowers)
+license: MIT
+metadata:
+  hermes:
+    tags: [debugging, troubleshooting, problem-solving, root-cause, investigation]
+    related_skills: [test-driven-development, writing-plans, subagent-driven-development]
+---
+
+# Systematic Debugging
+
+## Overview
+
+Random fixes waste time and create new bugs. Quick patches mask underlying issues.
+
+**Core principle:** ALWAYS find root cause before attempting fixes. Symptom fixes are failure.
+
+**Violating the letter of this process is violating the spirit of debugging.**
+
+## The Iron Law
+
+```
+NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST
+```
+
+If you haven't completed Phase 1, you cannot propose fixes.
+
+## When to Use
+
+Use for ANY technical issue:
+- Test failures
+- Bugs in production
+- Unexpected behavior
+- Performance problems
+- Build failures
+- Integration issues
+
+**Use this ESPECIALLY when:**
+- Under time pressure (emergencies make guessing tempting)
+- "Just one quick fix" seems obvious
+- You've already tried multiple fixes
+- Previous fix didn't work
+- You don't fully understand the issue
+
+**Don't skip when:**
+- Issue seems simple (simple bugs have root causes too)
+- You're in a hurry (rushing guarantees rework)
+- Someone wants it fixed NOW (systematic is faster than thrashing)
+
+## The Four Phases
+
+You MUST complete each phase before proceeding to the next.
+
+---
+
+## Phase 1: Root Cause Investigation
+
+**BEFORE attempting ANY fix:**
+
+### 1. Read Error Messages Carefully
+
+- Don't skip past errors or warnings
+- They often contain the exact solution
+- Read stack traces completely
+- Note line numbers, file paths, error codes
+
+**Action:** Use `read_file` on the relevant source files. Use `search_files` to find the error string in the codebase.
+
+### 2. Reproduce Consistently
+
+- Can you trigger it reliably?
+- What are the exact steps?
+- Does it happen every time?
+- If not reproducible → gather more data, don't guess
+
+**Action:** Use the `terminal` tool to run the failing test or trigger the bug:
+
+```bash
+# Run specific failing test
+pytest tests/test_module.py::test_name -v
+
+# Run with verbose output
+pytest tests/test_module.py -v --tb=long
+```
+
+### 3. Check Recent Changes
+
+- What changed that could cause this?
+- Git diff, recent commits
+- New dependencies, config changes
+
+**Action:**
+
+```bash
+# Recent commits
+git log --oneline -10
+
+# Uncommitted changes
+git diff
+
+# Changes in specific file
+git log -p --follow src/problematic_file.py | head -100
+```
+
+### 4. Gather Evidence in Multi-Component Systems
+
+**WHEN system has multiple components (API → service → database, CI → build → deploy):**
+
+**BEFORE proposing fixes, add diagnostic instrumentation:**
+
+For EACH component boundary:
+- Log what data enters the component
+- Log what data exits the component
+- Verify environment/config propagation
+- Check state at each layer
+
+Run once to gather evidence showing WHERE it breaks.
+THEN analyze evidence to identify the failing component.
+THEN investigate that specific component.
+
+### 5. Trace Data Flow
+
+**WHEN error is deep in the call stack:**
+
+- Where does the bad value originate?
+- What called this function with the bad value?
+- Keep tracing upstream until you find the source
+- Fix at the source, not at the symptom
+
+**Action:** Use `search_files` to trace references:
+
+```python
+# Find where the function is called
+search_files("function_name(", path="src/", file_glob="*.py")
+
+# Find where the variable is set
+search_files("variable_name\\s*=", path="src/", file_glob="*.py")
+```
+
+### Phase 1 Completion Checklist
+
+- [ ] Error messages fully read and understood
+- [ ] Issue reproduced consistently
+- [ ] Recent changes identified and reviewed
+- [ ] Evidence gathered (logs, state, data flow)
+- [ ] Problem isolated to specific component/code
+- [ ] Root cause hypothesis formed
+
+**STOP:** Do not proceed to Phase 2 until you understand WHY it's happening.
+
+---
+
+## Phase 2: Pattern Analysis
+
+**Find the pattern before fixing:**
+
+### 1. Find Working Examples
+
+- Locate similar working code in the same codebase
+- What works that's similar to what's broken?
+
+**Action:** Use `search_files` to find comparable patterns:
+
+```python
+search_files("similar_pattern", path="src/", file_glob="*.py")
+```
+
+### 2. Compare Against References
+
+- If implementing a pattern, read the reference implementation COMPLETELY
+- Don't skim — read every line
+- Understand the pattern fully before applying
+
+### 3. Identify Differences
+
+- What's different between working and broken?
+- List every difference, however small
+- Don't assume "that can't matter"
+
+### 4. Understand Dependencies
+
+- What other components does this need?
+- What settings, config, environment?
+- What assumptions does it make?
+
+---
+
+## Phase 3: Hypothesis and Testing
+
+**Scientific method:**
+
+### 1. Form a Single Hypothesis
+
+- State clearly: "I think X is the root cause because Y"
+- Write it down
+- Be specific, not vague
+
+### 2. Test Minimally
+
+- Make the SMALLEST possible change to test the hypothesis
+- One variable at a time
+- Don't fix multiple things at once
+
+### 3. Verify Before Continuing
+
+- Did it work? → Phase 4
+- Didn't work? → Form NEW hypothesis
+- DON'T add more fixes on top
+
+### 4. When You Don't Know
+
+- Say "I don't understand X"
+- Don't pretend to know
+- Ask the user for help
+- Research more
+
+---
+
+## Phase 4: Implementation
+
+**Fix the root cause, not the symptom:**
+
+### 1. Create Failing Test Case
+
+- Simplest possible reproduction
+- Automated test if possible
+- MUST have before fixing
+- Use the `test-driven-development` skill
+
+### 2. Implement Single Fix
+
+- Address the root cause identified
+- ONE change at a time
+- No "while I'm here" improvements
+- No bundled refactoring
+
+### 3. Verify Fix
+
+```bash
+# Run the specific regression test
+pytest tests/test_module.py::test_regression -v
+
+# Run full suite — no regressions
+pytest tests/ -q
+```
+
+### 4. If Fix Doesn't Work — The Rule of Three
+
+- **STOP.**
+- Count: How many fixes have you tried?
+- If < 3: Return to Phase 1, re-analyze with new information
+- **If ≥ 3: STOP and question the architecture (step 5 below)**
+- DON'T attempt Fix #4 without architectural discussion
+
+### 5. If 3+ Fixes Failed: Question Architecture
+
+**Pattern indicating an architectural problem:**
+- Each fix reveals new shared state/coupling in a different place
+- Fixes require "massive refactoring" to implement
+- Each fix creates new symptoms elsewhere
+
+**STOP and question fundamentals:**
+- Is this pattern fundamentally sound?
+- Are we "sticking with it through sheer inertia"?
+- Should we refactor the architecture vs. continue fixing symptoms?
+
+**Discuss with the user before attempting more fixes.**
+
+This is NOT a failed hypothesis — this is a wrong architecture.
+
+---
+
+## Red Flags — STOP and Follow Process
+
+If you catch yourself thinking:
+- "Quick fix for now, investigate later"
+- "Just try changing X and see if it works"
+- "Add multiple changes, run tests"
+- "Skip the test, I'll manually verify"
+- "It's probably X, let me fix that"
+- "I don't fully understand but this might work"
+- "Pattern says X but I'll adapt it differently"
+- "Here are the main problems: [lists fixes without investigation]"
+- Proposing solutions before tracing data flow
+- **"One more fix attempt" (when already tried 2+)**
+- **Each fix reveals a new problem in a different place**
+
+**ALL of these mean: STOP. Return to Phase 1.**
+
+**If 3+ fixes failed:** Question the architecture (Phase 4 step 5).
+
+## Common Rationalizations
+
+| Excuse | Reality |
+|--------|---------|
+| "Issue is simple, don't need process" | Simple issues have root causes too. Process is fast for simple bugs. |
+| "Emergency, no time for process" | Systematic debugging is FASTER than guess-and-check thrashing. |
+| "Just try this first, then investigate" | First fix sets the pattern. Do it right from the start. |
+| "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves it. |
+| "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
+| "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read it completely. |
+| "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
+| "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Question the pattern, don't fix again. |
+
+## Quick Reference
+
+| Phase | Key Activities | Success Criteria |
+|-------|---------------|------------------|
+| **1. Root Cause** | Read errors, reproduce, check changes, gather evidence, trace data flow | Understand WHAT and WHY |
+| **2. Pattern** | Find working examples, compare, identify differences | Know what's different |
+| **3. Hypothesis** | Form theory, test minimally, one variable at a time | Confirmed or new hypothesis |
+| **4. Implementation** | Create regression test, fix root cause, verify | Bug resolved, all tests pass |
+
+## Hermes Agent Integration
+
+### Investigation Tools
+
+Use these Hermes tools during Phase 1:
+
+- **`search_files`** — Find error strings, trace function calls, locate patterns
+- **`read_file`** — Read source code with line numbers for precise analysis
+- **`terminal`** — Run tests, check git history, reproduce bugs
+- **`web_search`/`web_extract`** — Research error messages, library docs
+
+### With delegate_task
+
+For complex multi-component debugging, dispatch investigation subagents:
+
+```python
+delegate_task(
+    goal="Investigate why [specific test/behavior] fails",
+    context="""
+    Follow systematic-debugging skill:
+    1. Read the error message carefully
+    2. Reproduce the issue
+    3. Trace the data flow to find root cause
+    4. Report findings — do NOT fix yet
+
+    Error: [paste full error]
+    File: [path to failing code]
+    Test command: [exact command]
+    """,
+    toolsets=['terminal', 'file']
+)
+```
+
+### With test-driven-development
+
+When fixing bugs:
+1. Write a test that reproduces the bug (RED)
+2. Debug systematically to find root cause
+3. Fix the root cause (GREEN)
+4. The test proves the fix and prevents regression
+
+## Real-World Impact
+
+From debugging sessions:
+- Systematic approach: 15-30 minutes to fix
+- Random fixes approach: 2-3 hours of thrashing
+- First-time fix rate: 95% vs 40%
+- New bugs introduced: Near zero vs common
+
+**No shortcuts. No guessing. Systematic always wins.**
diff --git a/output/systematic-debugging/20260424_232423/metrics.json b/output/systematic-debugging/20260424_232423/metrics.json
new file mode 100644
index 00000000..6c73fd2e
--- /dev/null
+++ b/output/systematic-debugging/20260424_232423/metrics.json
@@ -0,0 +1,17 @@
+{
+  "skill_name": "systematic-debugging",
+  "timestamp": "20260424_232423",
+  "iterations": 3,
+  "optimizer_model": "openrouter/anthropic/claude-sonnet-4",
+  "eval_model": "openrouter/google/gemini-2.5-flash",
+  "baseline_score": 0.4787204285302781,
+  "evolved_score": 0.4787204285302781,
+  "improvement": 0.0,
+  "baseline_size": 10020,
+  "evolved_size": 10020,
+  "train_examples": 10,
+  "val_examples": 5,
+  "holdout_examples": 5,
+  "elapsed_seconds": 93.34848666191101,
+  "constraints_passed": true
+}
\ No newline at end of file
diff --git a/run-evolution.sh b/run-evolution.sh
new file mode 100755
index 00000000..1c780732
--- /dev/null
+++ b/run-evolution.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Wrapper to run hermes-agent-self-evolution with provider selection
+#set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Load secrets from Hermes' secure .env
+if [[ -f "$HOME/.hermes/.env" ]]; then
+    set -a
+    source "$HOME/.hermes/.env"
+    set +a
+fi
+
+# Load local config
+if [[ -f "$SCRIPT_DIR/.env" ]]; then
+    set -a
+    source "$SCRIPT_DIR/.env"
+    set +a
+fi
+
+PROVIDER="${PROVIDER:-openrouter}"
+
+if [[ "$PROVIDER" == "nous" ]]; then
+    # Nous Research: OpenRouter-powered with coding-plan discounts
+    OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-anthropic/claude-sonnet-4.6}"
+    EVAL_MODEL="${EVAL_MODEL:-moonshotai/kimi-k2.6}"
+    if [[ -n "${NOUS_API_KEY:-}" ]]; then
+        export OPENROUTER_API_KEY="$NOUS_API_KEY"
+    fi
+    echo "🧬 Provider: Nous Research (Kimi K2.6 evaluator is FREE)"
+else
+    # OpenRouter defaults
+    OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-anthropic/claude-sonnet-4.6}"
+    EVAL_MODEL="${EVAL_MODEL:-google/gemini-3.1-flash-lite}"
+    JUDGE_MODEL="${JUDGE_MODEL:-anthropic/claude-sonnet-4.6}"
+
+    if [[ -z "${OPENROUTER_API_KEY:-}" ]]; then
+        echo "ERROR: OPENROUTER_API_KEY is not set. Set it in ~/.hermes/.env or $SCRIPT_DIR/.env"
+        exit 1
+    fi
+    echo "🧬 Provider: OpenRouter"
+fi
+
+source "$SCRIPT_DIR/venv/bin/activate"
+
+exec python -m evolution.skills.evolve_skill \
+    --optimizer-model "$OPTIMIZER_MODEL" \
+    --eval-model "$EVAL_MODEL" \
+    "$@"