From 64b5d27e67c13b777521b94ad0b5b2ec7e074730 Mon Sep 17 00:00:00 2001
From: Evan Senter <evansenter@gmail.com>
Date: Sun, 4 Jan 2026 04:17:18 +0000
Subject: [PATCH 1/2] feat: Close drill-down gaps in MCP API (RFC #49)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Self-play testing revealed that LLMs could see aggregate counts but
couldn't drill down to actionable specifics. This closes those gaps:

- **error_examples in analyze_failures**: When errors_by_tool shows
  "Bash: 5 errors", error_examples now reveals WHICH commands failed
  (top 5 per tool with counts)

- **classification_factors in classify_sessions**: Sessions now include
  the trigger threshold and relevant metrics explaining WHY they were
  categorized (e.g., "error_rate > 15%", error_rate: 33.2%)

- **fnmatch for permission_gaps**: Patterns like Bash(make*) now
  correctly match commands using glob patterns, not just exact matches

- **Clearer sample_sequences error**: Message now says "must be
  alphanumeric or underscores" (was misleading about underscores)

Closes #45, #46, #48, #49
Supersedes #47 (was already working via tool_id joins)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/session_analytics/guide.md    |  34 ++++--
 src/session_analytics/patterns.py | 166 ++++++++++++++++++++++++------
 src/session_analytics/queries.py  |  47 ++++++++-
 tests/test_patterns.py            | 137 ++++++++++++++++++++++--
 tests/test_queries.py             |  61 +++++++++++
 5 files changed, 389 insertions(+), 56 deletions(-)

diff --git a/src/session_analytics/guide.md b/src/session_analytics/guide.md
index c233f14..b33eab0 100644
--- a/src/session_analytics/guide.md
+++ b/src/session_analytics/guide.md
@@ -38,20 +38,29 @@ identify permission gaps.
 |------|---------|
 | `get_tool_sequences(days?, min_count?, length?)` | Common tool chains (e.g., Read → Edit → Bash) |
 | `sample_sequences(pattern, limit?, context_events?)` | Random samples of a pattern with surrounding context |
-| `get_permission_gaps(days?, min_count?)` | Commands that should be in settings.json |
+| `get_permission_gaps(days?, min_count?)` | Commands not covered by settings.json (supports glob patterns) |
 | `get_insights(days?, refresh?)` | Pre-computed patterns for /improve-workflow |
 
 ### Failure Analysis
 
 | Tool | Purpose |
 |------|---------|
-| `analyze_failures(days?, project?)` | Failure patterns, rework, and correlations |
+| `analyze_failures(days?, project?)` | Failure patterns with drill-down to specific commands |
+
+Returns:
+- `errors_by_tool`: Count of errors per tool
+- `error_examples`: Top failing commands (Bash) or files (Edit/Read/Write) for drill-down
+- `rework_patterns`: Files edited 3+ times within 10 minutes
 
 ### Session Classification
 
 | Tool | Purpose |
 |------|---------|
-| `classify_sessions(days?, project?)` | Categorize sessions (debugging, development, research, maintenance) |
+| `classify_sessions(days?, project?)` | Categorize sessions with explanation of why |
+
+Each session includes `classification_factors` explaining WHY it was categorized:
+- `trigger`: The threshold that was exceeded (e.g., "error_rate > 15%")
+- Relevant metrics (error_rate, edit_rate, etc.)
 
 ### Trend Analysis
 
@@ -168,15 +177,15 @@ analyze_trends()      → "Usage is increasing/decreasing"
 
 ### Session Categories
 
-`classify_sessions()` returns one of these categories:
+`classify_sessions()` returns one of these categories, with `classification_factors` explaining why:
 
-| Category | Criteria |
-|----------|----------|
-| **debugging** | High error rate (>15%) or 5+ errors |
-| **development** | Heavy editing (>30% edits or 3+ writes) |
-| **maintenance** | Git/build focus without much editing |
-| **research** | Mostly reading/searching codebase |
-| **mixed** | No dominant pattern |
+| Category | Criteria | Trigger Example |
+|----------|----------|-----------------|
+| **debugging** | High error rate (>15%) or 5+ errors | `"error_rate > 15%"` |
+| **development** | Heavy editing (>30% edits or 3+ writes) | `"edit_rate > 30%"` |
+| **maintenance** | Git/build focus without much editing | `"git_build_rate > 30%"` |
+| **research** | Mostly reading/searching codebase | `"read_search_rate > 50%"` |
+| **mixed** | No dominant pattern | `"no_dominant_pattern"` |
 
 ### Permission Gaps
 
@@ -189,6 +198,9 @@ get_permission_gaps(min_count=5)
 
 Add suggestions to `permissions.allow` in your settings.
 
+**Note:** Supports glob pattern matching. Patterns like `Bash(make*)` will correctly
+match commands `make`, `make-test`, etc. using fnmatch.
+
 ### Git Integration
 
 Git correlation requires two steps:
diff --git a/src/session_analytics/patterns.py b/src/session_analytics/patterns.py
index f3e9384..935a989 100644
--- a/src/session_analytics/patterns.py
+++ b/src/session_analytics/patterns.py
@@ -1,5 +1,6 @@
 """Pattern detection and insight generation for session analytics."""
 
+import fnmatch
 import json
 import logging
 import random
@@ -241,12 +242,12 @@ def sample_sequences(
     else:
         target_tools = [t.strip() for t in pattern.split(",")]
 
-    # Validate individual tool names (alphanumeric + underscore only)
+    # Validate individual tool names (alphanumeric and underscores only)
     for tool in target_tools:
         if not tool or not all(c.isalnum() or c == "_" for c in tool):
             return {
                 "pattern": pattern,
-                "error": f"Invalid tool name: '{tool}' (must be alphanumeric)",
+                "error": f"Invalid tool name: '{tool}' (must be alphanumeric or underscores)",
                 "total_occurrences": 0,
                 "samples": [],
             }
@@ -389,6 +390,7 @@ def analyze_failures(
 
     Identifies:
     - Tool errors (is_error=True in tool_result)
+    - Error examples showing top failing commands/files per tool
     - Rework patterns (same file edited multiple times quickly)
     - Error clustering by tool/command
 
@@ -398,7 +400,10 @@ def analyze_failures(
         rework_window_minutes: Time window for detecting rework (default: 10)
 
     Returns:
-        Dict with failure analysis including error counts, rework patterns, recovery times
+        Dict with:
+        - errors_by_tool: Count of errors per tool
+        - error_examples: Top failing commands (Bash) or files (Edit/Read/Write) per tool
+        - rework_patterns: Instances of same file edited 3+ times quickly
     """
     cutoff = get_cutoff(days=days)
 
@@ -452,6 +457,54 @@ def analyze_failures(
         if row["tool_name"]
     ]
 
+    # Get error examples: top failing commands/files for drill-down
+    # For Bash, group by command; for file tools, group by file_path
+    error_examples_rows = storage.execute_query(
+        """
+        SELECT
+            e2.tool_name,
+            e2.command,
+            e2.file_path,
+            COUNT(*) as error_count
+        FROM events e1
+        JOIN events e2 ON e1.tool_id = e2.tool_id AND e2.entry_type = 'tool_use'
+        WHERE e1.timestamp >= ?
+          AND e1.is_error = 1
+          AND e1.entry_type = 'tool_result'
+        GROUP BY e2.tool_name, e2.command, e2.file_path
+        ORDER BY e2.tool_name, error_count DESC
+        """,
+        (cutoff,),
+    )
+
+    # Organize error examples by tool with top 5 examples each
+    error_examples: dict[str, list[dict]] = {}
+    tool_example_counts: dict[str, int] = {}
+
+    for row in error_examples_rows:
+        tool = row["tool_name"]
+        if not tool:
+            continue
+
+        # Limit to 5 examples per tool
+        if tool_example_counts.get(tool, 0) >= 5:
+            continue
+
+        if tool not in error_examples:
+            error_examples[tool] = []
+            tool_example_counts[tool] = 0
+
+        # Build example based on tool type
+        if tool == "Bash" and row["command"]:
+            error_examples[tool].append({"command": row["command"], "count": row["error_count"]})
+        elif row["file_path"]:
+            error_examples[tool].append({"file": row["file_path"], "count": row["error_count"]})
+        else:
+            # Generic fallback
+            error_examples[tool].append({"count": row["error_count"]})
+
+        tool_example_counts[tool] += 1
+
     # Detect rework patterns: same file edited multiple times in quick succession
     rework_window = timedelta(minutes=rework_window_minutes)
 
@@ -542,6 +595,7 @@ def analyze_failures(
         "sessions_with_errors": sessions_with_errors,
         "avg_errors_per_session": round(avg_errors_per_session, 2),
         "errors_by_tool": errors_by_tool[:10],
+        "error_examples": error_examples,
         "rework_patterns": {
             "instances_detected": len(rework_instances),
             "rework_window_minutes": rework_window_minutes,
@@ -550,50 +604,101 @@ def analyze_failures(
     }
 
 
-def load_allowed_commands(settings_path: Path = DEFAULT_SETTINGS_PATH) -> set[str]:
-    """Load allowed base commands from Claude Code settings.json.
-
-    Parses Bash permission patterns and extracts base commands:
-    - Bash(gh:*) → gh
-    - Bash(gh pr view:*) → gh
-    - Bash(git status:*) → git
-
-    This means a command like `gh` won't be reported as a permission gap
-    if ANY pattern for `gh` exists (e.g., `Bash(gh pr view:*)`).
+def load_allowed_commands(
+    settings_path: Path = DEFAULT_SETTINGS_PATH,
+) -> tuple[set[str], list[str]]:
+    """Load allowed base commands and glob patterns from Claude Code settings.json.
+
+    Parses Bash permission patterns and extracts:
+    1. Base commands for simple matching:
+       - Bash(gh:*) → gh
+       - Bash(gh pr view:*) → gh
+       - Bash(git status:*) → git
+    2. Glob patterns for fnmatch matching:
+       - Bash(make*) → make*
+       - Bash(./scripts/*.sh:*) → ./scripts/*.sh
 
     Args:
         settings_path: Path to settings.json
 
     Returns:
-        Set of base commands that have any configured pattern
+        Tuple of (base_commands set, glob_patterns list for fnmatch)
     """
     if not settings_path.exists():
-        return set()
+        return set(), []
 
     try:
         with open(settings_path) as f:
             settings = json.load(f)
 
-        base_commands = set()
+        base_commands: set[str] = set()
+        glob_patterns: list[str] = []
         permissions = settings.get("permissions", {})
 
         for pattern in permissions.get("allow", []):
-            if pattern.startswith("Bash(") and ":*)" in pattern:
-                # Extract full command from "Bash(command args:*)"
-                # Find the position of ":*)" to handle patterns correctly
-                start = 5  # len("Bash(")
-                end = pattern.find(":*)")
-                if end > start:
-                    full_cmd = pattern[start:end]
-                    # Extract base command (first word)
-                    base_cmd = full_cmd.split()[0] if full_cmd else None
+            if not pattern.startswith("Bash(") or not pattern.endswith(")"):
+                continue
+
+            # Extract content from Bash(...)
+            content = pattern[5:-1]  # Remove "Bash(" and ")"
+            if not content:
+                continue
+
+            # Handle different formats
+            if ":*" in content:
+                # Standard format: Bash(cmd:*) or Bash(cmd args:*)
+                full_cmd = content.split(":*")[0]
+                # Extract base command (first word)
+                base_cmd = full_cmd.split()[0] if full_cmd else None
+                if base_cmd:
+                    base_commands.add(base_cmd)
+                    # Also store as glob pattern for fnmatch
+                    glob_patterns.append(base_cmd)
+            elif "*" in content or "?" in content or "[" in content:
+                # Glob pattern: Bash(make*), Bash(./scripts/*.sh)
+                # Extract base command (remove glob chars for base matching)
+                base = content.rstrip("*").rstrip()
+                if base:
+                    # For patterns like "make*", base is "make"
+                    base_cmd = base.split()[0] if base else None
                     if base_cmd:
                         base_commands.add(base_cmd)
+                # Store full pattern for fnmatch
+                glob_patterns.append(content)
+            else:
+                # Exact match: Bash(cmd)
+                base_cmd = content.split()[0] if content else None
+                if base_cmd:
+                    base_commands.add(base_cmd)
+                    glob_patterns.append(base_cmd)
 
-        return base_commands
+        return base_commands, glob_patterns
     except (json.JSONDecodeError, OSError) as e:
         logger.warning(f"Could not load settings.json: {e}")
-        return set()
+        return set(), []
+
+
+def _command_matches_patterns(cmd: str, base_commands: set[str], glob_patterns: list[str]) -> bool:
+    """Check if a command is covered by allowed patterns.
+
+    Args:
+        cmd: The base command to check (e.g., "git", "make")
+        base_commands: Set of allowed base commands
+        glob_patterns: List of glob patterns for fnmatch
+
+    Returns:
+        True if command is allowed by any pattern
+    """
+    # First check simple base command membership
+    if cmd in base_commands:
+        return True
+
+    # Then check glob patterns using fnmatch
+    for pattern in glob_patterns:
+        if fnmatch.fnmatch(cmd, pattern):
+            return True
+
+    return False
 
 
 def compute_permission_gaps(
@@ -604,6 +709,9 @@ def compute_permission_gaps(
 ) -> list[Pattern]:
     """Find commands that are frequently used but not in settings.json.
 
+    Uses fnmatch for glob pattern matching, so patterns like Bash(make*)
+    will correctly match commands like 'make', 'make-test', etc.
+
     Args:
         storage: Storage instance
         days: Number of days to analyze
@@ -616,7 +724,7 @@ def compute_permission_gaps(
     cutoff = get_cutoff(days=days)
     now = datetime.now()
 
-    allowed_commands = load_allowed_commands(settings_path)
+    base_commands, glob_patterns = load_allowed_commands(settings_path)
 
     rows = storage.execute_query(
         """
@@ -633,7 +741,7 @@ def compute_permission_gaps(
     patterns = []
     for row in rows:
         cmd = row["command"]
-        if cmd not in allowed_commands:
+        if not _command_matches_patterns(cmd, base_commands, glob_patterns):
             patterns.append(
                 Pattern(
                     id=None,
diff --git a/src/session_analytics/queries.py b/src/session_analytics/queries.py
index 57a041a..d413c51 100644
--- a/src/session_analytics/queries.py
+++ b/src/session_analytics/queries.py
@@ -1052,13 +1052,18 @@ def classify_sessions(
     - research: Read/search heavy, exploring codebase
     - maintenance: CI/git heavy, infrastructure work
 
+    Each session includes `classification_factors` explaining WHY it was
+    categorized, including the trigger threshold and relevant metrics.
+
     Args:
         storage: Storage instance
         days: Number of days to analyze (default: 7)
         project: Optional project filter
 
     Returns:
-        Dict with session classifications and category distribution
+        Dict with:
+        - sessions: List with category, confidence, classification_factors, and stats
+        - category_distribution: Count of sessions per category
     """
     cutoff = get_cutoff(days=days)
 
@@ -1123,21 +1128,52 @@ def classify_sessions(
         # - Maintenance: Git/build focus without editing (>30% combined)
         # - Research: Mostly reading/searching codebase (>50% combined)
         # - Mixed: No dominant pattern, balanced activity
-        if error_pct > 0.15 or (row["error_count"] or 0) > 5:
+        error_count = row["error_count"] or 0
+        write_count = row["write_count"] or 0
+
+        if error_pct > 0.15 or error_count > 5:
             category = "debugging"
             confidence = min(1.0, error_pct * 3)
-        elif edit_pct > 0.3 or (row["write_count"] or 0) > 3:
+            classification_factors = {
+                "trigger": "error_rate > 15%" if error_pct > 0.15 else "error_count > 5",
+                "error_rate": round(error_pct * 100, 1),
+                "error_count": error_count,
+            }
+        elif edit_pct > 0.3 or write_count > 3:
             category = "development"
-            confidence = min(1.0, (edit_pct + (row["write_count"] or 0) / total) * 2)
+            confidence = min(1.0, (edit_pct + write_count / total) * 2)
+            classification_factors = {
+                "trigger": "edit_rate > 30%" if edit_pct > 0.3 else "write_count > 3",
+                "edit_rate": round(edit_pct * 100, 1),
+                "write_count": write_count,
+            }
         elif git_pct + build_pct > 0.3:
             category = "maintenance"
             confidence = min(1.0, (git_pct + build_pct) * 2)
+            classification_factors = {
+                "trigger": "git_build_rate > 30%",
+                "git_rate": round(git_pct * 100, 1),
+                "build_rate": round(build_pct * 100, 1),
+            }
         elif read_pct + search_pct > 0.5:
             category = "research"
             confidence = min(1.0, (read_pct + search_pct) * 1.5)
+            classification_factors = {
+                "trigger": "read_search_rate > 50%",
+                "read_rate": round(read_pct * 100, 1),
+                "search_rate": round(search_pct * 100, 1),
+            }
         else:
             category = "mixed"
             confidence = 0.5
+            classification_factors = {
+                "trigger": "no_dominant_pattern",
+                "top_activities": {
+                    "edit_rate": round(edit_pct * 100, 1),
+                    "read_rate": round(read_pct * 100, 1),
+                    "search_rate": round(search_pct * 100, 1),
+                },
+            }
 
         category_counts[category] += 1
 
@@ -1147,13 +1183,14 @@ def classify_sessions(
                 "project": row["project_path"],
                 "category": category,
                 "confidence": round(confidence, 2),
+                "classification_factors": classification_factors,
                 "stats": {
                     "total_events": row["total_events"],
                     "edit_count": row["edit_count"] or 0,
                     "read_count": row["read_count"] or 0,
                     "search_count": row["search_count"] or 0,
                     "git_count": row["git_count"] or 0,
-                    "error_count": row["error_count"] or 0,
+                    "error_count": error_count,
                 },
                 "first_seen": _format_timestamp(row["first_seen"]),
                 "last_seen": _format_timestamp(row["last_seen"]),
diff --git a/tests/test_patterns.py b/tests/test_patterns.py
index 1e29466..e82c0e2 100644
--- a/tests/test_patterns.py
+++ b/tests/test_patterns.py
@@ -92,17 +92,18 @@ def test_load_allowed_commands_missing_file(self):
         """Test loading allowed commands from non-existent file."""
         with tempfile.TemporaryDirectory() as tmpdir:
             missing_path = Path(tmpdir) / "nonexistent.json"
-            allowed = load_allowed_commands(missing_path)
-            assert allowed == set()
+            base_commands, glob_patterns = load_allowed_commands(missing_path)
+            assert base_commands == set()
+            assert glob_patterns == []
 
     def test_load_allowed_commands(self):
         """Test loading allowed commands from settings.json."""
         with tempfile.TemporaryDirectory() as tmpdir:
             settings_path = Path(tmpdir) / "settings.json"
             settings_path.write_text('{"permissions": {"allow": ["Bash(git:*)", "Bash(make:*)"]}}')
-            allowed = load_allowed_commands(settings_path)
-            assert "git" in allowed
-            assert "make" in allowed
+            base_commands, glob_patterns = load_allowed_commands(settings_path)
+            assert "git" in base_commands
+            assert "make" in base_commands
 
     def test_compute_permission_gaps(self, pattern_storage):
         """Test computing permission gaps."""
@@ -150,16 +151,16 @@ def test_load_allowed_commands_extracts_base_from_subcommands(self):
                 '"Bash(cargo build:*)"'
                 "]}}"
             )
-            allowed = load_allowed_commands(settings_path)
+            base_commands, glob_patterns = load_allowed_commands(settings_path)
 
             # Should extract base commands, not full subcommands
-            assert "gh" in allowed
-            assert "git" in allowed
-            assert "cargo" in allowed
+            assert "gh" in base_commands
+            assert "git" in base_commands
+            assert "cargo" in base_commands
 
             # Should NOT contain full subcommand strings
-            assert "gh pr view" not in allowed
-            assert "git status" not in allowed
+            assert "gh pr view" not in base_commands
+            assert "git status" not in base_commands
 
     def test_permission_gaps_filters_subcommand_patterns(self, pattern_storage):
         """Test that gaps are filtered when subcommand patterns exist.
@@ -184,6 +185,52 @@ def test_permission_gaps_filters_subcommand_patterns(self, pattern_storage):
             # make has no patterns, should still be a gap
             assert "make" in pattern_keys
 
+    def test_load_allowed_commands_handles_glob_patterns(self):
+        """Test that glob patterns (without :*) are handled correctly.
+
+        Patterns like Bash(make*) should be recognized and used for
+        fnmatch-based matching.
+        """
+        with tempfile.TemporaryDirectory() as tmpdir:
+            settings_path = Path(tmpdir) / "settings.json"
+            settings_path.write_text(
+                '{"permissions": {"allow": ['
+                '"Bash(make*)", '
+                '"Bash(./scripts/*.sh:*)", '
+                '"Bash(cargo)"'
+                "]}}"
+            )
+            base_commands, glob_patterns = load_allowed_commands(settings_path)
+
+            # Should extract base commands
+            assert "make" in base_commands
+            assert "cargo" in base_commands
+
+            # Glob patterns should be stored for fnmatch
+            assert "make*" in glob_patterns
+            assert "cargo" in glob_patterns
+
+    def test_permission_gaps_uses_fnmatch(self, pattern_storage):
+        """Test that permission gaps uses fnmatch for glob pattern matching.
+
+        If settings has Bash(make*), then 'make' should NOT be reported
+        as a permission gap because it matches the glob pattern.
+        """
+        with tempfile.TemporaryDirectory() as tmpdir:
+            settings_path = Path(tmpdir) / "settings.json"
+            # Use glob pattern without :*
+            settings_path.write_text('{"permissions": {"allow": ["Bash(make*)"]}}')
+
+            patterns = compute_permission_gaps(
+                pattern_storage, days=7, threshold=1, settings_path=settings_path
+            )
+
+            pattern_keys = {p.pattern_key for p in patterns}
+            # make should be filtered out by fnmatch against "make*"
+            assert "make" not in pattern_keys
+            # git has no matching pattern, should still be a gap
+            assert "git" in pattern_keys
+
 
 class TestComputeAllPatterns:
     """Tests for computing all patterns."""
@@ -503,6 +550,74 @@ def test_rework_not_detected_different_files(self, storage):
         # Different files shouldn't count as rework
         assert result["rework_patterns"]["instances_detected"] == 0
 
+    def test_analyze_failures_error_examples(self, storage):
+        """Test that error_examples provides drill-down to specific failing commands/files.
+
+        RFC #49: When errors_by_tool shows 'Bash: 5 errors', error_examples should
+        reveal WHICH commands failed, enabling actionable diagnosis.
+        """
+        from session_analytics.patterns import analyze_failures
+
+        now = datetime.now()
+        events = [
+            # Bash error with command
+            Event(
+                id=None,
+                uuid="bash-use-1",
+                timestamp=now - timedelta(hours=1),
+                session_id="s1",
+                entry_type="tool_use",
+                tool_name="Bash",
+                tool_id="bash-1",
+                command="make test",
+            ),
+            Event(
+                id=None,
+                uuid="bash-result-1",
+                timestamp=now - timedelta(hours=1, minutes=-1),
+                session_id="s1",
+                entry_type="tool_result",
+                tool_id="bash-1",
+                is_error=True,
+            ),
+            # Read error with file_path
+            Event(
+                id=None,
+                uuid="read-use-1",
+                timestamp=now - timedelta(hours=2),
+                session_id="s1",
+                entry_type="tool_use",
+                tool_name="Read",
+                tool_id="read-1",
+                file_path="/nonexistent/file.py",
+            ),
+            Event(
+                id=None,
+                uuid="read-result-1",
+                timestamp=now - timedelta(hours=2, minutes=-1),
+                session_id="s1",
+                entry_type="tool_result",
+                tool_id="read-1",
+                is_error=True,
+            ),
+        ]
+        storage.add_events_batch(events)
+
+        result = analyze_failures(storage, days=7)
+
+        # Verify error_examples exists
+        assert "error_examples" in result
+
+        # Bash errors should include the failing command
+        bash_examples = result["error_examples"].get("Bash", [])
+        assert len(bash_examples) >= 1
+        assert any(ex.get("command") == "make test" for ex in bash_examples)
+
+        # Read errors should include the failing file
+        read_examples = result["error_examples"].get("Read", [])
+        assert len(read_examples) >= 1
+        assert any(ex.get("file") == "/nonexistent/file.py" for ex in read_examples)
+
 
 class TestAnalyzeTrends:
     """Tests for the analyze_trends function (Phase 7: Trend Analysis)."""
diff --git a/tests/test_queries.py b/tests/test_queries.py
index 10ae3f0..9119316 100644
--- a/tests/test_queries.py
+++ b/tests/test_queries.py
@@ -1106,6 +1106,67 @@ def test_min_event_threshold(self, storage):
         # Session with only 3 events should be excluded
         assert result["session_count"] == 0
 
+    def test_classification_factors_included(self, storage):
+        """Test that classification_factors explains WHY sessions were categorized.
+
+        RFC #49: Without classification_factors, an LLM seeing 'category: debugging'
+        cannot explain to the user why it was classified that way.
+        """
+        from session_analytics.queries import classify_sessions
+
+        now = datetime.now()
+        events = []
+        # Create session with >15% error rate to trigger debugging classification
+        for i in range(6):
+            events.append(
+                Event(
+                    id=None,
+                    uuid=f"factors-tool-{i}",
+                    timestamp=now - timedelta(hours=1, minutes=i),
+                    session_id="factors-session",
+                    project_path="/factors/project",
+                    entry_type="tool_use",
+                    tool_name="Bash",
+                    tool_id=f"tool-{i}",
+                )
+            )
+        # Add 2 error results (33% error rate)
+        for i in range(2):
+            events.append(
+                Event(
+                    id=None,
+                    uuid=f"factors-error-{i}",
+                    timestamp=now - timedelta(hours=1, minutes=i + 10),
+                    session_id="factors-session",
+                    project_path="/factors/project",
+                    entry_type="tool_result",
+                    tool_id=f"tool-{i}",
+                    is_error=True,
+                )
+            )
+        storage.add_events_batch(events)
+
+        result = classify_sessions(storage, days=7)
+
+        session = next(
+            (s for s in result["sessions"] if s["session_id"] == "factors-session"),
+            None,
+        )
+        assert session is not None
+        assert session["category"] == "debugging"
+
+        # Verify classification_factors exists and explains WHY
+        assert "classification_factors" in session
+        factors = session["classification_factors"]
+
+        # Should include the trigger that caused this classification
+        assert "trigger" in factors
+        assert "error_rate" in factors["trigger"] or "error_count" in factors["trigger"]
+
+        # Should include the relevant metrics
+        assert "error_rate" in factors
+        assert factors["error_rate"] > 15  # Should be ~33%
+
 
 class TestGetUserJourneyIncludeProjects:
     """Test for get_user_journey with include_projects=False."""

From 3cceb366f4150d4ec6584e0f1279b040010dfa7f Mon Sep 17 00:00:00 2001
From: Evan Senter <evansenter@gmail.com>
Date: Sun, 4 Jan 2026 04:28:22 +0000
Subject: [PATCH 2/2] style: Clarify confusing timedelta in test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address reviewer feedback: use timedelta(minutes=59) instead of
timedelta(hours=1, minutes=-1) for clarity in test timestamps.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/test_patterns.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_patterns.py b/tests/test_patterns.py
index e82c0e2..7f707b2 100644
--- a/tests/test_patterns.py
+++ b/tests/test_patterns.py
@@ -574,7 +574,7 @@ def test_analyze_failures_error_examples(self, storage):
             Event(
                 id=None,
                 uuid="bash-result-1",
-                timestamp=now - timedelta(hours=1, minutes=-1),
+                timestamp=now - timedelta(minutes=59),  # Shortly after bash-use-1
                 session_id="s1",
                 entry_type="tool_result",
                 tool_id="bash-1",
@@ -594,7 +594,7 @@ def test_analyze_failures_error_examples(self, storage):
             Event(
                 id=None,
                 uuid="read-result-1",
-                timestamp=now - timedelta(hours=2, minutes=-1),
+                timestamp=now - timedelta(minutes=119),  # Shortly after read-use-1
                 session_id="s1",
                 entry_type="tool_result",
                 tool_id="read-1",