From 8c68210f8ffaced079f9bf8452cae746986c1b05 Mon Sep 17 00:00:00 2001 From: Evan Senter Date: Sat, 10 Jan 2026 01:30:15 +0000 Subject: [PATCH 1/3] feat: Add assistant messages to get_session_messages (closes #68) Add unified `message_text` column for all entry types (user, assistant, tool_result, summary) with FTS indexing across all message types. Key changes: - Add migration v8 for message_text column and rebuilt FTS index - Update ingestion to extract text from assistant, tool_result, summary entries - Add entry_types parameter to filter by message type (default: user, assistant) - Add max_message_length parameter for truncation (default: 500, 0=no limit) - Update search_messages to search across all message types - Update CLI with --entry-types and --max-length flags - Add 5 new tests for the new functionality Co-Authored-By: Claude Opus 4.5 --- docs/SCHEMA.md | 14 +- src/session_analytics/cli.py | 68 ++++++-- src/session_analytics/guide.md | 14 +- src/session_analytics/ingest.py | 89 +++++++++- src/session_analytics/queries.py | 41 ++++- src/session_analytics/server.py | 34 +++- src/session_analytics/storage.py | 272 ++++++++++++++++++++++--------- tests/conftest.py | 4 +- tests/test_queries.py | 133 +++++++++++++-- tests/test_storage.py | 117 ++++++++++--- 10 files changed, 631 insertions(+), 155 deletions(-) diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md index e73ae94..b9ff784 100644 --- a/docs/SCHEMA.md +++ b/docs/SCHEMA.md @@ -70,8 +70,9 @@ CREATE TABLE events ( cwd TEXT, -- User journey (RFC #17) - user_message_text TEXT, -- For FTS search + user_message_text TEXT, -- Deprecated: use message_text exit_code INTEGER, -- Reserved for future extraction + message_text TEXT, -- Unified text for all entry types (Issue #68) -- Agent tracking (RFC #41) parent_uuid TEXT, -- Links tool_use to parent assistant event @@ -86,7 +87,7 @@ CREATE TABLE events ( **Key patterns**: - `entry_type='tool_use'` + `entry_type='tool_result'` are correlated by `tool_id` - Token columns only populated on `entry_type='assistant'` to avoid double-counting -- `user_message_text` enables FTS via `events_fts` virtual table +- `message_text` enables FTS via `events_fts` virtual table for all entry types - `tool_input_json` preserves full parameters for drill-down queries ### sessions @@ -201,7 +202,7 @@ Performance-critical indexes on the `events` table: | `idx_events_tool_id` | `tool_id` | Self-join for tool_use ↔ tool_result correlation | | `idx_events_parent_uuid` | `parent_uuid` | Token deduplication queries | | `idx_events_agent_id` | `agent_id` | Agent activity breakdown | -| `idx_events_has_user_message` | Partial on `id` | FTS join optimization | +| `idx_events_has_message_text` | Partial on `id` | FTS join optimization (WHERE message_text IS NOT NULL) | **Performance note**: The `idx_events_tool_id` index is critical for `query_error_details()` which self-joins events to correlate errors with their input parameters. Without it, queries take ~25s on 160K rows; with it, ~0.3s. @@ -223,18 +224,18 @@ Performance-critical indexes on the `events` table: ## Full-Text Search -User messages are indexed via FTS5: +All message types (user, assistant, tool_result, summary) are indexed via FTS5: ```sql CREATE VIRTUAL TABLE events_fts USING fts5( - user_message_text, + message_text, content='events', content_rowid='id' ) ``` Sync triggers maintain index consistency: -- `events_fts_insert`: Populates FTS on new events +- `events_fts_insert`: Populates FTS on new events with message_text - `events_fts_delete`: Removes from FTS on delete - `events_fts_update`: Handles message text changes @@ -251,6 +252,7 @@ Sync triggers maintain index consistency: | 5 | add_agent_tracking | parent_uuid, agent_id, is_sidechain, version | | 6 | add_event_bus_integration | bus_events table | | 7 | add_tool_id_index | Performance index for self-joins | +| 8 | add_unified_message_text | Unified message_text column, rebuilt FTS on all entry types (Issue #68) | --- diff --git a/src/session_analytics/cli.py b/src/session_analytics/cli.py index 60dfece..da4e5b4 100644 --- a/src/session_analytics/cli.py +++ b/src/session_analytics/cli.py @@ -280,9 +280,11 @@ def _format_sample_sequences(data: dict) -> list[str]: @_register_formatter(lambda d: "journey" in d and "message_count" in d) def _format_user_journey(data: dict) -> list[str]: + entry_types = data.get("entry_types", ["user", "assistant"]) lines = [ - f"User Journey (last {data['hours']} hours)", + f"Session Messages (last {data['hours']} hours)", f"Messages: {data['message_count']}", + f"Types: {', '.join(entry_types)}", ] if data.get("projects_visited"): lines.append(f"Projects: {len(data['projects_visited'])}") @@ -292,29 +294,36 @@ def _format_user_journey(data: dict) -> list[str]: for event in data.get("journey", []): ts = event.get("timestamp", "")[:16] if event.get("timestamp") else "unknown" msg = event.get("message", "") if event.get("message") else "" + msg_type = event.get("type", "user") project = event.get("project", "") + type_prefix = f"[{msg_type[0].upper()}]" # [U], [A], [T], [S] if project: - lines.append(f" [{ts}] ({project}) {msg}") + lines.append(f" [{ts}] {type_prefix} ({project}) {msg}") else: - lines.append(f" [{ts}] {msg}") + lines.append(f" [{ts}] {type_prefix} {msg}") return lines @_register_formatter(lambda d: "query" in d and "messages" in d and "count" in d) def _format_search_results(data: dict) -> list[str]: + entry_types = data.get("entry_types") lines = [ f"Search: {data['query']}", f"Results: {data['count']}", - "", ] + if entry_types: + lines.append(f"Types: {', '.join(entry_types)}") + lines.append("") for msg in data.get("messages", []): ts = msg.get("timestamp", "")[:16] if msg.get("timestamp") else "unknown" text = msg.get("message", "") if msg.get("message") else "" + msg_type = msg.get("type", "user") project = msg.get("project", "") + type_prefix = f"[{msg_type[0].upper()}]" # [U], [A], [T], [S] if project: - lines.append(f" [{ts}] ({project}) {text}") + lines.append(f" [{ts}] {type_prefix} ({project}) {text}") else: - lines.append(f" [{ts}] {text}") + lines.append(f" [{ts}] {type_prefix} {text}") return lines @@ -795,25 +804,36 @@ def cmd_sample_sequences(args): def cmd_journey(args): - """Show user messages across sessions.""" + """Show messages across sessions.""" storage = SQLiteStorage() hours = int(args.days * 24) + entry_types = getattr(args, "entry_types", None) + if entry_types: + entry_types = [t.strip() for t in entry_types.split(",")] + max_length = getattr(args, "max_length", 500) result = get_user_journey( storage, hours=hours, include_projects=not args.no_projects, session_id=getattr(args, "session_id", None), limit=args.limit, + entry_types=entry_types, + max_message_length=max_length, ) print(format_output(result, args.json)) def cmd_search(args): - """Search user messages using full-text search.""" + """Search messages using full-text search.""" storage = SQLiteStorage() project = getattr(args, "project", None) + entry_types = getattr(args, "entry_types", None) + if entry_types: + entry_types = [t.strip() for t in entry_types.split(",")] try: - results = storage.search_user_messages(args.query, limit=args.limit, project=project) + results = storage.search_messages( + args.query, limit=args.limit, project=project, entry_types=entry_types + ) except sqlite3.OperationalError as e: # Catch FTS5-related errors (syntax, unterminated strings, etc.) output = { @@ -826,13 +846,15 @@ def cmd_search(args): output = { "query": args.query, "project": project, + "entry_types": entry_types, "count": len(results), "messages": [ { "timestamp": e.timestamp.isoformat() if e.timestamp else None, "session_id": e.session_id, "project": e.project_path, - "message": e.user_message_text, + "type": e.entry_type, + "message": e.message_text, } for e in results ], @@ -1136,8 +1158,16 @@ def cmd_benchmark(args): storage, pattern="Read → Edit", count=2 ), "get_permission_gaps": lambda: patterns_compute_permission_gaps(storage, days=7), - "get_session_messages": lambda: queries_get_user_journey(storage, hours=24), - "search_messages": lambda: storage.search_user_messages("test", limit=10), + "get_session_messages": lambda: queries_get_user_journey( + storage, hours=24, entry_types=["user", "assistant"] + ), + "get_session_messages_all": lambda: queries_get_user_journey( + storage, hours=24, entry_types=["user", "assistant", "tool_result"] + ), + "search_messages": lambda: storage.search_messages("test", limit=10), + "search_messages_filtered": lambda: storage.search_messages( + "test", limit=10, entry_types=["user", "assistant"] + ), "detect_parallel_sessions": lambda: queries_detect_parallel_sessions(storage, hours=24), "get_insights": lambda: patterns_get_insights(storage, refresh=False, days=7), "analyze_failures": lambda: patterns_analyze_failures(storage, days=7), @@ -1287,20 +1317,30 @@ def main(): sub.set_defaults(func=cmd_sample_sequences) # journey (maps to get_session_messages MCP tool) - sub = subparsers.add_parser("journey", help="Show user messages across sessions") + sub = subparsers.add_parser("journey", help="Show messages across sessions") sub.add_argument( "--days", type=float, default=1, help="Days to look back (default: 1, supports 0.5 for 12h)" ) sub.add_argument("--limit", type=int, default=100, help="Max messages (default: 100)") sub.add_argument("--no-projects", action="store_true", help="Exclude project info") sub.add_argument("--session-id", help="Filter to specific session ID") + sub.add_argument( + "--entry-types", + help="Entry types to include, comma-separated (default: user,assistant)", + ) + sub.add_argument( + "--max-length", type=int, default=500, help="Max message length (default: 500, 0=no limit)" + ) sub.set_defaults(func=cmd_journey) # search - sub = subparsers.add_parser("search", help="Search user messages (FTS)") + sub = subparsers.add_parser("search", help="Search messages (FTS)") sub.add_argument("query", help="FTS5 query (e.g., 'auth', '\"fix bug\"', 'skip OR defer')") sub.add_argument("--limit", type=int, default=50, help="Max results (default: 50)") sub.add_argument("--project", help="Project path filter") + sub.add_argument( + "--entry-types", help="Entry types to search, comma-separated (default: all)" + ) sub.set_defaults(func=cmd_search) # parallel diff --git a/src/session_analytics/guide.md b/src/session_analytics/guide.md index a60de0f..8d6051e 100644 --- a/src/session_analytics/guide.md +++ b/src/session_analytics/guide.md @@ -74,12 +74,16 @@ Each session includes `classification_factors` explaining WHY it was categorized |------|---------| | `analyze_trends(days?, compare_to?)` | Token/event trends with growth rates | -### User Messages +### Session Messages | Tool | Purpose | |------|---------| -| `get_session_messages(days?, project?, session_id?)` | User messages across sessions chronologically | -| `search_messages(query, limit?)` | Full-text search on user messages (FTS5) | +| `get_session_messages(days?, session_id?, entry_types?, max_message_length?)` | Messages across sessions chronologically (user + assistant by default) | +| `search_messages(query, limit?, entry_types?)` | Full-text search across all message types (FTS5) | + +**entry_types**: Filter by `["user"]`, `["assistant"]`, `["tool_result"]`, `["summary"]` or any combination. Default: `["user", "assistant"]`. + +**max_message_length**: Truncate messages (default: 500, 0=no limit). ### Session Relationships @@ -169,9 +173,9 @@ use the APIs however best fits your needs. │ DRILL INTO SPECIFICS │ ├─────────────────────────────────────────────────────────────────┤ │ get_session_events(session_id=X) → Full event trace │ -│ get_session_messages(session_id=X) → User intent │ +│ get_session_messages(session_id=X) → User+assistant messages │ │ get_session_commits(session_id=X) → Work products │ -│ search_messages("query") → Find specific topics │ +│ search_messages("query") → Find across all messages │ └─────────────────────────────────────────────────────────────────┘ ``` diff --git a/src/session_analytics/ingest.py b/src/session_analytics/ingest.py index 224e552..46eed30 100644 --- a/src/session_analytics/ingest.py +++ b/src/session_analytics/ingest.py @@ -1,5 +1,7 @@ """JSONL log ingestion for Claude Code session analytics.""" +from __future__ import annotations + import json import logging import re @@ -17,6 +19,74 @@ # Maximum length for user message text to prevent DB bloat while preserving context USER_MESSAGE_MAX_LENGTH = 2000 +# No limit for message_text - user requested full content including tool results +# Set to None to indicate no truncation +MESSAGE_TEXT_MAX_LENGTH = None + + +def extract_text_from_content(content) -> str | None: + """Extract text content from various message content formats. + + Handles: + - Plain strings + - List of content blocks (text, tool_use, tool_result, etc.) + + Returns concatenated text from all text blocks, or None if no text found. + """ + if content is None: + return None + + if isinstance(content, str): + return content if content else None + + if isinstance(content, list): + text_parts = [] + for item in content: + if isinstance(item, str): + text_parts.append(item) + elif isinstance(item, dict): + item_type = item.get("type") + if item_type == "text": + text_parts.append(item.get("text", "")) + # Skip tool_use, tool_result - they're handled separately + if text_parts: + return " ".join(text_parts) + + return None + + +def extract_tool_result_content(tool_result: dict) -> str | None: + """Extract content from a tool_result block. + + Tool results can have various content formats: + - String content directly + - List of content blocks + - Nested content structures + """ + content = tool_result.get("content") + if content is None: + return None + + if isinstance(content, str): + return content if content else None + + if isinstance(content, list): + text_parts = [] + for item in content: + if isinstance(item, str): + text_parts.append(item) + elif isinstance(item, dict): + item_type = item.get("type") + if item_type == "text": + text_parts.append(item.get("text", "")) + elif item_type == "image": + # Image results - just note that an image was returned + text_parts.append("[image]") + if text_parts: + return "\n".join(text_parts) + + return None + def find_log_files( logs_dir: Path = DEFAULT_LOGS_DIR, @@ -212,6 +282,9 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: content = message.get("content", []) tool_uses = [c for c in content if isinstance(c, dict) and c.get("type") == "tool_use"] + # Extract assistant's text response (Issue #68) + assistant_text = extract_text_from_content(content) + # ALWAYS create assistant event with tokens (fixes token duplication) events.append( Event( @@ -228,6 +301,7 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: model=model, git_branch=git_branch, cwd=cwd, + message_text=assistant_text, # Issue #68: unified message text # RFC #41: Agent tracking fields parent_uuid=None, # Assistant events have no parent agent_id=agent_id, @@ -275,7 +349,7 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: elif entry_type == "user": content = message.get("content", "") - # Extract user message text for user journey tracking + # Extract user message text for user journey tracking (truncated for backwards compat) user_message_text = None if isinstance(content, str): user_message_text = content[:USER_MESSAGE_MAX_LENGTH] if content else None @@ -290,6 +364,9 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: if text_parts: user_message_text = " ".join(text_parts)[:USER_MESSAGE_MAX_LENGTH] + # Issue #68: Extract full message text (no truncation) + message_text = extract_text_from_content(content) + # Extract command name from isMeta user messages (slash command expansions) # e.g., /status-report expands to a user message starting with "# Status Report" is_meta = raw.get("isMeta", False) @@ -304,6 +381,8 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: for tr in tool_results: # Check for error is_error = tr.get("is_error", False) + # Issue #68: Extract tool result content + tool_result_text = extract_tool_result_content(tr) events.append( Event( id=None, @@ -316,6 +395,7 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: is_error=is_error, git_branch=git_branch, cwd=cwd, + message_text=tool_result_text, # Issue #68: full tool result # RFC #41: Agent tracking fields agent_id=agent_id, is_sidechain=is_sidechain, @@ -334,6 +414,7 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: entry_type="command" if command_name else "user", skill_name=command_name, # Reuse skill_name for command tracking user_message_text=user_message_text, + message_text=message_text, # Issue #68: unified message text git_branch=git_branch, cwd=cwd, # RFC #41: Agent tracking fields @@ -354,6 +435,7 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: entry_type="command" if command_name else "user", skill_name=command_name, # Reuse skill_name for command tracking user_message_text=user_message_text, + message_text=message_text, # Issue #68: unified message text git_branch=git_branch, cwd=cwd, # RFC #41: Agent tracking fields @@ -365,6 +447,10 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: # Handle summary entries elif entry_type == "summary": + # Issue #68: Extract summary text + summary_content = message.get("content", "") if message else raw.get("summary", "") + summary_text = extract_text_from_content(summary_content) + events.append( Event( id=None, @@ -373,6 +459,7 @@ def parse_entry(raw: dict, project_path: str) -> list[Event]: session_id=session_id if session_id else "unknown", project_path=project_path, entry_type="summary", + message_text=summary_text, # Issue #68: unified message text # RFC #41: Agent tracking fields agent_id=agent_id, is_sidechain=is_sidechain, diff --git a/src/session_analytics/queries.py b/src/session_analytics/queries.py index 81c488c..27c6ad6 100644 --- a/src/session_analytics/queries.py +++ b/src/session_analytics/queries.py @@ -1,5 +1,7 @@ """Query implementations for session analytics.""" +from __future__ import annotations + import re from datetime import datetime, timedelta @@ -657,11 +659,14 @@ def get_user_journey( include_projects: bool = True, session_id: str | None = None, limit: int = 100, + entry_types: list[str] | None = None, + max_message_length: int = 500, ) -> dict: - """Get all user messages chronologically across sessions. + """Get messages chronologically across sessions. Shows how the user moved across sessions and projects over time, revealing task switching, project interleaving, and work patterns. + Includes both user messages and assistant responses for conversation replay. Args: storage: Storage instance @@ -669,33 +674,44 @@ def get_user_journey( include_projects: Include project info in output (default: True) session_id: Optional session ID filter (get messages from specific session) limit: Maximum messages to return (default: 100) + entry_types: Which entry types to include (default: ["user", "assistant"]) + max_message_length: Truncate messages to this length (default: 500, 0=no limit) Returns: Dict with journey events and pattern analysis """ cutoff = get_cutoff(hours=hours) + # Default to user and assistant messages + if entry_types is None: + entry_types = ["user", "assistant"] + # Build query with optional session_id filter session_filter = "" params: list = [cutoff] if session_id: session_filter = "AND session_id = ?" params.append(session_id) + + # Build entry_type filter + type_placeholders = ",".join("?" * len(entry_types)) + params.extend(entry_types) params.append(limit) - # Query user messages ordered by timestamp + # Query messages ordered by timestamp rows = storage.execute_query( f""" SELECT timestamp, session_id, project_path, - user_message_text + entry_type, + message_text FROM events WHERE timestamp >= ? - AND entry_type = 'user' - AND user_message_text IS NOT NULL + AND message_text IS NOT NULL {session_filter} + AND entry_type IN ({type_placeholders}) ORDER BY timestamp ASC LIMIT ? """, @@ -716,10 +732,16 @@ def get_user_journey( project_switches += 1 last_project = project + # Truncate message if max_message_length is set + message_text = row["message_text"] + if message_text and max_message_length > 0: + message_text = message_text[:max_message_length] + event = { "timestamp": row["timestamp"].isoformat() if row["timestamp"] else None, "session_id": row["session_id"], - "message": row["user_message_text"][:200] if row["user_message_text"] else None, + "type": row["entry_type"], + "message": message_text, } if include_projects: event["project"] = project @@ -728,6 +750,7 @@ def get_user_journey( return { "hours": hours, "session_id": session_id, + "entry_types": entry_types, "message_count": len(journey), "projects_visited": list(projects_seen) if include_projects else None, "project_switches": project_switches if include_projects else None, @@ -1288,11 +1311,11 @@ def get_handoff_context( # Get recent user messages messages = storage.execute_query( """ - SELECT timestamp, user_message_text + SELECT timestamp, message_text FROM events WHERE session_id = ? AND entry_type = 'user' - AND user_message_text IS NOT NULL + AND message_text IS NOT NULL ORDER BY timestamp DESC LIMIT ? """, @@ -1302,7 +1325,7 @@ def get_handoff_context( recent_messages = [ { "timestamp": _format_timestamp(m["timestamp"]), - "message": m["user_message_text"][:200] if m["user_message_text"] else None, + "message": m["message_text"][:200] if m["message_text"] else None, } for m in messages ] diff --git a/src/session_analytics/server.py b/src/session_analytics/server.py index 31cabb3..de02e05 100644 --- a/src/session_analytics/server.py +++ b/src/session_analytics/server.py @@ -294,17 +294,22 @@ def get_session_messages( include_projects: bool = True, session_id: str | None = None, limit: int = 100, + entry_types: list[str] | None = None, + max_message_length: int = 500, ) -> dict: - """Get all user messages chronologically across sessions. + """Get messages chronologically across sessions. Shows how the user moved across sessions and projects over time, revealing task switching, project interleaving, and work patterns. + Includes both user messages and assistant responses for conversation replay. Args: days: Number of days to look back (default: 1, supports fractions like 0.5 for 12h) include_projects: Include project info in output (default: True) session_id: Optional session ID filter (get messages from specific session) limit: Maximum messages to return (default: 100) + entry_types: Which entry types to include (default: ["user", "assistant"]) + max_message_length: Truncate messages to this length (default: 500, 0=no limit) Returns: Journey events with timestamps, sessions, and messages @@ -317,18 +322,24 @@ def get_session_messages( include_projects=include_projects, session_id=session_id, limit=limit, + entry_types=entry_types, + max_message_length=max_message_length, ) return {"status": "ok", **result} @mcp.tool() -def search_messages(query: str, limit: int = 50, project: str | None = None) -> dict: - """Search user messages using full-text search. - - Uses FTS5 to efficiently search across all user messages. Useful for finding - discussions about specific topics, decisions, or patterns across sessions. +def search_messages( + query: str, + limit: int = 50, + project: str | None = None, + entry_types: list[str] | None = None, +) -> dict: + """Search messages using full-text search. - Note: Searches user messages only, not assistant responses. + Uses FTS5 to efficiently search across all message types (user, assistant, + tool_result, summary). Useful for finding discussions about specific topics, + decisions, or patterns across sessions. Args: query: FTS5 query string. Supports: @@ -338,13 +349,16 @@ def search_messages(query: str, limit: int = 50, project: str | None = None) -> - Prefix: "implement*" limit: Maximum results to return (default: 50) project: Optional project path filter + entry_types: Optional list of entry types to filter (e.g., ["user", "assistant"]) Returns: Matching messages with session context and timestamps """ queries.ensure_fresh_data(storage) try: - results = storage.search_user_messages(query, limit=limit, project=project) + results = storage.search_messages( + query, limit=limit, project=project, entry_types=entry_types + ) except sqlite3.OperationalError as e: # Catch FTS5-related errors (syntax, unterminated strings, etc.) return { @@ -356,13 +370,15 @@ def search_messages(query: str, limit: int = 50, project: str | None = None) -> "status": "ok", "query": query, "project": project, + "entry_types": entry_types, "count": len(results), "messages": [ { "timestamp": e.timestamp.isoformat() if e.timestamp else None, "session_id": e.session_id, "project": e.project_path, - "message": e.user_message_text, + "type": e.entry_type, + "message": e.message_text, } for e in results ], diff --git a/src/session_analytics/storage.py b/src/session_analytics/storage.py index 02f9514..19b1abc 100644 --- a/src/session_analytics/storage.py +++ b/src/session_analytics/storage.py @@ -1,5 +1,7 @@ """SQLite storage backend for session analytics.""" +from __future__ import annotations + import json import logging import os @@ -63,7 +65,8 @@ class Event: cwd: str | None = None # RFC #17 Phase 1 additions - user_message_text: str | None = None # For user journey tracking + user_message_text: str | None = None # For user journey tracking (deprecated, use message_text) + message_text: str | None = None # Unified text content for all entry types (user/assistant/tool_result/summary) # TODO(Phase 4): exit_code is not currently available in Claude Code JSONL format. # The toolUseResult has stdout/stderr/interrupted but no exit code. # This field is reserved for future extraction when format changes or @@ -166,7 +169,7 @@ class BusEvent: DEFAULT_DB_PATH = Path.home() / ".claude" / "contrib" / "analytics" / "data.db" # Schema version for migrations -SCHEMA_VERSION = 7 +SCHEMA_VERSION = 8 # Migration functions: dict of version -> (migration_name, migration_func) # Each migration upgrades FROM version-1 TO version @@ -390,6 +393,89 @@ def migrate_v7(conn): conn.execute("CREATE INDEX IF NOT EXISTS idx_events_tool_id ON events(tool_id)") +@migration(8, "add_unified_message_text") +def migrate_v8(conn): + """Add unified message_text column for all entry types and rebuild FTS. + + Issue #68: Previously only user messages had text captured. This migration: + 1. Adds message_text column for user/assistant/tool_result/summary text + 2. Copies existing user_message_text to message_text + 3. Rebuilds FTS index on the new unified column + + The old user_message_text column is kept for backwards compatibility + but is deprecated. Data will be backfilled via re-ingestion. + """ + # Add message_text column + existing_cols = {row[1] for row in conn.execute("PRAGMA table_info(events)")} + if "message_text" not in existing_cols: + conn.execute("ALTER TABLE events ADD COLUMN message_text TEXT") + + # Copy existing user_message_text to message_text + conn.execute(""" + UPDATE events + SET message_text = user_message_text + WHERE user_message_text IS NOT NULL AND message_text IS NULL + """) + + # Drop old FTS triggers + conn.execute("DROP TRIGGER IF EXISTS events_fts_insert") + conn.execute("DROP TRIGGER IF EXISTS events_fts_delete") + conn.execute("DROP TRIGGER IF EXISTS events_fts_update") + + # Drop old FTS table + conn.execute("DROP TABLE IF EXISTS events_fts") + + # Create new FTS table on message_text + conn.execute(""" + CREATE VIRTUAL TABLE IF NOT EXISTS events_fts USING fts5( + message_text, + content='events', + content_rowid='id' + ) + """) + + # Populate FTS index from all events with message_text + conn.execute(""" + INSERT INTO events_fts(rowid, message_text) + SELECT id, message_text FROM events WHERE message_text IS NOT NULL + """) + + # Create triggers to keep FTS in sync + conn.execute(""" + CREATE TRIGGER IF NOT EXISTS events_fts_insert AFTER INSERT ON events + WHEN NEW.message_text IS NOT NULL + BEGIN + INSERT INTO events_fts(rowid, message_text) VALUES (NEW.id, NEW.message_text); + END + """) + + conn.execute(""" + CREATE TRIGGER IF NOT EXISTS events_fts_delete AFTER DELETE ON events + WHEN OLD.message_text IS NOT NULL + BEGIN + INSERT INTO events_fts(events_fts, rowid, message_text) + VALUES ('delete', OLD.id, OLD.message_text); + END + """) + + conn.execute(""" + CREATE TRIGGER IF NOT EXISTS events_fts_update AFTER UPDATE OF message_text ON events + WHEN OLD.message_text IS NOT NULL OR NEW.message_text IS NOT NULL + BEGIN + INSERT INTO events_fts(events_fts, rowid, message_text) + SELECT 'delete', OLD.id, OLD.message_text WHERE OLD.message_text IS NOT NULL; + INSERT INTO events_fts(rowid, message_text) + SELECT NEW.id, NEW.message_text WHERE NEW.message_text IS NOT NULL; + END + """) + + # Add partial index for efficient message_text queries + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_events_has_message_text + ON events(id) WHERE message_text IS NOT NULL + """) + + class SQLiteStorage: """SQLite-backed storage for session analytics.""" @@ -468,8 +554,10 @@ def executemany(self, sql: str, params_seq: list[tuple]) -> int: def _get_schema_version(self, conn: sqlite3.Connection) -> int: """Get current schema version from database.""" try: - row = conn.execute("SELECT version FROM schema_version LIMIT 1").fetchone() - return row[0] if row else 0 + row = conn.execute( + "SELECT MAX(version) FROM schema_version" + ).fetchone() + return row[0] if row and row[0] else 0 except sqlite3.OperationalError: # Table doesn't exist yet return 0 @@ -530,6 +618,7 @@ def _init_db(self): -- RFC #17 Phase 1 additions user_message_text TEXT, + message_text TEXT, exit_code INTEGER, -- RFC #41: Agent tracking and token deduplication @@ -632,50 +721,6 @@ def _init_db(self): "ON session_commits(commit_sha)" ) - # FTS5 full-text search on user_message_text (RFC #17 Phase 1) - conn.execute(""" - CREATE VIRTUAL TABLE IF NOT EXISTS events_fts USING fts5( - user_message_text, - content='events', - content_rowid='id' - ) - """) - - # Triggers to keep FTS in sync with events table - conn.execute(""" - CREATE TRIGGER IF NOT EXISTS events_fts_insert AFTER INSERT ON events - WHEN NEW.user_message_text IS NOT NULL - BEGIN - INSERT INTO events_fts(rowid, user_message_text) VALUES (NEW.id, NEW.user_message_text); - END - """) - - conn.execute(""" - CREATE TRIGGER IF NOT EXISTS events_fts_delete AFTER DELETE ON events - WHEN OLD.user_message_text IS NOT NULL - BEGIN - INSERT INTO events_fts(events_fts, rowid, user_message_text) - VALUES ('delete', OLD.id, OLD.user_message_text); - END - """) - - conn.execute(""" - CREATE TRIGGER IF NOT EXISTS events_fts_update AFTER UPDATE OF user_message_text ON events - WHEN OLD.user_message_text IS NOT NULL OR NEW.user_message_text IS NOT NULL - BEGIN - INSERT INTO events_fts(events_fts, rowid, user_message_text) - SELECT 'delete', OLD.id, OLD.user_message_text WHERE OLD.user_message_text IS NOT NULL; - INSERT INTO events_fts(rowid, user_message_text) - SELECT NEW.id, NEW.user_message_text WHERE NEW.user_message_text IS NOT NULL; - END - """) - - # Partial index for efficiently querying events with user messages - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_events_has_user_message - ON events(id) WHERE user_message_text IS NOT NULL - """) - # Event-bus integration for cross-session insights (RFC #54) conn.execute(""" CREATE TABLE IF NOT EXISTS bus_events ( @@ -719,6 +764,51 @@ def _init_db(self): # Performance index for tool_use ↔ tool_result self-joins (migration v7) conn.execute("CREATE INDEX IF NOT EXISTS idx_events_tool_id ON events(tool_id)") + # FTS5 full-text search on message_text (Issue #68: unified text for all entry types) + # Run AFTER migrations so message_text column exists on both fresh and migrated DBs + conn.execute(""" + CREATE VIRTUAL TABLE IF NOT EXISTS events_fts USING fts5( + message_text, + content='events', + content_rowid='id' + ) + """) + + # Triggers to keep FTS in sync with events table + conn.execute(""" + CREATE TRIGGER IF NOT EXISTS events_fts_insert AFTER INSERT ON events + WHEN NEW.message_text IS NOT NULL + BEGIN + INSERT INTO events_fts(rowid, message_text) VALUES (NEW.id, NEW.message_text); + END + """) + + conn.execute(""" + CREATE TRIGGER IF NOT EXISTS events_fts_delete AFTER DELETE ON events + WHEN OLD.message_text IS NOT NULL + BEGIN + INSERT INTO events_fts(events_fts, rowid, message_text) + VALUES ('delete', OLD.id, OLD.message_text); + END + """) + + conn.execute(""" + CREATE TRIGGER IF NOT EXISTS events_fts_update AFTER UPDATE OF message_text ON events + WHEN OLD.message_text IS NOT NULL OR NEW.message_text IS NOT NULL + BEGIN + INSERT INTO events_fts(events_fts, rowid, message_text) + SELECT 'delete', OLD.id, OLD.message_text WHERE OLD.message_text IS NOT NULL; + INSERT INTO events_fts(rowid, message_text) + SELECT NEW.id, NEW.message_text WHERE NEW.message_text IS NOT NULL; + END + """) + + # Partial index for efficiently querying events with messages + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_events_has_message_text + ON events(id) WHERE message_text IS NOT NULL + """) + # Event operations def add_event(self, event: Event) -> Event: @@ -731,9 +821,9 @@ def add_event(self, event: Event) -> Event: tool_name, tool_input_json, tool_id, is_error, command, command_args, file_path, skill_name, input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens, model, - git_branch, cwd, user_message_text, exit_code, + git_branch, cwd, user_message_text, message_text, exit_code, parent_uuid, agent_id, is_sidechain, version - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( event.uuid, @@ -757,6 +847,7 @@ def add_event(self, event: Event) -> Event: event.git_branch, event.cwd, event.user_message_text, + event.message_text, event.exit_code, event.parent_uuid, event.agent_id, @@ -777,9 +868,9 @@ def add_events_batch(self, events: list[Event]) -> int: tool_name, tool_input_json, tool_id, is_error, command, command_args, file_path, skill_name, input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens, model, - git_branch, cwd, user_message_text, exit_code, + git_branch, cwd, user_message_text, message_text, exit_code, parent_uuid, agent_id, is_sidechain, version - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, [ ( @@ -804,6 +895,7 @@ def add_events_batch(self, events: list[Event]) -> int: e.git_branch, e.cwd, e.user_message_text, + e.message_text, e.exit_code, e.parent_uuid, e.agent_id, @@ -900,6 +992,7 @@ def get_col(name: str, default=None): git_branch=row["git_branch"], cwd=row["cwd"], user_message_text=get_col("user_message_text"), + message_text=get_col("message_text"), exit_code=get_col("exit_code"), # RFC #41: Agent tracking parent_uuid=get_col("parent_uuid"), @@ -1293,47 +1386,70 @@ def get_commits_for_sessions( # Full-text search operations - def search_user_messages( - self, query: str, limit: int = 100, project: str | None = None + def search_messages( + self, + query: str, + limit: int = 100, + project: str | None = None, + entry_types: list[str] | None = None, ) -> list[Event]: - """Search user messages using full-text search. + """Search messages using full-text search. + + Searches across all message types (user, assistant, tool_result, summary) + stored in the message_text column. Args: query: FTS5 query string (supports AND, OR, NOT, phrases, etc.) limit: Maximum number of results project: Optional project path filter (LIKE %project%) + entry_types: Optional list of entry types to filter (e.g., ["user", "assistant"]) Returns: List of Event objects matching the search query """ with self._connect() as conn: - # Use FTS5 MATCH to search, join back to events for full data + # Build base query with optional filters + params: list = [query] + filters = [] + if project: - rows = conn.execute( - """ - SELECT events.* FROM events - INNER JOIN events_fts ON events.id = events_fts.rowid - WHERE events_fts MATCH ? - AND events.project_path LIKE ? - ORDER BY rank - LIMIT ? - """, - (query, f"%{project}%", limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT events.* FROM events - INNER JOIN events_fts ON events.id = events_fts.rowid - WHERE events_fts MATCH ? - ORDER BY rank - LIMIT ? - """, - (query, limit), - ).fetchall() + filters.append("events.project_path LIKE ?") + params.append(f"%{project}%") + + if entry_types: + placeholders = ",".join("?" * len(entry_types)) + filters.append(f"events.entry_type IN ({placeholders})") + params.extend(entry_types) + + filter_clause = "" + if filters: + filter_clause = "AND " + " AND ".join(filters) + + params.append(limit) + + rows = conn.execute( + f""" + SELECT events.* FROM events + INNER JOIN events_fts ON events.id = events_fts.rowid + WHERE events_fts MATCH ? + {filter_clause} + ORDER BY rank + LIMIT ? + """, + tuple(params), + ).fetchall() return [self._row_to_event(row) for row in rows] + def search_user_messages( + self, query: str, limit: int = 100, project: str | None = None + ) -> list[Event]: + """Search user messages using full-text search. + + Deprecated: Use search_messages() instead. + """ + return self.search_messages(query, limit, project, entry_types=["user"]) + # Utility operations def get_db_stats(self) -> dict: diff --git a/tests/conftest.py b/tests/conftest.py index 9fb8fbd..4dc5255 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -124,7 +124,7 @@ def populated_storage(storage): session_id="session-1", project_path="-test-project", entry_type="user", - user_message_text="Fix the authentication bug in the login flow", + message_text="Fix the authentication bug in the login flow", ), Event( id=None, @@ -133,7 +133,7 @@ def populated_storage(storage): session_id="session-1", project_path="-test-project", entry_type="user", - user_message_text="Add unit tests for the API endpoints", + message_text="Add unit tests for the API endpoints", ), ] storage.add_events_batch(events) diff --git a/tests/test_queries.py b/tests/test_queries.py index f1cf7bd..7b06059 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -217,7 +217,7 @@ def test_basic_journey(self, storage): session_id="s1", project_path="project-a", entry_type="user", - user_message_text="Start working on feature", + message_text="Start working on feature", ), Event( id=None, @@ -226,7 +226,7 @@ def test_basic_journey(self, storage): session_id="s2", project_path="project-b", entry_type="user", - user_message_text="Fix bug in other project", + message_text="Fix bug in other project", ), ] storage.add_events_batch(events) @@ -249,7 +249,7 @@ def test_journey_excludes_tool_events(self, storage): timestamp=now - timedelta(hours=1), session_id="s1", entry_type="user", - user_message_text="User message", + message_text="User message", ), Event( id=None, @@ -281,7 +281,7 @@ def test_journey_with_session_id_filter(self, storage): session_id="session-target", project_path="project-a", entry_type="user", - user_message_text="Message from target session", + message_text="Message from target session", ) ) storage.add_event( @@ -292,7 +292,7 @@ def test_journey_with_session_id_filter(self, storage): session_id="session-other", project_path="project-a", entry_type="user", - user_message_text="Message from other session", + message_text="Message from other session", ) ) @@ -303,6 +303,119 @@ def test_journey_with_session_id_filter(self, storage): assert result["message_count"] == 1 assert result["journey"][0]["session_id"] == "session-target" + def test_journey_includes_assistant_messages(self, storage): + """Test that journey includes assistant messages by default.""" + from session_analytics.queries import get_user_journey + + now = datetime.now() + events = [ + Event( + id=None, + uuid="u1", + timestamp=now - timedelta(hours=1), + session_id="s1", + entry_type="user", + message_text="User question", + ), + Event( + id=None, + uuid="a1", + timestamp=now - timedelta(minutes=59), + session_id="s1", + entry_type="assistant", + message_text="Assistant response", + ), + ] + storage.add_events_batch(events) + + result = get_user_journey(storage, hours=24) + + # Default should include both user and assistant + assert result["message_count"] == 2 + assert result["entry_types"] == ["user", "assistant"] + types = [e["type"] for e in result["journey"]] + assert "user" in types + assert "assistant" in types + + def test_journey_custom_entry_types(self, storage): + """Test filtering by custom entry_types.""" + from session_analytics.queries import get_user_journey + + now = datetime.now() + events = [ + Event( + id=None, + uuid="u1", + timestamp=now - timedelta(hours=1), + session_id="s1", + entry_type="user", + message_text="User message", + ), + Event( + id=None, + uuid="a1", + timestamp=now - timedelta(minutes=50), + session_id="s1", + entry_type="assistant", + message_text="Assistant message", + ), + Event( + id=None, + uuid="tr1", + timestamp=now - timedelta(minutes=40), + session_id="s1", + entry_type="tool_result", + message_text="Tool output content", + ), + ] + storage.add_events_batch(events) + + # Only user messages + result = get_user_journey(storage, hours=24, entry_types=["user"]) + assert result["message_count"] == 1 + assert result["journey"][0]["type"] == "user" + + # Only tool_result + result = get_user_journey(storage, hours=24, entry_types=["tool_result"]) + assert result["message_count"] == 1 + assert result["journey"][0]["type"] == "tool_result" + + # All three + result = get_user_journey( + storage, hours=24, entry_types=["user", "assistant", "tool_result"] + ) + assert result["message_count"] == 3 + + def test_journey_max_message_length_truncation(self, storage): + """Test message truncation with max_message_length.""" + from session_analytics.queries import get_user_journey + + now = datetime.now() + long_message = "A" * 1000 + events = [ + Event( + id=None, + uuid="u1", + timestamp=now - timedelta(hours=1), + session_id="s1", + entry_type="user", + message_text=long_message, + ), + ] + storage.add_events_batch(events) + + # Default truncation (500) + result = get_user_journey(storage, hours=24) + assert len(result["journey"][0]["message"]) == 500 + + # Custom truncation + result = get_user_journey(storage, hours=24, max_message_length=100) + assert len(result["journey"][0]["message"]) == 100 + + # No truncation (0) + result = get_user_journey(storage, hours=24, max_message_length=0) + assert len(result["journey"][0]["message"]) == 1000 + class TestDetectParallelSessions: """Tests for detect_parallel_sessions function.""" @@ -623,7 +736,7 @@ def test_returns_session_info(self, storage): session_id="test-session", project_path="/test/project", entry_type="user", - user_message_text="Hello, let's start", + message_text="Hello, let's start", ), Event( id=None, @@ -667,7 +780,7 @@ def test_returns_recent_messages(self, storage): timestamp=now - timedelta(hours=1), session_id="msg-session", entry_type="user", - user_message_text="First message", + message_text="First message", ), Event( id=None, @@ -675,7 +788,7 @@ def test_returns_recent_messages(self, storage): timestamp=now - timedelta(minutes=30), session_id="msg-session", entry_type="user", - user_message_text="Second message", + message_text="Second message", ), ] storage.add_events_batch(events) @@ -1185,7 +1298,7 @@ def test_journey_without_projects(self, storage): session_id="s1", project_path="project-a", entry_type="user", - user_message_text="First message", + message_text="First message", ), Event( id=None, @@ -1194,7 +1307,7 @@ def test_journey_without_projects(self, storage): session_id="s2", project_path="project-b", entry_type="user", - user_message_text="Second message", + message_text="Second message", ), ] storage.add_events_batch(events) diff --git a/tests/test_storage.py b/tests/test_storage.py index 18e80c5..b792760 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -435,14 +435,14 @@ def test_event_with_user_message_text(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="Hello, please help me with something", + message_text="Hello, please help me with something", ) stored = storage.add_event(event) assert stored.id is not None events = storage.get_events_in_range() assert len(events) == 1 - assert events[0].user_message_text == "Hello, please help me with something" + assert events[0].message_text == "Hello, please help me with something" def test_event_with_exit_code(self, storage): """Test storing and retrieving exit_code.""" @@ -469,13 +469,13 @@ def test_event_with_all_new_fields(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="Run a command", + message_text="Run a command", exit_code=0, ) storage.add_event(event) events = storage.get_events_in_range() - assert events[0].user_message_text == "Run a command" + assert events[0].message_text == "Run a command" assert events[0].exit_code == 0 def test_event_with_null_new_fields(self, storage): @@ -492,7 +492,7 @@ def test_event_with_null_new_fields(self, storage): events = storage.get_events_in_range() assert len(events) == 1 - assert events[0].user_message_text is None + assert events[0].message_text is None assert events[0].exit_code is None @@ -509,7 +509,7 @@ def test_search_user_messages_basic(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="Help me debug the authentication error", + message_text="Help me debug the authentication error", ) ) storage.add_event( @@ -519,7 +519,7 @@ def test_search_user_messages_basic(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="Fix the database connection issue", + message_text="Fix the database connection issue", ) ) storage.add_event( @@ -529,19 +529,19 @@ def test_search_user_messages_basic(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="Another error message to debug", + message_text="Another error message to debug", ) ) # Search for "debug" results = storage.search_user_messages("debug") assert len(results) == 2 - assert all("debug" in r.user_message_text.lower() for r in results) + assert all("debug" in r.message_text.lower() for r in results) # Search for "authentication" results = storage.search_user_messages("authentication") assert len(results) == 1 - assert "authentication" in results[0].user_message_text.lower() + assert "authentication" in results[0].message_text.lower() def test_search_user_messages_no_match(self, storage): """Test search returns empty when no matches found.""" @@ -552,7 +552,7 @@ def test_search_user_messages_no_match(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="This is a test message", + message_text="This is a test message", ) ) @@ -568,7 +568,7 @@ def test_search_user_messages_phrase(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="Run the unit tests", + message_text="Run the unit tests", ) ) storage.add_event( @@ -578,14 +578,89 @@ def test_search_user_messages_phrase(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="Unit testing is important", + message_text="Unit testing is important", ) ) # Search for phrase "unit tests" results = storage.search_user_messages('"unit tests"') assert len(results) == 1 - assert "unit tests" in results[0].user_message_text.lower() + assert "unit tests" in results[0].message_text.lower() + + def test_search_messages_all_types(self, storage): + """Test search_messages finds content in all message types.""" + # Add events of different types with searchable text + storage.add_event( + Event( + id=None, + uuid="user-msg", + timestamp=datetime.now(), + session_id="session-1", + entry_type="user", + message_text="User asks about authentication", + ) + ) + storage.add_event( + Event( + id=None, + uuid="assistant-msg", + timestamp=datetime.now(), + session_id="session-1", + entry_type="assistant", + message_text="Assistant explains authentication flow", + ) + ) + storage.add_event( + Event( + id=None, + uuid="tool-result", + timestamp=datetime.now(), + session_id="session-1", + entry_type="tool_result", + message_text="Tool output: authentication successful", + ) + ) + + # Search all message types (default) + results = storage.search_messages("authentication") + assert len(results) == 3 + + def test_search_messages_with_entry_types_filter(self, storage): + """Test search_messages with entry_types filter.""" + storage.add_event( + Event( + id=None, + uuid="user-search", + timestamp=datetime.now(), + session_id="session-1", + entry_type="user", + message_text="User database query", + ) + ) + storage.add_event( + Event( + id=None, + uuid="assistant-search", + timestamp=datetime.now(), + session_id="session-1", + entry_type="assistant", + message_text="Assistant database response", + ) + ) + + # Only user messages + results = storage.search_messages("database", entry_types=["user"]) + assert len(results) == 1 + assert results[0].entry_type == "user" + + # Only assistant messages + results = storage.search_messages("database", entry_types=["assistant"]) + assert len(results) == 1 + assert results[0].entry_type == "assistant" + + # Both + results = storage.search_messages("database", entry_types=["user", "assistant"]) + assert len(results) == 2 class TestFTSTriggers: @@ -600,7 +675,7 @@ def test_fts_trigger_on_insert(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="searchable insert content", + message_text="searchable insert content", ) ) @@ -619,7 +694,7 @@ def test_fts_trigger_on_update_null_to_value(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text=None, + message_text=None, ) ) @@ -629,7 +704,7 @@ def test_fts_trigger_on_update_null_to_value(self, storage): # Update to add user_message_text storage.execute_write( - "UPDATE events SET user_message_text = ? WHERE uuid = ?", + "UPDATE events SET message_text = ? WHERE uuid = ?", ("updated content here", "update-null-test"), ) @@ -647,7 +722,7 @@ def test_fts_trigger_on_update_value_to_different(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="original searchterm", + message_text="original searchterm", ) ) @@ -657,7 +732,7 @@ def test_fts_trigger_on_update_value_to_different(self, storage): # Update to different value storage.execute_write( - "UPDATE events SET user_message_text = ? WHERE uuid = ?", + "UPDATE events SET message_text = ? WHERE uuid = ?", ("replacement searchterm", "update-value-test"), ) @@ -679,7 +754,7 @@ def test_fts_trigger_on_update_value_to_null(self, storage): timestamp=datetime.now(), session_id="session-1", entry_type="user", - user_message_text="removable content", + message_text="removable content", ) ) @@ -689,7 +764,7 @@ def test_fts_trigger_on_update_value_to_null(self, storage): # Update to NULL storage.execute_write( - "UPDATE events SET user_message_text = NULL WHERE uuid = ?", + "UPDATE events SET message_text = NULL WHERE uuid = ?", ("update-to-null-test",), ) From 368ab8f2cde7c8e72b643b25f5fef8985234d42c Mon Sep 17 00:00:00 2001 From: Evan Senter Date: Sat, 10 Jan 2026 01:44:14 +0000 Subject: [PATCH 2/3] style: Fix formatting in cli.py and storage.py Co-Authored-By: Claude Opus 4.5 --- src/session_analytics/cli.py | 4 +--- src/session_analytics/storage.py | 8 ++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/session_analytics/cli.py b/src/session_analytics/cli.py index da4e5b4..9b9f432 100644 --- a/src/session_analytics/cli.py +++ b/src/session_analytics/cli.py @@ -1338,9 +1338,7 @@ def main(): sub.add_argument("query", help="FTS5 query (e.g., 'auth', '\"fix bug\"', 'skip OR defer')") sub.add_argument("--limit", type=int, default=50, help="Max results (default: 50)") sub.add_argument("--project", help="Project path filter") - sub.add_argument( - "--entry-types", help="Entry types to search, comma-separated (default: all)" - ) + sub.add_argument("--entry-types", help="Entry types to search, comma-separated (default: all)") sub.set_defaults(func=cmd_search) # parallel diff --git a/src/session_analytics/storage.py b/src/session_analytics/storage.py index 19b1abc..93fae89 100644 --- a/src/session_analytics/storage.py +++ b/src/session_analytics/storage.py @@ -66,7 +66,9 @@ class Event: # RFC #17 Phase 1 additions user_message_text: str | None = None # For user journey tracking (deprecated, use message_text) - message_text: str | None = None # Unified text content for all entry types (user/assistant/tool_result/summary) + message_text: str | None = ( + None # Unified text content for all entry types (user/assistant/tool_result/summary) + ) # TODO(Phase 4): exit_code is not currently available in Claude Code JSONL format. # The toolUseResult has stdout/stderr/interrupted but no exit code. # This field is reserved for future extraction when format changes or @@ -554,9 +556,7 @@ def executemany(self, sql: str, params_seq: list[tuple]) -> int: def _get_schema_version(self, conn: sqlite3.Connection) -> int: """Get current schema version from database.""" try: - row = conn.execute( - "SELECT MAX(version) FROM schema_version" - ).fetchone() + row = conn.execute("SELECT MAX(version) FROM schema_version").fetchone() return row[0] if row and row[0] else 0 except sqlite3.OperationalError: # Table doesn't exist yet From 96529fc1903539eea2ba6b06fd76f0889f76b485 Mon Sep 17 00:00:00 2001 From: Evan Senter Date: Sat, 10 Jan 2026 02:00:33 +0000 Subject: [PATCH 3/3] Address reviewer feedback: tests, cleanup, docs - Add 21 unit tests for extract_text_from_content() and extract_tool_result_content() covering edge cases (empty lists, mixed content, image placeholders, etc.) - Remove unused MESSAGE_TEXT_MAX_LENGTH constant from ingest.py - Clarify entry_types defaults in guide.md: get_session_messages defaults to ["user", "assistant"], search_messages defaults to all types (no filter) Co-Authored-By: Claude Opus 4.5 --- src/session_analytics/guide.md | 4 +- src/session_analytics/ingest.py | 4 - tests/test_ingest.py | 145 ++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 5 deletions(-) diff --git a/src/session_analytics/guide.md b/src/session_analytics/guide.md index 8d6051e..56e4a53 100644 --- a/src/session_analytics/guide.md +++ b/src/session_analytics/guide.md @@ -81,7 +81,9 @@ Each session includes `classification_factors` explaining WHY it was categorized | `get_session_messages(days?, session_id?, entry_types?, max_message_length?)` | Messages across sessions chronologically (user + assistant by default) | | `search_messages(query, limit?, entry_types?)` | Full-text search across all message types (FTS5) | -**entry_types**: Filter by `["user"]`, `["assistant"]`, `["tool_result"]`, `["summary"]` or any combination. Default: `["user", "assistant"]`. +**entry_types**: Filter by `["user"]`, `["assistant"]`, `["tool_result"]`, `["summary"]` or any combination. +- `get_session_messages`: Default: `["user", "assistant"]` (conversational context) +- `search_messages`: Default: all types (no filter) for comprehensive search **max_message_length**: Truncate messages (default: 500, 0=no limit). diff --git a/src/session_analytics/ingest.py b/src/session_analytics/ingest.py index 46eed30..7d502ec 100644 --- a/src/session_analytics/ingest.py +++ b/src/session_analytics/ingest.py @@ -19,10 +19,6 @@ # Maximum length for user message text to prevent DB bloat while preserving context USER_MESSAGE_MAX_LENGTH = 2000 -# No limit for message_text - user requested full content including tool results -# Set to None to indicate no truncation -MESSAGE_TEXT_MAX_LENGTH = None - def extract_text_from_content(content) -> str | None: """Extract text content from various message content formats. diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 6d85364..2670a0d 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -8,6 +8,8 @@ from session_analytics.ingest import ( extract_command_name, + extract_text_from_content, + extract_tool_result_content, find_log_files, ingest_file, parse_entry, @@ -17,6 +19,149 @@ # Uses fixtures from conftest.py: storage +class TestExtractTextFromContent: + """Tests for extract_text_from_content() edge cases.""" + + def test_none_returns_none(self): + """None content returns None.""" + assert extract_text_from_content(None) is None + + def test_empty_string_returns_none(self): + """Empty string returns None.""" + assert extract_text_from_content("") is None + + def test_plain_string_returns_string(self): + """Plain string content is returned as-is.""" + assert extract_text_from_content("Hello world") == "Hello world" + + def test_list_of_strings(self): + """List of strings is concatenated with spaces.""" + result = extract_text_from_content(["Hello", "world"]) + assert result == "Hello world" + + def test_list_of_text_blocks(self): + """List of text blocks extracts and concatenates text.""" + content = [ + {"type": "text", "text": "First part"}, + {"type": "text", "text": "Second part"}, + ] + result = extract_text_from_content(content) + assert result == "First part Second part" + + def test_mixed_string_and_dict_content(self): + """Mixed strings and dicts are handled correctly.""" + content = [ + "Plain string", + {"type": "text", "text": "Text block"}, + ] + result = extract_text_from_content(content) + assert result == "Plain string Text block" + + def test_skips_tool_use_blocks(self): + """Tool use blocks are skipped (not extracted as text).""" + content = [ + {"type": "text", "text": "Before"}, + {"type": "tool_use", "id": "t1", "name": "Bash", "input": {}}, + {"type": "text", "text": "After"}, + ] + result = extract_text_from_content(content) + assert result == "Before After" + + def test_skips_tool_result_blocks(self): + """Tool result blocks are skipped (handled separately).""" + content = [ + {"type": "text", "text": "Message"}, + {"type": "tool_result", "tool_use_id": "t1", "content": "output"}, + ] + result = extract_text_from_content(content) + assert result == "Message" + + def test_empty_list_returns_none(self): + """Empty list returns None.""" + assert extract_text_from_content([]) is None + + def test_list_with_only_tool_blocks_returns_none(self): + """List containing only tool blocks returns None.""" + content = [ + {"type": "tool_use", "id": "t1", "name": "Bash", "input": {}}, + {"type": "tool_result", "tool_use_id": "t1", "content": "output"}, + ] + assert extract_text_from_content(content) is None + + def test_text_block_with_empty_text(self): + """Text block with empty text is included but results in empty string.""" + content = [{"type": "text", "text": ""}] + # Empty string from join of [""] is "" which is falsy + # So it returns the joined string "" which is then caught by "if text_parts:" + result = extract_text_from_content(content) + assert result == "" + + +class TestExtractToolResultContent: + """Tests for extract_tool_result_content() edge cases.""" + + def test_none_content_returns_none(self): + """Tool result with no content returns None.""" + assert extract_tool_result_content({}) is None + assert extract_tool_result_content({"content": None}) is None + + def test_empty_string_content_returns_none(self): + """Tool result with empty string content returns None.""" + assert extract_tool_result_content({"content": ""}) is None + + def test_string_content(self): + """Tool result with string content returns the string.""" + result = extract_tool_result_content({"content": "Command output"}) + assert result == "Command output" + + def test_list_of_strings(self): + """Tool result with list of strings concatenates with newlines.""" + result = extract_tool_result_content({"content": ["Line 1", "Line 2"]}) + assert result == "Line 1\nLine 2" + + def test_list_of_text_blocks(self): + """Tool result with text blocks extracts text.""" + content = [ + {"type": "text", "text": "First line"}, + {"type": "text", "text": "Second line"}, + ] + result = extract_tool_result_content({"content": content}) + assert result == "First line\nSecond line" + + def test_image_content_placeholder(self): + """Image content is replaced with [image] placeholder.""" + content = [{"type": "image", "source": {"type": "base64", "data": "..."}}] + result = extract_tool_result_content({"content": content}) + assert result == "[image]" + + def test_mixed_text_and_image(self): + """Mixed text and image content is handled correctly.""" + content = [ + {"type": "text", "text": "Screenshot taken:"}, + {"type": "image", "source": {"type": "base64", "data": "..."}}, + ] + result = extract_tool_result_content({"content": content}) + assert result == "Screenshot taken:\n[image]" + + def test_empty_list_returns_none(self): + """Tool result with empty list content returns None.""" + assert extract_tool_result_content({"content": []}) is None + + def test_list_with_unsupported_types_skipped(self): + """Unsupported content types in list are skipped.""" + content = [ + {"type": "text", "text": "Valid text"}, + {"type": "unknown", "data": "mystery"}, + ] + result = extract_tool_result_content({"content": content}) + assert result == "Valid text" + + def test_nested_structure_not_extracted(self): + """Non-list, non-string content returns None.""" + # dict content that isn't handled + assert extract_tool_result_content({"content": {"nested": "data"}}) is None + + @pytest.fixture def sample_logs_dir(): """Create a temporary directory with sample JSONL files."""