From 595c660fd140cb15b97a644450a785d5811487f7 Mon Sep 17 00:00:00 2001 From: Ed Dowding Date: Tue, 31 Mar 2026 18:15:47 -0500 Subject: [PATCH 01/65] Add WhatsApp import from decrypted backup (#160) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Import WhatsApp messages from decrypted msgstore.db backups into msgvault. Reads contacts, messages, and group metadata from the WhatsApp SQLite database and maps them into the existing msgvault schema with message_type='whatsapp'. Includes: - WhatsApp SQLite queries for messages, contacts, group metadata - Contact resolution (phone → name) with WhatsApp contact DB support - Conversation/thread mapping for 1:1 and group chats - TUI and query engine updates for multi-source message types - Schema migrations for phone_number, message_type, conversation title Co-Authored-By: Ed Dowding --- cmd/msgvault/cmd/build_cache.go | 55 +- cmd/msgvault/cmd/build_cache_test.go | 34 +- cmd/msgvault/cmd/export_attachments.go | 4 + cmd/msgvault/cmd/export_eml.go | 4 + cmd/msgvault/cmd/import.go | 237 ++++++ cmd/msgvault/cmd/list_domains.go | 4 + cmd/msgvault/cmd/list_labels.go | 4 + cmd/msgvault/cmd/list_senders.go | 4 + cmd/msgvault/cmd/repair_encoding.go | 4 + cmd/msgvault/cmd/show_message.go | 4 + cmd/msgvault/cmd/update_account.go | 4 + cmd/msgvault/cmd/verify.go | 4 + .../2026-02-17-multi-source-messaging.md | 164 ++++ internal/mcp/handlers.go | 7 +- internal/query/duckdb.go | 432 +++++++++-- internal/query/duckdb_test.go | 150 +++- internal/query/models.go | 5 +- internal/query/sqlite.go | 87 ++- internal/query/testfixtures_test.go | 53 +- internal/store/messages.go | 168 ++++ internal/store/store.go | 8 + internal/textutil/encoding.go | 82 ++ internal/textutil/encoding_test.go | 34 + internal/tui/view.go | 38 +- internal/whatsapp/contacts.go | 243 ++++++ internal/whatsapp/contacts_test.go | 241 ++++++ internal/whatsapp/importer.go | 720 ++++++++++++++++++ internal/whatsapp/mapping.go | 215 ++++++ internal/whatsapp/mapping_test.go | 347 +++++++++ internal/whatsapp/queries.go | 365 +++++++++ internal/whatsapp/queries_test.go | 89 +++ internal/whatsapp/types.go | 155 ++++ 32 files changed, 3793 insertions(+), 172 deletions(-) create mode 100644 cmd/msgvault/cmd/import.go create mode 100644 docs/plans/2026-02-17-multi-source-messaging.md create mode 100644 internal/whatsapp/contacts.go create mode 100644 internal/whatsapp/contacts_test.go create mode 100644 internal/whatsapp/importer.go create mode 100644 internal/whatsapp/mapping.go create mode 100644 internal/whatsapp/mapping_test.go create mode 100644 internal/whatsapp/queries.go create mode 100644 internal/whatsapp/queries_test.go create mode 100644 internal/whatsapp/types.go diff --git a/cmd/msgvault/cmd/build_cache.go b/cmd/msgvault/cmd/build_cache.go index 49f76703..7cd6c952 100644 --- a/cmd/msgvault/cmd/build_cache.go +++ b/cmd/msgvault/cmd/build_cache.go @@ -17,6 +17,7 @@ import ( "github.com/spf13/cobra" "github.com/wesm/msgvault/internal/config" "github.com/wesm/msgvault/internal/query" + "github.com/wesm/msgvault/internal/store" ) var fullRebuild bool @@ -27,10 +28,17 @@ var fullRebuild bool // files (_last_sync.json, parquet directories) can corrupt the cache. var buildCacheMu sync.Mutex +// cacheSchemaVersion tracks the Parquet schema layout. Bump this whenever +// columns are added/removed/renamed in the COPY queries below so that +// incremental builds automatically trigger a full rebuild instead of +// producing Parquet files with mismatched schemas. +const cacheSchemaVersion = 4 // v4: add source_type to sources Parquet; strip \r\n in SanitizeTerminal + // syncState tracks the message and sync-run watermarks covered by the cache. type syncState struct { LastMessageID int64 `json:"last_message_id"` LastSyncAt time.Time `json:"last_sync_at"` + SchemaVersion int `json:"schema_version,omitempty"` LastCompletedSyncRunID int64 `json:"last_completed_sync_run_id,omitempty"` } @@ -63,6 +71,20 @@ Use --full-rebuild to recreate all cache files from scratch.`, return fmt.Errorf("database not found: %s\nRun 'msgvault init-db' first", dbPath) } + // Ensure schema is up to date before building cache. + // Legacy databases may be missing columns (e.g. attachment_count, + // sender_id, message_type, phone_number) that the export queries + // reference. Running migrations first adds them. + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + if err := s.InitSchema(); err != nil { + _ = s.Close() + return fmt.Errorf("init schema: %w", err) + } + _ = s.Close() + result, err := buildCache(dbPath, analyticsDir, fullRebuild) if err != nil { return err @@ -102,8 +124,16 @@ func buildCache(dbPath, analyticsDir string, fullRebuild bool) (*buildResult, er if data, err := os.ReadFile(stateFile); err == nil { var state syncState if json.Unmarshal(data, &state) == nil { - lastMessageID = state.LastMessageID - fmt.Printf("Incremental export from message_id > %d\n", lastMessageID) + if state.SchemaVersion != cacheSchemaVersion { + // Schema has changed — force a full rebuild. + fmt.Printf("Cache schema version mismatch (have v%d, need v%d). Forcing full rebuild.\n", + state.SchemaVersion, cacheSchemaVersion) + fullRebuild = true + lastMessageID = 0 + } else { + lastMessageID = state.LastMessageID + fmt.Printf("Incremental export from message_id > %d\n", lastMessageID) + } } } } @@ -264,7 +294,10 @@ func buildCache(dbPath, analyticsDir string, fullRebuild bool) (*buildResult, er m.sent_at, m.size_estimate, m.has_attachments, + COALESCE(TRY_CAST(m.attachment_count AS INTEGER), 0) as attachment_count, m.deleted_from_source_at, + m.sender_id, + COALESCE(TRY_CAST(m.message_type AS VARCHAR), '') as message_type, CAST(EXTRACT(YEAR FROM m.sent_at) AS INTEGER) as year, CAST(EXTRACT(MONTH FROM m.sent_at) AS INTEGER) as month FROM sqlite_db.messages m @@ -354,7 +387,8 @@ func buildCache(dbPath, analyticsDir string, fullRebuild bool) (*buildResult, er id, COALESCE(TRY_CAST(email_address AS VARCHAR), '') as email_address, COALESCE(TRY_CAST(domain AS VARCHAR), '') as domain, - COALESCE(TRY_CAST(display_name AS VARCHAR), '') as display_name + COALESCE(TRY_CAST(display_name AS VARCHAR), '') as display_name, + COALESCE(TRY_CAST(phone_number AS VARCHAR), '') as phone_number FROM sqlite_db.participants ) TO '%s/participants.parquet' ( FORMAT PARQUET, @@ -388,7 +422,8 @@ func buildCache(dbPath, analyticsDir string, fullRebuild bool) (*buildResult, er COPY ( SELECT id, - identifier as account_email + identifier as account_email, + COALESCE(TRY_CAST(source_type AS VARCHAR), 'gmail') as source_type FROM sqlite_db.sources ) TO '%s/sources.parquet' ( FORMAT PARQUET, @@ -405,7 +440,8 @@ func buildCache(dbPath, analyticsDir string, fullRebuild bool) (*buildResult, er COPY ( SELECT id, - COALESCE(TRY_CAST(source_conversation_id AS VARCHAR), '') as source_conversation_id + COALESCE(TRY_CAST(source_conversation_id AS VARCHAR), '') as source_conversation_id, + COALESCE(TRY_CAST(title AS VARCHAR), '') as title FROM sqlite_db.conversations ) TO '%s/conversations.parquet' ( FORMAT PARQUET, @@ -438,6 +474,7 @@ func buildCache(dbPath, analyticsDir string, fullRebuild bool) (*buildResult, er state := syncState{ LastMessageID: maxID, LastSyncAt: cacheWatermark, + SchemaVersion: cacheSchemaVersion, LastCompletedSyncRunID: lastCompletedSyncRunID, } stateData, _ := json.Marshal(state) @@ -636,15 +673,15 @@ func setupSQLiteSource(duckDB *sql.DB, dbPath string) (cleanup func(), err error query string typeOverrides string // DuckDB types parameter for read_csv_auto (empty = infer all) }{ - {"messages", "SELECT id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, deleted_from_source_at FROM messages WHERE sent_at IS NOT NULL", + {"messages", "SELECT id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, attachment_count, deleted_from_source_at, sender_id, message_type FROM messages WHERE sent_at IS NOT NULL", "types={'sent_at': 'TIMESTAMP', 'deleted_from_source_at': 'TIMESTAMP'}"}, {"message_recipients", "SELECT message_id, participant_id, recipient_type, display_name FROM message_recipients", ""}, {"message_labels", "SELECT message_id, label_id FROM message_labels", ""}, {"attachments", "SELECT message_id, size, filename FROM attachments", ""}, - {"participants", "SELECT id, email_address, domain, display_name FROM participants", ""}, + {"participants", "SELECT id, email_address, domain, display_name, phone_number FROM participants", ""}, {"labels", "SELECT id, name FROM labels", ""}, - {"sources", "SELECT id, identifier FROM sources", ""}, - {"conversations", "SELECT id, source_conversation_id FROM conversations", ""}, + {"sources", "SELECT id, identifier, source_type FROM sources", ""}, + {"conversations", "SELECT id, source_conversation_id, title FROM conversations", ""}, } for _, t := range tables { diff --git a/cmd/msgvault/cmd/build_cache_test.go b/cmd/msgvault/cmd/build_cache_test.go index 2369f81c..71f82aa7 100644 --- a/cmd/msgvault/cmd/build_cache_test.go +++ b/cmd/msgvault/cmd/build_cache_test.go @@ -51,7 +51,10 @@ func setupTestSQLite(t *testing.T) (string, func()) { received_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN DEFAULT FALSE, + attachment_count INTEGER DEFAULT 0, deleted_from_source_at TIMESTAMP, + sender_id INTEGER, + message_type TEXT NOT NULL DEFAULT 'email', UNIQUE(source_id, source_message_id) ); @@ -59,7 +62,8 @@ func setupTestSQLite(t *testing.T) (string, func()) { id INTEGER PRIMARY KEY, email_address TEXT NOT NULL UNIQUE, domain TEXT, - display_name TEXT + display_name TEXT, + phone_number TEXT ); CREATE TABLE message_recipients ( @@ -1128,13 +1132,13 @@ func TestBuildCache_EmptyDatabase(t *testing.T) { db, _ := sql.Open("sqlite3", dbPath) _, _ = db.Exec(` CREATE TABLE sources (id INTEGER PRIMARY KEY, identifier TEXT); - CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP); - CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT, domain TEXT, display_name TEXT); + CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email'); + CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT, domain TEXT, display_name TEXT, phone_number TEXT); CREATE TABLE message_recipients (message_id INTEGER, participant_id INTEGER, recipient_type TEXT, display_name TEXT); CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); CREATE TABLE message_labels (message_id INTEGER, label_id INTEGER); CREATE TABLE attachments (message_id INTEGER, size INTEGER, filename TEXT); - CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT); + CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT); `) _ = db.Close() @@ -1328,13 +1332,13 @@ func BenchmarkBuildCache(b *testing.B) { // Create schema _, _ = db.Exec(` CREATE TABLE sources (id INTEGER PRIMARY KEY, identifier TEXT); - CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP); - CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT UNIQUE, domain TEXT, display_name TEXT); + CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email'); + CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT UNIQUE, domain TEXT, display_name TEXT, phone_number TEXT); CREATE TABLE message_recipients (message_id INTEGER, participant_id INTEGER, recipient_type TEXT, display_name TEXT); CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); CREATE TABLE message_labels (message_id INTEGER, label_id INTEGER); CREATE TABLE attachments (message_id INTEGER, size INTEGER, filename TEXT); - CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT); + CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT); INSERT INTO sources VALUES (1, 'test@gmail.com'); INSERT INTO labels VALUES (1, 'INBOX'), (2, 'Work'); `) @@ -1418,14 +1422,18 @@ func setupTestSQLiteEmpty(t *testing.T) (string, func()) { received_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN DEFAULT FALSE, + attachment_count INTEGER DEFAULT 0, deleted_from_source_at TIMESTAMP, + sender_id INTEGER, + message_type TEXT NOT NULL DEFAULT 'email', UNIQUE(source_id, source_message_id) ); CREATE TABLE participants ( id INTEGER PRIMARY KEY, email_address TEXT NOT NULL UNIQUE, domain TEXT, - display_name TEXT + display_name TEXT, + phone_number TEXT ); CREATE TABLE message_recipients ( id INTEGER PRIMARY KEY, @@ -1955,17 +1963,17 @@ func BenchmarkBuildCacheIncremental(b *testing.B) { // Create schema and initial data (10000 messages) _, _ = db.Exec(` CREATE TABLE sources (id INTEGER PRIMARY KEY, identifier TEXT); - CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP); - CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT UNIQUE, domain TEXT, display_name TEXT); + CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email'); + CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT UNIQUE, domain TEXT, display_name TEXT, phone_number TEXT); CREATE TABLE message_recipients (message_id INTEGER, participant_id INTEGER, recipient_type TEXT, display_name TEXT); CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); CREATE TABLE message_labels (message_id INTEGER, label_id INTEGER); CREATE TABLE attachments (message_id INTEGER, size INTEGER, filename TEXT); - CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT); + CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT); INSERT INTO sources VALUES (1, 'test@gmail.com'); INSERT INTO labels VALUES (1, 'INBOX'); - INSERT INTO participants VALUES (1, 'alice@example.com', 'example.com', 'Alice'); - INSERT INTO participants VALUES (2, 'bob@example.com', 'example.com', 'Bob'); + INSERT INTO participants VALUES (1, 'alice@example.com', 'example.com', 'Alice', NULL); + INSERT INTO participants VALUES (2, 'bob@example.com', 'example.com', 'Bob', NULL); `) // Insert conversations to match messages diff --git a/cmd/msgvault/cmd/export_attachments.go b/cmd/msgvault/cmd/export_attachments.go index 1482c9fc..e27ef57b 100644 --- a/cmd/msgvault/cmd/export_attachments.go +++ b/cmd/msgvault/cmd/export_attachments.go @@ -42,6 +42,10 @@ func runExportAttachments(cmd *cobra.Command, args []string) error { } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + engine := query.NewSQLiteEngine(s.DB()) // Resolve message ID — try numeric first, fallback to Gmail ID diff --git a/cmd/msgvault/cmd/export_eml.go b/cmd/msgvault/cmd/export_eml.go index a76ae43e..82b610f7 100644 --- a/cmd/msgvault/cmd/export_eml.go +++ b/cmd/msgvault/cmd/export_eml.go @@ -91,6 +91,10 @@ func runExportEML(cmd *cobra.Command, messageRef, outputPath string) error { } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + engine := query.NewSQLiteEngine(s.DB()) resolved, err := resolveMessage(engine, cmd, messageRef) diff --git a/cmd/msgvault/cmd/import.go b/cmd/msgvault/cmd/import.go new file mode 100644 index 00000000..3ba45d30 --- /dev/null +++ b/cmd/msgvault/cmd/import.go @@ -0,0 +1,237 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "strings" + "syscall" + "time" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/textutil" + "github.com/wesm/msgvault/internal/whatsapp" +) + +var ( + importType string + importPhone string + importMediaDir string + importContacts string + importLimit int + importDisplayName string +) + +var importCmd = &cobra.Command{ + Use: "import [path]", + Short: "Import messages from external sources", + Long: `Import messages from external message databases. + +Currently supported types: + whatsapp Import from a decrypted WhatsApp msgstore.db + +Examples: + msgvault import --type whatsapp --phone "+447700900000" /path/to/msgstore.db + msgvault import --type whatsapp --phone "+447700900000" --contacts ~/contacts.vcf /path/to/msgstore.db + msgvault import --type whatsapp --phone "+447700900000" --media-dir /path/to/Media /path/to/msgstore.db`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := MustBeLocal("import"); err != nil { + return err + } + + sourcePath := args[0] + + // Validate source file exists. + if _, err := os.Stat(sourcePath); err != nil { + return fmt.Errorf("source file not found: %w", err) + } + + switch strings.ToLower(importType) { + case "whatsapp": + return runWhatsAppImport(cmd, sourcePath) + default: + return fmt.Errorf("unsupported import type %q (supported: whatsapp)", importType) + } + }, +} + +func runWhatsAppImport(cmd *cobra.Command, sourcePath string) error { + // Validate phone number. + if importPhone == "" { + return fmt.Errorf("--phone is required for WhatsApp import (E.164 format, e.g., +447700900000)") + } + if !strings.HasPrefix(importPhone, "+") { + return fmt.Errorf("phone number must be in E.164 format (starting with +), got %q", importPhone) + } + + // Validate media dir if provided. + if importMediaDir != "" { + if info, err := os.Stat(importMediaDir); err != nil || !info.IsDir() { + return fmt.Errorf("media directory not found or not a directory: %s", importMediaDir) + } + } + + // Open database. + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + + // Set up context with cancellation. + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + // Handle Ctrl+C gracefully. + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + fmt.Println("\nInterrupted. Saving checkpoint...") + cancel() + }() + + // Build import options. + opts := whatsapp.DefaultOptions() + opts.Phone = importPhone + opts.DisplayName = importDisplayName + opts.MediaDir = importMediaDir + opts.AttachmentsDir = cfg.AttachmentsDir() + opts.Limit = importLimit + + // Create importer with CLI progress. + progress := &ImportCLIProgress{} + importer := whatsapp.NewImporter(s, progress) + + fmt.Printf("Importing WhatsApp messages from %s\n", sourcePath) + fmt.Printf("Phone: %s\n", importPhone) + if importMediaDir != "" { + fmt.Printf("Media: %s\n", importMediaDir) + } + if importLimit > 0 { + fmt.Printf("Limit: %d messages\n", importLimit) + } + fmt.Println() + + summary, err := importer.Import(ctx, sourcePath, opts) + if err != nil { + if ctx.Err() != nil { + fmt.Println("\nImport interrupted. Run again to continue.") + return nil + } + return fmt.Errorf("import failed: %w", err) + } + + // Import contacts if provided. + if importContacts != "" { + fmt.Printf("\nImporting contacts from %s...\n", importContacts) + matched, total, err := whatsapp.ImportContacts(s, importContacts) + if err != nil { + return fmt.Errorf("contact import: %w", err) + } else { + fmt.Printf(" Contacts: %d in file, %d phone numbers matched to participants\n", total, matched) + } + } + + // Print summary. + fmt.Println() + fmt.Println("Import complete!") + fmt.Printf(" Duration: %s\n", summary.Duration.Round(time.Second)) + fmt.Printf(" Chats: %d\n", summary.ChatsProcessed) + fmt.Printf(" Messages: %d processed, %d added, %d skipped\n", + summary.MessagesProcessed, summary.MessagesAdded, summary.MessagesSkipped) + fmt.Printf(" Participants: %d\n", summary.Participants) + fmt.Printf(" Reactions: %d\n", summary.ReactionsAdded) + fmt.Printf(" Attachments: %d found", summary.AttachmentsFound) + if summary.MediaCopied > 0 { + fmt.Printf(", %d files copied", summary.MediaCopied) + } + fmt.Println() + if summary.Errors > 0 { + fmt.Printf(" Errors: %d\n", summary.Errors) + } + + if summary.MessagesAdded > 0 { + rate := float64(summary.MessagesAdded) / summary.Duration.Seconds() + fmt.Printf(" Rate: %.0f messages/sec\n", rate) + } + + return nil +} + +// ImportCLIProgress implements whatsapp.ImportProgress for terminal output. +type ImportCLIProgress struct { + startTime time.Time + lastPrint time.Time + currentChat string +} + +func (p *ImportCLIProgress) OnStart() { + p.startTime = time.Now() + p.lastPrint = time.Now() +} + +func (p *ImportCLIProgress) OnChatStart(chatJID, chatTitle string, messageCount int) { + p.currentChat = chatTitle + // Don't print every chat start — too noisy for 13k+ chats. +} + +func (p *ImportCLIProgress) OnProgress(processed, added, skipped int64) { + // Throttle output to every 2 seconds. + if time.Since(p.lastPrint) < 2*time.Second { + return + } + p.lastPrint = time.Now() + + elapsed := time.Since(p.startTime) + rate := 0.0 + if elapsed.Seconds() >= 1 { + rate = float64(added) / elapsed.Seconds() + } + + elapsedStr := formatDuration(elapsed) + + chatStr := "" + if p.currentChat != "" { + // Truncate long chat names and sanitize to prevent terminal injection. + name := textutil.SanitizeTerminal(p.currentChat) + if len(name) > 30 { + name = name[:27] + "..." + } + chatStr = fmt.Sprintf(" | Chat: %s", name) + } + + fmt.Printf("\r Processed: %d | Added: %d | Skipped: %d | Rate: %.0f/s | Elapsed: %s%s ", + processed, added, skipped, rate, elapsedStr, chatStr) +} + +func (p *ImportCLIProgress) OnChatComplete(chatJID string, messagesAdded int64) { + // Quiet — progress line shows the aggregate. +} + +func (p *ImportCLIProgress) OnComplete(summary *whatsapp.ImportSummary) { + fmt.Println() // Clear the progress line. +} + +func (p *ImportCLIProgress) OnError(err error) { + fmt.Printf("\nWarning: %s\n", textutil.SanitizeTerminal(err.Error())) +} + +func init() { + importCmd.Flags().StringVar(&importType, "type", "", "import source type (required: whatsapp)") + importCmd.Flags().StringVar(&importPhone, "phone", "", "your phone number in E.164 format (required for whatsapp)") + importCmd.Flags().StringVar(&importMediaDir, "media-dir", "", "path to decrypted Media folder (optional)") + importCmd.Flags().StringVar(&importContacts, "contacts", "", "path to contacts .vcf file for name resolution (optional)") + importCmd.Flags().IntVar(&importLimit, "limit", 0, "limit number of messages (for testing)") + importCmd.Flags().StringVar(&importDisplayName, "display-name", "", "display name for the phone owner") + _ = importCmd.MarkFlagRequired("type") + rootCmd.AddCommand(importCmd) +} diff --git a/cmd/msgvault/cmd/list_domains.go b/cmd/msgvault/cmd/list_domains.go index 71207f67..a45c30bf 100644 --- a/cmd/msgvault/cmd/list_domains.go +++ b/cmd/msgvault/cmd/list_domains.go @@ -34,6 +34,10 @@ Examples: } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + // Create query engine engine := query.NewSQLiteEngine(s.DB()) diff --git a/cmd/msgvault/cmd/list_labels.go b/cmd/msgvault/cmd/list_labels.go index 0c6e54fb..efebeeeb 100644 --- a/cmd/msgvault/cmd/list_labels.go +++ b/cmd/msgvault/cmd/list_labels.go @@ -34,6 +34,10 @@ Examples: } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + // Create query engine engine := query.NewSQLiteEngine(s.DB()) diff --git a/cmd/msgvault/cmd/list_senders.go b/cmd/msgvault/cmd/list_senders.go index 24f37bbd..e00b8154 100644 --- a/cmd/msgvault/cmd/list_senders.go +++ b/cmd/msgvault/cmd/list_senders.go @@ -34,6 +34,10 @@ Examples: } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + // Create query engine engine := query.NewSQLiteEngine(s.DB()) diff --git a/cmd/msgvault/cmd/repair_encoding.go b/cmd/msgvault/cmd/repair_encoding.go index 57119ce3..321afaa6 100644 --- a/cmd/msgvault/cmd/repair_encoding.go +++ b/cmd/msgvault/cmd/repair_encoding.go @@ -44,6 +44,10 @@ charset detection issues in the MIME parser.`, } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + return repairEncoding(s) }, } diff --git a/cmd/msgvault/cmd/show_message.go b/cmd/msgvault/cmd/show_message.go index 676a8f43..8b325f17 100644 --- a/cmd/msgvault/cmd/show_message.go +++ b/cmd/msgvault/cmd/show_message.go @@ -82,6 +82,10 @@ func showLocalMessage(cmd *cobra.Command, idStr string) error { } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + // Create query engine engine := query.NewSQLiteEngine(s.DB()) diff --git a/cmd/msgvault/cmd/update_account.go b/cmd/msgvault/cmd/update_account.go index 5416f2f1..08f2e645 100644 --- a/cmd/msgvault/cmd/update_account.go +++ b/cmd/msgvault/cmd/update_account.go @@ -34,6 +34,10 @@ Examples: } defer func() { _ = s.Close() }() + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + source, err := s.GetSourceByIdentifier(email) if err != nil { return fmt.Errorf("get account: %w", err) diff --git a/cmd/msgvault/cmd/verify.go b/cmd/msgvault/cmd/verify.go index 936260cf..056e7112 100644 --- a/cmd/msgvault/cmd/verify.go +++ b/cmd/msgvault/cmd/verify.go @@ -72,6 +72,10 @@ Examples: return err } + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + // Create OAuth manager and get token source oauthMgr, err := oauth.NewManager(clientSecretsPath, cfg.TokensDir(), logger) if err != nil { diff --git a/docs/plans/2026-02-17-multi-source-messaging.md b/docs/plans/2026-02-17-multi-source-messaging.md new file mode 100644 index 00000000..48b7b415 --- /dev/null +++ b/docs/plans/2026-02-17-multi-source-messaging.md @@ -0,0 +1,164 @@ +# Multi-Source Messaging Support + +**Issue:** [wesm/msgvault#136](https://github.com/wesm/msgvault/issues/136) +**Author:** Ed Dowding +**Date:** 2026-02-17 +**Status:** Draft for review + +## Goal + +Make msgvault a universal message archive — not just Gmail. Starting with WhatsApp, but ensuring the design works for iMessage, Telegram, SMS, and other chat platforms. + +## Good News: The Schema Is Already Ready + +The existing schema was designed for this. Key fields already in place: + +| Table | Multi-source fields | +|-------|-------------------| +| `sources` | `source_type` ('gmail', 'whatsapp', 'apple_messages', 'google_messages'), `identifier` (email or phone), `sync_cursor` (platform-agnostic) | +| `messages` | `message_type` ('email', 'imessage', 'sms', 'mms', 'rcs', 'whatsapp'), `is_edited`, `is_forwarded`, `delivered_at`, `read_at` | +| `conversations` | `conversation_type` ('email_thread', 'group_chat', 'direct_chat', 'channel') | +| `participants` | `phone_number` (E.164), `canonical_id` (cross-platform dedup) | +| `participant_identifiers` | `identifier_type` ('email', 'phone', 'apple_id', 'whatsapp') | +| `attachments` | `media_type` ('image', 'video', 'audio', 'sticker', 'gif', 'voice_note') | +| `reactions` | `reaction_type` ('tapback', 'emoji', 'like') | +| `message_raw` | `raw_format` ('mime', 'imessage_archive', 'whatsapp_json', 'rcs_json') | + +**No schema migrations needed.** The store layer (`UpsertMessage`, `GetOrCreateSource`, etc.) is already generic — it accepts any `source_type` and `message_type`. The tight coupling to Gmail is only in the sync pipeline and CLI commands. + +## CLI Design + +Per Wes's feedback, use `--type` not `--whatsapp`: + +```bash +# Add accounts +msgvault add-account user@gmail.com # default: --type gmail +msgvault add-account --type whatsapp "+447700900000" # WhatsApp via phone +msgvault add-account --type imessage # no identifier needed (local DB) + +# Sync +msgvault sync-full # all sources +msgvault sync-full user@gmail.com # specific account +msgvault sync-full "+447700900000" # auto-detects type from sources table +msgvault sync-full --type whatsapp # all WhatsApp accounts +msgvault sync-incremental # incremental for all sources +``` + +**Account identifiers** use E.164 phone numbers for phone-based sources (`+447700900000`), email addresses for email-based sources. The existing `UNIQUE(source_type, identifier)` constraint means the same phone number can be both a WhatsApp and iMessage account. + +## How Each Platform Syncs + +The fundamental difference: Gmail is pull-based (fetch any message anytime), most chat platforms are push-based (stream messages in real time). Each platform gets its own package under `internal/` that knows how to sync into the shared store. + +| Platform | Sync model | History access | Auth | Identifier | +|----------|-----------|---------------|------|------------| +| **Gmail** | Pull via API | Full random access | OAuth2 (browser or device flow) | Email address | +| **WhatsApp** | Connect + stream | One-time dump at pairing, then forward-only | QR code or phone pairing code | E.164 phone | +| **iMessage** | Read local SQLite | Full (reads `~/Library/Messages/chat.db`) | macOS Full Disk Access | None (local) | +| **Telegram** | Pull via TDLib | Full history via API | Phone + code | E.164 phone | +| **SMS/Android** | Read local SQLite | Full (reads `mmssms.db` from backup) | File access | E.164 phone | + +No abstract `Provider` interface up front — just build each platform's sync as a standalone package, and extract common patterns once we have two working. YAGNI. + +## WhatsApp Specifics (Phase 1) + +### Library: whatsmeow + +[whatsmeow](https://github.com/tulir/whatsmeow) is a pure Go implementation of the WhatsApp Web multi-device protocol. Production-grade — it powers the [mautrix-whatsapp](https://github.com/mautrix/whatsapp) Matrix bridge (2,200+ stars). Actively maintained (last commit: Feb 2026). + +### Auth Flow + +1. User runs `msgvault add-account --type whatsapp "+447700900000"` +2. Terminal displays QR code (or pairing code with `--headless`) +3. User scans with WhatsApp on their phone +4. Session credentials stored in SQLite (alongside msgvault's main DB) +5. Session persists across restarts — no re-scanning needed + +Session expires if the primary phone doesn't connect to internet for 14 days, or after ~30 days of inactivity. + +### Sync Model + +**Critical constraint:** WhatsApp history is a one-time dump, not an on-demand API. + +``` +First sync: + connect → receive history dump (HistorySync event) → stream until caught up → disconnect + +Subsequent syncs: + connect → stream new messages since last cursor → disconnect +``` + +On-demand historical backfill exists (`BuildHistorySyncRequest`) but is documented as unreliable, especially for groups. Design accordingly: treat initial history as best-effort, then reliably capture everything going forward. + +### Media Must Be Downloaded Immediately + +WhatsApp media URLs expire after ~30 days. Unlike Gmail where you can fetch any attachment anytime, WhatsApp media must be downloaded and stored locally at sync time. The existing content-addressed attachment storage (SHA-256 dedup) works perfectly for this. + +### Message Type Mapping + +| WhatsApp | msgvault field | Value | +|----------|---------------|-------| +| Text message | `messages.message_type` | `'whatsapp'` | +| Image/Video/Audio | `attachments.media_type` | `'image'`, `'video'`, `'audio'` | +| Voice note | `attachments.media_type` | `'voice_note'` | +| Sticker | `attachments.media_type` | `'sticker'` | +| Document | `attachments.media_type` | `'document'` | +| Reaction (emoji) | `reactions.reaction_type` | `'emoji'` | +| Reply/Quote | `messages.reply_to_message_id` | FK to parent message | +| Forwarded | `messages.is_forwarded` | `true` | +| Edited | `messages.is_edited` | `true` | +| Read receipt | `messages.read_at` | Timestamp | +| Delivery receipt | `messages.delivered_at` | Timestamp | +| Group chat | `conversations.conversation_type` | `'group_chat'` | +| 1:1 chat | `conversations.conversation_type` | `'direct_chat'` | +| Sender JID | `participant_identifiers.identifier_type` | `'whatsapp'`, value = `447700900000@s.whatsapp.net` | +| Sender phone | `participants.phone_number` | `+447700900000` (E.164) | +| Raw protobuf | `message_raw.raw_format` | `'whatsapp_protobuf'` | + +### What Changes in Existing Code + +**New package:** `internal/whatsapp/` — self-contained, no changes to existing Gmail code. + +**Small changes needed:** +- `cmd/msgvault/cmd/addaccount.go`: Add `--type` flag, dispatch to WhatsApp auth when type is `"whatsapp"` +- `cmd/msgvault/cmd/syncfull.go`: Currently hardcodes `ListSources("gmail")` — change to `ListSources("")` (all types) with a type-based dispatcher +- `internal/store/`: Add `EnsureParticipantByPhone()` method (currently only handles email-based participants) +- `internal/store/`: Add `'member'` as a valid `recipient_type` for group chat participants + +**No changes to:** schema, query engine, TUI, MCP server, HTTP API, or any consumer. Messages from WhatsApp will appear in search, aggregation, and all views automatically because consumers operate on the generic `messages` table. + +## Risks + +| Risk | Severity | Mitigation | +|------|----------|------------| +| **Account ban/warning** | High | WhatsApp TOS prohibits unofficial clients. Read-only archival is lower risk than bots, but not zero. Document prominently. Recommend a dedicated/secondary number for testing. | +| **History dump is incomplete** | Medium | WhatsApp server decides how much history to send at pairing. Design as "best effort snapshot + reliable stream forward." | +| **whatsmeow protocol breakage** | Medium | WhatsApp changes their protocol regularly. Pin whatsmeow version, expect occasional breakage, track upstream releases. | +| **Media URL expiration** | Low | Download everything at sync time. Already mitigated by design. | +| **Phone must be online every 14 days** | Low | Document requirement. Could add a warning in `sync` output if session is stale. | + +## How Other Platforms Would Plug In Later + +Each gets its own `internal//` package that syncs into the store. Brief notes on feasibility: + +**iMessage** (macOS only): Read `~/Library/Messages/chat.db` directly. Full history available. Timestamps use Apple epoch (nanoseconds since 2001-01-01). Tapbacks stored as separate messages referencing parent via `associated_message_guid` — would map to `reactions` table. Requires Full Disk Access permission. No network needed. + +**Telegram**: TDLib (official C++ library with Go bindings) or import from Desktop export JSON. Full history available via API. Unique features: channels, supergroups, forums, scheduled messages, silent messages. User IDs are numeric (not phone-based) but phone is the auth method. + +**SMS/Android**: Import from `mmssms.db` backup. Simple data model (phone, timestamp, body). MMS attachments in `part` table. No reactions, no threading, no edits. + +**Signal**: Hardest. Desktop DB is SQLCipher-encrypted. Schema changes frequently (215+ migration versions). No official export API. Feasible but fragile. + +## Implementation Phases + +**Phase 1 — CLI + dispatcher (no new platforms):** +Add `--type` flag. Change sync dispatch from Gmail-only to type-based. All existing behavior unchanged. + +**Phase 2 — WhatsApp sync:** +`internal/whatsapp/` package. QR pairing. History dump. Forward streaming. Media download. Phone participant handling. + +**Phase 3 — WhatsApp features:** +Reactions, replies, groups with metadata, voice notes, stickers, read receipts. + +**Phase 4 — Next platform (iMessage or Telegram):** +By this point we'll have two implementations and can extract common patterns if they emerge naturally. Not before. diff --git a/internal/mcp/handlers.go b/internal/mcp/handlers.go index 2ca99e71..40ee2047 100644 --- a/internal/mcp/handlers.go +++ b/internal/mcp/handlers.go @@ -323,7 +323,12 @@ func (h *handlers) listMessages(ctx context.Context, req mcp.CallToolRequest) (* } if v, ok := args["from"].(string); ok && v != "" { - filter.Sender = v + // If it looks like an email address, filter by email; otherwise by display name. + if strings.Contains(v, "@") || strings.HasPrefix(v, "+") { + filter.Sender = v + } else { + filter.SenderName = v + } } if v, ok := args["to"].(string); ok && v != "" { filter.Recipient = v diff --git a/internal/query/duckdb.go b/internal/query/duckdb.go index 87af1cd7..375f1c08 100644 --- a/internal/query/duckdb.go +++ b/internal/query/duckdb.go @@ -40,6 +40,12 @@ type DuckDBEngine struct { hasSQLiteScanner bool // true if DuckDB's sqlite extension is loaded tempTableSeq atomic.Uint64 // Unique suffix for temp tables to avoid concurrent collisions + // optionalCols tracks which columns exist in each Parquet table's schema. + // Used to gracefully handle stale cache files that lack newer columns + // (e.g. phone_number, attachment_count, sender_id, message_type added in PR #160). + // Map: table_name -> column_name -> exists_in_parquet + optionalCols map[string]map[string]bool + // Search result cache: keeps the materialized temp table alive across // pagination calls for the same search query, avoiding repeated Parquet scans. searchCacheMu sync.Mutex // protects cache fields from concurrent goroutines @@ -122,14 +128,41 @@ func NewDuckDBEngine(analyticsDir string, sqlitePath string, sqliteDB *sql.DB, o sqliteEngine = NewSQLiteEngine(sqliteDB) } - return &DuckDBEngine{ + engine := &DuckDBEngine{ db: db, analyticsDir: analyticsDir, sqlitePath: sqlitePath, sqliteDB: sqliteDB, sqliteEngine: sqliteEngine, hasSQLiteScanner: hasSQLiteScanner, - }, nil + } + + // Probe Parquet schemas for optional columns added in PR #160 (WhatsApp import). + // Old cache files may lack these columns; we'll supply defaults in parquetCTEs(). + engine.optionalCols = map[string]map[string]bool{ + "participants": engine.probeParquetColumns(engine.parquetPath("participants"), false), + "messages": engine.probeParquetColumns(engine.parquetGlob(), true), + "conversations": engine.probeParquetColumns(engine.parquetPath("conversations"), false), + "sources": engine.probeParquetColumns(engine.parquetPath("sources"), false), + } + var missing []string + for _, col := range []struct{ table, col string }{ + {"participants", "phone_number"}, + {"messages", "attachment_count"}, + {"messages", "sender_id"}, + {"messages", "message_type"}, + {"conversations", "title"}, + {"sources", "source_type"}, + } { + if !engine.optionalCols[col.table][col.col] { + missing = append(missing, col.table+"."+col.col) + } + } + if len(missing) > 0 { + log.Printf("[warn] Parquet cache missing columns %v — run 'msgvault build-cache --full-rebuild' to update", missing) + } + + return engine, nil } // Close releases DuckDB resources, including any cached search temp table. @@ -156,6 +189,48 @@ func (e *DuckDBEngine) parquetPath(table string) string { return filepath.Join(e.analyticsDir, table, "*.parquet") } +// probeParquetColumns checks which columns exist in a Parquet table's files. +// Returns a map of column_name -> true for columns that exist. +// On any error (files missing, unreadable, etc.), returns an empty map — callers +// should treat absent keys as "column does not exist" and supply defaults. +func (e *DuckDBEngine) probeParquetColumns(pathPattern string, hivePartitioning bool) map[string]bool { + cols := make(map[string]bool) + hiveOpt := "" + if hivePartitioning { + hiveOpt = ", hive_partitioning=true" + } + escapedPath := strings.ReplaceAll(pathPattern, "'", "''") + query := fmt.Sprintf("DESCRIBE SELECT * FROM read_parquet('%s'%s)", escapedPath, hiveOpt) + rows, err := e.db.Query(query) + if err != nil { + // No Parquet files or unreadable — treat all optional cols as missing. + return cols + } + defer func() { _ = rows.Close() }() + for rows.Next() { + var colName, colType, isNull, key, dflt, extra sql.NullString + if err := rows.Scan(&colName, &colType, &isNull, &key, &dflt, &extra); err != nil { + continue + } + if colName.Valid { + cols[colName.String] = true + } + } + return cols +} + +// hasCol returns true if the named column exists in the Parquet schema for the given table. +func (e *DuckDBEngine) hasCol(table, col string) bool { + if e.optionalCols == nil { + return true // no probe data — assume present (backwards compatible) + } + tbl, ok := e.optionalCols[table] + if !ok { + return true // table not probed — assume present + } + return tbl[col] +} + // parquetCTEs returns common CTEs for reading all Parquet tables. // This is used by aggregate queries that need to join across tables. // parquetCTEs returns the WITH clause body that defines CTEs for all Parquet @@ -163,19 +238,99 @@ func (e *DuckDBEngine) parquetPath(table string) string { // REPLACE syntax, because Parquet schema inference from SQLite can store // integer/boolean columns as VARCHAR, causing type mismatch errors in JOINs // and COALESCE expressions. +// +// Optional columns (phone_number, attachment_count, sender_id, message_type) +// are handled gracefully: if the Parquet file predates their addition, they +// are synthesised with sensible defaults instead of causing a binder error. func (e *DuckDBEngine) parquetCTEs() string { + // --- messages CTE --- + msgReplace := []string{ + "CAST(id AS BIGINT) AS id", + "CAST(source_id AS BIGINT) AS source_id", + "CAST(source_message_id AS VARCHAR) AS source_message_id", + "CAST(conversation_id AS BIGINT) AS conversation_id", + "CAST(subject AS VARCHAR) AS subject", + "CAST(snippet AS VARCHAR) AS snippet", + "CAST(size_estimate AS BIGINT) AS size_estimate", + "COALESCE(TRY_CAST(has_attachments AS BOOLEAN), false) AS has_attachments", + } + var msgExtra []string + if e.hasCol("messages", "attachment_count") { + msgReplace = append(msgReplace, "COALESCE(TRY_CAST(attachment_count AS INTEGER), 0) AS attachment_count") + } else { + msgExtra = append(msgExtra, "0 AS attachment_count") + } + if e.hasCol("messages", "sender_id") { + msgReplace = append(msgReplace, "TRY_CAST(sender_id AS BIGINT) AS sender_id") + } else { + msgExtra = append(msgExtra, "NULL::BIGINT AS sender_id") + } + if e.hasCol("messages", "message_type") { + msgReplace = append(msgReplace, "COALESCE(CAST(message_type AS VARCHAR), '') AS message_type") + } else { + msgExtra = append(msgExtra, "'' AS message_type") + } + msgCTE := fmt.Sprintf("SELECT * REPLACE (\n\t\t\t\t%s\n\t\t\t)", strings.Join(msgReplace, ",\n\t\t\t\t")) + if len(msgExtra) > 0 { + msgCTE += ", " + strings.Join(msgExtra, ", ") + } + msgCTE += fmt.Sprintf(" FROM read_parquet('%s', hive_partitioning=true, union_by_name=true)", e.parquetGlob()) + + // --- participants CTE --- + pReplace := []string{ + "CAST(id AS BIGINT) AS id", + "CAST(email_address AS VARCHAR) AS email_address", + "CAST(domain AS VARCHAR) AS domain", + "CAST(display_name AS VARCHAR) AS display_name", + } + var pExtra []string + if e.hasCol("participants", "phone_number") { + pReplace = append(pReplace, "COALESCE(CAST(phone_number AS VARCHAR), '') AS phone_number") + } else { + pExtra = append(pExtra, "'' AS phone_number") + } + pCTE := fmt.Sprintf("SELECT * REPLACE (\n\t\t\t\t%s\n\t\t\t)", strings.Join(pReplace, ",\n\t\t\t\t")) + if len(pExtra) > 0 { + pCTE += ", " + strings.Join(pExtra, ", ") + } + pCTE += fmt.Sprintf(" FROM read_parquet('%s')", e.parquetPath("participants")) + + // --- conversations CTE --- + convReplace := []string{ + "CAST(id AS BIGINT) AS id", + "CAST(source_conversation_id AS VARCHAR) AS source_conversation_id", + } + var convExtra []string + if e.hasCol("conversations", "title") { + convReplace = append(convReplace, "COALESCE(CAST(title AS VARCHAR), '') AS title") + } else { + convExtra = append(convExtra, "'' AS title") + } + convCTE := fmt.Sprintf("SELECT * REPLACE (\n\t\t\t\t%s\n\t\t\t)", strings.Join(convReplace, ",\n\t\t\t\t")) + if len(convExtra) > 0 { + convCTE += ", " + strings.Join(convExtra, ", ") + } + convCTE += fmt.Sprintf(" FROM read_parquet('%s')", e.parquetPath("conversations")) + + // --- sources CTE --- + srcReplace := []string{ + "CAST(id AS BIGINT) AS id", + } + var srcExtra []string + if e.hasCol("sources", "source_type") { + srcReplace = append(srcReplace, "COALESCE(CAST(source_type AS VARCHAR), 'gmail') AS source_type") + } else { + srcExtra = append(srcExtra, "'gmail' AS source_type") + } + srcCTE := fmt.Sprintf("SELECT * REPLACE (\n\t\t\t\t%s\n\t\t\t)", strings.Join(srcReplace, ",\n\t\t\t\t")) + if len(srcExtra) > 0 { + srcCTE += ", " + strings.Join(srcExtra, ", ") + } + srcCTE += fmt.Sprintf(" FROM read_parquet('%s')", e.parquetPath("sources")) + return fmt.Sprintf(` msg AS ( - SELECT * REPLACE ( - CAST(id AS BIGINT) AS id, - CAST(source_id AS BIGINT) AS source_id, - CAST(source_message_id AS VARCHAR) AS source_message_id, - CAST(conversation_id AS BIGINT) AS conversation_id, - CAST(subject AS VARCHAR) AS subject, - CAST(snippet AS VARCHAR) AS snippet, - CAST(size_estimate AS BIGINT) AS size_estimate, - COALESCE(TRY_CAST(has_attachments AS BOOLEAN), false) AS has_attachments - ) FROM read_parquet('%s', hive_partitioning=true) + %s ), mr AS ( SELECT * REPLACE ( @@ -186,12 +341,7 @@ func (e *DuckDBEngine) parquetCTEs() string { ) FROM read_parquet('%s') ), p AS ( - SELECT * REPLACE ( - CAST(id AS BIGINT) AS id, - CAST(email_address AS VARCHAR) AS email_address, - CAST(domain AS VARCHAR) AS domain, - CAST(display_name AS VARCHAR) AS display_name - ) FROM read_parquet('%s') + %s ), lbl AS ( SELECT * REPLACE ( @@ -213,24 +363,19 @@ func (e *DuckDBEngine) parquetCTEs() string { GROUP BY 1 ), src AS ( - SELECT * REPLACE ( - CAST(id AS BIGINT) AS id - ) FROM read_parquet('%s') + %s ), conv AS ( - SELECT * REPLACE ( - CAST(id AS BIGINT) AS id, - CAST(source_conversation_id AS VARCHAR) AS source_conversation_id - ) FROM read_parquet('%s') + %s ) - `, e.parquetGlob(), + `, msgCTE, e.parquetPath("message_recipients"), - e.parquetPath("participants"), + pCTE, e.parquetPath("labels"), e.parquetPath("message_labels"), e.parquetPath("attachments"), - e.parquetPath("sources"), - e.parquetPath("conversations")) + srcCTE, + convCTE) } // escapeILIKE escapes ILIKE wildcard characters (% and _) in user input. @@ -686,45 +831,62 @@ func (e *DuckDBEngine) buildFilterConditions(filter MessageFilter) (string, []in conditions = append(conditions, "msg.deleted_from_source_at IS NULL") } - // Sender filter - use EXISTS subquery (becomes semi-join) + // Sender filter - check both message_recipients (email) and direct sender_id (WhatsApp/chat) + // Also checks phone_number for phone-based lookups (e.g., from:+447...) if filter.Sender != "" { - conditions = append(conditions, `EXISTS ( + conditions = append(conditions, `(EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type = 'from' - AND p.email_address = ? - )`) - args = append(args, filter.Sender) + AND (p.email_address = ? OR p.phone_number = ?) + ) OR EXISTS ( + SELECT 1 FROM p + WHERE p.id = msg.sender_id + AND (p.email_address = ? OR p.phone_number = ?) + ))`) + args = append(args, filter.Sender, filter.Sender, filter.Sender, filter.Sender) } else if filter.MatchesEmpty(ViewSenders) { - conditions = append(conditions, `NOT EXISTS ( + // A message has an "empty sender" only if it has no from-recipient AND no direct sender_id. + conditions = append(conditions, `(NOT EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type = 'from' - AND p.email_address IS NOT NULL - AND p.email_address != '' - )`) + AND ( + (p.email_address IS NOT NULL AND p.email_address != '') OR + (p.phone_number IS NOT NULL AND p.phone_number != '') + ) + ) AND msg.sender_id IS NULL)`) } - // Sender name filter - use EXISTS subquery (becomes semi-join) + // Sender name filter - check both message_recipients (email) and direct sender_id (WhatsApp/chat) if filter.SenderName != "" { - conditions = append(conditions, `EXISTS ( + conditions = append(conditions, `(EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type = 'from' AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) = ? - )`) - args = append(args, filter.SenderName) + ) OR EXISTS ( + SELECT 1 FROM p + WHERE p.id = msg.sender_id + AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) = ? + ))`) + args = append(args, filter.SenderName, filter.SenderName) } else if filter.MatchesEmpty(ViewSenderNames) { - conditions = append(conditions, `NOT EXISTS ( + // A message has an "empty sender name" only if it has no from-recipient name AND no direct sender_id with a name. + conditions = append(conditions, `(NOT EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type = 'from' AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) IS NOT NULL - )`) + ) AND NOT EXISTS ( + SELECT 1 FROM p + WHERE p.id = msg.sender_id + AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) IS NOT NULL + ))`) } // Recipient filter - use EXISTS subquery (becomes semi-join) @@ -1053,12 +1215,24 @@ func (e *DuckDBEngine) ListMessages(ctx context.Context, filter MessageFilter) ( msg_sender AS ( SELECT mr.message_id, FIRST(p.email_address) as from_email, - FIRST(COALESCE(mr.display_name, p.display_name, '')) as from_name + FIRST(COALESCE(mr.display_name, p.display_name, '')) as from_name, + FIRST(COALESCE(p.phone_number, '')) as from_phone FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.recipient_type = 'from' AND mr.message_id IN (SELECT id FROM filtered_msgs) GROUP BY mr.message_id + ), + direct_sender AS ( + SELECT msg.id as message_id, + COALESCE(p.email_address, '') as from_email, + COALESCE(p.display_name, '') as from_name, + COALESCE(p.phone_number, '') as from_phone + FROM msg + JOIN filtered_msgs fm ON fm.id = msg.id + JOIN p ON p.id = msg.sender_id + WHERE msg.sender_id IS NOT NULL + AND msg.id NOT IN (SELECT message_id FROM msg_sender) ) SELECT msg.id, @@ -1067,15 +1241,20 @@ func (e *DuckDBEngine) ListMessages(ctx context.Context, filter MessageFilter) ( COALESCE(c.source_conversation_id, '') as source_conversation_id, COALESCE(msg.subject, '') as subject, COALESCE(msg.snippet, '') as snippet, - COALESCE(ms.from_email, '') as from_email, - COALESCE(ms.from_name, '') as from_name, + COALESCE(ms.from_email, ds.from_email, '') as from_email, + COALESCE(ms.from_name, ds.from_name, '') as from_name, + COALESCE(ms.from_phone, ds.from_phone, '') as from_phone, msg.sent_at, COALESCE(msg.size_estimate, 0) as size_estimate, COALESCE(msg.has_attachments, false) as has_attachments, - msg.deleted_from_source_at + COALESCE(msg.attachment_count, 0) as attachment_count, + msg.deleted_from_source_at, + COALESCE(msg.message_type, '') as message_type, + COALESCE(c.title, '') as conv_title FROM msg JOIN filtered_msgs fm ON fm.id = msg.id LEFT JOIN msg_sender ms ON ms.message_id = msg.id + LEFT JOIN direct_sender ds ON ds.message_id = msg.id LEFT JOIN conv c ON c.id = msg.conversation_id ORDER BY %s `, e.parquetCTEs(), where, orderBy, orderBy) @@ -1102,10 +1281,14 @@ func (e *DuckDBEngine) ListMessages(ctx context.Context, filter MessageFilter) ( &msg.Snippet, &msg.FromEmail, &msg.FromName, + &msg.FromPhone, &sentAt, &msg.SizeEstimate, &msg.HasAttachments, + &msg.AttachmentCount, &deletedAt, + &msg.MessageType, + &msg.ConversationTitle, ); err != nil { return nil, fmt.Errorf("scan message: %w", err) } @@ -1425,6 +1608,10 @@ func (e *DuckDBEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi // Always exclude deleted messages conditions = append(conditions, "msg.deleted_from_source_at IS NULL") + // Gmail scoping is handled by JOIN src in the query below — this function + // is used for Gmail-specific deletion/staging workflows and must not + // return WhatsApp or other source IDs. + if filter.SourceID != nil { conditions = append(conditions, "msg.source_id = ?") args = append(args, *filter.SourceID) @@ -1432,25 +1619,33 @@ func (e *DuckDBEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi // Use EXISTS subqueries for filtering (becomes semi-joins, no duplicates) if filter.Sender != "" { - conditions = append(conditions, `EXISTS ( + conditions = append(conditions, `(EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type = 'from' - AND p.email_address = ? - )`) - args = append(args, filter.Sender) + AND (p.email_address = ? OR p.phone_number = ?) + ) OR EXISTS ( + SELECT 1 FROM p + WHERE p.id = msg.sender_id + AND (p.email_address = ? OR p.phone_number = ?) + ))`) + args = append(args, filter.Sender, filter.Sender, filter.Sender, filter.Sender) } if filter.SenderName != "" { - conditions = append(conditions, `EXISTS ( + conditions = append(conditions, `(EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type = 'from' AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) = ? - )`) - args = append(args, filter.SenderName) + ) OR EXISTS ( + SELECT 1 FROM p + WHERE p.id = msg.sender_id + AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) = ? + ))`) + args = append(args, filter.SenderName, filter.SenderName) } if filter.Recipient != "" { @@ -1512,11 +1707,12 @@ func (e *DuckDBEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi args = append(args, filter.TimeRange.Period) } - // Build query + // Build query — JOIN src to scope to Gmail sources authoritatively. query := fmt.Sprintf(` WITH %s SELECT msg.source_message_id FROM msg + JOIN src ON src.id = msg.source_id AND COALESCE(src.source_type, 'gmail') = 'gmail' WHERE %s ORDER BY msg.sent_at DESC, msg.id DESC `, e.parquetCTEs(), strings.Join(conditions, " AND ")) @@ -1612,11 +1808,22 @@ func (e *DuckDBEngine) SearchFast(ctx context.Context, q *search.Query, filter M msg_sender AS ( SELECT mr.message_id, FIRST(p.email_address) as from_email, - FIRST(COALESCE(mr.display_name, p.display_name, '')) as from_name + FIRST(COALESCE(mr.display_name, p.display_name, '')) as from_name, + FIRST(COALESCE(p.phone_number, '')) as from_phone FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.recipient_type = 'from' GROUP BY mr.message_id + ), + direct_sender AS ( + SELECT msg.id as message_id, + COALESCE(p.email_address, '') as from_email, + COALESCE(p.display_name, '') as from_name, + COALESCE(p.phone_number, '') as from_phone + FROM msg + JOIN p ON p.id = msg.sender_id + WHERE msg.sender_id IS NOT NULL + AND msg.id NOT IN (SELECT message_id FROM msg_sender) ) SELECT COALESCE(msg.id, 0) as id, @@ -1625,16 +1832,20 @@ func (e *DuckDBEngine) SearchFast(ctx context.Context, q *search.Query, filter M COALESCE(c.source_conversation_id, '') as source_conversation_id, COALESCE(msg.subject, '') as subject, COALESCE(msg.snippet, '') as snippet, - COALESCE(ms.from_email, '') as from_email, - COALESCE(ms.from_name, '') as from_name, + COALESCE(ms.from_email, ds.from_email, '') as from_email, + COALESCE(ms.from_name, ds.from_name, '') as from_name, + COALESCE(ms.from_phone, ds.from_phone, '') as from_phone, msg.sent_at, COALESCE(msg.size_estimate, 0) as size_estimate, COALESCE(msg.has_attachments, false) as has_attachments, COALESCE(att.attachment_count, 0) as attachment_count, CAST(COALESCE(to_json(mlbl.labels), '[]') AS VARCHAR) as labels, - msg.deleted_from_source_at + msg.deleted_from_source_at, + COALESCE(msg.message_type, '') as message_type, + COALESCE(c.title, '') as conv_title FROM msg LEFT JOIN msg_sender ms ON ms.message_id = msg.id + LEFT JOIN direct_sender ds ON ds.message_id = msg.id LEFT JOIN att ON att.message_id = msg.id LEFT JOIN msg_labels mlbl ON mlbl.message_id = msg.id LEFT JOIN conv c ON c.id = msg.conversation_id @@ -1666,12 +1877,15 @@ func (e *DuckDBEngine) SearchFast(ctx context.Context, q *search.Query, filter M &msg.Snippet, &msg.FromEmail, &msg.FromName, + &msg.FromPhone, &sentAt, &msg.SizeEstimate, &msg.HasAttachments, &msg.AttachmentCount, &labelsJSON, &deletedAt, + &msg.MessageType, + &msg.ConversationTitle, ); err != nil { return nil, fmt.Errorf("scan message: %w", err) } @@ -1702,15 +1916,29 @@ func (e *DuckDBEngine) SearchFastCount(ctx context.Context, q *search.Query, fil query := fmt.Sprintf(` WITH %s, msg_sender AS ( - SELECT mr.message_id, FIRST(p.email_address) as from_email, FIRST(p.display_name) as from_name + SELECT mr.message_id, + FIRST(p.email_address) as from_email, + FIRST(COALESCE(mr.display_name, p.display_name, '')) as from_name, + FIRST(COALESCE(p.phone_number, '')) as from_phone FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.recipient_type = 'from' GROUP BY mr.message_id + ), + direct_sender AS ( + SELECT msg.id as message_id, + COALESCE(p.email_address, '') as from_email, + COALESCE(p.display_name, '') as from_name, + COALESCE(p.phone_number, '') as from_phone + FROM msg + JOIN p ON p.id = msg.sender_id + WHERE msg.sender_id IS NOT NULL + AND msg.id NOT IN (SELECT message_id FROM msg_sender) ) SELECT COUNT(*) as cnt FROM msg LEFT JOIN msg_sender ms ON ms.message_id = msg.id + LEFT JOIN direct_sender ds ON ds.message_id = msg.id WHERE %s `, e.parquetCTEs(), strings.Join(conditions, " AND ")) @@ -1779,12 +2007,15 @@ func (e *DuckDBEngine) searchPageFromCache(ctx context.Context, limit, offset in sm.snippet, sm.from_email, sm.from_name, + COALESCE(sm.from_phone, '') as from_phone, sm.sent_at, sm.size_estimate, sm.has_attachments, COALESCE(att.attachment_count, 0) as attachment_count, CAST(COALESCE(to_json(mlbl.labels), '[]') AS VARCHAR) as labels, - sm.deleted_from_source_at + sm.deleted_from_source_at, + COALESCE(sm.message_type, '') as message_type, + COALESCE(c.title, '') as conv_title FROM %s sm JOIN page p ON p.id = sm.id LEFT JOIN att ON att.message_id = sm.id @@ -1825,12 +2056,15 @@ func (e *DuckDBEngine) searchPageFromCache(ctx context.Context, limit, offset in &msg.Snippet, &msg.FromEmail, &msg.FromName, + &msg.FromPhone, &sentAt, &msg.SizeEstimate, &msg.HasAttachments, &msg.AttachmentCount, &labelsJSON, &deletedAt, + &msg.MessageType, + &msg.ConversationTitle, ); err != nil { return nil, fmt.Errorf("scan message: %w", err) } @@ -1946,11 +2180,22 @@ func (e *DuckDBEngine) SearchFastWithStats(ctx context.Context, q *search.Query, msg_sender AS ( SELECT mr.message_id, FIRST(p.email_address) as from_email, - FIRST(COALESCE(mr.display_name, p.display_name, '')) as from_name + FIRST(COALESCE(mr.display_name, p.display_name, '')) as from_name, + FIRST(COALESCE(p.phone_number, '')) as from_phone FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.recipient_type = 'from' GROUP BY mr.message_id + ), + direct_sender AS ( + SELECT msg.id as message_id, + COALESCE(p.email_address, '') as from_email, + COALESCE(p.display_name, '') as from_name, + COALESCE(p.phone_number, '') as from_phone + FROM msg + JOIN p ON p.id = msg.sender_id + WHERE msg.sender_id IS NOT NULL + AND msg.id NOT IN (SELECT message_id FROM msg_sender) ) SELECT msg.id, @@ -1958,15 +2203,18 @@ func (e *DuckDBEngine) SearchFastWithStats(ctx context.Context, q *search.Query, COALESCE(msg.conversation_id, 0) as conversation_id, COALESCE(msg.subject, '') as subject, COALESCE(msg.snippet, '') as snippet, - COALESCE(ms.from_email, '') as from_email, - COALESCE(ms.from_name, '') as from_name, + COALESCE(ms.from_email, ds.from_email, '') as from_email, + COALESCE(ms.from_name, ds.from_name, '') as from_name, + COALESCE(ms.from_phone, ds.from_phone, '') as from_phone, msg.sent_at, COALESCE(CAST(msg.size_estimate AS BIGINT), 0) as size_estimate, COALESCE(msg.has_attachments, false) as has_attachments, msg.deleted_from_source_at, - CAST(msg.source_id AS BIGINT) as source_id + CAST(msg.source_id AS BIGINT) as source_id, + COALESCE(msg.message_type, '') as message_type FROM msg LEFT JOIN msg_sender ms ON ms.message_id = msg.id + LEFT JOIN direct_sender ds ON ds.message_id = msg.id WHERE %s `, tempTable, e.parquetCTEs(), strings.Join(conditions, " AND ")) @@ -2022,24 +2270,35 @@ func (e *DuckDBEngine) buildSearchConditions(q *search.Query, filter MessageFilt if filter.HideDeletedFromSource { conditions = append(conditions, "msg.deleted_from_source_at IS NULL") } + // Sender filter - check both message_recipients (email/phone) and direct sender_id (WhatsApp/chat) if filter.Sender != "" { - conditions = append(conditions, "ms.from_email = ?") - args = append(args, filter.Sender) + conditions = append(conditions, `(EXISTS ( + SELECT 1 FROM mr + JOIN p ON p.id = mr.participant_id + WHERE mr.message_id = msg.id + AND mr.recipient_type = 'from' + AND (p.email_address = ? OR p.phone_number = ?) + ) OR EXISTS ( + SELECT 1 FROM p + WHERE p.id = msg.sender_id + AND (p.email_address = ? OR p.phone_number = ?) + ))`) + args = append(args, filter.Sender, filter.Sender, filter.Sender, filter.Sender) } if filter.Domain != "" { conditions = append(conditions, "ms.from_email ILIKE ?") args = append(args, "%@"+filter.Domain) } - // Recipient filter - use EXISTS subquery for drill-down context + // Recipient filter - use EXISTS subquery for drill-down context (checks email and phone) if filter.Recipient != "" { conditions = append(conditions, `EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type IN ('to', 'cc', 'bcc') - AND p.email_address = ? + AND (p.email_address = ? OR p.phone_number = ?) )`) - args = append(args, filter.Recipient) + args = append(args, filter.Recipient, filter.Recipient) } // Label filter - use EXISTS subquery for drill-down context if filter.Label != "" { @@ -2063,31 +2322,44 @@ func (e *DuckDBEngine) buildSearchConditions(q *search.Query, filter MessageFilt termPattern := "%" + escapeILIKE(term) + "%" conditions = append(conditions, `( msg.subject ILIKE ? ESCAPE '\' OR - ms.from_email ILIKE ? ESCAPE '\' OR - ms.from_name ILIKE ? ESCAPE '\' + COALESCE(ms.from_email, ds.from_email, '') ILIKE ? ESCAPE '\' OR + COALESCE(ms.from_name, ds.from_name, '') ILIKE ? ESCAPE '\' OR + COALESCE(ms.from_phone, ds.from_phone, '') ILIKE ? ESCAPE '\' )`) - args = append(args, termPattern, termPattern, termPattern) + args = append(args, termPattern, termPattern, termPattern, termPattern) } } - // From filter + // From filter - check email, phone, display name via message_recipients and direct sender_id if len(q.FromAddrs) > 0 { for _, addr := range q.FromAddrs { - conditions = append(conditions, "ms.from_email ILIKE ? ESCAPE '\\'") - args = append(args, "%"+escapeILIKE(addr)+"%") + pattern := "%" + escapeILIKE(addr) + "%" + conditions = append(conditions, `(EXISTS ( + SELECT 1 FROM mr + JOIN p ON p.id = mr.participant_id + WHERE mr.message_id = msg.id + AND mr.recipient_type = 'from' + AND (p.email_address ILIKE ? ESCAPE '\' OR p.phone_number ILIKE ? ESCAPE '\' OR p.display_name ILIKE ? ESCAPE '\') + ) OR EXISTS ( + SELECT 1 FROM p + WHERE p.id = msg.sender_id + AND (p.email_address ILIKE ? ESCAPE '\' OR p.phone_number ILIKE ? ESCAPE '\' OR p.display_name ILIKE ? ESCAPE '\') + ))`) + args = append(args, pattern, pattern, pattern, pattern, pattern, pattern) } } - // To filter - use EXISTS subquery to check recipients + // To filter - use EXISTS subquery to check recipients (email and phone) if len(q.ToAddrs) > 0 { for _, addr := range q.ToAddrs { + pattern := "%" + escapeILIKE(addr) + "%" conditions = append(conditions, `EXISTS ( SELECT 1 FROM mr JOIN p ON p.id = mr.participant_id WHERE mr.message_id = msg.id AND mr.recipient_type IN ('to', 'cc', 'bcc') - AND p.email_address ILIKE ? ESCAPE '\' + AND (p.email_address ILIKE ? ESCAPE '\' OR p.phone_number ILIKE ? ESCAPE '\') )`) - args = append(args, "%"+escapeILIKE(addr)+"%") + args = append(args, pattern, pattern) } } diff --git a/internal/query/duckdb_test.go b/internal/query/duckdb_test.go index fb2d3d88..54df6bba 100644 --- a/internal/query/duckdb_test.go +++ b/internal/query/duckdb_test.go @@ -2030,7 +2030,7 @@ func TestBuildSearchConditions_EscapedWildcards(t *testing.T) { query: &search.Query{ FromAddrs: []string{"test_user%"}, }, - wantClauses: []string{"ms.from_email ILIKE", "ESCAPE"}, + wantClauses: []string{"p.email_address ILIKE", "ESCAPE"}, wantInArgs: []string{"test\\_user\\%"}, }, { @@ -2156,16 +2156,16 @@ func TestDuckDBEngine_AggregateByRecipientName_EmptyStringFallback(t *testing.T) // Build Parquet data with empty-string and whitespace display_names on recipients engine := createEngineFromBuilder(t, newParquetBuilder(t). addTable("messages", "messages/year=2024", "data.parquet", messagesCols, ` - (1::BIGINT, 1::BIGINT, 'msg1', 100::BIGINT, 'Hello', 'Snippet', TIMESTAMP '2024-01-15 10:00:00', 1000::BIGINT, false, NULL::TIMESTAMP, 2024, 1), - (2::BIGINT, 1::BIGINT, 'msg2', 101::BIGINT, 'World', 'Snippet', TIMESTAMP '2024-01-16 10:00:00', 1000::BIGINT, false, NULL::TIMESTAMP, 2024, 1) + (1::BIGINT, 1::BIGINT, 'msg1', 100::BIGINT, 'Hello', 'Snippet', TIMESTAMP '2024-01-15 10:00:00', 1000::BIGINT, false, 0, NULL::TIMESTAMP, NULL::BIGINT, 'email', 2024, 1), + (2::BIGINT, 1::BIGINT, 'msg2', 101::BIGINT, 'World', 'Snippet', TIMESTAMP '2024-01-16 10:00:00', 1000::BIGINT, false, 0, NULL::TIMESTAMP, NULL::BIGINT, 'email', 2024, 1) `). addTable("sources", "sources", "sources.parquet", sourcesCols, ` - (1::BIGINT, 'test@gmail.com') + (1::BIGINT, 'test@gmail.com', 'gmail') `). addTable("participants", "participants", "participants.parquet", participantsCols, ` - (1::BIGINT, 'sender@test.com', 'test.com', 'Sender'), - (2::BIGINT, 'empty@test.com', 'test.com', ''), - (3::BIGINT, 'spaces@test.com', 'test.com', ' ') + (1::BIGINT, 'sender@test.com', 'test.com', 'Sender', ''), + (2::BIGINT, 'empty@test.com', 'test.com', '', ''), + (3::BIGINT, 'spaces@test.com', 'test.com', ' ', '') `). addTable("message_recipients", "message_recipients", "message_recipients.parquet", messageRecipientsCols, ` (1::BIGINT, 1::BIGINT, 'from', 'Sender'), @@ -2177,8 +2177,8 @@ func TestDuckDBEngine_AggregateByRecipientName_EmptyStringFallback(t *testing.T) addEmptyTable("message_labels", "message_labels", "message_labels.parquet", messageLabelsCols, `(1::BIGINT, 1::BIGINT)`). addEmptyTable("attachments", "attachments", "attachments.parquet", attachmentsCols, `(1::BIGINT, 100::BIGINT, 'x')`). addTable("conversations", "conversations", "conversations.parquet", conversationsCols, ` - (100::BIGINT, 'thread100'), - (101::BIGINT, 'thread101') + (100::BIGINT, 'thread100', ''), + (101::BIGINT, 'thread101', '') `)) ctx := context.Background() @@ -2208,15 +2208,15 @@ func TestDuckDBEngine_ListMessages_MatchEmptyRecipientName(t *testing.T) { // Build Parquet data with a message that has no recipients engine := createEngineFromBuilder(t, newParquetBuilder(t). addTable("messages", "messages/year=2024", "data.parquet", messagesCols, ` - (1::BIGINT, 1::BIGINT, 'msg1', 100::BIGINT, 'Has Recipient', 'Snippet', TIMESTAMP '2024-01-15 10:00:00', 1000::BIGINT, false, NULL::TIMESTAMP, 2024, 1), - (2::BIGINT, 1::BIGINT, 'msg2', 101::BIGINT, 'No Recipient', 'Snippet', TIMESTAMP '2024-01-16 10:00:00', 1000::BIGINT, false, NULL::TIMESTAMP, 2024, 1) + (1::BIGINT, 1::BIGINT, 'msg1', 100::BIGINT, 'Has Recipient', 'Snippet', TIMESTAMP '2024-01-15 10:00:00', 1000::BIGINT, false, 0, NULL::TIMESTAMP, NULL::BIGINT, 'email', 2024, 1), + (2::BIGINT, 1::BIGINT, 'msg2', 101::BIGINT, 'No Recipient', 'Snippet', TIMESTAMP '2024-01-16 10:00:00', 1000::BIGINT, false, 0, NULL::TIMESTAMP, NULL::BIGINT, 'email', 2024, 1) `). addTable("sources", "sources", "sources.parquet", sourcesCols, ` - (1::BIGINT, 'test@gmail.com') + (1::BIGINT, 'test@gmail.com', 'gmail') `). addTable("participants", "participants", "participants.parquet", participantsCols, ` - (1::BIGINT, 'alice@test.com', 'test.com', 'Alice'), - (2::BIGINT, 'bob@test.com', 'test.com', 'Bob') + (1::BIGINT, 'alice@test.com', 'test.com', 'Alice', ''), + (2::BIGINT, 'bob@test.com', 'test.com', 'Bob', '') `). addTable("message_recipients", "message_recipients", "message_recipients.parquet", messageRecipientsCols, ` (1::BIGINT, 1::BIGINT, 'from', 'Alice'), @@ -2226,8 +2226,8 @@ func TestDuckDBEngine_ListMessages_MatchEmptyRecipientName(t *testing.T) { addEmptyTable("message_labels", "message_labels", "message_labels.parquet", messageLabelsCols, `(1::BIGINT, 1::BIGINT)`). addEmptyTable("attachments", "attachments", "attachments.parquet", attachmentsCols, `(1::BIGINT, 100::BIGINT, 'x')`). addTable("conversations", "conversations", "conversations.parquet", conversationsCols, ` - (100::BIGINT, 'thread100'), - (101::BIGINT, 'thread101') + (100::BIGINT, 'thread100', ''), + (101::BIGINT, 'thread101', '') `)) ctx := context.Background() @@ -2805,14 +2805,14 @@ func TestDuckDBEngine_VARCHARParquetColumns(t *testing.T) { // string, to reproduce type mismatches in COALESCE, JOINs, and TRY_CAST paths. engine := createEngineFromBuilder(t, newParquetBuilder(t). addTable("messages", "messages/year=2024", "data.parquet", messagesCols, ` - (1::BIGINT, 1::BIGINT, 'msg1', '100', 'Hello World', 'snippet1', TIMESTAMP '2024-01-15 10:00:00', '1000', '0', NULL::TIMESTAMP, 2024, 1), - (2::BIGINT, 1::BIGINT, 'msg2', '101', 'Goodbye', 'snippet2', TIMESTAMP '2024-01-16 10:00:00', '2000', '1', NULL::TIMESTAMP, 2024, 1) + (1::BIGINT, 1::BIGINT, 'msg1', '100', 'Hello World', 'snippet1', TIMESTAMP '2024-01-15 10:00:00', '1000', '0', '0', NULL::TIMESTAMP, NULL::BIGINT, 'email', 2024, 1), + (2::BIGINT, 1::BIGINT, 'msg2', '101', 'Goodbye', 'snippet2', TIMESTAMP '2024-01-16 10:00:00', '2000', '1', '0', NULL::TIMESTAMP, NULL::BIGINT, 'email', 2024, 1) `). addTable("sources", "sources", "sources.parquet", sourcesCols, ` - (1::BIGINT, 'test@gmail.com') + (1::BIGINT, 'test@gmail.com', 'gmail') `). addTable("participants", "participants", "participants.parquet", participantsCols, ` - (1::BIGINT, 'alice@test.com', 'test.com', 'Alice') + (1::BIGINT, 'alice@test.com', 'test.com', 'Alice', '') `). addTable("message_recipients", "message_recipients", "message_recipients.parquet", messageRecipientsCols, ` (1::BIGINT, 1::BIGINT, 'from', 'Alice'), @@ -2822,8 +2822,8 @@ func TestDuckDBEngine_VARCHARParquetColumns(t *testing.T) { addEmptyTable("message_labels", "message_labels", "message_labels.parquet", messageLabelsCols, `(1::BIGINT, 1::BIGINT)`). addEmptyTable("attachments", "attachments", "attachments.parquet", attachmentsCols, `(1::BIGINT, '100', 'x')`). addTable("conversations", "conversations", "conversations.parquet", conversationsCols, ` - (100::BIGINT, 'thread100'), - (101::BIGINT, 'thread101') + (100::BIGINT, 'thread100', ''), + (101::BIGINT, 'thread101', '') `)) ctx := context.Background() @@ -3208,3 +3208,109 @@ func TestDuckDBEngine_HideDeletedFromSource(t *testing.T) { t.Errorf("GetTotalStats with hide-deleted: expected 2 messages, got %d", stats.MessageCount) } } + +// TestDuckDBEngine_StaleParquetSchema verifies that a DuckDB engine can query +// Parquet files written BEFORE PR #160 added phone_number, attachment_count, +// sender_id, and message_type columns. The engine should synthesise sensible +// defaults instead of failing with a binder error. +func TestDuckDBEngine_StaleParquetSchema(t *testing.T) { + // Old-style column definitions (pre-WhatsApp). + const oldMessagesCols = "id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, deleted_from_source_at, year, month" + const oldParticipantsCols = "id, email_address, domain, display_name" + const oldConversationsCols = "id, source_conversation_id" + + engine := createEngineFromBuilder(t, newParquetBuilder(t). + addTable("messages", "messages/year=2024", "data.parquet", oldMessagesCols, ` + (1::BIGINT, 1::BIGINT, 'msg1', 100::BIGINT, 'Stale Hello', 'snip1', TIMESTAMP '2024-01-15 10:00:00', 1000::BIGINT, false, NULL::TIMESTAMP, 2024, 1), + (2::BIGINT, 1::BIGINT, 'msg2', 101::BIGINT, 'Stale Goodbye', 'snip2', TIMESTAMP '2024-01-16 10:00:00', 2000::BIGINT, true, NULL::TIMESTAMP, 2024, 1) + `). + addTable("sources", "sources", "sources.parquet", sourcesCols, ` + (1::BIGINT, 'test@gmail.com', 'gmail') + `). + addTable("participants", "participants", "participants.parquet", oldParticipantsCols, ` + (1::BIGINT, 'alice@test.com', 'test.com', 'Alice') + `). + addTable("message_recipients", "message_recipients", "message_recipients.parquet", messageRecipientsCols, ` + (1::BIGINT, 1::BIGINT, 'from', 'Alice'), + (2::BIGINT, 1::BIGINT, 'from', 'Alice') + `). + addEmptyTable("labels", "labels", "labels.parquet", labelsCols, `(1::BIGINT, 'x')`). + addEmptyTable("message_labels", "message_labels", "message_labels.parquet", messageLabelsCols, `(1::BIGINT, 1::BIGINT)`). + addEmptyTable("attachments", "attachments", "attachments.parquet", attachmentsCols, `(1::BIGINT, 100::BIGINT, 'x')`). + addTable("conversations", "conversations", "conversations.parquet", oldConversationsCols, ` + (100::BIGINT, 'thread100'), + (101::BIGINT, 'thread101') + `)) + + ctx := context.Background() + + t.Run("ListMessages", func(t *testing.T) { + results, err := engine.ListMessages(ctx, MessageFilter{}) + if err != nil { + t.Fatalf("ListMessages with stale Parquet schema: %v", err) + } + if len(results) != 2 { + t.Fatalf("expected 2 messages, got %d", len(results)) + } + }) + + t.Run("SearchFast", func(t *testing.T) { + q := search.Parse("Stale Hello") + results, err := engine.SearchFast(ctx, q, MessageFilter{}, 100, 0) + if err != nil { + t.Fatalf("SearchFast with stale Parquet schema: %v", err) + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if results[0].Subject != "Stale Hello" { + t.Fatalf("unexpected subject: %s", results[0].Subject) + } + }) + + t.Run("SearchFastCount", func(t *testing.T) { + q := search.Parse("Stale") + count, err := engine.SearchFastCount(ctx, q, MessageFilter{}) + if err != nil { + t.Fatalf("SearchFastCount with stale Parquet schema: %v", err) + } + if count != 2 { + t.Fatalf("expected count 2, got %d", count) + } + }) + + t.Run("Aggregate", func(t *testing.T) { + results, err := engine.Aggregate(ctx, ViewSenders, DefaultAggregateOptions()) + if err != nil { + t.Fatalf("Aggregate with stale Parquet schema: %v", err) + } + if len(results) != 1 { + t.Fatalf("expected 1 sender, got %d", len(results)) + } + }) + + t.Run("GetTotalStats", func(t *testing.T) { + stats, err := engine.GetTotalStats(ctx, StatsOptions{}) + if err != nil { + t.Fatalf("GetTotalStats with stale Parquet schema: %v", err) + } + if stats.MessageCount != 2 { + t.Fatalf("expected 2 messages, got %d", stats.MessageCount) + } + }) + + // Verify that optionalCols correctly detected the missing columns. + t.Run("ProbeDetectedMissing", func(t *testing.T) { + for _, col := range []struct{ table, col string }{ + {"participants", "phone_number"}, + {"messages", "attachment_count"}, + {"messages", "sender_id"}, + {"messages", "message_type"}, + {"conversations", "title"}, + } { + if engine.hasCol(col.table, col.col) { + t.Errorf("expected %s.%s to be detected as missing", col.table, col.col) + } + } + }) +} diff --git a/internal/query/models.go b/internal/query/models.go index 3d2cce5c..26b29879 100644 --- a/internal/query/models.go +++ b/internal/query/models.go @@ -28,12 +28,15 @@ type MessageSummary struct { Snippet string `json:"snippet"` FromEmail string `json:"from_email"` FromName string `json:"from_name"` + FromPhone string `json:"from_phone,omitempty"` // Phone number (for WhatsApp/chat sources) SentAt time.Time `json:"sent_at"` SizeEstimate int64 `json:"size_estimate"` HasAttachments bool `json:"has_attachments"` AttachmentCount int `json:"attachment_count"` Labels []string `json:"labels"` - DeletedAt *time.Time `json:"deleted_at,omitempty"` // When message was deleted from server (nil if not deleted) + DeletedAt *time.Time `json:"deleted_at,omitempty"` // When message was deleted from server (nil if not deleted) + MessageType string `json:"message_type,omitempty"` // e.g., "email", "whatsapp" — from messages.message_type + ConversationTitle string `json:"conversation_title,omitempty"` // Group/chat name from conversations.title } // MessageDetail represents a full message with body and attachments. diff --git a/internal/query/sqlite.go b/internal/query/sqlite.go index 65504297..e47548e3 100644 --- a/internal/query/sqlite.go +++ b/internal/query/sqlite.go @@ -280,40 +280,56 @@ func buildFilterJoinsAndConditions(filter MessageFilter, tableAlias string) (str conditions = append(conditions, prefix+"deleted_from_source_at IS NULL") } - // Sender filter + // Sender filter - check both message_recipients (email) and direct sender_id (WhatsApp/chat) + // Also checks phone_number for phone-based lookups (e.g., from:+447...) if filter.Sender != "" { joins = append(joins, ` - JOIN message_recipients mr_filter_from ON mr_filter_from.message_id = m.id AND mr_filter_from.recipient_type = 'from' - JOIN participants p_filter_from ON p_filter_from.id = mr_filter_from.participant_id + LEFT JOIN message_recipients mr_filter_from ON mr_filter_from.message_id = m.id AND mr_filter_from.recipient_type = 'from' + LEFT JOIN participants p_filter_from ON p_filter_from.id = mr_filter_from.participant_id + LEFT JOIN participants p_direct_sender ON p_direct_sender.id = m.sender_id `) - conditions = append(conditions, "p_filter_from.email_address = ?") - args = append(args, filter.Sender) + conditions = append(conditions, "(p_filter_from.email_address = ? OR p_filter_from.phone_number = ? OR p_direct_sender.email_address = ? OR p_direct_sender.phone_number = ?)") + args = append(args, filter.Sender, filter.Sender, filter.Sender, filter.Sender) } else if filter.MatchesEmpty(ViewSenders) { + // A message has an "empty sender" only if it has no from-recipient AND no direct sender_id. joins = append(joins, ` LEFT JOIN message_recipients mr_filter_from ON mr_filter_from.message_id = m.id AND mr_filter_from.recipient_type = 'from' LEFT JOIN participants p_filter_from ON p_filter_from.id = mr_filter_from.participant_id + LEFT JOIN participants p_direct_sender ON p_direct_sender.id = m.sender_id `) - conditions = append(conditions, "(mr_filter_from.id IS NULL OR p_filter_from.email_address IS NULL OR p_filter_from.email_address = '')") + conditions = append(conditions, `((mr_filter_from.id IS NULL OR ( + (p_filter_from.email_address IS NULL OR p_filter_from.email_address = '') AND + (p_filter_from.phone_number IS NULL OR p_filter_from.phone_number = '') + )) AND m.sender_id IS NULL)`) } - // Sender name filter + // Sender name filter - check both message_recipients (email) and direct sender_id (WhatsApp/chat) if filter.SenderName != "" { if filter.Sender == "" && !filter.MatchesEmpty(ViewSenders) { joins = append(joins, ` - JOIN message_recipients mr_filter_from ON mr_filter_from.message_id = m.id AND mr_filter_from.recipient_type = 'from' - JOIN participants p_filter_from ON p_filter_from.id = mr_filter_from.participant_id + LEFT JOIN message_recipients mr_filter_from ON mr_filter_from.message_id = m.id AND mr_filter_from.recipient_type = 'from' + LEFT JOIN participants p_filter_from ON p_filter_from.id = mr_filter_from.participant_id + LEFT JOIN participants p_direct_sender ON p_direct_sender.id = m.sender_id `) } - conditions = append(conditions, "COALESCE(NULLIF(TRIM(p_filter_from.display_name), ''), p_filter_from.email_address) = ?") - args = append(args, filter.SenderName) + conditions = append(conditions, `( + COALESCE(NULLIF(TRIM(p_filter_from.display_name), ''), p_filter_from.email_address) = ? + OR COALESCE(NULLIF(TRIM(p_direct_sender.display_name), ''), p_direct_sender.email_address) = ? + )`) + args = append(args, filter.SenderName, filter.SenderName) } else if filter.MatchesEmpty(ViewSenderNames) { - conditions = append(conditions, `NOT EXISTS ( + // A message has an "empty sender name" only if it has no from-recipient name AND no direct sender_id with a name. + conditions = append(conditions, `(NOT EXISTS ( SELECT 1 FROM message_recipients mr_sn JOIN participants p_sn ON p_sn.id = mr_sn.participant_id WHERE mr_sn.message_id = m.id AND mr_sn.recipient_type = 'from' AND COALESCE(NULLIF(TRIM(p_sn.display_name), ''), p_sn.email_address) IS NOT NULL - )`) + ) AND NOT EXISTS ( + SELECT 1 FROM participants p_ds + WHERE p_ds.id = m.sender_id + AND COALESCE(NULLIF(TRIM(p_ds.display_name), ''), p_ds.email_address) IS NOT NULL + ))`) } // Recipient filter @@ -600,14 +616,17 @@ func (e *SQLiteEngine) ListMessages(ctx context.Context, filter MessageFilter) ( COALESCE(m.snippet, ''), COALESCE(p_sender.email_address, ''), COALESCE(p_sender.display_name, ''), + COALESCE(p_sender.phone_number, ''), m.sent_at, COALESCE(m.size_estimate, 0), m.has_attachments, m.attachment_count, - m.deleted_from_source_at + m.deleted_from_source_at, + COALESCE(m.message_type, ''), + COALESCE(conv.title, '') FROM messages m LEFT JOIN message_recipients mr_sender ON mr_sender.message_id = m.id AND mr_sender.recipient_type = 'from' - LEFT JOIN participants p_sender ON p_sender.id = mr_sender.participant_id + LEFT JOIN participants p_sender ON p_sender.id = COALESCE(mr_sender.participant_id, m.sender_id) LEFT JOIN conversations conv ON conv.id = m.conversation_id %s WHERE %s @@ -637,11 +656,14 @@ func (e *SQLiteEngine) ListMessages(ctx context.Context, filter MessageFilter) ( &msg.Snippet, &msg.FromEmail, &msg.FromName, + &msg.FromPhone, &sentAt, &msg.SizeEstimate, &msg.HasAttachments, &msg.AttachmentCount, &deletedAt, + &msg.MessageType, + &msg.ConversationTitle, ); err != nil { return nil, fmt.Errorf("scan message: %w", err) } @@ -879,24 +901,33 @@ func (e *SQLiteEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi // Build JOIN clauses based on filter type var joins []string + // Scope to Gmail sources only — this function is used for Gmail-specific + // deletion/staging workflows and must not return WhatsApp or other source IDs. + joins = append(joins, `JOIN sources s_gmail ON s_gmail.id = m.source_id AND s_gmail.source_type = 'gmail'`) + if filter.Sender != "" { joins = append(joins, ` - JOIN message_recipients mr_from ON mr_from.message_id = m.id AND mr_from.recipient_type = 'from' - JOIN participants p_from ON p_from.id = mr_from.participant_id + LEFT JOIN message_recipients mr_from ON mr_from.message_id = m.id AND mr_from.recipient_type = 'from' + LEFT JOIN participants p_from ON p_from.id = mr_from.participant_id + LEFT JOIN participants p_ds ON p_ds.id = m.sender_id `) - conditions = append(conditions, "p_from.email_address = ?") - args = append(args, filter.Sender) + conditions = append(conditions, "(p_from.email_address = ? OR p_from.phone_number = ? OR p_ds.email_address = ? OR p_ds.phone_number = ?)") + args = append(args, filter.Sender, filter.Sender, filter.Sender, filter.Sender) } if filter.SenderName != "" { if filter.Sender == "" { joins = append(joins, ` - JOIN message_recipients mr_from ON mr_from.message_id = m.id AND mr_from.recipient_type = 'from' - JOIN participants p_from ON p_from.id = mr_from.participant_id + LEFT JOIN message_recipients mr_from ON mr_from.message_id = m.id AND mr_from.recipient_type = 'from' + LEFT JOIN participants p_from ON p_from.id = mr_from.participant_id + LEFT JOIN participants p_ds ON p_ds.id = m.sender_id `) } - conditions = append(conditions, "COALESCE(NULLIF(TRIM(p_from.display_name), ''), p_from.email_address) = ?") - args = append(args, filter.SenderName) + conditions = append(conditions, `( + COALESCE(NULLIF(TRIM(p_from.display_name), ''), p_from.email_address) = ? + OR COALESCE(NULLIF(TRIM(p_ds.display_name), ''), p_ds.email_address) = ? + )`) + args = append(args, filter.SenderName, filter.SenderName) } if filter.Recipient != "" { @@ -1194,14 +1225,17 @@ func (e *SQLiteEngine) executeSearchQuery(ctx context.Context, conditions []stri COALESCE(m.snippet, ''), COALESCE(p_sender.email_address, ''), COALESCE(p_sender.display_name, ''), + COALESCE(p_sender.phone_number, ''), m.sent_at, COALESCE(m.size_estimate, 0), m.has_attachments, m.attachment_count, - m.deleted_from_source_at + m.deleted_from_source_at, + COALESCE(m.message_type, ''), + COALESCE(conv.title, '') FROM messages m LEFT JOIN message_recipients mr_sender ON mr_sender.message_id = m.id AND mr_sender.recipient_type = 'from' - LEFT JOIN participants p_sender ON p_sender.id = mr_sender.participant_id + LEFT JOIN participants p_sender ON p_sender.id = COALESCE(mr_sender.participant_id, m.sender_id) LEFT JOIN conversations conv ON conv.id = m.conversation_id %s %s @@ -1232,11 +1266,14 @@ func (e *SQLiteEngine) executeSearchQuery(ctx context.Context, conditions []stri &msg.Snippet, &msg.FromEmail, &msg.FromName, + &msg.FromPhone, &sentAt, &msg.SizeEstimate, &msg.HasAttachments, &msg.AttachmentCount, &deletedAt, + &msg.MessageType, + &msg.ConversationTitle, ); err != nil { return nil, fmt.Errorf("scan message: %w", err) } diff --git a/internal/query/testfixtures_test.go b/internal/query/testfixtures_test.go index c022fdfc..5a1b87ca 100644 --- a/internal/query/testfixtures_test.go +++ b/internal/query/testfixtures_test.go @@ -27,7 +27,10 @@ type MessageFixture struct { SentAt time.Time SizeEstimate int64 HasAttachments bool + AttachmentCount int DeletedAt *time.Time // nil = NULL + SenderID *int64 // nil = NULL (direct sender for WhatsApp/chat messages) + MessageType string // e.g. "email", "whatsapp" Year int Month int } @@ -36,6 +39,7 @@ type MessageFixture struct { type SourceFixture struct { ID int64 AccountEmail string + SourceType string // "gmail", "whatsapp", etc. Defaults to "gmail". } // ParticipantFixture defines a participant row for Parquet test data. @@ -44,6 +48,7 @@ type ParticipantFixture struct { Email string Domain string DisplayName string + PhoneNumber string // E.164 phone number (for WhatsApp/chat participants) } // RecipientFixture defines a message_recipients row for Parquet test data. @@ -77,6 +82,7 @@ type AttachmentFixture struct { type ConversationFixture struct { ID int64 SourceConversationID string + Title string // Group/chat name (for WhatsApp/chat conversations) } // --------------------------------------------------------------------------- @@ -119,9 +125,14 @@ func NewTestDataBuilder(t testing.TB) *TestDataBuilder { // AddSource adds a source and returns its ID. func (b *TestDataBuilder) AddSource(email string) int64 { + return b.AddSourceWithType(email, "gmail") +} + +// AddSourceWithType adds a source with a specific type and returns its ID. +func (b *TestDataBuilder) AddSourceWithType(email, sourceType string) int64 { id := b.nextSrcID b.nextSrcID++ - b.sources = append(b.sources, SourceFixture{ID: id, AccountEmail: email}) + b.sources = append(b.sources, SourceFixture{ID: id, AccountEmail: email, SourceType: sourceType}) return id } @@ -292,24 +303,36 @@ func (m MessageFixture) toSQL() string { if m.DeletedAt != nil { deletedAt = fmt.Sprintf("TIMESTAMP '%s'", m.DeletedAt.Format("2006-01-02 15:04:05")) } - return fmt.Sprintf("(%d::BIGINT, %d::BIGINT, %s, %d::BIGINT, %s, %s, TIMESTAMP '%s', %d::BIGINT, %v, %s, %d, %d)", + senderID := "NULL::BIGINT" + if m.SenderID != nil { + senderID = fmt.Sprintf("%d::BIGINT", *m.SenderID) + } + msgType := m.MessageType + if msgType == "" { + msgType = "email" + } + return fmt.Sprintf("(%d::BIGINT, %d::BIGINT, %s, %d::BIGINT, %s, %s, TIMESTAMP '%s', %d::BIGINT, %v, %d, %s, %s, %s, %d, %d)", m.ID, m.SourceID, sqlStr(m.SourceMessageID), m.ConversationID, sqlStr(m.Subject), sqlStr(m.Snippet), m.SentAt.Format("2006-01-02 15:04:05"), m.SizeEstimate, - m.HasAttachments, deletedAt, m.Year, m.Month, + m.HasAttachments, m.AttachmentCount, deletedAt, senderID, sqlStr(msgType), m.Year, m.Month, ) } func (b *TestDataBuilder) sourcesSQL() string { return joinRows(b.sources, func(s SourceFixture) string { - return fmt.Sprintf("(%d::BIGINT, %s)", s.ID, sqlStr(s.AccountEmail)) + st := s.SourceType + if st == "" { + st = "gmail" + } + return fmt.Sprintf("(%d::BIGINT, %s, %s)", s.ID, sqlStr(s.AccountEmail), sqlStr(st)) }) } func (b *TestDataBuilder) participantsSQL() string { return joinRows(b.participants, func(p ParticipantFixture) string { - return fmt.Sprintf("(%d::BIGINT, %s, %s, %s)", - p.ID, sqlStr(p.Email), sqlStr(p.Domain), sqlStr(p.DisplayName)) + return fmt.Sprintf("(%d::BIGINT, %s, %s, %s, %s)", + p.ID, sqlStr(p.Email), sqlStr(p.Domain), sqlStr(p.DisplayName), sqlStr(p.PhoneNumber)) }) } @@ -341,8 +364,8 @@ func (b *TestDataBuilder) attachmentsSQL() string { func (b *TestDataBuilder) conversationsSQL() string { return joinRows(b.conversations, func(c ConversationFixture) string { - return fmt.Sprintf("(%d::BIGINT, %s)", - c.ID, sqlStr(c.SourceConversationID)) + return fmt.Sprintf("(%d::BIGINT, %s, %s)", + c.ID, sqlStr(c.SourceConversationID), sqlStr(c.Title)) }) } @@ -352,14 +375,14 @@ func (b *TestDataBuilder) conversationsSQL() string { // column definitions (coupled to SQL generation methods above) const ( - messagesCols = "id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, deleted_from_source_at, year, month" - sourcesCols = "id, account_email" - participantsCols = "id, email_address, domain, display_name" + messagesCols = "id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, attachment_count, deleted_from_source_at, sender_id, message_type, year, month" + sourcesCols = "id, account_email, source_type" + participantsCols = "id, email_address, domain, display_name, phone_number" messageRecipientsCols = "message_id, participant_id, recipient_type, display_name" labelsCols = "id, name" messageLabelsCols = "message_id, label_id" attachmentsCols = "message_id, size, filename" - conversationsCols = "id, source_conversation_id" + conversationsCols = "id, source_conversation_id, title" ) // Build generates Parquet files from the accumulated data and returns the @@ -395,12 +418,12 @@ func (b *TestDataBuilder) addAuxiliaryTables(pb *parquetBuilder) { name, subdir, file, cols, dummy, sql string empty bool }{ - {"sources", "sources", "sources.parquet", sourcesCols, "(0::BIGINT, '')", b.sourcesSQL(), len(b.sources) == 0}, - {"participants", "participants", "participants.parquet", participantsCols, "(0::BIGINT, '', '', '')", b.participantsSQL(), len(b.participants) == 0}, + {"sources", "sources", "sources.parquet", sourcesCols, "(0::BIGINT, '', 'gmail')", b.sourcesSQL(), len(b.sources) == 0}, + {"participants", "participants", "participants.parquet", participantsCols, "(0::BIGINT, '', '', '', '')", b.participantsSQL(), len(b.participants) == 0}, {"message_recipients", "message_recipients", "message_recipients.parquet", messageRecipientsCols, "(0::BIGINT, 0::BIGINT, '', '')", b.recipientsSQL(), len(b.recipients) == 0}, {"labels", "labels", "labels.parquet", labelsCols, "(0::BIGINT, '')", b.labelsSQL(), len(b.labels) == 0}, {"message_labels", "message_labels", "message_labels.parquet", messageLabelsCols, "(0::BIGINT, 0::BIGINT)", b.messageLabelsSQL(), len(b.msgLabels) == 0}, - {"conversations", "conversations", "conversations.parquet", conversationsCols, "(0::BIGINT, '')", b.conversationsSQL(), len(b.conversations) == 0}, + {"conversations", "conversations", "conversations.parquet", conversationsCols, "(0::BIGINT, '', '')", b.conversationsSQL(), len(b.conversations) == 0}, } for _, a := range auxTables { if a.empty { diff --git a/internal/store/messages.go b/internal/store/messages.go index 1543ffd6..6b25ae7d 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -860,6 +860,174 @@ func (s *Store) backfillFTSBatch(fromID, toID int64) (int64, error) { return result.RowsAffected() } +// EnsureConversationWithType gets or creates a conversation with an explicit conversation_type. +// Unlike EnsureConversation (which hardcodes 'email_thread'), this accepts the type as a parameter, +// making it suitable for WhatsApp and other messaging platforms. +func (s *Store) EnsureConversationWithType(sourceID int64, sourceConversationID, conversationType, title string) (int64, error) { + // Try to get existing + var id int64 + err := s.db.QueryRow(` + SELECT id FROM conversations + WHERE source_id = ? AND source_conversation_id = ? + `, sourceID, sourceConversationID).Scan(&id) + + if err == nil { + // Update conversation_type and title if they've changed. + // Only update title when the new value is non-empty (don't blank out existing titles). + if title != "" { + _, _ = s.db.Exec(` + UPDATE conversations SET conversation_type = ?, title = ?, updated_at = datetime('now') + WHERE id = ? AND (conversation_type != ? OR title != ? OR title IS NULL) + `, conversationType, title, id, conversationType, title) + } else { + _, _ = s.db.Exec(` + UPDATE conversations SET conversation_type = ?, updated_at = datetime('now') + WHERE id = ? AND conversation_type != ? + `, conversationType, id, conversationType) + } + return id, nil + } + if err != sql.ErrNoRows { + return 0, err + } + + // Create new + result, err := s.db.Exec(` + INSERT INTO conversations (source_id, source_conversation_id, conversation_type, title, created_at, updated_at) + VALUES (?, ?, ?, ?, datetime('now'), datetime('now')) + `, sourceID, sourceConversationID, conversationType, title) + if err != nil { + return 0, err + } + + return result.LastInsertId() +} + +// EnsureParticipantByPhone gets or creates a participant by phone number. +// Phone must start with "+" (E.164 format). Returns an error for empty or +// invalid phone numbers to prevent database pollution. +// Also creates a participant_identifiers row with identifier_type='whatsapp'. +func (s *Store) EnsureParticipantByPhone(phone, displayName string) (int64, error) { + if phone == "" { + return 0, fmt.Errorf("phone number is required") + } + if !strings.HasPrefix(phone, "+") { + return 0, fmt.Errorf("phone number must be in E.164 format (starting with +), got %q", phone) + } + + // Try to get existing by phone + var id int64 + err := s.db.QueryRow(` + SELECT id FROM participants WHERE phone_number = ? + `, phone).Scan(&id) + + if err == nil { + // Update display name if provided and currently empty + if displayName != "" { + _, _ = s.db.Exec(` + UPDATE participants SET display_name = ? + WHERE id = ? AND (display_name IS NULL OR display_name = '') + `, displayName, id) // best-effort display name update, ignore error + } + return id, nil + } + if err != sql.ErrNoRows { + return 0, err + } + + // Create new participant + result, err := s.db.Exec(` + INSERT INTO participants (phone_number, display_name, created_at, updated_at) + VALUES (?, ?, datetime('now'), datetime('now')) + `, phone, displayName) + if err != nil { + return 0, fmt.Errorf("insert participant: %w", err) + } + + id, err = result.LastInsertId() + if err != nil { + return 0, err + } + + // Also create a participant_identifiers row + _, err = s.db.Exec(` + INSERT OR IGNORE INTO participant_identifiers (participant_id, identifier_type, identifier_value, is_primary) + VALUES (?, 'whatsapp', ?, TRUE) + `, id, phone) + if err != nil { + return 0, fmt.Errorf("insert participant identifier: %w", err) + } + + return id, nil +} + +// UpdateParticipantDisplayNameByPhone updates the display_name for an existing +// participant identified by phone number. Only updates if display_name is currently +// empty. Returns true if a participant was found and updated, false if not found +// or name was already set. Does NOT create new participants. +func (s *Store) UpdateParticipantDisplayNameByPhone(phone, displayName string) (bool, error) { + if phone == "" || displayName == "" { + return false, nil + } + + result, err := s.db.Exec(` + UPDATE participants SET display_name = ?, updated_at = datetime('now') + WHERE phone_number = ? AND (display_name IS NULL OR display_name = '') + `, displayName, phone) + if err != nil { + return false, err + } + + rows, err := result.RowsAffected() + if err != nil { + return false, err + } + return rows > 0, nil +} + +// EnsureConversationParticipant adds a participant to a conversation. +// Uses INSERT OR IGNORE to be idempotent. +func (s *Store) EnsureConversationParticipant(conversationID, participantID int64, role string) error { + _, err := s.db.Exec(` + INSERT OR IGNORE INTO conversation_participants (conversation_id, participant_id, role, joined_at) + VALUES (?, ?, ?, datetime('now')) + `, conversationID, participantID, role) + return err +} + +// UpsertReaction inserts or ignores a reaction. +func (s *Store) UpsertReaction(messageID, participantID int64, reactionType, reactionValue string, createdAt time.Time) error { + _, err := s.db.Exec(` + INSERT OR IGNORE INTO reactions (message_id, participant_id, reaction_type, reaction_value, created_at) + VALUES (?, ?, ?, ?, ?) + `, messageID, participantID, reactionType, reactionValue, createdAt) + return err +} + +// UpsertMessageRawWithFormat stores compressed raw data with an explicit format. +// Unlike UpsertMessageRaw (which hardcodes 'mime'), this accepts the format as a parameter. +func (s *Store) UpsertMessageRawWithFormat(messageID int64, rawData []byte, format string) error { + // Compress with zlib + var compressed bytes.Buffer + w := zlib.NewWriter(&compressed) + if _, err := w.Write(rawData); err != nil { + return fmt.Errorf("compress: %w", err) + } + if err := w.Close(); err != nil { + return fmt.Errorf("close compressor: %w", err) + } + + _, err := s.db.Exec(` + INSERT INTO message_raw (message_id, raw_data, raw_format, compression) + VALUES (?, ?, ?, 'zlib') + ON CONFLICT(message_id) DO UPDATE SET + raw_data = excluded.raw_data, + raw_format = excluded.raw_format, + compression = excluded.compression + `, messageID, compressed.Bytes(), format) + return err +} + // UpsertAttachment stores an attachment record. func (s *Store) UpsertAttachment(messageID int64, filename, mimeType, storagePath, contentHash string, size int) error { // Check if attachment already exists (by message_id and content_hash) diff --git a/internal/store/store.go b/internal/store/store.go index 7e71f3e7..f55b164a 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -260,6 +260,14 @@ func (s *Store) InitSchema() error { {`ALTER TABLE sources ADD COLUMN sync_config JSON`, "sync_config"}, {`ALTER TABLE messages ADD COLUMN rfc822_message_id TEXT`, "rfc822_message_id"}, {`ALTER TABLE sources ADD COLUMN oauth_app TEXT`, "oauth_app"}, + {`ALTER TABLE participants ADD COLUMN phone_number TEXT`, "phone_number"}, + {`ALTER TABLE participants ADD COLUMN canonical_id TEXT`, "canonical_id"}, + {`ALTER TABLE messages ADD COLUMN sender_id INTEGER REFERENCES participants(id)`, "sender_id"}, + {`ALTER TABLE messages ADD COLUMN message_type TEXT NOT NULL DEFAULT 'email'`, "message_type"}, + {`ALTER TABLE messages ADD COLUMN attachment_count INTEGER DEFAULT 0`, "attachment_count"}, + {`ALTER TABLE messages ADD COLUMN deleted_from_source_at DATETIME`, "deleted_from_source_at"}, + {`ALTER TABLE messages ADD COLUMN delete_batch_id TEXT`, "delete_batch_id"}, + {`ALTER TABLE conversations ADD COLUMN title TEXT`, "title"}, } { if _, err := s.db.Exec(m.sql); err != nil { if !isSQLiteError(err, "duplicate column name") { diff --git a/internal/textutil/encoding.go b/internal/textutil/encoding.go index bafc04b8..fb0a9033 100644 --- a/internal/textutil/encoding.go +++ b/internal/textutil/encoding.go @@ -147,3 +147,85 @@ func FirstLine(s string) string { } return TruncateRunes(s, 200) } + +// SanitizeTerminal strips ANSI escape sequences and C0/C1 control characters +// from a string, preventing terminal injection via untrusted data (e.g., +// WhatsApp chat names, message snippets). Preserves printable characters +// and tabs. Replaces \r and \n with spaces to prevent line overwrite/break +// attacks in single-line contexts (TUI rows, progress output). +// +// C1 control characters (U+0080–U+009F) are checked on the decoded rune, not +// the raw leading byte, so that UTF-8 encoded C1 chars (e.g., U+009B CSI +// encoded as 0xC2 0x9B) are correctly stripped. +func SanitizeTerminal(s string) string { + var b strings.Builder + b.Grow(len(s)) + i := 0 + for i < len(s) { + c := s[i] + // Strip ESC-initiated sequences (CSI, OSC, etc.). + if c == 0x1b && i+1 < len(s) { + next := s[i+1] + switch next { + case '[': // CSI sequence: ESC [ ... + i += 2 + for i < len(s) && (s[i] < 0x40 || s[i] > 0x7E) { + i++ + } + if i < len(s) { + i++ // skip final byte + } + continue + case ']': // OSC sequence: ESC ] ... (ST or BEL) + i += 2 + for i < len(s) { + if s[i] == 0x07 { // BEL terminates OSC + i++ + break + } + if s[i] == 0x1b && i+1 < len(s) && s[i+1] == '\\' { // ST terminates OSC + i += 2 + break + } + i++ + } + continue + default: // Other ESC sequences (2-byte): skip both + i += 2 + continue + } + } + + // Decode the full rune so we can check C1 control characters that + // span multiple bytes in UTF-8 (e.g., U+009B is 0xC2 0x9B). + r, size := utf8.DecodeRuneInString(s[i:]) + if r == utf8.RuneError && size == 1 { + // Invalid UTF-8 byte — skip it. + i++ + continue + } + + // Allow tab; strip newline and carriage return (all callers use this + // in single-line contexts such as TUI rows and progress output where + // \r can overwrite lines and \n can break layout). + if r == '\t' { + b.WriteRune(r) + i += size + continue + } + if r == '\n' || r == '\r' { + // Replace with space to preserve word boundaries. + b.WriteByte(' ') + i += size + continue + } + if r < 0x20 || (r >= 0x7f && r <= 0x9f) { + i += size + continue + } + + b.WriteString(s[i : i+size]) + i += size + } + return b.String() +} diff --git a/internal/textutil/encoding_test.go b/internal/textutil/encoding_test.go index 16f5faf1..0237befe 100644 --- a/internal/textutil/encoding_test.go +++ b/internal/textutil/encoding_test.go @@ -500,3 +500,37 @@ func TestFirstLine(t *testing.T) { }) } } + +func TestSanitizeTerminal(t *testing.T) { + tests := []struct { + name string + input string + want string + }{ + {"plain text", "Hello World", "Hello World"}, + {"preserves tabs", "col1\tcol2", "col1\tcol2"}, + {"replaces newlines with spaces", "line1\nline2", "line1 line2"}, + {"replaces CR with space", "over\rwrite", "over write"}, + {"strips CSI color", "\x1b[31mred\x1b[0m", "red"}, + {"strips CSI cursor move", "\x1b[2Ahello", "hello"}, + {"strips OSC title (BEL)", "\x1b]0;evil title\x07safe", "safe"}, + {"strips OSC title (ST)", "\x1b]0;evil\x1b\\safe", "safe"}, + {"strips BEL", "\x07beep", "beep"}, + {"strips null bytes", "a\x00b", "ab"}, + {"strips C1 control byte", "a\x8fb", "ab"}, + {"strips UTF-8 encoded C1 CSI (U+009B)", "a\xc2\x9bb", "ab"}, + {"strips UTF-8 encoded C1 0x80-0x9F range", "a\xc2\x80z\xc2\x9fb", "azb"}, + {"preserves unicode", "café ☕ 日本語", "café ☕ 日本語"}, + {"strips embedded ESC seq", "before\x1b[1;32mgreen\x1b[0mafter", "beforegreenafter"}, + {"empty string", "", ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := SanitizeTerminal(tt.input) + if got != tt.want { + t.Errorf("SanitizeTerminal(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} diff --git a/internal/tui/view.go b/internal/tui/view.go index eae7f8af..57f3a536 100644 --- a/internal/tui/view.go +++ b/internal/tui/view.go @@ -6,6 +6,7 @@ import ( "github.com/charmbracelet/lipgloss" "github.com/wesm/msgvault/internal/query" + "github.com/wesm/msgvault/internal/textutil" ) // Monochrome theme - adaptive for light and dark terminals @@ -527,16 +528,34 @@ func (m Model) messageListView() string { date := msg.SentAt.Format("2006-01-02 15:04") // Format from (rune-aware for international names) - from := msg.FromEmail + // Sanitize untrusted metadata to prevent terminal control-sequence injection. + from := textutil.SanitizeTerminal(msg.FromEmail) if msg.FromName != "" { - from = msg.FromName + from = textutil.SanitizeTerminal(msg.FromName) + } + // For chat messages: fall back to phone number, then group title + if from == "" && msg.FromPhone != "" { + from = textutil.SanitizeTerminal(msg.FromPhone) + } + if from == "" && msg.ConversationTitle != "" { + from = textutil.SanitizeTerminal(msg.ConversationTitle) } from = truncateRunes(from, fromWidth) from = fmt.Sprintf("%-*s", fromWidth, from) from = highlightTerms(from, m.searchQuery) // Format subject with indicators (rune-aware) - subject := msg.Subject + // For chat messages without a subject, show snippet or group title + subject := textutil.SanitizeTerminal(msg.Subject) + if subject == "" && msg.MessageType == "whatsapp" { + title := textutil.SanitizeTerminal(msg.ConversationTitle) + snippet := textutil.SanitizeTerminal(msg.Snippet) + if title != "" { + subject = title + ": " + snippet + } else { + subject = snippet + } + } if msg.DeletedAt != nil { subject = "🗑 " + subject // Deleted from server indicator } @@ -878,12 +897,19 @@ func (m Model) threadView() string { dateStr := msg.SentAt.Format("2006-01-02 15:04") // Format from/subject with deleted indicator - fromSubject := msg.FromEmail + // Sanitize untrusted metadata to prevent terminal control-sequence injection. + fromSubject := textutil.SanitizeTerminal(msg.FromEmail) if msg.FromName != "" { - fromSubject = msg.FromName + fromSubject = textutil.SanitizeTerminal(msg.FromName) + } + // For chat messages: fall back to phone number + if fromSubject == "" && msg.FromPhone != "" { + fromSubject = textutil.SanitizeTerminal(msg.FromPhone) } if msg.Subject != "" { - fromSubject = truncateRunes(fromSubject, 18) + ": " + msg.Subject + fromSubject = truncateRunes(fromSubject, 18) + ": " + textutil.SanitizeTerminal(msg.Subject) + } else if msg.MessageType == "whatsapp" && msg.Snippet != "" { + fromSubject = truncateRunes(fromSubject, 18) + ": " + textutil.SanitizeTerminal(msg.Snippet) } if msg.DeletedAt != nil { fromSubject = "🗑 " + fromSubject // Deleted from server indicator diff --git a/internal/whatsapp/contacts.go b/internal/whatsapp/contacts.go new file mode 100644 index 00000000..5219b33f --- /dev/null +++ b/internal/whatsapp/contacts.go @@ -0,0 +1,243 @@ +package whatsapp + +import ( + "bufio" + "fmt" + "os" + "regexp" + "strings" + + "github.com/wesm/msgvault/internal/store" +) + +// vcardContact represents a parsed contact from a vCard file. +type vcardContact struct { + FullName string + Phones []string // normalized to E.164 +} + +// ImportContacts reads a .vcf file and updates participant display names +// for any phone numbers that match existing participants in the store. +// Only updates existing participants — does not create new ones. +// Returns the number of existing participants whose names were updated. +func ImportContacts(s *store.Store, vcfPath string) (matched, total int, err error) { + contacts, err := parseVCardFile(vcfPath) + if err != nil { + return 0, 0, fmt.Errorf("parse vcard: %w", err) + } + + total = len(contacts) + var errCount int + for _, c := range contacts { + if c.FullName == "" { + continue + } + for _, phone := range c.Phones { + if phone == "" { + continue + } + // Only update display_name for participants that already exist. + // Does not create new participants — those are created during message import. + updated, updateErr := s.UpdateParticipantDisplayNameByPhone(phone, c.FullName) + if updateErr != nil { + errCount++ + continue + } + if updated { + matched++ + } + } + } + + if errCount > 0 { + return matched, total, fmt.Errorf("contact import completed with %d database errors", errCount) + } + + return matched, total, nil +} + +// parseVCardFile reads a .vcf file and returns parsed contacts. +// Handles vCard 2.1 and 3.0 formats, including RFC 2425 line folding +// and QUOTED-PRINTABLE encoded values. +func parseVCardFile(path string) ([]vcardContact, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer func() { _ = f.Close() }() + + // Read all lines and unfold continuation lines (RFC 2425: lines starting + // with a space or tab are continuations of the previous line). + var rawLines []string + scanner := bufio.NewScanner(f) + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + + for scanner.Scan() { + line := scanner.Text() + if len(line) > 0 && (line[0] == ' ' || line[0] == '\t') { + // Continuation line — append to previous. + if len(rawLines) > 0 { + rawLines[len(rawLines)-1] += strings.TrimLeft(line, " \t") + continue + } + } + rawLines = append(rawLines, line) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("scan vcard: %w", err) + } + + // Handle QUOTED-PRINTABLE soft line breaks: a trailing '=' means the + // value continues on the next line (vCard 2.1 convention). The scanner + // already consumed the newline, so we rejoin here. + var qpJoined []string + for i := 0; i < len(rawLines); i++ { + line := rawLines[i] + for strings.HasSuffix(line, "=") && i+1 < len(rawLines) { + line = line[:len(line)-1] + rawLines[i+1] + i++ + } + qpJoined = append(qpJoined, line) + } + rawLines = qpJoined + + var contacts []vcardContact + var current *vcardContact + + for _, line := range rawLines { + line = strings.TrimSpace(line) + // vCard field names are case-insensitive (RFC 2426). + // Uppercase the key portion for matching, but preserve original value bytes. + upper := strings.ToUpper(line) + + switch { + case upper == "BEGIN:VCARD": + current = &vcardContact{} + + case upper == "END:VCARD": + if current != nil && (current.FullName != "" || len(current.Phones) > 0) { + contacts = append(contacts, *current) + } + current = nil + + case current == nil: + continue + + case strings.HasPrefix(upper, "FN:") || strings.HasPrefix(upper, "FN;"): + // FN (formatted name) — preferred over N because it's the display name. + name := extractVCardValue(line) + if isQuotedPrintable(line) { + name = decodeQuotedPrintable(name) + } + if name != "" { + current.FullName = name + } + + case strings.HasPrefix(upper, "TEL"): + // TEL;CELL:+447... or TEL;TYPE=CELL:+447... or TEL:+447... + raw := extractVCardValue(line) + phone := normalizeVCardPhone(raw) + if phone != "" { + current.Phones = append(current.Phones, phone) + } + } + } + + return contacts, nil +} + +// extractVCardValue extracts the value part from a vCard line. +// Handles both "KEY:value" and "KEY;params:value" formats. +func extractVCardValue(line string) string { + // Find the first colon that separates key from value. + idx := strings.Index(line, ":") + if idx < 0 { + return "" + } + return strings.TrimSpace(line[idx+1:]) +} + +// isQuotedPrintable returns true if a vCard line indicates QUOTED-PRINTABLE encoding. +func isQuotedPrintable(line string) bool { + upper := strings.ToUpper(line) + return strings.Contains(upper, "ENCODING=QUOTED-PRINTABLE") || + strings.Contains(upper, ";QUOTED-PRINTABLE") +} + +// decodeQuotedPrintable decodes a QUOTED-PRINTABLE encoded string. +// Handles =XX hex sequences (e.g., =C3=A9 → é). +func decodeQuotedPrintable(s string) string { + var b strings.Builder + b.Grow(len(s)) + for i := 0; i < len(s); i++ { + if s[i] == '=' && i+2 < len(s) { + hi := unhex(s[i+1]) + lo := unhex(s[i+2]) + if hi >= 0 && lo >= 0 { + b.WriteByte(byte(hi<<4 | lo)) + i += 2 + continue + } + } + b.WriteByte(s[i]) + } + return b.String() +} + +// unhex returns the numeric value of a hex digit, or -1 if invalid. +func unhex(c byte) int { + switch { + case c >= '0' && c <= '9': + return int(c - '0') + case c >= 'A' && c <= 'F': + return int(c - 'A' + 10) + case c >= 'a' && c <= 'f': + return int(c - 'a' + 10) + default: + return -1 + } +} + +// nonDigitRe matches any non-digit character. +var nonDigitRe = regexp.MustCompile(`[^\d]`) + +// normalizeVCardPhone normalizes a phone number from a vCard to E.164 format. +// Handles various formats: +447..., 003-362-..., 077-380-06043, etc. +func normalizeVCardPhone(raw string) string { + raw = strings.TrimSpace(raw) + if raw == "" { + return "" + } + + // Check if it starts with + (already has country code). + hasPlus := strings.HasPrefix(raw, "+") + + // Strip trunk prefix "(0)" before digit extraction. + // Common in UK/European numbers: +44 (0)7700 means +447700, not +4407700. + raw = strings.ReplaceAll(raw, "(0)", "") + + // Strip everything except digits. + digits := nonDigitRe.ReplaceAllString(raw, "") + if digits == "" { + return "" + } + + // If originally had +, it's already E.164-ish. + if hasPlus { + return "+" + digits + } + + // Handle 00-prefixed international format (e.g., 003-362-4921221 → +33624921221). + if strings.HasPrefix(digits, "00") && len(digits) > 4 { + return "+" + digits[2:] + } + + // Local numbers starting with 0 (e.g., 07738006043) are country-specific + // and cannot be reliably normalized without knowing the country code. + // Skip these rather than hardcoding a country assumption. + + // Without an explicit country code indicator (+ or 00), we cannot + // reliably determine the country code. Skip ambiguous numbers rather + // than guessing — a wrong prefix would match the wrong participant. + return "" +} diff --git a/internal/whatsapp/contacts_test.go b/internal/whatsapp/contacts_test.go new file mode 100644 index 00000000..584e60e6 --- /dev/null +++ b/internal/whatsapp/contacts_test.go @@ -0,0 +1,241 @@ +package whatsapp + +import ( + "os" + "path/filepath" + "testing" +) + +func TestNormalizeVCardPhone(t *testing.T) { + tests := []struct { + raw string + want string + }{ + // Already E.164 + {"+447700900000", "+447700900000"}, + {"+12025551234", "+12025551234"}, + {"+33624921221", "+33624921221"}, + + // With dashes/spaces + {"+44 7700 900000", "+447700900000"}, + {"+1-202-555-1234", "+12025551234"}, + + // Trunk prefix (0) — common in UK/European numbers + {"+44 (0)7700 900000", "+447700900000"}, + {"+44(0)20 7123 4567", "+442071234567"}, + + // 00 prefix (international) + {"003-362-4921221", "+33624921221"}, + {"0033624921221", "+33624921221"}, + {"004-479-35975580", "+447935975580"}, + + // 0 prefix (local) — skipped, country-ambiguous + {"011-585-73843", ""}, + {"07738006043", ""}, + {"077-380-06043", ""}, + + // No explicit country code indicator — ambiguous, skip + {"447700900000", ""}, + {"2025551234", ""}, + + // Empty/invalid + {"", ""}, + {" ", ""}, + {"abc", ""}, + {"12", ""}, // too short + } + + for _, tt := range tests { + got := normalizeVCardPhone(tt.raw) + if got != tt.want { + t.Errorf("normalizeVCardPhone(%q) = %q, want %q", tt.raw, got, tt.want) + } + } +} + +func TestParseVCardFile(t *testing.T) { + // Write a test vCard file. + vcf := `BEGIN:VCARD +VERSION:2.1 +N:McGregor;Alastair;;; +FN:Alastair McGregor +TEL;CELL:+447984959428 +END:VCARD +BEGIN:VCARD +VERSION:2.1 +N:France;Geoff;;; +FN:Geoff France +TEL;X-Mobile:+33562645735 +END:VCARD +BEGIN:VCARD +VERSION:2.1 +N:Studios;Claire Mohacek -;Amazon;; +FN:Claire Mohacek - Amazon Studios +TEL;CELL:077-380-06043 +END:VCARD +BEGIN:VCARD +VERSION:2.1 +TEL;CELL: +END:VCARD +BEGIN:VCARD +VERSION:3.0 +FN:Multi Phone Person +TEL;TYPE=CELL:+447700900001 +TEL;TYPE=WORK:+442071234567 +END:VCARD +` + dir := t.TempDir() + path := filepath.Join(dir, "test.vcf") + if err := os.WriteFile(path, []byte(vcf), 0644); err != nil { + t.Fatal(err) + } + + contacts, err := parseVCardFile(path) + if err != nil { + t.Fatalf("parseVCardFile() error: %v", err) + } + + if len(contacts) != 4 { // 4 with names/phones, 1 empty entry skipped + t.Fatalf("got %d contacts, want 4", len(contacts)) + } + + // First contact + if contacts[0].FullName != "Alastair McGregor" { + t.Errorf("contact 0 name = %q, want %q", contacts[0].FullName, "Alastair McGregor") + } + if len(contacts[0].Phones) != 1 || contacts[0].Phones[0] != "+447984959428" { + t.Errorf("contact 0 phones = %v, want [+447984959428]", contacts[0].Phones) + } + + // Third contact — local number (0-prefix) is now skipped (country-ambiguous) + if contacts[2].FullName != "Claire Mohacek - Amazon Studios" { + t.Errorf("contact 2 name = %q", contacts[2].FullName) + } + if len(contacts[2].Phones) != 0 { + t.Errorf("contact 2 phones = %v, want [] (local numbers skipped)", contacts[2].Phones) + } + + // Multi phone contact + if contacts[3].FullName != "Multi Phone Person" { + t.Errorf("contact 3 name = %q", contacts[3].FullName) + } + if len(contacts[3].Phones) != 2 { + t.Errorf("contact 3 phone count = %d, want 2", len(contacts[3].Phones)) + } +} + +func TestParseVCardFile_FoldedAndEncoded(t *testing.T) { + // Test RFC 2425 line folding and QUOTED-PRINTABLE encoding. + vcf := "BEGIN:VCARD\r\n" + + "VERSION:2.1\r\n" + + "FN:José\r\n" + + " García\r\n" + // folded continuation line + "TEL;CELL:+34\r\n" + + " 612345678\r\n" + // folded phone + "END:VCARD\r\n" + + "BEGIN:VCARD\r\n" + + "VERSION:2.1\r\n" + + "FN;ENCODING=QUOTED-PRINTABLE:Ren=C3=A9 Dupont\r\n" + + "TEL;CELL:+33612345678\r\n" + + "END:VCARD\r\n" + + dir := t.TempDir() + path := filepath.Join(dir, "folded.vcf") + if err := os.WriteFile(path, []byte(vcf), 0644); err != nil { + t.Fatal(err) + } + + contacts, err := parseVCardFile(path) + if err != nil { + t.Fatalf("parseVCardFile() error: %v", err) + } + + if len(contacts) != 2 { + t.Fatalf("got %d contacts, want 2", len(contacts)) + } + + // Folded name (RFC 2425: leading whitespace is stripped, content concatenated) + if contacts[0].FullName != "JoséGarcía" { + t.Errorf("folded name = %q, want %q", contacts[0].FullName, "JoséGarcía") + } + if len(contacts[0].Phones) != 1 || contacts[0].Phones[0] != "+34612345678" { + t.Errorf("folded phone = %v, want [+34612345678]", contacts[0].Phones) + } + + // QUOTED-PRINTABLE encoded name + if contacts[1].FullName != "René Dupont" { + t.Errorf("QP name = %q, want %q", contacts[1].FullName, "René Dupont") + } +} + +func TestParseVCardFile_QPSoftBreaks(t *testing.T) { + // Test QUOTED-PRINTABLE soft line breaks (= at end of line). + // vCard 2.1 uses = at EOL to wrap long QP values across lines. + vcf := "BEGIN:VCARD\r\n" + + "VERSION:2.1\r\n" + + "FN;ENCODING=QUOTED-PRINTABLE:Jo=C3=A3o da =\r\n" + + "Silva\r\n" + + "TEL;CELL:+5511999887766\r\n" + + "END:VCARD\r\n" + + dir := t.TempDir() + path := filepath.Join(dir, "qp-soft.vcf") + if err := os.WriteFile(path, []byte(vcf), 0644); err != nil { + t.Fatal(err) + } + + contacts, err := parseVCardFile(path) + if err != nil { + t.Fatalf("parseVCardFile() error: %v", err) + } + + if len(contacts) != 1 { + t.Fatalf("got %d contacts, want 1", len(contacts)) + } + + // Soft break should be stripped, continuation joined, then QP decoded. + want := "João da Silva" + if contacts[0].FullName != want { + t.Errorf("QP soft break name = %q, want %q", contacts[0].FullName, want) + } +} + +func TestDecodeQuotedPrintable(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"hello", "hello"}, + {"Ren=C3=A9", "René"}, + {"=C3=A9=C3=A8", "éè"}, + {"no=encoding", "no=encoding"}, // invalid hex after = — kept as-is + {"end=", "end="}, // trailing = — kept as-is + } + for _, tt := range tests { + got := decodeQuotedPrintable(tt.input) + if got != tt.want { + t.Errorf("decodeQuotedPrintable(%q) = %q, want %q", tt.input, got, tt.want) + } + } +} + +func TestExtractVCardValue(t *testing.T) { + tests := []struct { + line string + want string + }{ + {"FN:John Doe", "John Doe"}, + {"FN;CHARSET=UTF-8:John Doe", "John Doe"}, + {"TEL;CELL:+447700900000", "+447700900000"}, + {"TEL;TYPE=CELL:+447700900000", "+447700900000"}, + {"TEL:+447700900000", "+447700900000"}, + {"NO_COLON", ""}, + } + + for _, tt := range tests { + got := extractVCardValue(tt.line) + if got != tt.want { + t.Errorf("extractVCardValue(%q) = %q, want %q", tt.line, got, tt.want) + } + } +} diff --git a/internal/whatsapp/importer.go b/internal/whatsapp/importer.go new file mode 100644 index 00000000..9fa005a8 --- /dev/null +++ b/internal/whatsapp/importer.go @@ -0,0 +1,720 @@ +package whatsapp + +import ( + "context" + "crypto/sha256" + "database/sql" + "encoding/json" + "fmt" + "io" + "net/url" + "os" + "path/filepath" + "strings" + "time" + + "github.com/wesm/msgvault/internal/store" +) + +// Importer handles importing WhatsApp messages from a decrypted msgstore.db +// into the msgvault store. +type Importer struct { + store *store.Store + progress ImportProgress +} + +// NewImporter creates a new WhatsApp importer. +func NewImporter(s *store.Store, progress ImportProgress) *Importer { + if progress == nil { + progress = NullProgress{} + } + return &Importer{ + store: s, + progress: progress, + } +} + +// Import performs the full WhatsApp import from a decrypted msgstore.db. +func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOptions) (*ImportSummary, error) { + startTime := time.Now() + summary := &ImportSummary{} + + // Open WhatsApp DB read-only. + // Use file: URI to safely handle paths containing '?' or other special characters. + dsn := (&url.URL{ + Scheme: "file", + OmitHost: true, + Path: waDBPath, + RawQuery: "mode=ro&_journal_mode=WAL&_busy_timeout=5000", + }).String() + waDB, err := sql.Open("sqlite3", dsn) + if err != nil { + return nil, fmt.Errorf("open whatsapp db: %w", err) + } + defer func() { _ = waDB.Close() }() + + // Verify it's a valid WhatsApp DB. + if err := verifyWhatsAppDB(waDB); err != nil { + return nil, err + } + + // Create or get the WhatsApp source. + source, err := imp.store.GetOrCreateSource("whatsapp", opts.Phone) + if err != nil { + return nil, fmt.Errorf("get or create source: %w", err) + } + + if opts.DisplayName != "" { + _ = imp.store.UpdateSourceDisplayName(source.ID, opts.DisplayName) + } + + // Start a sync run for tracking. + syncID, err := imp.store.StartSync(source.ID, "whatsapp_import") + if err != nil { + return nil, fmt.Errorf("start sync: %w", err) + } + + // Ensure we complete/fail the sync run on exit. + var syncErr error + defer func() { + if syncErr != nil { + _ = imp.store.FailSync(syncID, syncErr.Error()) + } else { + _ = imp.store.CompleteSync(syncID, "") + } + }() + + imp.progress.OnStart() + + // Create participant for the phone owner (self). + selfParticipantID, err := imp.store.EnsureParticipantByPhone(opts.Phone, opts.DisplayName) + if err != nil { + syncErr = err + return nil, fmt.Errorf("ensure self participant: %w", err) + } + summary.Participants++ + + // Fetch all chats from WhatsApp DB. + chats, err := fetchChats(waDB) + if err != nil { + syncErr = err + return nil, fmt.Errorf("fetch chats: %w", err) + } + + // Load lid → phone mapping for resolving "lid" senders. + lidMap, err := fetchLidMap(waDB) + if err != nil { + syncErr = err + return nil, fmt.Errorf("fetch lid map: %w", err) + } + + batchSize := opts.BatchSize + if batchSize <= 0 { + batchSize = 1000 + } + + // Track key_id → message_id for reply threading within each chat. + // Scoped per chat to bound memory; cross-chat quotes won't thread + // but that's rare and the quoted text is still in the message body. + keyIDToMsgID := make(map[string]int64) + + totalLimit := opts.Limit + totalAdded := int64(0) + + for _, chat := range chats { + // Clear reply map per chat to prevent unbounded growth. + clear(keyIDToMsgID) + if ctx.Err() != nil { + syncErr = ctx.Err() + return nil, ctx.Err() + } + + // Check global limit. + if totalLimit > 0 && totalAdded >= int64(totalLimit) { + break + } + + summary.ChatsProcessed++ + + // Map chat to conversation. + sourceConvID, convType, title := mapConversation(chat) + conversationID, err := imp.store.EnsureConversationWithType(source.ID, sourceConvID, convType, title) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("ensure conversation %s: %w", sourceConvID, err)) + continue + } + + imp.progress.OnChatStart(chat.RawString, chatTitle(chat), 0) + + // For direct chats: add the remote participant. + if !isGroupChat(chat) && chat.User != "" { + phone := normalizePhone(chat.User, chat.Server) + if phone == "" { + // Non-phone JID (e.g., lid:..., broadcast) — skip. + } else if participantID, err := imp.store.EnsureParticipantByPhone(phone, ""); err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("ensure participant %s: %w", phone, err)) + } else { + summary.Participants++ + _ = imp.store.EnsureConversationParticipant(conversationID, participantID, "member") + _ = imp.store.EnsureConversationParticipant(conversationID, selfParticipantID, "member") + } + } + + // For group chats: add all group participants. + if isGroupChat(chat) { + members, err := fetchGroupParticipants(waDB, chat.RawString) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("fetch group participants for %s: %w", sourceConvID, err)) + } else { + for _, member := range members { + phone := normalizePhone(member.MemberUser, member.MemberServer) + if phone == "" { + continue // Non-phone JID — skip. + } + participantID, err := imp.store.EnsureParticipantByPhone(phone, "") + if err != nil { + summary.Errors++ + continue + } + summary.Participants++ + role := mapGroupRole(member.Admin) + _ = imp.store.EnsureConversationParticipant(conversationID, participantID, role) + } + } + } + + // Track resolved sender participant IDs for participant fallback. + // After the message loop, we ensure each sender is a conversation + // participant — covers groups where group_participants is empty. + chatSenderIDs := make(map[int64]struct{}) + + // Process messages in batches. + chatAdded := int64(0) + afterID := int64(0) + + for { + if ctx.Err() != nil { + syncErr = ctx.Err() + return nil, ctx.Err() + } + + // Check global limit for this batch. + remaining := batchSize + if totalLimit > 0 { + left := int64(totalLimit) - totalAdded + if left <= 0 { + break + } + if left < int64(remaining) { + remaining = int(left) + } + } + + messages, err := fetchMessages(waDB, chat.RowID, afterID, remaining) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("fetch messages for chat %s after %d: %w", sourceConvID, afterID, err)) + break + } + + if len(messages) == 0 { + break + } + + // Collect message row IDs for batch media/reaction/quote lookups. + msgRowIDs := make([]int64, len(messages)) + for i, m := range messages { + msgRowIDs[i] = m.RowID + } + + // Batch-fetch media, reactions, and quotes. + mediaMap, err := fetchMedia(waDB, msgRowIDs) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("fetch media: %w", err)) + mediaMap = make(map[int64]waMedia) + } + + reactionMap, err := fetchReactions(waDB, msgRowIDs) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("fetch reactions: %w", err)) + reactionMap = make(map[int64][]waReaction) + } + + quotedMap, err := fetchQuotedMessages(waDB, msgRowIDs) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("fetch quoted messages: %w", err)) + quotedMap = make(map[int64]waQuoted) + } + + for _, waMsg := range messages { + summary.MessagesProcessed++ + afterID = waMsg.RowID + + // Skip system messages and calls. + if isSkippedType(waMsg.MessageType) { + summary.MessagesSkipped++ + continue + } + + // Skip messages with empty key_id — they can't be uniquely + // identified for upsert and would collide with each other. + if waMsg.KeyID == "" { + summary.MessagesSkipped++ + continue + } + + // Resolve sender. + var senderID sql.NullInt64 + if waMsg.FromMe == 1 { + senderID = sql.NullInt64{Int64: selfParticipantID, Valid: true} + } else if waMsg.SenderServer.Valid && waMsg.SenderServer.String == "lid" { + // Lid JID — resolve via jid_map before trying normalizePhone, + // because lid user strings can be 15 digits and pass E.164 + // validation despite not being real phone numbers. + phone := resolveLidSender(waMsg.SenderJIDRowID, waMsg.SenderServer.String, lidMap) + if phone != "" { + pid, err := imp.store.EnsureParticipantByPhone(phone, "") + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("ensure sender participant %s: %w", phone, err)) + } else { + senderID = sql.NullInt64{Int64: pid, Valid: true} + } + } + } else if waMsg.SenderUser.Valid && waMsg.SenderUser.String != "" { + phone := normalizePhone(waMsg.SenderUser.String, waMsg.SenderServer.String) + if phone != "" { + pid, err := imp.store.EnsureParticipantByPhone(phone, "") + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("ensure sender participant %s: %w", phone, err)) + } else { + senderID = sql.NullInt64{Int64: pid, Valid: true} + } + } + } else if !isGroupChat(chat) && waMsg.FromMe == 0 { + // In a direct chat, the other person is the sender. + phone := normalizePhone(chat.User, chat.Server) + if phone != "" { + pid, err := imp.store.EnsureParticipantByPhone(phone, "") + if err == nil { + senderID = sql.NullInt64{Int64: pid, Valid: true} + } + } + } + + // Track sender for participant fallback. + if senderID.Valid { + chatSenderIDs[senderID.Int64] = struct{}{} + } + + // Build and upsert the message. + msg := mapMessage(waMsg, conversationID, source.ID, senderID) + messageID, err := imp.store.UpsertMessage(&msg) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("upsert message %s: %w", waMsg.KeyID, err)) + continue + } + + // Track for reply threading. + keyIDToMsgID[waMsg.KeyID] = messageID + + summary.MessagesAdded++ + chatAdded++ + totalAdded++ + + // Store message body. + bodyText := sql.NullString{} + if waMsg.TextData.Valid && waMsg.TextData.String != "" { + bodyText = waMsg.TextData + } + // Check for media caption as additional body text. + if media, ok := mediaMap[waMsg.RowID]; ok { + if media.MediaCaption.Valid && media.MediaCaption.String != "" { + if bodyText.Valid && bodyText.String != "" { + // Append caption to body. + bodyText.String += "\n\n" + media.MediaCaption.String + } else { + bodyText = media.MediaCaption + } + } + } + if bodyText.Valid { + _ = imp.store.UpsertMessageBody(messageID, bodyText, sql.NullString{}) + } + + // Store raw JSON for re-parsing. + rawJSON, err := json.Marshal(waMsg) + if err == nil { + _ = imp.store.UpsertMessageRawWithFormat(messageID, rawJSON, "whatsapp_json") + } + + // Handle media/attachments. + if media, ok := mediaMap[waMsg.RowID]; ok { + summary.AttachmentsFound++ + mediaType := mapMediaType(waMsg.MessageType) + + storagePath, contentHash := imp.handleMediaFile(media, opts) + if storagePath != "" { + summary.MediaCopied++ + } + + mimeType := "" + if media.MimeType.Valid { + mimeType = media.MimeType.String + } + + filename := "" + if media.FilePath.Valid { + filename = filepath.Base(media.FilePath.String) + } + + size := 0 + if media.FileSize.Valid { + size = int(media.FileSize.Int64) + } + + // Use UpsertAttachment — it handles dedup by content_hash. + err := imp.store.UpsertAttachment(messageID, filename, mimeType, storagePath, contentHash, size) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("upsert attachment for message %s: %w", waMsg.KeyID, err)) + } + + // Store media metadata in the attachments table is done above. + // For extra metadata (width, height, duration, media_type), + // update via a direct SQL call since UpsertAttachment doesn't have those fields. + if mediaType != "" || (media.Width.Valid && media.Width.Int64 > 0) { + imp.updateAttachmentMetadata(messageID, contentHash, mediaType, media) + } + } + + // Handle quoted/reply messages. + if quoted, ok := quotedMap[waMsg.RowID]; ok { + if replyToMsgID, found := keyIDToMsgID[quoted.QuotedKeyID]; found { + imp.setReplyTo(messageID, replyToMsgID) + } else if dbMsgID, lookupErr := imp.lookupMessageByKeyID(source.ID, quoted.QuotedKeyID); lookupErr == nil && dbMsgID > 0 { + // Found in DB from a previous import run or another chat. + imp.setReplyTo(messageID, dbMsgID) + } + } + + // Handle reactions. + if reactions, ok := reactionMap[waMsg.RowID]; ok { + for _, r := range reactions { + reactionType, reactionValue := mapReaction(r) + if reactionValue == "" { + continue + } + + var reactorID int64 + if r.SenderServer.Valid && r.SenderServer.String == "lid" { + // Lid JID — resolve via jid_map first. + phone := resolveLidSender(r.SenderJIDRowID, r.SenderServer.String, lidMap) + if phone == "" { + continue + } + pid, err := imp.store.EnsureParticipantByPhone(phone, "") + if err != nil { + summary.Errors++ + continue + } + reactorID = pid + } else if r.SenderUser.Valid && r.SenderUser.String != "" { + phone := normalizePhone(r.SenderUser.String, r.SenderServer.String) + if phone == "" { + continue // Non-phone JID — skip reaction. + } + pid, err := imp.store.EnsureParticipantByPhone(phone, "") + if err != nil { + summary.Errors++ + continue + } + reactorID = pid + } else { + // Self reaction. + reactorID = selfParticipantID + } + + createdAt := time.Unix(r.Timestamp/1000, 0) + if err := imp.store.UpsertReaction(messageID, reactorID, reactionType, reactionValue, createdAt); err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("upsert reaction: %w", err)) + } else { + summary.ReactionsAdded++ + } + } + } + + // FTS indexing. + if bodyText.Valid { + senderAddr := "" + if waMsg.FromMe == 1 { + senderAddr = opts.Phone + } else if waMsg.SenderServer.Valid && waMsg.SenderServer.String == "lid" { + senderAddr = resolveLidSender(waMsg.SenderJIDRowID, waMsg.SenderServer.String, lidMap) + } else if waMsg.SenderUser.Valid { + senderAddr = normalizePhone(waMsg.SenderUser.String, waMsg.SenderServer.String) + } + _ = imp.store.UpsertFTS(messageID, "", bodyText.String, senderAddr, "", "") + } + } + + // Update sync run progress counters (for monitoring, not resume). + // Resume is not implemented yet — re-running is safe due to upsert dedup. + _ = imp.store.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + MessagesProcessed: summary.MessagesProcessed, + MessagesAdded: summary.MessagesAdded, + }) + + imp.progress.OnProgress(summary.MessagesProcessed, summary.MessagesAdded, summary.MessagesSkipped) + + // If we got fewer than requested, we've finished this chat. + if len(messages) < remaining { + break + } + } + + // Participant fallback: ensure every resolved sender is a conversation + // participant. Covers groups where group_participants is empty (newer + // WhatsApp versions) and any senders discovered via lid resolution. + for pid := range chatSenderIDs { + _ = imp.store.EnsureConversationParticipant(conversationID, pid, "member") + } + // Always include self as participant. + _ = imp.store.EnsureConversationParticipant(conversationID, selfParticipantID, "member") + + imp.progress.OnChatComplete(chat.RawString, chatAdded) + } + + // Update denormalised conversation counts for the WhatsApp source. + _, _ = imp.store.DB().Exec(` + UPDATE conversations SET + message_count = ( + SELECT COUNT(*) FROM messages + WHERE conversation_id = conversations.id + ), + participant_count = ( + SELECT COUNT(*) FROM conversation_participants + WHERE conversation_id = conversations.id + ), + last_message_at = ( + SELECT MAX(COALESCE(sent_at, received_at, internal_date)) + FROM messages + WHERE conversation_id = conversations.id + ) + WHERE source_id = ? + `, source.ID) + + summary.Duration = time.Since(startTime) + imp.progress.OnComplete(summary) + + return summary, nil +} + +// handleMediaFile attempts to find and copy a media file to content-addressed storage. +// Returns (storagePath, contentHash). Both empty if file not found. +func (imp *Importer) handleMediaFile(media waMedia, opts ImportOptions) (string, string) { + if opts.MediaDir == "" || opts.AttachmentsDir == "" || !media.FilePath.Valid || media.FilePath.String == "" { + return "", "" + } + + mediaDir := opts.MediaDir + + // Sanitize the path from the WhatsApp DB (untrusted data). + relPath := filepath.Clean(media.FilePath.String) + + // Reject absolute paths — the DB should only contain relative paths. + if filepath.IsAbs(relPath) { + relPath = filepath.Base(relPath) + } + + // Reject directory traversal. + if relPath == ".." || strings.HasPrefix(relPath, ".."+string(filepath.Separator)) { + relPath = filepath.Base(relPath) + } + + // Build candidate path and verify it stays within mediaDir. + fullPath := filepath.Join(mediaDir, relPath) + absMediaDir, err := filepath.Abs(mediaDir) + if err != nil { + return "", "" + } + absFullPath, err := filepath.Abs(fullPath) + if err != nil { + return "", "" + } + if !strings.HasPrefix(absFullPath, absMediaDir+string(filepath.Separator)) && absFullPath != absMediaDir { + // Path escapes mediaDir — fall back to base filename only. + fullPath = filepath.Join(mediaDir, filepath.Base(relPath)) + absFullPath, _ = filepath.Abs(fullPath) + if !strings.HasPrefix(absFullPath, absMediaDir+string(filepath.Separator)) { + return "", "" + } + } + + // Check file exists. + info, err := os.Stat(fullPath) + if err != nil { + // Try just the filename as fallback. + fullPath = filepath.Join(mediaDir, filepath.Base(relPath)) + info, err = os.Stat(fullPath) + if err != nil { + return "", "" + } + } + + // Enforce max file size to prevent OOM. + maxSize := opts.MaxMediaFileSize + if maxSize <= 0 { + maxSize = 100 * 1024 * 1024 // 100MB default + } + if info.Size() > maxSize { + return "", "" + } + + // Open file and compute hash by streaming (no full-file read into memory). + f, err := os.Open(fullPath) + if err != nil { + return "", "" + } + defer func() { _ = f.Close() }() + + h := sha256.New() + if _, err := io.Copy(h, io.LimitReader(f, maxSize+1)); err != nil { + return "", "" + } + contentHash := fmt.Sprintf("%x", h.Sum(nil)) + + // Content-addressed storage: // + // The storage_path stored in DB is the relative portion: / + relStoragePath := filepath.Join(contentHash[:2], contentHash) + absStoragePath := filepath.Join(opts.AttachmentsDir, relStoragePath) + + // Check for dedup — file already stored. + if _, err := os.Stat(absStoragePath); err == nil { + return relStoragePath, contentHash + } + + // Create directory and stream-copy the file. + absStorageDir := filepath.Dir(absStoragePath) + if err := os.MkdirAll(absStorageDir, 0750); err != nil { + return "", contentHash + } + + // Seek back to beginning of source file for the copy. + if _, err := f.Seek(0, io.SeekStart); err != nil { + return "", contentHash + } + + dst, err := os.OpenFile(absStoragePath, os.O_CREATE|os.O_WRONLY|os.O_EXCL, 0600) + if err != nil { + if os.IsExist(err) { + // Race: another goroutine already wrote it. + return relStoragePath, contentHash + } + return "", contentHash + } + + if _, err := io.Copy(dst, io.LimitReader(f, maxSize)); err != nil { + _ = dst.Close() + _ = os.Remove(absStoragePath) + return "", contentHash + } + if err := dst.Close(); err != nil { + _ = os.Remove(absStoragePath) + return "", contentHash + } + + return relStoragePath, contentHash +} + +// updateAttachmentMetadata updates media-specific metadata on an attachment record. +func (imp *Importer) updateAttachmentMetadata(messageID int64, contentHash, mediaType string, media waMedia) { + var width, height, durationMS sql.NullInt64 + if media.Width.Valid && media.Width.Int64 > 0 { + width = media.Width + } + if media.Height.Valid && media.Height.Int64 > 0 { + height = media.Height + } + if media.MediaDuration.Valid && media.MediaDuration.Int64 > 0 { + // WhatsApp stores duration in seconds; msgvault uses milliseconds. + durationMS = sql.NullInt64{Int64: media.MediaDuration.Int64 * 1000, Valid: true} + } + + _, _ = imp.store.DB().Exec(` + UPDATE attachments SET media_type = ?, width = ?, height = ?, duration_ms = ? + WHERE message_id = ? AND (content_hash = ? OR content_hash IS NULL) + `, mediaType, width, height, durationMS, messageID, contentHash) +} + +// lookupMessageByKeyID looks up a previously imported message by its WhatsApp key_id. +// Returns 0 if not found. +func (imp *Importer) lookupMessageByKeyID(sourceID int64, keyID string) (int64, error) { + var msgID int64 + err := imp.store.DB().QueryRow( + `SELECT id FROM messages WHERE source_id = ? AND source_message_id = ?`, + sourceID, keyID, + ).Scan(&msgID) + if err == sql.ErrNoRows { + return 0, nil + } + return msgID, err +} + +// setReplyTo sets the reply_to_message_id on a message. +func (imp *Importer) setReplyTo(messageID, replyToID int64) { + _, _ = imp.store.DB().Exec(` + UPDATE messages SET reply_to_message_id = ? WHERE id = ? + `, replyToID, messageID) +} + +// verifyWhatsAppDB checks that the database looks like a WhatsApp msgstore.db. +func verifyWhatsAppDB(db *sql.DB) error { + // Check for the 'message' table with expected columns. + var count int + err := db.QueryRow(` + SELECT COUNT(*) FROM sqlite_master + WHERE type = 'table' AND name = 'message' + `).Scan(&count) + if err != nil { + return fmt.Errorf("check whatsapp db: %w", err) + } + if count == 0 { + return fmt.Errorf("not a valid WhatsApp database: 'message' table not found") + } + + // Check for the 'jid' table. + err = db.QueryRow(` + SELECT COUNT(*) FROM sqlite_master + WHERE type = 'table' AND name = 'jid' + `).Scan(&count) + if err != nil { + return fmt.Errorf("check whatsapp db: %w", err) + } + if count == 0 { + return fmt.Errorf("not a valid WhatsApp database: 'jid' table not found") + } + + // Check for the 'chat' table. + err = db.QueryRow(` + SELECT COUNT(*) FROM sqlite_master + WHERE type = 'table' AND name = 'chat' + `).Scan(&count) + if err != nil { + return fmt.Errorf("check whatsapp db: %w", err) + } + if count == 0 { + return fmt.Errorf("not a valid WhatsApp database: 'chat' table not found") + } + + return nil +} diff --git a/internal/whatsapp/mapping.go b/internal/whatsapp/mapping.go new file mode 100644 index 00000000..aa4e195b --- /dev/null +++ b/internal/whatsapp/mapping.go @@ -0,0 +1,215 @@ +package whatsapp + +import ( + "database/sql" + "strings" + "time" + "unicode/utf8" + + "github.com/wesm/msgvault/internal/store" +) + +// isGroupChat returns true if the chat represents a group conversation. +// A chat is a group if group_type > 0 OR if the JID server is "g.us". +// Some groups (e.g. WhatsApp Communities and their sub-groups) have +// group_type = 0 despite being groups; the JID server is the +// definitive signal. +func isGroupChat(chat waChat) bool { + return chat.GroupType > 0 || chat.Server == "g.us" +} + +// mapConversation maps a WhatsApp chat to a msgvault conversation. +// Returns the source_conversation_id, conversation_type, and title. +func mapConversation(chat waChat) (sourceConvID, convType, title string) { + sourceConvID = chat.RawString + + if isGroupChat(chat) { + convType = "group_chat" + if chat.Subject.Valid { + title = chat.Subject.String + } + } else { + convType = "direct_chat" + // No title for direct chats (resolved via participant lookup) + } + + return sourceConvID, convType, title +} + +// mapMessage maps a WhatsApp message to a msgvault Message struct. +// The conversationID and sourceID must be resolved before calling. +func mapMessage(msg waMessage, conversationID, sourceID int64, senderID sql.NullInt64) store.Message { + sentAt := sql.NullTime{} + if msg.Timestamp > 0 { + // WhatsApp timestamps are in milliseconds since epoch. + sentAt = sql.NullTime{ + Time: time.Unix(msg.Timestamp/1000, (msg.Timestamp%1000)*1e6), + Valid: true, + } + } + + snippet := sql.NullString{} + if msg.TextData.Valid && msg.TextData.String != "" { + s := msg.TextData.String + if utf8.RuneCountInString(s) > 100 { + // Truncate to 100 runes, preserving multi-byte characters. + runes := []rune(s) + s = string(runes[:100]) + } + snippet = sql.NullString{String: s, Valid: true} + } + + return store.Message{ + ConversationID: conversationID, + SourceID: sourceID, + SourceMessageID: msg.KeyID, + MessageType: "whatsapp", + SentAt: sentAt, + SenderID: senderID, + IsFromMe: msg.FromMe == 1, + Snippet: snippet, + HasAttachments: isMediaType(msg.MessageType), + AttachmentCount: boolToInt(isMediaType(msg.MessageType)), + ArchivedAt: time.Now(), + } +} + +// mapMediaType maps a WhatsApp message_type integer to a media type string. +// Returns empty string for non-media types. +func mapMediaType(waMessageType int) string { + switch waMessageType { + case 1: + return "image" + case 2: + return "video" + case 3: + return "audio" + case 4: + return "gif" + case 5: + return "voice_note" + case 13: + return "document" + case 90: + return "sticker" + default: + return "" + } +} + +// isMediaType returns true if the WhatsApp message_type represents media. +func isMediaType(waMessageType int) bool { + return mapMediaType(waMessageType) != "" +} + +// isSkippedType returns true if the message type should be skipped during import. +// System messages, calls, locations, contacts, and polls are not imported. +func isSkippedType(waMessageType int) bool { + switch waMessageType { + case 7: // system message + return true + case 9: // location share + return true + case 10: // contact card + return true + case 15: // voice/video call + return true + case 64: // call (missed) + return true + case 66: // call (group) + return true + case 99: // poll + return true + case 11: // status/story + return true + default: + return false + } +} + +// normalizePhone normalizes a WhatsApp JID user+server to an E.164 phone number. +// Input: user="447700900000", server="s.whatsapp.net" +// Output: "+447700900000" +// Returns empty string for non-phone JIDs (e.g., lid:..., status@broadcast). +func normalizePhone(user, server string) string { + if user == "" { + return "" + } + + // Strip the @server suffix if present in user. + user = strings.TrimSuffix(user, "@"+server) + + // Already in E.164 format? + if strings.HasPrefix(user, "+") { + return user + } + + // Reject non-numeric JID users (e.g., "lid:123", "status", broadcast addresses). + // Valid phone numbers contain only digits. + for _, c := range user { + if c < '0' || c > '9' { + return "" + } + } + + // Must be at least a few digits to be a plausible phone number, + // and no more than 15 (E.164 max) to prevent data pollution. + if len(user) < 4 || len(user) > 15 { + return "" + } + + // Prepend + for E.164. + return "+" + user +} + +// resolveLidSender resolves a "lid" JID sender to a phone number via the +// jid_map lookup table. Only activates when the sender's server is "lid". +// Returns a normalised E.164 phone number, or empty string if unresolvable. +func resolveLidSender(jidRowID sql.NullInt64, server string, lidMap map[int64]waLidMapping) string { + if server != "lid" || !jidRowID.Valid { + return "" + } + mapping, ok := lidMap[jidRowID.Int64] + if !ok { + return "" + } + return normalizePhone(mapping.PhoneUser, mapping.PhoneServer) +} + +// mapReaction maps a WhatsApp reaction to reaction_type and reaction_value. +func mapReaction(r waReaction) (reactionType, reactionValue string) { + if r.ReactionValue.Valid && r.ReactionValue.String != "" { + return "emoji", r.ReactionValue.String + } + return "emoji", "" +} + +// mapGroupRole maps a WhatsApp admin level to a conversation participant role. +func mapGroupRole(admin int) string { + switch admin { + case 1: + return "admin" + case 2: + return "admin" // superadmin → admin (msgvault doesn't distinguish) + default: + return "member" + } +} + +// chatTitle returns a display title for a chat for progress reporting. +func chatTitle(chat waChat) string { + if chat.Subject.Valid && chat.Subject.String != "" { + return chat.Subject.String + } + if chat.User != "" { + return normalizePhone(chat.User, chat.Server) + } + return chat.RawString +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} diff --git a/internal/whatsapp/mapping_test.go b/internal/whatsapp/mapping_test.go new file mode 100644 index 00000000..b15ea5d6 --- /dev/null +++ b/internal/whatsapp/mapping_test.go @@ -0,0 +1,347 @@ +package whatsapp + +import ( + "database/sql" + "testing" +) + +func TestNormalizePhone(t *testing.T) { + tests := []struct { + user, server string + want string + }{ + {"447700900000", "s.whatsapp.net", "+447700900000"}, + {"12025551234", "s.whatsapp.net", "+12025551234"}, + {"+447700900000", "s.whatsapp.net", "+447700900000"}, + {"", "s.whatsapp.net", ""}, + {"447700900000", "g.us", "+447700900000"}, + } + + for _, tt := range tests { + got := normalizePhone(tt.user, tt.server) + if got != tt.want { + t.Errorf("normalizePhone(%q, %q) = %q, want %q", tt.user, tt.server, got, tt.want) + } + } +} + +func TestMapMediaType(t *testing.T) { + tests := []struct { + waType int + want string + }{ + {0, ""}, // text + {1, "image"}, + {2, "video"}, + {3, "audio"}, + {4, "gif"}, + {5, "voice_note"}, + {13, "document"}, + {90, "sticker"}, + {7, ""}, // system (no media type) + {15, ""}, // call + {99, ""}, // poll + } + + for _, tt := range tests { + got := mapMediaType(tt.waType) + if got != tt.want { + t.Errorf("mapMediaType(%d) = %q, want %q", tt.waType, got, tt.want) + } + } +} + +func TestIsMediaType(t *testing.T) { + if !isMediaType(1) { + t.Error("isMediaType(1) should be true (image)") + } + if isMediaType(0) { + t.Error("isMediaType(0) should be false (text)") + } + if isMediaType(7) { + t.Error("isMediaType(7) should be false (system)") + } +} + +func TestIsSkippedType(t *testing.T) { + skipped := []int{7, 9, 10, 15, 64, 66, 99, 11} + for _, typ := range skipped { + if !isSkippedType(typ) { + t.Errorf("isSkippedType(%d) should be true", typ) + } + } + + notSkipped := []int{0, 1, 2, 3, 4, 5, 13, 90} + for _, typ := range notSkipped { + if isSkippedType(typ) { + t.Errorf("isSkippedType(%d) should be false", typ) + } + } +} + +func TestIsGroupChat(t *testing.T) { + tests := []struct { + name string + chat waChat + want bool + }{ + { + name: "direct chat", + chat: waChat{Server: "s.whatsapp.net", GroupType: 0}, + want: false, + }, + { + name: "standard group", + chat: waChat{Server: "g.us", GroupType: 1}, + want: true, + }, + { + name: "community sub-group (g.us + type=0)", + chat: waChat{Server: "g.us", GroupType: 0}, + want: true, + }, + { + name: "broadcast", + chat: waChat{Server: "broadcast", GroupType: 0}, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isGroupChat(tt.chat) + if got != tt.want { + t.Errorf("isGroupChat() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestMapConversation(t *testing.T) { + // Direct chat. + direct := waChat{ + RawString: "447700900000@s.whatsapp.net", + GroupType: 0, + } + id, typ, title := mapConversation(direct) + if id != "447700900000@s.whatsapp.net" { + t.Errorf("direct chat sourceConvID = %q, want %q", id, "447700900000@s.whatsapp.net") + } + if typ != "direct_chat" { + t.Errorf("direct chat convType = %q, want %q", typ, "direct_chat") + } + if title != "" { + t.Errorf("direct chat title = %q, want empty", title) + } + + // Group chat. + group := waChat{ + RawString: "120363001234567890@g.us", + Server: "g.us", + GroupType: 1, + Subject: sql.NullString{String: "Family Group", Valid: true}, + } + id, typ, title = mapConversation(group) + if id != "120363001234567890@g.us" { + t.Errorf("group chat sourceConvID = %q", id) + } + if typ != "group_chat" { + t.Errorf("group chat convType = %q, want %q", typ, "group_chat") + } + if title != "Family Group" { + t.Errorf("group chat title = %q, want %q", title, "Family Group") + } + + // Group with group_type=0 but g.us server (e.g. WhatsApp Community sub-groups). + community := waChat{ + RawString: "120363377259312783@g.us", + Server: "g.us", + GroupType: 0, + Subject: sql.NullString{String: "AI Impact", Valid: true}, + } + _, typ, title = mapConversation(community) + if typ != "group_chat" { + t.Errorf("g.us with group_type=0: convType = %q, want %q", typ, "group_chat") + } + if title != "AI Impact" { + t.Errorf("g.us with group_type=0: title = %q, want %q", title, "AI Impact") + } +} + +func TestMapMessage(t *testing.T) { + msg := waMessage{ + RowID: 42, + ChatRowID: 1, + FromMe: 1, + KeyID: "ABC123", + Timestamp: 1700000000000, // ms + MessageType: 0, + TextData: sql.NullString{String: "Hello world", Valid: true}, + } + + senderID := sql.NullInt64{Int64: 99, Valid: true} + result := mapMessage(msg, 10, 20, senderID) + + if result.ConversationID != 10 { + t.Errorf("ConversationID = %d, want 10", result.ConversationID) + } + if result.SourceID != 20 { + t.Errorf("SourceID = %d, want 20", result.SourceID) + } + if result.SourceMessageID != "ABC123" { + t.Errorf("SourceMessageID = %q, want %q", result.SourceMessageID, "ABC123") + } + if result.MessageType != "whatsapp" { + t.Errorf("MessageType = %q, want %q", result.MessageType, "whatsapp") + } + if !result.IsFromMe { + t.Error("IsFromMe should be true") + } + if !result.SentAt.Valid { + t.Error("SentAt should be valid") + } + if result.SentAt.Time.Unix() != 1700000000 { + t.Errorf("SentAt Unix = %d, want 1700000000", result.SentAt.Time.Unix()) + } + if !result.Snippet.Valid || result.Snippet.String != "Hello world" { + t.Errorf("Snippet = %v, want 'Hello world'", result.Snippet) + } + if result.HasAttachments { + t.Error("HasAttachments should be false for text message") + } +} + +func TestMapMessageSnippetTruncation(t *testing.T) { + // Create a message with text longer than 100 characters. + longText := "" + for i := 0; i < 150; i++ { + longText += "x" + } + + msg := waMessage{ + KeyID: "LONG1", + Timestamp: 1700000000000, + MessageType: 0, + TextData: sql.NullString{String: longText, Valid: true}, + } + + result := mapMessage(msg, 1, 1, sql.NullInt64{}) + if !result.Snippet.Valid { + t.Fatal("Snippet should be valid") + } + if len([]rune(result.Snippet.String)) != 100 { + t.Errorf("Snippet rune count = %d, want 100", len([]rune(result.Snippet.String))) + } +} + +func TestMapGroupRole(t *testing.T) { + tests := []struct { + admin int + want string + }{ + {0, "member"}, + {1, "admin"}, + {2, "admin"}, // superadmin + {3, "member"}, + } + + for _, tt := range tests { + got := mapGroupRole(tt.admin) + if got != tt.want { + t.Errorf("mapGroupRole(%d) = %q, want %q", tt.admin, got, tt.want) + } + } +} + +func TestMapReaction(t *testing.T) { + r := waReaction{ + ReactionValue: sql.NullString{String: "❤️", Valid: true}, + } + typ, val := mapReaction(r) + if typ != "emoji" { + t.Errorf("reaction type = %q, want %q", typ, "emoji") + } + if val != "❤️" { + t.Errorf("reaction value = %q, want %q", val, "❤️") + } + + // Empty reaction. + empty := waReaction{ + ReactionValue: sql.NullString{}, + } + _, val = mapReaction(empty) + if val != "" { + t.Errorf("empty reaction value = %q, want empty", val) + } +} + +func TestResolveLidSender(t *testing.T) { + lidMap := map[int64]waLidMapping{ + 100: {LidRowID: 100, PhoneUser: "447957366403", PhoneServer: "s.whatsapp.net"}, + 200: {LidRowID: 200, PhoneUser: "12025551234", PhoneServer: "s.whatsapp.net"}, + } + + tests := []struct { + name string + jidRowID sql.NullInt64 + server string + want string + }{ + { + name: "lid sender found in map", + jidRowID: sql.NullInt64{Int64: 100, Valid: true}, + server: "lid", + want: "+447957366403", + }, + { + name: "lid sender not in map", + jidRowID: sql.NullInt64{Int64: 999, Valid: true}, + server: "lid", + want: "", + }, + { + name: "non-lid server ignored", + jidRowID: sql.NullInt64{Int64: 100, Valid: true}, + server: "s.whatsapp.net", + want: "", + }, + { + name: "null jid row id", + jidRowID: sql.NullInt64{Valid: false}, + server: "lid", + want: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := resolveLidSender(tt.jidRowID, tt.server, lidMap) + if got != tt.want { + t.Errorf("resolveLidSender() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestChatTitle(t *testing.T) { + // Group with subject. + group := waChat{ + Subject: sql.NullString{String: "Work Chat", Valid: true}, + User: "120363001234567890", + Server: "g.us", + RawString: "120363001234567890@g.us", + } + if chatTitle(group) != "Work Chat" { + t.Errorf("chatTitle(group) = %q, want %q", chatTitle(group), "Work Chat") + } + + // Direct chat. + direct := waChat{ + User: "447700900000", + Server: "s.whatsapp.net", + RawString: "447700900000@s.whatsapp.net", + } + if chatTitle(direct) != "+447700900000" { + t.Errorf("chatTitle(direct) = %q, want %q", chatTitle(direct), "+447700900000") + } +} diff --git a/internal/whatsapp/queries.go b/internal/whatsapp/queries.go new file mode 100644 index 00000000..48f5a78e --- /dev/null +++ b/internal/whatsapp/queries.go @@ -0,0 +1,365 @@ +package whatsapp + +import ( + "database/sql" + "fmt" + "strings" +) + +// fetchChats returns all non-hidden chats from the WhatsApp database. +// Joins with the jid table to get JID details for each chat. +func fetchChats(db *sql.DB) ([]waChat, error) { + rows, err := db.Query(` + SELECT + c._id, + c.jid_row_id, + j.raw_string, + COALESCE(j.user, ''), + COALESCE(j.server, ''), + c.subject, + COALESCE(c.group_type, 0), + COALESCE(c.hidden, 0), + COALESCE(c.sort_timestamp, 0) + FROM chat c + JOIN jid j ON c.jid_row_id = j._id + WHERE COALESCE(c.hidden, 0) = 0 + ORDER BY c.sort_timestamp DESC + `) + if err != nil { + return nil, fmt.Errorf("fetch chats: %w", err) + } + defer func() { _ = rows.Close() }() + + var chats []waChat + for rows.Next() { + var c waChat + if err := rows.Scan( + &c.RowID, &c.JIDRowID, &c.RawString, &c.User, &c.Server, + &c.Subject, &c.GroupType, &c.Hidden, + &c.LastMessageTimestamp, + ); err != nil { + return nil, fmt.Errorf("scan chat: %w", err) + } + chats = append(chats, c) + } + return chats, rows.Err() +} + +// fetchMessages returns messages for a chat, batched after a given _id. +// Messages are ordered by _id ascending for deterministic resumability. +// Joins with jid to resolve sender information. +func fetchMessages(db *sql.DB, chatRowID int64, afterID int64, limit int) ([]waMessage, error) { + rows, err := db.Query(` + SELECT + m._id, + m.chat_row_id, + COALESCE(m.from_me, 0), + COALESCE(m.key_id, ''), + m.sender_jid_row_id, + sj.raw_string, + sj.user, + sj.server, + COALESCE(m.timestamp, 0), + COALESCE(m.message_type, 0), + m.text_data, + COALESCE(m.status, 0), + COALESCE(m.starred, 0) + FROM message m + LEFT JOIN jid sj ON m.sender_jid_row_id = sj._id + WHERE m.chat_row_id = ? + AND m._id > ? + ORDER BY m._id ASC + LIMIT ? + `, chatRowID, afterID, limit) + if err != nil { + return nil, fmt.Errorf("fetch messages: %w", err) + } + defer func() { _ = rows.Close() }() + + var messages []waMessage + for rows.Next() { + var m waMessage + if err := rows.Scan( + &m.RowID, &m.ChatRowID, &m.FromMe, &m.KeyID, + &m.SenderJIDRowID, &m.SenderRawString, &m.SenderUser, &m.SenderServer, + &m.Timestamp, &m.MessageType, &m.TextData, + &m.Status, &m.Starred, + ); err != nil { + return nil, fmt.Errorf("scan message: %w", err) + } + messages = append(messages, m) + } + return messages, rows.Err() +} + +// fetchMedia returns media metadata for a batch of message row IDs. +// Returns a map of message_row_id → waMedia. +func fetchMedia(db *sql.DB, messageRowIDs []int64) (map[int64]waMedia, error) { + if len(messageRowIDs) == 0 { + return make(map[int64]waMedia), nil + } + + result := make(map[int64]waMedia) + + // Process in chunks to stay within SQLite's parameter limit. + const chunkSize = 500 + for i := 0; i < len(messageRowIDs); i += chunkSize { + end := i + chunkSize + if end > len(messageRowIDs) { + end = len(messageRowIDs) + } + chunk := messageRowIDs[i:end] + + placeholders := make([]string, len(chunk)) + args := make([]interface{}, len(chunk)) + for j, id := range chunk { + placeholders[j] = "?" + args[j] = id + } + + query := fmt.Sprintf(` + SELECT + mm.message_row_id, + mm.mime_type, + mm.media_caption, + mm.file_size, + mm.file_path, + mm.width, + mm.height, + mm.media_duration + FROM message_media mm + WHERE mm.message_row_id IN (%s) + `, strings.Join(placeholders, ",")) + + rows, err := db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("fetch media: %w", err) + } + + for rows.Next() { + var m waMedia + if err := rows.Scan( + &m.MessageRowID, &m.MimeType, &m.MediaCaption, + &m.FileSize, &m.FilePath, &m.Width, &m.Height, + &m.MediaDuration, + ); err != nil { + _ = rows.Close() + return nil, fmt.Errorf("scan media: %w", err) + } + result[m.MessageRowID] = m + } + _ = rows.Close() + if err := rows.Err(); err != nil { + return nil, err + } + } + + return result, nil +} + +// fetchReactions returns reactions for a batch of message row IDs. +// Returns a map of message_row_id → []waReaction. +func fetchReactions(db *sql.DB, messageRowIDs []int64) (map[int64][]waReaction, error) { + if len(messageRowIDs) == 0 { + return make(map[int64][]waReaction), nil + } + + result := make(map[int64][]waReaction) + + const chunkSize = 500 + for i := 0; i < len(messageRowIDs); i += chunkSize { + end := i + chunkSize + if end > len(messageRowIDs) { + end = len(messageRowIDs) + } + chunk := messageRowIDs[i:end] + + placeholders := make([]string, len(chunk)) + args := make([]interface{}, len(chunk)) + for j, id := range chunk { + placeholders[j] = "?" + args[j] = id + } + + // WhatsApp stores reactions in message_add_on (metadata) joined with + // message_add_on_reaction (the actual emoji). The link to the original + // message is via parent_message_row_id. + query := fmt.Sprintf(` + SELECT + ao.parent_message_row_id, + ao.sender_jid_row_id, + sj.raw_string, + sj.user, + sj.server, + ar.reaction, + COALESCE(ar.sender_timestamp, 0) + FROM message_add_on ao + JOIN message_add_on_reaction ar ON ar.message_add_on_row_id = ao._id + LEFT JOIN jid sj ON ao.sender_jid_row_id = sj._id + WHERE ao.parent_message_row_id IN (%s) + AND ar.reaction IS NOT NULL + AND ar.reaction != '' + `, strings.Join(placeholders, ",")) + + rows, err := db.Query(query, args...) + if err != nil { + // Table might not exist in older DB versions + if isTableNotFound(err) { + return result, nil + } + return nil, fmt.Errorf("fetch reactions: %w", err) + } + + for rows.Next() { + var r waReaction + if err := rows.Scan( + &r.MessageRowID, &r.SenderJIDRowID, + &r.SenderRawString, &r.SenderUser, &r.SenderServer, + &r.ReactionValue, &r.Timestamp, + ); err != nil { + _ = rows.Close() + return nil, fmt.Errorf("scan reaction: %w", err) + } + result[r.MessageRowID] = append(result[r.MessageRowID], r) + } + _ = rows.Close() + if err := rows.Err(); err != nil { + return nil, err + } + } + + return result, nil +} + +// fetchGroupParticipants returns all participants of a group chat. +// In the WhatsApp schema, group_participants.gjid and .jid are TEXT fields +// containing raw JID strings (e.g., "447700900000@s.whatsapp.net"), +// not integer row IDs. +func fetchGroupParticipants(db *sql.DB, groupJIDRawString string) ([]waGroupMember, error) { + rows, err := db.Query(` + SELECT + gp.gjid, + gp.jid, + COALESCE(j.user, ''), + COALESCE(j.server, ''), + COALESCE(gp.admin, 0) + FROM group_participants gp + LEFT JOIN jid j ON gp.jid = j.raw_string + WHERE gp.gjid = ? + `, groupJIDRawString) + if err != nil { + return nil, fmt.Errorf("fetch group participants: %w", err) + } + defer func() { _ = rows.Close() }() + + var members []waGroupMember + for rows.Next() { + var m waGroupMember + if err := rows.Scan( + &m.GroupJID, &m.MemberJID, + &m.MemberUser, &m.MemberServer, &m.Admin, + ); err != nil { + return nil, fmt.Errorf("scan group participant: %w", err) + } + members = append(members, m) + } + return members, rows.Err() +} + +// fetchQuotedMessages returns quoted message references for a batch of message row IDs. +// Returns a map of message_row_id → waQuoted (the message that contains the quote). +func fetchQuotedMessages(db *sql.DB, messageRowIDs []int64) (map[int64]waQuoted, error) { + if len(messageRowIDs) == 0 { + return make(map[int64]waQuoted), nil + } + + result := make(map[int64]waQuoted) + + const chunkSize = 500 + for i := 0; i < len(messageRowIDs); i += chunkSize { + end := i + chunkSize + if end > len(messageRowIDs) { + end = len(messageRowIDs) + } + chunk := messageRowIDs[i:end] + + placeholders := make([]string, len(chunk)) + args := make([]interface{}, len(chunk)) + for j, id := range chunk { + placeholders[j] = "?" + args[j] = id + } + + query := fmt.Sprintf(` + SELECT + mq.message_row_id, + mq.key_id + FROM message_quoted mq + WHERE mq.message_row_id IN (%s) + AND mq.key_id IS NOT NULL + AND mq.key_id != '' + `, strings.Join(placeholders, ",")) + + rows, err := db.Query(query, args...) + if err != nil { + if isTableNotFound(err) { + return result, nil + } + return nil, fmt.Errorf("fetch quoted messages: %w", err) + } + + for rows.Next() { + var q waQuoted + if err := rows.Scan(&q.MessageRowID, &q.QuotedKeyID); err != nil { + _ = rows.Close() + return nil, fmt.Errorf("scan quoted message: %w", err) + } + result[q.MessageRowID] = q + } + _ = rows.Close() + if err := rows.Err(); err != nil { + return nil, err + } + } + + return result, nil +} + +// fetchLidMap reads the WhatsApp jid_map table to build a mapping from +// lid JID row IDs to their corresponding phone JIDs. The jid_map table +// links two jid rows: one with server="lid" and one with the real phone +// (server="s.whatsapp.net"). Returns an empty map gracefully if the +// jid_map table doesn't exist (older WhatsApp DB versions). +func fetchLidMap(db *sql.DB) (map[int64]waLidMapping, error) { + result := make(map[int64]waLidMapping) + + rows, err := db.Query(` + SELECT + jm.lid_row_id, + COALESCE(phone_jid.user, ''), + COALESCE(phone_jid.server, '') + FROM jid_map jm + JOIN jid phone_jid ON jm.jid_row_id = phone_jid._id + `) + if err != nil { + if isTableNotFound(err) { + return result, nil + } + return nil, fmt.Errorf("fetch lid map: %w", err) + } + defer func() { _ = rows.Close() }() + + for rows.Next() { + var m waLidMapping + if err := rows.Scan(&m.LidRowID, &m.PhoneUser, &m.PhoneServer); err != nil { + return nil, fmt.Errorf("scan lid mapping: %w", err) + } + result[m.LidRowID] = m + } + return result, rows.Err() +} + +// isTableNotFound returns true if the error indicates a missing table. +func isTableNotFound(err error) bool { + return err != nil && strings.Contains(err.Error(), "no such table") +} diff --git a/internal/whatsapp/queries_test.go b/internal/whatsapp/queries_test.go new file mode 100644 index 00000000..5f7d062b --- /dev/null +++ b/internal/whatsapp/queries_test.go @@ -0,0 +1,89 @@ +package whatsapp + +import ( + "database/sql" + "testing" + + _ "github.com/mattn/go-sqlite3" +) + +func TestFetchLidMap(t *testing.T) { + db, err := sql.Open("sqlite3", ":memory:") + if err != nil { + t.Fatal(err) + } + defer func() { _ = db.Close() }() + + // Create the jid and jid_map tables matching WhatsApp's actual schema. + // In WhatsApp: jid_map.lid_row_id is PK (= jid._id for the lid entry), + // jid_map.jid_row_id points to the phone jid._id. + _, err = db.Exec(` + CREATE TABLE jid ( + _id INTEGER PRIMARY KEY, + user TEXT, + server TEXT, + raw_string TEXT + ); + CREATE TABLE jid_map ( + lid_row_id INTEGER PRIMARY KEY NOT NULL, + jid_row_id INTEGER NOT NULL + ); + + -- lid JID entries (these are the lid_row_id values) + INSERT INTO jid (_id, user, server, raw_string) VALUES (10, '12345abcde', 'lid', '12345abcde@lid'); + INSERT INTO jid (_id, user, server, raw_string) VALUES (20, '67890fghij', 'lid', '67890fghij@lid'); + + -- phone JID entries (these are the jid_row_id values) + INSERT INTO jid (_id, user, server, raw_string) VALUES (11, '447957366403', 's.whatsapp.net', '447957366403@s.whatsapp.net'); + INSERT INTO jid (_id, user, server, raw_string) VALUES (21, '12025551234', 's.whatsapp.net', '12025551234@s.whatsapp.net'); + + -- Map lid → phone + INSERT INTO jid_map (lid_row_id, jid_row_id) VALUES (10, 11); + INSERT INTO jid_map (lid_row_id, jid_row_id) VALUES (20, 21); + `) + if err != nil { + t.Fatal(err) + } + + lidMap, err := fetchLidMap(db) + if err != nil { + t.Fatal(err) + } + + if len(lidMap) != 2 { + t.Fatalf("expected 2 lid mappings, got %d", len(lidMap)) + } + + m1, ok := lidMap[10] + if !ok { + t.Fatal("expected lid row 10 in map") + } + if m1.PhoneUser != "447957366403" || m1.PhoneServer != "s.whatsapp.net" { + t.Errorf("lid 10: got user=%q server=%q, want 447957366403@s.whatsapp.net", m1.PhoneUser, m1.PhoneServer) + } + + m2, ok := lidMap[20] + if !ok { + t.Fatal("expected lid row 20 in map") + } + if m2.PhoneUser != "12025551234" { + t.Errorf("lid 20: got user=%q, want 12025551234", m2.PhoneUser) + } +} + +func TestFetchLidMapMissingTable(t *testing.T) { + db, err := sql.Open("sqlite3", ":memory:") + if err != nil { + t.Fatal(err) + } + defer func() { _ = db.Close() }() + + // No jid_map table — should return empty map, not error. + lidMap, err := fetchLidMap(db) + if err != nil { + t.Fatalf("expected no error for missing table, got: %v", err) + } + if len(lidMap) != 0 { + t.Errorf("expected empty map, got %d entries", len(lidMap)) + } +} diff --git a/internal/whatsapp/types.go b/internal/whatsapp/types.go new file mode 100644 index 00000000..1fbc74b8 --- /dev/null +++ b/internal/whatsapp/types.go @@ -0,0 +1,155 @@ +// Package whatsapp provides import functionality for WhatsApp message backups. +// It reads from a decrypted WhatsApp msgstore.db (SQLite) and maps messages +// into the msgvault unified schema. +package whatsapp + +import ( + "database/sql" + "time" +) + +// waChat represents a chat/conversation from the WhatsApp jid + chat tables. +type waChat struct { + RowID int64 // chat._id + JIDRowID int64 // chat.jid_row_id → jid._id + RawString string // jid.raw_string (e.g., "447700900000@s.whatsapp.net") + User string // jid.user (phone number part) + Server string // jid.server (s.whatsapp.net or g.us) + Subject sql.NullString // chat.subject (group name) + GroupType int // chat.group_type: 0=individual (but see Server), >0=group + Hidden int // chat.hidden + LastMessageTimestamp int64 // chat.sort_timestamp +} + +// waMessage represents a message from the WhatsApp message table. +type waMessage struct { + RowID int64 // message._id + ChatRowID int64 // message.chat_row_id + FromMe int // message.from_me (0=received, 1=sent) + KeyID string // message.key_id (unique message ID) + SenderJIDRowID sql.NullInt64 // message.sender_jid_row_id → jid._id + SenderRawString sql.NullString // jid.raw_string of sender + SenderUser sql.NullString // jid.user of sender + SenderServer sql.NullString // jid.server of sender + Timestamp int64 // message.timestamp (ms since epoch) + MessageType int // message.message_type + TextData sql.NullString // message.text_data + Status int // message.status + Starred int // message.starred +} + +// waMedia represents media metadata from the message_media table. +type waMedia struct { + MessageRowID int64 // message_media.message_row_id + MimeType sql.NullString // message_media.mime_type + MediaCaption sql.NullString // message_media.media_caption + FileSize sql.NullInt64 // message_media.file_size + FilePath sql.NullString // message_media.file_path + Width sql.NullInt64 // message_media.width + Height sql.NullInt64 // message_media.height + MediaDuration sql.NullInt64 // message_media.media_duration (seconds) +} + +// waReaction represents a reaction from the message_add_on table. +type waReaction struct { + MessageRowID int64 // FK to message._id + SenderJIDRowID sql.NullInt64 // jid of reactor + SenderRawString sql.NullString // jid.raw_string + SenderUser sql.NullString // jid.user + SenderServer sql.NullString // jid.server + ReactionValue sql.NullString // emoji character + Timestamp int64 // timestamp (ms) +} + +// waGroupMember represents a member of a group chat. +type waGroupMember struct { + GroupJID string // group_participants.gjid (text, raw JID string) + MemberJID string // group_participants.jid (text, raw JID string) + MemberUser string // jid.user (parsed from MemberJID) + MemberServer string // jid.server (parsed from MemberJID) + Admin int // group_participants.admin (0=member, 1=admin, 2=superadmin) +} + +// waQuoted represents a quoted/replied-to message reference. +type waQuoted struct { + MessageRowID int64 // the message that quotes + QuotedKeyID string // message_quoted.key_id of the quoted message +} + +// waLidMapping maps a "lid" JID row to its corresponding phone JID, +// populated from the WhatsApp jid_map table. +type waLidMapping struct { + LidRowID int64 // jid._id for the lid entry + PhoneUser string // jid.user for the phone entry (e.g., "447700900000") + PhoneServer string // jid.server for the phone entry (e.g., "s.whatsapp.net") +} + +// ImportOptions configures the WhatsApp import process. +type ImportOptions struct { + // Phone is the user's own phone number in E.164 format (e.g., "+447700900000"). + Phone string + + // DisplayName is an optional display name for the user. + DisplayName string + + // MediaDir is an optional path to the decrypted Media folder. + // If set, media files will be copied to content-addressed storage. + MediaDir string + + // AttachmentsDir is the root directory for content-addressed attachment storage. + // This should be cfg.AttachmentsDir() (e.g., ~/.msgvault/attachments/). + // Required when MediaDir is set. + AttachmentsDir string + + // MaxMediaFileSize is the maximum size of a single media file to copy (in bytes). + // Files larger than this are skipped. Default: 100MB. + MaxMediaFileSize int64 + + // Limit limits the number of messages imported (0 = no limit, for testing). + Limit int + + // BatchSize is the number of messages to process per batch (default: 1000). + BatchSize int +} + +// DefaultOptions returns ImportOptions with sensible defaults. +func DefaultOptions() ImportOptions { + return ImportOptions{ + BatchSize: 1000, + MaxMediaFileSize: 100 * 1024 * 1024, // 100MB + } +} + +// ImportSummary holds statistics from a completed import. +type ImportSummary struct { + Duration time.Duration + ChatsProcessed int64 + MessagesProcessed int64 + MessagesAdded int64 + MessagesSkipped int64 + ReactionsAdded int64 + AttachmentsFound int64 + MediaCopied int64 + Participants int64 + Errors int64 +} + +// ImportProgress provides callbacks for import progress reporting. +type ImportProgress interface { + OnStart() + OnChatStart(chatJID string, chatTitle string, messageCount int) + OnProgress(processed, added, skipped int64) + OnChatComplete(chatJID string, messagesAdded int64) + OnComplete(summary *ImportSummary) + OnError(err error) +} + +// NullProgress is a no-op implementation of ImportProgress. +type NullProgress struct{} + +func (NullProgress) OnStart() {} +func (NullProgress) OnChatStart(string, string, int) {} +func (NullProgress) OnProgress(int64, int64, int64) {} +func (NullProgress) OnChatComplete(string, int64) {} +func (NullProgress) OnComplete(*ImportSummary) {} +func (NullProgress) OnError(error) {} From 19b1012d65fcdc01752d04129dd768cc2304f457 Mon Sep 17 00:00:00 2001 From: Ryan Stern <206953196+vanboompow@users.noreply.github.com> Date: Tue, 31 Mar 2026 18:16:54 -0500 Subject: [PATCH 02/65] Add iMessage sync support (#224) Sync iMessage history from the local macOS chat.db into msgvault. Reads conversations, messages, and participants from the iMessage SQLite database and stores them using the existing schema. Includes: - iMessage SQLite client with timestamp format auto-detection - Message and conversation parsing with participant resolution - CLI command (sync-imessage) with incremental sync support - Parser tests for message extraction and formatting Co-Authored-By: Ryan Stern <206953196+vanboompow@users.noreply.github.com> --- cmd/msgvault/cmd/sync_imessage.go | 210 ++++++++++++++ go.mod | 1 + go.sum | 4 + internal/imessage/client.go | 460 ++++++++++++++++++++++++++++++ internal/imessage/models.go | 21 ++ internal/imessage/parser.go | 218 ++++++++++++++ internal/imessage/parser_test.go | 300 +++++++++++++++++++ internal/sync/sync.go | 1 + 8 files changed, 1215 insertions(+) create mode 100644 cmd/msgvault/cmd/sync_imessage.go create mode 100644 internal/imessage/client.go create mode 100644 internal/imessage/models.go create mode 100644 internal/imessage/parser.go create mode 100644 internal/imessage/parser_test.go diff --git a/cmd/msgvault/cmd/sync_imessage.go b/cmd/msgvault/cmd/sync_imessage.go new file mode 100644 index 00000000..425453fc --- /dev/null +++ b/cmd/msgvault/cmd/sync_imessage.go @@ -0,0 +1,210 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "path/filepath" + "syscall" + "time" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/imessage" + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/sync" +) + +var ( + imessageDBPath string + imessageBefore string + imessageAfter string + imessageLimit int + imessageMe string + imessageNoResume bool +) + +var syncImessageCmd = &cobra.Command{ + Use: "sync-imessage", + Short: "Import iMessages from local database", + Long: `Import iMessages from macOS's local Messages database (chat.db). + +Reads messages from ~/Library/Messages/chat.db and stores them in the +msgvault archive alongside Gmail messages. This is a read-only operation +that does not modify the iMessage database. + +Requires Full Disk Access permission in System Settings > Privacy & Security. + +Date filters: + --after 2024-01-01 Only messages on or after this date + --before 2024-12-31 Only messages before this date + +Examples: + msgvault sync-imessage + msgvault sync-imessage --after 2024-01-01 + msgvault sync-imessage --limit 100 + msgvault sync-imessage --me "+15551234567" + msgvault sync-imessage --db-path /path/to/chat.db`, + RunE: func(cmd *cobra.Command, args []string) error { + // Open msgvault database + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + + // Resolve chat.db path + chatDBPath := imessageDBPath + if chatDBPath == "" { + home, err := os.UserHomeDir() + if err != nil { + return fmt.Errorf("get home directory: %w", err) + } + chatDBPath = filepath.Join(home, "Library", "Messages", "chat.db") + } + + // Check chat.db exists + if _, err := os.Stat(chatDBPath); os.IsNotExist(err) { + return fmt.Errorf("iMessage database not found at %s\n\nMake sure you're running on macOS with Messages enabled", chatDBPath) + } + + // Build client options + var clientOpts []imessage.ClientOption + clientOpts = append(clientOpts, imessage.WithImessageLogger(logger)) + + if imessageMe != "" { + clientOpts = append(clientOpts, imessage.WithMyAddress(imessageMe)) + } + + if imessageAfter != "" { + t, err := time.ParseInLocation("2006-01-02", imessageAfter, time.Local) + if err != nil { + return fmt.Errorf("invalid --after date: %w (use YYYY-MM-DD format)", err) + } + clientOpts = append(clientOpts, imessage.WithAfterDate(t)) + } + + if imessageBefore != "" { + t, err := time.ParseInLocation("2006-01-02", imessageBefore, time.Local) + if err != nil { + return fmt.Errorf("invalid --before date: %w (use YYYY-MM-DD format)", err) + } + clientOpts = append(clientOpts, imessage.WithBeforeDate(t)) + } + + if imessageLimit > 0 { + clientOpts = append(clientOpts, imessage.WithLimit(imessageLimit)) + } + + // Determine source identifier + identifier := "local" + if imessageMe != "" { + identifier = imessageMe + } + + // Create iMessage client + imsgClient, err := imessage.NewClient(chatDBPath, identifier, clientOpts...) + if err != nil { + return fmt.Errorf("open iMessage database: %w", err) + } + defer func() { _ = imsgClient.Close() }() + + // Set up context with cancellation + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + // Handle Ctrl+C gracefully + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + fmt.Println("\nInterrupted. Saving checkpoint...") + cancel() + }() + + // Set up sync options + opts := sync.DefaultOptions() + opts.NoResume = imessageNoResume + opts.SourceType = "apple_messages" + opts.AttachmentsDir = cfg.AttachmentsDir() + + // Create syncer with progress reporter + syncer := sync.New(imsgClient, s, opts). + WithLogger(logger). + WithProgress(&CLIProgress{}) + + // Run sync + startTime := time.Now() + fmt.Printf("Starting iMessage sync from %s\n", chatDBPath) + if imessageAfter != "" || imessageBefore != "" { + parts := []string{} + if imessageAfter != "" { + parts = append(parts, "after "+imessageAfter) + } + if imessageBefore != "" { + parts = append(parts, "before "+imessageBefore) + } + fmt.Printf("Date filter: %s\n", joinParts(parts)) + } + if imessageLimit > 0 { + fmt.Printf("Limit: %d messages\n", imessageLimit) + } + fmt.Println() + + summary, err := syncer.Full(ctx, identifier) + if err != nil { + if ctx.Err() != nil { + fmt.Println("\nSync interrupted. Run again to resume.") + return nil + } + return fmt.Errorf("sync failed: %w", err) + } + + // Print summary + fmt.Println() + fmt.Println("iMessage sync complete!") + fmt.Printf(" Duration: %s\n", summary.Duration.Round(time.Second)) + fmt.Printf(" Messages: %d found, %d added, %d skipped\n", + summary.MessagesFound, summary.MessagesAdded, summary.MessagesSkipped) + if summary.Errors > 0 { + fmt.Printf(" Errors: %d\n", summary.Errors) + } + if summary.WasResumed { + fmt.Printf(" (Resumed from checkpoint)\n") + } + + if summary.MessagesAdded > 0 { + elapsed := time.Since(startTime) + messagesPerSec := float64(summary.MessagesAdded) / elapsed.Seconds() + fmt.Printf(" Rate: %.1f messages/sec\n", messagesPerSec) + } + + return nil + }, +} + +func joinParts(parts []string) string { + result := "" + for i, p := range parts { + if i > 0 { + result += ", " + } + result += p + } + return result +} + +func init() { + syncImessageCmd.Flags().StringVar(&imessageDBPath, "db-path", "", "path to chat.db (default: ~/Library/Messages/chat.db)") + syncImessageCmd.Flags().StringVar(&imessageBefore, "before", "", "only messages before this date (YYYY-MM-DD)") + syncImessageCmd.Flags().StringVar(&imessageAfter, "after", "", "only messages after this date (YYYY-MM-DD)") + syncImessageCmd.Flags().IntVar(&imessageLimit, "limit", 0, "limit number of messages (for testing)") + syncImessageCmd.Flags().StringVar(&imessageMe, "me", "", "your phone number or email (e.g., +15551234567)") + syncImessageCmd.Flags().BoolVar(&imessageNoResume, "noresume", false, "force fresh sync (don't resume)") + rootCmd.AddCommand(syncImessageCmd) +} diff --git a/go.mod b/go.mod index 99258ee7..b44c42e0 100644 --- a/go.mod +++ b/go.mod @@ -28,6 +28,7 @@ require ( golang.org/x/sys v0.42.0 golang.org/x/text v0.35.0 golang.org/x/time v0.15.0 + howett.net/plist v1.0.1 ) require ( diff --git a/go.sum b/go.sum index 3754b6ef..3b411cc4 100644 --- a/go.sum +++ b/go.sum @@ -95,6 +95,7 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA= github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= +github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/jhillyerd/enmime v1.3.0 h1:LV5kzfLidiOr8qRGIpYYmUZCnhrPbcFAnAFUnWn99rw= github.com/jhillyerd/enmime v1.3.0/go.mod h1:6c6jg5HdRRV2FtvVL69LjiX1M8oE0xDX9VEhV3oy4gs= github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= @@ -226,5 +227,8 @@ golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6f gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +howett.net/plist v1.0.1 h1:37GdZ8tP09Q35o9ych3ehygcsL+HqKSwzctveSlarvM= +howett.net/plist v1.0.1/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= diff --git a/internal/imessage/client.go b/internal/imessage/client.go new file mode 100644 index 00000000..a70131e1 --- /dev/null +++ b/internal/imessage/client.go @@ -0,0 +1,460 @@ +package imessage + +import ( + "context" + "database/sql" + "fmt" + "log/slog" + "strconv" + "time" + + _ "github.com/mattn/go-sqlite3" + "github.com/wesm/msgvault/internal/gmail" +) + +const defaultPageSize = 500 + +// Client reads from macOS's iMessage chat.db and implements the gmail.API +// interface so it can be used with the existing sync infrastructure. +type Client struct { + db *sql.DB + identifier string // source identifier for GetProfile (e.g., "local") + myAddress string // normalized email-like address for the device owner + afterDate time.Time // only sync messages after this date + beforeDate time.Time // only sync messages before this date + limit int // max total messages to return (0 = unlimited) + returned int // messages returned so far (for limit tracking) + useNanoseconds bool // whether chat.db uses nanosecond timestamps + logger *slog.Logger + pageSize int +} + +// ClientOption configures a Client. +type ClientOption func(*Client) + +// WithAfterDate filters to messages on or after this date. +func WithAfterDate(t time.Time) ClientOption { + return func(c *Client) { c.afterDate = t } +} + +// WithBeforeDate filters to messages before this date. +func WithBeforeDate(t time.Time) ClientOption { + return func(c *Client) { c.beforeDate = t } +} + +// WithLimit sets the maximum number of messages to return across all pages. +func WithLimit(n int) ClientOption { + return func(c *Client) { c.limit = n } +} + +// WithMyAddress sets the owner's email-like address for MIME From headers +// on is_from_me messages. +func WithMyAddress(addr string) ClientOption { + return func(c *Client) { c.myAddress = addr } +} + +// WithImessageLogger sets the logger for the client. +func WithImessageLogger(l *slog.Logger) ClientOption { + return func(c *Client) { c.logger = l } +} + +// NewClient opens a read-only connection to an iMessage chat.db file +// and returns a Client that implements gmail.API. +func NewClient(dbPath string, identifier string, opts ...ClientOption) (*Client, error) { + // Open chat.db read-only + dsn := fmt.Sprintf("file:%s?mode=ro&_journal_mode=WAL&_busy_timeout=5000", dbPath) + db, err := sql.Open("sqlite3", dsn) + if err != nil { + return nil, fmt.Errorf("open chat.db: %w", err) + } + + if err := db.Ping(); err != nil { + _ = db.Close() + return nil, fmt.Errorf("connect to chat.db: %w (check Full Disk Access permissions)", err) + } + + c := &Client{ + db: db, + identifier: identifier, + myAddress: "me@imessage.local", + logger: slog.Default(), + pageSize: defaultPageSize, + } + + for _, opt := range opts { + opt(c) + } + + // Detect timestamp format (nanoseconds vs seconds) + if err := c.detectTimestampFormat(); err != nil { + _ = db.Close() + return nil, fmt.Errorf("detect timestamp format: %w", err) + } + + return c, nil +} + +// Close closes the database connection. +func (c *Client) Close() error { + return c.db.Close() +} + +// detectTimestampFormat checks whether chat.db uses nanosecond timestamps +// (macOS High Sierra+) or second timestamps (older macOS). +func (c *Client) detectTimestampFormat() error { + var maxDate sql.NullInt64 + err := c.db.QueryRow("SELECT MAX(date) FROM message WHERE date > 0").Scan(&maxDate) + if err != nil { + return fmt.Errorf("query max date: %w", err) + } + if maxDate.Valid { + c.useNanoseconds = maxDate.Int64 > 1_000_000_000_000 + } + return nil +} + +// GetProfile returns a profile with the message count and max ROWID as history ID. +func (c *Client) GetProfile(ctx context.Context) (*gmail.Profile, error) { + var count int64 + if err := c.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM message").Scan(&count); err != nil { + return nil, fmt.Errorf("count messages: %w", err) + } + + var maxROWID sql.NullInt64 + if err := c.db.QueryRowContext(ctx, "SELECT MAX(ROWID) FROM message").Scan(&maxROWID); err != nil { + return nil, fmt.Errorf("get max rowid: %w", err) + } + + historyID := uint64(0) + if maxROWID.Valid { + historyID = uint64(maxROWID.Int64) + } + + return &gmail.Profile{ + EmailAddress: c.identifier, + MessagesTotal: count, + HistoryID: historyID, + }, nil +} + +// ListLabels returns iMessage and SMS as labels. +func (c *Client) ListLabels(ctx context.Context) ([]*gmail.Label, error) { + return []*gmail.Label{ + {ID: "iMessage", Name: "iMessage", Type: "user"}, + {ID: "SMS", Name: "SMS", Type: "user"}, + }, nil +} + +// ListMessages returns a page of message IDs from chat.db, ordered by ROWID. +// The pageToken is the string representation of the last seen ROWID. +// The query parameter is ignored (date filtering is done via client options). +func (c *Client) ListMessages(ctx context.Context, query string, pageToken string) (*gmail.MessageListResponse, error) { + // Check limit + if c.limit > 0 && c.returned >= c.limit { + return &gmail.MessageListResponse{}, nil + } + + lastROWID := int64(0) + if pageToken != "" { + var err error + lastROWID, err = strconv.ParseInt(pageToken, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid page token: %w", err) + } + } + + // Build query + sqlQuery := ` + SELECT m.ROWID, COALESCE(c.guid, 'no-chat-' || CAST(m.ROWID AS TEXT)) as chat_guid + FROM message m + LEFT JOIN chat_message_join cmj ON cmj.message_id = m.ROWID + LEFT JOIN chat c ON c.ROWID = cmj.chat_id + WHERE m.ROWID > ?` + args := []interface{}{lastROWID} + + if !c.afterDate.IsZero() { + appleTS := timeToAppleTimestamp(c.afterDate, c.useNanoseconds) + sqlQuery += " AND m.date >= ?" + args = append(args, appleTS) + } + if !c.beforeDate.IsZero() { + appleTS := timeToAppleTimestamp(c.beforeDate, c.useNanoseconds) + sqlQuery += " AND m.date < ?" + args = append(args, appleTS) + } + + sqlQuery += " ORDER BY m.ROWID ASC LIMIT ?" + + // Calculate page size respecting limit + pageSize := c.pageSize + if c.limit > 0 { + remaining := c.limit - c.returned + if remaining < pageSize { + pageSize = remaining + } + } + args = append(args, pageSize) + + rows, err := c.db.QueryContext(ctx, sqlQuery, args...) + if err != nil { + return nil, fmt.Errorf("list messages: %w", err) + } + defer func() { _ = rows.Close() }() + + var messages []gmail.MessageID + var maxRowID int64 + for rows.Next() { + var rowID int64 + var chatGUID string + if err := rows.Scan(&rowID, &chatGUID); err != nil { + return nil, fmt.Errorf("scan message: %w", err) + } + messages = append(messages, gmail.MessageID{ + ID: strconv.FormatInt(rowID, 10), + ThreadID: chatGUID, + }) + maxRowID = rowID + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("iterate messages: %w", err) + } + + c.returned += len(messages) + + // Determine next page token + var nextPageToken string + if len(messages) == pageSize { + nextPageToken = strconv.FormatInt(maxRowID, 10) + } + + // Get total estimate for progress reporting + totalEstimate := int64(len(messages)) + if pageToken == "" { + totalEstimate = c.countFilteredMessages(ctx) + } + + return &gmail.MessageListResponse{ + Messages: messages, + NextPageToken: nextPageToken, + ResultSizeEstimate: totalEstimate, + }, nil +} + +// countFilteredMessages returns the total count of messages matching the date filters. +func (c *Client) countFilteredMessages(ctx context.Context) int64 { + sqlQuery := "SELECT COUNT(*) FROM message WHERE 1=1" + var args []interface{} + + if !c.afterDate.IsZero() { + appleTS := timeToAppleTimestamp(c.afterDate, c.useNanoseconds) + sqlQuery += " AND date >= ?" + args = append(args, appleTS) + } + if !c.beforeDate.IsZero() { + appleTS := timeToAppleTimestamp(c.beforeDate, c.useNanoseconds) + sqlQuery += " AND date < ?" + args = append(args, appleTS) + } + + var count int64 + if err := c.db.QueryRowContext(ctx, sqlQuery, args...).Scan(&count); err != nil { + return 0 + } + return count +} + +// GetMessageRaw fetches a single message and builds synthetic MIME data. +func (c *Client) GetMessageRaw(ctx context.Context, messageID string) (*gmail.RawMessage, error) { + rowID, err := strconv.ParseInt(messageID, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid message ID: %w", err) + } + + // Query message with handle and chat info + var msg messageRow + err = c.db.QueryRowContext(ctx, ` + SELECT + m.ROWID, m.guid, m.text, m.attributedBody, m.date, m.is_from_me, m.service, + m.cache_has_attachments, + h.id, + c.ROWID, c.guid, c.display_name, c.chat_identifier + FROM message m + LEFT JOIN handle h ON h.ROWID = m.handle_id + LEFT JOIN chat_message_join cmj ON cmj.message_id = m.ROWID + LEFT JOIN chat c ON c.ROWID = cmj.chat_id + WHERE m.ROWID = ? + `, rowID).Scan( + &msg.ROWID, &msg.GUID, &msg.Text, &msg.AttributedBody, &msg.Date, &msg.IsFromMe, &msg.Service, + &msg.HasAttachments, + &msg.HandleID, + &msg.ChatROWID, &msg.ChatGUID, &msg.ChatDisplayName, &msg.ChatIdentifier, + ) + if err == sql.ErrNoRows { + return nil, &gmail.NotFoundError{Path: "/messages/" + messageID} + } + if err != nil { + return nil, fmt.Errorf("get message %s: %w", messageID, err) + } + + // Warn about attachments that won't be archived + if msg.HasAttachments != 0 { + c.logger.Warn("message has attachments that will not be archived (attachment extraction not yet implemented)", "id", messageID, "guid", msg.GUID) + } + + // Determine sender and recipients + fromAddr, toAddrs := c.resolveParticipants(ctx, &msg) + + // Convert Apple timestamp to time + msgDate := appleTimestampToTime(msg.Date) + + // Get message body: prefer plain-text column, fall back to attributedBody blob + // (macOS Ventura+ / iOS 16+ stopped populating m.text for many message types). + body := "" + if msg.Text != nil { + body = *msg.Text + } else if len(msg.AttributedBody) > 0 { + body = extractAttributedBodyText(msg.AttributedBody) + } + + // Build MIME + mimeData := buildMIME(fromAddr, toAddrs, msgDate, msg.GUID, body) + + // Determine thread ID + threadID := "no-chat-" + messageID + if msg.ChatGUID != nil { + threadID = *msg.ChatGUID + } + + // Build label based on service + var labelIDs []string + if msg.Service != nil && *msg.Service != "" { + labelIDs = []string{*msg.Service} + } + + // InternalDate as Unix milliseconds + internalDate := int64(0) + if !msgDate.IsZero() { + internalDate = msgDate.UnixMilli() + } + + return &gmail.RawMessage{ + ID: messageID, + ThreadID: threadID, + LabelIDs: labelIDs, + Snippet: snippet(body, 100), + HistoryID: uint64(msg.ROWID), + InternalDate: internalDate, + SizeEstimate: int64(len(mimeData)), + Raw: mimeData, + }, nil +} + +// resolveParticipants determines the From and To addresses for a message. +func (c *Client) resolveParticipants(ctx context.Context, msg *messageRow) (from []string, to []string) { + if msg.IsFromMe != 0 { + // Sender is the device owner + from = []string{c.myAddress} + // Recipients are the chat participants + if msg.ChatROWID != nil { + to = c.getChatParticipants(ctx, *msg.ChatROWID) + } else if msg.HandleID != nil { + email, _, _ := normalizeIdentifier(*msg.HandleID) + if email != "" { + to = []string{email} + } + } + } else { + // Sender is from the handle table + if msg.HandleID != nil { + email, _, _ := normalizeIdentifier(*msg.HandleID) + if email != "" { + from = []string{email} + } + } + // Recipient is the device owner (and possibly other participants in group chats) + to = []string{c.myAddress} + if msg.ChatROWID != nil { + others := c.getChatParticipants(ctx, *msg.ChatROWID) + // Add other participants (exclude the sender) + senderAddr := "" + if len(from) > 0 { + senderAddr = from[0] + } + for _, addr := range others { + if addr != senderAddr && addr != c.myAddress { + to = append(to, addr) + } + } + } + } + return from, to +} + +// getChatParticipants returns the normalized email addresses of all participants +// in a chat (excluding the device owner). +func (c *Client) getChatParticipants(ctx context.Context, chatROWID int64) []string { + rows, err := c.db.QueryContext(ctx, ` + SELECT h.id + FROM chat_handle_join chj + JOIN handle h ON h.ROWID = chj.handle_id + WHERE chj.chat_id = ? + `, chatROWID) + if err != nil { + c.logger.Warn("failed to get chat participants", "chat_id", chatROWID, "error", err) + return nil + } + defer func() { _ = rows.Close() }() + + var addrs []string + for rows.Next() { + var handleID string + if err := rows.Scan(&handleID); err != nil { + continue + } + email, _, _ := normalizeIdentifier(handleID) + if email != "" { + addrs = append(addrs, email) + } + } + return addrs +} + +// GetMessagesRawBatch fetches multiple messages sequentially. +// Since we're reading from a local database, parallelism adds no benefit. +func (c *Client) GetMessagesRawBatch(ctx context.Context, messageIDs []string) ([]*gmail.RawMessage, error) { + results := make([]*gmail.RawMessage, 0, len(messageIDs)) + for _, id := range messageIDs { + msg, err := c.GetMessageRaw(ctx, id) + if err != nil { + c.logger.Warn("failed to fetch message", "id", id, "error", err) + continue + } + results = append(results, msg) + } + return results, nil +} + +// ListHistory is not supported for iMessage (no incremental sync yet). +func (c *Client) ListHistory(ctx context.Context, startHistoryID uint64, pageToken string) (*gmail.HistoryResponse, error) { + return &gmail.HistoryResponse{ + HistoryID: startHistoryID, + }, nil +} + +// TrashMessage is not supported for iMessage. +func (c *Client) TrashMessage(ctx context.Context, messageID string) error { + return fmt.Errorf("trash not supported for iMessage") +} + +// DeleteMessage is not supported for iMessage. +func (c *Client) DeleteMessage(ctx context.Context, messageID string) error { + return fmt.Errorf("delete not supported for iMessage") +} + +// BatchDeleteMessages is not supported for iMessage. +func (c *Client) BatchDeleteMessages(ctx context.Context, messageIDs []string) error { + return fmt.Errorf("batch delete not supported for iMessage") +} + +// Ensure Client implements gmail.API. +var _ gmail.API = (*Client)(nil) diff --git a/internal/imessage/models.go b/internal/imessage/models.go new file mode 100644 index 00000000..3a62a688 --- /dev/null +++ b/internal/imessage/models.go @@ -0,0 +1,21 @@ +// Package imessage provides an iMessage client that reads from macOS's chat.db +// and implements the gmail.API interface for use with the existing sync infrastructure. +package imessage + +// messageRow holds a row from the iMessage chat.db message table +// joined with chat and handle info. +type messageRow struct { + ROWID int64 + GUID string + Text *string // nullable - some messages only have attributedBody + AttributedBody []byte // NSKeyedArchiver blob; fallback when Text is nil (macOS Ventura+) + Date int64 // Apple epoch timestamp (seconds or nanoseconds) + IsFromMe int + Service *string // "iMessage", "SMS", or NULL for system messages + HasAttachments int + HandleID *string // handle.id (phone or email), NULL for is_from_me + ChatROWID *int64 // chat.ROWID for participant lookup + ChatGUID *string // chat.guid, used as conversation/thread ID + ChatDisplayName *string // chat.display_name (set for group chats) + ChatIdentifier *string // chat.chat_identifier +} diff --git a/internal/imessage/parser.go b/internal/imessage/parser.go new file mode 100644 index 00000000..0d2f9f96 --- /dev/null +++ b/internal/imessage/parser.go @@ -0,0 +1,218 @@ +package imessage + +import ( + "crypto/sha256" + "encoding/hex" + "net/mail" + "strings" + "time" + + "howett.net/plist" +) + +// appleEpochOffset is the number of seconds between Unix epoch (1970-01-01) +// and Apple/Core Data epoch (2001-01-01). +const appleEpochOffset int64 = 978307200 + +// appleTimestampToTime converts an Apple epoch timestamp to time.Time. +// macOS High Sierra+ stores dates as nanoseconds since Apple epoch; +// older versions use seconds. We detect the format by checking magnitude. +func appleTimestampToTime(ts int64) time.Time { + if ts == 0 { + return time.Time{} + } + // Values > 1e12 are nanoseconds (1e12 ns = ~16 minutes from epoch, + // while 1e12 seconds from epoch = year ~33700). + if ts > 1_000_000_000_000 { + sec := ts / 1_000_000_000 + nsec := ts % 1_000_000_000 + return time.Unix(sec+appleEpochOffset, nsec).UTC() + } + return time.Unix(ts+appleEpochOffset, 0).UTC() +} + +// timeToAppleTimestamp converts a time.Time to an Apple epoch timestamp. +// If useNano is true, returns nanoseconds; otherwise returns seconds. +func timeToAppleTimestamp(t time.Time, useNano bool) int64 { + appleSec := t.Unix() - appleEpochOffset + if useNano { + return appleSec*1_000_000_000 + int64(t.Nanosecond()) + } + return appleSec +} + +// normalizeIdentifier converts a phone number or email address from iMessage's +// handle table into a normalized email-like identifier for the participants table. +// Returns the normalized email, domain, and display name. +func normalizeIdentifier(handleID string) (email, domain, displayName string) { + handleID = strings.TrimSpace(handleID) + if handleID == "" { + return "", "", "" + } + + // Email addresses: use as-is (lowercased) + if strings.Contains(handleID, "@") { + email = strings.ToLower(handleID) + if idx := strings.LastIndex(email, "@"); idx >= 0 { + domain = email[idx+1:] + } + return email, domain, "" + } + + // Phone numbers: normalize and use a synthetic domain + phone := normalizePhone(handleID) + return phone + "@phone.imessage", "phone.imessage", phone +} + +// normalizePhone strips non-digit characters from a phone number and attempts +// to produce a consistent E.164-like format. +func normalizePhone(phone string) string { + // Preserve leading + + hasPlus := strings.HasPrefix(phone, "+") + + // Extract digits only + var digits strings.Builder + for _, r := range phone { + if r >= '0' && r <= '9' { + digits.WriteRune(r) + } + } + d := digits.String() + if d == "" { + return phone // Return original if no digits found + } + + // Try to normalize to E.164 + if hasPlus { + return "+" + d + } + // 10-digit US number + if len(d) == 10 { + return "+1" + d + } + // 11-digit number starting with 1 (US with country code) + if len(d) == 11 && d[0] == '1' { + return "+" + d + } + // Other: prefix with + + return "+" + d +} + +// buildMIME constructs a minimal RFC 2822 message from iMessage data. +// The resulting bytes can be parsed by enmime for the sync pipeline. +func buildMIME(fromAddr, toAddrs []string, date time.Time, messageID, body string) []byte { + var b strings.Builder + + // From header + if len(fromAddr) > 0 { + b.WriteString("From: ") + b.WriteString(formatMIMEAddress(fromAddr[0])) + b.WriteString("\r\n") + } + + // To header + if len(toAddrs) > 0 { + b.WriteString("To: ") + for i, addr := range toAddrs { + if i > 0 { + b.WriteString(", ") + } + b.WriteString(formatMIMEAddress(addr)) + } + b.WriteString("\r\n") + } + + // Date header + if !date.IsZero() { + b.WriteString("Date: ") + b.WriteString(date.Format(time.RFC1123Z)) + b.WriteString("\r\n") + } + + // Subject (empty for iMessage - messages don't have subjects) + b.WriteString("Subject: \r\n") + + // Message-ID — hash the GUID since iMessage GUIDs contain characters + // like ':' and '/' that are invalid in RFC 5322 msg-id local-part. + if messageID != "" { + h := sha256.Sum256([]byte(messageID)) + safeID := hex.EncodeToString(h[:12]) // 24 hex chars, unique enough + b.WriteString("Message-ID: <") + b.WriteString(safeID) + b.WriteString("@imessage.local>\r\n") + } + + // MIME version and content type + b.WriteString("MIME-Version: 1.0\r\n") + b.WriteString("Content-Type: text/plain; charset=utf-8\r\n") + + // Header/body separator + b.WriteString("\r\n") + + // Body + if body != "" { + b.WriteString(body) + } + + return []byte(b.String()) +} + +// formatMIMEAddress formats an email address for MIME headers. +func formatMIMEAddress(addr string) string { + return (&mail.Address{Address: addr}).String() +} + +// extractAttributedBodyText decodes an NSKeyedArchiver binary plist blob from +// chat.db's attributedBody column and returns the plain text string. +// +// macOS Ventura+ / iOS 16+ stopped populating the plain-text "text" column for +// most iMessages; the content lives exclusively in attributedBody as an +// NSAttributedString archived via NSKeyedArchiver. +func extractAttributedBodyText(data []byte) string { + if len(data) == 0 { + return "" + } + + var archive struct { + Top map[string]plist.UID `plist:"$top"` + Objects []interface{} `plist:"$objects"` + } + if _, err := plist.Unmarshal(data, &archive); err != nil { + return "" + } + + rootUID, ok := archive.Top["root"] + if !ok || int(rootUID) >= len(archive.Objects) { + return "" + } + + rootObj, ok := archive.Objects[rootUID].(map[string]interface{}) + if !ok { + return "" + } + + nsStringUID, ok := rootObj["NS.string"].(plist.UID) + if !ok { + return "" + } + if int(nsStringUID) >= len(archive.Objects) { + return "" + } + + text, ok := archive.Objects[nsStringUID].(string) + if !ok { + return "" + } + return text +} + +// snippet returns the first n characters of s, suitable for message preview. +func snippet(s string, maxLen int) string { + // Normalize whitespace + s = strings.Join(strings.Fields(s), " ") + runes := []rune(s) + if len(runes) > maxLen { + return string(runes[:maxLen]) + } + return s +} diff --git a/internal/imessage/parser_test.go b/internal/imessage/parser_test.go new file mode 100644 index 00000000..8c4b3beb --- /dev/null +++ b/internal/imessage/parser_test.go @@ -0,0 +1,300 @@ +package imessage + +import ( + "strings" + "testing" + "time" + + "howett.net/plist" +) + +func TestAppleTimestampToTime(t *testing.T) { + tests := []struct { + name string + ts int64 + wantYear int + wantZero bool + }{ + { + name: "zero returns zero time", + ts: 0, + wantZero: true, + }, + { + name: "nanoseconds - 2024-01-01", + ts: 725760000000000000, // 2024-01-01 00:00:00 UTC in Apple nanoseconds + wantYear: 2024, + }, + { + name: "seconds - 2024-01-01", + ts: 725760000, // 2024-01-01 00:00:00 UTC in Apple seconds + wantYear: 2024, + }, + { + name: "nanoseconds - 2020-06-15", + ts: 613872000000000000, // 2020-06-15 in Apple nanoseconds + wantYear: 2020, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := appleTimestampToTime(tt.ts) + if tt.wantZero { + if !got.IsZero() { + t.Errorf("expected zero time, got %v", got) + } + return + } + if got.Year() != tt.wantYear { + t.Errorf("expected year %d, got %d (time: %v)", tt.wantYear, got.Year(), got) + } + }) + } +} + +func TestTimeToAppleTimestamp(t *testing.T) { + // 2024-01-01 00:00:00 UTC + tm := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) + + // In seconds + gotSec := timeToAppleTimestamp(tm, false) + wantSec := int64(725760000) // 1704067200 - 978307200 + if gotSec != wantSec { + t.Errorf("seconds: got %d, want %d", gotSec, wantSec) + } + + // In nanoseconds + gotNano := timeToAppleTimestamp(tm, true) + wantNano := int64(725760000000000000) + if gotNano != wantNano { + t.Errorf("nanoseconds: got %d, want %d", gotNano, wantNano) + } +} + +func TestRoundTripTimestamp(t *testing.T) { + original := time.Date(2024, 6, 15, 12, 30, 45, 0, time.UTC) + + // Round trip through nanoseconds + appleNano := timeToAppleTimestamp(original, true) + recovered := appleTimestampToTime(appleNano) + if !recovered.Equal(original) { + t.Errorf("nanosecond round trip: got %v, want %v", recovered, original) + } + + // Round trip through seconds (loses sub-second precision) + appleSec := timeToAppleTimestamp(original, false) + recoveredSec := appleTimestampToTime(appleSec) + expected := time.Date(2024, 6, 15, 12, 30, 45, 0, time.UTC) + if !recoveredSec.Equal(expected) { + t.Errorf("second round trip: got %v, want %v", recoveredSec, expected) + } +} + +func TestNormalizeIdentifier(t *testing.T) { + tests := []struct { + name string + handleID string + wantEmail string + wantDomain string + }{ + { + name: "email address", + handleID: "John@Example.com", + wantEmail: "john@example.com", + wantDomain: "example.com", + }, + { + name: "US phone with +1", + handleID: "+15551234567", + wantEmail: "+15551234567@phone.imessage", + wantDomain: "phone.imessage", + }, + { + name: "US phone 10 digits", + handleID: "5551234567", + wantEmail: "+15551234567@phone.imessage", + wantDomain: "phone.imessage", + }, + { + name: "US phone with formatting", + handleID: "(555) 123-4567", + wantEmail: "+15551234567@phone.imessage", + wantDomain: "phone.imessage", + }, + { + name: "US phone 11 digits with 1", + handleID: "15551234567", + wantEmail: "+15551234567@phone.imessage", + wantDomain: "phone.imessage", + }, + { + name: "international phone", + handleID: "+447911123456", + wantEmail: "+447911123456@phone.imessage", + wantDomain: "phone.imessage", + }, + { + name: "empty string", + handleID: "", + wantEmail: "", + wantDomain: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotEmail, gotDomain, _ := normalizeIdentifier(tt.handleID) + if gotEmail != tt.wantEmail { + t.Errorf("email: got %q, want %q", gotEmail, tt.wantEmail) + } + if gotDomain != tt.wantDomain { + t.Errorf("domain: got %q, want %q", gotDomain, tt.wantDomain) + } + }) + } +} + +func TestBuildMIME(t *testing.T) { + date := time.Date(2024, 6, 15, 14, 30, 0, 0, time.UTC) + mime := buildMIME( + []string{"sender@example.com"}, + []string{"recipient@example.com", "other@example.com"}, + date, + "p:0/ABC123", + "Hello, world!", + ) + + mimeStr := string(mime) + + // Check required headers + if !strings.Contains(mimeStr, "From: ") { + t.Error("missing or incorrect From header") + } + if !strings.Contains(mimeStr, "To: , ") { + t.Error("missing or incorrect To header") + } + if !strings.Contains(mimeStr, "Date: ") { + t.Error("missing Date header") + } + // Message-ID is a hash of the GUID (RFC 5322 safe) + if !strings.Contains(mimeStr, "Message-ID: <") || !strings.Contains(mimeStr, "@imessage.local>") { + t.Error("missing Message-ID header") + } + // Verify the raw GUID with invalid chars is NOT present + if strings.Contains(mimeStr, "p:0/ABC123@imessage.local") { + t.Error("Message-ID should not contain raw GUID with invalid chars") + } + if !strings.Contains(mimeStr, "Content-Type: text/plain; charset=utf-8") { + t.Error("missing Content-Type header") + } + if !strings.Contains(mimeStr, "MIME-Version: 1.0") { + t.Error("missing MIME-Version header") + } + // Check body is after blank line + if !strings.Contains(mimeStr, "\r\n\r\nHello, world!") { + t.Error("body not found after header separator") + } +} + +func TestBuildMIME_EmptyBody(t *testing.T) { + date := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) + mime := buildMIME( + []string{"sender@example.com"}, + []string{"recipient@example.com"}, + date, + "test-guid", + "", + ) + + mimeStr := string(mime) + + // Should still have headers and separator + if !strings.Contains(mimeStr, "\r\n\r\n") { + t.Error("missing header/body separator") + } + // Body should be empty + parts := strings.SplitN(mimeStr, "\r\n\r\n", 2) + if len(parts) != 2 || parts[1] != "" { + t.Errorf("expected empty body, got %q", parts[1]) + } +} + +// makeAttributedBodyBlob builds a minimal NSKeyedArchiver binary plist blob +// equivalent to an NSAttributedString with the given text. +func makeAttributedBodyBlob(text string) []byte { + archive := struct { + Archiver string `plist:"$archiver"` + Version uint64 `plist:"$version"` + Top map[string]plist.UID `plist:"$top"` + Objects []interface{} `plist:"$objects"` + }{ + Archiver: "NSKeyedArchiver", + Version: 100000, + Top: map[string]plist.UID{"root": 1}, + Objects: []interface{}{ + "$null", + map[string]interface{}{ + "$class": plist.UID(3), + "NS.string": plist.UID(2), + }, + text, + map[string]interface{}{ + "$classname": "NSAttributedString", + "$classes": []string{"NSAttributedString", "NSObject"}, + }, + }, + } + data, err := plist.Marshal(archive, plist.BinaryFormat) + if err != nil { + panic("makeAttributedBodyBlob: " + err.Error()) + } + return data +} + +func TestExtractAttributedBodyText(t *testing.T) { + tests := []struct { + name string + input []byte + want string + }{ + {"nil blob", nil, ""}, + {"empty blob", []byte{}, ""}, + {"invalid plist", []byte("not a plist"), ""}, + {"plain ASCII message", makeAttributedBodyBlob("Hello from iMessage"), "Hello from iMessage"}, + {"unicode and emoji", makeAttributedBodyBlob("Hey! \xf0\x9f\x98\x8a"), "Hey! \xf0\x9f\x98\x8a"}, + {"multiline", makeAttributedBodyBlob("Line one\nLine two"), "Line one\nLine two"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := extractAttributedBodyText(tt.input) + if got != tt.want { + t.Errorf("got %q, want %q", got, tt.want) + } + }) + } +} + +func TestSnippet(t *testing.T) { + tests := []struct { + name string + input string + maxLen int + want string + }{ + {"short text", "hello", 100, "hello"}, + {"long text", "hello world this is a long message", 10, "hello worl"}, + {"empty", "", 100, ""}, + {"multiline", "line1\nline2\nline3", 100, "line1 line2 line3"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := snippet(tt.input, tt.maxLen) + if got != tt.want { + t.Errorf("got %q, want %q", got, tt.want) + } + }) + } +} diff --git a/internal/sync/sync.go b/internal/sync/sync.go index f1d9e2fc..15cf307a 100644 --- a/internal/sync/sync.go +++ b/internal/sync/sync.go @@ -55,6 +55,7 @@ func DefaultOptions() *Options { return &Options{ BatchSize: 10, CheckpointInterval: 200, + SourceType: "gmail", } } From fb1d5a90acdd463639c01e1ff3c63d502af176ac Mon Sep 17 00:00:00 2001 From: Ryan Stern <206953196+vanboompow@users.noreply.github.com> Date: Tue, 31 Mar 2026 18:17:40 -0500 Subject: [PATCH 03/65] Add Google Voice Takeout import support (#225) Import Google Voice history from Google Takeout exports into msgvault. Parses HTML conversation files, VCF contacts, and call logs from the Takeout directory structure. Includes: - Takeout directory parser for texts, voicemails, and calls - HTML conversation parser with timestamp and participant extraction - VCF contact parser for Google Voice number detection - CLI command (sync-gvoice) with conversation deduplication - Parser tests for HTML and VCF extraction Co-Authored-By: Ryan Stern <206953196+vanboompow@users.noreply.github.com> --- cmd/msgvault/cmd/sync_gvoice.go | 179 +++++++++ internal/gvoice/client.go | 627 ++++++++++++++++++++++++++++++++ internal/gvoice/models.go | 98 +++++ internal/gvoice/parser.go | 533 +++++++++++++++++++++++++++ internal/gvoice/parser_test.go | 446 +++++++++++++++++++++++ 5 files changed, 1883 insertions(+) create mode 100644 cmd/msgvault/cmd/sync_gvoice.go create mode 100644 internal/gvoice/client.go create mode 100644 internal/gvoice/models.go create mode 100644 internal/gvoice/parser.go create mode 100644 internal/gvoice/parser_test.go diff --git a/cmd/msgvault/cmd/sync_gvoice.go b/cmd/msgvault/cmd/sync_gvoice.go new file mode 100644 index 00000000..de1cf744 --- /dev/null +++ b/cmd/msgvault/cmd/sync_gvoice.go @@ -0,0 +1,179 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "strings" + "syscall" + "time" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/gvoice" + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/sync" +) + +var ( + gvoiceBefore string + gvoiceAfter string + gvoiceLimit int + gvoiceNoResume bool +) + +var syncGvoiceCmd = &cobra.Command{ + Use: "sync-gvoice ", + Short: "Import Google Voice messages from Takeout export", + Long: `Import Google Voice SMS, MMS, and call records from a Google Takeout export. + +Reads HTML files from the Voice/Calls/ directory in a Takeout archive and +stores them in the msgvault archive alongside Gmail and iMessage data. + +The takeout-voice-dir argument should point to the "Voice" directory inside +the extracted Takeout archive, which contains "Calls/" and "Phones.vcf". + +Date filters: + --after 2020-01-01 Only messages on or after this date + --before 2024-12-31 Only messages before this date + +Examples: + msgvault sync-gvoice /path/to/Takeout/Voice + msgvault sync-gvoice /path/to/Takeout/Voice --after 2020-01-01 + msgvault sync-gvoice /path/to/Takeout/Voice --limit 100`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + takeoutDir := args[0] + + // Open msgvault database + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + + // Check takeout directory exists + if _, err := os.Stat(takeoutDir); os.IsNotExist(err) { + return fmt.Errorf("takeout directory not found: %s", takeoutDir) + } + + // Build client options + var clientOpts []gvoice.ClientOption + clientOpts = append(clientOpts, gvoice.WithLogger(logger)) + + if gvoiceAfter != "" { + t, err := time.Parse("2006-01-02", gvoiceAfter) + if err != nil { + return fmt.Errorf("invalid --after date: %w (use YYYY-MM-DD format)", err) + } + clientOpts = append(clientOpts, gvoice.WithAfterDate(t)) + } + + if gvoiceBefore != "" { + t, err := time.Parse("2006-01-02", gvoiceBefore) + if err != nil { + return fmt.Errorf("invalid --before date: %w (use YYYY-MM-DD format)", err) + } + clientOpts = append(clientOpts, gvoice.WithBeforeDate(t)) + } + + if gvoiceLimit > 0 { + clientOpts = append(clientOpts, gvoice.WithLimit(gvoiceLimit)) + } + + // Create Google Voice client + gvClient, err := gvoice.NewClient(takeoutDir, clientOpts...) + if err != nil { + return fmt.Errorf("open Google Voice takeout: %w", err) + } + defer func() { _ = gvClient.Close() }() + + identifier := gvClient.Identifier() + + // Set up context with cancellation + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + // Handle Ctrl+C gracefully + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + fmt.Println("\nInterrupted. Saving checkpoint...") + cancel() + }() + + // Set up sync options + opts := sync.DefaultOptions() + opts.NoResume = gvoiceNoResume + opts.SourceType = "google_voice" + opts.AttachmentsDir = cfg.AttachmentsDir() + + // Create syncer with progress reporter + syncer := sync.New(gvClient, s, opts). + WithLogger(logger). + WithProgress(&CLIProgress{}) + + // Run sync + startTime := time.Now() + fmt.Printf("Starting Google Voice import from %s\n", takeoutDir) + fmt.Printf("Google Voice number: %s\n", identifier) + if gvoiceAfter != "" || gvoiceBefore != "" { + parts := []string{} + if gvoiceAfter != "" { + parts = append(parts, "after "+gvoiceAfter) + } + if gvoiceBefore != "" { + parts = append(parts, "before "+gvoiceBefore) + } + fmt.Printf("Date filter: %s\n", strings.Join(parts, ", ")) + } + if gvoiceLimit > 0 { + fmt.Printf("Limit: %d messages\n", gvoiceLimit) + } + fmt.Println() + + summary, err := syncer.Full(ctx, identifier) + if err != nil { + if ctx.Err() != nil { + fmt.Println("\nSync interrupted. Run again to resume.") + return nil + } + return fmt.Errorf("sync failed: %w", err) + } + + // Print summary + fmt.Println() + fmt.Println("Google Voice import complete!") + fmt.Printf(" Duration: %s\n", summary.Duration.Round(time.Second)) + fmt.Printf(" Messages: %d found, %d added, %d skipped\n", + summary.MessagesFound, summary.MessagesAdded, summary.MessagesSkipped) + if summary.Errors > 0 { + fmt.Printf(" Errors: %d\n", summary.Errors) + } + if summary.WasResumed { + fmt.Printf(" (Resumed from checkpoint)\n") + } + + if summary.MessagesAdded > 0 { + elapsed := time.Since(startTime) + messagesPerSec := float64(summary.MessagesAdded) / elapsed.Seconds() + fmt.Printf(" Rate: %.1f messages/sec\n", messagesPerSec) + } + + return nil + }, +} + +func init() { + syncGvoiceCmd.Flags().StringVar(&gvoiceBefore, "before", "", "only messages before this date (YYYY-MM-DD)") + syncGvoiceCmd.Flags().StringVar(&gvoiceAfter, "after", "", "only messages after this date (YYYY-MM-DD)") + syncGvoiceCmd.Flags().IntVar(&gvoiceLimit, "limit", 0, "limit number of messages (for testing)") + syncGvoiceCmd.Flags().BoolVar(&gvoiceNoResume, "noresume", false, "force fresh sync (don't resume)") + rootCmd.AddCommand(syncGvoiceCmd) +} diff --git a/internal/gvoice/client.go b/internal/gvoice/client.go new file mode 100644 index 00000000..4a7dd032 --- /dev/null +++ b/internal/gvoice/client.go @@ -0,0 +1,627 @@ +package gvoice + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "time" + + "github.com/wesm/msgvault/internal/gmail" +) + +const defaultPageSize = 500 + +// Client reads from a Google Voice Takeout export and implements the gmail.API +// interface so it can be used with the existing sync infrastructure. +type Client struct { + takeoutDir string + owner ownerPhones + identifier string // GV phone number used as source identifier + afterDate time.Time + beforeDate time.Time + limit int + returned int + index []indexEntry + indexBuilt bool + logger *slog.Logger + pageSize int + + // LRU cache for parsed HTML files (avoid re-parsing when consecutive + // messages come from the same file) + lastFilePath string + lastMessages []textMessage + lastGroupPar []string +} + +// ClientOption configures a Client. +type ClientOption func(*Client) + +// WithAfterDate filters to messages on or after this date. +func WithAfterDate(t time.Time) ClientOption { + return func(c *Client) { c.afterDate = t } +} + +// WithBeforeDate filters to messages before this date. +func WithBeforeDate(t time.Time) ClientOption { + return func(c *Client) { c.beforeDate = t } +} + +// WithLimit sets the maximum number of messages to return across all pages. +func WithLimit(n int) ClientOption { + return func(c *Client) { c.limit = n } +} + +// WithLogger sets the logger for the client. +func WithLogger(l *slog.Logger) ClientOption { + return func(c *Client) { c.logger = l } +} + +// NewClient creates a Client from a Google Voice Takeout directory. +// The directory should be the "Voice" folder containing "Calls/" and "Phones.vcf". +func NewClient(takeoutDir string, opts ...ClientOption) (*Client, error) { + // Validate directory exists + info, err := os.Stat(takeoutDir) + if err != nil { + return nil, fmt.Errorf("takeout directory: %w", err) + } + if !info.IsDir() { + return nil, fmt.Errorf("not a directory: %s", takeoutDir) + } + + // Check for Calls subdirectory + callsDir := filepath.Join(takeoutDir, "Calls") + if _, err := os.Stat(callsDir); err != nil { + return nil, fmt.Errorf("calls directory not found in %s: %w", takeoutDir, err) + } + + // Parse Phones.vcf + vcfPath := filepath.Join(takeoutDir, "Phones.vcf") + vcfData, err := os.ReadFile(vcfPath) + if err != nil { + return nil, fmt.Errorf("read Phones.vcf: %w", err) + } + + owner, err := parseVCF(vcfData) + if err != nil { + return nil, fmt.Errorf("parse Phones.vcf: %w", err) + } + + c := &Client{ + takeoutDir: takeoutDir, + owner: owner, + identifier: owner.GoogleVoice, + logger: slog.Default(), + pageSize: defaultPageSize, + } + + for _, opt := range opts { + opt(c) + } + + return c, nil +} + +// Identifier returns the Google Voice phone number used as source identifier. +func (c *Client) Identifier() string { + return c.identifier +} + +// Close is a no-op for the Takeout client. +func (c *Client) Close() error { + return nil +} + +// buildIndex walks the Calls directory, parses each HTML file, and builds +// a sorted index of all messages and call records. This is done lazily +// on the first call to ListMessages. +func (c *Client) buildIndex() error { + if c.indexBuilt { + return nil + } + + callsDir := filepath.Join(c.takeoutDir, "Calls") + entries, err := os.ReadDir(callsDir) + if err != nil { + return fmt.Errorf("read Calls directory: %w", err) + } + + c.logger.Info("building index", "files", len(entries)) + + var index []indexEntry + skipped := 0 + + for _, entry := range entries { + if entry.IsDir() { + continue + } + + name, ft, err := classifyFile(entry.Name()) + if err != nil { + skipped++ + continue + } + + filePath := filepath.Join(callsDir, entry.Name()) + + switch ft { + case fileTypeText, fileTypeGroup: + entries, err := c.indexTextFile(filePath, name, ft) + if err != nil { + c.logger.Warn("failed to index text file", "file", entry.Name(), "error", err) + continue + } + index = append(index, entries...) + + case fileTypeReceived, fileTypePlaced, fileTypeMissed, fileTypeVoicemail: + entry, err := c.indexCallFile(filePath, name, ft) + if err != nil { + c.logger.Warn("failed to index call file", "file", entry.ID, "error", err) + continue + } + index = append(index, *entry) + } + } + + // Apply date filters + var filtered []indexEntry + for _, e := range index { + if !c.afterDate.IsZero() && e.Timestamp.Before(c.afterDate) { + continue + } + if !c.beforeDate.IsZero() && !e.Timestamp.Before(c.beforeDate) { + continue + } + filtered = append(filtered, e) + } + + // Sort by timestamp + sort.Slice(filtered, func(i, j int) bool { + return filtered[i].Timestamp.Before(filtered[j].Timestamp) + }) + + c.index = filtered + c.indexBuilt = true + + c.logger.Info("index built", + "total_entries", len(index), + "filtered_entries", len(filtered), + "skipped_files", skipped, + ) + + return nil +} + +// indexTextFile parses a text/group conversation HTML and returns index entries +// for each individual message within it. +func (c *Client) indexTextFile(filePath, contactName string, ft fileType) ([]indexEntry, error) { + f, err := os.Open(filePath) + if err != nil { + return nil, err + } + defer func() { _ = f.Close() }() + + messages, groupParticipants, err := parseTextHTML(f) + if err != nil { + return nil, err + } + + var entries []indexEntry + for i, msg := range messages { + // Compute deterministic message ID + bodyPrefix := msg.Body + if len(bodyPrefix) > 50 { + bodyPrefix = bodyPrefix[:50] + } + id := computeMessageID(msg.SenderPhone, msg.Timestamp.Format(time.RFC3339Nano), bodyPrefix) + + // Compute thread ID + var threadID string + if ft == fileTypeGroup { + threadID = computeThreadID(c.owner.Cell, fileTypeGroup, "", groupParticipants) + } else { + // For 1:1 texts, use the non-owner phone + otherPhone := msg.SenderPhone + if msg.IsMe { + // Need to find the other party — look through all messages + for _, m := range messages { + if !m.IsMe { + otherPhone = m.SenderPhone + break + } + } + } + threadID = computeThreadID(c.owner.Cell, fileTypeText, otherPhone, nil) + } + + label := labelForFileType(ft) + + entries = append(entries, indexEntry{ + ID: id, + ThreadID: threadID, + FilePath: filePath, + MessageIndex: i, + Timestamp: msg.Timestamp, + FileType: ft, + Labels: []string{label}, + }) + } + + return entries, nil +} + +// indexCallFile parses a call log HTML and returns a single index entry. +func (c *Client) indexCallFile(filePath, contactName string, ft fileType) (*indexEntry, error) { + f, err := os.Open(filePath) + if err != nil { + return nil, err + } + defer func() { _ = f.Close() }() + + record, err := parseCallHTML(f) + if err != nil { + return nil, err + } + + // Override file type from HTML if the filename didn't include it + if record.CallType != 0 { + ft = record.CallType + } + + id := computeMessageID(ft.String(), record.Phone, record.Timestamp.Format(time.RFC3339Nano)) + threadID := computeThreadID(c.owner.Cell, ft, record.Phone, nil) + label := labelForFileType(ft) + + return &indexEntry{ + ID: id, + ThreadID: threadID, + FilePath: filePath, + Timestamp: record.Timestamp, + FileType: ft, + Labels: []string{label}, + }, nil +} + +// GetProfile returns a profile with the GV phone as identifier and index size as total. +func (c *Client) GetProfile(ctx context.Context) (*gmail.Profile, error) { + if err := c.buildIndex(); err != nil { + return nil, err + } + + return &gmail.Profile{ + EmailAddress: c.identifier, + MessagesTotal: int64(len(c.index)), + HistoryID: uint64(len(c.index)), + }, nil +} + +// ListLabels returns the set of labels used for Google Voice messages. +func (c *Client) ListLabels(ctx context.Context) ([]*gmail.Label, error) { + return []*gmail.Label{ + {ID: "sms", Name: "SMS", Type: "user"}, + {ID: "call_received", Name: "Call Received", Type: "user"}, + {ID: "call_placed", Name: "Call Placed", Type: "user"}, + {ID: "call_missed", Name: "Call Missed", Type: "user"}, + {ID: "voicemail", Name: "Voicemail", Type: "user"}, + {ID: "mms", Name: "MMS", Type: "user"}, + }, nil +} + +// ListMessages returns a page of message IDs from the sorted index. +// The pageToken is the string representation of the offset into the index. +func (c *Client) ListMessages(ctx context.Context, query string, pageToken string) (*gmail.MessageListResponse, error) { + if err := c.buildIndex(); err != nil { + return nil, fmt.Errorf("build index: %w", err) + } + + // Check limit + if c.limit > 0 && c.returned >= c.limit { + return &gmail.MessageListResponse{}, nil + } + + offset := 0 + if pageToken != "" { + var err error + offset, err = strconv.Atoi(pageToken) + if err != nil { + return nil, fmt.Errorf("invalid page token: %w", err) + } + } + + if offset >= len(c.index) { + return &gmail.MessageListResponse{}, nil + } + + // Calculate page size respecting limit + pageSize := c.pageSize + if c.limit > 0 { + remaining := c.limit - c.returned + if remaining < pageSize { + pageSize = remaining + } + } + + end := offset + pageSize + if end > len(c.index) { + end = len(c.index) + } + + page := c.index[offset:end] + messages := make([]gmail.MessageID, len(page)) + for i, entry := range page { + messages[i] = gmail.MessageID{ + ID: entry.ID, + ThreadID: entry.ThreadID, + } + } + + c.returned += len(messages) + + var nextPageToken string + if end < len(c.index) && (c.limit <= 0 || c.returned < c.limit) { + nextPageToken = strconv.Itoa(end) + } + + totalEstimate := int64(len(c.index)) + + return &gmail.MessageListResponse{ + Messages: messages, + NextPageToken: nextPageToken, + ResultSizeEstimate: totalEstimate, + }, nil +} + +// GetMessageRaw fetches a single message by ID and builds synthetic MIME data. +func (c *Client) GetMessageRaw(ctx context.Context, messageID string) (*gmail.RawMessage, error) { + if err := c.buildIndex(); err != nil { + return nil, fmt.Errorf("build index: %w", err) + } + + // Linear scan for the entry (index is typically <300k entries) + var entry *indexEntry + for i := range c.index { + if c.index[i].ID == messageID { + entry = &c.index[i] + break + } + } + if entry == nil { + return nil, &gmail.NotFoundError{Path: "/messages/" + messageID} + } + + switch entry.FileType { + case fileTypeText, fileTypeGroup: + return c.buildTextMessage(entry) + case fileTypeReceived, fileTypePlaced, fileTypeMissed, fileTypeVoicemail: + return c.buildCallMessage(entry) + default: + return nil, fmt.Errorf("unknown file type for message %s", messageID) + } +} + +// buildTextMessage constructs a RawMessage from a text/group conversation entry. +func (c *Client) buildTextMessage(entry *indexEntry) (*gmail.RawMessage, error) { + messages, groupParticipants, err := c.getCachedMessages(entry.FilePath) + if err != nil { + return nil, err + } + + if entry.MessageIndex >= len(messages) { + return nil, fmt.Errorf("message index %d out of range (file has %d messages)", entry.MessageIndex, len(messages)) + } + + msg := messages[entry.MessageIndex] + + // Determine from and to addresses + var fromAddrs, toAddrs []string + + ownerEmail, _ := normalizeIdentifier(c.owner.GoogleVoice) + + if msg.IsMe { + fromAddrs = []string{ownerEmail} + if entry.FileType == fileTypeGroup { + for _, phone := range groupParticipants { + email, _ := normalizeIdentifier(phone) + toAddrs = append(toAddrs, email) + } + } else { + // 1:1 text — find the other party + for _, m := range messages { + if !m.IsMe { + email, _ := normalizeIdentifier(m.SenderPhone) + toAddrs = []string{email} + break + } + } + } + } else { + senderEmail, _ := normalizeIdentifier(msg.SenderPhone) + fromAddrs = []string{senderEmail} + toAddrs = []string{ownerEmail} + // In group conversations, add other participants + if entry.FileType == fileTypeGroup { + for _, phone := range groupParticipants { + email, _ := normalizeIdentifier(phone) + if email != senderEmail { + toAddrs = append(toAddrs, email) + } + } + } + } + + mimeData := buildMIME(fromAddrs, toAddrs, msg.Timestamp, entry.ID, msg.Body) + + internalDate := int64(0) + if !msg.Timestamp.IsZero() { + internalDate = msg.Timestamp.UnixMilli() + } + + // Check for MMS attachments + labels := entry.Labels + if len(msg.Attachments) > 0 { + labels = append(labels, "mms") + } + + return &gmail.RawMessage{ + ID: entry.ID, + ThreadID: entry.ThreadID, + LabelIDs: labels, + Snippet: snippet(msg.Body, 100), + HistoryID: uint64(entry.Timestamp.UnixNano()), + InternalDate: internalDate, + SizeEstimate: int64(len(mimeData)), + Raw: mimeData, + }, nil +} + +// buildCallMessage constructs a RawMessage from a call record entry. +func (c *Client) buildCallMessage(entry *indexEntry) (*gmail.RawMessage, error) { + f, err := os.Open(entry.FilePath) + if err != nil { + return nil, err + } + defer func() { _ = f.Close() }() + + record, err := parseCallHTML(f) + if err != nil { + return nil, err + } + + // Build a descriptive body for the call + var body strings.Builder + switch record.CallType { + case fileTypeReceived: + fmt.Fprintf(&body, "Received call from %s", record.Name) + case fileTypePlaced: + fmt.Fprintf(&body, "Placed call to %s", record.Name) + case fileTypeMissed: + fmt.Fprintf(&body, "Missed call from %s", record.Name) + case fileTypeVoicemail: + fmt.Fprintf(&body, "Voicemail from %s", record.Name) + } + if record.Duration != "" { + fmt.Fprintf(&body, " (%s)", formatDuration(record.Duration)) + } + + ownerEmail, _ := normalizeIdentifier(c.owner.GoogleVoice) + contactEmail, _ := normalizeIdentifier(record.Phone) + + var fromAddrs, toAddrs []string + switch record.CallType { + case fileTypeReceived, fileTypeMissed, fileTypeVoicemail: + fromAddrs = []string{contactEmail} + toAddrs = []string{ownerEmail} + case fileTypePlaced: + fromAddrs = []string{ownerEmail} + toAddrs = []string{contactEmail} + } + + mimeData := buildMIME(fromAddrs, toAddrs, record.Timestamp, entry.ID, body.String()) + + internalDate := int64(0) + if !record.Timestamp.IsZero() { + internalDate = record.Timestamp.UnixMilli() + } + + return &gmail.RawMessage{ + ID: entry.ID, + ThreadID: entry.ThreadID, + LabelIDs: entry.Labels, + Snippet: snippet(body.String(), 100), + HistoryID: uint64(record.Timestamp.UnixNano()), + InternalDate: internalDate, + SizeEstimate: int64(len(mimeData)), + Raw: mimeData, + }, nil +} + +// getCachedMessages returns parsed messages for a file, using a simple cache. +func (c *Client) getCachedMessages(filePath string) ([]textMessage, []string, error) { + if c.lastFilePath == filePath { + return c.lastMessages, c.lastGroupPar, nil + } + + f, err := os.Open(filePath) + if err != nil { + return nil, nil, err + } + defer func() { _ = f.Close() }() + + messages, groupParticipants, err := parseTextHTML(f) + if err != nil { + return nil, nil, err + } + + c.lastFilePath = filePath + c.lastMessages = messages + c.lastGroupPar = groupParticipants + + return messages, groupParticipants, nil +} + +// GetMessagesRawBatch fetches multiple messages sequentially. +func (c *Client) GetMessagesRawBatch(ctx context.Context, messageIDs []string) ([]*gmail.RawMessage, error) { + results := make([]*gmail.RawMessage, len(messageIDs)) + for i, id := range messageIDs { + msg, err := c.GetMessageRaw(ctx, id) + if err != nil { + c.logger.Warn("failed to fetch message", "id", id, "error", err) + continue + } + results[i] = msg + } + return results, nil +} + +// ListHistory is not supported for Google Voice Takeout (static export). +func (c *Client) ListHistory(ctx context.Context, startHistoryID uint64, pageToken string) (*gmail.HistoryResponse, error) { + return &gmail.HistoryResponse{ + HistoryID: startHistoryID, + }, nil +} + +// TrashMessage is not supported for Google Voice Takeout. +func (c *Client) TrashMessage(ctx context.Context, messageID string) error { + return fmt.Errorf("trash not supported for Google Voice Takeout") +} + +// DeleteMessage is not supported for Google Voice Takeout. +func (c *Client) DeleteMessage(ctx context.Context, messageID string) error { + return fmt.Errorf("delete not supported for Google Voice Takeout") +} + +// BatchDeleteMessages is not supported for Google Voice Takeout. +func (c *Client) BatchDeleteMessages(ctx context.Context, messageIDs []string) error { + return fmt.Errorf("batch delete not supported for Google Voice Takeout") +} + +// formatDuration converts ISO 8601 duration (PT1M23S) to human-readable format. +func formatDuration(iso string) string { + // Parse PT{hours}H{minutes}M{seconds}S + iso = strings.TrimPrefix(iso, "PT") + var parts []string + + if i := strings.Index(iso, "H"); i >= 0 { + parts = append(parts, iso[:i]+"h") + iso = iso[i+1:] + } + if i := strings.Index(iso, "M"); i >= 0 { + parts = append(parts, iso[:i]+"m") + iso = iso[i+1:] + } + if i := strings.Index(iso, "S"); i >= 0 { + parts = append(parts, iso[:i]+"s") + } + + if len(parts) == 0 { + return "0s" + } + return strings.Join(parts, " ") +} + +// Ensure Client implements gmail.API. +var _ gmail.API = (*Client)(nil) diff --git a/internal/gvoice/models.go b/internal/gvoice/models.go new file mode 100644 index 00000000..91e7ecae --- /dev/null +++ b/internal/gvoice/models.go @@ -0,0 +1,98 @@ +package gvoice + +import "time" + +// fileType classifies a Google Voice Takeout HTML file. +type fileType int + +const ( + fileTypeText fileType = iota // SMS/MMS conversation + fileTypeReceived // Received call + fileTypePlaced // Placed call + fileTypeMissed // Missed call + fileTypeVoicemail // Voicemail + fileTypeGroup // Group conversation +) + +func (ft fileType) String() string { + switch ft { + case fileTypeText: + return "text" + case fileTypeReceived: + return "received" + case fileTypePlaced: + return "placed" + case fileTypeMissed: + return "missed" + case fileTypeVoicemail: + return "voicemail" + case fileTypeGroup: + return "group" + default: + return "unknown" + } +} + +// labelForFileType returns the label string used in ListLabels and message labels. +func labelForFileType(ft fileType) string { + switch ft { + case fileTypeText: + return "sms" + case fileTypeGroup: + return "sms" + case fileTypeReceived: + return "call_received" + case fileTypePlaced: + return "call_placed" + case fileTypeMissed: + return "call_missed" + case fileTypeVoicemail: + return "voicemail" + default: + return "unknown" + } +} + +// ownerPhones holds phone numbers parsed from Phones.vcf. +type ownerPhones struct { + GoogleVoice string // Google Voice number (e.g., +17026083638) + Cell string // Cell number (e.g., +15753222266) +} + +// indexEntry is a pre-indexed reference to a single message or call record +// within a Takeout HTML file. One HTML file may contain many messages. +type indexEntry struct { + ID string // deterministic dedup ID (sha256-based) + ThreadID string // conversation grouping key + FilePath string // path to HTML file + MessageIndex int // index within the HTML file (for text files with multiple messages) + Timestamp time.Time + FileType fileType + Labels []string // e.g., ["sms", "inbox"] +} + +// textMessage is a parsed individual SMS/MMS from a text conversation HTML file. +type textMessage struct { + Timestamp time.Time + SenderPhone string + SenderName string + Body string + Attachments []attachmentRef + IsMe bool // true if sender is the device owner +} + +// attachmentRef references an MMS attachment found in the HTML. +type attachmentRef struct { + HrefInHTML string // href attribute value (no extension in HTML) + MediaType string // "video", "image", etc. +} + +// callRecord is a parsed call log entry. +type callRecord struct { + CallType fileType // received, placed, missed, voicemail + Phone string // contact phone number + Name string // contact display name + Timestamp time.Time + Duration string // ISO 8601 duration (e.g., "PT1M23S") + Labels []string // from the HTML tags section +} diff --git a/internal/gvoice/parser.go b/internal/gvoice/parser.go new file mode 100644 index 00000000..65f4974c --- /dev/null +++ b/internal/gvoice/parser.go @@ -0,0 +1,533 @@ +package gvoice + +import ( + "bufio" + "bytes" + "crypto/sha256" + "fmt" + "io" + "net/mail" + "regexp" + "sort" + "strings" + "time" + + "golang.org/x/net/html" +) + +// Filename classification patterns. +var ( + // "{Name} - Text - {Timestamp}.html" + reText = regexp.MustCompile(`^(.+) - Text - (\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}Z?)\.html$`) + // "{Name} - Received - {Timestamp}.html" + reReceived = regexp.MustCompile(`^(.+) - Received - (\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}Z?)\.html$`) + // "{Name} - Placed - {Timestamp}.html" + rePlaced = regexp.MustCompile(`^(.+) - Placed - (\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}Z?)\.html$`) + // "{Name} - Missed - {Timestamp}.html" + reMissed = regexp.MustCompile(`^(.+) - Missed - (\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}Z?)\.html$`) + // "{Name} - Voicemail - {Timestamp}.html" + reVoicemail = regexp.MustCompile(`^(.+) - Voicemail - (\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}Z?)\.html$`) + // "Group Conversation - {Timestamp}.html" + reGroup = regexp.MustCompile(`^Group Conversation - (\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}Z?)\.html$`) + // "{Name} - {Timestamp}.html" (call files without explicit type — classify from HTML title) + reNameOnly = regexp.MustCompile(`^(.+) - (\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}Z?)\.html$`) +) + +// classifyFile classifies a Google Voice Takeout filename. +// Returns the contact name (empty for groups), the file type, and any error. +// Returns an error for non-HTML files or files to skip (e.g., Bills.html). +func classifyFile(filename string) (name string, ft fileType, err error) { + if !strings.HasSuffix(filename, ".html") { + return "", 0, fmt.Errorf("not an HTML file: %s", filename) + } + + // Skip known non-message files + base := filename + if strings.EqualFold(base, "Bills.html") { + return "", 0, fmt.Errorf("skipping Bills.html") + } + + if m := reText.FindStringSubmatch(base); m != nil { + return m[1], fileTypeText, nil + } + if m := reReceived.FindStringSubmatch(base); m != nil { + return m[1], fileTypeReceived, nil + } + if m := rePlaced.FindStringSubmatch(base); m != nil { + return m[1], fileTypePlaced, nil + } + if m := reMissed.FindStringSubmatch(base); m != nil { + return m[1], fileTypeMissed, nil + } + if m := reVoicemail.FindStringSubmatch(base); m != nil { + return m[1], fileTypeVoicemail, nil + } + if m := reGroup.FindStringSubmatch(base); m != nil { + _ = m[1] // timestamp + return "", fileTypeGroup, nil + } + + // Fallback: "{Name} - {Timestamp}.html" — these are call files without + // the explicit type keyword. Return as unknown and let the caller + // determine the type from the HTML . + if m := reNameOnly.FindStringSubmatch(base); m != nil { + return m[1], fileTypePlaced, nil // default to placed, caller can override from HTML + } + + return "", 0, fmt.Errorf("unrecognized filename pattern: %s", filename) +} + +// parseVCF parses a Google Voice Phones.vcf file to extract phone numbers. +// The VCF uses itemN.TEL and itemN.X-ABLabel pairs where the label may +// appear before or after the TEL line, so we collect all items first. +func parseVCF(data []byte) (ownerPhones, error) { + var phones ownerPhones + scanner := bufio.NewScanner(bytes.NewReader(data)) + + // Collect itemN.TEL and itemN.X-ABLabel pairs + itemTels := make(map[string]string) // "item1" -> phone + itemLabels := make(map[string]string) // "item1" -> label + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + // Match itemN.TEL:value + if idx := strings.Index(line, ".TEL:"); idx > 0 { + prefix := line[:idx] // e.g., "item1" + value := line[idx+5:] + itemTels[prefix] = value + } + // Match itemN.X-ABLabel:value + if idx := strings.Index(line, ".X-ABLabel:"); idx > 0 { + prefix := line[:idx] + value := line[idx+11:] + itemLabels[prefix] = value + } + + if strings.HasPrefix(line, "TEL;TYPE=CELL:") { + phones.Cell = normalizePhone(strings.TrimPrefix(line, "TEL;TYPE=CELL:")) + } + } + + // Match items: find the TEL with "Google Voice" label + for prefix, label := range itemLabels { + if label == "Google Voice" { + if tel, ok := itemTels[prefix]; ok { + phones.GoogleVoice = normalizePhone(tel) + break + } + } + } + + if phones.GoogleVoice == "" { + return phones, fmt.Errorf("google Voice number not found in VCF") + } + + return phones, scanner.Err() +} + +// parseTextHTML parses a Google Voice text/SMS conversation HTML file. +// Returns the individual messages, group participant phones (if any), and any error. +func parseTextHTML(r io.Reader) ([]textMessage, []string, error) { + doc, err := html.Parse(r) + if err != nil { + return nil, nil, fmt.Errorf("parse HTML: %w", err) + } + + var messages []textMessage + var groupParticipants []string + + // Find participants div (group conversations) + walkNodes(doc, func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "div" && hasClass(n, "participants") { + // Extract phone numbers from participant links + walkNodes(n, func(link *html.Node) bool { + if link.Type == html.ElementNode && link.Data == "a" && hasClass(link, "tel") { + href := getAttr(link, "href") + if strings.HasPrefix(href, "tel:") { + phone := normalizePhone(strings.TrimPrefix(href, "tel:")) + groupParticipants = append(groupParticipants, phone) + } + } + return false + }) + } + return false + }) + + // Find message divs + walkNodes(doc, func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "div" && hasClass(n, "message") { + msg := parseMessageDiv(n) + if !msg.Timestamp.IsZero() { + messages = append(messages, msg) + } + return true // don't recurse into message divs + } + return false + }) + + return messages, groupParticipants, nil +} + +// parseMessageDiv extracts a single textMessage from a div.message node. +func parseMessageDiv(div *html.Node) textMessage { + var msg textMessage + + walkNodes(div, func(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + + switch { + case n.Data == "abbr" && hasClass(n, "dt"): + // Timestamp + title := getAttr(n, "title") + if t, err := time.Parse("2006-01-02T15:04:05.000-07:00", title); err == nil { + msg.Timestamp = t.UTC() + } else if t, err := time.Parse("2006-01-02T15:04:05.000Z", title); err == nil { + msg.Timestamp = t.UTC() + } else if t, err := time.Parse("2006-01-02T15:04:05-07:00", title); err == nil { + msg.Timestamp = t.UTC() + } + + case n.Data == "a" && hasClass(n, "tel"): + // Sender phone + href := getAttr(n, "href") + if strings.HasPrefix(href, "tel:") { + msg.SenderPhone = normalizePhone(strings.TrimPrefix(href, "tel:")) + } + // Sender name from child <span class="fn"> or <abbr class="fn"> + walkNodes(n, func(child *html.Node) bool { + if child.Type == html.ElementNode && (child.Data == "span" || child.Data == "abbr") && hasClass(child, "fn") { + name := textContent(child) + if name == "Me" { + msg.IsMe = true + msg.SenderName = "Me" + } else { + msg.SenderName = name + } + return true + } + return false + }) + + case n.Data == "q": + // Message body + msg.Body = extractQBody(n) + return true // don't recurse further + + case n.Data == "a" && hasClass(n, "video"): + // Video attachment + msg.Attachments = append(msg.Attachments, attachmentRef{ + HrefInHTML: getAttr(n, "href"), + MediaType: "video", + }) + + case n.Data == "img": + // Image attachment (MMS) + src := getAttr(n, "src") + if src != "" { + msg.Attachments = append(msg.Attachments, attachmentRef{ + HrefInHTML: src, + MediaType: "image", + }) + } + } + + return false + }) + + return msg +} + +// extractQBody extracts text content from a <q> element, converting <br> to newlines. +func extractQBody(q *html.Node) string { + var b strings.Builder + for c := q.FirstChild; c != nil; c = c.NextSibling { + switch { + case c.Type == html.TextNode: + b.WriteString(c.Data) + case c.Type == html.ElementNode && c.Data == "br": + // Trailing <br> in GV HTML — only add newline if there's more content after + if c.NextSibling != nil && (c.NextSibling.Type != html.TextNode || strings.TrimSpace(c.NextSibling.Data) != "") { + b.WriteString("\n") + } + case c.Type == html.ElementNode: + // Recurse for inline elements like <a> + b.WriteString(textContent(c)) + } + } + return strings.TrimRight(b.String(), "\n") +} + +// parseCallHTML parses a Google Voice call log HTML file. +func parseCallHTML(r io.Reader) (*callRecord, error) { + doc, err := html.Parse(r) + if err != nil { + return nil, fmt.Errorf("parse HTML: %w", err) + } + + record := &callRecord{} + + // Determine call type from title + walkNodes(doc, func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "title" { + title := strings.ToLower(textContent(n)) + switch { + case strings.Contains(title, "received"): + record.CallType = fileTypeReceived + case strings.Contains(title, "placed"): + record.CallType = fileTypePlaced + case strings.Contains(title, "missed"): + record.CallType = fileTypeMissed + case strings.Contains(title, "voicemail"): + record.CallType = fileTypeVoicemail + } + return true + } + return false + }) + + // Find the haudio div + walkNodes(doc, func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "div" && hasClass(n, "haudio") { + // Extract phone and name from contributor vcard + walkNodes(n, func(child *html.Node) bool { + if child.Type == html.ElementNode && child.Data == "div" && hasClass(child, "contributor") { + walkNodes(child, func(link *html.Node) bool { + if link.Type == html.ElementNode && link.Data == "a" && hasClass(link, "tel") { + href := getAttr(link, "href") + if strings.HasPrefix(href, "tel:") { + record.Phone = normalizePhone(strings.TrimPrefix(href, "tel:")) + } + walkNodes(link, func(fn *html.Node) bool { + if fn.Type == html.ElementNode && fn.Data == "span" && hasClass(fn, "fn") { + record.Name = textContent(fn) + return true + } + return false + }) + } + return false + }) + return true + } + return false + }) + + // Extract timestamp + walkNodes(n, func(child *html.Node) bool { + if child.Type == html.ElementNode && child.Data == "abbr" && hasClass(child, "published") { + title := getAttr(child, "title") + if t, err := time.Parse("2006-01-02T15:04:05.000-07:00", title); err == nil { + record.Timestamp = t.UTC() + } + return true + } + return false + }) + + // Extract duration + walkNodes(n, func(child *html.Node) bool { + if child.Type == html.ElementNode && child.Data == "abbr" && hasClass(child, "duration") { + record.Duration = getAttr(child, "title") + return true + } + return false + }) + + // Extract labels + walkNodes(n, func(child *html.Node) bool { + if child.Type == html.ElementNode && child.Data == "div" && hasClass(child, "tags") { + walkNodes(child, func(link *html.Node) bool { + if link.Type == html.ElementNode && link.Data == "a" { + label := strings.ToLower(textContent(link)) + record.Labels = append(record.Labels, label) + } + return false + }) + return true + } + return false + }) + + return true + } + return false + }) + + if record.Phone == "" && record.Timestamp.IsZero() { + return nil, fmt.Errorf("failed to parse call record") + } + + return record, nil +} + +// normalizePhone strips non-digit characters from a phone number and attempts +// to produce a consistent E.164-like format. +func normalizePhone(phone string) string { + hasPlus := strings.HasPrefix(phone, "+") + + var digits strings.Builder + for _, r := range phone { + if r >= '0' && r <= '9' { + digits.WriteRune(r) + } + } + d := digits.String() + if d == "" { + return phone + } + + if hasPlus { + return "+" + d + } + if len(d) == 10 { + return "+1" + d + } + if len(d) == 11 && d[0] == '1' { + return "+" + d + } + return "+" + d +} + +// normalizeIdentifier converts a phone number into an email-like identifier +// using the @phone.gvoice domain. +func normalizeIdentifier(phone string) (email, domain string) { + phone = normalizePhone(phone) + return phone + "@phone.gvoice", "phone.gvoice" +} + +// buildMIME constructs a minimal RFC 2822 message from Google Voice data. +func buildMIME(from, to []string, date time.Time, messageID, body string) []byte { + var b strings.Builder + + if len(from) > 0 { + b.WriteString("From: ") + b.WriteString(formatMIMEAddress(from[0])) + b.WriteString("\r\n") + } + + if len(to) > 0 { + b.WriteString("To: ") + for i, addr := range to { + if i > 0 { + b.WriteString(", ") + } + b.WriteString(formatMIMEAddress(addr)) + } + b.WriteString("\r\n") + } + + if !date.IsZero() { + b.WriteString("Date: ") + b.WriteString(date.Format(time.RFC1123Z)) + b.WriteString("\r\n") + } + + b.WriteString("Subject: \r\n") + + if messageID != "" { + fmt.Fprintf(&b, "Message-ID: <%s@gvoice.local>\r\n", messageID) + } + + b.WriteString("MIME-Version: 1.0\r\n") + b.WriteString("Content-Type: text/plain; charset=utf-8\r\n") + b.WriteString("\r\n") + + if body != "" { + b.WriteString(body) + } + + return []byte(b.String()) +} + +// formatMIMEAddress formats an email address for MIME headers. +func formatMIMEAddress(addr string) string { + return (&mail.Address{Address: addr}).String() +} + +// snippet returns the first n characters of s, suitable for message preview. +func snippet(s string, maxLen int) string { + s = strings.Join(strings.Fields(s), " ") + runes := []rune(s) + if len(runes) > maxLen { + return string(runes[:maxLen]) + } + return s +} + +// computeMessageID computes a deterministic 16-char hex ID from the given parts. +func computeMessageID(parts ...string) string { + h := sha256.Sum256([]byte(strings.Join(parts, "|"))) + return fmt.Sprintf("%x", h[:8]) +} + +// computeThreadID computes the conversation thread ID for a set of participant phones. +// For 1:1 texts, uses the other party's normalized phone. +// For group texts, uses "group:" + sorted(all participant phones). +// For calls, uses "calls:" + normalizedPhone. +func computeThreadID(ownerCell string, ft fileType, contactPhone string, groupParticipants []string) string { + switch ft { + case fileTypeGroup: + phones := make([]string, len(groupParticipants)) + copy(phones, groupParticipants) + sort.Strings(phones) + return "group:" + strings.Join(phones, ",") + case fileTypeReceived, fileTypePlaced, fileTypeMissed, fileTypeVoicemail: + return "calls:" + contactPhone + default: + // 1:1 text — use the other party's phone + return contactPhone + } +} + +// HTML parsing helpers + +// walkNodes recursively walks the HTML node tree, calling fn for each node. +// If fn returns true, the children of that node are skipped. +func walkNodes(n *html.Node, fn func(*html.Node) bool) { + if fn(n) { + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walkNodes(c, fn) + } +} + +// hasClass checks if an HTML element has the given class. +func hasClass(n *html.Node, class string) bool { + for _, a := range n.Attr { + if a.Key == "class" { + for _, c := range strings.Fields(a.Val) { + if c == class { + return true + } + } + } + } + return false +} + +// getAttr returns the value of the named attribute, or empty string. +func getAttr(n *html.Node, key string) string { + for _, a := range n.Attr { + if a.Key == key { + return a.Val + } + } + return "" +} + +// textContent returns the concatenated text content of a node and its children. +func textContent(n *html.Node) string { + if n.Type == html.TextNode { + return n.Data + } + var b strings.Builder + for c := n.FirstChild; c != nil; c = c.NextSibling { + b.WriteString(textContent(c)) + } + return strings.TrimSpace(b.String()) +} diff --git a/internal/gvoice/parser_test.go b/internal/gvoice/parser_test.go new file mode 100644 index 00000000..e03bb7ec --- /dev/null +++ b/internal/gvoice/parser_test.go @@ -0,0 +1,446 @@ +package gvoice + +import ( + "strings" + "testing" + "time" +) + +func TestParseVCF(t *testing.T) { + vcf := `BEGIN:VCARD +VERSION:3.0 +FN: +N:;;;; +item1.TEL:+17026083638 +item1.X-ABLabel:Google Voice +TEL;TYPE=CELL:+15753222266 +END:VCARD +` + phones, err := parseVCF([]byte(vcf)) + if err != nil { + t.Fatalf("parseVCF() error: %v", err) + } + if phones.GoogleVoice != "+17026083638" { + t.Errorf("GoogleVoice = %q, want +17026083638", phones.GoogleVoice) + } + if phones.Cell != "+15753222266" { + t.Errorf("Cell = %q, want +15753222266", phones.Cell) + } +} + +func TestParseVCF_MissingGV(t *testing.T) { + vcf := `BEGIN:VCARD +VERSION:3.0 +TEL;TYPE=CELL:+15551234567 +END:VCARD +` + _, err := parseVCF([]byte(vcf)) + if err == nil { + t.Fatal("expected error for missing GV number") + } +} + +func TestClassifyFile(t *testing.T) { + tests := []struct { + filename string + wantName string + wantType fileType + wantErr bool + }{ + { + filename: "Keith Stern - Text - 2020-02-03T17_37_45Z.html", + wantName: "Keith Stern", + wantType: fileTypeText, + }, + { + filename: "Keith Stern - Received - 2020-02-05T23_26_28Z.html", + wantName: "Keith Stern", + wantType: fileTypeReceived, + }, + { + filename: "Kicy Motley - Placed - 2020-02-03T20_05_20Z.html", + wantName: "Kicy Motley", + wantType: fileTypePlaced, + }, + { + filename: "John Doe - Missed - 2020-03-15T10_30_00Z.html", + wantName: "John Doe", + wantType: fileTypeMissed, + }, + { + filename: "Jane - Voicemail - 2020-04-01T12_00_00Z.html", + wantName: "Jane", + wantType: fileTypeVoicemail, + }, + { + filename: "Group Conversation - 2020-02-05T17_16_14Z.html", + wantName: "", + wantType: fileTypeGroup, + }, + { + // Filename without type keyword (some call files lack explicit type) + filename: "Kicy Motley - 2020-02-03T20_05_20Z.html", + wantName: "Kicy Motley", + wantType: fileTypePlaced, // defaults to placed, caller overrides from HTML + }, + { + // Timestamp without trailing Z + filename: "Someone - Text - 2020-01-15T08_30_00.html", + wantName: "Someone", + wantType: fileTypeText, + }, + { + // Phone number as contact name + filename: "+12025551234 - Text - 2020-06-01T09_00_00Z.html", + wantName: "+12025551234", + wantType: fileTypeText, + }, + { + filename: "photo.jpg", + wantErr: true, + }, + { + filename: "Bills.html", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.filename, func(t *testing.T) { + name, ft, err := classifyFile(tt.filename) + if tt.wantErr { + if err == nil { + t.Fatal("expected error") + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if name != tt.wantName { + t.Errorf("name = %q, want %q", name, tt.wantName) + } + if ft != tt.wantType { + t.Errorf("type = %v, want %v", ft, tt.wantType) + } + }) + } +} + +const sampleTextHTML = `<?xml version="1.0" ?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"><head> +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> +<title>Keith Stern +
+
Feb 3, 2020: +Keith Stern: +Cara says you're coming in tonight? Awesome. +
Feb 3, 2020: +Me: +I'm looking at a bus getting in 815ish. +
` + +func TestParseTextHTML(t *testing.T) { + messages, groupPar, err := parseTextHTML(strings.NewReader(sampleTextHTML)) + if err != nil { + t.Fatalf("parseTextHTML() error: %v", err) + } + + if len(groupPar) != 0 { + t.Errorf("expected no group participants, got %v", groupPar) + } + + if len(messages) != 2 { + t.Fatalf("expected 2 messages, got %d", len(messages)) + } + + // First message: from Keith + m0 := messages[0] + if m0.SenderPhone != "+12023065386" { + t.Errorf("m0.SenderPhone = %q, want +12023065386", m0.SenderPhone) + } + if m0.SenderName != "Keith Stern" { + t.Errorf("m0.SenderName = %q, want Keith Stern", m0.SenderName) + } + if m0.IsMe { + t.Error("m0.IsMe should be false") + } + if !strings.Contains(m0.Body, "Cara says") { + t.Errorf("m0.Body = %q, want to contain 'Cara says'", m0.Body) + } + // HTML entity should be decoded + if !strings.Contains(m0.Body, "you're") { + t.Errorf("m0.Body = %q, expected HTML entities to be decoded", m0.Body) + } + + // Timestamp + expectedTime := time.Date(2020, 2, 3, 17, 37, 45, 632000000, time.UTC) + if !m0.Timestamp.Equal(expectedTime) { + t.Errorf("m0.Timestamp = %v, want %v", m0.Timestamp, expectedTime) + } + + // Second message: from Me + m1 := messages[1] + if !m1.IsMe { + t.Error("m1.IsMe should be true") + } + if m1.SenderName != "Me" { + t.Errorf("m1.SenderName = %q, want Me", m1.SenderName) + } +} + +const sampleGroupHTML = ` + +Group Conversation +
Group conversation with: +Cara Morris Stern, Keith Stern
+
Feb 5, 2020: +Cara Morris Stern: +Check this out
+
Feb 5, 2020: +Keith Stern: +Cool
+
` + +func TestParseTextHTML_Group(t *testing.T) { + messages, groupPar, err := parseTextHTML(strings.NewReader(sampleGroupHTML)) + if err != nil { + t.Fatalf("parseTextHTML() error: %v", err) + } + + if len(groupPar) != 2 { + t.Fatalf("expected 2 group participants, got %d", len(groupPar)) + } + if groupPar[0] != "+12022712272" { + t.Errorf("groupPar[0] = %q, want +12022712272", groupPar[0]) + } + if groupPar[1] != "+12023065386" { + t.Errorf("groupPar[1] = %q, want +12023065386", groupPar[1]) + } + + if len(messages) != 2 { + t.Fatalf("expected 2 messages, got %d", len(messages)) + } + + // Trailing
should be stripped + if strings.HasSuffix(messages[0].Body, "\n") { + t.Errorf("body should not end with newline: %q", messages[0].Body) + } +} + +const sampleMMS = ` + +Test +
+
Feb 5, 2020: +Test User: + +
` + +func TestParseTextHTML_MMS(t *testing.T) { + messages, _, err := parseTextHTML(strings.NewReader(sampleMMS)) + if err != nil { + t.Fatalf("parseTextHTML() error: %v", err) + } + + if len(messages) != 1 { + t.Fatalf("expected 1 message, got %d", len(messages)) + } + + if len(messages[0].Attachments) != 1 { + t.Fatalf("expected 1 attachment, got %d", len(messages[0].Attachments)) + } + + att := messages[0].Attachments[0] + if att.MediaType != "video" { + t.Errorf("attachment MediaType = %q, want video", att.MediaType) + } + if att.HrefInHTML != "Group Conversation - 2020-02-05T17_16_14Z-7-1" { + t.Errorf("attachment HrefInHTML = %q", att.HrefInHTML) + } +} + +const sampleReceivedCallHTML = ` + +Received call from +Keith Stern +
Call Log for + +Received call from +Keith Stern +
Received call from +Keith Stern
+Feb 5, 2020 +
+(00:01:23) +
Labels: +
+
` + +func TestParseCallHTML_Received(t *testing.T) { + record, err := parseCallHTML(strings.NewReader(sampleReceivedCallHTML)) + if err != nil { + t.Fatalf("parseCallHTML() error: %v", err) + } + + if record.CallType != fileTypeReceived { + t.Errorf("CallType = %v, want received", record.CallType) + } + if record.Phone != "+12023065386" { + t.Errorf("Phone = %q, want +12023065386", record.Phone) + } + if record.Name != "Keith Stern" { + t.Errorf("Name = %q, want Keith Stern", record.Name) + } + if record.Duration != "PT1M23S" { + t.Errorf("Duration = %q, want PT1M23S", record.Duration) + } + + expectedTime := time.Date(2020, 2, 5, 23, 26, 28, 0, time.UTC) + if !record.Timestamp.Equal(expectedTime) { + t.Errorf("Timestamp = %v, want %v", record.Timestamp, expectedTime) + } +} + +const samplePlacedCallHTML = ` + +Placed call to +Kicy Motley +
Call Log for + +Placed call to +Kicy Motley +
Placed call to +Kicy Motley
+Feb 3, 2020 +
+(00:05:08) +
Labels: +
+
` + +func TestParseCallHTML_Placed(t *testing.T) { + record, err := parseCallHTML(strings.NewReader(samplePlacedCallHTML)) + if err != nil { + t.Fatalf("parseCallHTML() error: %v", err) + } + + if record.CallType != fileTypePlaced { + t.Errorf("CallType = %v, want placed", record.CallType) + } + if record.Phone != "+17188096446" { + t.Errorf("Phone = %q, want +17188096446", record.Phone) + } +} + +func TestNormalizePhone(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"+12023065386", "+12023065386"}, + {"(202) 306-5386", "+12023065386"}, + {"2023065386", "+12023065386"}, + {"+442071234567", "+442071234567"}, + {"12023065386", "+12023065386"}, + {"+1 (202) 306-5386", "+12023065386"}, + {"", ""}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got := normalizePhone(tt.input) + if got != tt.want { + t.Errorf("normalizePhone(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} + +func TestComputeMessageID(t *testing.T) { + id1 := computeMessageID("+12023065386", "2020-02-03T11:37:45Z", "Hello") + id2 := computeMessageID("+12023065386", "2020-02-03T11:37:45Z", "Hello") + id3 := computeMessageID("+12023065386", "2020-02-03T11:37:45Z", "Goodbye") + + if id1 != id2 { + t.Error("same inputs should produce same ID") + } + if id1 == id3 { + t.Error("different inputs should produce different IDs") + } + if len(id1) != 16 { + t.Errorf("ID length = %d, want 16", len(id1)) + } +} + +func TestNormalizeIdentifier(t *testing.T) { + email, domain := normalizeIdentifier("+12023065386") + if email != "+12023065386@phone.gvoice" { + t.Errorf("email = %q, want +12023065386@phone.gvoice", email) + } + if domain != "phone.gvoice" { + t.Errorf("domain = %q, want phone.gvoice", domain) + } +} + +func TestFormatDuration(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"PT1M23S", "1m 23s"}, + {"PT5M8S", "5m 8s"}, + {"PT0S", "0s"}, + {"PT1H2M3S", "1h 2m 3s"}, + {"PT30S", "30s"}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got := formatDuration(tt.input) + if got != tt.want { + t.Errorf("formatDuration(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} + +func TestComputeThreadID(t *testing.T) { + // 1:1 text uses other party's phone + tid := computeThreadID("+15553334444", fileTypeText, "+12023065386", nil) + if tid != "+12023065386" { + t.Errorf("1:1 threadID = %q, want +12023065386", tid) + } + + // Group uses sorted participants + tid = computeThreadID("+15553334444", fileTypeGroup, "", []string{"+12023065386", "+12022712272"}) + if tid != "group:+12022712272,+12023065386" { + t.Errorf("group threadID = %q, want group:+12022712272,+12023065386", tid) + } + + // Call uses calls: prefix + tid = computeThreadID("+15553334444", fileTypeReceived, "+12023065386", nil) + if tid != "calls:+12023065386" { + t.Errorf("call threadID = %q, want calls:+12023065386", tid) + } +} + +func TestSnippet(t *testing.T) { + long := strings.Repeat("a", 200) + s := snippet(long, 100) + if len(s) != 100 { + t.Errorf("snippet length = %d, want 100", len(s)) + } + + s = snippet("short", 100) + if s != "short" { + t.Errorf("snippet = %q, want short", s) + } + + // Whitespace normalization + s = snippet(" hello world ", 100) + if s != "hello world" { + t.Errorf("snippet = %q, want 'hello world'", s) + } +} From 10ccb5e0962698a696d4d027d2969baacdfd7bad Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 19:10:35 -0500 Subject: [PATCH 04/65] Add unified text message import design spec Design for merging WhatsApp (#160), iMessage (#224), and Google Voice (#225) import implementations into a coherent system with shared phone-based participant model, proper schema usage, and dedicated TUI Texts mode. Co-Authored-By: Claude Opus 4.6 (1M context) --- ...3-31-unified-text-message-import-design.md | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md diff --git a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md new file mode 100644 index 00000000..3735c999 --- /dev/null +++ b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md @@ -0,0 +1,227 @@ +# Unified Text Message Import + +Merge three independent text message import implementations (WhatsApp +#160, iMessage #224, Google Voice #225) into a coherent system with a +shared schema, unified participant model, and dedicated TUI experience. + +## Guiding Principles + +1. **Phone number is the unification key.** If you communicate with + someone through multiple channels (iMessage, WhatsApp, Google Voice), + all messages appear under one contact. +2. **Texts are not emails.** The TUI has a separate Texts mode with + conversation-centric navigation, not the sender-aggregate model used + for email. +3. **Consistent UX across modes.** Same keybindings, sort/filter + patterns, and visual language in both Email and Texts modes. Only the + available views and drill-down behavior differ. + +## Schema & Persistence + +All text message importers converge on the same storage pattern. + +### Participant Model + +- `participants.phone_number` stores E.164 normalized phone numbers. + No synthetic email addresses (`@phone.imessage`, `@phone.gvoice`). +- `EnsureParticipantByPhone` deduplicates across sources: the same + phone number from WhatsApp, iMessage, and Google Voice resolves to + one `participants` row. +- `participant_identifiers` tracks which platforms a contact is known + on (`identifier_type = 'whatsapp'`, `'imessage'`, `'google_voice'`). +- A shared `NormalizePhone()` utility ensures consistent E.164 + normalization across all importers. + +### Message Storage + +| Column | Value | +|---|---| +| `messages.message_type` | `'whatsapp'`, `'imessage'`, `'sms'`, `'google_voice'` (see note below) | +| `messages.sender_id` | FK to `participants.id` (direct link, not via `message_recipients`) | +| `messages.subject` | NULL for text messages | +| `conversations.conversation_type` | `'group_chat'` or `'direct_chat'` | +| `conversations.title` | Group name, or contact name for 1:1 chats | +| `sources.source_type` | `'whatsapp'`, `'apple_messages'`, `'google_voice'` | +| `message_bodies.body_text` | Message text stored directly | +| `message_raw.raw_format` | `'whatsapp_json'`, `'imessage_json'`, `'gvoice_html'` | + +No synthetic MIME wrapping for text messages. Body text goes directly +into `message_bodies`. Raw source data is stored in its native format. + +### Message Type Values + +- iMessage sets `'imessage'` or `'sms'` based on the service field in + `chat.db` (Apple distinguishes these natively). +- Google Voice uses `'google_voice'` for all record types (texts, calls, + voicemails). Call records and voicemails are differentiated via labels + (`sms`, `mms`, `call_received`, `call_placed`, `call_missed`, + `voicemail`) rather than separate `message_type` values. Call records + have `conversation_type = 'direct_chat'` and are grouped into + `calls:` threads. + +### `conversation_participants` + +All three importers populate this table to track who is in each +conversation, with roles where applicable (e.g., WhatsApp group admins). + +## Importer Architecture + +### Per-Source Packages + +Each importer is its own package with source-specific parsing: + +- `internal/whatsapp/` — reads decrypted WhatsApp `msgstore.db` +- `internal/imessage/` — reads macOS `chat.db` +- `internal/gvoice/` — parses Google Takeout HTML/VCF files + +No shared interface is forced — each source is too different. But all +converge on the same store methods for persistence: +`EnsureParticipantByPhone`, `EnsureConversationWithType`, message +insertion with proper `message_type`/`sender_id`/`conversation_type`. + +### Shared Utilities (`internal/textimport/`) + +- `NormalizePhone(raw string) string` — E.164 normalization +- Progress reporting (callback-based, like WhatsApp's `ImportCLIProgress`) + +### iMessage Refactoring + +Drop `gmail.API` interface implementation and synthetic MIME generation. +Instead: +- Read from `chat.db` directly (parsing stays the same) +- Call store methods for persistence with proper phone-based participants +- Set `message_type = 'imessage'` or `'sms'` (based on iMessage service field) +- Set `conversation_type` based on chat type (group vs 1:1) +- Populate `conversations.title` from `chat.display_name` + +### Google Voice Refactoring + +Drop `gmail.API` interface implementation and synthetic MIME generation. +Instead: +- Parse HTML/VCF files (parsing stays the same) +- Call store methods for persistence with proper phone-based participants +- Set `message_type = 'google_voice'` +- Set `conversation_type` based on participant count +- Store body text directly, raw HTML in `message_raw` + +### WhatsApp + +Mostly fine as-is — already follows the target pattern. Minor cleanup: +- Use shared `NormalizePhone()` instead of internal normalization +- Ensure consistent `raw_format` naming + +### CLI Commands + +Renamed for consistency (each stays separate since inputs differ): + +``` +msgvault import-whatsapp --phone +1... [--media-dir] [--contacts] +msgvault import-imessage [--me +1...] +msgvault import-gvoice +``` + +The `source_type` is `'whatsapp'` regardless of import method (backup +now, web sync API later). `raw_format` in `message_raw` can distinguish +import methods if needed. + +## TUI Texts Mode + +### Mode Switching + +A new key (`m`) toggles between Email mode and Texts mode. The status +bar shows the current mode. All existing email TUI behavior is +unchanged in Email mode. + +### Conversations View (Primary) + +The default view when entering Texts mode. Each row shows: + +| Name | Source | Messages | Participants | Last Message | +|------|--------|----------|-------------|--------------| +| Jane Smith | iMessage | 1,247 | 2 | 2026-03-28 | +| Family Group | WhatsApp | 8,432 | 6 | 2026-03-30 | + +- Default sort: last message date (newest first) +- Drill into a conversation: chronological message timeline +- Messages display in compact chat style (timestamp, sender, body snippet) + +### Aggregate Views (Tab to Cycle) + +- **Contacts** — aggregate by participant phone number/name, total + messages across all sources and conversations +- **Contact Names** — aggregate by display name +- **Sources** — aggregate by source type (WhatsApp / iMessage / GVoice) +- **Labels** — source-specific labels (GVoice: sms/voicemail/call) +- **Time** — message volume over time (year/month/day granularity) + +### Drill-Down + +- From Conversations: chronological message timeline +- From Contacts: all conversations with that person (across all + sources), then drill into a specific conversation +- From Time: conversations active in that period + +### Filters and Interaction + +All existing patterns carry over: +- Account filter (`a`) — doubles as source-type filter +- Date range, attachment filter +- Search (`/`) — queries FTS, results filtered to text messages +- Selection (`Space`/`A`), deletion staging (`d`/`D`) +- Sort cycling (`s`), reverse (`r`) + +## Parquet Analytics + +### Separate Cache for Texts + +``` +~/.msgvault/analytics/ + messages/year=*/ # Email (existing) + texts/year=*/ # Text messages (new) + _last_sync.json +``` + +### Text Parquet Schema (Denormalized) + +- `message_id`, `source_id`, `conversation_id` +- `phone_number`, `display_name` (sender) +- `message_type` (whatsapp/imessage/sms/google_voice) +- `source_type` (whatsapp/apple_messages/google_voice) +- `conversation_title`, `conversation_type` +- `sent_at`, `year` (partition key) +- `body_length`, `has_attachments`, `attachment_count` +- `to_phones[]` (recipient phone numbers) +- `labels[]` + +### Query Engine + +DuckDB query engine gets parallel methods for texts — same +aggregate/filter patterns as email but keyed on phone numbers and +conversations instead of email addresses and domains. + +## Search + +Text messages are indexed in `messages_fts` alongside emails. Search +in Texts mode filters results to text message types; search in Email +mode filters to email. The FTS table and indexing pipeline are shared. + +## Scope + +### In Scope + +- Refactor iMessage and Google Voice to phone-based persistence +- Shared `NormalizePhone()` utility +- Participant deduplication by phone number across all sources +- CLI command renaming +- TUI Texts mode (Conversations + aggregate views) +- Text message Parquet cache and DuckDB query methods +- FTS indexing of text messages +- `build-cache` builds both email and text Parquet files + +### Deferred + +- WhatsApp web sync API (future import method) +- MMS/iMessage attachment extraction +- Contact name resolution from macOS address book +- Cross-mode unified search (emails + texts together) +- Rich message detail view for texts (headers, raw data display) From a500cb0c0a196e6903c7a9c8b1f1097154647f87 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 19:20:57 -0500 Subject: [PATCH 05/65] Address review findings in text message import spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Support non-phone participants (iMessage email handles, short codes) with a resolution order: phone → email → raw handle - Add conversation title fallback chain for 1:1 chats - Generalize EnsureParticipantByPhone to accept identifierType param - Split Google Voice into distinct message_types (text/call/voicemail) so Texts mode can cleanly filter out call records Co-Authored-By: Claude Opus 4.6 (1M context) --- ...3-31-unified-text-message-import-design.md | 91 ++++++++++++++----- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md index 3735c999..4539e55d 100644 --- a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md +++ b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md @@ -22,25 +22,44 @@ All text message importers converge on the same storage pattern. ### Participant Model -- `participants.phone_number` stores E.164 normalized phone numbers. - No synthetic email addresses (`@phone.imessage`, `@phone.gvoice`). -- `EnsureParticipantByPhone` deduplicates across sources: the same - phone number from WhatsApp, iMessage, and Google Voice resolves to - one `participants` row. -- `participant_identifiers` tracks which platforms a contact is known - on (`identifier_type = 'whatsapp'`, `'imessage'`, `'google_voice'`). -- A shared `NormalizePhone()` utility ensures consistent E.164 - normalization across all importers. +Phone number is the preferred unification key, but not all participants +have one. iMessage handles can be email addresses, and some senders are +short codes or system identifiers. + +**Resolution order:** +1. If the handle normalizes to a valid E.164 phone number, use + `EnsureParticipantByPhone` — this deduplicates across sources so the + same phone from WhatsApp, iMessage, and Google Voice resolves to one + `participants` row. +2. If the handle is an email address (common in iMessage), use the + existing `EnsureParticipant` by email — the participant gets an + `email_address` but no `phone_number`. +3. If the handle is neither (short codes, system senders), create a + participant with the raw handle stored in `participant_identifiers` + and no canonical phone or email. + +No synthetic email addresses (`@phone.imessage`, `@phone.gvoice`). + +**Platform identifier tracking:** `EnsureParticipantByPhone` (and the +email path) accept an `identifierType` parameter (`'whatsapp'`, +`'imessage'`, `'google_voice'`) so each importer registers its own +platform-specific identifier in `participant_identifiers`. The current +WhatsApp-hardcoded behavior is generalized. + +A shared `NormalizePhone()` utility ensures consistent E.164 +normalization across all importers. It returns an error for inputs that +cannot be normalized (email handles, short codes), signaling the caller +to fall through to path 2 or 3 above. ### Message Storage | Column | Value | |---|---| -| `messages.message_type` | `'whatsapp'`, `'imessage'`, `'sms'`, `'google_voice'` (see note below) | +| `messages.message_type` | `'whatsapp'`, `'imessage'`, `'sms'`, `'google_voice_text'`, `'google_voice_call'`, `'google_voice_voicemail'` | | `messages.sender_id` | FK to `participants.id` (direct link, not via `message_recipients`) | | `messages.subject` | NULL for text messages | | `conversations.conversation_type` | `'group_chat'` or `'direct_chat'` | -| `conversations.title` | Group name, or contact name for 1:1 chats | +| `conversations.title` | Group name, or resolved contact name for 1:1 (see fallback below) | | `sources.source_type` | `'whatsapp'`, `'apple_messages'`, `'google_voice'` | | `message_bodies.body_text` | Message text stored directly | | `message_raw.raw_format` | `'whatsapp_json'`, `'imessage_json'`, `'gvoice_html'` | @@ -48,16 +67,39 @@ All text message importers converge on the same storage pattern. No synthetic MIME wrapping for text messages. Body text goes directly into `message_bodies`. Raw source data is stored in its native format. +### Conversation Title Fallback + +Group chats use the group name from the source (WhatsApp subject, +iMessage `display_name`). For 1:1 chats, title is resolved with this +fallback chain: +1. `chat.display_name` (if set by the source) +2. Other participant's `display_name` from `participants` +3. Other participant's phone number or email handle + +The TUI Conversations view uses this title for display. If the title +is still empty at display time (e.g., participant not yet resolved), +the raw handle is shown. + ### Message Type Values - iMessage sets `'imessage'` or `'sms'` based on the service field in `chat.db` (Apple distinguishes these natively). -- Google Voice uses `'google_voice'` for all record types (texts, calls, - voicemails). Call records and voicemails are differentiated via labels +- Google Voice uses distinct `message_type` values per record kind: + `'google_voice_text'` for SMS/MMS, `'google_voice_call'` for call + records, and `'google_voice_voicemail'` for voicemails. Labels (`sms`, `mms`, `call_received`, `call_placed`, `call_missed`, - `voicemail`) rather than separate `message_type` values. Call records - have `conversation_type = 'direct_chat'` and are grouped into - `calls:` threads. + `voicemail`) provide finer-grained classification within each type. + Call records have `conversation_type = 'direct_chat'` and are + grouped into `calls:` threads. + +### Texts Mode Message Type Filtering + +Texts mode displays messages where `message_type` is one of: +`'whatsapp'`, `'imessage'`, `'sms'`, `'google_voice_text'`. Call +records (`'google_voice_call'`) and voicemails +(`'google_voice_voicemail'`) are excluded from the default Texts view. +They are accessible via the Labels aggregate view when filtered to the +relevant label. ### `conversation_participants` @@ -76,8 +118,10 @@ Each importer is its own package with source-specific parsing: No shared interface is forced — each source is too different. But all converge on the same store methods for persistence: -`EnsureParticipantByPhone`, `EnsureConversationWithType`, message -insertion with proper `message_type`/`sender_id`/`conversation_type`. +`EnsureParticipantByPhone(phone, identifierType)`, +`EnsureParticipant(email, identifierType)` (for email-based handles), +`EnsureConversationWithType`, and message insertion with proper +`message_type`/`sender_id`/`conversation_type`. ### Shared Utilities (`internal/textimport/`) @@ -89,10 +133,12 @@ insertion with proper `message_type`/`sender_id`/`conversation_type`. Drop `gmail.API` interface implementation and synthetic MIME generation. Instead: - Read from `chat.db` directly (parsing stays the same) -- Call store methods for persistence with proper phone-based participants +- Resolve participants via phone or email (iMessage handles can be + either); use `NormalizePhone` first, fall back to email path - Set `message_type = 'imessage'` or `'sms'` (based on iMessage service field) - Set `conversation_type` based on chat type (group vs 1:1) -- Populate `conversations.title` from `chat.display_name` +- Populate `conversations.title` using the fallback chain (see + Conversation Title Fallback section) ### Google Voice Refactoring @@ -100,7 +146,8 @@ Drop `gmail.API` interface implementation and synthetic MIME generation. Instead: - Parse HTML/VCF files (parsing stays the same) - Call store methods for persistence with proper phone-based participants -- Set `message_type = 'google_voice'` +- Set `message_type` per record kind: `'google_voice_text'`, + `'google_voice_call'`, or `'google_voice_voicemail'` - Set `conversation_type` based on participant count - Store body text directly, raw HTML in `message_raw` @@ -185,7 +232,7 @@ All existing patterns carry over: - `message_id`, `source_id`, `conversation_id` - `phone_number`, `display_name` (sender) -- `message_type` (whatsapp/imessage/sms/google_voice) +- `message_type` (whatsapp/imessage/sms/google_voice_text/google_voice_call/google_voice_voicemail) - `source_type` (whatsapp/apple_messages/google_voice) - `conversation_title`, `conversation_type` - `sent_at`, `year` (partition key) From f0c3c636072252caab250168f575c18fc10f8935 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 19:28:21 -0500 Subject: [PATCH 06/65] Address second round of review findings in spec - Texts mode is explicitly read-only (no deletion staging); imported archives have no live delete API - Scope the cross-channel unification principle honestly: phone-based dedup works, email-only iMessage handles remain separate until address book resolution - Conversation stats maintained by the store layer on insert, not left to each importer - Unified Parquet cache with mode filtering instead of separate texts/ directory - Label persistence is part of the shared importer contract - FTS backfill updated to populate sender from phone_number via sender_id for text messages Co-Authored-By: Claude Opus 4.6 (1M context) --- ...3-31-unified-text-message-import-design.md | 159 ++++++++++++++---- 1 file changed, 125 insertions(+), 34 deletions(-) diff --git a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md index 4539e55d..aac1597f 100644 --- a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md +++ b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md @@ -6,15 +6,25 @@ shared schema, unified participant model, and dedicated TUI experience. ## Guiding Principles -1. **Phone number is the unification key.** If you communicate with - someone through multiple channels (iMessage, WhatsApp, Google Voice), - all messages appear under one contact. +1. **Phone number is the primary unification key.** If you communicate + with someone through multiple channels (iMessage, WhatsApp, Google + Voice) using the same phone number, all messages appear under one + contact. Cross-channel unification where the only shared identifier + is an address book entry (e.g., alice@icloud.com in iMessage and + +1... in WhatsApp) requires address book resolution, which is + deferred. Phone-based dedup handles the common case; gaps are + acknowledged, not hidden. 2. **Texts are not emails.** The TUI has a separate Texts mode with conversation-centric navigation, not the sender-aggregate model used for email. 3. **Consistent UX across modes.** Same keybindings, sort/filter patterns, and visual language in both Email and Texts modes. Only the available views and drill-down behavior differ. +4. **Texts mode is read-only.** Imported text archives have no live + delete API (iMessage reads a local DB, WhatsApp reads a backup, + GVoice reads a Takeout export). Deletion staging (`d`/`D`) is + disabled in Texts mode. Selection keybindings (`Space`/`A`) are + reserved for future use (e.g., export) but do not stage deletions. ## Schema & Persistence @@ -51,6 +61,13 @@ normalization across all importers. It returns an error for inputs that cannot be normalized (email handles, short codes), signaling the caller to fall through to path 2 or 3 above. +**Cross-channel limitations:** Participants matched by phone number are +unified automatically. Participants only known by email (e.g., an +iMessage contact using their iCloud address) remain separate from the +same person's phone-based participant until address book resolution is +implemented. The Contacts aggregate view in Texts mode will show these +as separate entries. + ### Message Storage | Column | Value | @@ -101,6 +118,42 @@ records (`'google_voice_call'`) and voicemails They are accessible via the Labels aggregate view when filtered to the relevant label. +### Conversation Stats Maintenance + +The `conversations` table has denormalized stats columns: +`message_count`, `participant_count`, `last_message_at`, +`last_message_preview`. These are required for the Conversations +primary view. + +**Store-level maintenance:** The store layer maintains these stats as +part of message insertion — not left to each importer. When a message +is inserted for a text source (identified by `message_type`), the +store updates the parent conversation's stats atomically: +- `message_count` incremented +- `last_message_at` updated if the new message is newer +- `last_message_preview` set to the message snippet +- `participant_count` updated when new `conversation_participants` + rows are added + +This replaces the WhatsApp importer's current approach of bulk- +updating stats in a post-processing step. All three importers get +correct stats automatically. + +### Label Persistence + +All importers that produce labels must create `labels` rows and link +them via `message_labels`. This is part of the shared persistence +contract: +- **WhatsApp:** source-specific labels as needed +- **iMessage:** `'iMessage'`, `'SMS'` (from service field) +- **Google Voice:** `'sms'`, `'mms'`, `'call_received'`, + `'call_placed'`, `'call_missed'`, `'voicemail'` + +The store provides `EnsureLabel(name, sourceID)` and +`LinkMessageLabel(messageID, labelID)`. Google Voice call/voicemail +records depend on labels for discoverability in the Labels aggregate +view. + ### `conversation_participants` All three importers populate this table to track who is in each @@ -120,13 +173,17 @@ No shared interface is forced — each source is too different. But all converge on the same store methods for persistence: `EnsureParticipantByPhone(phone, identifierType)`, `EnsureParticipant(email, identifierType)` (for email-based handles), -`EnsureConversationWithType`, and message insertion with proper -`message_type`/`sender_id`/`conversation_type`. +`EnsureConversationWithType`, `EnsureLabel`, `LinkMessageLabel`, and +message insertion with proper +`message_type`/`sender_id`/`conversation_type`. The store handles +conversation stats maintenance automatically on insert. ### Shared Utilities (`internal/textimport/`) -- `NormalizePhone(raw string) string` — E.164 normalization -- Progress reporting (callback-based, like WhatsApp's `ImportCLIProgress`) +- `NormalizePhone(raw string) (string, error)` — E.164 normalization; + returns error for non-phone inputs +- Progress reporting (callback-based, like WhatsApp's + `ImportCLIProgress`) ### iMessage Refactoring @@ -135,10 +192,12 @@ Instead: - Read from `chat.db` directly (parsing stays the same) - Resolve participants via phone or email (iMessage handles can be either); use `NormalizePhone` first, fall back to email path -- Set `message_type = 'imessage'` or `'sms'` (based on iMessage service field) +- Set `message_type = 'imessage'` or `'sms'` (based on iMessage + service field) - Set `conversation_type` based on chat type (group vs 1:1) - Populate `conversations.title` using the fallback chain (see Conversation Title Fallback section) +- Create labels (`'iMessage'`, `'SMS'`) and link to messages ### Google Voice Refactoring @@ -150,6 +209,8 @@ Instead: `'google_voice_call'`, or `'google_voice_voicemail'` - Set `conversation_type` based on participant count - Store body text directly, raw HTML in `message_raw` +- Create labels (`'sms'`, `'mms'`, `'call_received'`, etc.) and link + to messages ### WhatsApp @@ -190,7 +251,10 @@ The default view when entering Texts mode. Each row shows: - Default sort: last message date (newest first) - Drill into a conversation: chronological message timeline -- Messages display in compact chat style (timestamp, sender, body snippet) +- Messages display in compact chat style (timestamp, sender, body + snippet) +- Conversation stats (Messages, Participants, Last Message) come from + denormalized columns maintained by the store layer ### Aggregate Views (Tab to Cycle) @@ -214,43 +278,65 @@ All existing patterns carry over: - Account filter (`a`) — doubles as source-type filter - Date range, attachment filter - Search (`/`) — queries FTS, results filtered to text messages -- Selection (`Space`/`A`), deletion staging (`d`/`D`) - Sort cycling (`s`), reverse (`r`) +**Read-only:** Deletion staging (`d`/`D`) and selection (`Space`/`A`) +are disabled in Texts mode. Imported text archives have no live delete +API — iMessage reads a local DB snapshot, WhatsApp reads a decrypted +backup, GVoice reads a Takeout export. There is no server to delete +from. + ## Parquet Analytics -### Separate Cache for Texts +### Unified Cache with Mode Filtering + +Text messages are stored in the same Parquet cache as emails, with +additional columns to support mode-specific queries. This avoids +duplicating the entire cache/query/staleness infrastructure. ``` ~/.msgvault/analytics/ - messages/year=*/ # Email (existing) - texts/year=*/ # Text messages (new) + messages/year=*/ # All messages (email + text) _last_sync.json ``` -### Text Parquet Schema (Denormalized) +### Additional Parquet Columns + +The existing denormalized Parquet schema is extended with: +- `phone_number` (sender, from `participants.phone_number`) +- `message_type` (whatsapp/imessage/sms/google_voice_*/email) +- `source_type` (whatsapp/apple_messages/google_voice/gmail) +- `conversation_title` (from `conversations.title`) +- `conversation_type` (group_chat/direct_chat/email_thread) +- `sender_id` (from `messages.sender_id`) -- `message_id`, `source_id`, `conversation_id` -- `phone_number`, `display_name` (sender) -- `message_type` (whatsapp/imessage/sms/google_voice_text/google_voice_call/google_voice_voicemail) -- `source_type` (whatsapp/apple_messages/google_voice) -- `conversation_title`, `conversation_type` -- `sent_at`, `year` (partition key) -- `body_length`, `has_attachments`, `attachment_count` -- `to_phones[]` (recipient phone numbers) -- `labels[]` +Email mode queries filter `WHERE message_type = 'email'` (or +`source_type = 'gmail'`). Texts mode queries filter on the text +message types. The DuckDB query engine branches on mode for aggregate +key columns (email uses `from_email`/`from_domain`; texts use +`phone_number`/`conversation_title`). ### Query Engine -DuckDB query engine gets parallel methods for texts — same -aggregate/filter patterns as email but keyed on phone numbers and -conversations instead of email addresses and domains. +The DuckDB query engine gains mode-aware aggregate methods. Same +function signatures as email aggregates, but the mode determines: +- Which `message_type` values are included +- Which columns are used for grouping (phone vs email, conversation + vs domain) +- Which views are available (Conversations is texts-only; Domains is + email-only) + +Existing email queries are unchanged — they gain an implicit +`message_type = 'email'` filter. ## Search -Text messages are indexed in `messages_fts` alongside emails. Search -in Texts mode filters results to text message types; search in Email -mode filters to email. The FTS table and indexing pipeline are shared. +Text messages are indexed in `messages_fts` alongside emails. The +FTS backfill pipeline is updated to populate the `from_addr` field +from `participants.phone_number` (via `messages.sender_id`) for text +messages, rather than only reading from `message_recipients` email +fields. Search in Texts mode filters results to text message types; +search in Email mode filters to email. ## Scope @@ -259,16 +345,21 @@ mode filters to email. The FTS table and indexing pipeline are shared. - Refactor iMessage and Google Voice to phone-based persistence - Shared `NormalizePhone()` utility - Participant deduplication by phone number across all sources +- Store-level conversation stats maintenance +- Label persistence contract for all importers - CLI command renaming -- TUI Texts mode (Conversations + aggregate views) -- Text message Parquet cache and DuckDB query methods -- FTS indexing of text messages -- `build-cache` builds both email and text Parquet files +- TUI Texts mode (Conversations + aggregate views), read-only +- Unified Parquet cache with mode-aware columns and queries +- FTS indexing of text messages (including phone-based sender lookup) +- `build-cache` exports text messages alongside emails ### Deferred - WhatsApp web sync API (future import method) - MMS/iMessage attachment extraction -- Contact name resolution from macOS address book +- Contact name resolution from macOS address book (needed for full + cross-channel unification of email-only iMessage handles with + phone-based contacts) - Cross-mode unified search (emails + texts together) - Rich message detail view for texts (headers, raw data display) +- Deletion support for text sources with live APIs From 645fd6248f56786fdeed058dd69c97b94c3ce4fc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 19:35:15 -0500 Subject: [PATCH 07/65] Address third round of review findings in spec - Replace incremental stats with idempotent RecomputeConversationStats post-import step to avoid counter drift on re-imports - Define Texts mode as a parallel navigation tree with new types (TextViewType, ConversationRow) and separate query methods, not a parameterization of the existing email aggregate model - Explicitly disable message detail in Texts mode (Enter is no-op on timeline messages) since the detail model is email-shaped - Clarify source filter as per-account (same plumbing as email), not a source-type bucket; defer source-type grouping Co-Authored-By: Claude Opus 4.6 (1M context) --- ...3-31-unified-text-message-import-design.md | 166 +++++++++++++----- 1 file changed, 122 insertions(+), 44 deletions(-) diff --git a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md index aac1597f..b22423b6 100644 --- a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md +++ b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md @@ -118,26 +118,30 @@ records (`'google_voice_call'`) and voicemails They are accessible via the Labels aggregate view when filtered to the relevant label. -### Conversation Stats Maintenance +### Conversation Stats The `conversations` table has denormalized stats columns: `message_count`, `participant_count`, `last_message_at`, `last_message_preview`. These are required for the Conversations primary view. -**Store-level maintenance:** The store layer maintains these stats as -part of message insertion — not left to each importer. When a message -is inserted for a text source (identified by `message_type`), the -store updates the parent conversation's stats atomically: -- `message_count` incremented -- `last_message_at` updated if the new message is newer -- `last_message_preview` set to the message snippet -- `participant_count` updated when new `conversation_participants` - rows are added +**Recomputation, not incremental updates.** Message insertion is +idempotent (`INSERT ... ON CONFLICT DO UPDATE`) and imports are +expected to be re-runnable. Incrementing counters on each upsert +would cause drift on re-imports. Instead, conversation stats are +recomputed from table state: -This replaces the WhatsApp importer's current approach of bulk- -updating stats in a post-processing step. All three importers get -correct stats automatically. +- Each importer calls `RecomputeConversationStats(sourceID)` as a + post-import step (like WhatsApp already does today). +- This runs aggregate queries against `messages` and + `conversation_participants` to set `message_count`, + `participant_count`, `last_message_at`, and `last_message_preview` + for all conversations belonging to that source. +- The operation is idempotent — running it twice produces the same + result regardless of how many times the import ran. + +This keeps the existing upsert/ignore semantics untouched and gives +all three importers correct stats via one shared store method. ### Label Persistence @@ -173,10 +177,9 @@ No shared interface is forced — each source is too different. But all converge on the same store methods for persistence: `EnsureParticipantByPhone(phone, identifierType)`, `EnsureParticipant(email, identifierType)` (for email-based handles), -`EnsureConversationWithType`, `EnsureLabel`, `LinkMessageLabel`, and -message insertion with proper -`message_type`/`sender_id`/`conversation_type`. The store handles -conversation stats maintenance automatically on insert. +`EnsureConversationWithType`, `EnsureLabel`, `LinkMessageLabel`, +`RecomputeConversationStats`, and message insertion with proper +`message_type`/`sender_id`/`conversation_type`. ### Shared Utilities (`internal/textimport/`) @@ -198,6 +201,7 @@ Instead: - Populate `conversations.title` using the fallback chain (see Conversation Title Fallback section) - Create labels (`'iMessage'`, `'SMS'`) and link to messages +- Call `RecomputeConversationStats` after import completes ### Google Voice Refactoring @@ -211,11 +215,13 @@ Instead: - Store body text directly, raw HTML in `message_raw` - Create labels (`'sms'`, `'mms'`, `'call_received'`, etc.) and link to messages +- Call `RecomputeConversationStats` after import completes ### WhatsApp Mostly fine as-is — already follows the target pattern. Minor cleanup: - Use shared `NormalizePhone()` instead of internal normalization +- Migrate bulk stats update to shared `RecomputeConversationStats` - Ensure consistent `raw_format` naming ### CLI Commands @@ -234,11 +240,55 @@ import methods if needed. ## TUI Texts Mode -### Mode Switching +### New Navigation Model + +Texts mode requires a different navigation shape than Email mode. The +current TUI is built around a single-key aggregate model: `ViewType` +selects a grouping dimension (sender, domain, label, time), +`AggregateRow` holds one key plus counts/sizes, and drill-down goes +from aggregate → message list → message detail. This structure does +not accommodate conversation-first navigation. + +**Texts mode introduces a parallel navigation tree:** + +``` +Texts Mode +├── Conversations view (primary) +│ └── Drill: conversation → message timeline +├── Contacts view (aggregate) +│ └── Drill: contact → conversations with that contact → timeline +├── Contact Names view (aggregate) +│ └── Drill: name → conversations → timeline +├── Sources view (aggregate) +│ └── Drill: source → conversations from that source → timeline +├── Labels view (aggregate) +│ └── Drill: label → messages with that label +└── Time view (aggregate) + └── Drill: period → conversations active in that period → timeline +``` -A new key (`m`) toggles between Email mode and Texts mode. The status -bar shows the current mode. All existing email TUI behavior is -unchanged in Email mode. +**Implementation approach:** This is a new set of view types, query +methods, and TUI states — not a parameterization of the existing email +views. + +- New `TextViewType` enum: `TextViewConversations`, + `TextViewContacts`, `TextViewContactNames`, `TextViewSources`, + `TextViewLabels`, `TextViewTime`. +- New `ConversationRow` struct for the Conversations view: `Title`, + `SourceType`, `MessageCount`, `ParticipantCount`, `LastMessageAt`, + `ConversationID`. This is not an `AggregateRow` — it has different + fields and different drill-down semantics. +- New query engine methods: `ListConversations(filter)`, + `TextAggregate(viewType, opts)`, `ListConversationMessages(convID, + filter)`. These are separate from the email `Aggregate`/ + `ListMessages` methods. +- New TUI state machine entries for Texts mode navigation. The mode + key (`m`) switches between the two state machines. Keybindings that + overlap (Tab, Enter, Esc, `s`, `r`, `a`, `/`, `?`, `q`) behave + the same way within each mode's navigation tree. + +The email TUI code is untouched. Texts mode is additive — new files, +new types, new methods. ### Conversations View (Primary) @@ -253,8 +303,8 @@ The default view when entering Texts mode. Each row shows: - Drill into a conversation: chronological message timeline - Messages display in compact chat style (timestamp, sender, body snippet) -- Conversation stats (Messages, Participants, Last Message) come from - denormalized columns maintained by the store layer +- Conversation stats come from denormalized columns recomputed + post-import ### Aggregate Views (Tab to Cycle) @@ -272,19 +322,35 @@ The default view when entering Texts mode. Each row shows: sources), then drill into a specific conversation - From Time: conversations active in that period +### Message Detail + +Pressing Enter on a message in the timeline does not open the email- +style detail view. The current detail model is email-shaped: +participants are `Address{Email, Name}` only, participant loading +reads `message_recipients`, and fallback body extraction assumes MIME +raw format. None of this works for text messages. + +In Texts mode, Enter on a message in the timeline is a no-op (or +scrolls to show the full message body inline if truncated). A proper +text message detail view is deferred. + ### Filters and Interaction All existing patterns carry over: -- Account filter (`a`) — doubles as source-type filter +- Source filter (`a`) — in Texts mode, this presents a list of text + sources (each `sources` row where `source_type` is a text type). + This is a per-account filter, same as Email mode — not a source- + type bucket. To filter by source type (e.g., "all WhatsApp"), the + user selects the specific WhatsApp account. If source-type grouping + is needed later, it would be a new filter dimension, not a reuse of + the account selector. - Date range, attachment filter - Search (`/`) — queries FTS, results filtered to text messages - Sort cycling (`s`), reverse (`r`) **Read-only:** Deletion staging (`d`/`D`) and selection (`Space`/`A`) are disabled in Texts mode. Imported text archives have no live delete -API — iMessage reads a local DB snapshot, WhatsApp reads a decrypted -backup, GVoice reads a Takeout export. There is no server to delete -from. +API. ## Parquet Analytics @@ -311,23 +377,31 @@ The existing denormalized Parquet schema is extended with: - `sender_id` (from `messages.sender_id`) Email mode queries filter `WHERE message_type = 'email'` (or -`source_type = 'gmail'`). Texts mode queries filter on the text -message types. The DuckDB query engine branches on mode for aggregate -key columns (email uses `from_email`/`from_domain`; texts use -`phone_number`/`conversation_title`). +`source_type IN ('gmail', 'imap')`). Texts mode queries filter on the +text message types. The DuckDB query engine branches on mode for +aggregate key columns (email uses `from_email`/`from_domain`; texts +use `phone_number`/`conversation_title`). ### Query Engine -The DuckDB query engine gains mode-aware aggregate methods. Same -function signatures as email aggregates, but the mode determines: -- Which `message_type` values are included -- Which columns are used for grouping (phone vs email, conversation - vs domain) -- Which views are available (Conversations is texts-only; Domains is - email-only) - -Existing email queries are unchanged — they gain an implicit -`message_type = 'email'` filter. +The DuckDB query engine gains new methods for Texts mode — these are +separate functions, not parameterizations of the existing email +methods: + +- `ListConversations(filter TextFilter) ([]ConversationRow, error)` — + queries denormalized conversation stats from Parquet, filtered and + sorted. +- `TextAggregate(viewType TextViewType, opts TextAggregateOptions) + ([]AggregateRow, error)` — aggregates text messages by contact, + source, label, or time. Reuses `AggregateRow` since the shape + (key + count + size) fits these views. +- `ListConversationMessages(convID int64, filter TextFilter) + ([]MessageSummary, error)` — messages within a single conversation, + chronological. + +Existing email query methods are unchanged — they gain an implicit +`message_type = 'email'` filter to exclude text messages from email +views. ## Search @@ -345,13 +419,16 @@ search in Email mode filters to email. - Refactor iMessage and Google Voice to phone-based persistence - Shared `NormalizePhone()` utility - Participant deduplication by phone number across all sources -- Store-level conversation stats maintenance +- `RecomputeConversationStats` shared store method - Label persistence contract for all importers - CLI command renaming -- TUI Texts mode (Conversations + aggregate views), read-only +- TUI Texts mode with new navigation model (Conversations + + aggregates + message timeline), read-only, detail view disabled +- New query engine methods for text conversations and aggregates - Unified Parquet cache with mode-aware columns and queries - FTS indexing of text messages (including phone-based sender lookup) - `build-cache` exports text messages alongside emails +- Source filter in Texts mode (per-account, same plumbing as email) ### Deferred @@ -361,5 +438,6 @@ search in Email mode filters to email. cross-channel unification of email-only iMessage handles with phone-based contacts) - Cross-mode unified search (emails + texts together) -- Rich message detail view for texts (headers, raw data display) +- Rich message detail view for texts - Deletion support for text sources with live APIs +- Source-type bucket filter (filter by "all WhatsApp" vs per-account) From 73a0f0e1a8573fe24029555684dd9eac1dc2757f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 19:41:16 -0500 Subject: [PATCH 08/65] Address fourth round of review findings in spec - Remove all ambiguity from conversation stats: explicitly state stats are NOT maintained during insert, only recomputed post-import - Replace vague keybinding claims with explicit mapping table showing every key's behavior in both Email and Texts modes - Define Texts mode search as plain full-text only; Gmail-style operators (from:, subject:, etc.) are email-mode only Co-Authored-By: Claude Opus 4.6 (1M context) --- ...3-31-unified-text-message-import-design.md | 96 ++++++++++++------- 1 file changed, 60 insertions(+), 36 deletions(-) diff --git a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md index b22423b6..105002dc 100644 --- a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md +++ b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md @@ -125,23 +125,17 @@ The `conversations` table has denormalized stats columns: `last_message_preview`. These are required for the Conversations primary view. -**Recomputation, not incremental updates.** Message insertion is -idempotent (`INSERT ... ON CONFLICT DO UPDATE`) and imports are -expected to be re-runnable. Incrementing counters on each upsert -would cause drift on re-imports. Instead, conversation stats are -recomputed from table state: - -- Each importer calls `RecomputeConversationStats(sourceID)` as a - post-import step (like WhatsApp already does today). -- This runs aggregate queries against `messages` and - `conversation_participants` to set `message_count`, - `participant_count`, `last_message_at`, and `last_message_preview` - for all conversations belonging to that source. -- The operation is idempotent — running it twice produces the same - result regardless of how many times the import ran. - -This keeps the existing upsert/ignore semantics untouched and gives -all three importers correct stats via one shared store method. +**Stats are not maintained during message insertion.** Message +insertion is idempotent (`INSERT ... ON CONFLICT DO UPDATE`) and +imports are expected to be re-runnable. The store does not attempt to +detect insert-vs-update, and does not increment counters on upsert. + +Instead, each importer calls `RecomputeConversationStats(sourceID)` +as a post-import step (like WhatsApp already does today). This runs +aggregate queries against `messages` and `conversation_participants` +to set all stats columns for conversations belonging to that source. +The operation is idempotent — running it twice produces the same +result. ### Label Persistence @@ -334,23 +328,34 @@ In Texts mode, Enter on a message in the timeline is a no-op (or scrolls to show the full message body inline if truncated). A proper text message detail view is deferred. -### Filters and Interaction - -All existing patterns carry over: -- Source filter (`a`) — in Texts mode, this presents a list of text - sources (each `sources` row where `source_type` is a text type). - This is a per-account filter, same as Email mode — not a source- - type bucket. To filter by source type (e.g., "all WhatsApp"), the - user selects the specific WhatsApp account. If source-type grouping - is needed later, it would be a new filter dimension, not a reuse of - the account selector. -- Date range, attachment filter -- Search (`/`) — queries FTS, results filtered to text messages -- Sort cycling (`s`), reverse (`r`) - -**Read-only:** Deletion staging (`d`/`D`) and selection (`Space`/`A`) -are disabled in Texts mode. Imported text archives have no live delete -API. +### Keybindings + +Texts mode reuses the same key assignments as Email mode where the +action applies. Keys that map to email-only actions are disabled. + +| Key | Email mode | Texts mode | +|-----|-----------|------------| +| `Tab` | Cycle aggregate views | Cycle text views (Conversations → Contacts → ...) | +| `Enter` | Drill down | Drill down (conversation → timeline; no message detail) | +| `Esc`/`Backspace` | Go back | Go back | +| `j`/`k`/`↑`/`↓` | Navigate rows | Navigate rows | +| `s` | Cycle sort field | Cycle sort field | +| `r` | Reverse sort | Reverse sort | +| `t` | Jump to Time view | Jump to Time view | +| `A` | Account selector | Source selector (lists text source accounts) | +| `a` | Jump to all messages | Jump to all conversations (reset filters) | +| `f` | Filter by attachments | Filter by attachments | +| `/` | Search (email FTS) | Search (text FTS, plain text only) | +| `?` | Help | Help | +| `q` | Quit | Quit | +| `m` | Switch to Texts mode | Switch to Email mode | +| `Space` | Toggle selection | Disabled (no deletion staging) | +| `d`/`D` | Stage deletion | Disabled (read-only) | +| `x` | Clear selection | Disabled | + +The `A` key opens the same account selector UI but filtered to text +sources. This is a per-account filter (same `SourceID *int64` +plumbing), not a source-type bucket. ## Parquet Analytics @@ -405,12 +410,31 @@ views. ## Search +### FTS Indexing + Text messages are indexed in `messages_fts` alongside emails. The FTS backfill pipeline is updated to populate the `from_addr` field from `participants.phone_number` (via `messages.sender_id`) for text messages, rather than only reading from `message_recipients` email -fields. Search in Texts mode filters results to text message types; -search in Email mode filters to email. +fields. + +### Search Semantics by Mode + +**Email mode** retains the current Gmail-style search operators: +`from:`, `to:`, `cc:`, `bcc:`, `subject:`, `account:`, etc. These +resolve against `message_recipients` and email-specific fields. No +changes. + +**Texts mode uses plain full-text search only.** The `/` key opens +the same search input, but the query is treated as a plain text match +against `messages_fts` (body + sender phone/name), filtered to text +message types. The Gmail-style operators (`from:`, `subject:`, etc.) +are not supported in Texts mode — they map to email-specific fields +(`message_recipients`, `subject`) that don't apply to text messages. + +If structured text search is needed later (e.g., `from:+1555...`, +`in:groupname`), it would be a new parser for text-specific +operators. For now, plain FTS is sufficient. ## Scope From 04f3725f02ff580d9d53ec2c9e9fa3d1fec924a4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 19:45:23 -0500 Subject: [PATCH 09/65] Address fifth round of review findings in spec - Fix A keybinding: principle 4 incorrectly listed A (account selector) as a disabled selection key; actual selection keys are Space and S - Introduce TextEngine as a separate interface from Engine to avoid rippling text query methods through remote/API/MCP/mock layers - DuckDBEngine implements both Engine and TextEngine; remote layers only implement TextEngine when remote Texts mode is added Co-Authored-By: Claude Opus 4.6 (1M context) --- ...3-31-unified-text-message-import-design.md | 57 ++++++++++++------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md index 105002dc..4218a96d 100644 --- a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md +++ b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md @@ -22,9 +22,8 @@ shared schema, unified participant model, and dedicated TUI experience. available views and drill-down behavior differ. 4. **Texts mode is read-only.** Imported text archives have no live delete API (iMessage reads a local DB, WhatsApp reads a backup, - GVoice reads a Takeout export). Deletion staging (`d`/`D`) is - disabled in Texts mode. Selection keybindings (`Space`/`A`) are - reserved for future use (e.g., export) but do not stage deletions. + GVoice reads a Takeout export). Deletion staging (`d`/`D`) and + selection (`Space`/`S`) are disabled in Texts mode. ## Schema & Persistence @@ -282,7 +281,35 @@ views. the same way within each mode's navigation tree. The email TUI code is untouched. Texts mode is additive — new files, -new types, new methods. +new types, new query interface. + +**Separate query interface.** Text query methods live in a new +`TextEngine` interface (in `internal/query/`), not on the existing +`Engine` interface. `Engine` is shared across the local TUI, remote +engine, API server, MCP server, and test mocks — adding methods to it +would force changes across all those layers. `TextEngine` is consumed +only by the Texts mode TUI, so it avoids that ripple. + +```go +type TextEngine interface { + ListConversations(ctx context.Context, + filter TextFilter) ([]ConversationRow, error) + TextAggregate(ctx context.Context, viewType TextViewType, + opts TextAggregateOptions) ([]AggregateRow, error) + ListConversationMessages(ctx context.Context, convID int64, + filter TextFilter) ([]MessageSummary, error) + TextSearch(ctx context.Context, query string, + limit, offset int) ([]MessageSummary, error) + GetTextStats(ctx context.Context, + opts TextStatsOptions) (*TotalStats, error) +} +``` + +`DuckDBEngine` implements both `Engine` and `TextEngine` (it already +has access to the Parquet data). `SQLiteEngine` can also implement +`TextEngine` as a fallback. The remote engine, API server, and MCP +server do not need to implement `TextEngine` until remote Texts mode +is added (deferred). ### Conversations View (Primary) @@ -389,22 +416,12 @@ use `phone_number`/`conversation_title`). ### Query Engine -The DuckDB query engine gains new methods for Texts mode — these are -separate functions, not parameterizations of the existing email -methods: - -- `ListConversations(filter TextFilter) ([]ConversationRow, error)` — - queries denormalized conversation stats from Parquet, filtered and - sorted. -- `TextAggregate(viewType TextViewType, opts TextAggregateOptions) - ([]AggregateRow, error)` — aggregates text messages by contact, - source, label, or time. Reuses `AggregateRow` since the shape - (key + count + size) fits these views. -- `ListConversationMessages(convID int64, filter TextFilter) - ([]MessageSummary, error)` — messages within a single conversation, - chronological. - -Existing email query methods are unchanged — they gain an implicit +`DuckDBEngine` implements the new `TextEngine` interface (see TUI +section) alongside the existing `Engine` interface. Text query +methods are separate functions on the same struct, not additions to +the `Engine` interface. + +Existing email query methods on `Engine` gain an implicit `message_type = 'email'` filter to exclude text messages from email views. From a580c288006d4fccde3b9341685dd0bbed14fd37 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 19:58:28 -0500 Subject: [PATCH 10/65] Add implementation plan for unified text message import 23 tasks across 5 phases: foundation store methods, importer refactoring (iMessage + GVoice + WhatsApp cleanup), Parquet cache + TextEngine query interface, TUI Texts mode, and integration testing. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../2026-03-31-unified-text-message-import.md | 1554 +++++++++++++++++ 1 file changed, 1554 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-31-unified-text-message-import.md diff --git a/docs/superpowers/plans/2026-03-31-unified-text-message-import.md b/docs/superpowers/plans/2026-03-31-unified-text-message-import.md new file mode 100644 index 00000000..719d0540 --- /dev/null +++ b/docs/superpowers/plans/2026-03-31-unified-text-message-import.md @@ -0,0 +1,1554 @@ +# Unified Text Message Import Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Merge WhatsApp, iMessage, and Google Voice import into a coherent system with shared phone-based participants, proper schema usage, and a dedicated TUI Texts mode. + +**Architecture:** Five sequential phases: (1) shared store/utility foundation, (2) importer refactoring to use store methods directly, (3) Parquet cache extension + TextEngine query interface, (4) TUI Texts mode, (5) CLI command renaming. Each phase builds on the previous. + +**Tech Stack:** Go, SQLite (mattn/go-sqlite3), DuckDB (go-duckdb), Bubble Tea TUI, Parquet/Arrow + +**Spec:** `docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md` + +--- + +## Phase 1: Foundation — Shared Utilities & Store Methods + +### Task 1: NormalizePhone Utility + +**Files:** +- Create: `internal/textimport/phone.go` +- Create: `internal/textimport/phone_test.go` + +- [ ] **Step 1: Write tests for NormalizePhone** + +```go +// internal/textimport/phone_test.go +package textimport + +import "testing" + +func TestNormalizePhone(t *testing.T) { + tests := []struct { + input string + want string + wantErr bool + }{ + // Valid E.164 + {"+15551234567", "+15551234567", false}, + // Strip formatting + {"+1 (555) 123-4567", "+15551234567", false}, + {"+1-555-123-4567", "+15551234567", false}, + {"1-555-123-4567", "+15551234567", false}, + // International + {"+447700900000", "+447700900000", false}, + {"+44 7700 900000", "+447700900000", false}, + // No country code — assume US + {"5551234567", "+15551234567", false}, + {"(555) 123-4567", "+15551234567", false}, + // Email — not a phone + {"alice@icloud.com", "", true}, + // Short code + {"12345", "", true}, + // Empty + {"", "", true}, + // System identifier + {"status@broadcast", "", true}, + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got, err := NormalizePhone(tt.input) + if tt.wantErr { + if err == nil { + t.Errorf("NormalizePhone(%q) = %q, want error", tt.input, got) + } + return + } + if err != nil { + t.Errorf("NormalizePhone(%q) error: %v", tt.input, err) + return + } + if got != tt.want { + t.Errorf("NormalizePhone(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/textimport/ -run TestNormalizePhone -v` +Expected: FAIL — package does not exist + +- [ ] **Step 3: Implement NormalizePhone** + +```go +// internal/textimport/phone.go +package textimport + +import ( + "fmt" + "strings" + "unicode" +) + +// NormalizePhone normalizes a phone number to E.164 format. +// Returns an error for inputs that are not phone numbers (emails, +// short codes, system identifiers). +func NormalizePhone(raw string) (string, error) { + if raw == "" { + return "", fmt.Errorf("empty input") + } + // Reject email addresses + if strings.Contains(raw, "@") { + return "", fmt.Errorf("not a phone number: %q", raw) + } + + // Strip all non-digit and non-plus characters + var b strings.Builder + for _, r := range raw { + if r == '+' || unicode.IsDigit(r) { + b.WriteRune(r) + } + } + digits := b.String() + + // Must start with + or be all digits + if digits == "" { + return "", fmt.Errorf("no digits in input: %q", raw) + } + + // Strip leading + for length check + justDigits := strings.TrimPrefix(digits, "+") + if len(justDigits) < 7 { + return "", fmt.Errorf("too short for phone number: %q", raw) + } + + // Ensure + prefix + if !strings.HasPrefix(digits, "+") { + // Assume US country code if 10 digits + if len(justDigits) == 10 { + digits = "+1" + justDigits + } else if len(justDigits) == 11 && justDigits[0] == '1' { + digits = "+" + justDigits + } else { + digits = "+" + justDigits + } + } + + return digits, nil +} +``` + +- [ ] **Step 4: Run tests** + +Run: `go test ./internal/textimport/ -run TestNormalizePhone -v` +Expected: PASS + +- [ ] **Step 5: Run fmt/vet, commit** + +```bash +go fmt ./internal/textimport/... +go vet ./internal/textimport/... +git add internal/textimport/ +git commit -m "Add shared NormalizePhone utility for text importers" +``` + +### Task 2: Generalize EnsureParticipantByPhone + +**Files:** +- Modify: `internal/store/messages.go:910-960` (EnsureParticipantByPhone) +- Modify: `internal/whatsapp/importer.go` (callers) +- Create: `internal/store/messages_test.go` (test for new signature) + +The current `EnsureParticipantByPhone` hardcodes `identifier_type = 'whatsapp'` in its `participant_identifiers` INSERT. Generalize to accept `identifierType` as a parameter. + +- [ ] **Step 1: Write test for generalized EnsureParticipantByPhone** + +```go +// Add to internal/store/messages_test.go (create if needed) +func TestEnsureParticipantByPhone_IdentifierType(t *testing.T) { + s := setupTestStore(t) + defer func() { _ = s.Close() }() + + // Create participant via WhatsApp + id1, err := s.EnsureParticipantByPhone("+15551234567", "Alice", "whatsapp") + if err != nil { + t.Fatal(err) + } + + // Same phone via iMessage — should return same participant + id2, err := s.EnsureParticipantByPhone("+15551234567", "Alice", "imessage") + if err != nil { + t.Fatal(err) + } + if id1 != id2 { + t.Errorf("same phone different source got different IDs: %d vs %d", id1, id2) + } + + // Check both identifiers exist + var count int + err = s.DB().QueryRow( + "SELECT COUNT(*) FROM participant_identifiers WHERE participant_id = ?", id1, + ).Scan(&count) + if err != nil { + t.Fatal(err) + } + if count != 2 { + t.Errorf("expected 2 identifier rows, got %d", count) + } +} +``` + +This test needs a `setupTestStore` helper — use an in-memory SQLite DB with `InitSchema()`. Check if one already exists in the test file; if not, add: + +```go +func setupTestStore(t *testing.T) *Store { + t.Helper() + s, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + if err := s.InitSchema(); err != nil { + t.Fatal(err) + } + return s +} +``` + +- [ ] **Step 2: Run test, verify failure** + +Run: `go test -tags fts5 ./internal/store/ -run TestEnsureParticipantByPhone_IdentifierType -v` +Expected: FAIL — wrong number of arguments + +- [ ] **Step 3: Update EnsureParticipantByPhone signature** + +In `internal/store/messages.go:910`, change: + +```go +func (s *Store) EnsureParticipantByPhone(phone, displayName string) (int64, error) { +``` +to: +```go +func (s *Store) EnsureParticipantByPhone(phone, displayName, identifierType string) (int64, error) { +``` + +Find the hardcoded `'whatsapp'` in the INSERT into `participant_identifiers` (around line 945) and replace with the `identifierType` parameter. + +- [ ] **Step 4: Update all callers in whatsapp package** + +In `internal/whatsapp/importer.go`, find every call to `EnsureParticipantByPhone(phone, name)` and add `"whatsapp"` as the third argument. Use `grep -rn "EnsureParticipantByPhone"` to find all call sites. + +- [ ] **Step 5: Run tests** + +Run: `go test -tags fts5 ./internal/store/ -run TestEnsureParticipantByPhone -v && go test ./internal/whatsapp/ -v` +Expected: PASS + +- [ ] **Step 6: Commit** + +```bash +git add internal/store/messages.go internal/store/messages_test.go internal/whatsapp/ +git commit -m "Generalize EnsureParticipantByPhone to accept identifierType" +``` + +### Task 3: RecomputeConversationStats Store Method + +**Files:** +- Modify: `internal/store/messages.go` (add method) +- Modify: `internal/whatsapp/importer.go:498-514` (replace inline SQL) + +- [ ] **Step 1: Write test** + +```go +func TestRecomputeConversationStats(t *testing.T) { + s := setupTestStore(t) + defer func() { _ = s.Close() }() + + // Create a source + sourceID, err := s.GetOrCreateSource("test_source", "whatsapp", "") + if err != nil { + t.Fatal(err) + } + + // Create a conversation + convID, err := s.EnsureConversationWithType(sourceID, "conv1", "direct_chat", "Test Chat") + if err != nil { + t.Fatal(err) + } + + // Insert two messages + for i, snippet := range []string{"hello", "world"} { + _, err := s.UpsertMessage(&Message{ + SourceID: sourceID, + SourceMessageID: fmt.Sprintf("msg%d", i), + ConversationID: convID, + Snippet: snippet, + SentAt: sql.NullTime{Time: time.Now().Add(time.Duration(i) * time.Hour), Valid: true}, + MessageType: "whatsapp", + }) + if err != nil { + t.Fatal(err) + } + } + + // Stats should be zero before recompute + var msgCount int64 + _ = s.DB().QueryRow("SELECT message_count FROM conversations WHERE id = ?", convID).Scan(&msgCount) + if msgCount != 0 { + t.Errorf("before recompute: message_count = %d, want 0", msgCount) + } + + // Recompute + if err := s.RecomputeConversationStats(sourceID); err != nil { + t.Fatal(err) + } + + // Verify + _ = s.DB().QueryRow("SELECT message_count FROM conversations WHERE id = ?", convID).Scan(&msgCount) + if msgCount != 2 { + t.Errorf("after recompute: message_count = %d, want 2", msgCount) + } + + // Running again should be idempotent + if err := s.RecomputeConversationStats(sourceID); err != nil { + t.Fatal(err) + } + _ = s.DB().QueryRow("SELECT message_count FROM conversations WHERE id = ?", convID).Scan(&msgCount) + if msgCount != 2 { + t.Errorf("after second recompute: message_count = %d, want 2", msgCount) + } +} +``` + +- [ ] **Step 2: Run test, verify failure** + +Run: `go test -tags fts5 ./internal/store/ -run TestRecomputeConversationStats -v` +Expected: FAIL — method not found + +- [ ] **Step 3: Implement RecomputeConversationStats** + +Add to `internal/store/messages.go`: + +```go +// RecomputeConversationStats recomputes denormalized stats +// (message_count, participant_count, last_message_at, +// last_message_preview) for all conversations belonging to sourceID. +// This is idempotent — safe to call after any import or re-import. +func (s *Store) RecomputeConversationStats(sourceID int64) error { + _, err := s.db.Exec(` + UPDATE conversations SET + message_count = ( + SELECT COUNT(*) FROM messages + WHERE conversation_id = conversations.id + ), + participant_count = ( + SELECT COUNT(*) FROM conversation_participants + WHERE conversation_id = conversations.id + ), + last_message_at = ( + SELECT MAX(COALESCE(sent_at, received_at, internal_date)) + FROM messages + WHERE conversation_id = conversations.id + ), + last_message_preview = ( + SELECT snippet FROM messages + WHERE conversation_id = conversations.id + ORDER BY COALESCE(sent_at, received_at, internal_date) DESC + LIMIT 1 + ) + WHERE source_id = ? + `, sourceID) + if err != nil { + return fmt.Errorf("recompute conversation stats: %w", err) + } + return nil +} +``` + +- [ ] **Step 4: Run tests** + +Run: `go test -tags fts5 ./internal/store/ -run TestRecomputeConversationStats -v` +Expected: PASS + +- [ ] **Step 5: Replace WhatsApp inline SQL with shared method** + +In `internal/whatsapp/importer.go:498-514`, replace the inline `UPDATE conversations SET ...` with: + +```go +if err := imp.store.RecomputeConversationStats(source.ID); err != nil { + imp.log("Warning: failed to recompute conversation stats: %v", err) +} +``` + +- [ ] **Step 6: Run WhatsApp tests, commit** + +```bash +go test ./internal/whatsapp/ -v +go fmt ./... +git add internal/store/messages.go internal/store/messages_test.go internal/whatsapp/importer.go +git commit -m "Add shared RecomputeConversationStats store method" +``` + +### Task 4: Add LinkMessageLabel Store Method + +**Files:** +- Modify: `internal/store/messages.go` (add method) + +The spec calls for `LinkMessageLabel(messageID, labelID)`. The store has `AddMessageLabels(messageID int64, labelIDs []int64)` at line 570 which does `INSERT OR IGNORE` for a slice. Add a convenience single-label wrapper. + +- [ ] **Step 1: Add LinkMessageLabel** + +```go +// LinkMessageLabel links a single label to a message. +// Uses INSERT OR IGNORE — safe to call multiple times. +func (s *Store) LinkMessageLabel(messageID, labelID int64) error { + return s.AddMessageLabels(messageID, []int64{labelID}) +} +``` + +- [ ] **Step 2: Run fmt/vet, commit** + +```bash +go fmt ./internal/store/... +go vet ./internal/store/... +git add internal/store/messages.go +git commit -m "Add LinkMessageLabel convenience method" +``` + +--- + +## Phase 2: Importer Refactoring + +### Task 5: Refactor iMessage Importer + +**Files:** +- Rewrite: `internal/imessage/client.go` — drop gmail.API, use store methods +- Modify: `internal/imessage/parser.go` — use shared NormalizePhone +- Modify: `internal/imessage/models.go` — update types if needed +- Rewrite: `cmd/msgvault/cmd/sync_imessage.go` → `cmd/msgvault/cmd/import_imessage.go` +- Update: `internal/imessage/parser_test.go` + +This is the largest refactoring task. The key changes: + +1. `Client` no longer implements `gmail.API` +2. `Client` takes a `*store.Store` and writes directly +3. New `Import(ctx, store, opts)` method replaces the sync pipeline +4. `normalizeIdentifier` uses shared `textimport.NormalizePhone` with fallback to email path +5. No more synthetic MIME — body goes to `message_bodies`, raw to `message_raw` + +- [ ] **Step 1: Update parser.go to use shared NormalizePhone** + +Replace the `normalizeIdentifier` function in `internal/imessage/parser.go` to use `textimport.NormalizePhone`: + +```go +import "github.com/wesm/msgvault/internal/textimport" + +// resolveHandle categorizes an iMessage handle as phone or email. +// Returns (phone, email, displayName). Exactly one of phone/email +// will be non-empty. +func resolveHandle(handleID string) (phone, email, displayName string) { + if handleID == "" { + return "", "", "" + } + // Try phone normalization first + normalized, err := textimport.NormalizePhone(handleID) + if err == nil { + return normalized, "", normalized + } + // Fall back to email + if strings.Contains(handleID, "@") { + return "", strings.ToLower(handleID), "" + } + // Neither — raw handle + return "", "", handleID +} +``` + +Remove the old `normalizeIdentifier`, `normalizePhone`, `buildMIME`, `formatMIMEAddress` functions — they're no longer needed. + +- [ ] **Step 2: Rewrite Client to use store methods directly** + +Replace the `gmail.API` implementation in `internal/imessage/client.go`. The new `Client` struct holds a `*sql.DB` (read-only handle to chat.db) and exposes an `Import` method: + +```go +type Client struct { + db *sql.DB + myHandle string // owner's phone or email + afterDate *time.Time + beforeDate *time.Time + limit int + useNanoseconds bool + logger *slog.Logger +} + +// Import reads iMessage history from chat.db and writes to the +// msgvault store. Returns a summary of what was imported. +func (c *Client) Import(ctx context.Context, s *store.Store, opts ImportOptions) (*ImportSummary, error) { + // 1. GetOrCreateSource with source_type="apple_messages" + // 2. Ensure labels ("iMessage", "SMS") + // 3. Query chat.db for conversations (chats) + // 4. For each chat: + // a. EnsureConversationWithType (group vs direct) + // b. Resolve participants via resolveHandle → EnsureParticipantByPhone or EnsureParticipant + // c. EnsureConversationParticipant for each + // d. Query messages for this chat + // e. For each message: UpsertMessage with message_type, sender_id + // f. UpsertMessageBody with body text + // g. LinkMessageLabel + // 5. RecomputeConversationStats + // 6. Return summary +} +``` + +Remove the `gmail.API` interface assertion and all gmail.API methods (`GetProfile`, `ListLabels`, `ListMessages`, `GetMessageRaw`, `GetMessagesRawBatch`, `ListHistory`, `TrashMessage`, `DeleteMessage`, `BatchDeleteMessages`). + +Keep the `chat.db` reading logic (SQL queries, timestamp handling, `detectTimestampFormat`). The SQL queries that read from chat.db stay the same — only the output path changes. + +- [ ] **Step 3: Rewrite CLI command** + +Move `cmd/msgvault/cmd/sync_imessage.go` → `cmd/msgvault/cmd/import_imessage.go`. Replace the `sync-imessage` cobra command with `import-imessage`. Remove all `sync.Syncer` usage — call `client.Import(ctx, store, opts)` directly. + +Register the new command in `root.go`. + +- [ ] **Step 4: Update tests** + +Update `internal/imessage/parser_test.go`: +- Replace tests for `normalizeIdentifier` with tests for `resolveHandle` +- Remove tests for `buildMIME` / `formatMIMEAddress` +- Add tests for phone/email/raw-handle resolution + +- [ ] **Step 5: Run all tests** + +```bash +go test ./internal/imessage/ -v +go test ./internal/store/ -v +go vet ./... +``` + +- [ ] **Step 6: Commit** + +```bash +git add internal/imessage/ cmd/msgvault/cmd/ +git commit -m "Refactor iMessage to use store methods directly + +Drop gmail.API adapter and synthetic MIME. iMessage now writes to +the store using EnsureParticipantByPhone, EnsureConversationWithType, +and proper message_type/sender_id/conversation_type fields." +``` + +### Task 6: Refactor Google Voice Importer + +**Files:** +- Rewrite: `internal/gvoice/client.go` — drop gmail.API, use store methods +- Modify: `internal/gvoice/parser.go` — use shared NormalizePhone +- Modify: `internal/gvoice/models.go` — add message_type mapping +- Rewrite: `cmd/msgvault/cmd/sync_gvoice.go` → `cmd/msgvault/cmd/import_gvoice.go` +- Update: `internal/gvoice/parser_test.go` + +Same pattern as Task 5. Key differences: + +1. GVoice reads from a Takeout directory (HTML files), not a database +2. Three message_type values: `google_voice_text`, `google_voice_call`, `google_voice_voicemail` +3. All participants are phone-based (no email fallback needed) +4. `normalizeIdentifier` in parser.go replaced with `textimport.NormalizePhone` + +- [ ] **Step 1: Update parser.go to use shared NormalizePhone** + +Replace `normalizeIdentifier` and `normalizePhone` in `internal/gvoice/parser.go` with calls to `textimport.NormalizePhone`. Remove `buildMIME` and `formatMIMEAddress`. + +- [ ] **Step 2: Add message_type mapping to models.go** + +```go +// MessageTypeForFileType returns the messages.message_type value +// for a Google Voice file type. +func MessageTypeForFileType(ft fileType) string { + switch ft { + case fileTypeText, fileTypeGroup: + return "google_voice_text" + case fileTypeReceived, fileTypePlaced, fileTypeMissed: + return "google_voice_call" + case fileTypeVoicemail: + return "google_voice_voicemail" + default: + return "google_voice_text" + } +} +``` + +- [ ] **Step 3: Rewrite Client to use store methods directly** + +Same approach as iMessage — new `Import` method, remove all gmail.API methods and interface assertion. The HTML parsing stays the same; the output path changes to store methods. + +For each indexed entry: +1. Resolve phone via `textimport.NormalizePhone` +2. `EnsureParticipantByPhone(phone, name, "google_voice")` +3. `EnsureConversationWithType` with thread ID +4. `UpsertMessage` with `message_type = MessageTypeForFileType(entry.FileType)` +5. `UpsertMessageBody` with body text +6. `UpsertMessageRawWithFormat` with raw HTML as `gvoice_html` +7. `EnsureLabel` + `LinkMessageLabel` for each label +8. After all entries: `RecomputeConversationStats` + +- [ ] **Step 4: Rewrite CLI command** + +Move `sync_gvoice.go` → `import_gvoice.go`. Replace `sync-gvoice` with `import-gvoice`. Remove `sync.Syncer` usage. + +- [ ] **Step 5: Update tests, run all** + +Update parser_test.go to test `textimport.NormalizePhone` integration. Remove MIME-related test assertions. + +```bash +go test ./internal/gvoice/ -v +go vet ./... +``` + +- [ ] **Step 6: Commit** + +```bash +git add internal/gvoice/ cmd/msgvault/cmd/ +git commit -m "Refactor Google Voice to use store methods directly + +Drop gmail.API adapter and synthetic MIME. Google Voice now writes +to the store with proper message_type (google_voice_text/call/ +voicemail), phone-based participants, and labels." +``` + +### Task 7: WhatsApp Cleanup + +**Files:** +- Modify: `internal/whatsapp/importer.go` — use shared NormalizePhone +- Modify: `internal/whatsapp/contacts.go` — use shared NormalizePhone + +- [ ] **Step 1: Replace internal normalizePhone with shared utility** + +Find all calls to the internal `normalizePhone` in the whatsapp package and replace with `textimport.NormalizePhone`. The internal function is in `internal/whatsapp/mapping.go` or `contacts.go`. Since the shared version returns an error, callers need to handle it (skip participants that don't normalize). + +- [ ] **Step 2: Update EnsureParticipantByPhone calls** + +All calls in the whatsapp package already pass `"whatsapp"` after Task 2. Verify. + +- [ ] **Step 3: Run tests, commit** + +```bash +go test ./internal/whatsapp/ -v +go fmt ./... +git add internal/whatsapp/ +git commit -m "WhatsApp: use shared NormalizePhone and RecomputeConversationStats" +``` + +### Task 8: Rename CLI Commands and Register + +**Files:** +- Rename: `cmd/msgvault/cmd/import.go` → verify naming +- Modify: `cmd/msgvault/cmd/root.go` — register new commands, remove old + +- [ ] **Step 1: Ensure all three import commands are registered** + +The WhatsApp import command is currently `import --type whatsapp` (in `cmd/msgvault/cmd/import.go`). Rename to `import-whatsapp`. The iMessage and GVoice commands were already renamed in Tasks 5-6. + +Update `root.go` to register: `importWhatsappCmd`, `importImessageCmd`, `importGvoiceCmd`. Remove any old `syncImessageCmd`, `syncGvoiceCmd` references. + +- [ ] **Step 2: Verify all commands work** + +```bash +go build -tags fts5 -o msgvault ./cmd/msgvault +./msgvault import-whatsapp --help +./msgvault import-imessage --help +./msgvault import-gvoice --help +``` + +- [ ] **Step 3: Commit** + +```bash +git add cmd/msgvault/ +git commit -m "Rename import CLI commands for consistency + +import-whatsapp, import-imessage, import-gvoice" +``` + +--- + +## Phase 3: Parquet Cache & TextEngine + +### Task 9: Extend Parquet Cache for Text Messages + +**Files:** +- Modify: `cmd/msgvault/cmd/build_cache.go` — add columns to export queries +- Modify: `internal/query/duckdb.go` — probe new columns + +The existing `build_cache.go` exports `messages`, `participants`, `conversations`, etc. to Parquet. We need to ensure the export includes the columns required for Texts mode queries. + +- [ ] **Step 1: Add conversation_type to conversations export** + +In `build_cache.go`, find the conversations export query (around line 460) and add `conversation_type` to the SELECT. The schema already has this column. + +- [ ] **Step 2: Add message_type and sender_id to messages export** + +The messages export (around line 300) already includes `message_type` and `sender_id` (added by the WhatsApp PR). Verify they're present. If not, add them. + +- [ ] **Step 3: Bump cache schema version** + +Change `cacheSchemaVersion` from 4 to 5. This forces a full rebuild when users upgrade, ensuring new columns are present. + +- [ ] **Step 4: Update DuckDB column probing** + +In `internal/query/duckdb.go`, the `probeParquetColumns` method checks for optional columns. Ensure `conversation_type` is probed for the conversations table. + +- [ ] **Step 5: Add email-only filter to existing Engine queries** + +In `DuckDBEngine.Aggregate`, `DuckDBEngine.ListMessages`, etc., add a `WHERE message_type = 'email' OR message_type IS NULL` filter so email-mode queries exclude text messages. The `IS NULL` handles old data without the column. + +This is a targeted change in `buildFilterConditions` (line 803) — add it as a default condition when no explicit message_type filter is set. + +- [ ] **Step 6: Run tests, commit** + +```bash +go test -tags fts5 ./internal/query/ -v +go test -tags fts5 ./cmd/msgvault/cmd/ -v +git add cmd/msgvault/cmd/build_cache.go internal/query/duckdb.go +git commit -m "Extend Parquet cache with text message columns + +Add conversation_type to exports, bump cache schema to v5, +filter email queries to exclude text messages." +``` + +### Task 10: TextEngine Interface and Types + +**Files:** +- Create: `internal/query/text_engine.go` +- Create: `internal/query/text_models.go` + +- [ ] **Step 1: Define TextEngine types** + +```go +// internal/query/text_models.go +package query + +import "time" + +// TextViewType represents the type of view in Texts mode. +type TextViewType int + +const ( + TextViewConversations TextViewType = iota + TextViewContacts + TextViewContactNames + TextViewSources + TextViewLabels + TextViewTime + TextViewTypeCount +) + +func (v TextViewType) String() string { + switch v { + case TextViewConversations: + return "Conversations" + case TextViewContacts: + return "Contacts" + case TextViewContactNames: + return "Contact Names" + case TextViewSources: + return "Sources" + case TextViewLabels: + return "Labels" + case TextViewTime: + return "Time" + default: + return "Unknown" + } +} + +// ConversationRow represents a conversation in the Conversations view. +type ConversationRow struct { + ConversationID int64 + Title string + SourceType string + MessageCount int64 + ParticipantCount int64 + LastMessageAt time.Time + LastPreview string +} + +// TextFilter specifies which text messages to retrieve. +type TextFilter struct { + SourceID *int64 + ConversationID *int64 + ContactPhone string + ContactName string + SourceType string + Label string + TimeRange TimeRange + After *time.Time + Before *time.Time + Pagination Pagination + SortField SortField + SortDirection SortDirection +} + +// TextAggregateOptions configures a text aggregate query. +type TextAggregateOptions struct { + SourceID *int64 + After *time.Time + Before *time.Time + SortField SortField + SortDirection SortDirection + Limit int + TimeGranularity TimeGranularity + SearchQuery string +} + +// TextStatsOptions configures a text stats query. +type TextStatsOptions struct { + SourceID *int64 + SearchQuery string +} + +// TextMessageTypes lists the message_type values included in Texts mode. +var TextMessageTypes = []string{ + "whatsapp", "imessage", "sms", "google_voice_text", +} + +// IsTextMessageType returns true if the given type is a text message type. +func IsTextMessageType(mt string) bool { + for _, t := range TextMessageTypes { + if t == mt { + return true + } + } + return false +} +``` + +- [ ] **Step 2: Define TextEngine interface** + +```go +// internal/query/text_engine.go +package query + +import "context" + +// TextEngine provides query operations for text message data. +// This is a separate interface from Engine to avoid rippling text +// query methods through remote/API/MCP/mock layers. +// DuckDBEngine and SQLiteEngine implement both Engine and TextEngine. +type TextEngine interface { + // ListConversations returns conversations matching the filter. + ListConversations(ctx context.Context, + filter TextFilter) ([]ConversationRow, error) + + // TextAggregate aggregates text messages by the given view type. + TextAggregate(ctx context.Context, viewType TextViewType, + opts TextAggregateOptions) ([]AggregateRow, error) + + // ListConversationMessages returns messages within a conversation. + ListConversationMessages(ctx context.Context, convID int64, + filter TextFilter) ([]MessageSummary, error) + + // TextSearch performs plain full-text search over text messages. + TextSearch(ctx context.Context, query string, + limit, offset int) ([]MessageSummary, error) + + // GetTextStats returns aggregate stats for text messages. + GetTextStats(ctx context.Context, + opts TextStatsOptions) (*TotalStats, error) +} +``` + +- [ ] **Step 3: Run fmt/vet, commit** + +```bash +go fmt ./internal/query/... +go vet ./internal/query/... +git add internal/query/text_engine.go internal/query/text_models.go +git commit -m "Add TextEngine interface and text query types" +``` + +### Task 11: DuckDB TextEngine Implementation + +**Files:** +- Create: `internal/query/duckdb_text.go` +- Create: `internal/query/duckdb_text_test.go` + +Implement `TextEngine` methods on `DuckDBEngine`. These query the same Parquet files as email queries but filter to text message types and use different grouping columns. + +- [ ] **Step 1: Implement ListConversations** + +```go +// internal/query/duckdb_text.go +package query + +// ... imports ... + +// textTypeFilter returns a SQL IN clause for text message types. +func textTypeFilter() string { + return "message_type IN ('whatsapp','imessage','sms','google_voice_text')" +} + +func (e *DuckDBEngine) ListConversations(ctx context.Context, + filter TextFilter) ([]ConversationRow, error) { + // Query conversations table joined with message stats + // from the Parquet messages, filtered to text message types. + // Uses denormalized stats from conversations table (via SQLite + // scanner or conversations Parquet). + // Sort by last_message_at DESC by default. + // Apply filter: SourceID, After/Before, Pagination. + // ... +} +``` + +The implementation queries the `conversations` Parquet table joined with `sources` to get `source_type`, filtered to text source types (`'whatsapp'`, `'apple_messages'`, `'google_voice'`). + +- [ ] **Step 2: Implement TextAggregate** + +Aggregation by view type: +- `TextViewContacts`: GROUP BY `phone_number`, `display_name` +- `TextViewContactNames`: GROUP BY `display_name` +- `TextViewSources`: GROUP BY `source_type` +- `TextViewLabels`: GROUP BY label name (JOIN message_labels + labels) +- `TextViewTime`: GROUP BY time period + +All queries include `WHERE textTypeFilter()`. + +- [ ] **Step 3: Implement ListConversationMessages** + +Query messages from Parquet where `conversation_id = convID` and `textTypeFilter()`, ordered by `sent_at ASC` (chronological for chat timeline). + +- [ ] **Step 4: Implement TextSearch** + +Plain FTS query against `messages_fts` via the SQLite scanner, filtered to text message types. No Gmail-style operator parsing — pass the query string directly to FTS5 MATCH. + +- [ ] **Step 5: Implement GetTextStats** + +Aggregate stats (message count, total size, etc.) filtered to text message types. + +- [ ] **Step 6: Add interface assertion** + +```go +var _ TextEngine = (*DuckDBEngine)(nil) +``` + +- [ ] **Step 7: Write tests** + +Create `internal/query/duckdb_text_test.go` with test fixtures that include text message data. Test `ListConversations`, `TextAggregate` for each view type, `ListConversationMessages`, and `GetTextStats`. + +Use the existing test fixture pattern from `internal/query/testfixtures_test.go` — extend it to include text message data with proper `message_type`, `sender_id`, and `conversation_type` values. + +- [ ] **Step 8: Run tests, commit** + +```bash +go test -tags fts5 ./internal/query/ -run TestText -v +git add internal/query/duckdb_text.go internal/query/duckdb_text_test.go +git commit -m "Implement TextEngine on DuckDBEngine + +ListConversations, TextAggregate, ListConversationMessages, +TextSearch, GetTextStats — all querying Parquet with text +message type filters." +``` + +### Task 12: SQLite TextEngine Fallback + +**Files:** +- Create: `internal/query/sqlite_text.go` + +Implement `TextEngine` on `SQLiteEngine` as a fallback for when Parquet cache is not built. Same logic as DuckDB but querying SQLite directly. + +- [ ] **Step 1: Implement all five TextEngine methods** + +Same patterns as DuckDB but using SQLite SQL. Key difference: joins go to real tables instead of Parquet files. + +- [ ] **Step 2: Add interface assertion** + +```go +var _ TextEngine = (*SQLiteEngine)(nil) +``` + +- [ ] **Step 3: Run tests, commit** + +```bash +go test -tags fts5 ./internal/query/ -v +git add internal/query/sqlite_text.go +git commit -m "Implement TextEngine on SQLiteEngine as fallback" +``` + +### Task 13: Update FTS Backfill for Text Messages + +**Files:** +- Modify: `internal/store/messages.go` (FTS backfill query) + +The current FTS backfill populates `from_addr` from `message_recipients` where `recipient_type = 'from'`. Text messages use `sender_id` instead. Update the backfill to handle both paths. + +- [ ] **Step 1: Find the FTS backfill query** + +In `internal/store/messages.go`, find the `BackfillFTS` or similar method that populates `messages_fts`. Look for the INSERT INTO `messages_fts` query. + +- [ ] **Step 2: Update the from_addr population** + +Change the `from_addr` subquery to use COALESCE: + +```sql +COALESCE( + (SELECT COALESCE(p.phone_number, p.email_address) + FROM participants p WHERE p.id = m.sender_id), + (SELECT p.email_address FROM message_recipients mr + JOIN participants p ON p.id = mr.participant_id + WHERE mr.message_id = m.id AND mr.recipient_type = 'from' + LIMIT 1) +) as from_addr +``` + +This checks `sender_id` first (for text messages), falls back to `message_recipients` (for email). + +- [ ] **Step 3: Run FTS tests, commit** + +```bash +go test -tags fts5 ./internal/store/ -v +git add internal/store/messages.go +git commit -m "Update FTS backfill to handle phone-based text senders" +``` + +--- + +## Phase 4: TUI Texts Mode + +### Task 14: TUI Model State for Texts Mode + +**Files:** +- Modify: `internal/tui/model.go` — add text mode state +- Create: `internal/tui/text_state.go` — text-specific state types + +- [ ] **Step 1: Add text mode types and state** + +```go +// internal/tui/text_state.go +package tui + +import "github.com/wesm/msgvault/internal/query" + +// tuiMode distinguishes Email mode from Texts mode. +type tuiMode int + +const ( + modeEmail tuiMode = iota + modeTexts +) + +// textViewLevel tracks navigation depth in Texts mode. +type textViewLevel int + +const ( + textLevelConversations textViewLevel = iota + textLevelAggregate + textLevelDrillConversations // conversations filtered by aggregate key + textLevelTimeline // messages within a conversation +) + +// textState holds all state specific to Texts mode. +type textState struct { + viewType query.TextViewType + level textViewLevel + conversations []query.ConversationRow + aggregateRows []query.AggregateRow + messages []query.MessageSummary + cursor int + scrollOffset int + selectedConvID int64 + + // Filter state + filter query.TextFilter + + // Stats + stats *query.TotalStats + + // Breadcrumbs for back navigation + breadcrumbs []textNavSnapshot +} + +type textNavSnapshot struct { + level textViewLevel + viewType query.TextViewType + cursor int + scrollOffset int + filter query.TextFilter + selectedConvID int64 +} +``` + +- [ ] **Step 2: Add mode and textState to Model** + +In `internal/tui/model.go`, add to the `Model` struct: + +```go +mode tuiMode +textEngine query.TextEngine // nil if not available +textState textState +``` + +In the `New` constructor, check if the engine implements `TextEngine`: + +```go +if te, ok := engine.(query.TextEngine); ok { + m.textEngine = te +} +``` + +- [ ] **Step 3: Commit** + +```bash +git add internal/tui/text_state.go internal/tui/model.go +git commit -m "Add Texts mode state types to TUI model" +``` + +### Task 15: Mode Switching (m key) + +**Files:** +- Modify: `internal/tui/keys.go` — add `m` key handler +- Modify: `internal/tui/model.go` — route Update based on mode + +- [ ] **Step 1: Add m key to handleGlobalKeys** + +In `internal/tui/keys.go`, in `handleGlobalKeys` (around line 86), add: + +```go +case "m": + if m.textEngine == nil { + return m, nil, true // no text engine, ignore + } + if m.mode == modeEmail { + m.mode = modeTexts + // Load text conversations + return m, m.loadTextConversations(), true + } + m.mode = modeEmail + return m, m.loadData(), true +``` + +- [ ] **Step 2: Route key handling by mode in Update** + +In `model.go`'s `Update` method, after global key handling, branch on `m.mode`: + +```go +if m.mode == modeTexts { + return m.handleTextKeyPress(msg) +} +// ... existing email key handling +``` + +- [ ] **Step 3: Commit** + +```bash +git add internal/tui/keys.go internal/tui/model.go +git commit -m "Add mode switching between Email and Texts (m key)" +``` + +### Task 16: Texts Mode Key Handling + +**Files:** +- Create: `internal/tui/text_keys.go` + +- [ ] **Step 1: Implement text mode key dispatch** + +```go +// internal/tui/text_keys.go +package tui + +import tea "github.com/charmbracelet/bubbletea" + +func (m Model) handleTextKeyPress(msg tea.KeyMsg) (tea.Model, tea.Cmd) { + key := msg.String() + + // Disabled keys in Texts mode + switch key { + case " ", "S", "d", "D", "x": + return m, nil // read-only mode + } + + switch m.textState.level { + case textLevelConversations, textLevelAggregate, + textLevelDrillConversations: + return m.handleTextListKeys(msg) + case textLevelTimeline: + return m.handleTextTimelineKeys(msg) + } + return m, nil +} + +func (m Model) handleTextListKeys(msg tea.KeyMsg) (tea.Model, tea.Cmd) { + key := msg.String() + switch key { + case "tab", "Tab": + m.cycleTextViewType(true) + return m, m.loadTextData() + case "shift+tab": + m.cycleTextViewType(false) + return m, m.loadTextData() + case "enter": + return m.textDrillDown() + case "esc", "backspace": + return m.textGoBack() + case "j", "down": + m.textState.cursor++ + m.clampTextCursor() + return m, nil + case "k", "up": + m.textState.cursor-- + m.clampTextCursor() + return m, nil + case "s": + m.cycleTextSortField() + return m, m.loadTextData() + case "r": + m.toggleTextSortDirection() + return m, m.loadTextData() + case "t": + m.textState.viewType = query.TextViewTime + m.textState.level = textLevelAggregate + return m, m.loadTextData() + case "a": + // Reset to conversations + m.textState = textState{viewType: query.TextViewConversations} + return m, m.loadTextConversations() + case "A": + m.openAccountSelector() + return m, nil + } + return m, nil +} +``` + +- [ ] **Step 2: Implement helper methods** + +Add `cycleTextViewType`, `clampTextCursor`, `textDrillDown`, `textGoBack`, `loadTextData`, `loadTextConversations` methods. These follow the same patterns as the email equivalents but operate on `textState`. + +- [ ] **Step 3: Commit** + +```bash +git add internal/tui/text_keys.go +git commit -m "Add Texts mode key handling" +``` + +### Task 17: Texts Mode Views + +**Files:** +- Create: `internal/tui/text_view.go` + +- [ ] **Step 1: Implement text conversations view** + +```go +// internal/tui/text_view.go +package tui + +// textConversationsView renders the Conversations list. +func (m Model) textConversationsView() string { + // Header: Name | Source | Messages | Participants | Last Message + // Rows from m.textState.conversations + // Same styling patterns as aggregateTableView +} +``` + +- [ ] **Step 2: Implement text aggregate view** + +```go +// textAggregateView renders aggregate views (Contacts, Sources, etc.) +func (m Model) textAggregateView() string { + // Same shape as email aggregate view + // Rows from m.textState.aggregateRows +} +``` + +- [ ] **Step 3: Implement text timeline view** + +```go +// textTimelineView renders a conversation's message timeline. +func (m Model) textTimelineView() string { + // Compact chat style: timestamp | sender | body snippet + // Rows from m.textState.messages + // Chronological order (oldest first) +} +``` + +- [ ] **Step 4: Wire into renderView** + +In the main `renderView()` switch (internal/tui/view.go), add a mode check: + +```go +if m.mode == modeTexts { + return m.renderTextView() +} +``` + +Implement `renderTextView()` in text_view.go, switching on `m.textState.level`. + +- [ ] **Step 5: Update footer for Texts mode** + +In `footerView()`, add a Texts mode branch that shows the correct keybindings for the current text view level. + +- [ ] **Step 6: Add mode indicator to header** + +In `buildTitleBar()`, show "Email" or "Texts" mode indicator. Show "m: switch mode" in the title bar. + +- [ ] **Step 7: Commit** + +```bash +git add internal/tui/text_view.go internal/tui/view.go +git commit -m "Add Texts mode views: conversations, aggregates, timeline" +``` + +### Task 18: Texts Mode Search + +**Files:** +- Modify: `internal/tui/text_keys.go` — add `/` handler +- Create: `internal/tui/text_search.go` — text search state management + +- [ ] **Step 1: Add search handling** + +In `handleTextListKeys`, the `/` key enters search mode. In Texts mode, search uses plain FTS (no Gmail operators): + +```go +case "/": + m.searchMode = true + m.searchInput = "" + return m, nil +``` + +When search is submitted, call `m.textEngine.TextSearch(ctx, query, limit, 0)` instead of the email search path. + +- [ ] **Step 2: Display search results** + +Search results in Texts mode show as a message list (same as timeline view). Pressing Esc exits search. + +- [ ] **Step 3: Commit** + +```bash +git add internal/tui/text_keys.go internal/tui/text_search.go +git commit -m "Add plain full-text search in Texts mode" +``` + +### Task 19: Data Loading Commands for Texts Mode + +**Files:** +- Create: `internal/tui/text_commands.go` + +- [ ] **Step 1: Implement async data loading commands** + +Following the Bubble Tea pattern, each data load returns a `tea.Cmd` that runs asynchronously and sends a message when done: + +```go +// internal/tui/text_commands.go +package tui + +import ( + "context" + tea "github.com/charmbracelet/bubbletea" + "github.com/wesm/msgvault/internal/query" +) + +// Message types for async text data loading +type textConversationsLoadedMsg struct { + conversations []query.ConversationRow + err error +} + +type textAggregateLoadedMsg struct { + rows []query.AggregateRow + err error +} + +type textMessagesLoadedMsg struct { + messages []query.MessageSummary + err error +} + +type textStatsLoadedMsg struct { + stats *query.TotalStats + err error +} + +func (m Model) loadTextConversations() tea.Cmd { + return func() tea.Msg { + convs, err := m.textEngine.ListConversations( + context.Background(), m.textState.filter) + return textConversationsLoadedMsg{convs, err} + } +} + +func (m Model) loadTextAggregate() tea.Cmd { + return func() tea.Msg { + rows, err := m.textEngine.TextAggregate( + context.Background(), + m.textState.viewType, + query.TextAggregateOptions{ + SourceID: m.textState.filter.SourceID, + After: m.textState.filter.After, + Before: m.textState.filter.Before, + SortField: m.textState.filter.SortField, + SortDirection: m.textState.filter.SortDirection, + Limit: m.aggregateLimit, + }) + return textAggregateLoadedMsg{rows, err} + } +} + +func (m Model) loadTextMessages() tea.Cmd { + return func() tea.Msg { + msgs, err := m.textEngine.ListConversationMessages( + context.Background(), + m.textState.selectedConvID, + m.textState.filter) + return textMessagesLoadedMsg{msgs, err} + } +} + +func (m Model) loadTextData() tea.Cmd { + switch m.textState.viewType { + case query.TextViewConversations: + return m.loadTextConversations() + default: + return m.loadTextAggregate() + } +} +``` + +- [ ] **Step 2: Handle loaded messages in Update** + +In `model.go`'s `Update` method, add cases for the new message types: + +```go +case textConversationsLoadedMsg: + m.textState.conversations = msg.conversations + m.loading = false + // ... +case textAggregateLoadedMsg: + m.textState.aggregateRows = msg.rows + m.loading = false + // ... +case textMessagesLoadedMsg: + m.textState.messages = msg.messages + m.loading = false + // ... +``` + +- [ ] **Step 3: Commit** + +```bash +git add internal/tui/text_commands.go internal/tui/model.go +git commit -m "Add async data loading for Texts mode" +``` + +--- + +## Phase 5: Integration & Polish + +### Task 20: Wire TUI Init to Load Text Engine + +**Files:** +- Modify: `cmd/msgvault/cmd/tui.go` + +- [ ] **Step 1: Pass TextEngine to TUI** + +In `tui.go`, after creating the query engine, check if it implements `TextEngine` and pass it through `tui.Options`: + +```go +type Options struct { + DataDir string + Version string + IsRemote bool + TextEngine query.TextEngine // nil if not available +} +``` + +In the TUI command's `RunE`, after engine creation: + +```go +var textEngine query.TextEngine +if te, ok := engine.(query.TextEngine); ok { + textEngine = te +} +opts := tui.Options{ + DataDir: dataDir, + Version: version, + IsRemote: isRemote, + TextEngine: textEngine, +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add cmd/msgvault/cmd/tui.go internal/tui/model.go +git commit -m "Wire TextEngine into TUI initialization" +``` + +### Task 21: End-to-End Integration Test + +**Files:** +- Create: `internal/textimport/integration_test.go` + +- [ ] **Step 1: Write integration test** + +Create a test that: +1. Creates an in-memory store +2. Simulates importing messages from two different sources using store methods directly (no actual chat.db/Takeout needed) +3. Verifies participant deduplication by phone number +4. Verifies conversation stats after RecomputeConversationStats +5. Verifies labels are linked +6. Creates a SQLiteEngine and verifies TextEngine methods return correct results + +This test exercises the full pipeline without needing real source data. + +- [ ] **Step 2: Run tests, commit** + +```bash +go test -tags fts5 ./internal/textimport/ -run TestIntegration -v +git add internal/textimport/integration_test.go +git commit -m "Add end-to-end integration test for text message import" +``` + +### Task 22: Build and Smoke Test + +- [ ] **Step 1: Build** + +```bash +make build +``` + +- [ ] **Step 2: Run full test suite** + +```bash +make test +``` + +- [ ] **Step 3: Run linter** + +```bash +make lint +``` + +- [ ] **Step 4: Fix any issues and commit** + +### Task 23: Final Commit — Remove Dead Code + +- [ ] **Step 1: Remove old sync command registrations** + +Check `root.go` for any remaining references to `syncImessageCmd`, `syncGvoiceCmd`. Remove them. + +- [ ] **Step 2: Remove unused gmail.API imports from imessage/gvoice packages** + +After refactoring, `internal/imessage/` and `internal/gvoice/` should no longer import `gmail` package. Verify and clean up. + +- [ ] **Step 3: Remove the design plan doc from WhatsApp PR** + +`docs/plans/2026-02-17-multi-source-messaging.md` was included in the WhatsApp PR as a planning doc. It's superseded by the spec. Remove it. + +- [ ] **Step 4: Run full test suite and linter one final time** + +```bash +make test && make lint +``` + +- [ ] **Step 5: Commit** + +```bash +git add -A +git commit -m "Clean up dead code and superseded planning docs" +``` From 0d15dfe4ba30c3f9d1999cd09d713b1c388a338b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:02:00 -0500 Subject: [PATCH 11/65] Add shared NormalizePhone utility for text importers --- internal/textimport/phone.go | 54 +++++++++++++++++++++++++++++++ internal/textimport/phone_test.go | 50 ++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 internal/textimport/phone.go create mode 100644 internal/textimport/phone_test.go diff --git a/internal/textimport/phone.go b/internal/textimport/phone.go new file mode 100644 index 00000000..d58108c8 --- /dev/null +++ b/internal/textimport/phone.go @@ -0,0 +1,54 @@ +package textimport + +import ( + "fmt" + "strings" + "unicode" +) + +// NormalizePhone normalizes a phone number to E.164 format. +// Returns an error for inputs that are not phone numbers (emails, +// short codes, system identifiers). +func NormalizePhone(raw string) (string, error) { + if raw == "" { + return "", fmt.Errorf("empty input") + } + // Reject email addresses + if strings.Contains(raw, "@") { + return "", fmt.Errorf("not a phone number: %q", raw) + } + + // Strip all non-digit and non-plus characters + var b strings.Builder + for _, r := range raw { + if r == '+' || unicode.IsDigit(r) { + b.WriteRune(r) + } + } + digits := b.String() + + // Must start with + or be all digits + if digits == "" { + return "", fmt.Errorf("no digits in input: %q", raw) + } + + // Strip leading + for length check + justDigits := strings.TrimPrefix(digits, "+") + if len(justDigits) < 7 { + return "", fmt.Errorf("too short for phone number: %q", raw) + } + + // Ensure + prefix + if !strings.HasPrefix(digits, "+") { + // Assume US country code if 10 digits + if len(justDigits) == 10 { + digits = "+1" + justDigits + } else if len(justDigits) == 11 && justDigits[0] == '1' { + digits = "+" + justDigits + } else { + digits = "+" + justDigits + } + } + + return digits, nil +} diff --git a/internal/textimport/phone_test.go b/internal/textimport/phone_test.go new file mode 100644 index 00000000..85ca215c --- /dev/null +++ b/internal/textimport/phone_test.go @@ -0,0 +1,50 @@ +package textimport + +import "testing" + +func TestNormalizePhone(t *testing.T) { + tests := []struct { + input string + want string + wantErr bool + }{ + // Valid E.164 + {"+15551234567", "+15551234567", false}, + // Strip formatting + {"+1 (555) 123-4567", "+15551234567", false}, + {"+1-555-123-4567", "+15551234567", false}, + {"1-555-123-4567", "+15551234567", false}, + // International + {"+447700900000", "+447700900000", false}, + {"+44 7700 900000", "+447700900000", false}, + // No country code — assume US + {"5551234567", "+15551234567", false}, + {"(555) 123-4567", "+15551234567", false}, + // Email — not a phone + {"alice@icloud.com", "", true}, + // Short code + {"12345", "", true}, + // Empty + {"", "", true}, + // System identifier + {"status@broadcast", "", true}, + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got, err := NormalizePhone(tt.input) + if tt.wantErr { + if err == nil { + t.Errorf("NormalizePhone(%q) = %q, want error", tt.input, got) + } + return + } + if err != nil { + t.Errorf("NormalizePhone(%q) error: %v", tt.input, err) + return + } + if got != tt.want { + t.Errorf("NormalizePhone(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} From 541e0cb2eb2a6d6af17e0cd4d634efde6d889ecc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:04:37 -0500 Subject: [PATCH 12/65] Generalize EnsureParticipantByPhone to accept identifierType Add identifierType as a third parameter so callers can pass "imessage", "google_voice", etc. The participant_identifiers INSERT now uses the parameter instead of the hardcoded literal 'whatsapp'. Move the INSERT OR IGNORE outside the new-only branch so that calling the method for an existing participant with a new identifierType (e.g. the same phone number seen on a second platform) still records the additional identifier row. All existing WhatsApp call sites updated to pass "whatsapp" explicitly. Co-Authored-By: Claude Sonnet 4.6 --- internal/store/messages.go | 42 ++++++++++++------------ internal/store/messages_test.go | 58 +++++++++++++++++++++++++++++++++ internal/whatsapp/importer.go | 16 ++++----- 3 files changed, 87 insertions(+), 29 deletions(-) create mode 100644 internal/store/messages_test.go diff --git a/internal/store/messages.go b/internal/store/messages.go index 6b25ae7d..9a6059a4 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -906,8 +906,9 @@ func (s *Store) EnsureConversationWithType(sourceID int64, sourceConversationID, // EnsureParticipantByPhone gets or creates a participant by phone number. // Phone must start with "+" (E.164 format). Returns an error for empty or // invalid phone numbers to prevent database pollution. -// Also creates a participant_identifiers row with identifier_type='whatsapp'. -func (s *Store) EnsureParticipantByPhone(phone, displayName string) (int64, error) { +// Also creates a participant_identifiers row with the given identifierType +// (e.g., "whatsapp", "imessage", "google_voice"). +func (s *Store) EnsureParticipantByPhone(phone, displayName, identifierType string) (int64, error) { if phone == "" { return 0, fmt.Errorf("phone number is required") } @@ -929,31 +930,30 @@ func (s *Store) EnsureParticipantByPhone(phone, displayName string) (int64, erro WHERE id = ? AND (display_name IS NULL OR display_name = '') `, displayName, id) // best-effort display name update, ignore error } - return id, nil - } - if err != sql.ErrNoRows { + } else if err != sql.ErrNoRows { return 0, err - } - - // Create new participant - result, err := s.db.Exec(` - INSERT INTO participants (phone_number, display_name, created_at, updated_at) - VALUES (?, ?, datetime('now'), datetime('now')) - `, phone, displayName) - if err != nil { - return 0, fmt.Errorf("insert participant: %w", err) - } + } else { + // Create new participant + result, err := s.db.Exec(` + INSERT INTO participants (phone_number, display_name, created_at, updated_at) + VALUES (?, ?, datetime('now'), datetime('now')) + `, phone, displayName) + if err != nil { + return 0, fmt.Errorf("insert participant: %w", err) + } - id, err = result.LastInsertId() - if err != nil { - return 0, err + id, err = result.LastInsertId() + if err != nil { + return 0, err + } } - // Also create a participant_identifiers row + // Ensure a participant_identifiers row exists for this identifierType. + // INSERT OR IGNORE is idempotent: a second call with the same type is a no-op. _, err = s.db.Exec(` INSERT OR IGNORE INTO participant_identifiers (participant_id, identifier_type, identifier_value, is_primary) - VALUES (?, 'whatsapp', ?, TRUE) - `, id, phone) + VALUES (?, ?, ?, TRUE) + `, id, identifierType, phone) if err != nil { return 0, fmt.Errorf("insert participant identifier: %w", err) } diff --git a/internal/store/messages_test.go b/internal/store/messages_test.go new file mode 100644 index 00000000..1ac43c4c --- /dev/null +++ b/internal/store/messages_test.go @@ -0,0 +1,58 @@ +package store_test + +import ( + "testing" + + "github.com/wesm/msgvault/internal/testutil" +) + +func TestEnsureParticipantByPhone_IdentifierType(t *testing.T) { + st := testutil.NewTestStore(t) + + // Create participant via WhatsApp + id1, err := st.EnsureParticipantByPhone("+15551234567", "Alice", "whatsapp") + if err != nil { + t.Fatalf("EnsureParticipantByPhone(whatsapp): %v", err) + } + if id1 == 0 { + t.Fatal("expected non-zero participant ID") + } + + // Same phone via iMessage — should return the same participant ID + id2, err := st.EnsureParticipantByPhone("+15551234567", "Alice", "imessage") + if err != nil { + t.Fatalf("EnsureParticipantByPhone(imessage): %v", err) + } + if id2 != id1 { + t.Errorf("imessage call returned participant ID %d, want %d (same as whatsapp)", id2, id1) + } + + // Both participant_identifiers rows should exist + var count int + err = st.DB().QueryRow( + `SELECT COUNT(*) FROM participant_identifiers WHERE participant_id = ?`, + id1, + ).Scan(&count) + if err != nil { + t.Fatalf("count participant_identifiers: %v", err) + } + if count != 2 { + t.Errorf("participant_identifiers count = %d, want 2", count) + } + + // Verify each identifier type is present + for _, identType := range []string{"whatsapp", "imessage"} { + var exists int + err = st.DB().QueryRow( + `SELECT COUNT(*) FROM participant_identifiers + WHERE participant_id = ? AND identifier_type = ?`, + id1, identType, + ).Scan(&exists) + if err != nil { + t.Fatalf("check identifier_type %q: %v", identType, err) + } + if exists != 1 { + t.Errorf("identifier_type %q count = %d, want 1", identType, exists) + } + } +} diff --git a/internal/whatsapp/importer.go b/internal/whatsapp/importer.go index 9fa005a8..7e43f3a6 100644 --- a/internal/whatsapp/importer.go +++ b/internal/whatsapp/importer.go @@ -87,7 +87,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt imp.progress.OnStart() // Create participant for the phone owner (self). - selfParticipantID, err := imp.store.EnsureParticipantByPhone(opts.Phone, opts.DisplayName) + selfParticipantID, err := imp.store.EnsureParticipantByPhone(opts.Phone, opts.DisplayName, "whatsapp") if err != nil { syncErr = err return nil, fmt.Errorf("ensure self participant: %w", err) @@ -152,7 +152,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt phone := normalizePhone(chat.User, chat.Server) if phone == "" { // Non-phone JID (e.g., lid:..., broadcast) — skip. - } else if participantID, err := imp.store.EnsureParticipantByPhone(phone, ""); err != nil { + } else if participantID, err := imp.store.EnsureParticipantByPhone(phone, "", "whatsapp"); err != nil { summary.Errors++ imp.progress.OnError(fmt.Errorf("ensure participant %s: %w", phone, err)) } else { @@ -174,7 +174,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt if phone == "" { continue // Non-phone JID — skip. } - participantID, err := imp.store.EnsureParticipantByPhone(phone, "") + participantID, err := imp.store.EnsureParticipantByPhone(phone, "", "whatsapp") if err != nil { summary.Errors++ continue @@ -279,7 +279,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt // validation despite not being real phone numbers. phone := resolveLidSender(waMsg.SenderJIDRowID, waMsg.SenderServer.String, lidMap) if phone != "" { - pid, err := imp.store.EnsureParticipantByPhone(phone, "") + pid, err := imp.store.EnsureParticipantByPhone(phone, "", "whatsapp") if err != nil { summary.Errors++ imp.progress.OnError(fmt.Errorf("ensure sender participant %s: %w", phone, err)) @@ -290,7 +290,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt } else if waMsg.SenderUser.Valid && waMsg.SenderUser.String != "" { phone := normalizePhone(waMsg.SenderUser.String, waMsg.SenderServer.String) if phone != "" { - pid, err := imp.store.EnsureParticipantByPhone(phone, "") + pid, err := imp.store.EnsureParticipantByPhone(phone, "", "whatsapp") if err != nil { summary.Errors++ imp.progress.OnError(fmt.Errorf("ensure sender participant %s: %w", phone, err)) @@ -302,7 +302,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt // In a direct chat, the other person is the sender. phone := normalizePhone(chat.User, chat.Server) if phone != "" { - pid, err := imp.store.EnsureParticipantByPhone(phone, "") + pid, err := imp.store.EnsureParticipantByPhone(phone, "", "whatsapp") if err == nil { senderID = sql.NullInt64{Int64: pid, Valid: true} } @@ -421,7 +421,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt if phone == "" { continue } - pid, err := imp.store.EnsureParticipantByPhone(phone, "") + pid, err := imp.store.EnsureParticipantByPhone(phone, "", "whatsapp") if err != nil { summary.Errors++ continue @@ -432,7 +432,7 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt if phone == "" { continue // Non-phone JID — skip reaction. } - pid, err := imp.store.EnsureParticipantByPhone(phone, "") + pid, err := imp.store.EnsureParticipantByPhone(phone, "", "whatsapp") if err != nil { summary.Errors++ continue From 9608176e546f096d2cbf79828a4d423ed020fe23 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:07:04 -0500 Subject: [PATCH 13/65] Add shared RecomputeConversationStats store method Extract inline SQL from WhatsApp importer into a reusable Store.RecomputeConversationStats(sourceID) method that updates message_count, participant_count, last_message_at, and last_message_preview for all conversations belonging to a source. Co-Authored-By: Claude Sonnet 4.6 --- internal/store/messages.go | 34 +++++++++++++ internal/store/messages_test.go | 86 +++++++++++++++++++++++++++++++++ internal/whatsapp/importer.go | 20 ++------ 3 files changed, 123 insertions(+), 17 deletions(-) diff --git a/internal/store/messages.go b/internal/store/messages.go index 9a6059a4..5d0f325a 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -860,6 +860,40 @@ func (s *Store) backfillFTSBatch(fromID, toID int64) (int64, error) { return result.RowsAffected() } +// RecomputeConversationStats updates the denormalized stats columns on all conversations +// belonging to the given source. It recomputes message_count, participant_count, +// last_message_at, and last_message_preview from the current table state. +// Safe to call multiple times — always produces the same result (idempotent). +func (s *Store) RecomputeConversationStats(sourceID int64) error { + _, err := s.db.Exec(` + UPDATE conversations SET + message_count = ( + SELECT COUNT(*) FROM messages + WHERE conversation_id = conversations.id + ), + participant_count = ( + SELECT COUNT(*) FROM conversation_participants + WHERE conversation_id = conversations.id + ), + last_message_at = ( + SELECT MAX(COALESCE(sent_at, received_at, internal_date)) + FROM messages + WHERE conversation_id = conversations.id + ), + last_message_preview = ( + SELECT snippet FROM messages + WHERE conversation_id = conversations.id + ORDER BY COALESCE(sent_at, received_at, internal_date) DESC + LIMIT 1 + ) + WHERE source_id = ? + `, sourceID) + if err != nil { + return fmt.Errorf("recompute conversation stats: %w", err) + } + return nil +} + // EnsureConversationWithType gets or creates a conversation with an explicit conversation_type. // Unlike EnsureConversation (which hardcodes 'email_thread'), this accepts the type as a parameter, // making it suitable for WhatsApp and other messaging platforms. diff --git a/internal/store/messages_test.go b/internal/store/messages_test.go index 1ac43c4c..647c8c08 100644 --- a/internal/store/messages_test.go +++ b/internal/store/messages_test.go @@ -1,11 +1,97 @@ package store_test import ( + "database/sql" "testing" + "time" + "github.com/wesm/msgvault/internal/store" "github.com/wesm/msgvault/internal/testutil" ) +func TestRecomputeConversationStats(t *testing.T) { + st := testutil.NewTestStore(t) + + source, err := st.GetOrCreateSource("whatsapp", "+15550000001") + if err != nil { + t.Fatalf("GetOrCreateSource: %v", err) + } + + convID, err := st.EnsureConversationWithType(source.ID, "conv-1", "whatsapp_dm", "Test Chat") + if err != nil { + t.Fatalf("EnsureConversationWithType: %v", err) + } + + // Verify initial message_count is 0 (stats not maintained on insert). + var initialCount int + if err := st.DB().QueryRow( + `SELECT message_count FROM conversations WHERE id = ?`, convID, + ).Scan(&initialCount); err != nil { + t.Fatalf("initial message_count scan: %v", err) + } + if initialCount != 0 { + t.Errorf("initial message_count = %d, want 0", initialCount) + } + + sentAt := time.Date(2024, 1, 15, 10, 0, 0, 0, time.UTC) + msg1 := &store.Message{ + SourceID: source.ID, + SourceMessageID: "msg-1", + ConversationID: convID, + MessageType: "whatsapp", + SentAt: sql.NullTime{Time: sentAt, Valid: true}, + Snippet: sql.NullString{String: "hello", Valid: true}, + } + if _, err := st.UpsertMessage(msg1); err != nil { + t.Fatalf("UpsertMessage msg1: %v", err) + } + + sentAt2 := sentAt.Add(time.Hour) + msg2 := &store.Message{ + SourceID: source.ID, + SourceMessageID: "msg-2", + ConversationID: convID, + MessageType: "whatsapp", + SentAt: sql.NullTime{Time: sentAt2, Valid: true}, + Snippet: sql.NullString{String: "world", Valid: true}, + } + if _, err := st.UpsertMessage(msg2); err != nil { + t.Fatalf("UpsertMessage msg2: %v", err) + } + + // Recompute and verify counts. + if err := st.RecomputeConversationStats(source.ID); err != nil { + t.Fatalf("RecomputeConversationStats: %v", err) + } + + var count int + var lastMsgAt sql.NullTime + if err := st.DB().QueryRow( + `SELECT message_count, last_message_at FROM conversations WHERE id = ?`, convID, + ).Scan(&count, &lastMsgAt); err != nil { + t.Fatalf("post-recompute scan: %v", err) + } + if count != 2 { + t.Errorf("message_count = %d, want 2", count) + } + if !lastMsgAt.Valid { + t.Error("last_message_at is NULL, want a timestamp") + } + + // Idempotency: calling again should produce the same result. + if err := st.RecomputeConversationStats(source.ID); err != nil { + t.Fatalf("RecomputeConversationStats (second call): %v", err) + } + if err := st.DB().QueryRow( + `SELECT message_count FROM conversations WHERE id = ?`, convID, + ).Scan(&count); err != nil { + t.Fatalf("idempotency scan: %v", err) + } + if count != 2 { + t.Errorf("idempotency: message_count = %d, want 2", count) + } +} + func TestEnsureParticipantByPhone_IdentifierType(t *testing.T) { st := testutil.NewTestStore(t) diff --git a/internal/whatsapp/importer.go b/internal/whatsapp/importer.go index 7e43f3a6..17406c22 100644 --- a/internal/whatsapp/importer.go +++ b/internal/whatsapp/importer.go @@ -495,23 +495,9 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt } // Update denormalised conversation counts for the WhatsApp source. - _, _ = imp.store.DB().Exec(` - UPDATE conversations SET - message_count = ( - SELECT COUNT(*) FROM messages - WHERE conversation_id = conversations.id - ), - participant_count = ( - SELECT COUNT(*) FROM conversation_participants - WHERE conversation_id = conversations.id - ), - last_message_at = ( - SELECT MAX(COALESCE(sent_at, received_at, internal_date)) - FROM messages - WHERE conversation_id = conversations.id - ) - WHERE source_id = ? - `, source.ID) + if err := imp.store.RecomputeConversationStats(source.ID); err != nil { + imp.progress.OnError(fmt.Errorf("recompute conversation stats: %w", err)) + } summary.Duration = time.Since(startTime) imp.progress.OnComplete(summary) From 5dbecd773da7cb87fdaa3b834ec199af506b18b8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:07:41 -0500 Subject: [PATCH 14/65] Add LinkMessageLabel convenience method --- internal/store/messages.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/store/messages.go b/internal/store/messages.go index 5d0f325a..b414e436 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -586,6 +586,12 @@ func (s *Store) AddMessageLabels(messageID int64, labelIDs []int64) error { }) } +// LinkMessageLabel links a single label to a message. +// Uses INSERT OR IGNORE — safe to call multiple times. +func (s *Store) LinkMessageLabel(messageID, labelID int64) error { + return s.AddMessageLabels(messageID, []int64{labelID}) +} + // RemoveMessageLabels removes specific labels from a message. func (s *Store) RemoveMessageLabels(messageID int64, labelIDs []int64) error { if len(labelIDs) == 0 { From eade26c8e851692df4e8213a151a3c5facff9574 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:13:21 -0500 Subject: [PATCH 15/65] Refactor iMessage to use store methods directly Drop the gmail.API adapter pattern. Instead of building synthetic MIME messages and flowing through the email sync pipeline, the client now calls store methods directly with proper message_type, sender_id, conversation_type, and phone-based participants. - parser.go: Replace normalizeIdentifier with resolveHandle using shared textimport.NormalizePhone; remove buildMIME, formatMIMEAddress, normalizePhone - client.go: Replace gmail.API interface with Import method that writes to store directly; keep all chat.db reading logic - models.go: Add ImportSummary type - CLI: New import-imessage command replaces sync-imessage - parser_test.go: Replace normalizeIdentifier/MIME tests with resolveHandle tests covering phone/email/raw-handle cases Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/msgvault/cmd/import_imessage.go | 243 +++++++++++ cmd/msgvault/cmd/sync_imessage.go | 210 --------- internal/imessage/client.go | 638 ++++++++++++++++------------ internal/imessage/models.go | 12 +- internal/imessage/parser.go | 127 +----- internal/imessage/parser_test.go | 154 +++---- 6 files changed, 676 insertions(+), 708 deletions(-) create mode 100644 cmd/msgvault/cmd/import_imessage.go delete mode 100644 cmd/msgvault/cmd/sync_imessage.go diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go new file mode 100644 index 00000000..c51692ae --- /dev/null +++ b/cmd/msgvault/cmd/import_imessage.go @@ -0,0 +1,243 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/imessage" + "github.com/wesm/msgvault/internal/store" +) + +var ( + importImessageDBPath string + importImessageBefore string + importImessageAfter string + importImessageLimit int +) + +var importImessageCmd = &cobra.Command{ + Use: "import-imessage", + Short: "Import iMessages from local database", + Long: `Import iMessages from macOS's local Messages database (chat.db). + +Reads messages from ~/Library/Messages/chat.db and stores them in the +msgvault archive. This is a read-only operation that does not modify +the iMessage database. + +Requires Full Disk Access permission in System Settings > Privacy & Security. + +Date filters: + --after 2024-01-01 Only messages on or after this date + --before 2024-12-31 Only messages before this date + +Examples: + msgvault import-imessage + msgvault import-imessage --after 2024-01-01 + msgvault import-imessage --limit 100 + msgvault import-imessage --db-path /path/to/chat.db`, + RunE: runImportImessage, +} + +func runImportImessage(cmd *cobra.Command, _ []string) error { + s, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = s.Close() }() + + chatDBPath, err := resolveChatDBPath() + if err != nil { + return err + } + + clientOpts, err := buildImessageOpts() + if err != nil { + return err + } + + client, err := imessage.NewClient(chatDBPath, clientOpts...) + if err != nil { + return fmt.Errorf("open iMessage database: %w", err) + } + defer func() { _ = client.Close() }() + + // Get or create the source + src, err := s.GetOrCreateSource("apple_messages", "imessage") + if err != nil { + return fmt.Errorf("get or create source: %w", err) + } + + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + fmt.Println("\nInterrupted.") + cancel() + }() + + startTime := time.Now() + totalEstimate := client.CountFilteredMessages(ctx) + fmt.Printf("Importing iMessages from %s\n", chatDBPath) + if totalEstimate > 0 { + fmt.Printf("Messages to import: ~%d\n", totalEstimate) + } + printImessageDateFilter() + if importImessageLimit > 0 { + fmt.Printf("Limit: %d messages\n", importImessageLimit) + } + fmt.Println() + + summary, err := client.Import(ctx, s, src.ID) + if err != nil { + if ctx.Err() != nil { + fmt.Println("\nImport interrupted.") + printImessageSummary(summary, startTime) + return nil + } + return fmt.Errorf("import failed: %w", err) + } + + printImessageSummary(summary, startTime) + return nil +} + +func openStoreAndInit() (*store.Store, error) { + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return nil, fmt.Errorf("open database: %w", err) + } + if err := s.InitSchema(); err != nil { + _ = s.Close() + return nil, fmt.Errorf("init schema: %w", err) + } + return s, nil +} + +func resolveChatDBPath() (string, error) { + if importImessageDBPath != "" { + if _, err := os.Stat(importImessageDBPath); os.IsNotExist(err) { + return "", fmt.Errorf( + "iMessage database not found at %s", + importImessageDBPath, + ) + } + return importImessageDBPath, nil + } + + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("get home directory: %w", err) + } + path := filepath.Join(home, "Library", "Messages", "chat.db") + if _, err := os.Stat(path); os.IsNotExist(err) { + return "", fmt.Errorf( + "iMessage database not found at %s\n\n"+ + "Make sure you're running on macOS with Messages enabled", + path, + ) + } + return path, nil +} + +func buildImessageOpts() ([]imessage.ClientOption, error) { + var opts []imessage.ClientOption + opts = append(opts, imessage.WithImessageLogger(logger)) + + if importImessageAfter != "" { + t, err := time.ParseInLocation( + "2006-01-02", importImessageAfter, time.Local, + ) + if err != nil { + return nil, fmt.Errorf( + "invalid --after date: %w (use YYYY-MM-DD format)", err, + ) + } + opts = append(opts, imessage.WithAfterDate(t)) + } + + if importImessageBefore != "" { + t, err := time.ParseInLocation( + "2006-01-02", importImessageBefore, time.Local, + ) + if err != nil { + return nil, fmt.Errorf( + "invalid --before date: %w (use YYYY-MM-DD format)", err, + ) + } + opts = append(opts, imessage.WithBeforeDate(t)) + } + + if importImessageLimit > 0 { + opts = append(opts, imessage.WithLimit(importImessageLimit)) + } + + return opts, nil +} + +func printImessageDateFilter() { + if importImessageAfter == "" && importImessageBefore == "" { + return + } + parts := []string{} + if importImessageAfter != "" { + parts = append(parts, "after "+importImessageAfter) + } + if importImessageBefore != "" { + parts = append(parts, "before "+importImessageBefore) + } + fmt.Printf("Date filter: %s\n", strings.Join(parts, ", ")) +} + +func printImessageSummary( + summary *imessage.ImportSummary, + startTime time.Time, +) { + if summary == nil { + return + } + elapsed := time.Since(startTime) + fmt.Println() + fmt.Println("iMessage import complete!") + fmt.Printf(" Duration: %s\n", elapsed.Round(time.Second)) + fmt.Printf(" Messages: %d imported\n", summary.MessagesImported) + fmt.Printf(" Conversations: %d\n", summary.ConversationsImported) + fmt.Printf(" Participants: %d resolved\n", summary.ParticipantsResolved) + if summary.Skipped > 0 { + fmt.Printf(" Skipped: %d\n", summary.Skipped) + } + if summary.MessagesImported > 0 && elapsed.Seconds() > 0 { + rate := float64(summary.MessagesImported) / elapsed.Seconds() + fmt.Printf(" Rate: %.1f messages/sec\n", rate) + } +} + +func init() { + importImessageCmd.Flags().StringVar( + &importImessageDBPath, "db-path", "", + "path to chat.db (default: ~/Library/Messages/chat.db)", + ) + importImessageCmd.Flags().StringVar( + &importImessageBefore, "before", "", + "only messages before this date (YYYY-MM-DD)", + ) + importImessageCmd.Flags().StringVar( + &importImessageAfter, "after", "", + "only messages after this date (YYYY-MM-DD)", + ) + importImessageCmd.Flags().IntVar( + &importImessageLimit, "limit", 0, + "limit number of messages (for testing)", + ) + rootCmd.AddCommand(importImessageCmd) +} diff --git a/cmd/msgvault/cmd/sync_imessage.go b/cmd/msgvault/cmd/sync_imessage.go deleted file mode 100644 index 425453fc..00000000 --- a/cmd/msgvault/cmd/sync_imessage.go +++ /dev/null @@ -1,210 +0,0 @@ -package cmd - -import ( - "context" - "fmt" - "os" - "os/signal" - "path/filepath" - "syscall" - "time" - - "github.com/spf13/cobra" - "github.com/wesm/msgvault/internal/imessage" - "github.com/wesm/msgvault/internal/store" - "github.com/wesm/msgvault/internal/sync" -) - -var ( - imessageDBPath string - imessageBefore string - imessageAfter string - imessageLimit int - imessageMe string - imessageNoResume bool -) - -var syncImessageCmd = &cobra.Command{ - Use: "sync-imessage", - Short: "Import iMessages from local database", - Long: `Import iMessages from macOS's local Messages database (chat.db). - -Reads messages from ~/Library/Messages/chat.db and stores them in the -msgvault archive alongside Gmail messages. This is a read-only operation -that does not modify the iMessage database. - -Requires Full Disk Access permission in System Settings > Privacy & Security. - -Date filters: - --after 2024-01-01 Only messages on or after this date - --before 2024-12-31 Only messages before this date - -Examples: - msgvault sync-imessage - msgvault sync-imessage --after 2024-01-01 - msgvault sync-imessage --limit 100 - msgvault sync-imessage --me "+15551234567" - msgvault sync-imessage --db-path /path/to/chat.db`, - RunE: func(cmd *cobra.Command, args []string) error { - // Open msgvault database - dbPath := cfg.DatabaseDSN() - s, err := store.Open(dbPath) - if err != nil { - return fmt.Errorf("open database: %w", err) - } - defer func() { _ = s.Close() }() - - if err := s.InitSchema(); err != nil { - return fmt.Errorf("init schema: %w", err) - } - - // Resolve chat.db path - chatDBPath := imessageDBPath - if chatDBPath == "" { - home, err := os.UserHomeDir() - if err != nil { - return fmt.Errorf("get home directory: %w", err) - } - chatDBPath = filepath.Join(home, "Library", "Messages", "chat.db") - } - - // Check chat.db exists - if _, err := os.Stat(chatDBPath); os.IsNotExist(err) { - return fmt.Errorf("iMessage database not found at %s\n\nMake sure you're running on macOS with Messages enabled", chatDBPath) - } - - // Build client options - var clientOpts []imessage.ClientOption - clientOpts = append(clientOpts, imessage.WithImessageLogger(logger)) - - if imessageMe != "" { - clientOpts = append(clientOpts, imessage.WithMyAddress(imessageMe)) - } - - if imessageAfter != "" { - t, err := time.ParseInLocation("2006-01-02", imessageAfter, time.Local) - if err != nil { - return fmt.Errorf("invalid --after date: %w (use YYYY-MM-DD format)", err) - } - clientOpts = append(clientOpts, imessage.WithAfterDate(t)) - } - - if imessageBefore != "" { - t, err := time.ParseInLocation("2006-01-02", imessageBefore, time.Local) - if err != nil { - return fmt.Errorf("invalid --before date: %w (use YYYY-MM-DD format)", err) - } - clientOpts = append(clientOpts, imessage.WithBeforeDate(t)) - } - - if imessageLimit > 0 { - clientOpts = append(clientOpts, imessage.WithLimit(imessageLimit)) - } - - // Determine source identifier - identifier := "local" - if imessageMe != "" { - identifier = imessageMe - } - - // Create iMessage client - imsgClient, err := imessage.NewClient(chatDBPath, identifier, clientOpts...) - if err != nil { - return fmt.Errorf("open iMessage database: %w", err) - } - defer func() { _ = imsgClient.Close() }() - - // Set up context with cancellation - ctx, cancel := context.WithCancel(cmd.Context()) - defer cancel() - - // Handle Ctrl+C gracefully - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) - go func() { - <-sigChan - fmt.Println("\nInterrupted. Saving checkpoint...") - cancel() - }() - - // Set up sync options - opts := sync.DefaultOptions() - opts.NoResume = imessageNoResume - opts.SourceType = "apple_messages" - opts.AttachmentsDir = cfg.AttachmentsDir() - - // Create syncer with progress reporter - syncer := sync.New(imsgClient, s, opts). - WithLogger(logger). - WithProgress(&CLIProgress{}) - - // Run sync - startTime := time.Now() - fmt.Printf("Starting iMessage sync from %s\n", chatDBPath) - if imessageAfter != "" || imessageBefore != "" { - parts := []string{} - if imessageAfter != "" { - parts = append(parts, "after "+imessageAfter) - } - if imessageBefore != "" { - parts = append(parts, "before "+imessageBefore) - } - fmt.Printf("Date filter: %s\n", joinParts(parts)) - } - if imessageLimit > 0 { - fmt.Printf("Limit: %d messages\n", imessageLimit) - } - fmt.Println() - - summary, err := syncer.Full(ctx, identifier) - if err != nil { - if ctx.Err() != nil { - fmt.Println("\nSync interrupted. Run again to resume.") - return nil - } - return fmt.Errorf("sync failed: %w", err) - } - - // Print summary - fmt.Println() - fmt.Println("iMessage sync complete!") - fmt.Printf(" Duration: %s\n", summary.Duration.Round(time.Second)) - fmt.Printf(" Messages: %d found, %d added, %d skipped\n", - summary.MessagesFound, summary.MessagesAdded, summary.MessagesSkipped) - if summary.Errors > 0 { - fmt.Printf(" Errors: %d\n", summary.Errors) - } - if summary.WasResumed { - fmt.Printf(" (Resumed from checkpoint)\n") - } - - if summary.MessagesAdded > 0 { - elapsed := time.Since(startTime) - messagesPerSec := float64(summary.MessagesAdded) / elapsed.Seconds() - fmt.Printf(" Rate: %.1f messages/sec\n", messagesPerSec) - } - - return nil - }, -} - -func joinParts(parts []string) string { - result := "" - for i, p := range parts { - if i > 0 { - result += ", " - } - result += p - } - return result -} - -func init() { - syncImessageCmd.Flags().StringVar(&imessageDBPath, "db-path", "", "path to chat.db (default: ~/Library/Messages/chat.db)") - syncImessageCmd.Flags().StringVar(&imessageBefore, "before", "", "only messages before this date (YYYY-MM-DD)") - syncImessageCmd.Flags().StringVar(&imessageAfter, "after", "", "only messages after this date (YYYY-MM-DD)") - syncImessageCmd.Flags().IntVar(&imessageLimit, "limit", 0, "limit number of messages (for testing)") - syncImessageCmd.Flags().StringVar(&imessageMe, "me", "", "your phone number or email (e.g., +15551234567)") - syncImessageCmd.Flags().BoolVar(&imessageNoResume, "noresume", false, "force fresh sync (don't resume)") - rootCmd.AddCommand(syncImessageCmd) -} diff --git a/internal/imessage/client.go b/internal/imessage/client.go index a70131e1..33edbea1 100644 --- a/internal/imessage/client.go +++ b/internal/imessage/client.go @@ -6,24 +6,23 @@ import ( "fmt" "log/slog" "strconv" + "strings" "time" _ "github.com/mattn/go-sqlite3" - "github.com/wesm/msgvault/internal/gmail" + "github.com/wesm/msgvault/internal/mime" + "github.com/wesm/msgvault/internal/store" ) const defaultPageSize = 500 -// Client reads from macOS's iMessage chat.db and implements the gmail.API -// interface so it can be used with the existing sync infrastructure. +// Client reads from macOS's iMessage chat.db and imports messages +// directly into the msgvault store. type Client struct { db *sql.DB - identifier string // source identifier for GetProfile (e.g., "local") - myAddress string // normalized email-like address for the device owner - afterDate time.Time // only sync messages after this date - beforeDate time.Time // only sync messages before this date - limit int // max total messages to return (0 = unlimited) - returned int // messages returned so far (for limit tracking) + afterDate time.Time // only import messages after this date + beforeDate time.Time // only import messages before this date + limit int // max total messages to import (0 = unlimited) useNanoseconds bool // whether chat.db uses nanosecond timestamps logger *slog.Logger pageSize int @@ -42,27 +41,22 @@ func WithBeforeDate(t time.Time) ClientOption { return func(c *Client) { c.beforeDate = t } } -// WithLimit sets the maximum number of messages to return across all pages. +// WithLimit sets the maximum number of messages to import. func WithLimit(n int) ClientOption { return func(c *Client) { c.limit = n } } -// WithMyAddress sets the owner's email-like address for MIME From headers -// on is_from_me messages. -func WithMyAddress(addr string) ClientOption { - return func(c *Client) { c.myAddress = addr } -} - // WithImessageLogger sets the logger for the client. func WithImessageLogger(l *slog.Logger) ClientOption { return func(c *Client) { c.logger = l } } -// NewClient opens a read-only connection to an iMessage chat.db file -// and returns a Client that implements gmail.API. -func NewClient(dbPath string, identifier string, opts ...ClientOption) (*Client, error) { - // Open chat.db read-only - dsn := fmt.Sprintf("file:%s?mode=ro&_journal_mode=WAL&_busy_timeout=5000", dbPath) +// NewClient opens a read-only connection to an iMessage chat.db file. +func NewClient(dbPath string, opts ...ClientOption) (*Client, error) { + dsn := fmt.Sprintf( + "file:%s?mode=ro&_journal_mode=WAL&_busy_timeout=5000", + dbPath, + ) db, err := sql.Open("sqlite3", dsn) if err != nil { return nil, fmt.Errorf("open chat.db: %w", err) @@ -70,22 +64,22 @@ func NewClient(dbPath string, identifier string, opts ...ClientOption) (*Client, if err := db.Ping(); err != nil { _ = db.Close() - return nil, fmt.Errorf("connect to chat.db: %w (check Full Disk Access permissions)", err) + return nil, fmt.Errorf( + "connect to chat.db: %w (check Full Disk Access permissions)", + err, + ) } c := &Client{ - db: db, - identifier: identifier, - myAddress: "me@imessage.local", - logger: slog.Default(), - pageSize: defaultPageSize, + db: db, + logger: slog.Default(), + pageSize: defaultPageSize, } for _, opt := range opts { opt(c) } - // Detect timestamp format (nanoseconds vs seconds) if err := c.detectTimestampFormat(); err != nil { _ = db.Close() return nil, fmt.Errorf("detect timestamp format: %w", err) @@ -103,7 +97,9 @@ func (c *Client) Close() error { // (macOS High Sierra+) or second timestamps (older macOS). func (c *Client) detectTimestampFormat() error { var maxDate sql.NullInt64 - err := c.db.QueryRow("SELECT MAX(date) FROM message WHERE date > 0").Scan(&maxDate) + err := c.db.QueryRow( + "SELECT MAX(date) FROM message WHERE date > 0", + ).Scan(&maxDate) if err != nil { return fmt.Errorf("query max date: %w", err) } @@ -113,64 +109,132 @@ func (c *Client) detectTimestampFormat() error { return nil } -// GetProfile returns a profile with the message count and max ROWID as history ID. -func (c *Client) GetProfile(ctx context.Context) (*gmail.Profile, error) { - var count int64 - if err := c.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM message").Scan(&count); err != nil { - return nil, fmt.Errorf("count messages: %w", err) - } +// CountFilteredMessages returns the total count of messages matching +// the date filters, for progress reporting. +func (c *Client) CountFilteredMessages(ctx context.Context) int64 { + sqlQuery := "SELECT COUNT(*) FROM message WHERE 1=1" + var args []interface{} - var maxROWID sql.NullInt64 - if err := c.db.QueryRowContext(ctx, "SELECT MAX(ROWID) FROM message").Scan(&maxROWID); err != nil { - return nil, fmt.Errorf("get max rowid: %w", err) + if !c.afterDate.IsZero() { + appleTS := timeToAppleTimestamp(c.afterDate, c.useNanoseconds) + sqlQuery += " AND date >= ?" + args = append(args, appleTS) } - - historyID := uint64(0) - if maxROWID.Valid { - historyID = uint64(maxROWID.Int64) + if !c.beforeDate.IsZero() { + appleTS := timeToAppleTimestamp(c.beforeDate, c.useNanoseconds) + sqlQuery += " AND date < ?" + args = append(args, appleTS) } - return &gmail.Profile{ - EmailAddress: c.identifier, - MessagesTotal: count, - HistoryID: historyID, - }, nil -} - -// ListLabels returns iMessage and SMS as labels. -func (c *Client) ListLabels(ctx context.Context) ([]*gmail.Label, error) { - return []*gmail.Label{ - {ID: "iMessage", Name: "iMessage", Type: "user"}, - {ID: "SMS", Name: "SMS", Type: "user"}, - }, nil + var count int64 + if err := c.db.QueryRowContext(ctx, sqlQuery, args...).Scan(&count); err != nil { + return 0 + } + return count } -// ListMessages returns a page of message IDs from chat.db, ordered by ROWID. -// The pageToken is the string representation of the last seen ROWID. -// The query parameter is ignored (date filtering is done via client options). -func (c *Client) ListMessages(ctx context.Context, query string, pageToken string) (*gmail.MessageListResponse, error) { - // Check limit - if c.limit > 0 && c.returned >= c.limit { - return &gmail.MessageListResponse{}, nil +// Import reads all matching messages from chat.db and writes them +// into the msgvault store using direct store methods. +func (c *Client) Import( + ctx context.Context, + s *store.Store, + sourceID int64, +) (*ImportSummary, error) { + summary := &ImportSummary{} + + // Pre-create iMessage and SMS labels + imessageLabelID, err := s.EnsureLabel( + sourceID, "iMessage", "iMessage", "user", + ) + if err != nil { + return nil, fmt.Errorf("ensure iMessage label: %w", err) + } + smsLabelID, err := s.EnsureLabel(sourceID, "SMS", "SMS", "user") + if err != nil { + return nil, fmt.Errorf("ensure SMS label: %w", err) } + // Track resolved participants to avoid repeated DB calls + phoneCache := map[string]int64{} // phone -> participantID + emailCache := map[string]int64{} // email -> participantID + convCache := map[string]int64{} // chatGUID -> conversationID + imported := 0 lastROWID := int64(0) - if pageToken != "" { - var err error - lastROWID, err = strconv.ParseInt(pageToken, 10, 64) + + for { + if err := ctx.Err(); err != nil { + return summary, err + } + + if c.limit > 0 && imported >= c.limit { + break + } + + rows, pageCount, err := c.fetchPage(ctx, lastROWID) if err != nil { - return nil, fmt.Errorf("invalid page token: %w", err) + return summary, fmt.Errorf("fetch page: %w", err) + } + if pageCount == 0 { + break + } + + for _, msg := range rows { + if err := ctx.Err(); err != nil { + return summary, err + } + if c.limit > 0 && imported >= c.limit { + break + } + + if err := c.importMessage( + ctx, s, sourceID, &msg, + imessageLabelID, smsLabelID, + phoneCache, emailCache, convCache, + summary, + ); err != nil { + c.logger.Warn( + "failed to import message", + "rowid", msg.ROWID, + "error", err, + ) + summary.Skipped++ + continue + } + + imported++ + lastROWID = msg.ROWID } + + if pageCount < c.pageSize { + break + } + } + + if err := s.RecomputeConversationStats(sourceID); err != nil { + return summary, fmt.Errorf("recompute stats: %w", err) } - // Build query + return summary, nil +} + +// fetchPage queries the next batch of messages from chat.db. +func (c *Client) fetchPage( + ctx context.Context, + afterROWID int64, +) ([]messageRow, int, error) { sqlQuery := ` - SELECT m.ROWID, COALESCE(c.guid, 'no-chat-' || CAST(m.ROWID AS TEXT)) as chat_guid + SELECT + m.ROWID, m.guid, m.text, m.attributedBody, + m.date, m.is_from_me, m.service, + m.cache_has_attachments, + h.id, + c.ROWID, c.guid, c.display_name, c.chat_identifier FROM message m + LEFT JOIN handle h ON h.ROWID = m.handle_id LEFT JOIN chat_message_join cmj ON cmj.message_id = m.ROWID LEFT JOIN chat c ON c.ROWID = cmj.chat_id WHERE m.ROWID > ?` - args := []interface{}{lastROWID} + args := []interface{}{afterROWID} if !c.afterDate.IsZero() { appleTS := timeToAppleTimestamp(c.afterDate, c.useNanoseconds) @@ -183,132 +247,95 @@ func (c *Client) ListMessages(ctx context.Context, query string, pageToken strin args = append(args, appleTS) } - sqlQuery += " ORDER BY m.ROWID ASC LIMIT ?" - - // Calculate page size respecting limit pageSize := c.pageSize if c.limit > 0 { - remaining := c.limit - c.returned + remaining := c.limit if remaining < pageSize { pageSize = remaining } } + sqlQuery += " ORDER BY m.ROWID ASC LIMIT ?" args = append(args, pageSize) - rows, err := c.db.QueryContext(ctx, sqlQuery, args...) + dbRows, err := c.db.QueryContext(ctx, sqlQuery, args...) if err != nil { - return nil, fmt.Errorf("list messages: %w", err) - } - defer func() { _ = rows.Close() }() - - var messages []gmail.MessageID - var maxRowID int64 - for rows.Next() { - var rowID int64 - var chatGUID string - if err := rows.Scan(&rowID, &chatGUID); err != nil { - return nil, fmt.Errorf("scan message: %w", err) + return nil, 0, fmt.Errorf("query messages: %w", err) + } + defer func() { _ = dbRows.Close() }() + + var result []messageRow + for dbRows.Next() { + var msg messageRow + if err := dbRows.Scan( + &msg.ROWID, &msg.GUID, &msg.Text, &msg.AttributedBody, + &msg.Date, &msg.IsFromMe, &msg.Service, + &msg.HasAttachments, + &msg.HandleID, + &msg.ChatROWID, &msg.ChatGUID, &msg.ChatDisplayName, + &msg.ChatIdentifier, + ); err != nil { + return nil, 0, fmt.Errorf("scan message: %w", err) } - messages = append(messages, gmail.MessageID{ - ID: strconv.FormatInt(rowID, 10), - ThreadID: chatGUID, - }) - maxRowID = rowID - } - if err := rows.Err(); err != nil { - return nil, fmt.Errorf("iterate messages: %w", err) - } - - c.returned += len(messages) - - // Determine next page token - var nextPageToken string - if len(messages) == pageSize { - nextPageToken = strconv.FormatInt(maxRowID, 10) - } - - // Get total estimate for progress reporting - totalEstimate := int64(len(messages)) - if pageToken == "" { - totalEstimate = c.countFilteredMessages(ctx) - } - - return &gmail.MessageListResponse{ - Messages: messages, - NextPageToken: nextPageToken, - ResultSizeEstimate: totalEstimate, - }, nil -} - -// countFilteredMessages returns the total count of messages matching the date filters. -func (c *Client) countFilteredMessages(ctx context.Context) int64 { - sqlQuery := "SELECT COUNT(*) FROM message WHERE 1=1" - var args []interface{} - - if !c.afterDate.IsZero() { - appleTS := timeToAppleTimestamp(c.afterDate, c.useNanoseconds) - sqlQuery += " AND date >= ?" - args = append(args, appleTS) + result = append(result, msg) } - if !c.beforeDate.IsZero() { - appleTS := timeToAppleTimestamp(c.beforeDate, c.useNanoseconds) - sqlQuery += " AND date < ?" - args = append(args, appleTS) + if err := dbRows.Err(); err != nil { + return nil, 0, fmt.Errorf("iterate messages: %w", err) } - var count int64 - if err := c.db.QueryRowContext(ctx, sqlQuery, args...).Scan(&count); err != nil { - return 0 - } - return count + return result, len(result), nil } -// GetMessageRaw fetches a single message and builds synthetic MIME data. -func (c *Client) GetMessageRaw(ctx context.Context, messageID string) (*gmail.RawMessage, error) { - rowID, err := strconv.ParseInt(messageID, 10, 64) - if err != nil { - return nil, fmt.Errorf("invalid message ID: %w", err) +// importMessage processes a single chat.db message row and writes it +// to the msgvault store. +func (c *Client) importMessage( + ctx context.Context, + s *store.Store, + sourceID int64, + msg *messageRow, + imessageLabelID, smsLabelID int64, + phoneCache map[string]int64, + emailCache map[string]int64, + convCache map[string]int64, + summary *ImportSummary, +) error { + // Determine conversation + chatGUID := "no-chat-" + strconv.FormatInt(msg.ROWID, 10) + if msg.ChatGUID != nil { + chatGUID = *msg.ChatGUID } - // Query message with handle and chat info - var msg messageRow - err = c.db.QueryRowContext(ctx, ` - SELECT - m.ROWID, m.guid, m.text, m.attributedBody, m.date, m.is_from_me, m.service, - m.cache_has_attachments, - h.id, - c.ROWID, c.guid, c.display_name, c.chat_identifier - FROM message m - LEFT JOIN handle h ON h.ROWID = m.handle_id - LEFT JOIN chat_message_join cmj ON cmj.message_id = m.ROWID - LEFT JOIN chat c ON c.ROWID = cmj.chat_id - WHERE m.ROWID = ? - `, rowID).Scan( - &msg.ROWID, &msg.GUID, &msg.Text, &msg.AttributedBody, &msg.Date, &msg.IsFromMe, &msg.Service, - &msg.HasAttachments, - &msg.HandleID, - &msg.ChatROWID, &msg.ChatGUID, &msg.ChatDisplayName, &msg.ChatIdentifier, + convID, isNewConv, err := c.ensureConversation( + ctx, s, sourceID, msg, chatGUID, convCache, + phoneCache, emailCache, summary, ) - if err == sql.ErrNoRows { - return nil, &gmail.NotFoundError{Path: "/messages/" + messageID} - } if err != nil { - return nil, fmt.Errorf("get message %s: %w", messageID, err) + return fmt.Errorf("ensure conversation: %w", err) } - - // Warn about attachments that won't be archived - if msg.HasAttachments != 0 { - c.logger.Warn("message has attachments that will not be archived (attachment extraction not yet implemented)", "id", messageID, "guid", msg.GUID) + if isNewConv { + summary.ConversationsImported++ } - // Determine sender and recipients - fromAddr, toAddrs := c.resolveParticipants(ctx, &msg) + // Resolve sender + var senderID sql.NullInt64 + if msg.IsFromMe != 0 { + // is_from_me messages: sender is the device owner, no external handle + } else if msg.HandleID != nil { + pid, err := c.resolveParticipant( + s, *msg.HandleID, phoneCache, emailCache, summary, + ) + if err == nil && pid > 0 { + senderID = sql.NullInt64{Int64: pid, Valid: true} + _ = s.EnsureConversationParticipant(convID, pid, "member") + } + } - // Convert Apple timestamp to time - msgDate := appleTimestampToTime(msg.Date) + // Determine message type from service field + msgType := "imessage" + if msg.Service != nil && strings.EqualFold(*msg.Service, "SMS") { + msgType = "sms" + } - // Get message body: prefer plain-text column, fall back to attributedBody blob - // (macOS Ventura+ / iOS 16+ stopped populating m.text for many message types). + // Extract body text body := "" if msg.Text != nil { body = *msg.Text @@ -316,83 +343,133 @@ func (c *Client) GetMessageRaw(ctx context.Context, messageID string) (*gmail.Ra body = extractAttributedBodyText(msg.AttributedBody) } - // Build MIME - mimeData := buildMIME(fromAddr, toAddrs, msgDate, msg.GUID, body) + msgDate := appleTimestampToTime(msg.Date) + var sentAt sql.NullTime + if !msgDate.IsZero() { + sentAt = sql.NullTime{Time: msgDate, Valid: true} + } + + // Upsert the message + msgID, err := s.UpsertMessage(&store.Message{ + SourceID: sourceID, + SourceMessageID: strconv.FormatInt(msg.ROWID, 10), + ConversationID: convID, + MessageType: msgType, + SentAt: sentAt, + InternalDate: sentAt, + SenderID: senderID, + IsFromMe: msg.IsFromMe != 0, + Snippet: sql.NullString{ + String: snippet(body, 100), + Valid: body != "", + }, + SizeEstimate: int64(len(body)), + HasAttachments: msg.HasAttachments != 0, + }) + if err != nil { + return fmt.Errorf("upsert message: %w", err) + } - // Determine thread ID - threadID := "no-chat-" + messageID - if msg.ChatGUID != nil { - threadID = *msg.ChatGUID + // Store body text directly (no MIME) + if body != "" { + if err := s.UpsertMessageBody( + msgID, + sql.NullString{String: body, Valid: true}, + sql.NullString{}, + ); err != nil { + return fmt.Errorf("upsert body: %w", err) + } } - // Build label based on service - var labelIDs []string - if msg.Service != nil && *msg.Service != "" { - labelIDs = []string{*msg.Service} + // Label: iMessage or SMS + labelID := imessageLabelID + if msgType == "sms" { + labelID = smsLabelID + } + if err := s.LinkMessageLabel(msgID, labelID); err != nil { + return fmt.Errorf("link label: %w", err) } - // InternalDate as Unix milliseconds - internalDate := int64(0) - if !msgDate.IsZero() { - internalDate = msgDate.UnixMilli() - } - - return &gmail.RawMessage{ - ID: messageID, - ThreadID: threadID, - LabelIDs: labelIDs, - Snippet: snippet(body, 100), - HistoryID: uint64(msg.ROWID), - InternalDate: internalDate, - SizeEstimate: int64(len(mimeData)), - Raw: mimeData, - }, nil + // Warn about attachments + if msg.HasAttachments != 0 { + c.logger.Debug( + "message has attachments (extraction not yet implemented)", + "rowid", msg.ROWID, + ) + } + + summary.MessagesImported++ + return nil } -// resolveParticipants determines the From and To addresses for a message. -func (c *Client) resolveParticipants(ctx context.Context, msg *messageRow) (from []string, to []string) { - if msg.IsFromMe != 0 { - // Sender is the device owner - from = []string{c.myAddress} - // Recipients are the chat participants - if msg.ChatROWID != nil { - to = c.getChatParticipants(ctx, *msg.ChatROWID) - } else if msg.HandleID != nil { - email, _, _ := normalizeIdentifier(*msg.HandleID) - if email != "" { - to = []string{email} - } - } - } else { - // Sender is from the handle table - if msg.HandleID != nil { - email, _, _ := normalizeIdentifier(*msg.HandleID) - if email != "" { - from = []string{email} - } - } - // Recipient is the device owner (and possibly other participants in group chats) - to = []string{c.myAddress} - if msg.ChatROWID != nil { - others := c.getChatParticipants(ctx, *msg.ChatROWID) - // Add other participants (exclude the sender) - senderAddr := "" - if len(from) > 0 { - senderAddr = from[0] - } - for _, addr := range others { - if addr != senderAddr && addr != c.myAddress { - to = append(to, addr) - } - } +// ensureConversation gets or creates a conversation for the chat, +// resolving participants and setting the title. +func (c *Client) ensureConversation( + ctx context.Context, + s *store.Store, + sourceID int64, + msg *messageRow, + chatGUID string, + convCache map[string]int64, + phoneCache map[string]int64, + emailCache map[string]int64, + summary *ImportSummary, +) (int64, bool, error) { + if id, ok := convCache[chatGUID]; ok { + return id, false, nil + } + + // Determine conversation type and title + convType := "direct_chat" + title := "" + if msg.ChatIdentifier != nil && + strings.Contains(*msg.ChatIdentifier, "chat;+;") { + convType = "group_chat" + } + + if msg.ChatDisplayName != nil && *msg.ChatDisplayName != "" { + title = *msg.ChatDisplayName + } else if convType == "direct_chat" && msg.HandleID != nil { + // For 1:1 chats, use the participant's phone/email as title + phone, email, name := resolveHandle(*msg.HandleID) + if name != "" { + title = name + } else if phone != "" { + title = phone + } else if email != "" { + title = email } } - return from, to + + convID, err := s.EnsureConversationWithType( + sourceID, chatGUID, convType, title, + ) + if err != nil { + return 0, false, err + } + convCache[chatGUID] = convID + + // Link chat participants to the conversation + if msg.ChatROWID != nil { + c.linkChatParticipants( + ctx, s, *msg.ChatROWID, convID, + phoneCache, emailCache, summary, + ) + } + + return convID, true, nil } -// getChatParticipants returns the normalized email addresses of all participants -// in a chat (excluding the device owner). -func (c *Client) getChatParticipants(ctx context.Context, chatROWID int64) []string { +// linkChatParticipants resolves all handles in a chat and links them +// as conversation participants. +func (c *Client) linkChatParticipants( + ctx context.Context, + s *store.Store, + chatROWID, convID int64, + phoneCache map[string]int64, + emailCache map[string]int64, + summary *ImportSummary, +) { rows, err := c.db.QueryContext(ctx, ` SELECT h.id FROM chat_handle_join chj @@ -400,61 +477,68 @@ func (c *Client) getChatParticipants(ctx context.Context, chatROWID int64) []str WHERE chj.chat_id = ? `, chatROWID) if err != nil { - c.logger.Warn("failed to get chat participants", "chat_id", chatROWID, "error", err) - return nil + c.logger.Warn( + "failed to get chat participants", + "chat_id", chatROWID, "error", err, + ) + return } defer func() { _ = rows.Close() }() - var addrs []string for rows.Next() { var handleID string if err := rows.Scan(&handleID); err != nil { continue } - email, _, _ := normalizeIdentifier(handleID) - if email != "" { - addrs = append(addrs, email) + pid, err := c.resolveParticipant( + s, handleID, phoneCache, emailCache, summary, + ) + if err != nil || pid == 0 { + continue } + _ = s.EnsureConversationParticipant(convID, pid, "member") } - return addrs } -// GetMessagesRawBatch fetches multiple messages sequentially. -// Since we're reading from a local database, parallelism adds no benefit. -func (c *Client) GetMessagesRawBatch(ctx context.Context, messageIDs []string) ([]*gmail.RawMessage, error) { - results := make([]*gmail.RawMessage, 0, len(messageIDs)) - for _, id := range messageIDs { - msg, err := c.GetMessageRaw(ctx, id) +// resolveParticipant resolves a handle ID to a participant ID in the +// store, creating the participant if needed. +func (c *Client) resolveParticipant( + s *store.Store, + handleID string, + phoneCache map[string]int64, + emailCache map[string]int64, + summary *ImportSummary, +) (int64, error) { + phone, email, _ := resolveHandle(handleID) + + if phone != "" { + if pid, ok := phoneCache[phone]; ok { + return pid, nil + } + pid, err := s.EnsureParticipantByPhone(phone, phone, "imessage") if err != nil { - c.logger.Warn("failed to fetch message", "id", id, "error", err) - continue + return 0, fmt.Errorf("ensure participant by phone %s: %w", phone, err) } - results = append(results, msg) + phoneCache[phone] = pid + summary.ParticipantsResolved++ + return pid, nil } - return results, nil -} - -// ListHistory is not supported for iMessage (no incremental sync yet). -func (c *Client) ListHistory(ctx context.Context, startHistoryID uint64, pageToken string) (*gmail.HistoryResponse, error) { - return &gmail.HistoryResponse{ - HistoryID: startHistoryID, - }, nil -} -// TrashMessage is not supported for iMessage. -func (c *Client) TrashMessage(ctx context.Context, messageID string) error { - return fmt.Errorf("trash not supported for iMessage") -} - -// DeleteMessage is not supported for iMessage. -func (c *Client) DeleteMessage(ctx context.Context, messageID string) error { - return fmt.Errorf("delete not supported for iMessage") -} + if email != "" { + if pid, ok := emailCache[email]; ok { + return pid, nil + } + result, err := s.EnsureParticipantsBatch([]mime.Address{ + {Email: email}, + }) + if err != nil { + return 0, fmt.Errorf("ensure participant by email %s: %w", email, err) + } + pid := result[email] + emailCache[email] = pid + summary.ParticipantsResolved++ + return pid, nil + } -// BatchDeleteMessages is not supported for iMessage. -func (c *Client) BatchDeleteMessages(ctx context.Context, messageIDs []string) error { - return fmt.Errorf("batch delete not supported for iMessage") + return 0, nil } - -// Ensure Client implements gmail.API. -var _ gmail.API = (*Client)(nil) diff --git a/internal/imessage/models.go b/internal/imessage/models.go index 3a62a688..d6741050 100644 --- a/internal/imessage/models.go +++ b/internal/imessage/models.go @@ -1,5 +1,5 @@ -// Package imessage provides an iMessage client that reads from macOS's chat.db -// and implements the gmail.API interface for use with the existing sync infrastructure. +// Package imessage reads from macOS's iMessage chat.db and imports +// messages into the msgvault store. package imessage // messageRow holds a row from the iMessage chat.db message table @@ -19,3 +19,11 @@ type messageRow struct { ChatDisplayName *string // chat.display_name (set for group chats) ChatIdentifier *string // chat.chat_identifier } + +// ImportSummary holds statistics from a completed import run. +type ImportSummary struct { + MessagesImported int + ConversationsImported int + ParticipantsResolved int + Skipped int +} diff --git a/internal/imessage/parser.go b/internal/imessage/parser.go index 0d2f9f96..1b99fd62 100644 --- a/internal/imessage/parser.go +++ b/internal/imessage/parser.go @@ -1,12 +1,10 @@ package imessage import ( - "crypto/sha256" - "encoding/hex" - "net/mail" "strings" "time" + "github.com/wesm/msgvault/internal/textimport" "howett.net/plist" ) @@ -41,125 +39,20 @@ func timeToAppleTimestamp(t time.Time, useNano bool) int64 { return appleSec } -// normalizeIdentifier converts a phone number or email address from iMessage's -// handle table into a normalized email-like identifier for the participants table. -// Returns the normalized email, domain, and display name. -func normalizeIdentifier(handleID string) (email, domain, displayName string) { - handleID = strings.TrimSpace(handleID) +// resolveHandle classifies an iMessage handle ID as a phone number, +// email address, or raw identifier (e.g. system handles). +func resolveHandle(handleID string) (phone, email, displayName string) { if handleID == "" { return "", "", "" } - - // Email addresses: use as-is (lowercased) - if strings.Contains(handleID, "@") { - email = strings.ToLower(handleID) - if idx := strings.LastIndex(email, "@"); idx >= 0 { - domain = email[idx+1:] - } - return email, domain, "" - } - - // Phone numbers: normalize and use a synthetic domain - phone := normalizePhone(handleID) - return phone + "@phone.imessage", "phone.imessage", phone -} - -// normalizePhone strips non-digit characters from a phone number and attempts -// to produce a consistent E.164-like format. -func normalizePhone(phone string) string { - // Preserve leading + - hasPlus := strings.HasPrefix(phone, "+") - - // Extract digits only - var digits strings.Builder - for _, r := range phone { - if r >= '0' && r <= '9' { - digits.WriteRune(r) - } - } - d := digits.String() - if d == "" { - return phone // Return original if no digits found - } - - // Try to normalize to E.164 - if hasPlus { - return "+" + d - } - // 10-digit US number - if len(d) == 10 { - return "+1" + d - } - // 11-digit number starting with 1 (US with country code) - if len(d) == 11 && d[0] == '1' { - return "+" + d - } - // Other: prefix with + - return "+" + d -} - -// buildMIME constructs a minimal RFC 2822 message from iMessage data. -// The resulting bytes can be parsed by enmime for the sync pipeline. -func buildMIME(fromAddr, toAddrs []string, date time.Time, messageID, body string) []byte { - var b strings.Builder - - // From header - if len(fromAddr) > 0 { - b.WriteString("From: ") - b.WriteString(formatMIMEAddress(fromAddr[0])) - b.WriteString("\r\n") - } - - // To header - if len(toAddrs) > 0 { - b.WriteString("To: ") - for i, addr := range toAddrs { - if i > 0 { - b.WriteString(", ") - } - b.WriteString(formatMIMEAddress(addr)) - } - b.WriteString("\r\n") - } - - // Date header - if !date.IsZero() { - b.WriteString("Date: ") - b.WriteString(date.Format(time.RFC1123Z)) - b.WriteString("\r\n") - } - - // Subject (empty for iMessage - messages don't have subjects) - b.WriteString("Subject: \r\n") - - // Message-ID — hash the GUID since iMessage GUIDs contain characters - // like ':' and '/' that are invalid in RFC 5322 msg-id local-part. - if messageID != "" { - h := sha256.Sum256([]byte(messageID)) - safeID := hex.EncodeToString(h[:12]) // 24 hex chars, unique enough - b.WriteString("Message-ID: <") - b.WriteString(safeID) - b.WriteString("@imessage.local>\r\n") + normalized, err := textimport.NormalizePhone(handleID) + if err == nil { + return normalized, "", normalized } - - // MIME version and content type - b.WriteString("MIME-Version: 1.0\r\n") - b.WriteString("Content-Type: text/plain; charset=utf-8\r\n") - - // Header/body separator - b.WriteString("\r\n") - - // Body - if body != "" { - b.WriteString(body) + if strings.Contains(handleID, "@") { + return "", strings.ToLower(handleID), "" } - - return []byte(b.String()) -} - -// formatMIMEAddress formats an email address for MIME headers. -func formatMIMEAddress(addr string) string { - return (&mail.Address{Address: addr}).String() + return "", "", handleID } // extractAttributedBodyText decodes an NSKeyedArchiver binary plist blob from diff --git a/internal/imessage/parser_test.go b/internal/imessage/parser_test.go index 8c4b3beb..d0959a0e 100644 --- a/internal/imessage/parser_test.go +++ b/internal/imessage/parser_test.go @@ -1,7 +1,6 @@ package imessage import ( - "strings" "testing" "time" @@ -91,135 +90,86 @@ func TestRoundTripTimestamp(t *testing.T) { } } -func TestNormalizeIdentifier(t *testing.T) { +func TestResolveHandle(t *testing.T) { tests := []struct { - name string - handleID string - wantEmail string - wantDomain string + name string + handleID string + wantPhone string + wantEmail string + wantDisplayName string }{ { - name: "email address", - handleID: "John@Example.com", - wantEmail: "john@example.com", - wantDomain: "example.com", + name: "email address", + handleID: "John@Example.com", + wantEmail: "john@example.com", }, { - name: "US phone with +1", - handleID: "+15551234567", - wantEmail: "+15551234567@phone.imessage", - wantDomain: "phone.imessage", + name: "US phone with +1", + handleID: "+15551234567", + wantPhone: "+15551234567", + wantDisplayName: "+15551234567", }, { - name: "US phone 10 digits", - handleID: "5551234567", - wantEmail: "+15551234567@phone.imessage", - wantDomain: "phone.imessage", + name: "US phone 10 digits", + handleID: "5551234567", + wantPhone: "+15551234567", + wantDisplayName: "+15551234567", }, { - name: "US phone with formatting", - handleID: "(555) 123-4567", - wantEmail: "+15551234567@phone.imessage", - wantDomain: "phone.imessage", + name: "US phone with formatting", + handleID: "(555) 123-4567", + wantPhone: "+15551234567", + wantDisplayName: "+15551234567", }, { - name: "US phone 11 digits with 1", - handleID: "15551234567", - wantEmail: "+15551234567@phone.imessage", - wantDomain: "phone.imessage", + name: "US phone 11 digits with 1", + handleID: "15551234567", + wantPhone: "+15551234567", + wantDisplayName: "+15551234567", }, { - name: "international phone", - handleID: "+447911123456", - wantEmail: "+447911123456@phone.imessage", - wantDomain: "phone.imessage", + name: "international phone", + handleID: "+447911123456", + wantPhone: "+447911123456", + wantDisplayName: "+447911123456", }, { - name: "empty string", - handleID: "", - wantEmail: "", - wantDomain: "", + name: "empty string", + }, + { + name: "short code (not a phone)", + handleID: "12345", + wantDisplayName: "12345", + }, + { + name: "handle with digits parses as phone", + handleID: "p:+1555123", + wantPhone: "+1555123", + wantDisplayName: "+1555123", + }, + { + name: "system handle without digits", + handleID: "system", + wantDisplayName: "system", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - gotEmail, gotDomain, _ := normalizeIdentifier(tt.handleID) + gotPhone, gotEmail, gotDisplay := resolveHandle(tt.handleID) + if gotPhone != tt.wantPhone { + t.Errorf("phone: got %q, want %q", gotPhone, tt.wantPhone) + } if gotEmail != tt.wantEmail { t.Errorf("email: got %q, want %q", gotEmail, tt.wantEmail) } - if gotDomain != tt.wantDomain { - t.Errorf("domain: got %q, want %q", gotDomain, tt.wantDomain) + if gotDisplay != tt.wantDisplayName { + t.Errorf("displayName: got %q, want %q", gotDisplay, tt.wantDisplayName) } }) } } -func TestBuildMIME(t *testing.T) { - date := time.Date(2024, 6, 15, 14, 30, 0, 0, time.UTC) - mime := buildMIME( - []string{"sender@example.com"}, - []string{"recipient@example.com", "other@example.com"}, - date, - "p:0/ABC123", - "Hello, world!", - ) - - mimeStr := string(mime) - - // Check required headers - if !strings.Contains(mimeStr, "From: ") { - t.Error("missing or incorrect From header") - } - if !strings.Contains(mimeStr, "To: , ") { - t.Error("missing or incorrect To header") - } - if !strings.Contains(mimeStr, "Date: ") { - t.Error("missing Date header") - } - // Message-ID is a hash of the GUID (RFC 5322 safe) - if !strings.Contains(mimeStr, "Message-ID: <") || !strings.Contains(mimeStr, "@imessage.local>") { - t.Error("missing Message-ID header") - } - // Verify the raw GUID with invalid chars is NOT present - if strings.Contains(mimeStr, "p:0/ABC123@imessage.local") { - t.Error("Message-ID should not contain raw GUID with invalid chars") - } - if !strings.Contains(mimeStr, "Content-Type: text/plain; charset=utf-8") { - t.Error("missing Content-Type header") - } - if !strings.Contains(mimeStr, "MIME-Version: 1.0") { - t.Error("missing MIME-Version header") - } - // Check body is after blank line - if !strings.Contains(mimeStr, "\r\n\r\nHello, world!") { - t.Error("body not found after header separator") - } -} - -func TestBuildMIME_EmptyBody(t *testing.T) { - date := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) - mime := buildMIME( - []string{"sender@example.com"}, - []string{"recipient@example.com"}, - date, - "test-guid", - "", - ) - - mimeStr := string(mime) - - // Should still have headers and separator - if !strings.Contains(mimeStr, "\r\n\r\n") { - t.Error("missing header/body separator") - } - // Body should be empty - parts := strings.SplitN(mimeStr, "\r\n\r\n", 2) - if len(parts) != 2 || parts[1] != "" { - t.Errorf("expected empty body, got %q", parts[1]) - } -} - // makeAttributedBodyBlob builds a minimal NSKeyedArchiver binary plist blob // equivalent to an NSAttributedString with the given text. func makeAttributedBodyBlob(text string) []byte { From 263b239d6a4951652289356475c872439ddbd390 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:29:25 -0500 Subject: [PATCH 16/65] Refactor Google Voice to use store methods directly Drop gmail.API adapter and synthetic MIME. Google Voice now writes to the store with proper message_type (google_voice_text/call/ voicemail), phone-based participants via EnsureParticipantByPhone, and labels linked via EnsureLabel + LinkMessageLabel. --- cmd/msgvault/cmd/import_gvoice.go | 198 +++++++ cmd/msgvault/cmd/sync_gvoice.go | 179 ------- internal/gvoice/client.go | 830 ++++++++++++++++++------------ internal/gvoice/models.go | 23 + internal/gvoice/parser.go | 111 +--- internal/gvoice/parser_test.go | 34 -- 6 files changed, 732 insertions(+), 643 deletions(-) create mode 100644 cmd/msgvault/cmd/import_gvoice.go delete mode 100644 cmd/msgvault/cmd/sync_gvoice.go diff --git a/cmd/msgvault/cmd/import_gvoice.go b/cmd/msgvault/cmd/import_gvoice.go new file mode 100644 index 00000000..238efa23 --- /dev/null +++ b/cmd/msgvault/cmd/import_gvoice.go @@ -0,0 +1,198 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "strings" + "syscall" + "time" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/gvoice" +) + +var ( + importGvoiceBefore string + importGvoiceAfter string + importGvoiceLimit int +) + +var importGvoiceCmd = &cobra.Command{ + Use: "import-gvoice ", + Short: "Import Google Voice history from Takeout export", + Long: `Import Google Voice texts, calls, and voicemails from a +Google Takeout export. + +The directory should be the "Voice" folder inside the Takeout archive, +containing "Calls/" and "Phones.vcf". + +Examples: + msgvault import-gvoice /path/to/Takeout/Voice + msgvault import-gvoice /path/to/Takeout/Voice --after 2020-01-01 + msgvault import-gvoice /path/to/Takeout/Voice --limit 100`, + Args: cobra.ExactArgs(1), + RunE: runImportGvoice, +} + +func runImportGvoice(cmd *cobra.Command, args []string) error { + takeoutDir := args[0] + + s, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = s.Close() }() + + clientOpts, err := buildGvoiceOpts() + if err != nil { + return err + } + + client, err := gvoice.NewClient(takeoutDir, clientOpts...) + if err != nil { + return fmt.Errorf("open Google Voice takeout: %w", err) + } + defer func() { _ = client.Close() }() + + src, err := s.GetOrCreateSource( + "google_voice", client.Identifier(), + ) + if err != nil { + return fmt.Errorf("get or create source: %w", err) + } + + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + fmt.Println("\nInterrupted.") + cancel() + }() + + startTime := time.Now() + fmt.Printf( + "Importing Google Voice from %s\n", takeoutDir, + ) + printGvoiceDateFilter() + if importGvoiceLimit > 0 { + fmt.Printf("Limit: %d messages\n", importGvoiceLimit) + } + fmt.Println() + + summary, err := client.Import(ctx, s, src.ID) + if err != nil { + if ctx.Err() != nil { + fmt.Println("\nImport interrupted.") + printGvoiceSummary(summary, startTime) + return nil + } + return fmt.Errorf("import failed: %w", err) + } + + printGvoiceSummary(summary, startTime) + return nil +} + +func buildGvoiceOpts() ([]gvoice.ClientOption, error) { + var opts []gvoice.ClientOption + opts = append(opts, gvoice.WithLogger(logger)) + + if importGvoiceAfter != "" { + t, err := time.ParseInLocation( + "2006-01-02", importGvoiceAfter, time.Local, + ) + if err != nil { + return nil, fmt.Errorf( + "invalid --after date: %w (use YYYY-MM-DD format)", + err, + ) + } + opts = append(opts, gvoice.WithAfterDate(t)) + } + + if importGvoiceBefore != "" { + t, err := time.ParseInLocation( + "2006-01-02", importGvoiceBefore, time.Local, + ) + if err != nil { + return nil, fmt.Errorf( + "invalid --before date: %w (use YYYY-MM-DD format)", + err, + ) + } + opts = append(opts, gvoice.WithBeforeDate(t)) + } + + if importGvoiceLimit > 0 { + opts = append(opts, gvoice.WithLimit(importGvoiceLimit)) + } + + return opts, nil +} + +func printGvoiceDateFilter() { + if importGvoiceAfter == "" && importGvoiceBefore == "" { + return + } + parts := []string{} + if importGvoiceAfter != "" { + parts = append(parts, "after "+importGvoiceAfter) + } + if importGvoiceBefore != "" { + parts = append(parts, "before "+importGvoiceBefore) + } + fmt.Printf("Date filter: %s\n", strings.Join(parts, ", ")) +} + +func printGvoiceSummary( + summary *gvoice.ImportSummary, + startTime time.Time, +) { + if summary == nil { + return + } + elapsed := time.Since(startTime) + fmt.Println() + fmt.Println("Google Voice import complete!") + fmt.Printf(" Duration: %s\n", elapsed.Round(time.Second)) + fmt.Printf( + " Messages: %d imported\n", + summary.MessagesImported, + ) + fmt.Printf( + " Conversations: %d\n", + summary.ConversationsImported, + ) + fmt.Printf( + " Participants: %d resolved\n", + summary.ParticipantsResolved, + ) + if summary.Skipped > 0 { + fmt.Printf(" Skipped: %d\n", summary.Skipped) + } + if summary.MessagesImported > 0 && elapsed.Seconds() > 0 { + rate := float64(summary.MessagesImported) / elapsed.Seconds() + fmt.Printf(" Rate: %.1f messages/sec\n", rate) + } +} + +func init() { + importGvoiceCmd.Flags().StringVar( + &importGvoiceBefore, "before", "", + "only messages before this date (YYYY-MM-DD)", + ) + importGvoiceCmd.Flags().StringVar( + &importGvoiceAfter, "after", "", + "only messages after this date (YYYY-MM-DD)", + ) + importGvoiceCmd.Flags().IntVar( + &importGvoiceLimit, "limit", 0, + "limit number of messages (for testing)", + ) + rootCmd.AddCommand(importGvoiceCmd) +} diff --git a/cmd/msgvault/cmd/sync_gvoice.go b/cmd/msgvault/cmd/sync_gvoice.go deleted file mode 100644 index de1cf744..00000000 --- a/cmd/msgvault/cmd/sync_gvoice.go +++ /dev/null @@ -1,179 +0,0 @@ -package cmd - -import ( - "context" - "fmt" - "os" - "os/signal" - "strings" - "syscall" - "time" - - "github.com/spf13/cobra" - "github.com/wesm/msgvault/internal/gvoice" - "github.com/wesm/msgvault/internal/store" - "github.com/wesm/msgvault/internal/sync" -) - -var ( - gvoiceBefore string - gvoiceAfter string - gvoiceLimit int - gvoiceNoResume bool -) - -var syncGvoiceCmd = &cobra.Command{ - Use: "sync-gvoice ", - Short: "Import Google Voice messages from Takeout export", - Long: `Import Google Voice SMS, MMS, and call records from a Google Takeout export. - -Reads HTML files from the Voice/Calls/ directory in a Takeout archive and -stores them in the msgvault archive alongside Gmail and iMessage data. - -The takeout-voice-dir argument should point to the "Voice" directory inside -the extracted Takeout archive, which contains "Calls/" and "Phones.vcf". - -Date filters: - --after 2020-01-01 Only messages on or after this date - --before 2024-12-31 Only messages before this date - -Examples: - msgvault sync-gvoice /path/to/Takeout/Voice - msgvault sync-gvoice /path/to/Takeout/Voice --after 2020-01-01 - msgvault sync-gvoice /path/to/Takeout/Voice --limit 100`, - Args: cobra.ExactArgs(1), - RunE: func(cmd *cobra.Command, args []string) error { - takeoutDir := args[0] - - // Open msgvault database - dbPath := cfg.DatabaseDSN() - s, err := store.Open(dbPath) - if err != nil { - return fmt.Errorf("open database: %w", err) - } - defer func() { _ = s.Close() }() - - if err := s.InitSchema(); err != nil { - return fmt.Errorf("init schema: %w", err) - } - - // Check takeout directory exists - if _, err := os.Stat(takeoutDir); os.IsNotExist(err) { - return fmt.Errorf("takeout directory not found: %s", takeoutDir) - } - - // Build client options - var clientOpts []gvoice.ClientOption - clientOpts = append(clientOpts, gvoice.WithLogger(logger)) - - if gvoiceAfter != "" { - t, err := time.Parse("2006-01-02", gvoiceAfter) - if err != nil { - return fmt.Errorf("invalid --after date: %w (use YYYY-MM-DD format)", err) - } - clientOpts = append(clientOpts, gvoice.WithAfterDate(t)) - } - - if gvoiceBefore != "" { - t, err := time.Parse("2006-01-02", gvoiceBefore) - if err != nil { - return fmt.Errorf("invalid --before date: %w (use YYYY-MM-DD format)", err) - } - clientOpts = append(clientOpts, gvoice.WithBeforeDate(t)) - } - - if gvoiceLimit > 0 { - clientOpts = append(clientOpts, gvoice.WithLimit(gvoiceLimit)) - } - - // Create Google Voice client - gvClient, err := gvoice.NewClient(takeoutDir, clientOpts...) - if err != nil { - return fmt.Errorf("open Google Voice takeout: %w", err) - } - defer func() { _ = gvClient.Close() }() - - identifier := gvClient.Identifier() - - // Set up context with cancellation - ctx, cancel := context.WithCancel(cmd.Context()) - defer cancel() - - // Handle Ctrl+C gracefully - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) - go func() { - <-sigChan - fmt.Println("\nInterrupted. Saving checkpoint...") - cancel() - }() - - // Set up sync options - opts := sync.DefaultOptions() - opts.NoResume = gvoiceNoResume - opts.SourceType = "google_voice" - opts.AttachmentsDir = cfg.AttachmentsDir() - - // Create syncer with progress reporter - syncer := sync.New(gvClient, s, opts). - WithLogger(logger). - WithProgress(&CLIProgress{}) - - // Run sync - startTime := time.Now() - fmt.Printf("Starting Google Voice import from %s\n", takeoutDir) - fmt.Printf("Google Voice number: %s\n", identifier) - if gvoiceAfter != "" || gvoiceBefore != "" { - parts := []string{} - if gvoiceAfter != "" { - parts = append(parts, "after "+gvoiceAfter) - } - if gvoiceBefore != "" { - parts = append(parts, "before "+gvoiceBefore) - } - fmt.Printf("Date filter: %s\n", strings.Join(parts, ", ")) - } - if gvoiceLimit > 0 { - fmt.Printf("Limit: %d messages\n", gvoiceLimit) - } - fmt.Println() - - summary, err := syncer.Full(ctx, identifier) - if err != nil { - if ctx.Err() != nil { - fmt.Println("\nSync interrupted. Run again to resume.") - return nil - } - return fmt.Errorf("sync failed: %w", err) - } - - // Print summary - fmt.Println() - fmt.Println("Google Voice import complete!") - fmt.Printf(" Duration: %s\n", summary.Duration.Round(time.Second)) - fmt.Printf(" Messages: %d found, %d added, %d skipped\n", - summary.MessagesFound, summary.MessagesAdded, summary.MessagesSkipped) - if summary.Errors > 0 { - fmt.Printf(" Errors: %d\n", summary.Errors) - } - if summary.WasResumed { - fmt.Printf(" (Resumed from checkpoint)\n") - } - - if summary.MessagesAdded > 0 { - elapsed := time.Since(startTime) - messagesPerSec := float64(summary.MessagesAdded) / elapsed.Seconds() - fmt.Printf(" Rate: %.1f messages/sec\n", messagesPerSec) - } - - return nil - }, -} - -func init() { - syncGvoiceCmd.Flags().StringVar(&gvoiceBefore, "before", "", "only messages before this date (YYYY-MM-DD)") - syncGvoiceCmd.Flags().StringVar(&gvoiceAfter, "after", "", "only messages after this date (YYYY-MM-DD)") - syncGvoiceCmd.Flags().IntVar(&gvoiceLimit, "limit", 0, "limit number of messages (for testing)") - syncGvoiceCmd.Flags().BoolVar(&gvoiceNoResume, "noresume", false, "force fresh sync (don't resume)") - rootCmd.AddCommand(syncGvoiceCmd) -} diff --git a/internal/gvoice/client.go b/internal/gvoice/client.go index 4a7dd032..f6395511 100644 --- a/internal/gvoice/client.go +++ b/internal/gvoice/client.go @@ -2,22 +2,23 @@ package gvoice import ( "context" + "database/sql" "fmt" "log/slog" "os" "path/filepath" "sort" - "strconv" "strings" "time" - "github.com/wesm/msgvault/internal/gmail" + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/textimport" ) const defaultPageSize = 500 -// Client reads from a Google Voice Takeout export and implements the gmail.API -// interface so it can be used with the existing sync infrastructure. +// Client reads from a Google Voice Takeout export and imports messages +// into the msgvault store. type Client struct { takeoutDir string owner ownerPhones @@ -25,14 +26,13 @@ type Client struct { afterDate time.Time beforeDate time.Time limit int - returned int index []indexEntry indexBuilt bool logger *slog.Logger pageSize int - // LRU cache for parsed HTML files (avoid re-parsing when consecutive - // messages come from the same file) + // LRU cache for parsed HTML files (avoid re-parsing when + // consecutive messages come from the same file) lastFilePath string lastMessages []textMessage lastGroupPar []string @@ -51,7 +51,7 @@ func WithBeforeDate(t time.Time) ClientOption { return func(c *Client) { c.beforeDate = t } } -// WithLimit sets the maximum number of messages to return across all pages. +// WithLimit sets the maximum number of messages to import. func WithLimit(n int) ClientOption { return func(c *Client) { c.limit = n } } @@ -62,9 +62,11 @@ func WithLogger(l *slog.Logger) ClientOption { } // NewClient creates a Client from a Google Voice Takeout directory. -// The directory should be the "Voice" folder containing "Calls/" and "Phones.vcf". -func NewClient(takeoutDir string, opts ...ClientOption) (*Client, error) { - // Validate directory exists +// The directory should be the "Voice" folder containing "Calls/" +// and "Phones.vcf". +func NewClient( + takeoutDir string, opts ...ClientOption, +) (*Client, error) { info, err := os.Stat(takeoutDir) if err != nil { return nil, fmt.Errorf("takeout directory: %w", err) @@ -73,13 +75,14 @@ func NewClient(takeoutDir string, opts ...ClientOption) (*Client, error) { return nil, fmt.Errorf("not a directory: %s", takeoutDir) } - // Check for Calls subdirectory callsDir := filepath.Join(takeoutDir, "Calls") if _, err := os.Stat(callsDir); err != nil { - return nil, fmt.Errorf("calls directory not found in %s: %w", takeoutDir, err) + return nil, fmt.Errorf( + "calls directory not found in %s: %w", + takeoutDir, err, + ) } - // Parse Phones.vcf vcfPath := filepath.Join(takeoutDir, "Phones.vcf") vcfData, err := os.ReadFile(vcfPath) if err != nil { @@ -106,7 +109,7 @@ func NewClient(takeoutDir string, opts ...ClientOption) (*Client, error) { return c, nil } -// Identifier returns the Google Voice phone number used as source identifier. +// Identifier returns the Google Voice phone number. func (c *Client) Identifier() string { return c.identifier } @@ -116,9 +119,423 @@ func (c *Client) Close() error { return nil } -// buildIndex walks the Calls directory, parses each HTML file, and builds -// a sorted index of all messages and call records. This is done lazily -// on the first call to ListMessages. +// Import reads all matching messages from the Takeout export and +// writes them into the msgvault store. +func (c *Client) Import( + ctx context.Context, + s *store.Store, + sourceID int64, +) (*ImportSummary, error) { + if err := c.buildIndex(); err != nil { + return nil, fmt.Errorf("build index: %w", err) + } + + summary := &ImportSummary{} + + // Ensure labels + labelIDs, err := c.ensureLabels(s, sourceID) + if err != nil { + return nil, err + } + + // Caches + phoneCache := map[string]int64{} // phone → participantID + convCache := map[string]int64{} // threadID → conversationID + imported := 0 + + for _, entry := range c.index { + if err := ctx.Err(); err != nil { + return summary, err + } + if c.limit > 0 && imported >= c.limit { + break + } + + switch entry.FileType { + case fileTypeText, fileTypeGroup: + n, err := c.importTextEntry( + ctx, s, sourceID, &entry, + labelIDs, phoneCache, convCache, summary, + ) + if err != nil { + c.logger.Warn( + "failed to import text entry", + "id", entry.ID, + "error", err, + ) + summary.Skipped++ + continue + } + imported += n + + default: + if err := c.importCallEntry( + ctx, s, sourceID, &entry, + labelIDs, phoneCache, convCache, summary, + ); err != nil { + c.logger.Warn( + "failed to import call entry", + "id", entry.ID, + "error", err, + ) + summary.Skipped++ + continue + } + imported++ + } + } + + if err := s.RecomputeConversationStats(sourceID); err != nil { + return summary, fmt.Errorf("recompute stats: %w", err) + } + + return summary, nil +} + +func (c *Client) ensureLabels( + s *store.Store, sourceID int64, +) (map[string]int64, error) { + labels := map[string]int64{} + for _, name := range []string{ + "sms", "mms", "call_received", + "call_placed", "call_missed", "voicemail", + } { + id, err := s.EnsureLabel(sourceID, name, name, "user") + if err != nil { + return nil, fmt.Errorf("ensure label %q: %w", name, err) + } + labels[name] = id + } + return labels, nil +} + +func (c *Client) importTextEntry( + ctx context.Context, + s *store.Store, + sourceID int64, + entry *indexEntry, + labelIDs map[string]int64, + phoneCache map[string]int64, + convCache map[string]int64, + summary *ImportSummary, +) (int, error) { + messages, groupParticipants, err := c.getCachedMessages( + entry.FilePath, + ) + if err != nil { + return 0, err + } + + if entry.MessageIndex >= len(messages) { + return 0, fmt.Errorf( + "message index %d out of range (file has %d messages)", + entry.MessageIndex, len(messages), + ) + } + + msg := messages[entry.MessageIndex] + + // Resolve conversation + convType := "direct_chat" + title := "" + if entry.FileType == fileTypeGroup { + convType = "group_chat" + title = "Group" + } + + convID, err := c.ensureConv( + s, sourceID, entry.ThreadID, convType, title, + convCache, summary, + ) + if err != nil { + return 0, fmt.Errorf("ensure conversation: %w", err) + } + + // Resolve sender + senderID, err := c.resolveParticipant( + s, msg.SenderPhone, msg.SenderName, + phoneCache, summary, + ) + if err != nil { + return 0, fmt.Errorf("resolve sender: %w", err) + } + + // Ensure conversation participant + if senderID > 0 { + _ = s.EnsureConversationParticipant(convID, senderID, "member") + } + + // Ensure owner as participant + ownerID, err := c.resolveParticipant( + s, c.owner.GoogleVoice, "", + phoneCache, summary, + ) + if err == nil && ownerID > 0 { + _ = s.EnsureConversationParticipant(convID, ownerID, "member") + } + + // Ensure group participants + for _, phone := range groupParticipants { + pid, pErr := c.resolveParticipant( + s, phone, "", phoneCache, summary, + ) + if pErr == nil && pid > 0 { + _ = s.EnsureConversationParticipant( + convID, pid, "member", + ) + } + } + + // Build message + msgType := MessageTypeForFileType(entry.FileType) + isFromMe := msg.IsMe + + senderIDNull := sql.NullInt64{} + if senderID > 0 { + senderIDNull = sql.NullInt64{Int64: senderID, Valid: true} + } + + sentAt := sql.NullTime{} + if !msg.Timestamp.IsZero() { + sentAt = sql.NullTime{Time: msg.Timestamp, Valid: true} + } + + hasAttachments := len(msg.Attachments) > 0 + + msgID, err := s.UpsertMessage(&store.Message{ + SourceID: sourceID, + SourceMessageID: entry.ID, + ConversationID: convID, + Snippet: nullStr(snippet(msg.Body, 100)), + SentAt: sentAt, + MessageType: msgType, + SenderID: senderIDNull, + IsFromMe: isFromMe, + HasAttachments: hasAttachments, + SizeEstimate: int64(len(msg.Body)), + }) + if err != nil { + return 0, fmt.Errorf("upsert message: %w", err) + } + + // Store body + if err := s.UpsertMessageBody( + msgID, + sql.NullString{String: msg.Body, Valid: msg.Body != ""}, + sql.NullString{}, + ); err != nil { + return 0, fmt.Errorf("upsert message body: %w", err) + } + + // Link labels + for _, labelName := range entry.Labels { + if lid, ok := labelIDs[labelName]; ok { + _ = s.LinkMessageLabel(msgID, lid) + } + } + // Add mms label if has attachments + if hasAttachments { + if lid, ok := labelIDs["mms"]; ok { + _ = s.LinkMessageLabel(msgID, lid) + } + } + + summary.MessagesImported++ + return 1, nil +} + +func (c *Client) importCallEntry( + ctx context.Context, + s *store.Store, + sourceID int64, + entry *indexEntry, + labelIDs map[string]int64, + phoneCache map[string]int64, + convCache map[string]int64, + summary *ImportSummary, +) error { + f, err := os.Open(entry.FilePath) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + + record, err := parseCallHTML(f) + if err != nil { + return err + } + + // Resolve conversation + convID, err := c.ensureConv( + s, sourceID, entry.ThreadID, "direct_chat", "", + convCache, summary, + ) + if err != nil { + return fmt.Errorf("ensure conversation: %w", err) + } + + // Resolve contact + contactID, err := c.resolveParticipant( + s, record.Phone, record.Name, + phoneCache, summary, + ) + if err != nil { + return fmt.Errorf("resolve contact: %w", err) + } + if contactID > 0 { + _ = s.EnsureConversationParticipant( + convID, contactID, "member", + ) + } + + // Ensure owner as participant + ownerID, err := c.resolveParticipant( + s, c.owner.GoogleVoice, "", + phoneCache, summary, + ) + if err == nil && ownerID > 0 { + _ = s.EnsureConversationParticipant( + convID, ownerID, "member", + ) + } + + // Determine sender + var senderID int64 + isFromMe := false + switch record.CallType { + case fileTypePlaced: + senderID = ownerID + isFromMe = true + default: + senderID = contactID + } + + // Build body + var body strings.Builder + switch record.CallType { + case fileTypeReceived: + fmt.Fprintf(&body, "Received call from %s", record.Name) + case fileTypePlaced: + fmt.Fprintf(&body, "Placed call to %s", record.Name) + case fileTypeMissed: + fmt.Fprintf(&body, "Missed call from %s", record.Name) + case fileTypeVoicemail: + fmt.Fprintf(&body, "Voicemail from %s", record.Name) + } + if record.Duration != "" { + fmt.Fprintf(&body, " (%s)", formatDuration(record.Duration)) + } + + msgType := MessageTypeForFileType(entry.FileType) + + senderIDNull := sql.NullInt64{} + if senderID > 0 { + senderIDNull = sql.NullInt64{ + Int64: senderID, Valid: true, + } + } + + sentAt := sql.NullTime{} + if !record.Timestamp.IsZero() { + sentAt = sql.NullTime{ + Time: record.Timestamp, Valid: true, + } + } + + bodyStr := body.String() + msgID, err := s.UpsertMessage(&store.Message{ + SourceID: sourceID, + SourceMessageID: entry.ID, + ConversationID: convID, + Snippet: nullStr(snippet(bodyStr, 100)), + SentAt: sentAt, + MessageType: msgType, + SenderID: senderIDNull, + IsFromMe: isFromMe, + SizeEstimate: int64(len(bodyStr)), + }) + if err != nil { + return fmt.Errorf("upsert message: %w", err) + } + + // Store body + if err := s.UpsertMessageBody( + msgID, + sql.NullString{String: bodyStr, Valid: bodyStr != ""}, + sql.NullString{}, + ); err != nil { + return fmt.Errorf("upsert message body: %w", err) + } + + // Store raw HTML + rawData, rErr := os.ReadFile(entry.FilePath) + if rErr == nil { + _ = s.UpsertMessageRawWithFormat( + msgID, rawData, "gvoice_html", + ) + } + + // Link labels + for _, labelName := range entry.Labels { + if lid, ok := labelIDs[labelName]; ok { + _ = s.LinkMessageLabel(msgID, lid) + } + } + + summary.MessagesImported++ + return nil +} + +func (c *Client) ensureConv( + s *store.Store, + sourceID int64, + threadID, convType, title string, + cache map[string]int64, + summary *ImportSummary, +) (int64, error) { + if id, ok := cache[threadID]; ok { + return id, nil + } + id, err := s.EnsureConversationWithType( + sourceID, threadID, convType, title, + ) + if err != nil { + return 0, err + } + cache[threadID] = id + summary.ConversationsImported++ + return id, nil +} + +func (c *Client) resolveParticipant( + s *store.Store, + phone, displayName string, + cache map[string]int64, + summary *ImportSummary, +) (int64, error) { + if phone == "" { + return 0, nil + } + normalized, err := textimport.NormalizePhone(phone) + if err != nil { + return 0, nil // skip non-normalizable + } + if id, ok := cache[normalized]; ok { + return id, nil + } + id, err := s.EnsureParticipantByPhone( + normalized, displayName, "google_voice", + ) + if err != nil { + return 0, err + } + cache[normalized] = id + summary.ParticipantsResolved++ + return id, nil +} + +// buildIndex walks the Calls directory, parses each HTML file, and +// builds a sorted index of all messages and call records. func (c *Client) buildIndex() error { if c.indexBuilt { return nil @@ -150,20 +567,28 @@ func (c *Client) buildIndex() error { switch ft { case fileTypeText, fileTypeGroup: - entries, err := c.indexTextFile(filePath, name, ft) + ents, err := c.indexTextFile(filePath, name, ft) if err != nil { - c.logger.Warn("failed to index text file", "file", entry.Name(), "error", err) + c.logger.Warn( + "failed to index text file", + "file", entry.Name(), + "error", err, + ) continue } - index = append(index, entries...) + index = append(index, ents...) - case fileTypeReceived, fileTypePlaced, fileTypeMissed, fileTypeVoicemail: - entry, err := c.indexCallFile(filePath, name, ft) + default: + ent, err := c.indexCallFile(filePath, name, ft) if err != nil { - c.logger.Warn("failed to index call file", "file", entry.ID, "error", err) + c.logger.Warn( + "failed to index call file", + "file", entry.Name(), + "error", err, + ) continue } - index = append(index, *entry) + index = append(index, *ent) } } @@ -173,13 +598,13 @@ func (c *Client) buildIndex() error { if !c.afterDate.IsZero() && e.Timestamp.Before(c.afterDate) { continue } - if !c.beforeDate.IsZero() && !e.Timestamp.Before(c.beforeDate) { + if !c.beforeDate.IsZero() && + !e.Timestamp.Before(c.beforeDate) { continue } filtered = append(filtered, e) } - // Sort by timestamp sort.Slice(filtered, func(i, j int) bool { return filtered[i].Timestamp.Before(filtered[j].Timestamp) }) @@ -196,9 +621,9 @@ func (c *Client) buildIndex() error { return nil } -// indexTextFile parses a text/group conversation HTML and returns index entries -// for each individual message within it. -func (c *Client) indexTextFile(filePath, contactName string, ft fileType) ([]indexEntry, error) { +func (c *Client) indexTextFile( + filePath, contactName string, ft fileType, +) ([]indexEntry, error) { f, err := os.Open(filePath) if err != nil { return nil, err @@ -210,24 +635,27 @@ func (c *Client) indexTextFile(filePath, contactName string, ft fileType) ([]ind return nil, err } - var entries []indexEntry + var result []indexEntry for i, msg := range messages { - // Compute deterministic message ID bodyPrefix := msg.Body if len(bodyPrefix) > 50 { bodyPrefix = bodyPrefix[:50] } - id := computeMessageID(msg.SenderPhone, msg.Timestamp.Format(time.RFC3339Nano), bodyPrefix) + id := computeMessageID( + msg.SenderPhone, + msg.Timestamp.Format(time.RFC3339Nano), + bodyPrefix, + ) - // Compute thread ID var threadID string if ft == fileTypeGroup { - threadID = computeThreadID(c.owner.Cell, fileTypeGroup, "", groupParticipants) + threadID = computeThreadID( + c.owner.Cell, fileTypeGroup, + "", groupParticipants, + ) } else { - // For 1:1 texts, use the non-owner phone otherPhone := msg.SenderPhone if msg.IsMe { - // Need to find the other party — look through all messages for _, m := range messages { if !m.IsMe { otherPhone = m.SenderPhone @@ -235,12 +663,14 @@ func (c *Client) indexTextFile(filePath, contactName string, ft fileType) ([]ind } } } - threadID = computeThreadID(c.owner.Cell, fileTypeText, otherPhone, nil) + threadID = computeThreadID( + c.owner.Cell, fileTypeText, + otherPhone, nil, + ) } label := labelForFileType(ft) - - entries = append(entries, indexEntry{ + result = append(result, indexEntry{ ID: id, ThreadID: threadID, FilePath: filePath, @@ -251,11 +681,12 @@ func (c *Client) indexTextFile(filePath, contactName string, ft fileType) ([]ind }) } - return entries, nil + return result, nil } -// indexCallFile parses a call log HTML and returns a single index entry. -func (c *Client) indexCallFile(filePath, contactName string, ft fileType) (*indexEntry, error) { +func (c *Client) indexCallFile( + filePath, contactName string, ft fileType, +) (*indexEntry, error) { f, err := os.Open(filePath) if err != nil { return nil, err @@ -267,13 +698,17 @@ func (c *Client) indexCallFile(filePath, contactName string, ft fileType) (*inde return nil, err } - // Override file type from HTML if the filename didn't include it if record.CallType != 0 { ft = record.CallType } - id := computeMessageID(ft.String(), record.Phone, record.Timestamp.Format(time.RFC3339Nano)) - threadID := computeThreadID(c.owner.Cell, ft, record.Phone, nil) + id := computeMessageID( + ft.String(), record.Phone, + record.Timestamp.Format(time.RFC3339Nano), + ) + threadID := computeThreadID( + c.owner.Cell, ft, record.Phone, nil, + ) label := labelForFileType(ft) return &indexEntry{ @@ -286,261 +721,11 @@ func (c *Client) indexCallFile(filePath, contactName string, ft fileType) (*inde }, nil } -// GetProfile returns a profile with the GV phone as identifier and index size as total. -func (c *Client) GetProfile(ctx context.Context) (*gmail.Profile, error) { - if err := c.buildIndex(); err != nil { - return nil, err - } - - return &gmail.Profile{ - EmailAddress: c.identifier, - MessagesTotal: int64(len(c.index)), - HistoryID: uint64(len(c.index)), - }, nil -} - -// ListLabels returns the set of labels used for Google Voice messages. -func (c *Client) ListLabels(ctx context.Context) ([]*gmail.Label, error) { - return []*gmail.Label{ - {ID: "sms", Name: "SMS", Type: "user"}, - {ID: "call_received", Name: "Call Received", Type: "user"}, - {ID: "call_placed", Name: "Call Placed", Type: "user"}, - {ID: "call_missed", Name: "Call Missed", Type: "user"}, - {ID: "voicemail", Name: "Voicemail", Type: "user"}, - {ID: "mms", Name: "MMS", Type: "user"}, - }, nil -} - -// ListMessages returns a page of message IDs from the sorted index. -// The pageToken is the string representation of the offset into the index. -func (c *Client) ListMessages(ctx context.Context, query string, pageToken string) (*gmail.MessageListResponse, error) { - if err := c.buildIndex(); err != nil { - return nil, fmt.Errorf("build index: %w", err) - } - - // Check limit - if c.limit > 0 && c.returned >= c.limit { - return &gmail.MessageListResponse{}, nil - } - - offset := 0 - if pageToken != "" { - var err error - offset, err = strconv.Atoi(pageToken) - if err != nil { - return nil, fmt.Errorf("invalid page token: %w", err) - } - } - - if offset >= len(c.index) { - return &gmail.MessageListResponse{}, nil - } - - // Calculate page size respecting limit - pageSize := c.pageSize - if c.limit > 0 { - remaining := c.limit - c.returned - if remaining < pageSize { - pageSize = remaining - } - } - - end := offset + pageSize - if end > len(c.index) { - end = len(c.index) - } - - page := c.index[offset:end] - messages := make([]gmail.MessageID, len(page)) - for i, entry := range page { - messages[i] = gmail.MessageID{ - ID: entry.ID, - ThreadID: entry.ThreadID, - } - } - - c.returned += len(messages) - - var nextPageToken string - if end < len(c.index) && (c.limit <= 0 || c.returned < c.limit) { - nextPageToken = strconv.Itoa(end) - } - - totalEstimate := int64(len(c.index)) - - return &gmail.MessageListResponse{ - Messages: messages, - NextPageToken: nextPageToken, - ResultSizeEstimate: totalEstimate, - }, nil -} - -// GetMessageRaw fetches a single message by ID and builds synthetic MIME data. -func (c *Client) GetMessageRaw(ctx context.Context, messageID string) (*gmail.RawMessage, error) { - if err := c.buildIndex(); err != nil { - return nil, fmt.Errorf("build index: %w", err) - } - - // Linear scan for the entry (index is typically <300k entries) - var entry *indexEntry - for i := range c.index { - if c.index[i].ID == messageID { - entry = &c.index[i] - break - } - } - if entry == nil { - return nil, &gmail.NotFoundError{Path: "/messages/" + messageID} - } - - switch entry.FileType { - case fileTypeText, fileTypeGroup: - return c.buildTextMessage(entry) - case fileTypeReceived, fileTypePlaced, fileTypeMissed, fileTypeVoicemail: - return c.buildCallMessage(entry) - default: - return nil, fmt.Errorf("unknown file type for message %s", messageID) - } -} - -// buildTextMessage constructs a RawMessage from a text/group conversation entry. -func (c *Client) buildTextMessage(entry *indexEntry) (*gmail.RawMessage, error) { - messages, groupParticipants, err := c.getCachedMessages(entry.FilePath) - if err != nil { - return nil, err - } - - if entry.MessageIndex >= len(messages) { - return nil, fmt.Errorf("message index %d out of range (file has %d messages)", entry.MessageIndex, len(messages)) - } - - msg := messages[entry.MessageIndex] - - // Determine from and to addresses - var fromAddrs, toAddrs []string - - ownerEmail, _ := normalizeIdentifier(c.owner.GoogleVoice) - - if msg.IsMe { - fromAddrs = []string{ownerEmail} - if entry.FileType == fileTypeGroup { - for _, phone := range groupParticipants { - email, _ := normalizeIdentifier(phone) - toAddrs = append(toAddrs, email) - } - } else { - // 1:1 text — find the other party - for _, m := range messages { - if !m.IsMe { - email, _ := normalizeIdentifier(m.SenderPhone) - toAddrs = []string{email} - break - } - } - } - } else { - senderEmail, _ := normalizeIdentifier(msg.SenderPhone) - fromAddrs = []string{senderEmail} - toAddrs = []string{ownerEmail} - // In group conversations, add other participants - if entry.FileType == fileTypeGroup { - for _, phone := range groupParticipants { - email, _ := normalizeIdentifier(phone) - if email != senderEmail { - toAddrs = append(toAddrs, email) - } - } - } - } - - mimeData := buildMIME(fromAddrs, toAddrs, msg.Timestamp, entry.ID, msg.Body) - - internalDate := int64(0) - if !msg.Timestamp.IsZero() { - internalDate = msg.Timestamp.UnixMilli() - } - - // Check for MMS attachments - labels := entry.Labels - if len(msg.Attachments) > 0 { - labels = append(labels, "mms") - } - - return &gmail.RawMessage{ - ID: entry.ID, - ThreadID: entry.ThreadID, - LabelIDs: labels, - Snippet: snippet(msg.Body, 100), - HistoryID: uint64(entry.Timestamp.UnixNano()), - InternalDate: internalDate, - SizeEstimate: int64(len(mimeData)), - Raw: mimeData, - }, nil -} - -// buildCallMessage constructs a RawMessage from a call record entry. -func (c *Client) buildCallMessage(entry *indexEntry) (*gmail.RawMessage, error) { - f, err := os.Open(entry.FilePath) - if err != nil { - return nil, err - } - defer func() { _ = f.Close() }() - - record, err := parseCallHTML(f) - if err != nil { - return nil, err - } - - // Build a descriptive body for the call - var body strings.Builder - switch record.CallType { - case fileTypeReceived: - fmt.Fprintf(&body, "Received call from %s", record.Name) - case fileTypePlaced: - fmt.Fprintf(&body, "Placed call to %s", record.Name) - case fileTypeMissed: - fmt.Fprintf(&body, "Missed call from %s", record.Name) - case fileTypeVoicemail: - fmt.Fprintf(&body, "Voicemail from %s", record.Name) - } - if record.Duration != "" { - fmt.Fprintf(&body, " (%s)", formatDuration(record.Duration)) - } - - ownerEmail, _ := normalizeIdentifier(c.owner.GoogleVoice) - contactEmail, _ := normalizeIdentifier(record.Phone) - - var fromAddrs, toAddrs []string - switch record.CallType { - case fileTypeReceived, fileTypeMissed, fileTypeVoicemail: - fromAddrs = []string{contactEmail} - toAddrs = []string{ownerEmail} - case fileTypePlaced: - fromAddrs = []string{ownerEmail} - toAddrs = []string{contactEmail} - } - - mimeData := buildMIME(fromAddrs, toAddrs, record.Timestamp, entry.ID, body.String()) - - internalDate := int64(0) - if !record.Timestamp.IsZero() { - internalDate = record.Timestamp.UnixMilli() - } - - return &gmail.RawMessage{ - ID: entry.ID, - ThreadID: entry.ThreadID, - LabelIDs: entry.Labels, - Snippet: snippet(body.String(), 100), - HistoryID: uint64(record.Timestamp.UnixNano()), - InternalDate: internalDate, - SizeEstimate: int64(len(mimeData)), - Raw: mimeData, - }, nil -} - -// getCachedMessages returns parsed messages for a file, using a simple cache. -func (c *Client) getCachedMessages(filePath string) ([]textMessage, []string, error) { +// getCachedMessages returns parsed messages for a file, using a +// simple LRU cache. +func (c *Client) getCachedMessages( + filePath string, +) ([]textMessage, []string, error) { if c.lastFilePath == filePath { return c.lastMessages, c.lastGroupPar, nil } @@ -563,45 +748,15 @@ func (c *Client) getCachedMessages(filePath string) ([]textMessage, []string, er return messages, groupParticipants, nil } -// GetMessagesRawBatch fetches multiple messages sequentially. -func (c *Client) GetMessagesRawBatch(ctx context.Context, messageIDs []string) ([]*gmail.RawMessage, error) { - results := make([]*gmail.RawMessage, len(messageIDs)) - for i, id := range messageIDs { - msg, err := c.GetMessageRaw(ctx, id) - if err != nil { - c.logger.Warn("failed to fetch message", "id", id, "error", err) - continue - } - results[i] = msg +func nullStr(s string) sql.NullString { + if s == "" { + return sql.NullString{} } - return results, nil + return sql.NullString{String: s, Valid: true} } -// ListHistory is not supported for Google Voice Takeout (static export). -func (c *Client) ListHistory(ctx context.Context, startHistoryID uint64, pageToken string) (*gmail.HistoryResponse, error) { - return &gmail.HistoryResponse{ - HistoryID: startHistoryID, - }, nil -} - -// TrashMessage is not supported for Google Voice Takeout. -func (c *Client) TrashMessage(ctx context.Context, messageID string) error { - return fmt.Errorf("trash not supported for Google Voice Takeout") -} - -// DeleteMessage is not supported for Google Voice Takeout. -func (c *Client) DeleteMessage(ctx context.Context, messageID string) error { - return fmt.Errorf("delete not supported for Google Voice Takeout") -} - -// BatchDeleteMessages is not supported for Google Voice Takeout. -func (c *Client) BatchDeleteMessages(ctx context.Context, messageIDs []string) error { - return fmt.Errorf("batch delete not supported for Google Voice Takeout") -} - -// formatDuration converts ISO 8601 duration (PT1M23S) to human-readable format. +// formatDuration converts ISO 8601 duration to human-readable format. func formatDuration(iso string) string { - // Parse PT{hours}H{minutes}M{seconds}S iso = strings.TrimPrefix(iso, "PT") var parts []string @@ -622,6 +777,3 @@ func formatDuration(iso string) string { } return strings.Join(parts, " ") } - -// Ensure Client implements gmail.API. -var _ gmail.API = (*Client)(nil) diff --git a/internal/gvoice/models.go b/internal/gvoice/models.go index 91e7ecae..3bed1b13 100644 --- a/internal/gvoice/models.go +++ b/internal/gvoice/models.go @@ -96,3 +96,26 @@ type callRecord struct { Duration string // ISO 8601 duration (e.g., "PT1M23S") Labels []string // from the HTML tags section } + +// ImportSummary holds statistics from a completed import run. +type ImportSummary struct { + MessagesImported int + ConversationsImported int + ParticipantsResolved int + Skipped int +} + +// MessageTypeForFileType maps a Google Voice file type to the +// message_type string stored in the database. +func MessageTypeForFileType(ft fileType) string { + switch ft { + case fileTypeText, fileTypeGroup: + return "google_voice_text" + case fileTypeReceived, fileTypePlaced, fileTypeMissed: + return "google_voice_call" + case fileTypeVoicemail: + return "google_voice_voicemail" + default: + return "google_voice_text" + } +} diff --git a/internal/gvoice/parser.go b/internal/gvoice/parser.go index 65f4974c..d54cceac 100644 --- a/internal/gvoice/parser.go +++ b/internal/gvoice/parser.go @@ -6,12 +6,12 @@ import ( "crypto/sha256" "fmt" "io" - "net/mail" "regexp" "sort" "strings" "time" + "github.com/wesm/msgvault/internal/textimport" "golang.org/x/net/html" ) @@ -105,7 +105,10 @@ func parseVCF(data []byte) (ownerPhones, error) { } if strings.HasPrefix(line, "TEL;TYPE=CELL:") { - phones.Cell = normalizePhone(strings.TrimPrefix(line, "TEL;TYPE=CELL:")) + raw := strings.TrimPrefix(line, "TEL;TYPE=CELL:") + if p, err := textimport.NormalizePhone(raw); err == nil { + phones.Cell = p + } } } @@ -113,7 +116,9 @@ func parseVCF(data []byte) (ownerPhones, error) { for prefix, label := range itemLabels { if label == "Google Voice" { if tel, ok := itemTels[prefix]; ok { - phones.GoogleVoice = normalizePhone(tel) + if p, err := textimport.NormalizePhone(tel); err == nil { + phones.GoogleVoice = p + } break } } @@ -145,8 +150,10 @@ func parseTextHTML(r io.Reader) ([]textMessage, []string, error) { if link.Type == html.ElementNode && link.Data == "a" && hasClass(link, "tel") { href := getAttr(link, "href") if strings.HasPrefix(href, "tel:") { - phone := normalizePhone(strings.TrimPrefix(href, "tel:")) - groupParticipants = append(groupParticipants, phone) + raw := strings.TrimPrefix(href, "tel:") + if p, err := textimport.NormalizePhone(raw); err == nil { + groupParticipants = append(groupParticipants, p) + } } } return false @@ -195,7 +202,10 @@ func parseMessageDiv(div *html.Node) textMessage { // Sender phone href := getAttr(n, "href") if strings.HasPrefix(href, "tel:") { - msg.SenderPhone = normalizePhone(strings.TrimPrefix(href, "tel:")) + raw := strings.TrimPrefix(href, "tel:") + if p, err := textimport.NormalizePhone(raw); err == nil { + msg.SenderPhone = p + } } // Sender name from child or walkNodes(n, func(child *html.Node) bool { @@ -299,7 +309,10 @@ func parseCallHTML(r io.Reader) (*callRecord, error) { if link.Type == html.ElementNode && link.Data == "a" && hasClass(link, "tel") { href := getAttr(link, "href") if strings.HasPrefix(href, "tel:") { - record.Phone = normalizePhone(strings.TrimPrefix(href, "tel:")) + raw := strings.TrimPrefix(href, "tel:") + if p, err := textimport.NormalizePhone(raw); err == nil { + record.Phone = p + } } walkNodes(link, func(fn *html.Node) bool { if fn.Type == html.ElementNode && fn.Data == "span" && hasClass(fn, "fn") { @@ -364,90 +377,6 @@ func parseCallHTML(r io.Reader) (*callRecord, error) { return record, nil } -// normalizePhone strips non-digit characters from a phone number and attempts -// to produce a consistent E.164-like format. -func normalizePhone(phone string) string { - hasPlus := strings.HasPrefix(phone, "+") - - var digits strings.Builder - for _, r := range phone { - if r >= '0' && r <= '9' { - digits.WriteRune(r) - } - } - d := digits.String() - if d == "" { - return phone - } - - if hasPlus { - return "+" + d - } - if len(d) == 10 { - return "+1" + d - } - if len(d) == 11 && d[0] == '1' { - return "+" + d - } - return "+" + d -} - -// normalizeIdentifier converts a phone number into an email-like identifier -// using the @phone.gvoice domain. -func normalizeIdentifier(phone string) (email, domain string) { - phone = normalizePhone(phone) - return phone + "@phone.gvoice", "phone.gvoice" -} - -// buildMIME constructs a minimal RFC 2822 message from Google Voice data. -func buildMIME(from, to []string, date time.Time, messageID, body string) []byte { - var b strings.Builder - - if len(from) > 0 { - b.WriteString("From: ") - b.WriteString(formatMIMEAddress(from[0])) - b.WriteString("\r\n") - } - - if len(to) > 0 { - b.WriteString("To: ") - for i, addr := range to { - if i > 0 { - b.WriteString(", ") - } - b.WriteString(formatMIMEAddress(addr)) - } - b.WriteString("\r\n") - } - - if !date.IsZero() { - b.WriteString("Date: ") - b.WriteString(date.Format(time.RFC1123Z)) - b.WriteString("\r\n") - } - - b.WriteString("Subject: \r\n") - - if messageID != "" { - fmt.Fprintf(&b, "Message-ID: <%s@gvoice.local>\r\n", messageID) - } - - b.WriteString("MIME-Version: 1.0\r\n") - b.WriteString("Content-Type: text/plain; charset=utf-8\r\n") - b.WriteString("\r\n") - - if body != "" { - b.WriteString(body) - } - - return []byte(b.String()) -} - -// formatMIMEAddress formats an email address for MIME headers. -func formatMIMEAddress(addr string) string { - return (&mail.Address{Address: addr}).String() -} - // snippet returns the first n characters of s, suitable for message preview. func snippet(s string, maxLen int) string { s = strings.Join(strings.Fields(s), " ") diff --git a/internal/gvoice/parser_test.go b/internal/gvoice/parser_test.go index e03bb7ec..2fa937c4 100644 --- a/internal/gvoice/parser_test.go +++ b/internal/gvoice/parser_test.go @@ -334,30 +334,6 @@ func TestParseCallHTML_Placed(t *testing.T) { } } -func TestNormalizePhone(t *testing.T) { - tests := []struct { - input string - want string - }{ - {"+12023065386", "+12023065386"}, - {"(202) 306-5386", "+12023065386"}, - {"2023065386", "+12023065386"}, - {"+442071234567", "+442071234567"}, - {"12023065386", "+12023065386"}, - {"+1 (202) 306-5386", "+12023065386"}, - {"", ""}, - } - - for _, tt := range tests { - t.Run(tt.input, func(t *testing.T) { - got := normalizePhone(tt.input) - if got != tt.want { - t.Errorf("normalizePhone(%q) = %q, want %q", tt.input, got, tt.want) - } - }) - } -} - func TestComputeMessageID(t *testing.T) { id1 := computeMessageID("+12023065386", "2020-02-03T11:37:45Z", "Hello") id2 := computeMessageID("+12023065386", "2020-02-03T11:37:45Z", "Hello") @@ -374,16 +350,6 @@ func TestComputeMessageID(t *testing.T) { } } -func TestNormalizeIdentifier(t *testing.T) { - email, domain := normalizeIdentifier("+12023065386") - if email != "+12023065386@phone.gvoice" { - t.Errorf("email = %q, want +12023065386@phone.gvoice", email) - } - if domain != "phone.gvoice" { - t.Errorf("domain = %q, want phone.gvoice", domain) - } -} - func TestFormatDuration(t *testing.T) { tests := []struct { input string From 2ebe5acd68feed9891e4cff65f3fdbfcaa6f6b77 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:32:13 -0500 Subject: [PATCH 17/65] Rename WhatsApp import CLI to import-whatsapp Consistent naming with import-imessage and import-gvoice. The --type flag is removed since each source has its own subcommand. --phone is now a required flag instead of validated at runtime. --- cmd/msgvault/cmd/import.go | 55 +++++++++++++++----------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/cmd/msgvault/cmd/import.go b/cmd/msgvault/cmd/import.go index 3ba45d30..cf16a0fc 100644 --- a/cmd/msgvault/cmd/import.go +++ b/cmd/msgvault/cmd/import.go @@ -16,7 +16,6 @@ import ( ) var ( - importType string importPhone string importMediaDir string importContacts string @@ -24,41 +23,30 @@ var ( importDisplayName string ) -var importCmd = &cobra.Command{ - Use: "import [path]", - Short: "Import messages from external sources", - Long: `Import messages from external message databases. - -Currently supported types: - whatsapp Import from a decrypted WhatsApp msgstore.db +var importWhatsappCmd = &cobra.Command{ + Use: "import-whatsapp ", + Short: "Import WhatsApp messages from decrypted backup", + Long: `Import messages from a decrypted WhatsApp msgstore.db backup. Examples: - msgvault import --type whatsapp --phone "+447700900000" /path/to/msgstore.db - msgvault import --type whatsapp --phone "+447700900000" --contacts ~/contacts.vcf /path/to/msgstore.db - msgvault import --type whatsapp --phone "+447700900000" --media-dir /path/to/Media /path/to/msgstore.db`, + msgvault import-whatsapp --phone "+447700900000" /path/to/msgstore.db + msgvault import-whatsapp --phone "+447700900000" --contacts ~/contacts.vcf /path/to/msgstore.db + msgvault import-whatsapp --phone "+447700900000" --media-dir /path/to/Media /path/to/msgstore.db`, Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { - if err := MustBeLocal("import"); err != nil { + if err := MustBeLocal("import-whatsapp"); err != nil { return err } - - sourcePath := args[0] - - // Validate source file exists. - if _, err := os.Stat(sourcePath); err != nil { - return fmt.Errorf("source file not found: %w", err) - } - - switch strings.ToLower(importType) { - case "whatsapp": - return runWhatsAppImport(cmd, sourcePath) - default: - return fmt.Errorf("unsupported import type %q (supported: whatsapp)", importType) - } + return runWhatsAppImport(cmd, args[0]) }, } func runWhatsAppImport(cmd *cobra.Command, sourcePath string) error { + // Validate source file exists. + if _, err := os.Stat(sourcePath); err != nil { + return fmt.Errorf("source file not found: %w", err) + } + // Validate phone number. if importPhone == "" { return fmt.Errorf("--phone is required for WhatsApp import (E.164 format, e.g., +447700900000)") @@ -226,12 +214,11 @@ func (p *ImportCLIProgress) OnError(err error) { } func init() { - importCmd.Flags().StringVar(&importType, "type", "", "import source type (required: whatsapp)") - importCmd.Flags().StringVar(&importPhone, "phone", "", "your phone number in E.164 format (required for whatsapp)") - importCmd.Flags().StringVar(&importMediaDir, "media-dir", "", "path to decrypted Media folder (optional)") - importCmd.Flags().StringVar(&importContacts, "contacts", "", "path to contacts .vcf file for name resolution (optional)") - importCmd.Flags().IntVar(&importLimit, "limit", 0, "limit number of messages (for testing)") - importCmd.Flags().StringVar(&importDisplayName, "display-name", "", "display name for the phone owner") - _ = importCmd.MarkFlagRequired("type") - rootCmd.AddCommand(importCmd) + importWhatsappCmd.Flags().StringVar(&importPhone, "phone", "", "your phone number in E.164 format (required)") + importWhatsappCmd.Flags().StringVar(&importMediaDir, "media-dir", "", "path to decrypted Media folder (optional)") + importWhatsappCmd.Flags().StringVar(&importContacts, "contacts", "", "path to contacts .vcf file for name resolution (optional)") + importWhatsappCmd.Flags().IntVar(&importLimit, "limit", 0, "limit number of messages (for testing)") + importWhatsappCmd.Flags().StringVar(&importDisplayName, "display-name", "", "display name for the phone owner") + _ = importWhatsappCmd.MarkFlagRequired("phone") + rootCmd.AddCommand(importWhatsappCmd) } From c7ce63b830f8dd1e7670cd33918e43590fb9ce6d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:37:26 -0500 Subject: [PATCH 18/65] Extend Parquet cache with text message columns - Add conversation_type to conversations Parquet export - Add conversation_type to DuckDB optional-column probe and parquetCTEs (falls back to 'email' for old caches) - Bump cacheSchemaVersion to 5 to force full rebuild on upgrade - Add email-only filter to DuckDB buildFilterConditions and buildWhereClause so existing email TUI views exclude WhatsApp/iMessage/GVoice messages - Apply same email-only filter to SQLite optsToFilterConditions and buildFilterJoinsAndConditions - Update test schemas to include conversation_type column Co-Authored-By: Claude Sonnet 4.6 --- cmd/msgvault/cmd/build_cache.go | 5 +++-- cmd/msgvault/cmd/build_cache_test.go | 12 +++++++----- internal/query/duckdb.go | 14 ++++++++++++++ internal/query/sqlite.go | 8 ++++++++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/cmd/msgvault/cmd/build_cache.go b/cmd/msgvault/cmd/build_cache.go index 7cd6c952..649ecc5e 100644 --- a/cmd/msgvault/cmd/build_cache.go +++ b/cmd/msgvault/cmd/build_cache.go @@ -32,7 +32,7 @@ var buildCacheMu sync.Mutex // columns are added/removed/renamed in the COPY queries below so that // incremental builds automatically trigger a full rebuild instead of // producing Parquet files with mismatched schemas. -const cacheSchemaVersion = 4 // v4: add source_type to sources Parquet; strip \r\n in SanitizeTerminal +const cacheSchemaVersion = 5 // v5: add conversation_type to conversations Parquet // syncState tracks the message and sync-run watermarks covered by the cache. type syncState struct { @@ -441,7 +441,8 @@ func buildCache(dbPath, analyticsDir string, fullRebuild bool) (*buildResult, er SELECT id, COALESCE(TRY_CAST(source_conversation_id AS VARCHAR), '') as source_conversation_id, - COALESCE(TRY_CAST(title AS VARCHAR), '') as title + COALESCE(TRY_CAST(title AS VARCHAR), '') as title, + COALESCE(TRY_CAST(conversation_type AS VARCHAR), 'email') as conversation_type FROM sqlite_db.conversations ) TO '%s/conversations.parquet' ( FORMAT PARQUET, diff --git a/cmd/msgvault/cmd/build_cache_test.go b/cmd/msgvault/cmd/build_cache_test.go index 71f82aa7..b8e858b2 100644 --- a/cmd/msgvault/cmd/build_cache_test.go +++ b/cmd/msgvault/cmd/build_cache_test.go @@ -101,7 +101,8 @@ func setupTestSQLite(t *testing.T) (string, func()) { id INTEGER PRIMARY KEY, source_id INTEGER NOT NULL REFERENCES sources(id), source_conversation_id TEXT, - title TEXT + title TEXT, + conversation_type TEXT NOT NULL DEFAULT 'email' ); ` @@ -1138,7 +1139,7 @@ func TestBuildCache_EmptyDatabase(t *testing.T) { CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); CREATE TABLE message_labels (message_id INTEGER, label_id INTEGER); CREATE TABLE attachments (message_id INTEGER, size INTEGER, filename TEXT); - CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT); + CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT, conversation_type TEXT NOT NULL DEFAULT 'email'); `) _ = db.Close() @@ -1338,7 +1339,7 @@ func BenchmarkBuildCache(b *testing.B) { CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); CREATE TABLE message_labels (message_id INTEGER, label_id INTEGER); CREATE TABLE attachments (message_id INTEGER, size INTEGER, filename TEXT); - CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT); + CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT, conversation_type TEXT NOT NULL DEFAULT 'email'); INSERT INTO sources VALUES (1, 'test@gmail.com'); INSERT INTO labels VALUES (1, 'INBOX'), (2, 'Work'); `) @@ -1466,7 +1467,8 @@ func setupTestSQLiteEmpty(t *testing.T) (string, func()) { id INTEGER PRIMARY KEY, source_id INTEGER NOT NULL REFERENCES sources(id), source_conversation_id TEXT, - title TEXT + title TEXT, + conversation_type TEXT NOT NULL DEFAULT 'email' ); ` if _, err := db.Exec(schema); err != nil { @@ -1969,7 +1971,7 @@ func BenchmarkBuildCacheIncremental(b *testing.B) { CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); CREATE TABLE message_labels (message_id INTEGER, label_id INTEGER); CREATE TABLE attachments (message_id INTEGER, size INTEGER, filename TEXT); - CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT); + CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_conversation_id TEXT, title TEXT, conversation_type TEXT NOT NULL DEFAULT 'email'); INSERT INTO sources VALUES (1, 'test@gmail.com'); INSERT INTO labels VALUES (1, 'INBOX'); INSERT INTO participants VALUES (1, 'alice@example.com', 'example.com', 'Alice', NULL); diff --git a/internal/query/duckdb.go b/internal/query/duckdb.go index 375f1c08..b8872ed4 100644 --- a/internal/query/duckdb.go +++ b/internal/query/duckdb.go @@ -152,6 +152,7 @@ func NewDuckDBEngine(analyticsDir string, sqlitePath string, sqliteDB *sql.DB, o {"messages", "sender_id"}, {"messages", "message_type"}, {"conversations", "title"}, + {"conversations", "conversation_type"}, {"sources", "source_type"}, } { if !engine.optionalCols[col.table][col.col] { @@ -306,6 +307,11 @@ func (e *DuckDBEngine) parquetCTEs() string { } else { convExtra = append(convExtra, "'' AS title") } + if e.hasCol("conversations", "conversation_type") { + convReplace = append(convReplace, "COALESCE(CAST(conversation_type AS VARCHAR), 'email') AS conversation_type") + } else { + convExtra = append(convExtra, "'email' AS conversation_type") + } convCTE := fmt.Sprintf("SELECT * REPLACE (\n\t\t\t\t%s\n\t\t\t)", strings.Join(convReplace, ",\n\t\t\t\t")) if len(convExtra) > 0 { convCTE += ", " + strings.Join(convExtra, ", ") @@ -602,6 +608,10 @@ func (e *DuckDBEngine) buildWhereClause(opts AggregateOptions, keyColumns ...str var conditions []string var args []interface{} + // Exclude text messages from email-mode queries. + // message_type IS NULL and '' handle old data without the column. + conditions = append(conditions, "(msg.message_type = 'email' OR msg.message_type IS NULL OR msg.message_type = '')") + if opts.SourceID != nil { conditions = append(conditions, "msg.source_id = ?") args = append(args, *opts.SourceID) @@ -804,6 +814,10 @@ func (e *DuckDBEngine) buildFilterConditions(filter MessageFilter) (string, []in var conditions []string var args []interface{} + // Exclude text messages from email-mode queries. + // message_type IS NULL and '' handle old data without the column. + conditions = append(conditions, "(msg.message_type = 'email' OR msg.message_type IS NULL OR msg.message_type = '')") + if filter.SourceID != nil { conditions = append(conditions, "msg.source_id = ?") args = append(args, *filter.SourceID) diff --git a/internal/query/sqlite.go b/internal/query/sqlite.go index e47548e3..400b0eca 100644 --- a/internal/query/sqlite.go +++ b/internal/query/sqlite.go @@ -185,6 +185,10 @@ func optsToFilterConditions(opts AggregateOptions, prefix string) ([]string, []i var conditions []string var args []interface{} + // Exclude text messages from email-mode queries. + // message_type IS NULL and '' handle old data without the column. + conditions = append(conditions, "("+prefix+"message_type = 'email' OR "+prefix+"message_type IS NULL OR "+prefix+"message_type = '')") + if opts.SourceID != nil { conditions = append(conditions, prefix+"source_id = ?") args = append(args, *opts.SourceID) @@ -253,6 +257,10 @@ func buildFilterJoinsAndConditions(filter MessageFilter, tableAlias string) (str // Include all messages (deleted messages shown with indicator in TUI) + // Exclude text messages from email-mode queries. + // message_type IS NULL and '' handle old data without the column. + conditions = append(conditions, "("+prefix+"message_type = 'email' OR "+prefix+"message_type IS NULL OR "+prefix+"message_type = '')") + if filter.SourceID != nil { conditions = append(conditions, prefix+"source_id = ?") args = append(args, *filter.SourceID) From 4725be3c3684ad1cabbaf5500db1b4f8dbb1030f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:39:17 -0500 Subject: [PATCH 19/65] Add TextEngine interface and text query types --- internal/query/text_engine.go | 29 +++++++++++ internal/query/text_models.go | 95 +++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 internal/query/text_engine.go create mode 100644 internal/query/text_models.go diff --git a/internal/query/text_engine.go b/internal/query/text_engine.go new file mode 100644 index 00000000..1fe6987f --- /dev/null +++ b/internal/query/text_engine.go @@ -0,0 +1,29 @@ +package query + +import "context" + +// TextEngine provides query operations for text message data. +// This is a separate interface from Engine to avoid rippling text +// query methods through remote/API/MCP/mock layers. +// DuckDBEngine and SQLiteEngine implement both Engine and TextEngine. +type TextEngine interface { + // ListConversations returns conversations matching the filter. + ListConversations(ctx context.Context, + filter TextFilter) ([]ConversationRow, error) + + // TextAggregate aggregates text messages by the given view type. + TextAggregate(ctx context.Context, viewType TextViewType, + opts TextAggregateOptions) ([]AggregateRow, error) + + // ListConversationMessages returns messages within a conversation. + ListConversationMessages(ctx context.Context, convID int64, + filter TextFilter) ([]MessageSummary, error) + + // TextSearch performs plain full-text search over text messages. + TextSearch(ctx context.Context, query string, + limit, offset int) ([]MessageSummary, error) + + // GetTextStats returns aggregate stats for text messages. + GetTextStats(ctx context.Context, + opts TextStatsOptions) (*TotalStats, error) +} diff --git a/internal/query/text_models.go b/internal/query/text_models.go new file mode 100644 index 00000000..db6bf521 --- /dev/null +++ b/internal/query/text_models.go @@ -0,0 +1,95 @@ +package query + +import "time" + +// TextViewType represents the type of view in Texts mode. +type TextViewType int + +const ( + TextViewConversations TextViewType = iota + TextViewContacts + TextViewContactNames + TextViewSources + TextViewLabels + TextViewTime + TextViewTypeCount +) + +func (v TextViewType) String() string { + switch v { + case TextViewConversations: + return "Conversations" + case TextViewContacts: + return "Contacts" + case TextViewContactNames: + return "Contact Names" + case TextViewSources: + return "Sources" + case TextViewLabels: + return "Labels" + case TextViewTime: + return "Time" + default: + return "Unknown" + } +} + +// ConversationRow represents a conversation in the Conversations view. +type ConversationRow struct { + ConversationID int64 + Title string + SourceType string + MessageCount int64 + ParticipantCount int64 + LastMessageAt time.Time + LastPreview string +} + +// TextFilter specifies which text messages to retrieve. +type TextFilter struct { + SourceID *int64 + ConversationID *int64 + ContactPhone string + ContactName string + SourceType string + Label string + TimeRange TimeRange + After *time.Time + Before *time.Time + Pagination Pagination + SortField SortField + SortDirection SortDirection +} + +// TextAggregateOptions configures a text aggregate query. +type TextAggregateOptions struct { + SourceID *int64 + After *time.Time + Before *time.Time + SortField SortField + SortDirection SortDirection + Limit int + TimeGranularity TimeGranularity + SearchQuery string +} + +// TextStatsOptions configures a text stats query. +type TextStatsOptions struct { + SourceID *int64 + SearchQuery string +} + +// TextMessageTypes lists the message_type values included in Texts mode. +var TextMessageTypes = []string{ + "whatsapp", "imessage", "sms", "google_voice_text", +} + +// IsTextMessageType returns true if the given type is a text message type. +func IsTextMessageType(mt string) bool { + for _, t := range TextMessageTypes { + if t == mt { + return true + } + } + return false +} From d72836ed3fb05d54b265165d562f9bf92d239aa7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:43:11 -0500 Subject: [PATCH 20/65] Implement TextEngine on DuckDBEngine Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/query/duckdb_text.go | 524 ++++++++++++++++++++++++++++++++++ 1 file changed, 524 insertions(+) create mode 100644 internal/query/duckdb_text.go diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go new file mode 100644 index 00000000..8834da06 --- /dev/null +++ b/internal/query/duckdb_text.go @@ -0,0 +1,524 @@ +package query + +import ( + "context" + "database/sql" + "fmt" + "strings" +) + +// Compile-time interface assertion. +var _ TextEngine = (*DuckDBEngine)(nil) + +// textTypeFilter returns a SQL condition restricting to text message types. +func textTypeFilter() string { + return "msg.message_type IN ('whatsapp','imessage','sms','google_voice_text')" +} + +// buildTextFilterConditions builds WHERE conditions from a TextFilter. +// All conditions use the msg. prefix and assume the standard parquetCTEs. +func (e *DuckDBEngine) buildTextFilterConditions( + filter TextFilter, +) (string, []interface{}) { + conditions := []string{textTypeFilter()} + var args []interface{} + + if filter.SourceID != nil { + conditions = append(conditions, "msg.source_id = ?") + args = append(args, *filter.SourceID) + } + if filter.ConversationID != nil { + conditions = append(conditions, "msg.conversation_id = ?") + args = append(args, *filter.ConversationID) + } + if filter.ContactPhone != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM mr + JOIN p ON p.id = mr.participant_id + WHERE mr.message_id = msg.id + AND p.phone_number = ? + )`) + args = append(args, filter.ContactPhone) + } + if filter.ContactName != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM mr + JOIN p ON p.id = mr.participant_id + WHERE mr.message_id = msg.id + AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) = ? + )`) + args = append(args, filter.ContactName) + } + if filter.SourceType != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM src + WHERE src.id = msg.source_id AND src.source_type = ? + )`) + args = append(args, filter.SourceType) + } + if filter.Label != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM ml + JOIN lbl ON lbl.id = ml.label_id + WHERE ml.message_id = msg.id + AND lbl.name ILIKE ? ESCAPE '\' + )`) + args = append(args, escapeILIKE(filter.Label)) + } + if filter.TimeRange.Period != "" { + g := inferTimeGranularity( + filter.TimeRange.Granularity, filter.TimeRange.Period, + ) + conditions = append(conditions, + fmt.Sprintf("%s = ?", timeExpr(g))) + args = append(args, filter.TimeRange.Period) + } + if filter.After != nil { + conditions = append(conditions, + "msg.sent_at >= CAST(? AS TIMESTAMP)") + args = append(args, + filter.After.Format("2006-01-02 15:04:05")) + } + if filter.Before != nil { + conditions = append(conditions, + "msg.sent_at < CAST(? AS TIMESTAMP)") + args = append(args, + filter.Before.Format("2006-01-02 15:04:05")) + } + + return strings.Join(conditions, " AND "), args +} + +// ListConversations returns conversations matching the filter, +// aggregating stats from the messages Parquet table. +func (e *DuckDBEngine) ListConversations( + ctx context.Context, filter TextFilter, +) ([]ConversationRow, error) { + where, args := e.buildTextFilterConditions(filter) + + // Sort clause + orderBy := "last_message_at DESC" + switch filter.SortField { + case SortByCount: + orderBy = "message_count" + case SortBySize: + orderBy = "total_size" + case SortByName: + orderBy = "title" + } + if filter.SortField != 0 { + if filter.SortDirection == SortAsc { + orderBy += " ASC" + } else { + orderBy += " DESC" + } + } + + limit := filter.Pagination.Limit + if limit == 0 { + limit = 100 + } + + query := fmt.Sprintf(` + WITH %s, + conv_stats AS ( + SELECT + msg.conversation_id, + COUNT(*) AS message_count, + COUNT(DISTINCT COALESCE(msg.sender_id, 0)) AS participant_count, + MAX(msg.sent_at) AS last_message_at, + COALESCE(SUM(CAST(msg.size_estimate AS BIGINT)), 0) AS total_size, + FIRST(msg.snippet ORDER BY msg.sent_at DESC) AS last_preview, + FIRST(msg.source_id) AS source_id + FROM msg + WHERE %s + GROUP BY msg.conversation_id + ) + SELECT + conv.id, + COALESCE(conv.title, '') AS title, + COALESCE(src.source_type, '') AS source_type, + cs.message_count, + cs.participant_count, + cs.last_message_at, + COALESCE(cs.last_preview, '') AS last_preview + FROM conv_stats cs + JOIN conv ON conv.id = cs.conversation_id + LEFT JOIN src ON src.id = cs.source_id + ORDER BY %s + LIMIT ? OFFSET ? + `, e.parquetCTEs(), where, orderBy) + + args = append(args, limit, filter.Pagination.Offset) + + rows, err := e.db.QueryContext(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("list conversations: %w", err) + } + defer func() { _ = rows.Close() }() + + var results []ConversationRow + for rows.Next() { + var row ConversationRow + var lastAt sql.NullTime + if err := rows.Scan( + &row.ConversationID, + &row.Title, + &row.SourceType, + &row.MessageCount, + &row.ParticipantCount, + &lastAt, + &row.LastPreview, + ); err != nil { + return nil, fmt.Errorf("scan conversation: %w", err) + } + if lastAt.Valid { + row.LastMessageAt = lastAt.Time + } + results = append(results, row) + } + return results, rows.Err() +} + +// textAggViewDef returns the aggregate query definition for a text view type. +func textAggViewDef( + view TextViewType, granularity TimeGranularity, +) (aggViewDef, error) { + switch view { + case TextViewContacts: + return aggViewDef{ + keyExpr: "COALESCE(NULLIF(p.phone_number, ''), p.email_address)", + joinClause: `JOIN mr ON mr.message_id = msg.id + JOIN p ON p.id = mr.participant_id`, + nullGuard: "COALESCE(NULLIF(p.phone_number, ''), p.email_address) IS NOT NULL", + }, nil + case TextViewContactNames: + nameExpr := "COALESCE(NULLIF(TRIM(p.display_name), ''), " + + "NULLIF(p.phone_number, ''), p.email_address)" + return aggViewDef{ + keyExpr: nameExpr, + joinClause: `JOIN mr ON mr.message_id = msg.id + JOIN p ON p.id = mr.participant_id`, + nullGuard: nameExpr + " IS NOT NULL", + }, nil + case TextViewSources: + return aggViewDef{ + keyExpr: "src.source_type", + joinClause: "JOIN src ON src.id = msg.source_id", + nullGuard: "src.source_type IS NOT NULL", + }, nil + case TextViewLabels: + return aggViewDef{ + keyExpr: "lbl.name", + joinClause: `JOIN ml ON ml.message_id = msg.id + JOIN lbl ON lbl.id = ml.label_id`, + nullGuard: "lbl.name IS NOT NULL", + keyColumns: []string{"lbl.name"}, + }, nil + case TextViewTime: + return aggViewDef{ + keyExpr: timeExpr(granularity), + nullGuard: "msg.sent_at IS NOT NULL", + }, nil + default: + return aggViewDef{}, + fmt.Errorf("unsupported text view type: %v", view) + } +} + +// TextAggregate aggregates text messages by the given view type. +func (e *DuckDBEngine) TextAggregate( + ctx context.Context, + viewType TextViewType, + opts TextAggregateOptions, +) ([]AggregateRow, error) { + def, err := textAggViewDef(viewType, opts.TimeGranularity) + if err != nil { + return nil, err + } + + // Build WHERE clause with text type filter. + conditions := []string{textTypeFilter()} + var args []interface{} + + if opts.SourceID != nil { + conditions = append(conditions, "msg.source_id = ?") + args = append(args, *opts.SourceID) + } + if opts.After != nil { + conditions = append(conditions, + "msg.sent_at >= CAST(? AS TIMESTAMP)") + args = append(args, + opts.After.Format("2006-01-02 15:04:05")) + } + if opts.Before != nil { + conditions = append(conditions, + "msg.sent_at < CAST(? AS TIMESTAMP)") + args = append(args, + opts.Before.Format("2006-01-02 15:04:05")) + } + + // Search filter on key columns. + if opts.SearchQuery != "" { + searchConds, searchArgs := e.buildAggregateSearchConditions( + opts.SearchQuery, def.keyColumns...) + conditions = append(conditions, searchConds...) + args = append(args, searchArgs...) + } + + whereClause := strings.Join(conditions, " AND ") + + aggOpts := AggregateOptions{ + SortField: opts.SortField, + SortDirection: opts.SortDirection, + Limit: opts.Limit, + TimeGranularity: opts.TimeGranularity, + } + + return e.runAggregation(ctx, def, whereClause, args, aggOpts) +} + +// ListConversationMessages returns messages within a conversation, +// ordered chronologically (ASC) for timeline display. +func (e *DuckDBEngine) ListConversationMessages( + ctx context.Context, convID int64, filter TextFilter, +) ([]MessageSummary, error) { + filter.ConversationID = &convID + where, args := e.buildTextFilterConditions(filter) + + limit := filter.Pagination.Limit + if limit == 0 { + limit = 500 + } + + query := fmt.Sprintf(` + WITH %s, + filtered_msgs AS ( + SELECT msg.id + FROM msg + WHERE %s + ORDER BY msg.sent_at ASC + LIMIT ? OFFSET ? + ), + msg_sender AS ( + SELECT mr.message_id, + FIRST(p.email_address) AS from_email, + FIRST(COALESCE(mr.display_name, p.display_name, '')) AS from_name, + FIRST(COALESCE(p.phone_number, '')) AS from_phone + FROM mr + JOIN p ON p.id = mr.participant_id + WHERE mr.recipient_type = 'from' + AND mr.message_id IN (SELECT id FROM filtered_msgs) + GROUP BY mr.message_id + ), + direct_sender AS ( + SELECT msg.id AS message_id, + COALESCE(p.email_address, '') AS from_email, + COALESCE(p.display_name, '') AS from_name, + COALESCE(p.phone_number, '') AS from_phone + FROM msg + JOIN filtered_msgs fm ON fm.id = msg.id + JOIN p ON p.id = msg.sender_id + WHERE msg.sender_id IS NOT NULL + AND msg.id NOT IN (SELECT message_id FROM msg_sender) + ) + SELECT + msg.id, + COALESCE(msg.source_message_id, '') AS source_message_id, + COALESCE(msg.conversation_id, 0) AS conversation_id, + COALESCE(c.source_conversation_id, '') AS source_conversation_id, + COALESCE(msg.subject, '') AS subject, + COALESCE(msg.snippet, '') AS snippet, + COALESCE(ms.from_email, ds.from_email, '') AS from_email, + COALESCE(ms.from_name, ds.from_name, '') AS from_name, + COALESCE(ms.from_phone, ds.from_phone, '') AS from_phone, + msg.sent_at, + COALESCE(msg.size_estimate, 0) AS size_estimate, + COALESCE(msg.has_attachments, false) AS has_attachments, + COALESCE(msg.attachment_count, 0) AS attachment_count, + msg.deleted_from_source_at, + COALESCE(msg.message_type, '') AS message_type, + COALESCE(c.title, '') AS conv_title + FROM msg + JOIN filtered_msgs fm ON fm.id = msg.id + LEFT JOIN msg_sender ms ON ms.message_id = msg.id + LEFT JOIN direct_sender ds ON ds.message_id = msg.id + LEFT JOIN conv c ON c.id = msg.conversation_id + ORDER BY msg.sent_at ASC + `, e.parquetCTEs(), where) + + args = append(args, limit, filter.Pagination.Offset) + + rows, err := e.db.QueryContext(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("list conversation messages: %w", err) + } + defer func() { _ = rows.Close() }() + + return scanMessageSummaries(rows) +} + +// TextSearch performs plain full-text search over text messages via FTS5. +// Returns empty results if SQLite is not available. +func (e *DuckDBEngine) TextSearch( + ctx context.Context, query string, limit, offset int, +) ([]MessageSummary, error) { + if e.sqliteDB == nil { + return nil, nil + } + if query == "" { + return nil, nil + } + if limit == 0 { + limit = 50 + } + + // Use FTS5 MATCH on messages_fts, filtered to text message types. + sqlQuery := ` + SELECT + m.id, + COALESCE(m.source_message_id, '') AS source_message_id, + COALESCE(m.conversation_id, 0) AS conversation_id, + '' AS source_conversation_id, + COALESCE(m.subject, '') AS subject, + COALESCE(m.snippet, '') AS snippet, + COALESCE(p.email_address, '') AS from_email, + COALESCE(p.display_name, '') AS from_name, + COALESCE(p.phone_number, '') AS from_phone, + m.sent_at, + COALESCE(m.size_estimate, 0) AS size_estimate, + COALESCE(m.has_attachments, 0) AS has_attachments, + 0 AS attachment_count, + m.deleted_from_source_at, + COALESCE(m.message_type, '') AS message_type, + COALESCE(c.title, '') AS conv_title + FROM messages_fts fts + JOIN messages m ON m.id = fts.rowid + LEFT JOIN participants p ON p.id = m.sender_id + LEFT JOIN conversations c ON c.id = m.conversation_id + WHERE fts.messages_fts MATCH ? + AND m.message_type IN ('whatsapp','imessage','sms','google_voice_text') + ORDER BY m.sent_at DESC + LIMIT ? OFFSET ? + ` + + rows, err := e.sqliteDB.QueryContext(ctx, sqlQuery, + query, limit, offset) + if err != nil { + return nil, fmt.Errorf("text search: %w", err) + } + defer func() { _ = rows.Close() }() + + return scanMessageSummaries(rows) +} + +// GetTextStats returns aggregate stats for text messages. +func (e *DuckDBEngine) GetTextStats( + ctx context.Context, opts TextStatsOptions, +) (*TotalStats, error) { + stats := &TotalStats{} + + conditions := []string{textTypeFilter()} + var args []interface{} + + if opts.SourceID != nil { + conditions = append(conditions, "msg.source_id = ?") + args = append(args, *opts.SourceID) + } + if opts.SearchQuery != "" { + termPattern := "%" + escapeILIKE(opts.SearchQuery) + "%" + conditions = append(conditions, + "(msg.subject ILIKE ? ESCAPE '\\' OR msg.snippet ILIKE ? ESCAPE '\\')") + args = append(args, termPattern, termPattern) + } + + whereClause := strings.Join(conditions, " AND ") + + msgQuery := fmt.Sprintf(` + WITH %s + SELECT + COUNT(*) AS message_count, + COALESCE(SUM(CAST(msg.size_estimate AS BIGINT)), 0) AS total_size, + CAST(COALESCE(SUM(att.attachment_count), 0) AS BIGINT) AS attachment_count, + CAST(COALESCE(SUM(att.attachment_size), 0) AS BIGINT) AS attachment_size, + COUNT(DISTINCT msg.source_id) AS account_count + FROM msg + LEFT JOIN att ON att.message_id = msg.id + WHERE %s + `, e.parquetCTEs(), whereClause) + + var attachmentSize sql.NullFloat64 + err := e.db.QueryRowContext(ctx, msgQuery, args...).Scan( + &stats.MessageCount, + &stats.TotalSize, + &stats.AttachmentCount, + &attachmentSize, + &stats.AccountCount, + ) + if err != nil { + return nil, fmt.Errorf("text stats query: %w", err) + } + if attachmentSize.Valid { + stats.AttachmentSize = int64(attachmentSize.Float64) + } + + // Label count for text messages. + labelQuery := fmt.Sprintf(` + WITH %s + SELECT COUNT(DISTINCT lbl.name) + FROM msg + JOIN ml ON ml.message_id = msg.id + JOIN lbl ON lbl.id = ml.label_id + WHERE %s + `, e.parquetCTEs(), whereClause) + + if err := e.db.QueryRowContext(ctx, labelQuery, args...).Scan( + &stats.LabelCount, + ); err != nil { + stats.LabelCount = 0 + } + + return stats, nil +} + +// scanMessageSummaries scans rows into MessageSummary slices. +// Shared by ListConversationMessages and TextSearch. +func scanMessageSummaries(rows *sql.Rows) ([]MessageSummary, error) { + var results []MessageSummary + for rows.Next() { + var msg MessageSummary + var sentAt sql.NullTime + var deletedAt sql.NullTime + if err := rows.Scan( + &msg.ID, + &msg.SourceMessageID, + &msg.ConversationID, + &msg.SourceConversationID, + &msg.Subject, + &msg.Snippet, + &msg.FromEmail, + &msg.FromName, + &msg.FromPhone, + &sentAt, + &msg.SizeEstimate, + &msg.HasAttachments, + &msg.AttachmentCount, + &deletedAt, + &msg.MessageType, + &msg.ConversationTitle, + ); err != nil { + return nil, fmt.Errorf("scan message: %w", err) + } + if sentAt.Valid { + msg.SentAt = sentAt.Time + } + if deletedAt.Valid { + msg.DeletedAt = &deletedAt.Time + } + results = append(results, msg) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("iterate messages: %w", err) + } + return results, nil +} From 2962c6554d971e1ff6d7ea2a1c4f59eff74223e1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:45:38 -0500 Subject: [PATCH 21/65] Implement TextEngine on SQLiteEngine as fallback Co-Authored-By: Claude Sonnet 4.6 --- internal/query/sqlite_text.go | 470 ++++++++++++++++++++++++++++++++++ 1 file changed, 470 insertions(+) create mode 100644 internal/query/sqlite_text.go diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go new file mode 100644 index 00000000..baff2c2a --- /dev/null +++ b/internal/query/sqlite_text.go @@ -0,0 +1,470 @@ +package query + +import ( + "context" + "database/sql" + "fmt" + "strings" +) + +// Compile-time interface assertion. +var _ TextEngine = (*SQLiteEngine)(nil) + +// textMsgTypeFilter returns a SQL condition restricting to text message types. +// Uses the m. table alias used in text query methods. +func textMsgTypeFilter() string { + return "m.message_type IN ('whatsapp','imessage','sms','google_voice_text')" +} + +// buildSQLiteTextFilterConditions builds WHERE conditions from a TextFilter. +// All conditions use the m. prefix for the messages table. +func buildSQLiteTextFilterConditions(filter TextFilter) (string, []interface{}) { + conditions := []string{textMsgTypeFilter()} + var args []interface{} + + if filter.SourceID != nil { + conditions = append(conditions, "m.source_id = ?") + args = append(args, *filter.SourceID) + } + if filter.ConversationID != nil { + conditions = append(conditions, "m.conversation_id = ?") + args = append(args, *filter.ConversationID) + } + if filter.ContactPhone != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM message_recipients mr_cp + JOIN participants p_cp ON p_cp.id = mr_cp.participant_id + WHERE mr_cp.message_id = m.id + AND p_cp.phone_number = ? + )`) + args = append(args, filter.ContactPhone) + } + if filter.ContactName != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM message_recipients mr_cn + JOIN participants p_cn ON p_cn.id = mr_cn.participant_id + WHERE mr_cn.message_id = m.id + AND COALESCE(NULLIF(TRIM(p_cn.display_name), ''), p_cn.email_address) = ? + )`) + args = append(args, filter.ContactName) + } + if filter.SourceType != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM sources s_st + WHERE s_st.id = m.source_id AND s_st.source_type = ? + )`) + args = append(args, filter.SourceType) + } + if filter.Label != "" { + conditions = append(conditions, `EXISTS ( + SELECT 1 FROM message_labels ml_lf + JOIN labels lbl_lf ON lbl_lf.id = ml_lf.label_id + WHERE ml_lf.message_id = m.id + AND LOWER(lbl_lf.name) = LOWER(?) + )`) + args = append(args, filter.Label) + } + if filter.TimeRange.Period != "" { + granularity := filter.TimeRange.Granularity + if granularity == TimeYear && len(filter.TimeRange.Period) > 4 { + switch len(filter.TimeRange.Period) { + case 7: + granularity = TimeMonth + case 10: + granularity = TimeDay + } + } + var timeExprStr string + switch granularity { + case TimeYear: + timeExprStr = "strftime('%Y', m.sent_at)" + case TimeMonth: + timeExprStr = "strftime('%Y-%m', m.sent_at)" + case TimeDay: + timeExprStr = "strftime('%Y-%m-%d', m.sent_at)" + default: + timeExprStr = "strftime('%Y-%m', m.sent_at)" + } + conditions = append(conditions, fmt.Sprintf("%s = ?", timeExprStr)) + args = append(args, filter.TimeRange.Period) + } + if filter.After != nil { + conditions = append(conditions, "m.sent_at >= ?") + args = append(args, filter.After.Format("2006-01-02 15:04:05")) + } + if filter.Before != nil { + conditions = append(conditions, "m.sent_at < ?") + args = append(args, filter.Before.Format("2006-01-02 15:04:05")) + } + + return strings.Join(conditions, " AND "), args +} + +// ListConversations returns conversations matching the filter. +func (e *SQLiteEngine) ListConversations( + ctx context.Context, filter TextFilter, +) ([]ConversationRow, error) { + where, args := buildSQLiteTextFilterConditions(filter) + + orderBy := "last_message_at DESC" + if filter.SortField != 0 { + switch filter.SortField { + case SortByCount: + orderBy = "message_count" + case SortBySize: + orderBy = "total_size" + case SortByName: + orderBy = "title" + } + if filter.SortDirection == SortAsc { + orderBy += " ASC" + } else { + orderBy += " DESC" + } + } + + limit := filter.Pagination.Limit + if limit == 0 { + limit = 100 + } + + query := fmt.Sprintf(` + SELECT + c.id, + COALESCE(c.title, '') AS title, + COALESCE(s.source_type, '') AS source_type, + COUNT(*) AS message_count, + COUNT(DISTINCT COALESCE(m.sender_id, 0)) AS participant_count, + MAX(m.sent_at) AS last_message_at, + COALESCE( + (SELECT m2.snippet FROM messages m2 + WHERE m2.conversation_id = c.id + AND %s + ORDER BY m2.sent_at DESC LIMIT 1), + '' + ) AS last_preview, + COALESCE(SUM(m.size_estimate), 0) AS total_size + FROM conversations c + JOIN messages m ON m.conversation_id = c.id + LEFT JOIN sources s ON s.id = m.source_id + WHERE %s + GROUP BY c.id, c.title, s.source_type + ORDER BY %s + LIMIT ? OFFSET ? + `, textMsgTypeFilter(), where, orderBy) + + args = append(args, limit, filter.Pagination.Offset) + + rows, err := e.db.QueryContext(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("list conversations: %w", err) + } + defer func() { _ = rows.Close() }() + + var results []ConversationRow + for rows.Next() { + var row ConversationRow + var lastAt sql.NullTime + var totalSize int64 + if err := rows.Scan( + &row.ConversationID, + &row.Title, + &row.SourceType, + &row.MessageCount, + &row.ParticipantCount, + &lastAt, + &row.LastPreview, + &totalSize, + ); err != nil { + return nil, fmt.Errorf("scan conversation: %w", err) + } + if lastAt.Valid { + row.LastMessageAt = lastAt.Time + } + results = append(results, row) + } + return results, rows.Err() +} + +// textAggSQLiteDimension returns the dimension definition for a text aggregate view. +func textAggSQLiteDimension( + view TextViewType, granularity TimeGranularity, +) (aggDimension, error) { + switch view { + case TextViewContacts: + return aggDimension{ + keyExpr: "COALESCE(NULLIF(p_agg.phone_number, ''), p_agg.email_address)", + joins: `JOIN message_recipients mr_agg ON mr_agg.message_id = m.id + JOIN participants p_agg ON p_agg.id = mr_agg.participant_id`, + whereExpr: "COALESCE(NULLIF(p_agg.phone_number, ''), p_agg.email_address) IS NOT NULL", + }, nil + case TextViewContactNames: + nameExpr := "COALESCE(NULLIF(TRIM(p_agg.display_name), ''), NULLIF(p_agg.phone_number, ''), p_agg.email_address)" + return aggDimension{ + keyExpr: nameExpr, + joins: `JOIN message_recipients mr_agg ON mr_agg.message_id = m.id + JOIN participants p_agg ON p_agg.id = mr_agg.participant_id`, + whereExpr: nameExpr + " IS NOT NULL", + }, nil + case TextViewSources: + return aggDimension{ + keyExpr: "s_agg.source_type", + joins: "JOIN sources s_agg ON s_agg.id = m.source_id", + whereExpr: "s_agg.source_type IS NOT NULL", + }, nil + case TextViewLabels: + return aggDimension{ + keyExpr: "lbl_agg.name", + joins: `JOIN message_labels ml_agg ON ml_agg.message_id = m.id + JOIN labels lbl_agg ON lbl_agg.id = ml_agg.label_id`, + whereExpr: "lbl_agg.name IS NOT NULL", + }, nil + case TextViewTime: + var timeExprStr string + switch granularity { + case TimeYear: + timeExprStr = "strftime('%Y', m.sent_at)" + case TimeMonth: + timeExprStr = "strftime('%Y-%m', m.sent_at)" + case TimeDay: + timeExprStr = "strftime('%Y-%m-%d', m.sent_at)" + default: + timeExprStr = "strftime('%Y-%m', m.sent_at)" + } + return aggDimension{ + keyExpr: timeExprStr, + joins: "", + whereExpr: "m.sent_at IS NOT NULL", + }, nil + default: + return aggDimension{}, fmt.Errorf("unsupported text view type: %v", view) + } +} + +// TextAggregate aggregates text messages by the given view type. +func (e *SQLiteEngine) TextAggregate( + ctx context.Context, + viewType TextViewType, + opts TextAggregateOptions, +) ([]AggregateRow, error) { + dim, err := textAggSQLiteDimension(viewType, opts.TimeGranularity) + if err != nil { + return nil, err + } + + conditions := []string{textMsgTypeFilter()} + var args []interface{} + + if opts.SourceID != nil { + conditions = append(conditions, "m.source_id = ?") + args = append(args, *opts.SourceID) + } + if opts.After != nil { + conditions = append(conditions, "m.sent_at >= ?") + args = append(args, opts.After.Format("2006-01-02 15:04:05")) + } + if opts.Before != nil { + conditions = append(conditions, "m.sent_at < ?") + args = append(args, opts.Before.Format("2006-01-02 15:04:05")) + } + if opts.SearchQuery != "" { + likeTerm := "%" + escapeSQLiteLike(opts.SearchQuery) + "%" + conditions = append(conditions, "(m.subject LIKE ? OR m.snippet LIKE ?)") + args = append(args, likeTerm, likeTerm) + } + + aggOpts := AggregateOptions{ + SortField: opts.SortField, + SortDirection: opts.SortDirection, + Limit: opts.Limit, + TimeGranularity: opts.TimeGranularity, + } + + sort, err := sortClause(aggOpts) + if err != nil { + return nil, err + } + + limit := aggOpts.Limit + if limit == 0 { + limit = 100 + } + + filterWhere := strings.Join(conditions, " AND ") + query := buildAggregateSQL(dim, "", filterWhere, sort) + args = append(args, limit) + return e.executeAggregateQuery(ctx, query, args) +} + +// ListConversationMessages returns messages within a conversation, +// ordered chronologically (ASC) for timeline display. +func (e *SQLiteEngine) ListConversationMessages( + ctx context.Context, convID int64, filter TextFilter, +) ([]MessageSummary, error) { + filter.ConversationID = &convID + where, args := buildSQLiteTextFilterConditions(filter) + + limit := filter.Pagination.Limit + if limit == 0 { + limit = 500 + } + + query := fmt.Sprintf(` + SELECT + m.id, + COALESCE(m.source_message_id, '') AS source_message_id, + COALESCE(m.conversation_id, 0) AS conversation_id, + COALESCE(c.source_conversation_id, '') AS source_conversation_id, + COALESCE(m.subject, '') AS subject, + COALESCE(m.snippet, '') AS snippet, + COALESCE(p_sender.email_address, '') AS from_email, + COALESCE(p_sender.display_name, '') AS from_name, + COALESCE(p_sender.phone_number, '') AS from_phone, + m.sent_at, + COALESCE(m.size_estimate, 0) AS size_estimate, + m.has_attachments, + m.attachment_count, + m.deleted_from_source_at, + COALESCE(m.message_type, '') AS message_type, + COALESCE(c.title, '') AS conv_title + FROM messages m + LEFT JOIN participants p_sender ON p_sender.id = m.sender_id + LEFT JOIN conversations c ON c.id = m.conversation_id + WHERE %s + ORDER BY m.sent_at ASC + LIMIT ? OFFSET ? + `, where) + + args = append(args, limit, filter.Pagination.Offset) + + rows, err := e.db.QueryContext(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("list conversation messages: %w", err) + } + defer func() { _ = rows.Close() }() + + return scanMessageSummaries(rows) +} + +// TextSearch performs plain full-text search over text messages. +// Uses FTS5 if available; otherwise returns empty results. +func (e *SQLiteEngine) TextSearch( + ctx context.Context, query string, limit, offset int, +) ([]MessageSummary, error) { + if query == "" { + return nil, nil + } + if !e.hasFTSTable(ctx) { + return nil, nil + } + if limit == 0 { + limit = 50 + } + + sqlQuery := ` + SELECT + m.id, + COALESCE(m.source_message_id, '') AS source_message_id, + COALESCE(m.conversation_id, 0) AS conversation_id, + '' AS source_conversation_id, + COALESCE(m.subject, '') AS subject, + COALESCE(m.snippet, '') AS snippet, + COALESCE(p.email_address, '') AS from_email, + COALESCE(p.display_name, '') AS from_name, + COALESCE(p.phone_number, '') AS from_phone, + m.sent_at, + COALESCE(m.size_estimate, 0) AS size_estimate, + COALESCE(m.has_attachments, 0) AS has_attachments, + 0 AS attachment_count, + m.deleted_from_source_at, + COALESCE(m.message_type, '') AS message_type, + COALESCE(c.title, '') AS conv_title + FROM messages_fts fts + JOIN messages m ON m.id = fts.rowid + LEFT JOIN participants p ON p.id = m.sender_id + LEFT JOIN conversations c ON c.id = m.conversation_id + WHERE fts.messages_fts MATCH ? + AND m.message_type IN ('whatsapp','imessage','sms','google_voice_text') + ORDER BY m.sent_at DESC + LIMIT ? OFFSET ? + ` + + rows, err := e.db.QueryContext(ctx, sqlQuery, query, limit, offset) + if err != nil { + return nil, fmt.Errorf("text search: %w", err) + } + defer func() { _ = rows.Close() }() + + return scanMessageSummaries(rows) +} + +// GetTextStats returns aggregate stats for text messages. +func (e *SQLiteEngine) GetTextStats( + ctx context.Context, opts TextStatsOptions, +) (*TotalStats, error) { + stats := &TotalStats{} + + conditions := []string{textMsgTypeFilter()} + var args []interface{} + + if opts.SourceID != nil { + conditions = append(conditions, "m.source_id = ?") + args = append(args, *opts.SourceID) + } + if opts.SearchQuery != "" { + likeTerm := "%" + escapeSQLiteLike(opts.SearchQuery) + "%" + conditions = append(conditions, "(m.subject LIKE ? OR m.snippet LIKE ?)") + args = append(args, likeTerm, likeTerm) + } + + whereClause := strings.Join(conditions, " AND ") + + msgQuery := fmt.Sprintf(` + SELECT + COUNT(*) AS message_count, + COALESCE(SUM(m.size_estimate), 0) AS total_size, + COUNT(DISTINCT m.source_id) AS account_count + FROM messages m + WHERE %s + `, whereClause) + + if err := e.db.QueryRowContext(ctx, msgQuery, args...).Scan( + &stats.MessageCount, + &stats.TotalSize, + &stats.AccountCount, + ); err != nil { + return nil, fmt.Errorf("text stats query: %w", err) + } + + attQuery := fmt.Sprintf(` + SELECT + COUNT(*) AS attachment_count, + COALESCE(SUM(a.size), 0) AS attachment_size + FROM attachments a + JOIN messages m ON m.id = a.message_id + WHERE %s + `, whereClause) + + if err := e.db.QueryRowContext(ctx, attQuery, args...).Scan( + &stats.AttachmentCount, + &stats.AttachmentSize, + ); err != nil { + return nil, fmt.Errorf("text attachment stats query: %w", err) + } + + labelQuery := fmt.Sprintf(` + SELECT COUNT(DISTINCT lbl.name) + FROM messages m + JOIN message_labels ml ON ml.message_id = m.id + JOIN labels lbl ON lbl.id = ml.label_id + WHERE %s + `, whereClause) + + if err := e.db.QueryRowContext(ctx, labelQuery, args...).Scan( + &stats.LabelCount, + ); err != nil { + stats.LabelCount = 0 + } + + return stats, nil +} From 98c0f9b9a6d1eeaddb6c69bd671c70d141b24b73 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:46:39 -0500 Subject: [PATCH 22/65] Update FTS backfill to handle phone-based text senders --- internal/store/messages.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/internal/store/messages.go b/internal/store/messages.go index b414e436..0fa81d3e 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -853,7 +853,11 @@ func (s *Store) backfillFTSBatch(fromID, toID int64) (int64, error) { result, err := s.db.Exec(` INSERT OR REPLACE INTO messages_fts (rowid, message_id, subject, body, from_addr, to_addr, cc_addr) SELECT m.id, m.id, COALESCE(m.subject, ''), COALESCE(mb.body_text, ''), - COALESCE((SELECT GROUP_CONCAT(p.email_address, ' ') FROM message_recipients mr JOIN participants p ON p.id = mr.participant_id WHERE mr.message_id = m.id AND mr.recipient_type = 'from'), ''), + COALESCE( + (SELECT COALESCE(p.phone_number, p.email_address) FROM participants p WHERE p.id = m.sender_id), + (SELECT GROUP_CONCAT(p.email_address, ' ') FROM message_recipients mr JOIN participants p ON p.id = mr.participant_id WHERE mr.message_id = m.id AND mr.recipient_type = 'from'), + '' + ), COALESCE((SELECT GROUP_CONCAT(p.email_address, ' ') FROM message_recipients mr JOIN participants p ON p.id = mr.participant_id WHERE mr.message_id = m.id AND mr.recipient_type = 'to'), ''), COALESCE((SELECT GROUP_CONCAT(p.email_address, ' ') FROM message_recipients mr JOIN participants p ON p.id = mr.participant_id WHERE mr.message_id = m.id AND mr.recipient_type = 'cc'), '') FROM messages m From 38a82561b51490ebd51778a4da9faf65bd58333a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:52:45 -0500 Subject: [PATCH 23/65] Add TUI Texts mode with conversations, aggregates, and timeline Implement the Texts mode toggle (m key) for browsing text message data (WhatsApp, iMessage, SMS, Google Voice) alongside the existing Email mode. New files: - text_state.go: tuiMode, textViewLevel, textState, textNavSnapshot types - text_keys.go: key handling for all text view levels - text_view.go: rendering for conversations, aggregates, and timeline - text_commands.go: async data loading via TextEngine Modified files: - model.go: mode/textEngine/textState fields, Options.TextEngine, Update routing for text message types, handleKeyPress mode dispatch - keys.go: m key in handleGlobalKeys to toggle modes - view.go: mode indicator in title bar, m key in help modal Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/tui/keys.go | 17 +- internal/tui/model.go | 114 +++++++ internal/tui/text_commands.go | 146 +++++++++ internal/tui/text_keys.go | 324 ++++++++++++++++++++ internal/tui/text_state.go | 46 +++ internal/tui/text_view.go | 545 ++++++++++++++++++++++++++++++++++ internal/tui/view.go | 11 +- 7 files changed, 1200 insertions(+), 3 deletions(-) create mode 100644 internal/tui/text_commands.go create mode 100644 internal/tui/text_keys.go create mode 100644 internal/tui/text_state.go create mode 100644 internal/tui/text_view.go diff --git a/internal/tui/keys.go b/internal/tui/keys.go index 92a1d1e7..ccceb12d 100644 --- a/internal/tui/keys.go +++ b/internal/tui/keys.go @@ -81,7 +81,7 @@ func (m Model) handleInlineSearchKeys(msg tea.KeyMsg) (tea.Model, tea.Cmd) { } } -// handleGlobalKeys handles keys common to all views (quit, help). +// handleGlobalKeys handles keys common to all views (quit, help, mode toggle). // Returns (model, cmd, true) if the key was handled, or (model, nil, false) otherwise. func (m Model) handleGlobalKeys(msg tea.KeyMsg) (Model, tea.Cmd, bool) { switch msg.String() { @@ -94,6 +94,21 @@ func (m Model) handleGlobalKeys(msg tea.KeyMsg) (Model, tea.Cmd, bool) { case "?": m.modal = modalHelp return m, nil, true + case "m": + if m.textEngine == nil { + return m, nil, true + } + if m.mode == modeEmail { + m.mode = modeTexts + m.loading = true + spinCmd := m.startSpinner() + return m, tea.Batch(spinCmd, m.loadTextConversations()), true + } + m.mode = modeEmail + m.loading = true + m.aggregateRequestID++ + spinCmd := m.startSpinner() + return m, tea.Batch(spinCmd, m.loadData(), m.loadStats()), true } return m, nil, false } diff --git a/internal/tui/model.go b/internal/tui/model.go index 5031e0a2..fc98ef32 100644 --- a/internal/tui/model.go +++ b/internal/tui/model.go @@ -54,6 +54,10 @@ type Options struct { // IsRemote indicates the TUI is connected to a remote server. // Some features (deletion staging, attachment export) are disabled in remote mode. IsRemote bool + + // TextEngine provides text message query operations. + // When non-nil, the 'm' key toggles between Email and Texts mode. + TextEngine query.TextEngine } // modalType represents the type of modal dialog. @@ -97,9 +101,18 @@ type selectionState struct { type Model struct { viewState // Embedded state + // Top-level mode: Email or Texts + mode tuiMode + // Query engine for data access engine query.Engine + // Text message query engine (nil if no text data available) + textEngine query.TextEngine + + // Texts mode state (separate from email viewState) + textState textState + // Version info for title bar version string @@ -217,8 +230,17 @@ func New(engine query.Engine, opts Options) Model { threadLimit = defaultThreadMessageLimit } + textEngine := opts.TextEngine + if textEngine == nil { + // Try type assertion as fallback + if te, ok := engine.(query.TextEngine); ok { + textEngine = te + } + } + return Model{ engine: engine, + textEngine: textEngine, actions: NewActionController(engine, opts.DataDir, nil), version: opts.Version, aggregateLimit: aggLimit, @@ -734,6 +756,89 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { return m.handleSearchDebounce(msg) case spinnerTickMsg: return m.handleSpinnerTick() + // Text mode messages + case textConversationsLoadedMsg: + return m.handleTextConversationsLoaded(msg) + case textAggregateLoadedMsg: + return m.handleTextAggregateLoaded(msg) + case textMessagesLoadedMsg: + return m.handleTextMessagesLoaded(msg) + case textSearchResultMsg: + return m.handleTextSearchResult(msg) + case textStatsLoadedMsg: + return m.handleTextStatsLoaded(msg) + } + return m, nil +} + +// handleTextConversationsLoaded processes text conversations load completion. +func (m Model) handleTextConversationsLoaded(msg textConversationsLoadedMsg) (tea.Model, tea.Cmd) { + m.transitionBuffer = "" + m.loading = false + if msg.err != nil { + m.err = msg.err + m.modal = modalError + m.modalResult = msg.err.Error() + return m, nil + } + m.textState.conversations = msg.conversations + m.textState.stats = msg.stats + return m, nil +} + +// handleTextAggregateLoaded processes text aggregate load completion. +func (m Model) handleTextAggregateLoaded(msg textAggregateLoadedMsg) (tea.Model, tea.Cmd) { + m.transitionBuffer = "" + m.loading = false + if msg.err != nil { + m.err = msg.err + m.modal = modalError + m.modalResult = msg.err.Error() + return m, nil + } + m.textState.aggregateRows = msg.rows + m.textState.stats = msg.stats + return m, nil +} + +// handleTextMessagesLoaded processes text messages load completion. +func (m Model) handleTextMessagesLoaded(msg textMessagesLoadedMsg) (tea.Model, tea.Cmd) { + m.transitionBuffer = "" + m.loading = false + if msg.err != nil { + m.err = msg.err + m.modal = modalError + m.modalResult = msg.err.Error() + return m, nil + } + m.textState.messages = msg.messages + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + return m, nil +} + +// handleTextSearchResult processes text search results. +func (m Model) handleTextSearchResult(msg textSearchResultMsg) (tea.Model, tea.Cmd) { + m.transitionBuffer = "" + m.loading = false + if msg.err != nil { + m.err = msg.err + m.modal = modalError + m.modalResult = msg.err.Error() + return m, nil + } + // Show search results as a timeline + m.textState.messages = msg.messages + m.textState.level = textLevelTimeline + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + return m, nil +} + +// handleTextStatsLoaded processes text stats load completion. +func (m Model) handleTextStatsLoaded(msg textStatsLoadedMsg) (tea.Model, tea.Cmd) { + if msg.err == nil { + m.textState.stats = msg.stats } return m, nil } @@ -1103,6 +1208,11 @@ func (m Model) handleSpinnerTick() (tea.Model, tea.Cmd) { // handleKeyPress processes keyboard input. func (m Model) handleKeyPress(msg tea.KeyMsg) (tea.Model, tea.Cmd) { + // Route to Texts mode handler when active + if m.mode == modeTexts { + return m.handleTextKeyPress(msg) + } + // Handle modal first (error modals must dismiss even during search) if m.modal != modalNone { return m.handleModalKeys(msg) @@ -1312,6 +1422,10 @@ func (m Model) View() string { // Separated from View() so transitions can capture the current output // before changing state (for the transitionBuffer pattern). func (m Model) renderView() string { + if m.mode == modeTexts { + return m.renderTextView() + } + switch m.level { case levelAggregates, levelDrillDown: return fmt.Sprintf("%s\n%s\n%s", diff --git a/internal/tui/text_commands.go b/internal/tui/text_commands.go new file mode 100644 index 00000000..8c67d601 --- /dev/null +++ b/internal/tui/text_commands.go @@ -0,0 +1,146 @@ +package tui + +import ( + "context" + "fmt" + + tea "github.com/charmbracelet/bubbletea" + "github.com/wesm/msgvault/internal/query" +) + +// textConversationsLoadedMsg is sent when text conversations are loaded. +type textConversationsLoadedMsg struct { + conversations []query.ConversationRow + stats *query.TotalStats + err error +} + +// textAggregateLoadedMsg is sent when text aggregate data is loaded. +type textAggregateLoadedMsg struct { + rows []query.AggregateRow + stats *query.TotalStats + err error +} + +// textMessagesLoadedMsg is sent when text conversation messages are loaded. +type textMessagesLoadedMsg struct { + messages []query.MessageSummary + err error +} + +// textSearchResultMsg is sent when text search results are loaded. +type textSearchResultMsg struct { + messages []query.MessageSummary + err error +} + +// textStatsLoadedMsg is sent when text stats are loaded. +type textStatsLoadedMsg struct { + stats *query.TotalStats + err error +} + +// loadTextConversations fetches text conversations matching the current filter. +func (m Model) loadTextConversations() tea.Cmd { + te := m.textEngine + filter := m.textState.filter + return safeCmdWithPanic( + func() tea.Msg { + ctx := context.Background() + convs, err := te.ListConversations(ctx, filter) + if err != nil { + return textConversationsLoadedMsg{err: err} + } + stats, _ := te.GetTextStats(ctx, query.TextStatsOptions{ + SourceID: filter.SourceID, + }) + return textConversationsLoadedMsg{ + conversations: convs, stats: stats, + } + }, + func(r any) tea.Msg { + return textConversationsLoadedMsg{ + err: fmt.Errorf("text conversations panic: %v", r), + } + }, + ) +} + +// loadTextAggregate fetches text aggregate data for the current view type. +func (m Model) loadTextAggregate() tea.Cmd { + te := m.textEngine + vt := m.textState.viewType + filter := m.textState.filter + return safeCmdWithPanic( + func() tea.Msg { + ctx := context.Background() + opts := query.TextAggregateOptions{ + SourceID: filter.SourceID, + After: filter.After, + Before: filter.Before, + SortField: filter.SortField, + SortDirection: filter.SortDirection, + Limit: defaultAggregateLimit, + } + rows, err := te.TextAggregate(ctx, vt, opts) + if err != nil { + return textAggregateLoadedMsg{err: err} + } + stats, _ := te.GetTextStats(ctx, query.TextStatsOptions{ + SourceID: filter.SourceID, + }) + return textAggregateLoadedMsg{rows: rows, stats: stats} + }, + func(r any) tea.Msg { + return textAggregateLoadedMsg{ + err: fmt.Errorf("text aggregate panic: %v", r), + } + }, + ) +} + +// loadTextMessages fetches messages for the selected conversation. +func (m Model) loadTextMessages() tea.Cmd { + te := m.textEngine + convID := m.textState.selectedConvID + filter := m.textState.filter + return safeCmdWithPanic( + func() tea.Msg { + msgs, err := te.ListConversationMessages( + context.Background(), convID, filter, + ) + return textMessagesLoadedMsg{messages: msgs, err: err} + }, + func(r any) tea.Msg { + return textMessagesLoadedMsg{ + err: fmt.Errorf("text messages panic: %v", r), + } + }, + ) +} + +// loadTextSearch executes a text message search. +func (m Model) loadTextSearch(searchQuery string) tea.Cmd { + te := m.textEngine + return safeCmdWithPanic( + func() tea.Msg { + msgs, err := te.TextSearch( + context.Background(), searchQuery, 100, 0, + ) + return textSearchResultMsg{messages: msgs, err: err} + }, + func(r any) tea.Msg { + return textSearchResultMsg{ + err: fmt.Errorf("text search panic: %v", r), + } + }, + ) +} + +// loadTextData dispatches the appropriate load command based on current view type. +func (m Model) loadTextData() tea.Cmd { + if m.textState.viewType == query.TextViewConversations { + return m.loadTextConversations() + } + return m.loadTextAggregate() +} diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go new file mode 100644 index 00000000..6981e053 --- /dev/null +++ b/internal/tui/text_keys.go @@ -0,0 +1,324 @@ +package tui + +import ( + tea "github.com/charmbracelet/bubbletea" + "github.com/wesm/msgvault/internal/query" +) + +// handleTextKeyPress dispatches key events when in Texts mode. +func (m Model) handleTextKeyPress(msg tea.KeyMsg) (tea.Model, tea.Cmd) { + // Modal takes priority + if m.modal != modalNone { + return m.handleModalKeys(msg) + } + + // Inline search takes priority over view keys + if m.inlineSearchActive { + return m.handleTextInlineSearchKeys(msg) + } + + // Check global keys first (q, ?, m) + newM, cmd, handled := m.handleGlobalKeys(msg) + if handled { + return newM, cmd + } + + // Disable selection/deletion keys in Texts mode (read-only) + switch msg.String() { + case " ", "S", "d", "D", "x": + return m, nil + } + + switch m.textState.level { + case textLevelConversations, textLevelAggregate, + textLevelDrillConversations: + return m.handleTextListKeys(msg) + case textLevelTimeline: + return m.handleTextTimelineKeys(msg) + } + return m, nil +} + +// handleTextListKeys handles keys in text list views +// (conversations, aggregates, drill-down conversations). +func (m Model) handleTextListKeys( + msg tea.KeyMsg, +) (tea.Model, tea.Cmd) { + // Handle list navigation + if m.navigateList(msg.String(), m.textRowCount()) { + return m, nil + } + + switch msg.String() { + case "tab", "Tab": + m.cycleTextViewType(true) + m.loading = true + return m, m.loadTextData() + + case "shift+tab": + m.cycleTextViewType(false) + m.loading = true + return m, m.loadTextData() + + case "enter": + return m.textDrillDown() + + case "esc", "backspace": + return m.textGoBack() + + case "s": + m.cycleTextSortField() + m.loading = true + return m, m.loadTextData() + + case "r", "v": + if m.textState.filter.SortDirection == query.SortDesc { + m.textState.filter.SortDirection = query.SortAsc + } else { + m.textState.filter.SortDirection = query.SortDesc + } + m.loading = true + return m, m.loadTextData() + + case "t": + m.textState.viewType = query.TextViewTime + m.textState.level = textLevelAggregate + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + m.loading = true + return m, m.loadTextData() + + case "a": + // Reset to conversations view (clear filters) + m.textState = textState{ + viewType: query.TextViewConversations, + } + m.loading = true + return m, m.loadTextConversations() + + case "A": + m.openAccountSelector() + return m, nil + + case "/": + return m, m.activateInlineSearch("search texts") + } + + return m, nil +} + +// handleTextTimelineKeys handles keys in the text timeline view. +func (m Model) handleTextTimelineKeys( + msg tea.KeyMsg, +) (tea.Model, tea.Cmd) { + switch msg.String() { + case "esc", "backspace": + return m.textGoBack() + + case "j", "down": + m.textMoveCursor(1) + return m, nil + + case "k", "up": + m.textMoveCursor(-1) + return m, nil + + case "pgup", "ctrl+u": + m.textMoveCursor(-m.visibleRows()) + return m, nil + + case "pgdown", "ctrl+d": + m.textMoveCursor(m.visibleRows()) + return m, nil + + case "home": + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + return m, nil + + case "end", "G": + maxIdx := m.textRowCount() - 1 + if maxIdx < 0 { + maxIdx = 0 + } + m.textState.cursor = maxIdx + return m, nil + } + return m, nil +} + +// handleTextInlineSearchKeys handles keys when inline search is +// active in Texts mode. Enter commits the search; Esc cancels. +func (m Model) handleTextInlineSearchKeys( + msg tea.KeyMsg, +) (tea.Model, tea.Cmd) { + switch msg.String() { + case "enter": + m.exitInlineSearchMode() + queryStr := m.searchInput.Value() + if queryStr == "" { + return m, nil + } + m.loading = true + return m, m.loadTextSearch(queryStr) + + case "esc": + m.exitInlineSearchMode() + m.searchInput.SetValue("") + return m, nil + + case "ctrl+c": + m.quitting = true + return m, tea.Quit + + default: + var cmd tea.Cmd + m.searchInput, cmd = m.searchInput.Update(msg) + return m, cmd + } +} + +// cycleTextViewType cycles through text view types. +func (m *Model) cycleTextViewType(forward bool) { + if forward { + m.textState.viewType++ + if m.textState.viewType >= query.TextViewTypeCount { + m.textState.viewType = 0 + } + } else { + if m.textState.viewType == 0 { + m.textState.viewType = query.TextViewTypeCount - 1 + } else { + m.textState.viewType-- + } + } + if m.textState.viewType == query.TextViewConversations { + m.textState.level = textLevelConversations + } else { + m.textState.level = textLevelAggregate + } + m.textState.cursor = 0 + m.textState.scrollOffset = 0 +} + +// textMoveCursor moves the cursor by delta and adjusts scroll offset. +func (m *Model) textMoveCursor(delta int) { + m.textState.cursor += delta + maxIdx := m.textRowCount() - 1 + if maxIdx < 0 { + maxIdx = 0 + } + if m.textState.cursor < 0 { + m.textState.cursor = 0 + } + if m.textState.cursor > maxIdx { + m.textState.cursor = maxIdx + } + m.textState.scrollOffset = calculateScrollOffset( + m.textState.cursor, + m.textState.scrollOffset, + m.visibleRows(), + ) +} + +// textRowCount returns the number of rows in the current text view. +func (m Model) textRowCount() int { + switch m.textState.level { + case textLevelConversations, textLevelDrillConversations: + return len(m.textState.conversations) + case textLevelAggregate: + return len(m.textState.aggregateRows) + case textLevelTimeline: + return len(m.textState.messages) + } + return 0 +} + +// cycleTextSortField cycles between sort fields for text views. +func (m *Model) cycleTextSortField() { + switch m.textState.filter.SortField { + case query.SortByCount: + m.textState.filter.SortField = query.SortByName + default: + m.textState.filter.SortField = query.SortByCount + } +} + +// textDrillDown enters the selected item in text mode. +func (m Model) textDrillDown() (tea.Model, tea.Cmd) { + switch m.textState.level { + case textLevelConversations, textLevelDrillConversations: + if m.textState.cursor >= len(m.textState.conversations) { + return m, nil + } + conv := m.textState.conversations[m.textState.cursor] + m.textState.breadcrumbs = append( + m.textState.breadcrumbs, + textNavSnapshot{ + level: m.textState.level, + viewType: m.textState.viewType, + cursor: m.textState.cursor, + scrollOffset: m.textState.scrollOffset, + filter: m.textState.filter, + selectedConvID: m.textState.selectedConvID, + }, + ) + m.textState.selectedConvID = conv.ConversationID + m.textState.level = textLevelTimeline + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + m.loading = true + return m, m.loadTextMessages() + + case textLevelAggregate: + if m.textState.cursor >= len(m.textState.aggregateRows) { + return m, nil + } + row := m.textState.aggregateRows[m.textState.cursor] + m.textState.breadcrumbs = append( + m.textState.breadcrumbs, + textNavSnapshot{ + level: m.textState.level, + viewType: m.textState.viewType, + cursor: m.textState.cursor, + scrollOffset: m.textState.scrollOffset, + filter: m.textState.filter, + selectedConvID: m.textState.selectedConvID, + }, + ) + // Apply aggregate filter and drill to conversations + switch m.textState.viewType { + case query.TextViewContacts: + m.textState.filter.ContactPhone = row.Key + case query.TextViewContactNames: + m.textState.filter.ContactName = row.Key + case query.TextViewSources: + m.textState.filter.SourceType = row.Key + case query.TextViewLabels: + m.textState.filter.Label = row.Key + } + m.textState.level = textLevelDrillConversations + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + m.loading = true + return m, m.loadTextConversations() + } + return m, nil +} + +// textGoBack returns to the previous text navigation state. +func (m Model) textGoBack() (tea.Model, tea.Cmd) { + if len(m.textState.breadcrumbs) == 0 { + return m, nil + } + snap := m.textState.breadcrumbs[len(m.textState.breadcrumbs)-1] + m.textState.breadcrumbs = m.textState.breadcrumbs[:len(m.textState.breadcrumbs)-1] + m.textState.level = snap.level + m.textState.viewType = snap.viewType + m.textState.cursor = snap.cursor + m.textState.scrollOffset = snap.scrollOffset + m.textState.filter = snap.filter + m.textState.selectedConvID = snap.selectedConvID + m.loading = true + return m, m.loadTextData() +} diff --git a/internal/tui/text_state.go b/internal/tui/text_state.go new file mode 100644 index 00000000..34213964 --- /dev/null +++ b/internal/tui/text_state.go @@ -0,0 +1,46 @@ +package tui + +import "github.com/wesm/msgvault/internal/query" + +// tuiMode represents the top-level mode: Email or Texts. +type tuiMode int + +const ( + modeEmail tuiMode = iota + modeTexts +) + +// textViewLevel represents the navigation depth within Texts mode. +type textViewLevel int + +const ( + textLevelConversations textViewLevel = iota // Top-level conversation list + textLevelAggregate // Aggregate view (contacts, sources, etc.) + textLevelDrillConversations // Conversations filtered by aggregate drill-down + textLevelTimeline // Message timeline within a conversation +) + +// textState holds all state for the Texts mode TUI. +type textState struct { + viewType query.TextViewType + level textViewLevel + conversations []query.ConversationRow + aggregateRows []query.AggregateRow + messages []query.MessageSummary + cursor int + scrollOffset int + selectedConvID int64 + filter query.TextFilter + stats *query.TotalStats + breadcrumbs []textNavSnapshot +} + +// textNavSnapshot stores state for text mode navigation history. +type textNavSnapshot struct { + level textViewLevel + viewType query.TextViewType + cursor int + scrollOffset int + filter query.TextFilter + selectedConvID int64 +} diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go new file mode 100644 index 00000000..7a04a24f --- /dev/null +++ b/internal/tui/text_view.go @@ -0,0 +1,545 @@ +package tui + +import ( + "fmt" + "strings" + + "github.com/charmbracelet/lipgloss" + "github.com/wesm/msgvault/internal/query" + "github.com/wesm/msgvault/internal/textutil" +) + +// renderTextView renders the current Texts mode view. +func (m Model) renderTextView() string { + header := m.textHeaderView() + var body string + switch m.textState.level { + case textLevelConversations, textLevelDrillConversations: + body = m.textConversationsView() + case textLevelAggregate: + body = m.textAggregateView() + case textLevelTimeline: + body = m.textTimelineView() + default: + body = m.textConversationsView() + } + footer := m.textFooterView() + return fmt.Sprintf("%s\n%s\n%s", header, body, footer) +} + +// textHeaderView renders the Texts mode header (title bar + breadcrumb). +func (m Model) textHeaderView() string { + line1 := m.textTitleBar() + + breadcrumb := m.textBreadcrumb() + statsStr := m.textStatsString() + + breadcrumbStyled := statsStyle.Render(" " + breadcrumb + " ") + statsStyled := statsStyle.Render(statsStr + " ") + gap := m.width - + lipgloss.Width(breadcrumbStyled) - + lipgloss.Width(statsStyled) + if gap < 0 { + gap = 0 + } + line2 := breadcrumbStyled + + strings.Repeat(" ", gap) + statsStyled + + return line1 + "\n" + line2 +} + +// textTitleBar builds the title bar for Texts mode. +func (m Model) textTitleBar() string { + titleText := "msgvault" + if m.version != "" && m.version != "dev" && + m.version != "unknown" { + titleText = fmt.Sprintf("msgvault [%s]", m.version) + } + + // Mode indicator + accountStr := "Texts" + if m.accountFilter != nil { + for _, acc := range m.accounts { + if acc.ID == *m.accountFilter { + accountStr = "Texts - " + acc.Identifier + break + } + } + } + + content := fmt.Sprintf("%s - %s", titleText, accountStr) + return titleBarStyle.Render(padRight(content, m.width-2)) +} + +// textBreadcrumb builds the breadcrumb for the current text view. +func (m Model) textBreadcrumb() string { + switch m.textState.level { + case textLevelConversations: + return "Conversations" + case textLevelAggregate: + return m.textState.viewType.String() + case textLevelDrillConversations: + return fmt.Sprintf( + "%s: %s", + textViewTypePrefix(m.textState.viewType), + textDrillKey(m), + ) + case textLevelTimeline: + return fmt.Sprintf( + "Timeline (conv %d)", m.textState.selectedConvID, + ) + } + return "" +} + +// textDrillKey returns the active drill filter key for breadcrumbs. +func textDrillKey(m Model) string { + f := m.textState.filter + if f.ContactPhone != "" { + return truncateRunes(f.ContactPhone, 30) + } + if f.ContactName != "" { + return truncateRunes(f.ContactName, 30) + } + if f.SourceType != "" { + return f.SourceType + } + if f.Label != "" { + return truncateRunes(f.Label, 30) + } + return "?" +} + +// textViewTypePrefix returns a short prefix for text view breadcrumbs. +func textViewTypePrefix(vt query.TextViewType) string { + switch vt { + case query.TextViewContacts: + return "Contact" + case query.TextViewContactNames: + return "Name" + case query.TextViewSources: + return "Source" + case query.TextViewLabels: + return "Label" + case query.TextViewTime: + return "Time" + default: + return "?" + } +} + +// textStatsString builds the stats summary for the Texts header. +func (m Model) textStatsString() string { + if m.textState.stats != nil { + return fmt.Sprintf( + "%d msgs | %s", + m.textState.stats.MessageCount, + formatBytes(m.textState.stats.TotalSize), + ) + } + return "" +} + +// textConversationsView renders the conversations list table. +func (m Model) textConversationsView() string { + if len(m.textState.conversations) == 0 && !m.loading { + return m.fillScreen( + normalRowStyle.Render( + padRight("No conversations", m.width), + ), 1, + ) + } + + var sb strings.Builder + + // Column widths + nameWidth := m.width - 48 + if nameWidth < 15 { + nameWidth = 15 + } + if nameWidth > 50 { + nameWidth = 50 + } + + // Header + headerRow := fmt.Sprintf( + " %-*s %10s %10s %-16s", + nameWidth, "Conversation", + "Source", "Messages", "Last Message", + ) + sb.WriteString( + tableHeaderStyle.Render(padRight(headerRow, m.width)), + ) + sb.WriteString("\n") + sb.WriteString( + separatorStyle.Render(strings.Repeat("\u2500", m.width)), + ) + sb.WriteString("\n") + + // Data rows + endRow := m.textState.scrollOffset + m.pageSize - 1 + if endRow > len(m.textState.conversations) { + endRow = len(m.textState.conversations) + } + + for i := m.textState.scrollOffset; i < endRow; i++ { + conv := m.textState.conversations[i] + isCursor := i == m.textState.cursor + + indicator := " " + if isCursor { + indicator = cursorRowStyle.Render("\u25b6 ") + } + + title := textutil.SanitizeTerminal(conv.Title) + if title == "" { + title = fmt.Sprintf("(conv %d)", conv.ConversationID) + } + title = truncateRunes(title, nameWidth) + title = fmt.Sprintf("%-*s", nameWidth, title) + + source := truncateRunes(conv.SourceType, 10) + msgs := formatCount(conv.MessageCount) + lastMsg := conv.LastMessageAt.Format("2006-01-02 15:04") + + line := fmt.Sprintf( + "%s %10s %10s %-16s", + title, source, msgs, lastMsg, + ) + + var style lipgloss.Style + if isCursor { + style = cursorRowStyle + } else if i%2 == 0 { + style = normalRowStyle + } else { + style = altRowStyle + } + + sb.WriteString(indicator) + sb.WriteString( + style.Render(padRight(line, m.width-3)), + ) + sb.WriteString("\n") + } + + // Fill remaining space + dataRows := endRow - m.textState.scrollOffset + for i := dataRows; i < m.pageSize-1; i++ { + sb.WriteString( + normalRowStyle.Render(strings.Repeat(" ", m.width)), + ) + sb.WriteString("\n") + } + + // Info line + var infoContent string + if m.inlineSearchActive { + infoContent = "/" + m.searchInput.View() + } + sb.WriteString(m.renderInfoLine(infoContent, m.loading)) + + if m.modal != modalNone { + return m.overlayModal(sb.String()) + } + return sb.String() +} + +// textAggregateView renders the text aggregate table (contacts, sources, etc.). +func (m Model) textAggregateView() string { + if len(m.textState.aggregateRows) == 0 && !m.loading { + return m.fillScreen( + normalRowStyle.Render( + padRight("No data", m.width), + ), 1, + ) + } + + var sb strings.Builder + + // Column widths + keyWidth := m.width - 43 + if keyWidth < 20 { + keyWidth = 20 + } + if keyWidth > 57 { + keyWidth = 57 + } + + // Sort indicators + sortInd := func(field query.SortField) string { + if m.textState.filter.SortField == field { + if m.textState.filter.SortDirection == query.SortDesc { + return "\u2193" + } + return "\u2191" + } + return "" + } + + viewLabel := m.textState.viewType.String() + if si := sortInd(query.SortByName); si != "" { + viewLabel += si + } + countLabel := "Count" + if si := sortInd(query.SortByCount); si != "" { + countLabel += si + } + sizeLabel := "Size" + if si := sortInd(query.SortBySize); si != "" { + sizeLabel += si + } + attachLabel := "Attchs" + + headerRow := fmt.Sprintf( + " %-*s %10s %12s %12s", + keyWidth, viewLabel, + countLabel, sizeLabel, attachLabel, + ) + sb.WriteString( + tableHeaderStyle.Render(padRight(headerRow, m.width)), + ) + sb.WriteString("\n") + sb.WriteString( + separatorStyle.Render(strings.Repeat("\u2500", m.width)), + ) + sb.WriteString("\n") + + endRow := m.textState.scrollOffset + m.pageSize - 1 + if endRow > len(m.textState.aggregateRows) { + endRow = len(m.textState.aggregateRows) + } + + for i := m.textState.scrollOffset; i < endRow; i++ { + row := m.textState.aggregateRows[i] + isCursor := i == m.textState.cursor + + indicator := " " + if isCursor { + indicator = cursorRowStyle.Render("\u25b6 ") + } + + key := truncateRunes(row.Key, keyWidth) + key = fmt.Sprintf("%-*s", keyWidth, key) + + line := fmt.Sprintf( + "%s %10s %12s %12s", + key, + formatCount(row.Count), + formatBytes(row.TotalSize), + formatBytes(row.AttachmentSize), + ) + + var style lipgloss.Style + if isCursor { + style = cursorRowStyle + } else if i%2 == 0 { + style = normalRowStyle + } else { + style = altRowStyle + } + + sb.WriteString(indicator) + sb.WriteString( + style.Render(padRight(line, m.width-3)), + ) + sb.WriteString("\n") + } + + // Fill remaining space + dataRows := endRow - m.textState.scrollOffset + if len(m.textState.aggregateRows) == 0 && !m.loading { + dataRows = 1 + } + for i := dataRows; i < m.pageSize-1; i++ { + sb.WriteString( + normalRowStyle.Render(strings.Repeat(" ", m.width)), + ) + sb.WriteString("\n") + } + + // Info line + var infoContent string + if m.inlineSearchActive { + infoContent = "/" + m.searchInput.View() + } + sb.WriteString(m.renderInfoLine(infoContent, m.loading)) + + if m.modal != modalNone { + return m.overlayModal(sb.String()) + } + return sb.String() +} + +// textTimelineView renders a chronological message timeline. +func (m Model) textTimelineView() string { + if len(m.textState.messages) == 0 && !m.loading { + return m.fillScreen( + normalRowStyle.Render( + padRight("No messages", m.width), + ), 1, + ) + } + + var sb strings.Builder + + // Column widths + dateWidth := 16 + fromWidth := 20 + bodyWidth := m.width - dateWidth - fromWidth - 9 + if bodyWidth < 10 { + bodyWidth = 10 + } + + // Header + headerRow := fmt.Sprintf( + " %-*s %-*s %-*s", + dateWidth, "Time", + fromWidth, "Sender", + bodyWidth, "Message", + ) + sb.WriteString( + tableHeaderStyle.Render(padRight(headerRow, m.width)), + ) + sb.WriteString("\n") + sb.WriteString( + separatorStyle.Render(strings.Repeat("\u2500", m.width)), + ) + sb.WriteString("\n") + + endRow := m.textState.scrollOffset + m.pageSize - 1 + if endRow > len(m.textState.messages) { + endRow = len(m.textState.messages) + } + + for i := m.textState.scrollOffset; i < endRow; i++ { + msg := m.textState.messages[i] + isCursor := i == m.textState.cursor + + indicator := " " + if isCursor { + indicator = cursorRowStyle.Render("\u25b6 ") + } + + dateStr := msg.SentAt.Format("2006-01-02 15:04") + + // Sender: prefer name, then phone, then email + from := textutil.SanitizeTerminal(msg.FromName) + if from == "" && msg.FromPhone != "" { + from = textutil.SanitizeTerminal(msg.FromPhone) + } + if from == "" { + from = textutil.SanitizeTerminal(msg.FromEmail) + } + from = truncateRunes(from, fromWidth) + from = fmt.Sprintf("%-*s", fromWidth, from) + + // Message body: use snippet + body := textutil.SanitizeTerminal(msg.Snippet) + if body == "" { + body = textutil.SanitizeTerminal(msg.Subject) + } + body = truncateRunes(body, bodyWidth) + body = fmt.Sprintf("%-*s", bodyWidth, body) + + line := fmt.Sprintf( + "%-*s %s %s", + dateWidth, dateStr, from, body, + ) + + var style lipgloss.Style + if isCursor { + style = cursorRowStyle + } else if i%2 == 0 { + style = normalRowStyle + } else { + style = altRowStyle + } + + sb.WriteString(indicator) + sb.WriteString( + style.Render(padRight(line, m.width-3)), + ) + sb.WriteString("\n") + } + + // Fill remaining space + dataRows := endRow - m.textState.scrollOffset + for i := dataRows; i < m.pageSize-1; i++ { + sb.WriteString( + normalRowStyle.Render(strings.Repeat(" ", m.width)), + ) + sb.WriteString("\n") + } + + // Info line + sb.WriteString(m.renderNotificationLine()) + + if m.modal != modalNone { + return m.overlayModal(sb.String()) + } + return sb.String() +} + +// textFooterView renders the footer with keybindings for Texts mode. +func (m Model) textFooterView() string { + var keys []string + var posStr string + + switch m.textState.level { + case textLevelConversations, textLevelDrillConversations: + keys = []string{ + "\u2191/k", "\u2193/j", "Enter", + "Tab group", "s sort", "A acct", + "m email", "? help", + } + if m.textState.level == textLevelDrillConversations { + keys = append([]string{"\u2191/k", "\u2193/j", "Enter", + "Esc back", "Tab group", "s sort", + "m email", "? help"}, []string{}...) + } + n := len(m.textState.conversations) + if n > 0 { + posStr = fmt.Sprintf( + " %d/%d ", m.textState.cursor+1, n, + ) + } + + case textLevelAggregate: + keys = []string{ + "\u2191/k", "\u2193/j", "Enter", + "Esc back", "Tab group", "s sort", + "m email", "? help", + } + n := len(m.textState.aggregateRows) + if n > 0 { + posStr = fmt.Sprintf( + " %d/%d ", m.textState.cursor+1, n, + ) + } + + case textLevelTimeline: + keys = []string{ + "\u2191/\u2193 navigate", "Esc back", + "m email", "? help", + } + n := len(m.textState.messages) + if n > 0 { + posStr = fmt.Sprintf( + " %d/%d ", m.textState.cursor+1, n, + ) + } + } + + keysStr := strings.Join(keys, " \u2502 ") + gap := m.width - + lipgloss.Width(keysStr) - + lipgloss.Width(posStr) - 2 + if gap < 0 { + gap = 0 + } + + return footerStyle.Render( + keysStr + strings.Repeat(" ", gap) + posStr, + ) +} diff --git a/internal/tui/view.go b/internal/tui/view.go index 57f3a536..7fca53ae 100644 --- a/internal/tui/view.go +++ b/internal/tui/view.go @@ -182,8 +182,14 @@ func (m Model) buildTitleBar() string { } } - // Build line content: "msgvault [hash] - Account update: vX.Y.Z" - line1Content := fmt.Sprintf("%s - %s", titleText, accountStr) + // Mode indicator when text engine is available + modeStr := "" + if m.textEngine != nil { + modeStr = " [Email]" + } + + // Build line content: "msgvault [hash] [Email] - Account update: vX.Y.Z" + line1Content := fmt.Sprintf("%s%s - %s", titleText, modeStr, accountStr) if updateNotice != "" { gap := m.width - 2 - lipgloss.Width(line1Content) - lipgloss.Width(updateNotice) if gap > 1 { @@ -1198,6 +1204,7 @@ var rawHelpLines = []string{ " A Select account", " f Filter (attachments, deleted)", " e Export attachments (in message view)", + " m Toggle Email/Texts mode", " q Quit", "", "[↑/↓] Scroll [Any other key] Close", From 84e53a01f92d2e1de5560a7923cac3ee25150b01 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:53:55 -0500 Subject: [PATCH 24/65] Wire TextEngine into TUI initialization --- cmd/msgvault/cmd/tui.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cmd/msgvault/cmd/tui.go b/cmd/msgvault/cmd/tui.go index ecefa764..fb98fa74 100644 --- a/cmd/msgvault/cmd/tui.go +++ b/cmd/msgvault/cmd/tui.go @@ -142,11 +142,18 @@ Remote Mode: } } + // Check if engine supports text queries + var textEngine query.TextEngine + if te, ok := engine.(query.TextEngine); ok { + textEngine = te + } + // Create and run TUI model := tui.New(engine, tui.Options{ - DataDir: cfg.Data.DataDir, - Version: Version, - IsRemote: isRemote, + DataDir: cfg.Data.DataDir, + Version: Version, + IsRemote: isRemote, + TextEngine: textEngine, }) p := tea.NewProgram(model, tea.WithAltScreen()) From 6ae46fe3fb47660b4520fd411152c47dd20fec17 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 20:58:47 -0500 Subject: [PATCH 25/65] Add end-to-end integration test for text message import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests the full pipeline: store methods, participant deduplication across sources, conversation stats recomputation, and all four TextEngine methods (ListConversations, TextAggregate, ListConversation Messages, GetTextStats). Also fixes a bug in ListConversations where MAX(sent_at) — returned as a plain string by SQLite's aggregate — was scanned into sql.NullTime (which fails). Now scans into sql.NullString and parses with explicit timestamp format list. Co-Authored-By: Claude Sonnet 4.6 --- internal/query/sqlite_text.go | 41 +++- internal/textimport/integration_test.go | 296 ++++++++++++++++++++++++ 2 files changed, 333 insertions(+), 4 deletions(-) create mode 100644 internal/textimport/integration_test.go diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index baff2c2a..4dd270ea 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -5,11 +5,40 @@ import ( "database/sql" "fmt" "strings" + "time" ) // Compile-time interface assertion. var _ TextEngine = (*SQLiteEngine)(nil) +// sqliteTimestampLayouts lists the datetime string formats emitted by SQLite +// and the go-sqlite3 driver. More specific formats come first. +var sqliteTimestampLayouts = []string{ + "2006-01-02 15:04:05.999999999-07:00", + "2006-01-02T15:04:05.999999999-07:00", + "2006-01-02 15:04:05.999999999", + "2006-01-02T15:04:05.999999999", + "2006-01-02 15:04:05", + "2006-01-02T15:04:05", + "2006-01-02 15:04", + "2006-01-02T15:04", + "2006-01-02", + time.RFC3339, + time.RFC3339Nano, +} + +// parseSQLiteTimestamp parses a datetime string from a SQLite aggregate +// (e.g., MAX(sent_at)) that the driver returns as a plain string rather +// than a time.Time value. +func parseSQLiteTimestamp(s string) (time.Time, error) { + for _, layout := range sqliteTimestampLayouts { + if t, err := time.Parse(layout, s); err == nil { + return t, nil + } + } + return time.Time{}, fmt.Errorf("unrecognized SQLite timestamp %q", s) +} + // textMsgTypeFilter returns a SQL condition restricting to text message types. // Uses the m. table alias used in text query methods. func textMsgTypeFilter() string { @@ -164,7 +193,7 @@ func (e *SQLiteEngine) ListConversations( var results []ConversationRow for rows.Next() { var row ConversationRow - var lastAt sql.NullTime + var lastAtStr sql.NullString var totalSize int64 if err := rows.Scan( &row.ConversationID, @@ -172,14 +201,18 @@ func (e *SQLiteEngine) ListConversations( &row.SourceType, &row.MessageCount, &row.ParticipantCount, - &lastAt, + &lastAtStr, &row.LastPreview, &totalSize, ); err != nil { return nil, fmt.Errorf("scan conversation: %w", err) } - if lastAt.Valid { - row.LastMessageAt = lastAt.Time + // MAX(sent_at) returns a string from SQLite; parse it manually so that + // the scan works regardless of column affinity on the aggregated value. + if lastAtStr.Valid && lastAtStr.String != "" { + if t, err := parseSQLiteTimestamp(lastAtStr.String); err == nil { + row.LastMessageAt = t + } } results = append(results, row) } diff --git a/internal/textimport/integration_test.go b/internal/textimport/integration_test.go new file mode 100644 index 00000000..2cf5ec57 --- /dev/null +++ b/internal/textimport/integration_test.go @@ -0,0 +1,296 @@ +package textimport_test + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" + "github.com/wesm/msgvault/internal/query" + "github.com/wesm/msgvault/internal/store" +) + +// TestIntegration exercises the full text message import pipeline: +// store methods, participant deduplication across sources, +// conversation stats recomputation, and TextEngine queries. +func TestIntegration(t *testing.T) { + ctx := context.Background() + + // Create a temporary on-disk DB (store.Open does MkdirAll, WAL, etc.) + dbPath := filepath.Join(t.TempDir(), "integration.db") + s, err := store.Open(dbPath) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + if err := s.InitSchema(); err != nil { + t.Fatalf("init schema: %v", err) + } + + // --- Sources --- + src1, err := s.GetOrCreateSource("whatsapp", "whatsapp:+15550000001") + if err != nil { + t.Fatalf("GetOrCreateSource(whatsapp): %v", err) + } + src2, err := s.GetOrCreateSource("apple_messages", "apple_messages:+15550000001") + if err != nil { + t.Fatalf("GetOrCreateSource(apple_messages): %v", err) + } + + // --- Participant deduplication across sources --- + // Both sources reference the same phone +15551234567. + // EnsureParticipantByPhone deduplicates by phone, so both calls should + // return the same participant ID. + participantID1, err := s.EnsureParticipantByPhone("+15551234567", "Alice", "whatsapp") + if err != nil { + t.Fatalf("EnsureParticipantByPhone(src1): %v", err) + } + participantID2, err := s.EnsureParticipantByPhone("+15551234567", "Alice", "imessage") + if err != nil { + t.Fatalf("EnsureParticipantByPhone(src2): %v", err) + } + if participantID1 != participantID2 { + t.Errorf("same phone across sources: participant IDs differ: %d != %d", participantID1, participantID2) + } + phoneParticipantID := participantID1 + + // --- Conversations --- + conv1ID, err := s.EnsureConversationWithType(src1.ID, "wa-conv-1", "whatsapp", "WhatsApp Chat") + if err != nil { + t.Fatalf("EnsureConversationWithType(src1): %v", err) + } + conv2ID, err := s.EnsureConversationWithType(src2.ID, "am-conv-1", "imessage", "iMessage Chat") + if err != nil { + t.Fatalf("EnsureConversationWithType(src2): %v", err) + } + + // Link participant to both conversations. + if err := s.EnsureConversationParticipant(conv1ID, phoneParticipantID, "member"); err != nil { + t.Fatalf("EnsureConversationParticipant(conv1): %v", err) + } + if err := s.EnsureConversationParticipant(conv2ID, phoneParticipantID, "member"); err != nil { + t.Fatalf("EnsureConversationParticipant(conv2): %v", err) + } + + // --- Messages for source 1 (whatsapp) --- + baseTime := time.Date(2024, 6, 1, 10, 0, 0, 0, time.UTC) + whatsappMsgs := []struct { + srcMsgID string + snippet string + sentAt time.Time + fromMe bool + }{ + {"wa-1", "Hello from WhatsApp", baseTime, false}, + {"wa-2", "Reply on WhatsApp", baseTime.Add(time.Minute), true}, + {"wa-3", "Third WhatsApp message", baseTime.Add(2 * time.Minute), false}, + } + for _, m := range whatsappMsgs { + msg := &store.Message{ + SourceID: src1.ID, + SourceMessageID: m.srcMsgID, + ConversationID: conv1ID, + MessageType: "whatsapp", + Snippet: sql.NullString{String: m.snippet, Valid: true}, + SentAt: sql.NullTime{Time: m.sentAt, Valid: true}, + IsFromMe: m.fromMe, + SizeEstimate: int64(len(m.snippet)), + SenderID: sql.NullInt64{Int64: phoneParticipantID, Valid: !m.fromMe}, + } + msgID, err := s.UpsertMessage(msg) + if err != nil { + t.Fatalf("UpsertMessage(%s): %v", m.srcMsgID, err) + } + bodyText := sql.NullString{String: m.snippet, Valid: true} + if err := s.UpsertMessageBody(msgID, bodyText, sql.NullString{}); err != nil { + t.Fatalf("UpsertMessageBody(%s): %v", m.srcMsgID, err) + } + // Add participant as message recipient for TextAggregate to pick up + if err := s.ReplaceMessageRecipients( + msgID, "from", + []int64{phoneParticipantID}, []string{"Alice"}, + ); err != nil { + t.Fatalf("ReplaceMessageRecipients(%s): %v", m.srcMsgID, err) + } + } + + // --- Messages for source 2 (apple_messages) --- + imessageMsgs := []struct { + srcMsgID string + snippet string + sentAt time.Time + }{ + {"am-1", "Hi from iMessage", baseTime.Add(time.Hour)}, + {"am-2", "iMessage follow-up", baseTime.Add(time.Hour + time.Minute)}, + } + for _, m := range imessageMsgs { + msg := &store.Message{ + SourceID: src2.ID, + SourceMessageID: m.srcMsgID, + ConversationID: conv2ID, + MessageType: "imessage", + Snippet: sql.NullString{String: m.snippet, Valid: true}, + SentAt: sql.NullTime{Time: m.sentAt, Valid: true}, + SizeEstimate: int64(len(m.snippet)), + SenderID: sql.NullInt64{Int64: phoneParticipantID, Valid: true}, + } + msgID, err := s.UpsertMessage(msg) + if err != nil { + t.Fatalf("UpsertMessage(%s): %v", m.srcMsgID, err) + } + bodyText := sql.NullString{String: m.snippet, Valid: true} + if err := s.UpsertMessageBody(msgID, bodyText, sql.NullString{}); err != nil { + t.Fatalf("UpsertMessageBody(%s): %v", m.srcMsgID, err) + } + if err := s.ReplaceMessageRecipients( + msgID, "from", + []int64{phoneParticipantID}, []string{"Alice"}, + ); err != nil { + t.Fatalf("ReplaceMessageRecipients(%s): %v", m.srcMsgID, err) + } + } + + // --- Labels --- + labelID, err := s.EnsureLabel(src1.ID, "important", "Important", "user") + if err != nil { + t.Fatalf("EnsureLabel: %v", err) + } + // Fetch the first WhatsApp message ID to link a label. + var wa1MsgID int64 + if err := s.DB().QueryRow( + `SELECT id FROM messages WHERE source_message_id = ?`, "wa-1", + ).Scan(&wa1MsgID); err != nil { + t.Fatalf("lookup wa-1 message: %v", err) + } + if err := s.LinkMessageLabel(wa1MsgID, labelID); err != nil { + t.Fatalf("LinkMessageLabel: %v", err) + } + + // Verify label is linked. + var labelCount int + if err := s.DB().QueryRow( + `SELECT COUNT(*) FROM message_labels WHERE message_id = ?`, wa1MsgID, + ).Scan(&labelCount); err != nil { + t.Fatalf("count labels: %v", err) + } + if labelCount != 1 { + t.Errorf("label count for wa-1: got %d, want 1", labelCount) + } + + // --- Recompute conversation stats --- + if err := s.RecomputeConversationStats(src1.ID); err != nil { + t.Fatalf("RecomputeConversationStats(src1): %v", err) + } + if err := s.RecomputeConversationStats(src2.ID); err != nil { + t.Fatalf("RecomputeConversationStats(src2): %v", err) + } + + // Verify conversation stats for conv1. + var msgCount int64 + if err := s.DB().QueryRow( + `SELECT message_count FROM conversations WHERE id = ?`, conv1ID, + ).Scan(&msgCount); err != nil { + t.Fatalf("read conv1 stats: %v", err) + } + if msgCount != 3 { + t.Errorf("conv1 message_count: got %d, want 3", msgCount) + } + + // --- TextEngine queries --- + eng := query.NewSQLiteEngine(s.DB()) + var te query.TextEngine = eng + + // ListConversations — should return both conversations. + convRows, err := te.ListConversations(ctx, query.TextFilter{}) + if err != nil { + t.Fatalf("ListConversations: %v", err) + } + if len(convRows) != 2 { + t.Errorf("ListConversations: got %d rows, want 2", len(convRows)) + } + convByID := make(map[int64]query.ConversationRow) + for _, row := range convRows { + convByID[row.ConversationID] = row + } + if row, ok := convByID[conv1ID]; !ok { + t.Errorf("conv1 not found in ListConversations results") + } else if row.MessageCount != 3 { + t.Errorf("conv1 MessageCount: got %d, want 3", row.MessageCount) + } + if row, ok := convByID[conv2ID]; !ok { + t.Errorf("conv2 not found in ListConversations results") + } else if row.MessageCount != 2 { + t.Errorf("conv2 MessageCount: got %d, want 2", row.MessageCount) + } + + // TextAggregate by contacts — groups by phone number. + // All 5 messages have +15551234567 as the from participant. + aggRows, err := te.TextAggregate(ctx, query.TextViewContacts, query.TextAggregateOptions{Limit: 100}) + if err != nil { + t.Fatalf("TextAggregate(TextViewContacts): %v", err) + } + if len(aggRows) == 0 { + t.Fatal("TextAggregate(TextViewContacts): got 0 rows, want at least 1") + } + foundPhone := false + for _, row := range aggRows { + if row.Key == "+15551234567" { + foundPhone = true + if row.Count != 5 { + t.Errorf("contact +15551234567: got count %d, want 5", row.Count) + } + } + } + if !foundPhone { + t.Errorf("TextAggregate: phone +15551234567 not found in results") + } + + // ListConversationMessages — returns messages for conv1 in chronological order. + messages, err := te.ListConversationMessages(ctx, conv1ID, query.TextFilter{}) + if err != nil { + t.Fatalf("ListConversationMessages(conv1): %v", err) + } + if len(messages) != 3 { + t.Errorf("ListConversationMessages(conv1): got %d messages, want 3", len(messages)) + } + // Verify chronological order (ascending by sent_at). + for i := 1; i < len(messages); i++ { + if messages[i].SentAt.Before(messages[i-1].SentAt) { + t.Errorf("messages not in chronological order at index %d", i) + } + } + // Verify message type is correct. + for _, msg := range messages { + if msg.MessageType != "whatsapp" { + t.Errorf("expected message_type=whatsapp, got %q", msg.MessageType) + } + } + + // GetTextStats — should count all 5 text messages. + stats, err := te.GetTextStats(ctx, query.TextStatsOptions{}) + if err != nil { + t.Fatalf("GetTextStats: %v", err) + } + if stats.MessageCount != 5 { + t.Errorf("GetTextStats.MessageCount: got %d, want 5", stats.MessageCount) + } + // Should see 2 accounts (sources). + if stats.AccountCount != 2 { + t.Errorf("GetTextStats.AccountCount: got %d, want 2", stats.AccountCount) + } + // LabelCount: 1 label linked to at least one text message. + if stats.LabelCount != 1 { + t.Errorf("GetTextStats.LabelCount: got %d, want 1", stats.LabelCount) + } + + // GetTextStats filtered by source 1 only. + statsS1, err := te.GetTextStats(ctx, query.TextStatsOptions{SourceID: &src1.ID}) + if err != nil { + t.Fatalf("GetTextStats(src1): %v", err) + } + if statsS1.MessageCount != 3 { + t.Errorf("GetTextStats(src1).MessageCount: got %d, want 3", statsS1.MessageCount) + } +} From d17b9a1f309bed0d80cab9c1e01ab6bfc4eb9d2e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 21:02:28 -0500 Subject: [PATCH 26/65] Remove superseded multi-source-messaging planning doc --- .../2026-02-17-multi-source-messaging.md | 164 ------------------ 1 file changed, 164 deletions(-) delete mode 100644 docs/plans/2026-02-17-multi-source-messaging.md diff --git a/docs/plans/2026-02-17-multi-source-messaging.md b/docs/plans/2026-02-17-multi-source-messaging.md deleted file mode 100644 index 48b7b415..00000000 --- a/docs/plans/2026-02-17-multi-source-messaging.md +++ /dev/null @@ -1,164 +0,0 @@ -# Multi-Source Messaging Support - -**Issue:** [wesm/msgvault#136](https://github.com/wesm/msgvault/issues/136) -**Author:** Ed Dowding -**Date:** 2026-02-17 -**Status:** Draft for review - -## Goal - -Make msgvault a universal message archive — not just Gmail. Starting with WhatsApp, but ensuring the design works for iMessage, Telegram, SMS, and other chat platforms. - -## Good News: The Schema Is Already Ready - -The existing schema was designed for this. Key fields already in place: - -| Table | Multi-source fields | -|-------|-------------------| -| `sources` | `source_type` ('gmail', 'whatsapp', 'apple_messages', 'google_messages'), `identifier` (email or phone), `sync_cursor` (platform-agnostic) | -| `messages` | `message_type` ('email', 'imessage', 'sms', 'mms', 'rcs', 'whatsapp'), `is_edited`, `is_forwarded`, `delivered_at`, `read_at` | -| `conversations` | `conversation_type` ('email_thread', 'group_chat', 'direct_chat', 'channel') | -| `participants` | `phone_number` (E.164), `canonical_id` (cross-platform dedup) | -| `participant_identifiers` | `identifier_type` ('email', 'phone', 'apple_id', 'whatsapp') | -| `attachments` | `media_type` ('image', 'video', 'audio', 'sticker', 'gif', 'voice_note') | -| `reactions` | `reaction_type` ('tapback', 'emoji', 'like') | -| `message_raw` | `raw_format` ('mime', 'imessage_archive', 'whatsapp_json', 'rcs_json') | - -**No schema migrations needed.** The store layer (`UpsertMessage`, `GetOrCreateSource`, etc.) is already generic — it accepts any `source_type` and `message_type`. The tight coupling to Gmail is only in the sync pipeline and CLI commands. - -## CLI Design - -Per Wes's feedback, use `--type` not `--whatsapp`: - -```bash -# Add accounts -msgvault add-account user@gmail.com # default: --type gmail -msgvault add-account --type whatsapp "+447700900000" # WhatsApp via phone -msgvault add-account --type imessage # no identifier needed (local DB) - -# Sync -msgvault sync-full # all sources -msgvault sync-full user@gmail.com # specific account -msgvault sync-full "+447700900000" # auto-detects type from sources table -msgvault sync-full --type whatsapp # all WhatsApp accounts -msgvault sync-incremental # incremental for all sources -``` - -**Account identifiers** use E.164 phone numbers for phone-based sources (`+447700900000`), email addresses for email-based sources. The existing `UNIQUE(source_type, identifier)` constraint means the same phone number can be both a WhatsApp and iMessage account. - -## How Each Platform Syncs - -The fundamental difference: Gmail is pull-based (fetch any message anytime), most chat platforms are push-based (stream messages in real time). Each platform gets its own package under `internal/` that knows how to sync into the shared store. - -| Platform | Sync model | History access | Auth | Identifier | -|----------|-----------|---------------|------|------------| -| **Gmail** | Pull via API | Full random access | OAuth2 (browser or device flow) | Email address | -| **WhatsApp** | Connect + stream | One-time dump at pairing, then forward-only | QR code or phone pairing code | E.164 phone | -| **iMessage** | Read local SQLite | Full (reads `~/Library/Messages/chat.db`) | macOS Full Disk Access | None (local) | -| **Telegram** | Pull via TDLib | Full history via API | Phone + code | E.164 phone | -| **SMS/Android** | Read local SQLite | Full (reads `mmssms.db` from backup) | File access | E.164 phone | - -No abstract `Provider` interface up front — just build each platform's sync as a standalone package, and extract common patterns once we have two working. YAGNI. - -## WhatsApp Specifics (Phase 1) - -### Library: whatsmeow - -[whatsmeow](https://github.com/tulir/whatsmeow) is a pure Go implementation of the WhatsApp Web multi-device protocol. Production-grade — it powers the [mautrix-whatsapp](https://github.com/mautrix/whatsapp) Matrix bridge (2,200+ stars). Actively maintained (last commit: Feb 2026). - -### Auth Flow - -1. User runs `msgvault add-account --type whatsapp "+447700900000"` -2. Terminal displays QR code (or pairing code with `--headless`) -3. User scans with WhatsApp on their phone -4. Session credentials stored in SQLite (alongside msgvault's main DB) -5. Session persists across restarts — no re-scanning needed - -Session expires if the primary phone doesn't connect to internet for 14 days, or after ~30 days of inactivity. - -### Sync Model - -**Critical constraint:** WhatsApp history is a one-time dump, not an on-demand API. - -``` -First sync: - connect → receive history dump (HistorySync event) → stream until caught up → disconnect - -Subsequent syncs: - connect → stream new messages since last cursor → disconnect -``` - -On-demand historical backfill exists (`BuildHistorySyncRequest`) but is documented as unreliable, especially for groups. Design accordingly: treat initial history as best-effort, then reliably capture everything going forward. - -### Media Must Be Downloaded Immediately - -WhatsApp media URLs expire after ~30 days. Unlike Gmail where you can fetch any attachment anytime, WhatsApp media must be downloaded and stored locally at sync time. The existing content-addressed attachment storage (SHA-256 dedup) works perfectly for this. - -### Message Type Mapping - -| WhatsApp | msgvault field | Value | -|----------|---------------|-------| -| Text message | `messages.message_type` | `'whatsapp'` | -| Image/Video/Audio | `attachments.media_type` | `'image'`, `'video'`, `'audio'` | -| Voice note | `attachments.media_type` | `'voice_note'` | -| Sticker | `attachments.media_type` | `'sticker'` | -| Document | `attachments.media_type` | `'document'` | -| Reaction (emoji) | `reactions.reaction_type` | `'emoji'` | -| Reply/Quote | `messages.reply_to_message_id` | FK to parent message | -| Forwarded | `messages.is_forwarded` | `true` | -| Edited | `messages.is_edited` | `true` | -| Read receipt | `messages.read_at` | Timestamp | -| Delivery receipt | `messages.delivered_at` | Timestamp | -| Group chat | `conversations.conversation_type` | `'group_chat'` | -| 1:1 chat | `conversations.conversation_type` | `'direct_chat'` | -| Sender JID | `participant_identifiers.identifier_type` | `'whatsapp'`, value = `447700900000@s.whatsapp.net` | -| Sender phone | `participants.phone_number` | `+447700900000` (E.164) | -| Raw protobuf | `message_raw.raw_format` | `'whatsapp_protobuf'` | - -### What Changes in Existing Code - -**New package:** `internal/whatsapp/` — self-contained, no changes to existing Gmail code. - -**Small changes needed:** -- `cmd/msgvault/cmd/addaccount.go`: Add `--type` flag, dispatch to WhatsApp auth when type is `"whatsapp"` -- `cmd/msgvault/cmd/syncfull.go`: Currently hardcodes `ListSources("gmail")` — change to `ListSources("")` (all types) with a type-based dispatcher -- `internal/store/`: Add `EnsureParticipantByPhone()` method (currently only handles email-based participants) -- `internal/store/`: Add `'member'` as a valid `recipient_type` for group chat participants - -**No changes to:** schema, query engine, TUI, MCP server, HTTP API, or any consumer. Messages from WhatsApp will appear in search, aggregation, and all views automatically because consumers operate on the generic `messages` table. - -## Risks - -| Risk | Severity | Mitigation | -|------|----------|------------| -| **Account ban/warning** | High | WhatsApp TOS prohibits unofficial clients. Read-only archival is lower risk than bots, but not zero. Document prominently. Recommend a dedicated/secondary number for testing. | -| **History dump is incomplete** | Medium | WhatsApp server decides how much history to send at pairing. Design as "best effort snapshot + reliable stream forward." | -| **whatsmeow protocol breakage** | Medium | WhatsApp changes their protocol regularly. Pin whatsmeow version, expect occasional breakage, track upstream releases. | -| **Media URL expiration** | Low | Download everything at sync time. Already mitigated by design. | -| **Phone must be online every 14 days** | Low | Document requirement. Could add a warning in `sync` output if session is stale. | - -## How Other Platforms Would Plug In Later - -Each gets its own `internal//` package that syncs into the store. Brief notes on feasibility: - -**iMessage** (macOS only): Read `~/Library/Messages/chat.db` directly. Full history available. Timestamps use Apple epoch (nanoseconds since 2001-01-01). Tapbacks stored as separate messages referencing parent via `associated_message_guid` — would map to `reactions` table. Requires Full Disk Access permission. No network needed. - -**Telegram**: TDLib (official C++ library with Go bindings) or import from Desktop export JSON. Full history available via API. Unique features: channels, supergroups, forums, scheduled messages, silent messages. User IDs are numeric (not phone-based) but phone is the auth method. - -**SMS/Android**: Import from `mmssms.db` backup. Simple data model (phone, timestamp, body). MMS attachments in `part` table. No reactions, no threading, no edits. - -**Signal**: Hardest. Desktop DB is SQLCipher-encrypted. Schema changes frequently (215+ migration versions). No official export API. Feasible but fragile. - -## Implementation Phases - -**Phase 1 — CLI + dispatcher (no new platforms):** -Add `--type` flag. Change sync dispatch from Gmail-only to type-based. All existing behavior unchanged. - -**Phase 2 — WhatsApp sync:** -`internal/whatsapp/` package. QR pairing. History dump. Forward streaming. Media download. Phone participant handling. - -**Phase 3 — WhatsApp features:** -Reactions, replies, groups with metadata, voice notes, stickers, read receipts. - -**Phase 4 — Next platform (iMessage or Telegram):** -By this point we'll have two implementations and can extract common patterns if they emerge naturally. Not before. From 7ddb22479d07043fafa3b3754e72c29a12538d10 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 21:32:14 -0500 Subject: [PATCH 27/65] Fix NormalizePhone for international formats and validation Co-Authored-By: Claude Sonnet 4.6 --- internal/textimport/phone.go | 51 +++++++++++++++++++------------ internal/textimport/phone_test.go | 8 +++++ 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/internal/textimport/phone.go b/internal/textimport/phone.go index d58108c8..1f4fa497 100644 --- a/internal/textimport/phone.go +++ b/internal/textimport/phone.go @@ -18,36 +18,49 @@ func NormalizePhone(raw string) (string, error) { return "", fmt.Errorf("not a phone number: %q", raw) } - // Strip all non-digit and non-plus characters + // Strip trunk prefix (0) before collecting digits, e.g. "+44 (0)7700" → "+44 7700" + cleaned := strings.ReplaceAll(raw, "(0)", "") + + // Collect digits and any leading '+'; reject embedded '+' (e.g. "1+555...") var b strings.Builder - for _, r := range raw { - if r == '+' || unicode.IsDigit(r) { + leadingPlus := false + for i, r := range cleaned { + switch { + case r == '+' && i == 0: + leadingPlus = true + case r == '+': + return "", fmt.Errorf("embedded '+' in phone number: %q", raw) + case unicode.IsDigit(r): b.WriteRune(r) } } - digits := b.String() + justDigits := b.String() - // Must start with + or be all digits - if digits == "" { + if justDigits == "" { return "", fmt.Errorf("no digits in input: %q", raw) } - - // Strip leading + for length check - justDigits := strings.TrimPrefix(digits, "+") if len(justDigits) < 7 { return "", fmt.Errorf("too short for phone number: %q", raw) } - // Ensure + prefix - if !strings.HasPrefix(digits, "+") { - // Assume US country code if 10 digits - if len(justDigits) == 10 { - digits = "+1" + justDigits - } else if len(justDigits) == 11 && justDigits[0] == '1' { - digits = "+" + justDigits - } else { - digits = "+" + justDigits - } + var digits string + if leadingPlus { + digits = "+" + justDigits + } else if strings.HasPrefix(justDigits, "00") { + // International 00-prefix → replace with + + digits = "+" + justDigits[2:] + } else if len(justDigits) == 10 { + // Assume US country code + digits = "+1" + justDigits + } else if len(justDigits) == 11 && justDigits[0] == '1' { + digits = "+" + justDigits + } else { + digits = "+" + justDigits + } + + // E.164 max is 15 digits (country code + subscriber) + if len(digits)-1 > 15 { + return "", fmt.Errorf("too long for E.164 (max 15 digits): %q", raw) } return digits, nil diff --git a/internal/textimport/phone_test.go b/internal/textimport/phone_test.go index 85ca215c..9d30e7d9 100644 --- a/internal/textimport/phone_test.go +++ b/internal/textimport/phone_test.go @@ -28,6 +28,14 @@ func TestNormalizePhone(t *testing.T) { {"", "", true}, // System identifier {"status@broadcast", "", true}, + // International 00-prefix + {"0033624921221", "+33624921221", false}, + // Trunk prefix (0) + {"+44 (0)7700 900000", "+447700900000", false}, + // Embedded + (invalid) + {"1+5551234567", "", true}, + // Too long (>15 digits) + {"+1234567890123456", "", true}, } for _, tt := range tests { t.Run(tt.input, func(t *testing.T) { From 035a7b8dc6071100ad85ad2cb339053aa208f39e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 21:33:28 -0500 Subject: [PATCH 28/65] Fix deterministic preview ordering in RecomputeConversationStats Add `id DESC` as a tie-breaker in the last_message_preview subquery so messages with identical timestamps always resolve to the same row. Strengthen TestRecomputeConversationStats to assert participant_count (via EnsureParticipantByPhone + EnsureConversationParticipant) and last_message_preview (latest message snippet by sent_at). Co-Authored-By: Claude Sonnet 4.6 --- internal/store/messages.go | 2 +- internal/store/messages_test.go | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/internal/store/messages.go b/internal/store/messages.go index 0fa81d3e..0c89f3dc 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -893,7 +893,7 @@ func (s *Store) RecomputeConversationStats(sourceID int64) error { last_message_preview = ( SELECT snippet FROM messages WHERE conversation_id = conversations.id - ORDER BY COALESCE(sent_at, received_at, internal_date) DESC + ORDER BY COALESCE(sent_at, received_at, internal_date) DESC, id DESC LIMIT 1 ) WHERE source_id = ? diff --git a/internal/store/messages_test.go b/internal/store/messages_test.go index 647c8c08..4361f1c0 100644 --- a/internal/store/messages_test.go +++ b/internal/store/messages_test.go @@ -59,24 +59,43 @@ func TestRecomputeConversationStats(t *testing.T) { t.Fatalf("UpsertMessage msg2: %v", err) } + // Add a conversation participant so participant_count is non-zero. + participantID, err := st.EnsureParticipantByPhone("+15559876543", "Bob", "whatsapp") + if err != nil { + t.Fatalf("EnsureParticipantByPhone: %v", err) + } + if err := st.EnsureConversationParticipant(convID, participantID, "member"); err != nil { + t.Fatalf("EnsureConversationParticipant: %v", err) + } + // Recompute and verify counts. if err := st.RecomputeConversationStats(source.ID); err != nil { t.Fatalf("RecomputeConversationStats: %v", err) } var count int + var participantCount int var lastMsgAt sql.NullTime + var preview sql.NullString if err := st.DB().QueryRow( - `SELECT message_count, last_message_at FROM conversations WHERE id = ?`, convID, - ).Scan(&count, &lastMsgAt); err != nil { + `SELECT message_count, participant_count, last_message_at, last_message_preview + FROM conversations WHERE id = ?`, convID, + ).Scan(&count, &participantCount, &lastMsgAt, &preview); err != nil { t.Fatalf("post-recompute scan: %v", err) } if count != 2 { t.Errorf("message_count = %d, want 2", count) } + if participantCount != 1 { + t.Errorf("participant_count = %d, want 1", participantCount) + } if !lastMsgAt.Valid { t.Error("last_message_at is NULL, want a timestamp") } + // msg2 has the later sent_at, so its snippet ("world") should be the preview. + if !preview.Valid || preview.String != "world" { + t.Errorf("last_message_preview = %q, want %q", preview.String, "world") + } // Idempotency: calling again should produce the same result. if err := st.RecomputeConversationStats(source.ID); err != nil { From 5c05d0fc2e700a33a564f8544a5afa22aabafdf2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 21:34:38 -0500 Subject: [PATCH 29/65] Remove specs --- .../2026-03-31-unified-text-message-import.md | 1554 ----------------- ...3-31-unified-text-message-import-design.md | 484 ----- 2 files changed, 2038 deletions(-) delete mode 100644 docs/superpowers/plans/2026-03-31-unified-text-message-import.md delete mode 100644 docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md diff --git a/docs/superpowers/plans/2026-03-31-unified-text-message-import.md b/docs/superpowers/plans/2026-03-31-unified-text-message-import.md deleted file mode 100644 index 719d0540..00000000 --- a/docs/superpowers/plans/2026-03-31-unified-text-message-import.md +++ /dev/null @@ -1,1554 +0,0 @@ -# Unified Text Message Import Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Merge WhatsApp, iMessage, and Google Voice import into a coherent system with shared phone-based participants, proper schema usage, and a dedicated TUI Texts mode. - -**Architecture:** Five sequential phases: (1) shared store/utility foundation, (2) importer refactoring to use store methods directly, (3) Parquet cache extension + TextEngine query interface, (4) TUI Texts mode, (5) CLI command renaming. Each phase builds on the previous. - -**Tech Stack:** Go, SQLite (mattn/go-sqlite3), DuckDB (go-duckdb), Bubble Tea TUI, Parquet/Arrow - -**Spec:** `docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md` - ---- - -## Phase 1: Foundation — Shared Utilities & Store Methods - -### Task 1: NormalizePhone Utility - -**Files:** -- Create: `internal/textimport/phone.go` -- Create: `internal/textimport/phone_test.go` - -- [ ] **Step 1: Write tests for NormalizePhone** - -```go -// internal/textimport/phone_test.go -package textimport - -import "testing" - -func TestNormalizePhone(t *testing.T) { - tests := []struct { - input string - want string - wantErr bool - }{ - // Valid E.164 - {"+15551234567", "+15551234567", false}, - // Strip formatting - {"+1 (555) 123-4567", "+15551234567", false}, - {"+1-555-123-4567", "+15551234567", false}, - {"1-555-123-4567", "+15551234567", false}, - // International - {"+447700900000", "+447700900000", false}, - {"+44 7700 900000", "+447700900000", false}, - // No country code — assume US - {"5551234567", "+15551234567", false}, - {"(555) 123-4567", "+15551234567", false}, - // Email — not a phone - {"alice@icloud.com", "", true}, - // Short code - {"12345", "", true}, - // Empty - {"", "", true}, - // System identifier - {"status@broadcast", "", true}, - } - for _, tt := range tests { - t.Run(tt.input, func(t *testing.T) { - got, err := NormalizePhone(tt.input) - if tt.wantErr { - if err == nil { - t.Errorf("NormalizePhone(%q) = %q, want error", tt.input, got) - } - return - } - if err != nil { - t.Errorf("NormalizePhone(%q) error: %v", tt.input, err) - return - } - if got != tt.want { - t.Errorf("NormalizePhone(%q) = %q, want %q", tt.input, got, tt.want) - } - }) - } -} -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `go test ./internal/textimport/ -run TestNormalizePhone -v` -Expected: FAIL — package does not exist - -- [ ] **Step 3: Implement NormalizePhone** - -```go -// internal/textimport/phone.go -package textimport - -import ( - "fmt" - "strings" - "unicode" -) - -// NormalizePhone normalizes a phone number to E.164 format. -// Returns an error for inputs that are not phone numbers (emails, -// short codes, system identifiers). -func NormalizePhone(raw string) (string, error) { - if raw == "" { - return "", fmt.Errorf("empty input") - } - // Reject email addresses - if strings.Contains(raw, "@") { - return "", fmt.Errorf("not a phone number: %q", raw) - } - - // Strip all non-digit and non-plus characters - var b strings.Builder - for _, r := range raw { - if r == '+' || unicode.IsDigit(r) { - b.WriteRune(r) - } - } - digits := b.String() - - // Must start with + or be all digits - if digits == "" { - return "", fmt.Errorf("no digits in input: %q", raw) - } - - // Strip leading + for length check - justDigits := strings.TrimPrefix(digits, "+") - if len(justDigits) < 7 { - return "", fmt.Errorf("too short for phone number: %q", raw) - } - - // Ensure + prefix - if !strings.HasPrefix(digits, "+") { - // Assume US country code if 10 digits - if len(justDigits) == 10 { - digits = "+1" + justDigits - } else if len(justDigits) == 11 && justDigits[0] == '1' { - digits = "+" + justDigits - } else { - digits = "+" + justDigits - } - } - - return digits, nil -} -``` - -- [ ] **Step 4: Run tests** - -Run: `go test ./internal/textimport/ -run TestNormalizePhone -v` -Expected: PASS - -- [ ] **Step 5: Run fmt/vet, commit** - -```bash -go fmt ./internal/textimport/... -go vet ./internal/textimport/... -git add internal/textimport/ -git commit -m "Add shared NormalizePhone utility for text importers" -``` - -### Task 2: Generalize EnsureParticipantByPhone - -**Files:** -- Modify: `internal/store/messages.go:910-960` (EnsureParticipantByPhone) -- Modify: `internal/whatsapp/importer.go` (callers) -- Create: `internal/store/messages_test.go` (test for new signature) - -The current `EnsureParticipantByPhone` hardcodes `identifier_type = 'whatsapp'` in its `participant_identifiers` INSERT. Generalize to accept `identifierType` as a parameter. - -- [ ] **Step 1: Write test for generalized EnsureParticipantByPhone** - -```go -// Add to internal/store/messages_test.go (create if needed) -func TestEnsureParticipantByPhone_IdentifierType(t *testing.T) { - s := setupTestStore(t) - defer func() { _ = s.Close() }() - - // Create participant via WhatsApp - id1, err := s.EnsureParticipantByPhone("+15551234567", "Alice", "whatsapp") - if err != nil { - t.Fatal(err) - } - - // Same phone via iMessage — should return same participant - id2, err := s.EnsureParticipantByPhone("+15551234567", "Alice", "imessage") - if err != nil { - t.Fatal(err) - } - if id1 != id2 { - t.Errorf("same phone different source got different IDs: %d vs %d", id1, id2) - } - - // Check both identifiers exist - var count int - err = s.DB().QueryRow( - "SELECT COUNT(*) FROM participant_identifiers WHERE participant_id = ?", id1, - ).Scan(&count) - if err != nil { - t.Fatal(err) - } - if count != 2 { - t.Errorf("expected 2 identifier rows, got %d", count) - } -} -``` - -This test needs a `setupTestStore` helper — use an in-memory SQLite DB with `InitSchema()`. Check if one already exists in the test file; if not, add: - -```go -func setupTestStore(t *testing.T) *Store { - t.Helper() - s, err := Open(":memory:") - if err != nil { - t.Fatal(err) - } - if err := s.InitSchema(); err != nil { - t.Fatal(err) - } - return s -} -``` - -- [ ] **Step 2: Run test, verify failure** - -Run: `go test -tags fts5 ./internal/store/ -run TestEnsureParticipantByPhone_IdentifierType -v` -Expected: FAIL — wrong number of arguments - -- [ ] **Step 3: Update EnsureParticipantByPhone signature** - -In `internal/store/messages.go:910`, change: - -```go -func (s *Store) EnsureParticipantByPhone(phone, displayName string) (int64, error) { -``` -to: -```go -func (s *Store) EnsureParticipantByPhone(phone, displayName, identifierType string) (int64, error) { -``` - -Find the hardcoded `'whatsapp'` in the INSERT into `participant_identifiers` (around line 945) and replace with the `identifierType` parameter. - -- [ ] **Step 4: Update all callers in whatsapp package** - -In `internal/whatsapp/importer.go`, find every call to `EnsureParticipantByPhone(phone, name)` and add `"whatsapp"` as the third argument. Use `grep -rn "EnsureParticipantByPhone"` to find all call sites. - -- [ ] **Step 5: Run tests** - -Run: `go test -tags fts5 ./internal/store/ -run TestEnsureParticipantByPhone -v && go test ./internal/whatsapp/ -v` -Expected: PASS - -- [ ] **Step 6: Commit** - -```bash -git add internal/store/messages.go internal/store/messages_test.go internal/whatsapp/ -git commit -m "Generalize EnsureParticipantByPhone to accept identifierType" -``` - -### Task 3: RecomputeConversationStats Store Method - -**Files:** -- Modify: `internal/store/messages.go` (add method) -- Modify: `internal/whatsapp/importer.go:498-514` (replace inline SQL) - -- [ ] **Step 1: Write test** - -```go -func TestRecomputeConversationStats(t *testing.T) { - s := setupTestStore(t) - defer func() { _ = s.Close() }() - - // Create a source - sourceID, err := s.GetOrCreateSource("test_source", "whatsapp", "") - if err != nil { - t.Fatal(err) - } - - // Create a conversation - convID, err := s.EnsureConversationWithType(sourceID, "conv1", "direct_chat", "Test Chat") - if err != nil { - t.Fatal(err) - } - - // Insert two messages - for i, snippet := range []string{"hello", "world"} { - _, err := s.UpsertMessage(&Message{ - SourceID: sourceID, - SourceMessageID: fmt.Sprintf("msg%d", i), - ConversationID: convID, - Snippet: snippet, - SentAt: sql.NullTime{Time: time.Now().Add(time.Duration(i) * time.Hour), Valid: true}, - MessageType: "whatsapp", - }) - if err != nil { - t.Fatal(err) - } - } - - // Stats should be zero before recompute - var msgCount int64 - _ = s.DB().QueryRow("SELECT message_count FROM conversations WHERE id = ?", convID).Scan(&msgCount) - if msgCount != 0 { - t.Errorf("before recompute: message_count = %d, want 0", msgCount) - } - - // Recompute - if err := s.RecomputeConversationStats(sourceID); err != nil { - t.Fatal(err) - } - - // Verify - _ = s.DB().QueryRow("SELECT message_count FROM conversations WHERE id = ?", convID).Scan(&msgCount) - if msgCount != 2 { - t.Errorf("after recompute: message_count = %d, want 2", msgCount) - } - - // Running again should be idempotent - if err := s.RecomputeConversationStats(sourceID); err != nil { - t.Fatal(err) - } - _ = s.DB().QueryRow("SELECT message_count FROM conversations WHERE id = ?", convID).Scan(&msgCount) - if msgCount != 2 { - t.Errorf("after second recompute: message_count = %d, want 2", msgCount) - } -} -``` - -- [ ] **Step 2: Run test, verify failure** - -Run: `go test -tags fts5 ./internal/store/ -run TestRecomputeConversationStats -v` -Expected: FAIL — method not found - -- [ ] **Step 3: Implement RecomputeConversationStats** - -Add to `internal/store/messages.go`: - -```go -// RecomputeConversationStats recomputes denormalized stats -// (message_count, participant_count, last_message_at, -// last_message_preview) for all conversations belonging to sourceID. -// This is idempotent — safe to call after any import or re-import. -func (s *Store) RecomputeConversationStats(sourceID int64) error { - _, err := s.db.Exec(` - UPDATE conversations SET - message_count = ( - SELECT COUNT(*) FROM messages - WHERE conversation_id = conversations.id - ), - participant_count = ( - SELECT COUNT(*) FROM conversation_participants - WHERE conversation_id = conversations.id - ), - last_message_at = ( - SELECT MAX(COALESCE(sent_at, received_at, internal_date)) - FROM messages - WHERE conversation_id = conversations.id - ), - last_message_preview = ( - SELECT snippet FROM messages - WHERE conversation_id = conversations.id - ORDER BY COALESCE(sent_at, received_at, internal_date) DESC - LIMIT 1 - ) - WHERE source_id = ? - `, sourceID) - if err != nil { - return fmt.Errorf("recompute conversation stats: %w", err) - } - return nil -} -``` - -- [ ] **Step 4: Run tests** - -Run: `go test -tags fts5 ./internal/store/ -run TestRecomputeConversationStats -v` -Expected: PASS - -- [ ] **Step 5: Replace WhatsApp inline SQL with shared method** - -In `internal/whatsapp/importer.go:498-514`, replace the inline `UPDATE conversations SET ...` with: - -```go -if err := imp.store.RecomputeConversationStats(source.ID); err != nil { - imp.log("Warning: failed to recompute conversation stats: %v", err) -} -``` - -- [ ] **Step 6: Run WhatsApp tests, commit** - -```bash -go test ./internal/whatsapp/ -v -go fmt ./... -git add internal/store/messages.go internal/store/messages_test.go internal/whatsapp/importer.go -git commit -m "Add shared RecomputeConversationStats store method" -``` - -### Task 4: Add LinkMessageLabel Store Method - -**Files:** -- Modify: `internal/store/messages.go` (add method) - -The spec calls for `LinkMessageLabel(messageID, labelID)`. The store has `AddMessageLabels(messageID int64, labelIDs []int64)` at line 570 which does `INSERT OR IGNORE` for a slice. Add a convenience single-label wrapper. - -- [ ] **Step 1: Add LinkMessageLabel** - -```go -// LinkMessageLabel links a single label to a message. -// Uses INSERT OR IGNORE — safe to call multiple times. -func (s *Store) LinkMessageLabel(messageID, labelID int64) error { - return s.AddMessageLabels(messageID, []int64{labelID}) -} -``` - -- [ ] **Step 2: Run fmt/vet, commit** - -```bash -go fmt ./internal/store/... -go vet ./internal/store/... -git add internal/store/messages.go -git commit -m "Add LinkMessageLabel convenience method" -``` - ---- - -## Phase 2: Importer Refactoring - -### Task 5: Refactor iMessage Importer - -**Files:** -- Rewrite: `internal/imessage/client.go` — drop gmail.API, use store methods -- Modify: `internal/imessage/parser.go` — use shared NormalizePhone -- Modify: `internal/imessage/models.go` — update types if needed -- Rewrite: `cmd/msgvault/cmd/sync_imessage.go` → `cmd/msgvault/cmd/import_imessage.go` -- Update: `internal/imessage/parser_test.go` - -This is the largest refactoring task. The key changes: - -1. `Client` no longer implements `gmail.API` -2. `Client` takes a `*store.Store` and writes directly -3. New `Import(ctx, store, opts)` method replaces the sync pipeline -4. `normalizeIdentifier` uses shared `textimport.NormalizePhone` with fallback to email path -5. No more synthetic MIME — body goes to `message_bodies`, raw to `message_raw` - -- [ ] **Step 1: Update parser.go to use shared NormalizePhone** - -Replace the `normalizeIdentifier` function in `internal/imessage/parser.go` to use `textimport.NormalizePhone`: - -```go -import "github.com/wesm/msgvault/internal/textimport" - -// resolveHandle categorizes an iMessage handle as phone or email. -// Returns (phone, email, displayName). Exactly one of phone/email -// will be non-empty. -func resolveHandle(handleID string) (phone, email, displayName string) { - if handleID == "" { - return "", "", "" - } - // Try phone normalization first - normalized, err := textimport.NormalizePhone(handleID) - if err == nil { - return normalized, "", normalized - } - // Fall back to email - if strings.Contains(handleID, "@") { - return "", strings.ToLower(handleID), "" - } - // Neither — raw handle - return "", "", handleID -} -``` - -Remove the old `normalizeIdentifier`, `normalizePhone`, `buildMIME`, `formatMIMEAddress` functions — they're no longer needed. - -- [ ] **Step 2: Rewrite Client to use store methods directly** - -Replace the `gmail.API` implementation in `internal/imessage/client.go`. The new `Client` struct holds a `*sql.DB` (read-only handle to chat.db) and exposes an `Import` method: - -```go -type Client struct { - db *sql.DB - myHandle string // owner's phone or email - afterDate *time.Time - beforeDate *time.Time - limit int - useNanoseconds bool - logger *slog.Logger -} - -// Import reads iMessage history from chat.db and writes to the -// msgvault store. Returns a summary of what was imported. -func (c *Client) Import(ctx context.Context, s *store.Store, opts ImportOptions) (*ImportSummary, error) { - // 1. GetOrCreateSource with source_type="apple_messages" - // 2. Ensure labels ("iMessage", "SMS") - // 3. Query chat.db for conversations (chats) - // 4. For each chat: - // a. EnsureConversationWithType (group vs direct) - // b. Resolve participants via resolveHandle → EnsureParticipantByPhone or EnsureParticipant - // c. EnsureConversationParticipant for each - // d. Query messages for this chat - // e. For each message: UpsertMessage with message_type, sender_id - // f. UpsertMessageBody with body text - // g. LinkMessageLabel - // 5. RecomputeConversationStats - // 6. Return summary -} -``` - -Remove the `gmail.API` interface assertion and all gmail.API methods (`GetProfile`, `ListLabels`, `ListMessages`, `GetMessageRaw`, `GetMessagesRawBatch`, `ListHistory`, `TrashMessage`, `DeleteMessage`, `BatchDeleteMessages`). - -Keep the `chat.db` reading logic (SQL queries, timestamp handling, `detectTimestampFormat`). The SQL queries that read from chat.db stay the same — only the output path changes. - -- [ ] **Step 3: Rewrite CLI command** - -Move `cmd/msgvault/cmd/sync_imessage.go` → `cmd/msgvault/cmd/import_imessage.go`. Replace the `sync-imessage` cobra command with `import-imessage`. Remove all `sync.Syncer` usage — call `client.Import(ctx, store, opts)` directly. - -Register the new command in `root.go`. - -- [ ] **Step 4: Update tests** - -Update `internal/imessage/parser_test.go`: -- Replace tests for `normalizeIdentifier` with tests for `resolveHandle` -- Remove tests for `buildMIME` / `formatMIMEAddress` -- Add tests for phone/email/raw-handle resolution - -- [ ] **Step 5: Run all tests** - -```bash -go test ./internal/imessage/ -v -go test ./internal/store/ -v -go vet ./... -``` - -- [ ] **Step 6: Commit** - -```bash -git add internal/imessage/ cmd/msgvault/cmd/ -git commit -m "Refactor iMessage to use store methods directly - -Drop gmail.API adapter and synthetic MIME. iMessage now writes to -the store using EnsureParticipantByPhone, EnsureConversationWithType, -and proper message_type/sender_id/conversation_type fields." -``` - -### Task 6: Refactor Google Voice Importer - -**Files:** -- Rewrite: `internal/gvoice/client.go` — drop gmail.API, use store methods -- Modify: `internal/gvoice/parser.go` — use shared NormalizePhone -- Modify: `internal/gvoice/models.go` — add message_type mapping -- Rewrite: `cmd/msgvault/cmd/sync_gvoice.go` → `cmd/msgvault/cmd/import_gvoice.go` -- Update: `internal/gvoice/parser_test.go` - -Same pattern as Task 5. Key differences: - -1. GVoice reads from a Takeout directory (HTML files), not a database -2. Three message_type values: `google_voice_text`, `google_voice_call`, `google_voice_voicemail` -3. All participants are phone-based (no email fallback needed) -4. `normalizeIdentifier` in parser.go replaced with `textimport.NormalizePhone` - -- [ ] **Step 1: Update parser.go to use shared NormalizePhone** - -Replace `normalizeIdentifier` and `normalizePhone` in `internal/gvoice/parser.go` with calls to `textimport.NormalizePhone`. Remove `buildMIME` and `formatMIMEAddress`. - -- [ ] **Step 2: Add message_type mapping to models.go** - -```go -// MessageTypeForFileType returns the messages.message_type value -// for a Google Voice file type. -func MessageTypeForFileType(ft fileType) string { - switch ft { - case fileTypeText, fileTypeGroup: - return "google_voice_text" - case fileTypeReceived, fileTypePlaced, fileTypeMissed: - return "google_voice_call" - case fileTypeVoicemail: - return "google_voice_voicemail" - default: - return "google_voice_text" - } -} -``` - -- [ ] **Step 3: Rewrite Client to use store methods directly** - -Same approach as iMessage — new `Import` method, remove all gmail.API methods and interface assertion. The HTML parsing stays the same; the output path changes to store methods. - -For each indexed entry: -1. Resolve phone via `textimport.NormalizePhone` -2. `EnsureParticipantByPhone(phone, name, "google_voice")` -3. `EnsureConversationWithType` with thread ID -4. `UpsertMessage` with `message_type = MessageTypeForFileType(entry.FileType)` -5. `UpsertMessageBody` with body text -6. `UpsertMessageRawWithFormat` with raw HTML as `gvoice_html` -7. `EnsureLabel` + `LinkMessageLabel` for each label -8. After all entries: `RecomputeConversationStats` - -- [ ] **Step 4: Rewrite CLI command** - -Move `sync_gvoice.go` → `import_gvoice.go`. Replace `sync-gvoice` with `import-gvoice`. Remove `sync.Syncer` usage. - -- [ ] **Step 5: Update tests, run all** - -Update parser_test.go to test `textimport.NormalizePhone` integration. Remove MIME-related test assertions. - -```bash -go test ./internal/gvoice/ -v -go vet ./... -``` - -- [ ] **Step 6: Commit** - -```bash -git add internal/gvoice/ cmd/msgvault/cmd/ -git commit -m "Refactor Google Voice to use store methods directly - -Drop gmail.API adapter and synthetic MIME. Google Voice now writes -to the store with proper message_type (google_voice_text/call/ -voicemail), phone-based participants, and labels." -``` - -### Task 7: WhatsApp Cleanup - -**Files:** -- Modify: `internal/whatsapp/importer.go` — use shared NormalizePhone -- Modify: `internal/whatsapp/contacts.go` — use shared NormalizePhone - -- [ ] **Step 1: Replace internal normalizePhone with shared utility** - -Find all calls to the internal `normalizePhone` in the whatsapp package and replace with `textimport.NormalizePhone`. The internal function is in `internal/whatsapp/mapping.go` or `contacts.go`. Since the shared version returns an error, callers need to handle it (skip participants that don't normalize). - -- [ ] **Step 2: Update EnsureParticipantByPhone calls** - -All calls in the whatsapp package already pass `"whatsapp"` after Task 2. Verify. - -- [ ] **Step 3: Run tests, commit** - -```bash -go test ./internal/whatsapp/ -v -go fmt ./... -git add internal/whatsapp/ -git commit -m "WhatsApp: use shared NormalizePhone and RecomputeConversationStats" -``` - -### Task 8: Rename CLI Commands and Register - -**Files:** -- Rename: `cmd/msgvault/cmd/import.go` → verify naming -- Modify: `cmd/msgvault/cmd/root.go` — register new commands, remove old - -- [ ] **Step 1: Ensure all three import commands are registered** - -The WhatsApp import command is currently `import --type whatsapp` (in `cmd/msgvault/cmd/import.go`). Rename to `import-whatsapp`. The iMessage and GVoice commands were already renamed in Tasks 5-6. - -Update `root.go` to register: `importWhatsappCmd`, `importImessageCmd`, `importGvoiceCmd`. Remove any old `syncImessageCmd`, `syncGvoiceCmd` references. - -- [ ] **Step 2: Verify all commands work** - -```bash -go build -tags fts5 -o msgvault ./cmd/msgvault -./msgvault import-whatsapp --help -./msgvault import-imessage --help -./msgvault import-gvoice --help -``` - -- [ ] **Step 3: Commit** - -```bash -git add cmd/msgvault/ -git commit -m "Rename import CLI commands for consistency - -import-whatsapp, import-imessage, import-gvoice" -``` - ---- - -## Phase 3: Parquet Cache & TextEngine - -### Task 9: Extend Parquet Cache for Text Messages - -**Files:** -- Modify: `cmd/msgvault/cmd/build_cache.go` — add columns to export queries -- Modify: `internal/query/duckdb.go` — probe new columns - -The existing `build_cache.go` exports `messages`, `participants`, `conversations`, etc. to Parquet. We need to ensure the export includes the columns required for Texts mode queries. - -- [ ] **Step 1: Add conversation_type to conversations export** - -In `build_cache.go`, find the conversations export query (around line 460) and add `conversation_type` to the SELECT. The schema already has this column. - -- [ ] **Step 2: Add message_type and sender_id to messages export** - -The messages export (around line 300) already includes `message_type` and `sender_id` (added by the WhatsApp PR). Verify they're present. If not, add them. - -- [ ] **Step 3: Bump cache schema version** - -Change `cacheSchemaVersion` from 4 to 5. This forces a full rebuild when users upgrade, ensuring new columns are present. - -- [ ] **Step 4: Update DuckDB column probing** - -In `internal/query/duckdb.go`, the `probeParquetColumns` method checks for optional columns. Ensure `conversation_type` is probed for the conversations table. - -- [ ] **Step 5: Add email-only filter to existing Engine queries** - -In `DuckDBEngine.Aggregate`, `DuckDBEngine.ListMessages`, etc., add a `WHERE message_type = 'email' OR message_type IS NULL` filter so email-mode queries exclude text messages. The `IS NULL` handles old data without the column. - -This is a targeted change in `buildFilterConditions` (line 803) — add it as a default condition when no explicit message_type filter is set. - -- [ ] **Step 6: Run tests, commit** - -```bash -go test -tags fts5 ./internal/query/ -v -go test -tags fts5 ./cmd/msgvault/cmd/ -v -git add cmd/msgvault/cmd/build_cache.go internal/query/duckdb.go -git commit -m "Extend Parquet cache with text message columns - -Add conversation_type to exports, bump cache schema to v5, -filter email queries to exclude text messages." -``` - -### Task 10: TextEngine Interface and Types - -**Files:** -- Create: `internal/query/text_engine.go` -- Create: `internal/query/text_models.go` - -- [ ] **Step 1: Define TextEngine types** - -```go -// internal/query/text_models.go -package query - -import "time" - -// TextViewType represents the type of view in Texts mode. -type TextViewType int - -const ( - TextViewConversations TextViewType = iota - TextViewContacts - TextViewContactNames - TextViewSources - TextViewLabels - TextViewTime - TextViewTypeCount -) - -func (v TextViewType) String() string { - switch v { - case TextViewConversations: - return "Conversations" - case TextViewContacts: - return "Contacts" - case TextViewContactNames: - return "Contact Names" - case TextViewSources: - return "Sources" - case TextViewLabels: - return "Labels" - case TextViewTime: - return "Time" - default: - return "Unknown" - } -} - -// ConversationRow represents a conversation in the Conversations view. -type ConversationRow struct { - ConversationID int64 - Title string - SourceType string - MessageCount int64 - ParticipantCount int64 - LastMessageAt time.Time - LastPreview string -} - -// TextFilter specifies which text messages to retrieve. -type TextFilter struct { - SourceID *int64 - ConversationID *int64 - ContactPhone string - ContactName string - SourceType string - Label string - TimeRange TimeRange - After *time.Time - Before *time.Time - Pagination Pagination - SortField SortField - SortDirection SortDirection -} - -// TextAggregateOptions configures a text aggregate query. -type TextAggregateOptions struct { - SourceID *int64 - After *time.Time - Before *time.Time - SortField SortField - SortDirection SortDirection - Limit int - TimeGranularity TimeGranularity - SearchQuery string -} - -// TextStatsOptions configures a text stats query. -type TextStatsOptions struct { - SourceID *int64 - SearchQuery string -} - -// TextMessageTypes lists the message_type values included in Texts mode. -var TextMessageTypes = []string{ - "whatsapp", "imessage", "sms", "google_voice_text", -} - -// IsTextMessageType returns true if the given type is a text message type. -func IsTextMessageType(mt string) bool { - for _, t := range TextMessageTypes { - if t == mt { - return true - } - } - return false -} -``` - -- [ ] **Step 2: Define TextEngine interface** - -```go -// internal/query/text_engine.go -package query - -import "context" - -// TextEngine provides query operations for text message data. -// This is a separate interface from Engine to avoid rippling text -// query methods through remote/API/MCP/mock layers. -// DuckDBEngine and SQLiteEngine implement both Engine and TextEngine. -type TextEngine interface { - // ListConversations returns conversations matching the filter. - ListConversations(ctx context.Context, - filter TextFilter) ([]ConversationRow, error) - - // TextAggregate aggregates text messages by the given view type. - TextAggregate(ctx context.Context, viewType TextViewType, - opts TextAggregateOptions) ([]AggregateRow, error) - - // ListConversationMessages returns messages within a conversation. - ListConversationMessages(ctx context.Context, convID int64, - filter TextFilter) ([]MessageSummary, error) - - // TextSearch performs plain full-text search over text messages. - TextSearch(ctx context.Context, query string, - limit, offset int) ([]MessageSummary, error) - - // GetTextStats returns aggregate stats for text messages. - GetTextStats(ctx context.Context, - opts TextStatsOptions) (*TotalStats, error) -} -``` - -- [ ] **Step 3: Run fmt/vet, commit** - -```bash -go fmt ./internal/query/... -go vet ./internal/query/... -git add internal/query/text_engine.go internal/query/text_models.go -git commit -m "Add TextEngine interface and text query types" -``` - -### Task 11: DuckDB TextEngine Implementation - -**Files:** -- Create: `internal/query/duckdb_text.go` -- Create: `internal/query/duckdb_text_test.go` - -Implement `TextEngine` methods on `DuckDBEngine`. These query the same Parquet files as email queries but filter to text message types and use different grouping columns. - -- [ ] **Step 1: Implement ListConversations** - -```go -// internal/query/duckdb_text.go -package query - -// ... imports ... - -// textTypeFilter returns a SQL IN clause for text message types. -func textTypeFilter() string { - return "message_type IN ('whatsapp','imessage','sms','google_voice_text')" -} - -func (e *DuckDBEngine) ListConversations(ctx context.Context, - filter TextFilter) ([]ConversationRow, error) { - // Query conversations table joined with message stats - // from the Parquet messages, filtered to text message types. - // Uses denormalized stats from conversations table (via SQLite - // scanner or conversations Parquet). - // Sort by last_message_at DESC by default. - // Apply filter: SourceID, After/Before, Pagination. - // ... -} -``` - -The implementation queries the `conversations` Parquet table joined with `sources` to get `source_type`, filtered to text source types (`'whatsapp'`, `'apple_messages'`, `'google_voice'`). - -- [ ] **Step 2: Implement TextAggregate** - -Aggregation by view type: -- `TextViewContacts`: GROUP BY `phone_number`, `display_name` -- `TextViewContactNames`: GROUP BY `display_name` -- `TextViewSources`: GROUP BY `source_type` -- `TextViewLabels`: GROUP BY label name (JOIN message_labels + labels) -- `TextViewTime`: GROUP BY time period - -All queries include `WHERE textTypeFilter()`. - -- [ ] **Step 3: Implement ListConversationMessages** - -Query messages from Parquet where `conversation_id = convID` and `textTypeFilter()`, ordered by `sent_at ASC` (chronological for chat timeline). - -- [ ] **Step 4: Implement TextSearch** - -Plain FTS query against `messages_fts` via the SQLite scanner, filtered to text message types. No Gmail-style operator parsing — pass the query string directly to FTS5 MATCH. - -- [ ] **Step 5: Implement GetTextStats** - -Aggregate stats (message count, total size, etc.) filtered to text message types. - -- [ ] **Step 6: Add interface assertion** - -```go -var _ TextEngine = (*DuckDBEngine)(nil) -``` - -- [ ] **Step 7: Write tests** - -Create `internal/query/duckdb_text_test.go` with test fixtures that include text message data. Test `ListConversations`, `TextAggregate` for each view type, `ListConversationMessages`, and `GetTextStats`. - -Use the existing test fixture pattern from `internal/query/testfixtures_test.go` — extend it to include text message data with proper `message_type`, `sender_id`, and `conversation_type` values. - -- [ ] **Step 8: Run tests, commit** - -```bash -go test -tags fts5 ./internal/query/ -run TestText -v -git add internal/query/duckdb_text.go internal/query/duckdb_text_test.go -git commit -m "Implement TextEngine on DuckDBEngine - -ListConversations, TextAggregate, ListConversationMessages, -TextSearch, GetTextStats — all querying Parquet with text -message type filters." -``` - -### Task 12: SQLite TextEngine Fallback - -**Files:** -- Create: `internal/query/sqlite_text.go` - -Implement `TextEngine` on `SQLiteEngine` as a fallback for when Parquet cache is not built. Same logic as DuckDB but querying SQLite directly. - -- [ ] **Step 1: Implement all five TextEngine methods** - -Same patterns as DuckDB but using SQLite SQL. Key difference: joins go to real tables instead of Parquet files. - -- [ ] **Step 2: Add interface assertion** - -```go -var _ TextEngine = (*SQLiteEngine)(nil) -``` - -- [ ] **Step 3: Run tests, commit** - -```bash -go test -tags fts5 ./internal/query/ -v -git add internal/query/sqlite_text.go -git commit -m "Implement TextEngine on SQLiteEngine as fallback" -``` - -### Task 13: Update FTS Backfill for Text Messages - -**Files:** -- Modify: `internal/store/messages.go` (FTS backfill query) - -The current FTS backfill populates `from_addr` from `message_recipients` where `recipient_type = 'from'`. Text messages use `sender_id` instead. Update the backfill to handle both paths. - -- [ ] **Step 1: Find the FTS backfill query** - -In `internal/store/messages.go`, find the `BackfillFTS` or similar method that populates `messages_fts`. Look for the INSERT INTO `messages_fts` query. - -- [ ] **Step 2: Update the from_addr population** - -Change the `from_addr` subquery to use COALESCE: - -```sql -COALESCE( - (SELECT COALESCE(p.phone_number, p.email_address) - FROM participants p WHERE p.id = m.sender_id), - (SELECT p.email_address FROM message_recipients mr - JOIN participants p ON p.id = mr.participant_id - WHERE mr.message_id = m.id AND mr.recipient_type = 'from' - LIMIT 1) -) as from_addr -``` - -This checks `sender_id` first (for text messages), falls back to `message_recipients` (for email). - -- [ ] **Step 3: Run FTS tests, commit** - -```bash -go test -tags fts5 ./internal/store/ -v -git add internal/store/messages.go -git commit -m "Update FTS backfill to handle phone-based text senders" -``` - ---- - -## Phase 4: TUI Texts Mode - -### Task 14: TUI Model State for Texts Mode - -**Files:** -- Modify: `internal/tui/model.go` — add text mode state -- Create: `internal/tui/text_state.go` — text-specific state types - -- [ ] **Step 1: Add text mode types and state** - -```go -// internal/tui/text_state.go -package tui - -import "github.com/wesm/msgvault/internal/query" - -// tuiMode distinguishes Email mode from Texts mode. -type tuiMode int - -const ( - modeEmail tuiMode = iota - modeTexts -) - -// textViewLevel tracks navigation depth in Texts mode. -type textViewLevel int - -const ( - textLevelConversations textViewLevel = iota - textLevelAggregate - textLevelDrillConversations // conversations filtered by aggregate key - textLevelTimeline // messages within a conversation -) - -// textState holds all state specific to Texts mode. -type textState struct { - viewType query.TextViewType - level textViewLevel - conversations []query.ConversationRow - aggregateRows []query.AggregateRow - messages []query.MessageSummary - cursor int - scrollOffset int - selectedConvID int64 - - // Filter state - filter query.TextFilter - - // Stats - stats *query.TotalStats - - // Breadcrumbs for back navigation - breadcrumbs []textNavSnapshot -} - -type textNavSnapshot struct { - level textViewLevel - viewType query.TextViewType - cursor int - scrollOffset int - filter query.TextFilter - selectedConvID int64 -} -``` - -- [ ] **Step 2: Add mode and textState to Model** - -In `internal/tui/model.go`, add to the `Model` struct: - -```go -mode tuiMode -textEngine query.TextEngine // nil if not available -textState textState -``` - -In the `New` constructor, check if the engine implements `TextEngine`: - -```go -if te, ok := engine.(query.TextEngine); ok { - m.textEngine = te -} -``` - -- [ ] **Step 3: Commit** - -```bash -git add internal/tui/text_state.go internal/tui/model.go -git commit -m "Add Texts mode state types to TUI model" -``` - -### Task 15: Mode Switching (m key) - -**Files:** -- Modify: `internal/tui/keys.go` — add `m` key handler -- Modify: `internal/tui/model.go` — route Update based on mode - -- [ ] **Step 1: Add m key to handleGlobalKeys** - -In `internal/tui/keys.go`, in `handleGlobalKeys` (around line 86), add: - -```go -case "m": - if m.textEngine == nil { - return m, nil, true // no text engine, ignore - } - if m.mode == modeEmail { - m.mode = modeTexts - // Load text conversations - return m, m.loadTextConversations(), true - } - m.mode = modeEmail - return m, m.loadData(), true -``` - -- [ ] **Step 2: Route key handling by mode in Update** - -In `model.go`'s `Update` method, after global key handling, branch on `m.mode`: - -```go -if m.mode == modeTexts { - return m.handleTextKeyPress(msg) -} -// ... existing email key handling -``` - -- [ ] **Step 3: Commit** - -```bash -git add internal/tui/keys.go internal/tui/model.go -git commit -m "Add mode switching between Email and Texts (m key)" -``` - -### Task 16: Texts Mode Key Handling - -**Files:** -- Create: `internal/tui/text_keys.go` - -- [ ] **Step 1: Implement text mode key dispatch** - -```go -// internal/tui/text_keys.go -package tui - -import tea "github.com/charmbracelet/bubbletea" - -func (m Model) handleTextKeyPress(msg tea.KeyMsg) (tea.Model, tea.Cmd) { - key := msg.String() - - // Disabled keys in Texts mode - switch key { - case " ", "S", "d", "D", "x": - return m, nil // read-only mode - } - - switch m.textState.level { - case textLevelConversations, textLevelAggregate, - textLevelDrillConversations: - return m.handleTextListKeys(msg) - case textLevelTimeline: - return m.handleTextTimelineKeys(msg) - } - return m, nil -} - -func (m Model) handleTextListKeys(msg tea.KeyMsg) (tea.Model, tea.Cmd) { - key := msg.String() - switch key { - case "tab", "Tab": - m.cycleTextViewType(true) - return m, m.loadTextData() - case "shift+tab": - m.cycleTextViewType(false) - return m, m.loadTextData() - case "enter": - return m.textDrillDown() - case "esc", "backspace": - return m.textGoBack() - case "j", "down": - m.textState.cursor++ - m.clampTextCursor() - return m, nil - case "k", "up": - m.textState.cursor-- - m.clampTextCursor() - return m, nil - case "s": - m.cycleTextSortField() - return m, m.loadTextData() - case "r": - m.toggleTextSortDirection() - return m, m.loadTextData() - case "t": - m.textState.viewType = query.TextViewTime - m.textState.level = textLevelAggregate - return m, m.loadTextData() - case "a": - // Reset to conversations - m.textState = textState{viewType: query.TextViewConversations} - return m, m.loadTextConversations() - case "A": - m.openAccountSelector() - return m, nil - } - return m, nil -} -``` - -- [ ] **Step 2: Implement helper methods** - -Add `cycleTextViewType`, `clampTextCursor`, `textDrillDown`, `textGoBack`, `loadTextData`, `loadTextConversations` methods. These follow the same patterns as the email equivalents but operate on `textState`. - -- [ ] **Step 3: Commit** - -```bash -git add internal/tui/text_keys.go -git commit -m "Add Texts mode key handling" -``` - -### Task 17: Texts Mode Views - -**Files:** -- Create: `internal/tui/text_view.go` - -- [ ] **Step 1: Implement text conversations view** - -```go -// internal/tui/text_view.go -package tui - -// textConversationsView renders the Conversations list. -func (m Model) textConversationsView() string { - // Header: Name | Source | Messages | Participants | Last Message - // Rows from m.textState.conversations - // Same styling patterns as aggregateTableView -} -``` - -- [ ] **Step 2: Implement text aggregate view** - -```go -// textAggregateView renders aggregate views (Contacts, Sources, etc.) -func (m Model) textAggregateView() string { - // Same shape as email aggregate view - // Rows from m.textState.aggregateRows -} -``` - -- [ ] **Step 3: Implement text timeline view** - -```go -// textTimelineView renders a conversation's message timeline. -func (m Model) textTimelineView() string { - // Compact chat style: timestamp | sender | body snippet - // Rows from m.textState.messages - // Chronological order (oldest first) -} -``` - -- [ ] **Step 4: Wire into renderView** - -In the main `renderView()` switch (internal/tui/view.go), add a mode check: - -```go -if m.mode == modeTexts { - return m.renderTextView() -} -``` - -Implement `renderTextView()` in text_view.go, switching on `m.textState.level`. - -- [ ] **Step 5: Update footer for Texts mode** - -In `footerView()`, add a Texts mode branch that shows the correct keybindings for the current text view level. - -- [ ] **Step 6: Add mode indicator to header** - -In `buildTitleBar()`, show "Email" or "Texts" mode indicator. Show "m: switch mode" in the title bar. - -- [ ] **Step 7: Commit** - -```bash -git add internal/tui/text_view.go internal/tui/view.go -git commit -m "Add Texts mode views: conversations, aggregates, timeline" -``` - -### Task 18: Texts Mode Search - -**Files:** -- Modify: `internal/tui/text_keys.go` — add `/` handler -- Create: `internal/tui/text_search.go` — text search state management - -- [ ] **Step 1: Add search handling** - -In `handleTextListKeys`, the `/` key enters search mode. In Texts mode, search uses plain FTS (no Gmail operators): - -```go -case "/": - m.searchMode = true - m.searchInput = "" - return m, nil -``` - -When search is submitted, call `m.textEngine.TextSearch(ctx, query, limit, 0)` instead of the email search path. - -- [ ] **Step 2: Display search results** - -Search results in Texts mode show as a message list (same as timeline view). Pressing Esc exits search. - -- [ ] **Step 3: Commit** - -```bash -git add internal/tui/text_keys.go internal/tui/text_search.go -git commit -m "Add plain full-text search in Texts mode" -``` - -### Task 19: Data Loading Commands for Texts Mode - -**Files:** -- Create: `internal/tui/text_commands.go` - -- [ ] **Step 1: Implement async data loading commands** - -Following the Bubble Tea pattern, each data load returns a `tea.Cmd` that runs asynchronously and sends a message when done: - -```go -// internal/tui/text_commands.go -package tui - -import ( - "context" - tea "github.com/charmbracelet/bubbletea" - "github.com/wesm/msgvault/internal/query" -) - -// Message types for async text data loading -type textConversationsLoadedMsg struct { - conversations []query.ConversationRow - err error -} - -type textAggregateLoadedMsg struct { - rows []query.AggregateRow - err error -} - -type textMessagesLoadedMsg struct { - messages []query.MessageSummary - err error -} - -type textStatsLoadedMsg struct { - stats *query.TotalStats - err error -} - -func (m Model) loadTextConversations() tea.Cmd { - return func() tea.Msg { - convs, err := m.textEngine.ListConversations( - context.Background(), m.textState.filter) - return textConversationsLoadedMsg{convs, err} - } -} - -func (m Model) loadTextAggregate() tea.Cmd { - return func() tea.Msg { - rows, err := m.textEngine.TextAggregate( - context.Background(), - m.textState.viewType, - query.TextAggregateOptions{ - SourceID: m.textState.filter.SourceID, - After: m.textState.filter.After, - Before: m.textState.filter.Before, - SortField: m.textState.filter.SortField, - SortDirection: m.textState.filter.SortDirection, - Limit: m.aggregateLimit, - }) - return textAggregateLoadedMsg{rows, err} - } -} - -func (m Model) loadTextMessages() tea.Cmd { - return func() tea.Msg { - msgs, err := m.textEngine.ListConversationMessages( - context.Background(), - m.textState.selectedConvID, - m.textState.filter) - return textMessagesLoadedMsg{msgs, err} - } -} - -func (m Model) loadTextData() tea.Cmd { - switch m.textState.viewType { - case query.TextViewConversations: - return m.loadTextConversations() - default: - return m.loadTextAggregate() - } -} -``` - -- [ ] **Step 2: Handle loaded messages in Update** - -In `model.go`'s `Update` method, add cases for the new message types: - -```go -case textConversationsLoadedMsg: - m.textState.conversations = msg.conversations - m.loading = false - // ... -case textAggregateLoadedMsg: - m.textState.aggregateRows = msg.rows - m.loading = false - // ... -case textMessagesLoadedMsg: - m.textState.messages = msg.messages - m.loading = false - // ... -``` - -- [ ] **Step 3: Commit** - -```bash -git add internal/tui/text_commands.go internal/tui/model.go -git commit -m "Add async data loading for Texts mode" -``` - ---- - -## Phase 5: Integration & Polish - -### Task 20: Wire TUI Init to Load Text Engine - -**Files:** -- Modify: `cmd/msgvault/cmd/tui.go` - -- [ ] **Step 1: Pass TextEngine to TUI** - -In `tui.go`, after creating the query engine, check if it implements `TextEngine` and pass it through `tui.Options`: - -```go -type Options struct { - DataDir string - Version string - IsRemote bool - TextEngine query.TextEngine // nil if not available -} -``` - -In the TUI command's `RunE`, after engine creation: - -```go -var textEngine query.TextEngine -if te, ok := engine.(query.TextEngine); ok { - textEngine = te -} -opts := tui.Options{ - DataDir: dataDir, - Version: version, - IsRemote: isRemote, - TextEngine: textEngine, -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add cmd/msgvault/cmd/tui.go internal/tui/model.go -git commit -m "Wire TextEngine into TUI initialization" -``` - -### Task 21: End-to-End Integration Test - -**Files:** -- Create: `internal/textimport/integration_test.go` - -- [ ] **Step 1: Write integration test** - -Create a test that: -1. Creates an in-memory store -2. Simulates importing messages from two different sources using store methods directly (no actual chat.db/Takeout needed) -3. Verifies participant deduplication by phone number -4. Verifies conversation stats after RecomputeConversationStats -5. Verifies labels are linked -6. Creates a SQLiteEngine and verifies TextEngine methods return correct results - -This test exercises the full pipeline without needing real source data. - -- [ ] **Step 2: Run tests, commit** - -```bash -go test -tags fts5 ./internal/textimport/ -run TestIntegration -v -git add internal/textimport/integration_test.go -git commit -m "Add end-to-end integration test for text message import" -``` - -### Task 22: Build and Smoke Test - -- [ ] **Step 1: Build** - -```bash -make build -``` - -- [ ] **Step 2: Run full test suite** - -```bash -make test -``` - -- [ ] **Step 3: Run linter** - -```bash -make lint -``` - -- [ ] **Step 4: Fix any issues and commit** - -### Task 23: Final Commit — Remove Dead Code - -- [ ] **Step 1: Remove old sync command registrations** - -Check `root.go` for any remaining references to `syncImessageCmd`, `syncGvoiceCmd`. Remove them. - -- [ ] **Step 2: Remove unused gmail.API imports from imessage/gvoice packages** - -After refactoring, `internal/imessage/` and `internal/gvoice/` should no longer import `gmail` package. Verify and clean up. - -- [ ] **Step 3: Remove the design plan doc from WhatsApp PR** - -`docs/plans/2026-02-17-multi-source-messaging.md` was included in the WhatsApp PR as a planning doc. It's superseded by the spec. Remove it. - -- [ ] **Step 4: Run full test suite and linter one final time** - -```bash -make test && make lint -``` - -- [ ] **Step 5: Commit** - -```bash -git add -A -git commit -m "Clean up dead code and superseded planning docs" -``` diff --git a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md b/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md deleted file mode 100644 index 4218a96d..00000000 --- a/docs/superpowers/specs/2026-03-31-unified-text-message-import-design.md +++ /dev/null @@ -1,484 +0,0 @@ -# Unified Text Message Import - -Merge three independent text message import implementations (WhatsApp -#160, iMessage #224, Google Voice #225) into a coherent system with a -shared schema, unified participant model, and dedicated TUI experience. - -## Guiding Principles - -1. **Phone number is the primary unification key.** If you communicate - with someone through multiple channels (iMessage, WhatsApp, Google - Voice) using the same phone number, all messages appear under one - contact. Cross-channel unification where the only shared identifier - is an address book entry (e.g., alice@icloud.com in iMessage and - +1... in WhatsApp) requires address book resolution, which is - deferred. Phone-based dedup handles the common case; gaps are - acknowledged, not hidden. -2. **Texts are not emails.** The TUI has a separate Texts mode with - conversation-centric navigation, not the sender-aggregate model used - for email. -3. **Consistent UX across modes.** Same keybindings, sort/filter - patterns, and visual language in both Email and Texts modes. Only the - available views and drill-down behavior differ. -4. **Texts mode is read-only.** Imported text archives have no live - delete API (iMessage reads a local DB, WhatsApp reads a backup, - GVoice reads a Takeout export). Deletion staging (`d`/`D`) and - selection (`Space`/`S`) are disabled in Texts mode. - -## Schema & Persistence - -All text message importers converge on the same storage pattern. - -### Participant Model - -Phone number is the preferred unification key, but not all participants -have one. iMessage handles can be email addresses, and some senders are -short codes or system identifiers. - -**Resolution order:** -1. If the handle normalizes to a valid E.164 phone number, use - `EnsureParticipantByPhone` — this deduplicates across sources so the - same phone from WhatsApp, iMessage, and Google Voice resolves to one - `participants` row. -2. If the handle is an email address (common in iMessage), use the - existing `EnsureParticipant` by email — the participant gets an - `email_address` but no `phone_number`. -3. If the handle is neither (short codes, system senders), create a - participant with the raw handle stored in `participant_identifiers` - and no canonical phone or email. - -No synthetic email addresses (`@phone.imessage`, `@phone.gvoice`). - -**Platform identifier tracking:** `EnsureParticipantByPhone` (and the -email path) accept an `identifierType` parameter (`'whatsapp'`, -`'imessage'`, `'google_voice'`) so each importer registers its own -platform-specific identifier in `participant_identifiers`. The current -WhatsApp-hardcoded behavior is generalized. - -A shared `NormalizePhone()` utility ensures consistent E.164 -normalization across all importers. It returns an error for inputs that -cannot be normalized (email handles, short codes), signaling the caller -to fall through to path 2 or 3 above. - -**Cross-channel limitations:** Participants matched by phone number are -unified automatically. Participants only known by email (e.g., an -iMessage contact using their iCloud address) remain separate from the -same person's phone-based participant until address book resolution is -implemented. The Contacts aggregate view in Texts mode will show these -as separate entries. - -### Message Storage - -| Column | Value | -|---|---| -| `messages.message_type` | `'whatsapp'`, `'imessage'`, `'sms'`, `'google_voice_text'`, `'google_voice_call'`, `'google_voice_voicemail'` | -| `messages.sender_id` | FK to `participants.id` (direct link, not via `message_recipients`) | -| `messages.subject` | NULL for text messages | -| `conversations.conversation_type` | `'group_chat'` or `'direct_chat'` | -| `conversations.title` | Group name, or resolved contact name for 1:1 (see fallback below) | -| `sources.source_type` | `'whatsapp'`, `'apple_messages'`, `'google_voice'` | -| `message_bodies.body_text` | Message text stored directly | -| `message_raw.raw_format` | `'whatsapp_json'`, `'imessage_json'`, `'gvoice_html'` | - -No synthetic MIME wrapping for text messages. Body text goes directly -into `message_bodies`. Raw source data is stored in its native format. - -### Conversation Title Fallback - -Group chats use the group name from the source (WhatsApp subject, -iMessage `display_name`). For 1:1 chats, title is resolved with this -fallback chain: -1. `chat.display_name` (if set by the source) -2. Other participant's `display_name` from `participants` -3. Other participant's phone number or email handle - -The TUI Conversations view uses this title for display. If the title -is still empty at display time (e.g., participant not yet resolved), -the raw handle is shown. - -### Message Type Values - -- iMessage sets `'imessage'` or `'sms'` based on the service field in - `chat.db` (Apple distinguishes these natively). -- Google Voice uses distinct `message_type` values per record kind: - `'google_voice_text'` for SMS/MMS, `'google_voice_call'` for call - records, and `'google_voice_voicemail'` for voicemails. Labels - (`sms`, `mms`, `call_received`, `call_placed`, `call_missed`, - `voicemail`) provide finer-grained classification within each type. - Call records have `conversation_type = 'direct_chat'` and are - grouped into `calls:` threads. - -### Texts Mode Message Type Filtering - -Texts mode displays messages where `message_type` is one of: -`'whatsapp'`, `'imessage'`, `'sms'`, `'google_voice_text'`. Call -records (`'google_voice_call'`) and voicemails -(`'google_voice_voicemail'`) are excluded from the default Texts view. -They are accessible via the Labels aggregate view when filtered to the -relevant label. - -### Conversation Stats - -The `conversations` table has denormalized stats columns: -`message_count`, `participant_count`, `last_message_at`, -`last_message_preview`. These are required for the Conversations -primary view. - -**Stats are not maintained during message insertion.** Message -insertion is idempotent (`INSERT ... ON CONFLICT DO UPDATE`) and -imports are expected to be re-runnable. The store does not attempt to -detect insert-vs-update, and does not increment counters on upsert. - -Instead, each importer calls `RecomputeConversationStats(sourceID)` -as a post-import step (like WhatsApp already does today). This runs -aggregate queries against `messages` and `conversation_participants` -to set all stats columns for conversations belonging to that source. -The operation is idempotent — running it twice produces the same -result. - -### Label Persistence - -All importers that produce labels must create `labels` rows and link -them via `message_labels`. This is part of the shared persistence -contract: -- **WhatsApp:** source-specific labels as needed -- **iMessage:** `'iMessage'`, `'SMS'` (from service field) -- **Google Voice:** `'sms'`, `'mms'`, `'call_received'`, - `'call_placed'`, `'call_missed'`, `'voicemail'` - -The store provides `EnsureLabel(name, sourceID)` and -`LinkMessageLabel(messageID, labelID)`. Google Voice call/voicemail -records depend on labels for discoverability in the Labels aggregate -view. - -### `conversation_participants` - -All three importers populate this table to track who is in each -conversation, with roles where applicable (e.g., WhatsApp group admins). - -## Importer Architecture - -### Per-Source Packages - -Each importer is its own package with source-specific parsing: - -- `internal/whatsapp/` — reads decrypted WhatsApp `msgstore.db` -- `internal/imessage/` — reads macOS `chat.db` -- `internal/gvoice/` — parses Google Takeout HTML/VCF files - -No shared interface is forced — each source is too different. But all -converge on the same store methods for persistence: -`EnsureParticipantByPhone(phone, identifierType)`, -`EnsureParticipant(email, identifierType)` (for email-based handles), -`EnsureConversationWithType`, `EnsureLabel`, `LinkMessageLabel`, -`RecomputeConversationStats`, and message insertion with proper -`message_type`/`sender_id`/`conversation_type`. - -### Shared Utilities (`internal/textimport/`) - -- `NormalizePhone(raw string) (string, error)` — E.164 normalization; - returns error for non-phone inputs -- Progress reporting (callback-based, like WhatsApp's - `ImportCLIProgress`) - -### iMessage Refactoring - -Drop `gmail.API` interface implementation and synthetic MIME generation. -Instead: -- Read from `chat.db` directly (parsing stays the same) -- Resolve participants via phone or email (iMessage handles can be - either); use `NormalizePhone` first, fall back to email path -- Set `message_type = 'imessage'` or `'sms'` (based on iMessage - service field) -- Set `conversation_type` based on chat type (group vs 1:1) -- Populate `conversations.title` using the fallback chain (see - Conversation Title Fallback section) -- Create labels (`'iMessage'`, `'SMS'`) and link to messages -- Call `RecomputeConversationStats` after import completes - -### Google Voice Refactoring - -Drop `gmail.API` interface implementation and synthetic MIME generation. -Instead: -- Parse HTML/VCF files (parsing stays the same) -- Call store methods for persistence with proper phone-based participants -- Set `message_type` per record kind: `'google_voice_text'`, - `'google_voice_call'`, or `'google_voice_voicemail'` -- Set `conversation_type` based on participant count -- Store body text directly, raw HTML in `message_raw` -- Create labels (`'sms'`, `'mms'`, `'call_received'`, etc.) and link - to messages -- Call `RecomputeConversationStats` after import completes - -### WhatsApp - -Mostly fine as-is — already follows the target pattern. Minor cleanup: -- Use shared `NormalizePhone()` instead of internal normalization -- Migrate bulk stats update to shared `RecomputeConversationStats` -- Ensure consistent `raw_format` naming - -### CLI Commands - -Renamed for consistency (each stays separate since inputs differ): - -``` -msgvault import-whatsapp --phone +1... [--media-dir] [--contacts] -msgvault import-imessage [--me +1...] -msgvault import-gvoice -``` - -The `source_type` is `'whatsapp'` regardless of import method (backup -now, web sync API later). `raw_format` in `message_raw` can distinguish -import methods if needed. - -## TUI Texts Mode - -### New Navigation Model - -Texts mode requires a different navigation shape than Email mode. The -current TUI is built around a single-key aggregate model: `ViewType` -selects a grouping dimension (sender, domain, label, time), -`AggregateRow` holds one key plus counts/sizes, and drill-down goes -from aggregate → message list → message detail. This structure does -not accommodate conversation-first navigation. - -**Texts mode introduces a parallel navigation tree:** - -``` -Texts Mode -├── Conversations view (primary) -│ └── Drill: conversation → message timeline -├── Contacts view (aggregate) -│ └── Drill: contact → conversations with that contact → timeline -├── Contact Names view (aggregate) -│ └── Drill: name → conversations → timeline -├── Sources view (aggregate) -│ └── Drill: source → conversations from that source → timeline -├── Labels view (aggregate) -│ └── Drill: label → messages with that label -└── Time view (aggregate) - └── Drill: period → conversations active in that period → timeline -``` - -**Implementation approach:** This is a new set of view types, query -methods, and TUI states — not a parameterization of the existing email -views. - -- New `TextViewType` enum: `TextViewConversations`, - `TextViewContacts`, `TextViewContactNames`, `TextViewSources`, - `TextViewLabels`, `TextViewTime`. -- New `ConversationRow` struct for the Conversations view: `Title`, - `SourceType`, `MessageCount`, `ParticipantCount`, `LastMessageAt`, - `ConversationID`. This is not an `AggregateRow` — it has different - fields and different drill-down semantics. -- New query engine methods: `ListConversations(filter)`, - `TextAggregate(viewType, opts)`, `ListConversationMessages(convID, - filter)`. These are separate from the email `Aggregate`/ - `ListMessages` methods. -- New TUI state machine entries for Texts mode navigation. The mode - key (`m`) switches between the two state machines. Keybindings that - overlap (Tab, Enter, Esc, `s`, `r`, `a`, `/`, `?`, `q`) behave - the same way within each mode's navigation tree. - -The email TUI code is untouched. Texts mode is additive — new files, -new types, new query interface. - -**Separate query interface.** Text query methods live in a new -`TextEngine` interface (in `internal/query/`), not on the existing -`Engine` interface. `Engine` is shared across the local TUI, remote -engine, API server, MCP server, and test mocks — adding methods to it -would force changes across all those layers. `TextEngine` is consumed -only by the Texts mode TUI, so it avoids that ripple. - -```go -type TextEngine interface { - ListConversations(ctx context.Context, - filter TextFilter) ([]ConversationRow, error) - TextAggregate(ctx context.Context, viewType TextViewType, - opts TextAggregateOptions) ([]AggregateRow, error) - ListConversationMessages(ctx context.Context, convID int64, - filter TextFilter) ([]MessageSummary, error) - TextSearch(ctx context.Context, query string, - limit, offset int) ([]MessageSummary, error) - GetTextStats(ctx context.Context, - opts TextStatsOptions) (*TotalStats, error) -} -``` - -`DuckDBEngine` implements both `Engine` and `TextEngine` (it already -has access to the Parquet data). `SQLiteEngine` can also implement -`TextEngine` as a fallback. The remote engine, API server, and MCP -server do not need to implement `TextEngine` until remote Texts mode -is added (deferred). - -### Conversations View (Primary) - -The default view when entering Texts mode. Each row shows: - -| Name | Source | Messages | Participants | Last Message | -|------|--------|----------|-------------|--------------| -| Jane Smith | iMessage | 1,247 | 2 | 2026-03-28 | -| Family Group | WhatsApp | 8,432 | 6 | 2026-03-30 | - -- Default sort: last message date (newest first) -- Drill into a conversation: chronological message timeline -- Messages display in compact chat style (timestamp, sender, body - snippet) -- Conversation stats come from denormalized columns recomputed - post-import - -### Aggregate Views (Tab to Cycle) - -- **Contacts** — aggregate by participant phone number/name, total - messages across all sources and conversations -- **Contact Names** — aggregate by display name -- **Sources** — aggregate by source type (WhatsApp / iMessage / GVoice) -- **Labels** — source-specific labels (GVoice: sms/voicemail/call) -- **Time** — message volume over time (year/month/day granularity) - -### Drill-Down - -- From Conversations: chronological message timeline -- From Contacts: all conversations with that person (across all - sources), then drill into a specific conversation -- From Time: conversations active in that period - -### Message Detail - -Pressing Enter on a message in the timeline does not open the email- -style detail view. The current detail model is email-shaped: -participants are `Address{Email, Name}` only, participant loading -reads `message_recipients`, and fallback body extraction assumes MIME -raw format. None of this works for text messages. - -In Texts mode, Enter on a message in the timeline is a no-op (or -scrolls to show the full message body inline if truncated). A proper -text message detail view is deferred. - -### Keybindings - -Texts mode reuses the same key assignments as Email mode where the -action applies. Keys that map to email-only actions are disabled. - -| Key | Email mode | Texts mode | -|-----|-----------|------------| -| `Tab` | Cycle aggregate views | Cycle text views (Conversations → Contacts → ...) | -| `Enter` | Drill down | Drill down (conversation → timeline; no message detail) | -| `Esc`/`Backspace` | Go back | Go back | -| `j`/`k`/`↑`/`↓` | Navigate rows | Navigate rows | -| `s` | Cycle sort field | Cycle sort field | -| `r` | Reverse sort | Reverse sort | -| `t` | Jump to Time view | Jump to Time view | -| `A` | Account selector | Source selector (lists text source accounts) | -| `a` | Jump to all messages | Jump to all conversations (reset filters) | -| `f` | Filter by attachments | Filter by attachments | -| `/` | Search (email FTS) | Search (text FTS, plain text only) | -| `?` | Help | Help | -| `q` | Quit | Quit | -| `m` | Switch to Texts mode | Switch to Email mode | -| `Space` | Toggle selection | Disabled (no deletion staging) | -| `d`/`D` | Stage deletion | Disabled (read-only) | -| `x` | Clear selection | Disabled | - -The `A` key opens the same account selector UI but filtered to text -sources. This is a per-account filter (same `SourceID *int64` -plumbing), not a source-type bucket. - -## Parquet Analytics - -### Unified Cache with Mode Filtering - -Text messages are stored in the same Parquet cache as emails, with -additional columns to support mode-specific queries. This avoids -duplicating the entire cache/query/staleness infrastructure. - -``` -~/.msgvault/analytics/ - messages/year=*/ # All messages (email + text) - _last_sync.json -``` - -### Additional Parquet Columns - -The existing denormalized Parquet schema is extended with: -- `phone_number` (sender, from `participants.phone_number`) -- `message_type` (whatsapp/imessage/sms/google_voice_*/email) -- `source_type` (whatsapp/apple_messages/google_voice/gmail) -- `conversation_title` (from `conversations.title`) -- `conversation_type` (group_chat/direct_chat/email_thread) -- `sender_id` (from `messages.sender_id`) - -Email mode queries filter `WHERE message_type = 'email'` (or -`source_type IN ('gmail', 'imap')`). Texts mode queries filter on the -text message types. The DuckDB query engine branches on mode for -aggregate key columns (email uses `from_email`/`from_domain`; texts -use `phone_number`/`conversation_title`). - -### Query Engine - -`DuckDBEngine` implements the new `TextEngine` interface (see TUI -section) alongside the existing `Engine` interface. Text query -methods are separate functions on the same struct, not additions to -the `Engine` interface. - -Existing email query methods on `Engine` gain an implicit -`message_type = 'email'` filter to exclude text messages from email -views. - -## Search - -### FTS Indexing - -Text messages are indexed in `messages_fts` alongside emails. The -FTS backfill pipeline is updated to populate the `from_addr` field -from `participants.phone_number` (via `messages.sender_id`) for text -messages, rather than only reading from `message_recipients` email -fields. - -### Search Semantics by Mode - -**Email mode** retains the current Gmail-style search operators: -`from:`, `to:`, `cc:`, `bcc:`, `subject:`, `account:`, etc. These -resolve against `message_recipients` and email-specific fields. No -changes. - -**Texts mode uses plain full-text search only.** The `/` key opens -the same search input, but the query is treated as a plain text match -against `messages_fts` (body + sender phone/name), filtered to text -message types. The Gmail-style operators (`from:`, `subject:`, etc.) -are not supported in Texts mode — they map to email-specific fields -(`message_recipients`, `subject`) that don't apply to text messages. - -If structured text search is needed later (e.g., `from:+1555...`, -`in:groupname`), it would be a new parser for text-specific -operators. For now, plain FTS is sufficient. - -## Scope - -### In Scope - -- Refactor iMessage and Google Voice to phone-based persistence -- Shared `NormalizePhone()` utility -- Participant deduplication by phone number across all sources -- `RecomputeConversationStats` shared store method -- Label persistence contract for all importers -- CLI command renaming -- TUI Texts mode with new navigation model (Conversations + - aggregates + message timeline), read-only, detail view disabled -- New query engine methods for text conversations and aggregates -- Unified Parquet cache with mode-aware columns and queries -- FTS indexing of text messages (including phone-based sender lookup) -- `build-cache` exports text messages alongside emails -- Source filter in Texts mode (per-account, same plumbing as email) - -### Deferred - -- WhatsApp web sync API (future import method) -- MMS/iMessage attachment extraction -- Contact name resolution from macOS address book (needed for full - cross-channel unification of email-only iMessage handles with - phone-based contacts) -- Cross-mode unified search (emails + texts together) -- Rich message detail view for texts -- Deletion support for text sources with live APIs -- Source-type bucket filter (filter by "all WhatsApp" vs per-account) From e1712addf78dc3d6dc4cdcef4e57c6a23a8c9af5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 21:38:54 -0500 Subject: [PATCH 30/65] Fix iMessage: source identifier, message_recipients, raw data - Use "local" as default source identifier instead of meaningless "imessage"; add --me flag for phone/email override - Write message_recipients rows (from/to) based on is_from_me: outgoing messages set owner as from and chat participants as to; incoming messages set sender handle as from and owner as to - Set sender_id on is_from_me messages when owner is resolved - Store message data as JSON in message_raw via UpsertMessageRawWithFormat with "imessage_json" format Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/msgvault/cmd/import_imessage.go | 16 ++- internal/imessage/client.go | 185 +++++++++++++++++++++++++++- 2 files changed, 198 insertions(+), 3 deletions(-) diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go index c51692ae..b3687992 100644 --- a/cmd/msgvault/cmd/import_imessage.go +++ b/cmd/msgvault/cmd/import_imessage.go @@ -20,6 +20,7 @@ var ( importImessageBefore string importImessageAfter string importImessageLimit int + importImessageMe string ) var importImessageCmd = &cobra.Command{ @@ -61,6 +62,11 @@ func runImportImessage(cmd *cobra.Command, _ []string) error { if err != nil { return err } + if importImessageMe != "" { + clientOpts = append( + clientOpts, imessage.WithOwnerHandle(importImessageMe), + ) + } client, err := imessage.NewClient(chatDBPath, clientOpts...) if err != nil { @@ -69,7 +75,11 @@ func runImportImessage(cmd *cobra.Command, _ []string) error { defer func() { _ = client.Close() }() // Get or create the source - src, err := s.GetOrCreateSource("apple_messages", "imessage") + identifier := "local" + if importImessageMe != "" { + identifier = importImessageMe + } + src, err := s.GetOrCreateSource("apple_messages", identifier) if err != nil { return fmt.Errorf("get or create source: %w", err) } @@ -239,5 +249,9 @@ func init() { &importImessageLimit, "limit", 0, "limit number of messages (for testing)", ) + importImessageCmd.Flags().StringVar( + &importImessageMe, "me", "", + "your phone/email for recipient tracking (default: source identifier 'local')", + ) rootCmd.AddCommand(importImessageCmd) } diff --git a/internal/imessage/client.go b/internal/imessage/client.go index 33edbea1..339a8f94 100644 --- a/internal/imessage/client.go +++ b/internal/imessage/client.go @@ -3,6 +3,7 @@ package imessage import ( "context" "database/sql" + "encoding/json" "fmt" "log/slog" "strconv" @@ -24,6 +25,7 @@ type Client struct { beforeDate time.Time // only import messages before this date limit int // max total messages to import (0 = unlimited) useNanoseconds bool // whether chat.db uses nanosecond timestamps + ownerHandle string // phone/email of device owner (from --me flag) logger *slog.Logger pageSize int } @@ -46,6 +48,12 @@ func WithLimit(n int) ClientOption { return func(c *Client) { c.limit = n } } +// WithOwnerHandle sets the device owner's phone or email for +// recipient tracking. Used to create message_recipients rows. +func WithOwnerHandle(handle string) ClientOption { + return func(c *Client) { c.ownerHandle = handle } +} + // WithImessageLogger sets the logger for the client. func WithImessageLogger(l *slog.Logger) ClientOption { return func(c *Client) { c.logger = l } @@ -154,6 +162,21 @@ func (c *Client) Import( return nil, fmt.Errorf("ensure SMS label: %w", err) } + // Resolve owner participant from --me flag for message_recipients + var ownerPID int64 + if c.ownerHandle != "" { + pid, err := c.resolveParticipant( + s, c.ownerHandle, + map[string]int64{}, map[string]int64{}, + summary, + ) + if err != nil { + return nil, fmt.Errorf("resolve owner handle %q: %w", + c.ownerHandle, err) + } + ownerPID = pid + } + // Track resolved participants to avoid repeated DB calls phoneCache := map[string]int64{} // phone -> participantID emailCache := map[string]int64{} // email -> participantID @@ -190,7 +213,7 @@ func (c *Client) Import( ctx, s, sourceID, &msg, imessageLabelID, smsLabelID, phoneCache, emailCache, convCache, - summary, + ownerPID, summary, ); err != nil { c.logger.Warn( "failed to import message", @@ -296,6 +319,7 @@ func (c *Client) importMessage( phoneCache map[string]int64, emailCache map[string]int64, convCache map[string]int64, + ownerPID int64, summary *ImportSummary, ) error { // Determine conversation @@ -318,7 +342,10 @@ func (c *Client) importMessage( // Resolve sender var senderID sql.NullInt64 if msg.IsFromMe != 0 { - // is_from_me messages: sender is the device owner, no external handle + // is_from_me: sender is the device owner + if ownerPID > 0 { + senderID = sql.NullInt64{Int64: ownerPID, Valid: true} + } } else if msg.HandleID != nil { pid, err := c.resolveParticipant( s, *msg.HandleID, phoneCache, emailCache, summary, @@ -381,6 +408,19 @@ func (c *Client) importMessage( } } + // Write message_recipients rows + if err := c.writeMessageRecipients( + s, msgID, msg, senderID, ownerPID, + phoneCache, emailCache, summary, + ); err != nil { + return fmt.Errorf("write message recipients: %w", err) + } + + // Store raw data as JSON for completeness + if err := c.writeMessageRaw(s, msgID, msg, body); err != nil { + return fmt.Errorf("write message raw: %w", err) + } + // Label: iMessage or SMS labelID := imessageLabelID if msgType == "sms" { @@ -402,6 +442,147 @@ func (c *Client) importMessage( return nil } +// writeMessageRecipients creates from/to rows in message_recipients. +// For is_from_me: from=owner, to=other chat participants. +// For !is_from_me: from=sender handle, to=owner. +func (c *Client) writeMessageRecipients( + s *store.Store, + msgID int64, + msg *messageRow, + senderID sql.NullInt64, + ownerPID int64, + phoneCache map[string]int64, + emailCache map[string]int64, + summary *ImportSummary, +) error { + if msg.IsFromMe != 0 { + // Sender is the device owner + if ownerPID > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "from", []int64{ownerPID}, nil, + ); err != nil { + return err + } + } + // Recipients are the other chat participants + toPIDs := c.getChatParticipantIDs( + s, msg, phoneCache, emailCache, summary, + ) + if len(toPIDs) > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "to", toPIDs, nil, + ); err != nil { + return err + } + } + } else { + // Sender is the external handle + if senderID.Valid { + if err := s.ReplaceMessageRecipients( + msgID, "from", + []int64{senderID.Int64}, nil, + ); err != nil { + return err + } + } + // Recipient is the device owner + if ownerPID > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "to", []int64{ownerPID}, nil, + ); err != nil { + return err + } + } + } + return nil +} + +// getChatParticipantIDs returns participant IDs for the chat members +// (excluding the owner). Used for "to" recipients on is_from_me messages. +func (c *Client) getChatParticipantIDs( + s *store.Store, + msg *messageRow, + phoneCache map[string]int64, + emailCache map[string]int64, + summary *ImportSummary, +) []int64 { + if msg.ChatROWID == nil { + // No chat info; fall back to handle if available + if msg.HandleID != nil { + pid, err := c.resolveParticipant( + s, *msg.HandleID, + phoneCache, emailCache, summary, + ) + if err == nil && pid > 0 { + return []int64{pid} + } + } + return nil + } + + rows, err := c.db.Query(` + SELECT h.id + FROM chat_handle_join chj + JOIN handle h ON h.ROWID = chj.handle_id + WHERE chj.chat_id = ? + `, *msg.ChatROWID) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var pids []int64 + for rows.Next() { + var handleID string + if err := rows.Scan(&handleID); err != nil { + continue + } + pid, err := c.resolveParticipant( + s, handleID, phoneCache, emailCache, summary, + ) + if err == nil && pid > 0 { + pids = append(pids, pid) + } + } + return pids +} + +// writeMessageRaw serializes the message data as JSON and stores it. +func (c *Client) writeMessageRaw( + s *store.Store, + msgID int64, + msg *messageRow, + body string, +) error { + raw := map[string]interface{}{ + "rowid": msg.ROWID, + "guid": msg.GUID, + "date": msg.Date, + "is_from_me": msg.IsFromMe, + "body": body, + } + if msg.Service != nil { + raw["service"] = *msg.Service + } + if msg.HandleID != nil { + raw["handle_id"] = *msg.HandleID + } + if msg.ChatGUID != nil { + raw["chat_guid"] = *msg.ChatGUID + } + if msg.ChatDisplayName != nil { + raw["chat_display_name"] = *msg.ChatDisplayName + } + if msg.ChatIdentifier != nil { + raw["chat_identifier"] = *msg.ChatIdentifier + } + rawJSON, err := json.Marshal(raw) + if err != nil { + return fmt.Errorf("marshal raw JSON: %w", err) + } + return s.UpsertMessageRawWithFormat(msgID, rawJSON, "imessage_json") +} + // ensureConversation gets or creates a conversation for the chat, // resolving participants and setting the title. func (c *Client) ensureConversation( From cb24095054259885d0c2083703f6ec6031579331 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 21:53:12 -0500 Subject: [PATCH 31/65] Fix GVoice: add message_recipients and raw data for text entries Resolve two issues in the Google Voice importer: 1. Write message_recipients rows after upserting each message in both importTextEntry and importCallEntry. Text messages use IsMe to determine from/to direction; call records use call type (placed vs received/missed/voicemail). 2. Store raw HTML for text entries via UpsertMessageRawWithFormat, matching what call entries already do. Also cache the owner participant ID at the start of Import() instead of re-resolving it in every importTextEntry/importCallEntry call. Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/gvoice/client.go | 171 ++++++++++++++++++++++++++++++++++---- 1 file changed, 157 insertions(+), 14 deletions(-) diff --git a/internal/gvoice/client.go b/internal/gvoice/client.go index f6395511..e47eaef2 100644 --- a/internal/gvoice/client.go +++ b/internal/gvoice/client.go @@ -143,6 +143,15 @@ func (c *Client) Import( convCache := map[string]int64{} // threadID → conversationID imported := 0 + // Resolve owner participant once for all messages + ownerID, err := c.resolveParticipant( + s, c.owner.GoogleVoice, "", + phoneCache, summary, + ) + if err != nil { + return nil, fmt.Errorf("resolve owner: %w", err) + } + for _, entry := range c.index { if err := ctx.Err(); err != nil { return summary, err @@ -154,7 +163,7 @@ func (c *Client) Import( switch entry.FileType { case fileTypeText, fileTypeGroup: n, err := c.importTextEntry( - ctx, s, sourceID, &entry, + ctx, s, sourceID, &entry, ownerID, labelIDs, phoneCache, convCache, summary, ) if err != nil { @@ -170,7 +179,7 @@ func (c *Client) Import( default: if err := c.importCallEntry( - ctx, s, sourceID, &entry, + ctx, s, sourceID, &entry, ownerID, labelIDs, phoneCache, convCache, summary, ); err != nil { c.logger.Warn( @@ -214,6 +223,7 @@ func (c *Client) importTextEntry( s *store.Store, sourceID int64, entry *indexEntry, + ownerID int64, labelIDs map[string]int64, phoneCache map[string]int64, convCache map[string]int64, @@ -265,12 +275,8 @@ func (c *Client) importTextEntry( _ = s.EnsureConversationParticipant(convID, senderID, "member") } - // Ensure owner as participant - ownerID, err := c.resolveParticipant( - s, c.owner.GoogleVoice, "", - phoneCache, summary, - ) - if err == nil && ownerID > 0 { + // Ensure owner as conversation participant + if ownerID > 0 { _ = s.EnsureConversationParticipant(convID, ownerID, "member") } @@ -327,6 +333,22 @@ func (c *Client) importTextEntry( return 0, fmt.Errorf("upsert message body: %w", err) } + // Store raw HTML + rawData, rErr := os.ReadFile(entry.FilePath) + if rErr == nil { + _ = s.UpsertMessageRawWithFormat( + msgID, rawData, "gvoice_html", + ) + } + + // Write message_recipients + if err := c.writeTextRecipients( + s, msgID, msg, ownerID, senderID, entry.FileType, + groupParticipants, phoneCache, summary, + ); err != nil { + return 0, fmt.Errorf("write message recipients: %w", err) + } + // Link labels for _, labelName := range entry.Labels { if lid, ok := labelIDs[labelName]; ok { @@ -349,6 +371,7 @@ func (c *Client) importCallEntry( s *store.Store, sourceID int64, entry *indexEntry, + ownerID int64, labelIDs map[string]int64, phoneCache map[string]int64, convCache map[string]int64, @@ -388,12 +411,8 @@ func (c *Client) importCallEntry( ) } - // Ensure owner as participant - ownerID, err := c.resolveParticipant( - s, c.owner.GoogleVoice, "", - phoneCache, summary, - ) - if err == nil && ownerID > 0 { + // Ensure owner as conversation participant + if ownerID > 0 { _ = s.EnsureConversationParticipant( convID, ownerID, "member", ) @@ -475,6 +494,13 @@ func (c *Client) importCallEntry( ) } + // Write message_recipients + if err := writeCallRecipients( + s, msgID, ownerID, contactID, record.CallType, + ); err != nil { + return fmt.Errorf("write message recipients: %w", err) + } + // Link labels for _, labelName := range entry.Labels { if lid, ok := labelIDs[labelName]; ok { @@ -486,6 +512,123 @@ func (c *Client) importCallEntry( return nil } +// writeTextRecipients writes from/to rows for a text message. +// IsMe=true: from=owner, to=contact(s). IsMe=false: from=sender, to=owner. +func (c *Client) writeTextRecipients( + s *store.Store, + msgID int64, + msg textMessage, + ownerID, senderID int64, + ft fileType, + groupParticipants []string, + phoneCache map[string]int64, + summary *ImportSummary, +) error { + if msg.IsMe { + // From: owner + if ownerID > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "from", []int64{ownerID}, nil, + ); err != nil { + return err + } + } + // To: group participants or the contact + toIDs := c.collectRecipientIDs( + s, ft, senderID, groupParticipants, + phoneCache, summary, + ) + if len(toIDs) > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "to", toIDs, nil, + ); err != nil { + return err + } + } + } else { + // From: external sender + if senderID > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "from", []int64{senderID}, nil, + ); err != nil { + return err + } + } + // To: owner + if ownerID > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "to", []int64{ownerID}, nil, + ); err != nil { + return err + } + } + } + return nil +} + +// collectRecipientIDs returns the "to" participant IDs for an +// outgoing text message. For group chats, all group participants +// (excluding the owner). For direct chats, the contact sender ID. +func (c *Client) collectRecipientIDs( + s *store.Store, + ft fileType, + contactID int64, + groupParticipants []string, + phoneCache map[string]int64, + summary *ImportSummary, +) []int64 { + if ft == fileTypeGroup && len(groupParticipants) > 0 { + var ids []int64 + for _, phone := range groupParticipants { + pid, err := c.resolveParticipant( + s, phone, "", phoneCache, summary, + ) + if err == nil && pid > 0 { + ids = append(ids, pid) + } + } + return ids + } + if contactID > 0 { + return []int64{contactID} + } + return nil +} + +// writeCallRecipients writes from/to rows for a call record. +// Placed calls: from=owner, to=contact. +// Received/missed/voicemail: from=contact, to=owner. +func writeCallRecipients( + s *store.Store, + msgID, ownerID, contactID int64, + callType fileType, +) error { + var fromID, toID int64 + switch callType { + case fileTypePlaced: + fromID = ownerID + toID = contactID + default: + fromID = contactID + toID = ownerID + } + if fromID > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "from", []int64{fromID}, nil, + ); err != nil { + return err + } + } + if toID > 0 { + if err := s.ReplaceMessageRecipients( + msgID, "to", []int64{toID}, nil, + ); err != nil { + return err + } + } + return nil +} + func (c *Client) ensureConv( s *store.Store, sourceID int64, From 6d459de95ed37fdd0707d6f23e0bc6a6cd3ab189 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 21:56:16 -0500 Subject: [PATCH 32/65] Fix review findings: attachments, fast search, test - WhatsApp: skip attachment row when no media stored (no --media-dir) - DuckDB: include snippet in fast search text matching - iMessage: fix test for non-phone handle prefix (p:+1555123) --- internal/imessage/parser_test.go | 6 +++--- internal/query/duckdb.go | 7 +++++-- internal/whatsapp/importer.go | 14 +++++++++----- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/internal/imessage/parser_test.go b/internal/imessage/parser_test.go index d0959a0e..74ea08f6 100644 --- a/internal/imessage/parser_test.go +++ b/internal/imessage/parser_test.go @@ -142,10 +142,10 @@ func TestResolveHandle(t *testing.T) { wantDisplayName: "12345", }, { - name: "handle with digits parses as phone", + name: "handle with prefix falls to raw handle", handleID: "p:+1555123", - wantPhone: "+1555123", - wantDisplayName: "+1555123", + wantPhone: "", + wantDisplayName: "p:+1555123", }, { name: "system handle without digits", diff --git a/internal/query/duckdb.go b/internal/query/duckdb.go index b8872ed4..b3eedf46 100644 --- a/internal/query/duckdb.go +++ b/internal/query/duckdb.go @@ -417,6 +417,8 @@ func (e *DuckDBEngine) buildAggregateSearchConditions(searchQuery string, keyCol var parts []string parts = append(parts, `msg.subject ILIKE ? ESCAPE '\'`) args = append(args, termPattern) + parts = append(parts, `COALESCE(msg.snippet, '') ILIKE ? ESCAPE '\'`) + args = append(args, termPattern) parts = append(parts, `EXISTS ( SELECT 1 FROM mr mr_search JOIN p p_search ON p_search.id = mr_search.participant_id @@ -2330,17 +2332,18 @@ func (e *DuckDBEngine) buildSearchConditions(q *search.Query, filter MessageFilt args = append(args, filter.TimeRange.Period) } - // Text search terms - search subject and from fields only (fast path) + // Text search terms - search subject, snippet, and sender fields (fast path) if len(q.TextTerms) > 0 { for _, term := range q.TextTerms { termPattern := "%" + escapeILIKE(term) + "%" conditions = append(conditions, `( msg.subject ILIKE ? ESCAPE '\' OR + COALESCE(msg.snippet, '') ILIKE ? ESCAPE '\' OR COALESCE(ms.from_email, ds.from_email, '') ILIKE ? ESCAPE '\' OR COALESCE(ms.from_name, ds.from_name, '') ILIKE ? ESCAPE '\' OR COALESCE(ms.from_phone, ds.from_phone, '') ILIKE ? ESCAPE '\' )`) - args = append(args, termPattern, termPattern, termPattern, termPattern) + args = append(args, termPattern, termPattern, termPattern, termPattern, termPattern) } } diff --git a/internal/whatsapp/importer.go b/internal/whatsapp/importer.go index 17406c22..4c50253a 100644 --- a/internal/whatsapp/importer.go +++ b/internal/whatsapp/importer.go @@ -381,11 +381,15 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt size = int(media.FileSize.Int64) } - // Use UpsertAttachment — it handles dedup by content_hash. - err := imp.store.UpsertAttachment(messageID, filename, mimeType, storagePath, contentHash, size) - if err != nil { - summary.Errors++ - imp.progress.OnError(fmt.Errorf("upsert attachment for message %s: %w", waMsg.KeyID, err)) + // Only insert attachment row when we have actual content. + // Without --media-dir, storagePath and contentHash are both + // empty; inserting would create broken records. + if storagePath != "" || contentHash != "" { + err := imp.store.UpsertAttachment(messageID, filename, mimeType, storagePath, contentHash, size) + if err != nil { + summary.Errors++ + imp.progress.OnError(fmt.Errorf("upsert attachment for message %s: %w", waMsg.KeyID, err)) + } } // Store media metadata in the attachments table is done above. From b65fd1e40588e4d07d8fb219b945227702c70aac Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:01:45 -0500 Subject: [PATCH 33/65] Fix TextEngine: FTS syntax, contact views, default sort - Fix FTS5 MATCH syntax: use `messages_fts MATCH ?` instead of `fts.messages_fts MATCH ?` (FTS5 requires table name, not alias.table) - Fix contact filters and aggregates in both DuckDB and SQLite engines to use sender_id joined to participants instead of message_recipients, matching how text importers populate sender data - Fix ListConversations default sort: only override the last_message_at DESC default when SortField is explicitly set (zero value SortByCount was incorrectly applied as default) - Add TODO for participant_count approximation (uses distinct sender_id count until conversation_participants is exported to Parquet) Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/query/duckdb_text.go | 63 +++++++++++++++++++---------------- internal/query/sqlite_text.go | 39 ++++++++++++---------- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go index 8834da06..88bd4f8d 100644 --- a/internal/query/duckdb_text.go +++ b/internal/query/duckdb_text.go @@ -33,19 +33,24 @@ func (e *DuckDBEngine) buildTextFilterConditions( } if filter.ContactPhone != "" { conditions = append(conditions, `EXISTS ( - SELECT 1 FROM mr - JOIN p ON p.id = mr.participant_id - WHERE mr.message_id = msg.id - AND p.phone_number = ? + SELECT 1 FROM p p_filter + WHERE p_filter.id = msg.sender_id + AND COALESCE( + NULLIF(p_filter.phone_number, ''), + p_filter.email_address + ) = ? )`) args = append(args, filter.ContactPhone) } if filter.ContactName != "" { conditions = append(conditions, `EXISTS ( - SELECT 1 FROM mr - JOIN p ON p.id = mr.participant_id - WHERE mr.message_id = msg.id - AND COALESCE(NULLIF(TRIM(p.display_name), ''), p.email_address) = ? + SELECT 1 FROM p p_filter + WHERE p_filter.id = msg.sender_id + AND COALESCE( + NULLIF(TRIM(p_filter.display_name), ''), + NULLIF(p_filter.phone_number, ''), + p_filter.email_address + ) = ? )`) args = append(args, filter.ContactName) } @@ -96,17 +101,18 @@ func (e *DuckDBEngine) ListConversations( ) ([]ConversationRow, error) { where, args := e.buildTextFilterConditions(filter) - // Sort clause + // Sort clause: default to last_message_at DESC for conversations, + // since the zero value of SortField (SortByCount) is not meaningful here. orderBy := "last_message_at DESC" - switch filter.SortField { - case SortByCount: - orderBy = "message_count" - case SortBySize: - orderBy = "total_size" - case SortByName: - orderBy = "title" - } if filter.SortField != 0 { + switch filter.SortField { + case SortByCount: + orderBy = "message_count" + case SortBySize: + orderBy = "total_size" + case SortByName: + orderBy = "title" + } if filter.SortDirection == SortAsc { orderBy += " ASC" } else { @@ -125,6 +131,7 @@ func (e *DuckDBEngine) ListConversations( SELECT msg.conversation_id, COUNT(*) AS message_count, + -- TODO: use conversation_participants table once exported to Parquet COUNT(DISTINCT COALESCE(msg.sender_id, 0)) AS participant_count, MAX(msg.sent_at) AS last_message_at, COALESCE(SUM(CAST(msg.size_estimate AS BIGINT)), 0) AS total_size, @@ -186,20 +193,20 @@ func textAggViewDef( ) (aggViewDef, error) { switch view { case TextViewContacts: + keyExpr := "COALESCE(NULLIF(p_sender.phone_number, ''), " + + "p_sender.email_address)" return aggViewDef{ - keyExpr: "COALESCE(NULLIF(p.phone_number, ''), p.email_address)", - joinClause: `JOIN mr ON mr.message_id = msg.id - JOIN p ON p.id = mr.participant_id`, - nullGuard: "COALESCE(NULLIF(p.phone_number, ''), p.email_address) IS NOT NULL", + keyExpr: keyExpr, + joinClause: "JOIN p p_sender ON p_sender.id = msg.sender_id", + nullGuard: keyExpr + " IS NOT NULL AND msg.sender_id IS NOT NULL", }, nil case TextViewContactNames: - nameExpr := "COALESCE(NULLIF(TRIM(p.display_name), ''), " + - "NULLIF(p.phone_number, ''), p.email_address)" + nameExpr := "COALESCE(NULLIF(TRIM(p_sender.display_name), ''), " + + "NULLIF(p_sender.phone_number, ''), p_sender.email_address)" return aggViewDef{ - keyExpr: nameExpr, - joinClause: `JOIN mr ON mr.message_id = msg.id - JOIN p ON p.id = mr.participant_id`, - nullGuard: nameExpr + " IS NOT NULL", + keyExpr: nameExpr, + joinClause: "JOIN p p_sender ON p_sender.id = msg.sender_id", + nullGuard: nameExpr + " IS NOT NULL AND msg.sender_id IS NOT NULL", }, nil case TextViewSources: return aggViewDef{ @@ -396,7 +403,7 @@ func (e *DuckDBEngine) TextSearch( JOIN messages m ON m.id = fts.rowid LEFT JOIN participants p ON p.id = m.sender_id LEFT JOIN conversations c ON c.id = m.conversation_id - WHERE fts.messages_fts MATCH ? + WHERE messages_fts MATCH ? AND m.message_type IN ('whatsapp','imessage','sms','google_voice_text') ORDER BY m.sent_at DESC LIMIT ? OFFSET ? diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index 4dd270ea..971e4e2a 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -61,19 +61,24 @@ func buildSQLiteTextFilterConditions(filter TextFilter) (string, []interface{}) } if filter.ContactPhone != "" { conditions = append(conditions, `EXISTS ( - SELECT 1 FROM message_recipients mr_cp - JOIN participants p_cp ON p_cp.id = mr_cp.participant_id - WHERE mr_cp.message_id = m.id - AND p_cp.phone_number = ? + SELECT 1 FROM participants p_cp + WHERE p_cp.id = m.sender_id + AND COALESCE( + NULLIF(p_cp.phone_number, ''), + p_cp.email_address + ) = ? )`) args = append(args, filter.ContactPhone) } if filter.ContactName != "" { conditions = append(conditions, `EXISTS ( - SELECT 1 FROM message_recipients mr_cn - JOIN participants p_cn ON p_cn.id = mr_cn.participant_id - WHERE mr_cn.message_id = m.id - AND COALESCE(NULLIF(TRIM(p_cn.display_name), ''), p_cn.email_address) = ? + SELECT 1 FROM participants p_cn + WHERE p_cn.id = m.sender_id + AND COALESCE( + NULLIF(TRIM(p_cn.display_name), ''), + NULLIF(p_cn.phone_number, ''), + p_cn.email_address + ) = ? )`) args = append(args, filter.ContactName) } @@ -225,19 +230,19 @@ func textAggSQLiteDimension( ) (aggDimension, error) { switch view { case TextViewContacts: + keyExpr := "COALESCE(NULLIF(p_agg.phone_number, ''), p_agg.email_address)" return aggDimension{ - keyExpr: "COALESCE(NULLIF(p_agg.phone_number, ''), p_agg.email_address)", - joins: `JOIN message_recipients mr_agg ON mr_agg.message_id = m.id - JOIN participants p_agg ON p_agg.id = mr_agg.participant_id`, - whereExpr: "COALESCE(NULLIF(p_agg.phone_number, ''), p_agg.email_address) IS NOT NULL", + keyExpr: keyExpr, + joins: "JOIN participants p_agg ON p_agg.id = m.sender_id", + whereExpr: keyExpr + " IS NOT NULL AND m.sender_id IS NOT NULL", }, nil case TextViewContactNames: - nameExpr := "COALESCE(NULLIF(TRIM(p_agg.display_name), ''), NULLIF(p_agg.phone_number, ''), p_agg.email_address)" + nameExpr := "COALESCE(NULLIF(TRIM(p_agg.display_name), ''), " + + "NULLIF(p_agg.phone_number, ''), p_agg.email_address)" return aggDimension{ - keyExpr: nameExpr, - joins: `JOIN message_recipients mr_agg ON mr_agg.message_id = m.id - JOIN participants p_agg ON p_agg.id = mr_agg.participant_id`, - whereExpr: nameExpr + " IS NOT NULL", + keyExpr: nameExpr, + joins: "JOIN participants p_agg ON p_agg.id = m.sender_id", + whereExpr: nameExpr + " IS NOT NULL AND m.sender_id IS NOT NULL", }, nil case TextViewSources: return aggDimension{ From f331ac35e4e0810e580b88a35ccb27b045434515 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:07:46 -0500 Subject: [PATCH 34/65] Fix review findings: CSV fallback, search filters, SQLite text, FTS - build_cache.go: add conversation_type to CSV fallback conversations query with COALESCE default to 'email_thread' for older databases - query/shared.go: add emailOnlyFilterMsg and emailOnlyFilterM constants - query/duckdb.go: apply email-only filter in buildSearchConditions and GetTotalStats (was missing from SearchFast/SearchFastCount/SearchFastWithStats and stats path) - query/sqlite.go: apply email-only filter in GetTotalStats and buildSearchQueryParts (search path was unfiltered) - query/sqlite_text.go: fix last_preview subquery to use m2. alias via textMsgTypeFilterAlias instead of textMsgTypeFilter (which hardcodes m.) - store/messages.go: only prefer sender_id for non-email messages in backfillFTSBatch; email messages use message_recipients for from_addr - store/subset.go: update populateFTS to match backfillFTSBatch logic Co-Authored-By: Claude Sonnet 4.6 --- cmd/msgvault/cmd/build_cache.go | 2 +- internal/query/duckdb.go | 6 ++++++ internal/query/shared.go | 8 ++++++++ internal/query/sqlite.go | 5 ++++- internal/query/sqlite_text.go | 8 +++++++- internal/store/messages.go | 4 +++- internal/store/subset.go | 18 +++++++++++------- 7 files changed, 40 insertions(+), 11 deletions(-) diff --git a/cmd/msgvault/cmd/build_cache.go b/cmd/msgvault/cmd/build_cache.go index 649ecc5e..f034807b 100644 --- a/cmd/msgvault/cmd/build_cache.go +++ b/cmd/msgvault/cmd/build_cache.go @@ -682,7 +682,7 @@ func setupSQLiteSource(duckDB *sql.DB, dbPath string) (cleanup func(), err error {"participants", "SELECT id, email_address, domain, display_name, phone_number FROM participants", ""}, {"labels", "SELECT id, name FROM labels", ""}, {"sources", "SELECT id, identifier, source_type FROM sources", ""}, - {"conversations", "SELECT id, source_conversation_id, title FROM conversations", ""}, + {"conversations", "SELECT id, source_conversation_id, title, COALESCE(conversation_type, 'email_thread') AS conversation_type FROM conversations", ""}, } for _, t := range tables { diff --git a/internal/query/duckdb.go b/internal/query/duckdb.go index b3eedf46..61067d0f 100644 --- a/internal/query/duckdb.go +++ b/internal/query/duckdb.go @@ -1080,6 +1080,9 @@ func (e *DuckDBEngine) GetTotalStats(ctx context.Context, opts StatsOptions) (*T var conditions []string var args []interface{} + // Restrict to email messages only; NULL and '' handle pre-message_type data. + conditions = append(conditions, emailOnlyFilterMsg) + if opts.SourceID != nil { conditions = append(conditions, "msg.source_id = ?") args = append(args, *opts.SourceID) @@ -2267,6 +2270,9 @@ func (e *DuckDBEngine) buildSearchConditions(q *search.Query, filter MessageFilt var conditions []string var args []interface{} + // Restrict to email messages only; NULL and '' handle pre-message_type data. + conditions = append(conditions, emailOnlyFilterMsg) + // Apply basic filter conditions (ignoring join flags for search - we handle those differently) if filter.SourceID != nil { conditions = append(conditions, "msg.source_id = ?") diff --git a/internal/query/shared.go b/internal/query/shared.go index eb149787..9b22bdde 100644 --- a/internal/query/shared.go +++ b/internal/query/shared.go @@ -12,6 +12,14 @@ import ( "github.com/wesm/msgvault/internal/mime" ) +// emailOnlyFilterMsg is the SQL condition restricting to email messages with "msg." alias (DuckDB). +// NULL and empty string handle old data where message_type was not yet populated. +const emailOnlyFilterMsg = "(msg.message_type = 'email' OR msg.message_type IS NULL OR msg.message_type = '')" + +// emailOnlyFilterM is the SQL condition restricting to email messages with "m." alias (SQLite). +// NULL and empty string handle old data where message_type was not yet populated. +const emailOnlyFilterM = "(m.message_type = 'email' OR m.message_type IS NULL OR m.message_type = '')" + // fetchLabelsForMessageList adds labels to message summaries using a batch query. // tablePrefix is "" for direct SQLite or "sqlite_db." for DuckDB's sqlite_scan. func fetchLabelsForMessageList(ctx context.Context, db *sql.DB, tablePrefix string, messages []MessageSummary) error { diff --git a/internal/query/sqlite.go b/internal/query/sqlite.go index 400b0eca..132255e6 100644 --- a/internal/query/sqlite.go +++ b/internal/query/sqlite.go @@ -780,6 +780,8 @@ func (e *SQLiteEngine) GetTotalStats(ctx context.Context, opts StatsOptions) (*T // the messages table for compatibility with search joins. var conditions []string var args []interface{} + // Restrict to email messages only; NULL and '' handle pre-message_type data. + conditions = append(conditions, emailOnlyFilterM) // Include all messages (deleted messages shown with indicator in TUI) if opts.SourceID != nil { conditions = append(conditions, "m.source_id = ?") @@ -1036,7 +1038,8 @@ func (e *SQLiteEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi // buildSearchQueryParts builds the WHERE conditions, args, joins, and FTS join // for a search query. This is shared between Search and SearchFastCount. func (e *SQLiteEngine) buildSearchQueryParts(ctx context.Context, q *search.Query) (conditions []string, args []interface{}, joins []string, ftsJoin string) { - // Include all messages (deleted messages shown with indicator in TUI) + // Restrict to email messages only; NULL and '' handle pre-message_type data. + conditions = append(conditions, emailOnlyFilterM) // From filter - uses EXISTS to avoid join multiplication in aggregates. // Handles both exact addresses and @domain patterns. diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index 971e4e2a..8aed97fc 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -45,6 +45,12 @@ func textMsgTypeFilter() string { return "m.message_type IN ('whatsapp','imessage','sms','google_voice_text')" } +// textMsgTypeFilterAlias returns a SQL condition restricting to text message types +// using the given table alias. +func textMsgTypeFilterAlias(alias string) string { + return alias + ".message_type IN ('whatsapp','imessage','sms','google_voice_text')" +} + // buildSQLiteTextFilterConditions builds WHERE conditions from a TextFilter. // All conditions use the m. prefix for the messages table. func buildSQLiteTextFilterConditions(filter TextFilter) (string, []interface{}) { @@ -185,7 +191,7 @@ func (e *SQLiteEngine) ListConversations( GROUP BY c.id, c.title, s.source_type ORDER BY %s LIMIT ? OFFSET ? - `, textMsgTypeFilter(), where, orderBy) + `, textMsgTypeFilterAlias("m2"), where, orderBy) args = append(args, limit, filter.Pagination.Offset) diff --git a/internal/store/messages.go b/internal/store/messages.go index 0c89f3dc..45cfb502 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -854,7 +854,9 @@ func (s *Store) backfillFTSBatch(fromID, toID int64) (int64, error) { INSERT OR REPLACE INTO messages_fts (rowid, message_id, subject, body, from_addr, to_addr, cc_addr) SELECT m.id, m.id, COALESCE(m.subject, ''), COALESCE(mb.body_text, ''), COALESCE( - (SELECT COALESCE(p.phone_number, p.email_address) FROM participants p WHERE p.id = m.sender_id), + CASE WHEN m.message_type != 'email' AND m.message_type IS NOT NULL AND m.message_type != '' + THEN (SELECT COALESCE(p.phone_number, p.email_address) FROM participants p WHERE p.id = m.sender_id) + END, (SELECT GROUP_CONCAT(p.email_address, ' ') FROM message_recipients mr JOIN participants p ON p.id = mr.participant_id WHERE mr.message_id = m.id AND mr.recipient_type = 'from'), '' ), diff --git a/internal/store/subset.go b/internal/store/subset.go index 0f095efa..9cf6c854 100644 --- a/internal/store/subset.go +++ b/internal/store/subset.go @@ -438,13 +438,17 @@ func populateFTS(db *sql.DB) error { ) SELECT m.id, m.id, COALESCE(m.subject, ''), COALESCE(mb.body_text, ''), - COALESCE(( - SELECT GROUP_CONCAT(p.email_address, ' ') - FROM message_recipients mr - JOIN participants p ON p.id = mr.participant_id - WHERE mr.message_id = m.id - AND mr.recipient_type = 'from' - ), ''), + COALESCE( + CASE WHEN m.message_type != 'email' AND m.message_type IS NOT NULL AND m.message_type != '' + THEN (SELECT COALESCE(p.phone_number, p.email_address) FROM participants p WHERE p.id = m.sender_id) + END, + (SELECT GROUP_CONCAT(p.email_address, ' ') + FROM message_recipients mr + JOIN participants p ON p.id = mr.participant_id + WHERE mr.message_id = m.id + AND mr.recipient_type = 'from'), + '' + ), COALESCE(( SELECT GROUP_CONCAT(p.email_address, ' ') FROM message_recipients mr From a401c7a0b050a4969a5beaebd2fb6f656101b5be Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:14:45 -0500 Subject: [PATCH 35/65] Address second batch of code review findings - Add deprecated 'import --type whatsapp' alias forwarding to import-whatsapp for backward compatibility - Fix DuckDB TextSearch FTS MATCH syntax (table name, not alias) - Fix Contact views to use sender_id instead of message_recipients - Fix default Conversations sort to last_message_at DESC - Introduce TextSortField enum with LastMessage/Count/Name - Remove duplicate ConversationID from TextFilter - Add conversation_type to CSV fallback export - Add email-only filter to search/stats paths (DuckDB + SQLite) - Fix SQLite last_preview alias mismatch (m vs m2) - Scope FTS sender_id shortcut to non-email message types - Update subset FTS rebuild to match backfill logic - Fix integration test sender_id setup Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/msgvault/cmd/import.go | 37 +++++++++++++++++ internal/query/duckdb_text.go | 40 ++++++++----------- internal/query/sqlite_text.go | 38 ++++++++---------- internal/query/text_models.go | 53 ++++++++++++++++++------- internal/textimport/integration_test.go | 2 +- internal/tui/text_keys.go | 8 ++-- internal/tui/text_view.go | 10 ++--- 7 files changed, 120 insertions(+), 68 deletions(-) diff --git a/cmd/msgvault/cmd/import.go b/cmd/msgvault/cmd/import.go index cf16a0fc..05d0c268 100644 --- a/cmd/msgvault/cmd/import.go +++ b/cmd/msgvault/cmd/import.go @@ -213,7 +213,35 @@ func (p *ImportCLIProgress) OnError(err error) { fmt.Printf("\nWarning: %s\n", textutil.SanitizeTerminal(err.Error())) } +// Deprecated: "import --type whatsapp" forwards to "import-whatsapp". +// Remove after one release cycle. +var importType string + +var importCmd = &cobra.Command{ + Use: "import [path]", + Short: "Import messages (deprecated: use import-whatsapp)", + Deprecated: "use import-whatsapp instead", + Hidden: true, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := MustBeLocal("import"); err != nil { + return err + } + if strings.ToLower(importType) != "whatsapp" { + return fmt.Errorf( + "unsupported import type %q; use import-whatsapp", + importType, + ) + } + fmt.Fprintln(os.Stderr, + "Warning: 'import --type whatsapp' is deprecated."+ + " Use 'import-whatsapp' instead.") + return runWhatsAppImport(cmd, args[0]) + }, +} + func init() { + // import-whatsapp (canonical) importWhatsappCmd.Flags().StringVar(&importPhone, "phone", "", "your phone number in E.164 format (required)") importWhatsappCmd.Flags().StringVar(&importMediaDir, "media-dir", "", "path to decrypted Media folder (optional)") importWhatsappCmd.Flags().StringVar(&importContacts, "contacts", "", "path to contacts .vcf file for name resolution (optional)") @@ -221,4 +249,13 @@ func init() { importWhatsappCmd.Flags().StringVar(&importDisplayName, "display-name", "", "display name for the phone owner") _ = importWhatsappCmd.MarkFlagRequired("phone") rootCmd.AddCommand(importWhatsappCmd) + + // Deprecated "import --type whatsapp" alias + importCmd.Flags().StringVar(&importType, "type", "", "import source type") + importCmd.Flags().StringVar(&importPhone, "phone", "", "your phone number in E.164 format") + importCmd.Flags().StringVar(&importMediaDir, "media-dir", "", "path to decrypted Media folder") + importCmd.Flags().StringVar(&importContacts, "contacts", "", "path to contacts .vcf file") + importCmd.Flags().IntVar(&importLimit, "limit", 0, "limit number of messages") + importCmd.Flags().StringVar(&importDisplayName, "display-name", "", "display name for the phone owner") + rootCmd.AddCommand(importCmd) } diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go index 88bd4f8d..22bf87ef 100644 --- a/internal/query/duckdb_text.go +++ b/internal/query/duckdb_text.go @@ -27,10 +27,6 @@ func (e *DuckDBEngine) buildTextFilterConditions( conditions = append(conditions, "msg.source_id = ?") args = append(args, *filter.SourceID) } - if filter.ConversationID != nil { - conditions = append(conditions, "msg.conversation_id = ?") - args = append(args, *filter.ConversationID) - } if filter.ContactPhone != "" { conditions = append(conditions, `EXISTS ( SELECT 1 FROM p p_filter @@ -101,23 +97,20 @@ func (e *DuckDBEngine) ListConversations( ) ([]ConversationRow, error) { where, args := e.buildTextFilterConditions(filter) - // Sort clause: default to last_message_at DESC for conversations, - // since the zero value of SortField (SortByCount) is not meaningful here. - orderBy := "last_message_at DESC" - if filter.SortField != 0 { - switch filter.SortField { - case SortByCount: - orderBy = "message_count" - case SortBySize: - orderBy = "total_size" - case SortByName: - orderBy = "title" - } - if filter.SortDirection == SortAsc { - orderBy += " ASC" - } else { - orderBy += " DESC" - } + // Sort clause. + var orderBy string + switch filter.SortField { + case TextSortByCount: + orderBy = "message_count" + case TextSortByName: + orderBy = "title" + default: // TextSortByLastMessage + orderBy = "last_message_at" + } + if filter.SortDirection == SortAsc { + orderBy += " ASC" + } else { + orderBy += " DESC" } limit := filter.Pagination.Limit @@ -276,7 +269,7 @@ func (e *DuckDBEngine) TextAggregate( whereClause := strings.Join(conditions, " AND ") aggOpts := AggregateOptions{ - SortField: opts.SortField, + SortField: textSortFieldToSortField(opts.SortField), SortDirection: opts.SortDirection, Limit: opts.Limit, TimeGranularity: opts.TimeGranularity, @@ -290,8 +283,9 @@ func (e *DuckDBEngine) TextAggregate( func (e *DuckDBEngine) ListConversationMessages( ctx context.Context, convID int64, filter TextFilter, ) ([]MessageSummary, error) { - filter.ConversationID = &convID where, args := e.buildTextFilterConditions(filter) + where += " AND msg.conversation_id = ?" + args = append(args, convID) limit := filter.Pagination.Limit if limit == 0 { diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index 8aed97fc..292e4a16 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -61,10 +61,6 @@ func buildSQLiteTextFilterConditions(filter TextFilter) (string, []interface{}) conditions = append(conditions, "m.source_id = ?") args = append(args, *filter.SourceID) } - if filter.ConversationID != nil { - conditions = append(conditions, "m.conversation_id = ?") - args = append(args, *filter.ConversationID) - } if filter.ContactPhone != "" { conditions = append(conditions, `EXISTS ( SELECT 1 FROM participants p_cp @@ -146,21 +142,20 @@ func (e *SQLiteEngine) ListConversations( ) ([]ConversationRow, error) { where, args := buildSQLiteTextFilterConditions(filter) - orderBy := "last_message_at DESC" - if filter.SortField != 0 { - switch filter.SortField { - case SortByCount: - orderBy = "message_count" - case SortBySize: - orderBy = "total_size" - case SortByName: - orderBy = "title" - } - if filter.SortDirection == SortAsc { - orderBy += " ASC" - } else { - orderBy += " DESC" - } + // Sort clause. + var orderBy string + switch filter.SortField { + case TextSortByCount: + orderBy = "message_count" + case TextSortByName: + orderBy = "title" + default: // TextSortByLastMessage + orderBy = "last_message_at" + } + if filter.SortDirection == SortAsc { + orderBy += " ASC" + } else { + orderBy += " DESC" } limit := filter.Pagination.Limit @@ -318,7 +313,7 @@ func (e *SQLiteEngine) TextAggregate( } aggOpts := AggregateOptions{ - SortField: opts.SortField, + SortField: textSortFieldToSortField(opts.SortField), SortDirection: opts.SortDirection, Limit: opts.Limit, TimeGranularity: opts.TimeGranularity, @@ -345,8 +340,9 @@ func (e *SQLiteEngine) TextAggregate( func (e *SQLiteEngine) ListConversationMessages( ctx context.Context, convID int64, filter TextFilter, ) ([]MessageSummary, error) { - filter.ConversationID = &convID where, args := buildSQLiteTextFilterConditions(filter) + where += " AND m.conversation_id = ?" + args = append(args, convID) limit := filter.Pagination.Limit if limit == 0 { diff --git a/internal/query/text_models.go b/internal/query/text_models.go index db6bf521..084da6f6 100644 --- a/internal/query/text_models.go +++ b/internal/query/text_models.go @@ -45,20 +45,31 @@ type ConversationRow struct { LastPreview string } -// TextFilter specifies which text messages to retrieve. +// TextSortField represents fields available for sorting in Texts mode. +type TextSortField int + +const ( + // TextSortByLastMessage sorts by last message timestamp (default). + TextSortByLastMessage TextSortField = iota + TextSortByCount + TextSortByName +) + +// TextFilter specifies which text messages/conversations to retrieve. +// Note: conversation scope for ListConversationMessages is passed as +// an explicit parameter, not through this filter. type TextFilter struct { - SourceID *int64 - ConversationID *int64 - ContactPhone string - ContactName string - SourceType string - Label string - TimeRange TimeRange - After *time.Time - Before *time.Time - Pagination Pagination - SortField SortField - SortDirection SortDirection + SourceID *int64 + ContactPhone string + ContactName string + SourceType string + Label string + TimeRange TimeRange + After *time.Time + Before *time.Time + Pagination Pagination + SortField TextSortField + SortDirection SortDirection } // TextAggregateOptions configures a text aggregate query. @@ -66,7 +77,7 @@ type TextAggregateOptions struct { SourceID *int64 After *time.Time Before *time.Time - SortField SortField + SortField TextSortField SortDirection SortDirection Limit int TimeGranularity TimeGranularity @@ -84,6 +95,20 @@ var TextMessageTypes = []string{ "whatsapp", "imessage", "sms", "google_voice_text", } +// textSortFieldToSortField converts a TextSortField to the generic SortField +// used by aggregate queries. TextSortByLastMessage has no direct equivalent +// in SortField so it falls back to SortByCount. +func textSortFieldToSortField(f TextSortField) SortField { + switch f { + case TextSortByCount: + return SortByCount + case TextSortByName: + return SortByName + default: // TextSortByLastMessage + return SortByCount + } +} + // IsTextMessageType returns true if the given type is a text message type. func IsTextMessageType(mt string) bool { for _, t := range TextMessageTypes { diff --git a/internal/textimport/integration_test.go b/internal/textimport/integration_test.go index 2cf5ec57..a4b593b4 100644 --- a/internal/textimport/integration_test.go +++ b/internal/textimport/integration_test.go @@ -97,7 +97,7 @@ func TestIntegration(t *testing.T) { SentAt: sql.NullTime{Time: m.sentAt, Valid: true}, IsFromMe: m.fromMe, SizeEstimate: int64(len(m.snippet)), - SenderID: sql.NullInt64{Int64: phoneParticipantID, Valid: !m.fromMe}, + SenderID: sql.NullInt64{Int64: phoneParticipantID, Valid: true}, } msgID, err := s.UpsertMessage(msg) if err != nil { diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 6981e053..7e7782fe 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -237,10 +237,12 @@ func (m Model) textRowCount() int { // cycleTextSortField cycles between sort fields for text views. func (m *Model) cycleTextSortField() { switch m.textState.filter.SortField { - case query.SortByCount: - m.textState.filter.SortField = query.SortByName + case query.TextSortByLastMessage: + m.textState.filter.SortField = query.TextSortByCount + case query.TextSortByCount: + m.textState.filter.SortField = query.TextSortByName default: - m.textState.filter.SortField = query.SortByCount + m.textState.filter.SortField = query.TextSortByLastMessage } } diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index 7a04a24f..c13361b1 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -267,7 +267,7 @@ func (m Model) textAggregateView() string { } // Sort indicators - sortInd := func(field query.SortField) string { + sortInd := func(field query.TextSortField) string { if m.textState.filter.SortField == field { if m.textState.filter.SortDirection == query.SortDesc { return "\u2193" @@ -278,17 +278,15 @@ func (m Model) textAggregateView() string { } viewLabel := m.textState.viewType.String() - if si := sortInd(query.SortByName); si != "" { + if si := sortInd(query.TextSortByName); si != "" { viewLabel += si } countLabel := "Count" - if si := sortInd(query.SortByCount); si != "" { + if si := sortInd(query.TextSortByCount); si != "" { countLabel += si } sizeLabel := "Size" - if si := sortInd(query.SortBySize); si != "" { - sizeLabel += si - } + _ = sizeLabel // TextSortField has no size variant; label kept for column layout attachLabel := "Attchs" headerRow := fmt.Sprintf( From 5bc9ae05e3d74472d3e1f35879d7b05574606efc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:18:58 -0500 Subject: [PATCH 36/65] Fix TUI Texts mode: navigation cursor, account filter, search back 1. Text list navigation was mutating email-mode viewState cursor/scroll fields instead of textState fields. Add navigateTextList helper that operates on textState.cursor/textState.scrollOffset. 2. Account selector modal only reloaded email data after selection. Now checks for modeTexts and reloads text data with the selected source. Also syncs textState.filter.SourceID when entering Texts mode. 3. Search submission replaced the view with timeline results but never saved a breadcrumb, making Esc a no-op. Push a breadcrumb snapshot before loading search results. Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/tui/keys.go | 5 +++ internal/tui/text_keys.go | 69 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/internal/tui/keys.go b/internal/tui/keys.go index ccceb12d..a0cc8bbe 100644 --- a/internal/tui/keys.go +++ b/internal/tui/keys.go @@ -100,6 +100,7 @@ func (m Model) handleGlobalKeys(msg tea.KeyMsg) (Model, tea.Cmd, bool) { } if m.mode == modeEmail { m.mode = modeTexts + m.textState.filter.SourceID = m.accountFilter m.loading = true spinCmd := m.startSpinner() return m, tea.Batch(spinCmd, m.loadTextConversations()), true @@ -973,6 +974,10 @@ func (m Model) handleAccountSelectorKeys(msg tea.KeyMsg) (tea.Model, tea.Cmd) { } m.modal = modalNone m.loading = true + if m.mode == modeTexts { + m.textState.filter.SourceID = m.accountFilter + return m, m.loadTextData() + } m.aggregateRequestID++ return m, tea.Batch(m.loadData(), m.loadStats()) case "esc": diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 7e7782fe..e1c3df56 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -44,8 +44,8 @@ func (m Model) handleTextKeyPress(msg tea.KeyMsg) (tea.Model, tea.Cmd) { func (m Model) handleTextListKeys( msg tea.KeyMsg, ) (tea.Model, tea.Cmd) { - // Handle list navigation - if m.navigateList(msg.String(), m.textRowCount()) { + // Handle list navigation (text-specific: operates on textState) + if m.navigateTextList(msg.String(), m.textRowCount()) { return m, nil } @@ -159,6 +159,18 @@ func (m Model) handleTextInlineSearchKeys( if queryStr == "" { return m, nil } + // Save current state so Esc can return from search results + m.textState.breadcrumbs = append( + m.textState.breadcrumbs, + textNavSnapshot{ + level: m.textState.level, + viewType: m.textState.viewType, + cursor: m.textState.cursor, + scrollOffset: m.textState.scrollOffset, + filter: m.textState.filter, + selectedConvID: m.textState.selectedConvID, + }, + ) m.loading = true return m, m.loadTextSearch(queryStr) @@ -221,6 +233,59 @@ func (m *Model) textMoveCursor(delta int) { ) } +// navigateTextList handles list navigation keys for text mode, +// operating on textState.cursor and textState.scrollOffset instead +// of the email-mode viewState fields. +// Returns true if the key was handled. +func (m *Model) navigateTextList(key string, itemCount int) bool { + switch key { + case "up", "k": + if m.textState.cursor > 0 { + m.textState.cursor-- + m.textState.scrollOffset = calculateScrollOffset( + m.textState.cursor, + m.textState.scrollOffset, + m.visibleRows(), + ) + } + return true + case "down", "j": + if m.textState.cursor < itemCount-1 { + m.textState.cursor++ + m.textState.scrollOffset = calculateScrollOffset( + m.textState.cursor, + m.textState.scrollOffset, + m.visibleRows(), + ) + } + return true + case "pgup", "ctrl+u": + m.textMoveCursor(-m.visibleRows()) + return true + case "pgdown", "ctrl+d": + m.textMoveCursor(m.visibleRows()) + return true + case "home": + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + return true + case "end", "G": + maxIdx := itemCount - 1 + if maxIdx < 0 { + maxIdx = 0 + } + m.textState.cursor = maxIdx + m.textState.scrollOffset = calculateScrollOffset( + m.textState.cursor, + m.textState.scrollOffset, + m.visibleRows(), + ) + return true + default: + return false + } +} + // textRowCount returns the number of rows in the current text view. func (m Model) textRowCount() int { switch m.textState.level { From 981d323c1bd3e7d23fa186806c681978ba4f562a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:24:13 -0500 Subject: [PATCH 37/65] Fix DuckDB stats search args, contact fallback, CSV migration, attachment flags - buildStatsSearchConditions: add snippet matching to default branch and fix countArgsForTextTerms (3 -> 4) to match buildAggregateSearchConditions - Contact views: COALESCE(sender_id, message_recipients fallback) for archives with NULL sender_id, in both DuckDB and SQLite engines - Add conversation_type migration for legacy DBs missing the column, fixing CSV fallback and Parquet export - WhatsApp importer: clear has_attachments/attachment_count when no media is stored (no --media-dir), preventing phantom attachment indicators Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/query/duckdb.go | 10 ++++++---- internal/query/duckdb_text.go | 26 ++++++++++++++++++++------ internal/query/sqlite_text.go | 26 ++++++++++++++++++++------ internal/store/store.go | 1 + internal/whatsapp/importer.go | 6 ++++++ 5 files changed, 53 insertions(+), 16 deletions(-) diff --git a/internal/query/duckdb.go b/internal/query/duckdb.go index 61067d0f..42a6b473 100644 --- a/internal/query/duckdb.go +++ b/internal/query/duckdb.go @@ -567,9 +567,10 @@ func (e *DuckDBEngine) buildStatsSearchConditions(searchQuery string, groupBy Vi )`) args = append(args, termPattern) default: - // Default: search subject and sender + // Default: search subject, snippet, and sender conditions = append(conditions, `( msg.subject ILIKE ? ESCAPE '\' OR + COALESCE(msg.snippet, '') ILIKE ? ESCAPE '\' OR EXISTS ( SELECT 1 FROM mr mr_search JOIN p p_search ON p_search.id = mr_search.participant_id @@ -578,7 +579,7 @@ func (e *DuckDBEngine) buildStatsSearchConditions(searchQuery string, groupBy Vi AND (p_search.email_address ILIKE ? ESCAPE '\' OR p_search.display_name ILIKE ? ESCAPE '\') ) )`) - args = append(args, termPattern, termPattern, termPattern) + args = append(args, termPattern, termPattern, termPattern, termPattern) } } @@ -599,9 +600,10 @@ func (e *DuckDBEngine) buildStatsSearchConditions(searchQuery string, groupBy Vi } // countArgsForTextTerms returns the number of args used by N text terms in -// buildAggregateSearchConditions with no keyColumns (3 args per term: subject + 2 sender). +// buildAggregateSearchConditions with no keyColumns (4 args per term: +// subject + snippet + 2 sender). func countArgsForTextTerms(n int) int { - return n * 3 + return n * 4 } // keyColumns are passed through to buildAggregateSearchConditions to control diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go index 22bf87ef..2eafb5fc 100644 --- a/internal/query/duckdb_text.go +++ b/internal/query/duckdb_text.go @@ -30,7 +30,10 @@ func (e *DuckDBEngine) buildTextFilterConditions( if filter.ContactPhone != "" { conditions = append(conditions, `EXISTS ( SELECT 1 FROM p p_filter - WHERE p_filter.id = msg.sender_id + WHERE p_filter.id = COALESCE(msg.sender_id, + (SELECT mr_fb.participant_id FROM mr mr_fb + WHERE mr_fb.message_id = msg.id AND mr_fb.recipient_type = 'from' + LIMIT 1)) AND COALESCE( NULLIF(p_filter.phone_number, ''), p_filter.email_address @@ -41,7 +44,10 @@ func (e *DuckDBEngine) buildTextFilterConditions( if filter.ContactName != "" { conditions = append(conditions, `EXISTS ( SELECT 1 FROM p p_filter - WHERE p_filter.id = msg.sender_id + WHERE p_filter.id = COALESCE(msg.sender_id, + (SELECT mr_fb.participant_id FROM mr mr_fb + WHERE mr_fb.message_id = msg.id AND mr_fb.recipient_type = 'from' + LIMIT 1)) AND COALESCE( NULLIF(TRIM(p_filter.display_name), ''), NULLIF(p_filter.phone_number, ''), @@ -188,18 +194,26 @@ func textAggViewDef( case TextViewContacts: keyExpr := "COALESCE(NULLIF(p_sender.phone_number, ''), " + "p_sender.email_address)" + senderJoin := `JOIN p p_sender ON p_sender.id = COALESCE(msg.sender_id, + (SELECT mr_fb.participant_id FROM mr mr_fb + WHERE mr_fb.message_id = msg.id AND mr_fb.recipient_type = 'from' + LIMIT 1))` return aggViewDef{ keyExpr: keyExpr, - joinClause: "JOIN p p_sender ON p_sender.id = msg.sender_id", - nullGuard: keyExpr + " IS NOT NULL AND msg.sender_id IS NOT NULL", + joinClause: senderJoin, + nullGuard: keyExpr + " IS NOT NULL", }, nil case TextViewContactNames: nameExpr := "COALESCE(NULLIF(TRIM(p_sender.display_name), ''), " + "NULLIF(p_sender.phone_number, ''), p_sender.email_address)" + senderJoin := `JOIN p p_sender ON p_sender.id = COALESCE(msg.sender_id, + (SELECT mr_fb.participant_id FROM mr mr_fb + WHERE mr_fb.message_id = msg.id AND mr_fb.recipient_type = 'from' + LIMIT 1))` return aggViewDef{ keyExpr: nameExpr, - joinClause: "JOIN p p_sender ON p_sender.id = msg.sender_id", - nullGuard: nameExpr + " IS NOT NULL AND msg.sender_id IS NOT NULL", + joinClause: senderJoin, + nullGuard: nameExpr + " IS NOT NULL", }, nil case TextViewSources: return aggViewDef{ diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index 292e4a16..d0b5a6dd 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -64,7 +64,10 @@ func buildSQLiteTextFilterConditions(filter TextFilter) (string, []interface{}) if filter.ContactPhone != "" { conditions = append(conditions, `EXISTS ( SELECT 1 FROM participants p_cp - WHERE p_cp.id = m.sender_id + WHERE p_cp.id = COALESCE(m.sender_id, + (SELECT mr_fb.participant_id FROM message_recipients mr_fb + WHERE mr_fb.message_id = m.id AND mr_fb.recipient_type = 'from' + LIMIT 1)) AND COALESCE( NULLIF(p_cp.phone_number, ''), p_cp.email_address @@ -75,7 +78,10 @@ func buildSQLiteTextFilterConditions(filter TextFilter) (string, []interface{}) if filter.ContactName != "" { conditions = append(conditions, `EXISTS ( SELECT 1 FROM participants p_cn - WHERE p_cn.id = m.sender_id + WHERE p_cn.id = COALESCE(m.sender_id, + (SELECT mr_fb.participant_id FROM message_recipients mr_fb + WHERE mr_fb.message_id = m.id AND mr_fb.recipient_type = 'from' + LIMIT 1)) AND COALESCE( NULLIF(TRIM(p_cn.display_name), ''), NULLIF(p_cn.phone_number, ''), @@ -232,18 +238,26 @@ func textAggSQLiteDimension( switch view { case TextViewContacts: keyExpr := "COALESCE(NULLIF(p_agg.phone_number, ''), p_agg.email_address)" + senderJoin := `JOIN participants p_agg ON p_agg.id = COALESCE(m.sender_id, + (SELECT mr_fb.participant_id FROM message_recipients mr_fb + WHERE mr_fb.message_id = m.id AND mr_fb.recipient_type = 'from' + LIMIT 1))` return aggDimension{ keyExpr: keyExpr, - joins: "JOIN participants p_agg ON p_agg.id = m.sender_id", - whereExpr: keyExpr + " IS NOT NULL AND m.sender_id IS NOT NULL", + joins: senderJoin, + whereExpr: keyExpr + " IS NOT NULL", }, nil case TextViewContactNames: nameExpr := "COALESCE(NULLIF(TRIM(p_agg.display_name), ''), " + "NULLIF(p_agg.phone_number, ''), p_agg.email_address)" + senderJoin := `JOIN participants p_agg ON p_agg.id = COALESCE(m.sender_id, + (SELECT mr_fb.participant_id FROM message_recipients mr_fb + WHERE mr_fb.message_id = m.id AND mr_fb.recipient_type = 'from' + LIMIT 1))` return aggDimension{ keyExpr: nameExpr, - joins: "JOIN participants p_agg ON p_agg.id = m.sender_id", - whereExpr: nameExpr + " IS NOT NULL AND m.sender_id IS NOT NULL", + joins: senderJoin, + whereExpr: nameExpr + " IS NOT NULL", }, nil case TextViewSources: return aggDimension{ diff --git a/internal/store/store.go b/internal/store/store.go index f55b164a..ea15eb72 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -268,6 +268,7 @@ func (s *Store) InitSchema() error { {`ALTER TABLE messages ADD COLUMN deleted_from_source_at DATETIME`, "deleted_from_source_at"}, {`ALTER TABLE messages ADD COLUMN delete_batch_id TEXT`, "delete_batch_id"}, {`ALTER TABLE conversations ADD COLUMN title TEXT`, "title"}, + {`ALTER TABLE conversations ADD COLUMN conversation_type TEXT NOT NULL DEFAULT 'email_thread'`, "conversation_type"}, } { if _, err := s.db.Exec(m.sql); err != nil { if !isSQLiteError(err, "duplicate column name") { diff --git a/internal/whatsapp/importer.go b/internal/whatsapp/importer.go index 4c50253a..d82d2111 100644 --- a/internal/whatsapp/importer.go +++ b/internal/whatsapp/importer.go @@ -390,6 +390,12 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt summary.Errors++ imp.progress.OnError(fmt.Errorf("upsert attachment for message %s: %w", waMsg.KeyID, err)) } + } else { + // No media stored — clear attachment flags set by mapMessage + // so Parquet/TUI queries don't show phantom attachments. + _, _ = imp.store.DB().Exec( + "UPDATE messages SET has_attachments = 0, attachment_count = 0 WHERE id = ?", + messageID) } // Store media metadata in the attachments table is done above. From a66d1677911de4b619b424e43d66be29312e3a56 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:25:35 -0500 Subject: [PATCH 38/65] Fix NormalizePhone: leading whitespace, 00-prefix length validation Co-Authored-By: Claude Sonnet 4.6 --- internal/textimport/phone.go | 11 +++++++---- internal/textimport/phone_test.go | 5 +++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/internal/textimport/phone.go b/internal/textimport/phone.go index 1f4fa497..62dd4e4a 100644 --- a/internal/textimport/phone.go +++ b/internal/textimport/phone.go @@ -10,6 +10,7 @@ import ( // Returns an error for inputs that are not phone numbers (emails, // short codes, system identifiers). func NormalizePhone(raw string) (string, error) { + raw = strings.TrimSpace(raw) if raw == "" { return "", fmt.Errorf("empty input") } @@ -39,9 +40,6 @@ func NormalizePhone(raw string) (string, error) { if justDigits == "" { return "", fmt.Errorf("no digits in input: %q", raw) } - if len(justDigits) < 7 { - return "", fmt.Errorf("too short for phone number: %q", raw) - } var digits string if leadingPlus { @@ -58,8 +56,13 @@ func NormalizePhone(raw string) (string, error) { digits = "+" + justDigits } + // Validate length against the final normalized digit string. + finalDigits := digits[1:] // strip leading '+' + if len(finalDigits) < 7 { + return "", fmt.Errorf("too short for phone number: %q", raw) + } // E.164 max is 15 digits (country code + subscriber) - if len(digits)-1 > 15 { + if len(finalDigits) > 15 { return "", fmt.Errorf("too long for E.164 (max 15 digits): %q", raw) } diff --git a/internal/textimport/phone_test.go b/internal/textimport/phone_test.go index 9d30e7d9..7fd1da62 100644 --- a/internal/textimport/phone_test.go +++ b/internal/textimport/phone_test.go @@ -30,6 +30,11 @@ func TestNormalizePhone(t *testing.T) { {"status@broadcast", "", true}, // International 00-prefix {"0033624921221", "+33624921221", false}, + // Leading whitespace + {" +15551234567", "+15551234567", false}, + {"\t+44 7700 900000", "+447700900000", false}, + // 00-prefix too short after conversion + {"0012345", "", true}, // Trunk prefix (0) {"+44 (0)7700 900000", "+447700900000", false}, // Embedded + (invalid) From 9ba855dd7f53b6e19fd887eff5b9ef6d1c710554 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:26:29 -0500 Subject: [PATCH 39/65] Fix iMessage: stable source identifier, include AttributedBody in raw Always use "local" as the iMessage source identifier regardless of --me flag, preventing duplicate sources when the flag value changes between runs. The --me value still flows to WithOwnerHandle for participant resolution. Include AttributedBody ([]byte) in the JSON raw store. json.Marshal encodes it as base64 automatically, preserving the blob for Ventura+ messages where Text is empty. Co-Authored-By: Claude Sonnet 4.6 --- cmd/msgvault/cmd/import_imessage.go | 10 ++++------ internal/imessage/client.go | 11 ++++++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go index b3687992..9ec4f867 100644 --- a/cmd/msgvault/cmd/import_imessage.go +++ b/cmd/msgvault/cmd/import_imessage.go @@ -74,12 +74,10 @@ func runImportImessage(cmd *cobra.Command, _ []string) error { } defer func() { _ = client.Close() }() - // Get or create the source - identifier := "local" - if importImessageMe != "" { - identifier = importImessageMe - } - src, err := s.GetOrCreateSource("apple_messages", identifier) + // Get or create the source. Always use "local" — there is only one + // iMessage database per machine. The --me flag affects participant + // resolution only, not the source identifier. + src, err := s.GetOrCreateSource("apple_messages", "local") if err != nil { return fmt.Errorf("get or create source: %w", err) } diff --git a/internal/imessage/client.go b/internal/imessage/client.go index 339a8f94..1dfded37 100644 --- a/internal/imessage/client.go +++ b/internal/imessage/client.go @@ -555,11 +555,12 @@ func (c *Client) writeMessageRaw( body string, ) error { raw := map[string]interface{}{ - "rowid": msg.ROWID, - "guid": msg.GUID, - "date": msg.Date, - "is_from_me": msg.IsFromMe, - "body": body, + "rowid": msg.ROWID, + "guid": msg.GUID, + "date": msg.Date, + "is_from_me": msg.IsFromMe, + "body": body, + "attributed_body": msg.AttributedBody, // base64-encoded by json.Marshal; nil omitted } if msg.Service != nil { raw["service"] = *msg.Service From 085caa9ed9d7f15537442c570733e415c443f97c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:28:02 -0500 Subject: [PATCH 40/65] Fix GVoice: correct recipient for outbound direct texts For IsMe direct-chat messages, msg.SenderPhone is the owner's number, so senderID == ownerID. Passing senderID as the "to" recipient made sent texts appear self-addressed. Resolve the contact (non-Me participant) by scanning the file's message list for the first non-Me sender. Pass this contactID separately to writeTextRecipients so the "to" field points to the actual other party. Co-Authored-By: Claude Sonnet 4.6 --- internal/gvoice/client.go | 44 +++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/internal/gvoice/client.go b/internal/gvoice/client.go index e47eaef2..83a624c3 100644 --- a/internal/gvoice/client.go +++ b/internal/gvoice/client.go @@ -270,6 +270,16 @@ func (c *Client) importTextEntry( return 0, fmt.Errorf("resolve sender: %w", err) } + // For direct chats, resolve the other party (the non-Me participant). + // For outbound messages msg.SenderPhone is the owner's number, so + // senderID == ownerID and cannot be used as the "to" recipient. + contactID := senderID + if entry.FileType == fileTypeText && msg.IsMe { + contactID = c.resolveContactID( + s, messages, phoneCache, summary, + ) + } + // Ensure conversation participant if senderID > 0 { _ = s.EnsureConversationParticipant(convID, senderID, "member") @@ -343,7 +353,7 @@ func (c *Client) importTextEntry( // Write message_recipients if err := c.writeTextRecipients( - s, msgID, msg, ownerID, senderID, entry.FileType, + s, msgID, msg, ownerID, senderID, contactID, entry.FileType, groupParticipants, phoneCache, summary, ); err != nil { return 0, fmt.Errorf("write message recipients: %w", err) @@ -514,11 +524,13 @@ func (c *Client) importCallEntry( // writeTextRecipients writes from/to rows for a text message. // IsMe=true: from=owner, to=contact(s). IsMe=false: from=sender, to=owner. +// contactID is the other party in a direct chat (may differ from senderID +// for outbound messages where senderID == ownerID). func (c *Client) writeTextRecipients( s *store.Store, msgID int64, msg textMessage, - ownerID, senderID int64, + ownerID, senderID, contactID int64, ft fileType, groupParticipants []string, phoneCache map[string]int64, @@ -533,9 +545,10 @@ func (c *Client) writeTextRecipients( return err } } - // To: group participants or the contact + // To: group participants or the contact (not the sender, who is + // the owner for outbound messages) toIDs := c.collectRecipientIDs( - s, ft, senderID, groupParticipants, + s, ft, contactID, groupParticipants, phoneCache, summary, ) if len(toIDs) > 0 { @@ -595,6 +608,29 @@ func (c *Client) collectRecipientIDs( return nil } +// resolveContactID finds the non-Me participant in a direct chat's message +// list and returns their participant ID. Returns 0 if no non-Me sender is +// found (e.g., a file containing only outbound messages with no replies). +func (c *Client) resolveContactID( + s *store.Store, + messages []textMessage, + phoneCache map[string]int64, + summary *ImportSummary, +) int64 { + for _, m := range messages { + if !m.IsMe && m.SenderPhone != "" { + pid, err := c.resolveParticipant( + s, m.SenderPhone, m.SenderName, + phoneCache, summary, + ) + if err == nil && pid > 0 { + return pid + } + } + } + return 0 +} + // writeCallRecipients writes from/to rows for a call record. // Placed calls: from=owner, to=contact. // Received/missed/voicemail: from=contact, to=owner. From 519e0503fcf63f00bb5043fbdd51840913b28848 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:29:26 -0500 Subject: [PATCH 41/65] Strengthen test coverage: same-timestamp tie-breaker, LastMessageAt - Add msg3 in TestRecomputeConversationStats with the same sent_at as msg2 but a distinct snippet; assert the higher-id message's snippet wins, exercising the `id DESC` tie-breaker in RecomputeConversationStats - Assert LastMessageAt is non-zero and equals the expected latest message time for each conversation in TestIntegration Co-Authored-By: Claude Sonnet 4.6 --- internal/store/messages_test.go | 30 +++++++++++++++++++------ internal/textimport/integration_test.go | 24 ++++++++++++++++---- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/internal/store/messages_test.go b/internal/store/messages_test.go index 4361f1c0..44b61c8b 100644 --- a/internal/store/messages_test.go +++ b/internal/store/messages_test.go @@ -59,6 +59,21 @@ func TestRecomputeConversationStats(t *testing.T) { t.Fatalf("UpsertMessage msg2: %v", err) } + // msg3 has the SAME sent_at as msg2 but a different snippet. + // After recompute, last_message_preview must come from msg3 (higher id), + // exercising the `id DESC` tie-breaker in the SQL. + msg3 := &store.Message{ + SourceID: source.ID, + SourceMessageID: "msg-3", + ConversationID: convID, + MessageType: "whatsapp", + SentAt: sql.NullTime{Time: sentAt2, Valid: true}, + Snippet: sql.NullString{String: "tie-breaker", Valid: true}, + } + if _, err := st.UpsertMessage(msg3); err != nil { + t.Fatalf("UpsertMessage msg3: %v", err) + } + // Add a conversation participant so participant_count is non-zero. participantID, err := st.EnsureParticipantByPhone("+15559876543", "Bob", "whatsapp") if err != nil { @@ -83,8 +98,8 @@ func TestRecomputeConversationStats(t *testing.T) { ).Scan(&count, &participantCount, &lastMsgAt, &preview); err != nil { t.Fatalf("post-recompute scan: %v", err) } - if count != 2 { - t.Errorf("message_count = %d, want 2", count) + if count != 3 { + t.Errorf("message_count = %d, want 3", count) } if participantCount != 1 { t.Errorf("participant_count = %d, want 1", participantCount) @@ -92,9 +107,10 @@ func TestRecomputeConversationStats(t *testing.T) { if !lastMsgAt.Valid { t.Error("last_message_at is NULL, want a timestamp") } - // msg2 has the later sent_at, so its snippet ("world") should be the preview. - if !preview.Valid || preview.String != "world" { - t.Errorf("last_message_preview = %q, want %q", preview.String, "world") + // msg2 and msg3 share the same sent_at; msg3 has the higher id, so its + // snippet ("tie-breaker") must win via the `id DESC` tie-breaker. + if !preview.Valid || preview.String != "tie-breaker" { + t.Errorf("last_message_preview = %q, want %q", preview.String, "tie-breaker") } // Idempotency: calling again should produce the same result. @@ -106,8 +122,8 @@ func TestRecomputeConversationStats(t *testing.T) { ).Scan(&count); err != nil { t.Fatalf("idempotency scan: %v", err) } - if count != 2 { - t.Errorf("idempotency: message_count = %d, want 2", count) + if count != 3 { + t.Errorf("idempotency: message_count = %d, want 3", count) } } diff --git a/internal/textimport/integration_test.go b/internal/textimport/integration_test.go index a4b593b4..ea1f44a0 100644 --- a/internal/textimport/integration_test.go +++ b/internal/textimport/integration_test.go @@ -214,15 +214,31 @@ func TestIntegration(t *testing.T) { for _, row := range convRows { convByID[row.ConversationID] = row } + wantConv1LastAt := baseTime.Add(2 * time.Minute) + wantConv2LastAt := baseTime.Add(time.Hour + time.Minute) if row, ok := convByID[conv1ID]; !ok { t.Errorf("conv1 not found in ListConversations results") - } else if row.MessageCount != 3 { - t.Errorf("conv1 MessageCount: got %d, want 3", row.MessageCount) + } else { + if row.MessageCount != 3 { + t.Errorf("conv1 MessageCount: got %d, want 3", row.MessageCount) + } + if row.LastMessageAt.IsZero() { + t.Error("conv1 LastMessageAt is zero") + } else if !row.LastMessageAt.Equal(wantConv1LastAt) { + t.Errorf("conv1 LastMessageAt: got %v, want %v", row.LastMessageAt, wantConv1LastAt) + } } if row, ok := convByID[conv2ID]; !ok { t.Errorf("conv2 not found in ListConversations results") - } else if row.MessageCount != 2 { - t.Errorf("conv2 MessageCount: got %d, want 2", row.MessageCount) + } else { + if row.MessageCount != 2 { + t.Errorf("conv2 MessageCount: got %d, want 2", row.MessageCount) + } + if row.LastMessageAt.IsZero() { + t.Error("conv2 LastMessageAt is zero") + } else if !row.LastMessageAt.Equal(wantConv2LastAt) { + t.Errorf("conv2 LastMessageAt: got %v, want %v", row.LastMessageAt, wantConv2LastAt) + } } // TextAggregate by contacts — groups by phone number. From b0be64c6c896bd37cee441a7977dbd3404042c2c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Mar 2026 22:40:48 -0500 Subject: [PATCH 42/65] Address review findings: tests, sort reset, load dispatch, pagination, attachments, source migration, recipients, preview ordering 1. Integration test: add NULL sender_id message to verify COALESCE fallback in TextAggregate, update expected counts from 5 to 6 2. Reset sort field when cycling text views: LastMessage for conversations, Count for aggregate views 3. Fix loadTextData to dispatch by navigation level, not viewType, so drill-down from aggregates loads conversations correctly 4. Fix page-up/page-down in text mode to move both cursor and scrollOffset by a full page, matching email-mode navigateList 5. WhatsApp: only clear attachment flags when no existing attachment rows, preventing data loss on re-import without --media-dir 6. iMessage: reuse existing apple_messages source to preserve dedup keys from imports that used --me as the identifier 7. GVoice: always call ReplaceMessageRecipients for "to" even with empty slice, clearing stale rows on re-import 8. GVoice: add resolved contactID to conversation_participants 9. Add id DESC tie-breaker to last_preview ordering in both SQLite and DuckDB ListConversations queries Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/msgvault/cmd/import_imessage.go | 17 +++++--- internal/gvoice/client.go | 32 ++++++++------ internal/query/duckdb_text.go | 2 +- internal/query/sqlite_text.go | 2 +- internal/textimport/integration_test.go | 49 +++++++++++++++++---- internal/tui/text_commands.go | 13 ++++-- internal/tui/text_keys.go | 57 +++++++++++++++++++++++-- internal/whatsapp/importer.go | 15 +++++-- 8 files changed, 147 insertions(+), 40 deletions(-) diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go index 9ec4f867..a91fbf5a 100644 --- a/cmd/msgvault/cmd/import_imessage.go +++ b/cmd/msgvault/cmd/import_imessage.go @@ -74,12 +74,17 @@ func runImportImessage(cmd *cobra.Command, _ []string) error { } defer func() { _ = client.Close() }() - // Get or create the source. Always use "local" — there is only one - // iMessage database per machine. The --me flag affects participant - // resolution only, not the source identifier. - src, err := s.GetOrCreateSource("apple_messages", "local") - if err != nil { - return fmt.Errorf("get or create source: %w", err) + // Reuse any existing apple_messages source to preserve dedup keys + // from previous imports (which may have used --me as the identifier). + var src *store.Source + existingSources, listErr := s.ListSources("apple_messages") + if listErr == nil && len(existingSources) > 0 { + src = existingSources[0] + } else { + src, err = s.GetOrCreateSource("apple_messages", "local") + if err != nil { + return fmt.Errorf("get or create source: %w", err) + } } ctx, cancel := context.WithCancel(cmd.Context()) diff --git a/internal/gvoice/client.go b/internal/gvoice/client.go index 83a624c3..b4901916 100644 --- a/internal/gvoice/client.go +++ b/internal/gvoice/client.go @@ -285,6 +285,11 @@ func (c *Client) importTextEntry( _ = s.EnsureConversationParticipant(convID, senderID, "member") } + // Ensure the resolved contact is a conversation participant. + if contactID > 0 && contactID != senderID { + _ = s.EnsureConversationParticipant(convID, contactID, "member") + } + // Ensure owner as conversation participant if ownerID > 0 { _ = s.EnsureConversationParticipant(convID, ownerID, "member") @@ -546,17 +551,16 @@ func (c *Client) writeTextRecipients( } } // To: group participants or the contact (not the sender, who is - // the owner for outbound messages) + // the owner for outbound messages). + // Always call Replace even with an empty slice to clear stale rows. toIDs := c.collectRecipientIDs( s, ft, contactID, groupParticipants, phoneCache, summary, ) - if len(toIDs) > 0 { - if err := s.ReplaceMessageRecipients( - msgID, "to", toIDs, nil, - ); err != nil { - return err - } + if err := s.ReplaceMessageRecipients( + msgID, "to", toIDs, nil, + ); err != nil { + return err } } else { // From: external sender @@ -567,13 +571,15 @@ func (c *Client) writeTextRecipients( return err } } - // To: owner + // To: owner. Always call Replace to clear stale rows on re-import. + var toIDs []int64 if ownerID > 0 { - if err := s.ReplaceMessageRecipients( - msgID, "to", []int64{ownerID}, nil, - ); err != nil { - return err - } + toIDs = []int64{ownerID} + } + if err := s.ReplaceMessageRecipients( + msgID, "to", toIDs, nil, + ); err != nil { + return err } } return nil diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go index 2eafb5fc..b27d5e5a 100644 --- a/internal/query/duckdb_text.go +++ b/internal/query/duckdb_text.go @@ -134,7 +134,7 @@ func (e *DuckDBEngine) ListConversations( COUNT(DISTINCT COALESCE(msg.sender_id, 0)) AS participant_count, MAX(msg.sent_at) AS last_message_at, COALESCE(SUM(CAST(msg.size_estimate AS BIGINT)), 0) AS total_size, - FIRST(msg.snippet ORDER BY msg.sent_at DESC) AS last_preview, + FIRST(msg.snippet ORDER BY msg.sent_at DESC, msg.id DESC) AS last_preview, FIRST(msg.source_id) AS source_id FROM msg WHERE %s diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index d0b5a6dd..853dd0e5 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -181,7 +181,7 @@ func (e *SQLiteEngine) ListConversations( (SELECT m2.snippet FROM messages m2 WHERE m2.conversation_id = c.id AND %s - ORDER BY m2.sent_at DESC LIMIT 1), + ORDER BY m2.sent_at DESC, m2.id DESC LIMIT 1), '' ) AS last_preview, COALESCE(SUM(m.size_estimate), 0) AS total_size diff --git a/internal/textimport/integration_test.go b/internal/textimport/integration_test.go index ea1f44a0..d820f5c1 100644 --- a/internal/textimport/integration_test.go +++ b/internal/textimport/integration_test.go @@ -152,6 +152,36 @@ func TestIntegration(t *testing.T) { } } + // --- Message with NULL sender_id (backward-compatibility) --- + // Some older imports only have message_recipients "from" rows, not sender_id. + // Verify that TextAggregate still picks these up via the COALESCE fallback. + { + msg := &store.Message{ + SourceID: src2.ID, + SourceMessageID: "am-null-sender", + ConversationID: conv2ID, + MessageType: "imessage", + Snippet: sql.NullString{String: "Null sender msg", Valid: true}, + SentAt: sql.NullTime{Time: baseTime.Add(2 * time.Hour), Valid: true}, + SizeEstimate: 15, + SenderID: sql.NullInt64{}, // NULL + } + msgID, err := s.UpsertMessage(msg) + if err != nil { + t.Fatalf("UpsertMessage(am-null-sender): %v", err) + } + bodyText := sql.NullString{String: "Null sender msg", Valid: true} + if err := s.UpsertMessageBody(msgID, bodyText, sql.NullString{}); err != nil { + t.Fatalf("UpsertMessageBody(am-null-sender): %v", err) + } + if err := s.ReplaceMessageRecipients( + msgID, "from", + []int64{phoneParticipantID}, []string{"Alice"}, + ); err != nil { + t.Fatalf("ReplaceMessageRecipients(am-null-sender): %v", err) + } + } + // --- Labels --- labelID, err := s.EnsureLabel(src1.ID, "important", "Important", "user") if err != nil { @@ -215,7 +245,7 @@ func TestIntegration(t *testing.T) { convByID[row.ConversationID] = row } wantConv1LastAt := baseTime.Add(2 * time.Minute) - wantConv2LastAt := baseTime.Add(time.Hour + time.Minute) + wantConv2LastAt := baseTime.Add(2 * time.Hour) if row, ok := convByID[conv1ID]; !ok { t.Errorf("conv1 not found in ListConversations results") } else { @@ -231,8 +261,8 @@ func TestIntegration(t *testing.T) { if row, ok := convByID[conv2ID]; !ok { t.Errorf("conv2 not found in ListConversations results") } else { - if row.MessageCount != 2 { - t.Errorf("conv2 MessageCount: got %d, want 2", row.MessageCount) + if row.MessageCount != 3 { + t.Errorf("conv2 MessageCount: got %d, want 3", row.MessageCount) } if row.LastMessageAt.IsZero() { t.Error("conv2 LastMessageAt is zero") @@ -242,7 +272,8 @@ func TestIntegration(t *testing.T) { } // TextAggregate by contacts — groups by phone number. - // All 5 messages have +15551234567 as the from participant. + // All 6 messages have +15551234567 as the from participant + // (5 via sender_id, 1 via message_recipients fallback with NULL sender_id). aggRows, err := te.TextAggregate(ctx, query.TextViewContacts, query.TextAggregateOptions{Limit: 100}) if err != nil { t.Fatalf("TextAggregate(TextViewContacts): %v", err) @@ -254,8 +285,8 @@ func TestIntegration(t *testing.T) { for _, row := range aggRows { if row.Key == "+15551234567" { foundPhone = true - if row.Count != 5 { - t.Errorf("contact +15551234567: got count %d, want 5", row.Count) + if row.Count != 6 { + t.Errorf("contact +15551234567: got count %d, want 6", row.Count) } } } @@ -284,13 +315,13 @@ func TestIntegration(t *testing.T) { } } - // GetTextStats — should count all 5 text messages. + // GetTextStats — should count all 6 text messages. stats, err := te.GetTextStats(ctx, query.TextStatsOptions{}) if err != nil { t.Fatalf("GetTextStats: %v", err) } - if stats.MessageCount != 5 { - t.Errorf("GetTextStats.MessageCount: got %d, want 5", stats.MessageCount) + if stats.MessageCount != 6 { + t.Errorf("GetTextStats.MessageCount: got %d, want 6", stats.MessageCount) } // Should see 2 accounts (sources). if stats.AccountCount != 2 { diff --git a/internal/tui/text_commands.go b/internal/tui/text_commands.go index 8c67d601..ab1bc106 100644 --- a/internal/tui/text_commands.go +++ b/internal/tui/text_commands.go @@ -137,10 +137,17 @@ func (m Model) loadTextSearch(searchQuery string) tea.Cmd { ) } -// loadTextData dispatches the appropriate load command based on current view type. +// loadTextData dispatches the appropriate load command based on the current +// navigation level. Checking level (not just viewType) is necessary because +// drill-down from an aggregate keeps the aggregate viewType but should load +// conversations. func (m Model) loadTextData() tea.Cmd { - if m.textState.viewType == query.TextViewConversations { + switch m.textState.level { + case textLevelDrillConversations, textLevelConversations: return m.loadTextConversations() + case textLevelTimeline: + return m.loadTextMessages() + default: + return m.loadTextAggregate() } - return m.loadTextAggregate() } diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index e1c3df56..015c4423 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -124,11 +124,35 @@ func (m Model) handleTextTimelineKeys( return m, nil case "pgup", "ctrl+u": - m.textMoveCursor(-m.visibleRows()) + step := m.visibleRows() + m.textState.cursor -= step + m.textState.scrollOffset -= step + if m.textState.cursor < 0 { + m.textState.cursor = 0 + } + if m.textState.scrollOffset < 0 { + m.textState.scrollOffset = 0 + } return m, nil case "pgdown", "ctrl+d": - m.textMoveCursor(m.visibleRows()) + itemCount := m.textRowCount() + step := m.visibleRows() + m.textState.cursor += step + m.textState.scrollOffset += step + if m.textState.cursor >= itemCount { + m.textState.cursor = itemCount - 1 + } + if m.textState.cursor < 0 { + m.textState.cursor = 0 + } + maxScroll := itemCount - m.visibleRows() + if maxScroll < 0 { + maxScroll = 0 + } + if m.textState.scrollOffset > maxScroll { + m.textState.scrollOffset = maxScroll + } return m, nil case "home": @@ -206,8 +230,10 @@ func (m *Model) cycleTextViewType(forward bool) { } if m.textState.viewType == query.TextViewConversations { m.textState.level = textLevelConversations + m.textState.filter.SortField = query.TextSortByLastMessage } else { m.textState.level = textLevelAggregate + m.textState.filter.SortField = query.TextSortByCount } m.textState.cursor = 0 m.textState.scrollOffset = 0 @@ -260,10 +286,33 @@ func (m *Model) navigateTextList(key string, itemCount int) bool { } return true case "pgup", "ctrl+u": - m.textMoveCursor(-m.visibleRows()) + step := m.visibleRows() + m.textState.cursor -= step + m.textState.scrollOffset -= step + if m.textState.cursor < 0 { + m.textState.cursor = 0 + } + if m.textState.scrollOffset < 0 { + m.textState.scrollOffset = 0 + } return true case "pgdown", "ctrl+d": - m.textMoveCursor(m.visibleRows()) + step := m.visibleRows() + m.textState.cursor += step + m.textState.scrollOffset += step + if m.textState.cursor >= itemCount { + m.textState.cursor = itemCount - 1 + } + if m.textState.cursor < 0 { + m.textState.cursor = 0 + } + maxScroll := itemCount - m.visibleRows() + if maxScroll < 0 { + maxScroll = 0 + } + if m.textState.scrollOffset > maxScroll { + m.textState.scrollOffset = maxScroll + } return true case "home": m.textState.cursor = 0 diff --git a/internal/whatsapp/importer.go b/internal/whatsapp/importer.go index d82d2111..7eea2998 100644 --- a/internal/whatsapp/importer.go +++ b/internal/whatsapp/importer.go @@ -393,9 +393,18 @@ func (imp *Importer) Import(ctx context.Context, waDBPath string, opts ImportOpt } else { // No media stored — clear attachment flags set by mapMessage // so Parquet/TUI queries don't show phantom attachments. - _, _ = imp.store.DB().Exec( - "UPDATE messages SET has_attachments = 0, attachment_count = 0 WHERE id = ?", - messageID) + // Only clear if the message has no pre-existing attachment rows + // (a previous import with --media-dir may have created them). + var existingCount int + _ = imp.store.DB().QueryRow( + "SELECT COUNT(*) FROM attachments WHERE message_id = ?", + messageID, + ).Scan(&existingCount) + if existingCount == 0 { + _, _ = imp.store.DB().Exec( + "UPDATE messages SET has_attachments = 0, attachment_count = 0 WHERE id = ?", + messageID) + } } // Store media metadata in the attachments table is done above. From 482d40fe2510ceb0310860b765109931f6f16122 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 07:54:28 -0500 Subject: [PATCH 43/65] Fix iMessage source migration, preview tie-breaker test - Prefer legacy (non-"local") apple_messages source when both exist to prevent duplicate imports after upgrading from --me identifier - Add same-timestamp message to integration test and assert ListConversations returns the correct LastPreview (highest ID wins) - Update test counts for the additional fixture message Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/msgvault/cmd/import_imessage.go | 8 +++ internal/textimport/integration_test.go | 65 +++++++++++++++++++------ 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go index a91fbf5a..7c71f01f 100644 --- a/cmd/msgvault/cmd/import_imessage.go +++ b/cmd/msgvault/cmd/import_imessage.go @@ -76,10 +76,18 @@ func runImportImessage(cmd *cobra.Command, _ []string) error { // Reuse any existing apple_messages source to preserve dedup keys // from previous imports (which may have used --me as the identifier). + // Prefer a legacy (non-"local") source when both exist. var src *store.Source existingSources, listErr := s.ListSources("apple_messages") if listErr == nil && len(existingSources) > 0 { + // Prefer legacy source (non-"local") for backward compat src = existingSources[0] + for _, s := range existingSources { + if s.Identifier != "local" { + src = s + break + } + } } else { src, err = s.GetOrCreateSource("apple_messages", "local") if err != nil { diff --git a/internal/textimport/integration_test.go b/internal/textimport/integration_test.go index d820f5c1..1ea12802 100644 --- a/internal/textimport/integration_test.go +++ b/internal/textimport/integration_test.go @@ -152,6 +152,36 @@ func TestIntegration(t *testing.T) { } } + // --- Same-timestamp message for preview tie-breaker test --- + // Inserted after wa-3 with the SAME sent_at; should have higher ID. + // ListConversations should pick this as last_preview (highest ID wins). + { + sameTimestamp := baseTime.Add(2 * time.Minute) // same as wa-3 + msg := &store.Message{ + SourceID: src1.ID, + SourceMessageID: "wa-4-tiebreaker", + ConversationID: conv1ID, + MessageType: "whatsapp", + Snippet: sql.NullString{String: "tiebreaker preview", Valid: true}, + SentAt: sql.NullTime{Time: sameTimestamp, Valid: true}, + SizeEstimate: 18, + SenderID: sql.NullInt64{Int64: phoneParticipantID, Valid: true}, + } + msgID, err := s.UpsertMessage(msg) + if err != nil { + t.Fatalf("UpsertMessage(wa-4-tiebreaker): %v", err) + } + if err := s.UpsertMessageBody(msgID, + sql.NullString{String: "tiebreaker preview", Valid: true}, + sql.NullString{}); err != nil { + t.Fatalf("UpsertMessageBody(wa-4-tiebreaker): %v", err) + } + if err := s.ReplaceMessageRecipients(msgID, "from", + []int64{phoneParticipantID}, []string{"Alice"}); err != nil { + t.Fatalf("ReplaceMessageRecipients(wa-4-tiebreaker): %v", err) + } + } + // --- Message with NULL sender_id (backward-compatibility) --- // Some older imports only have message_recipients "from" rows, not sender_id. // Verify that TextAggregate still picks these up via the COALESCE fallback. @@ -224,8 +254,8 @@ func TestIntegration(t *testing.T) { ).Scan(&msgCount); err != nil { t.Fatalf("read conv1 stats: %v", err) } - if msgCount != 3 { - t.Errorf("conv1 message_count: got %d, want 3", msgCount) + if msgCount != 4 { + t.Errorf("conv1 message_count: got %d, want 4", msgCount) } // --- TextEngine queries --- @@ -249,14 +279,19 @@ func TestIntegration(t *testing.T) { if row, ok := convByID[conv1ID]; !ok { t.Errorf("conv1 not found in ListConversations results") } else { - if row.MessageCount != 3 { - t.Errorf("conv1 MessageCount: got %d, want 3", row.MessageCount) + if row.MessageCount != 4 { + t.Errorf("conv1 MessageCount: got %d, want 4", row.MessageCount) } if row.LastMessageAt.IsZero() { t.Error("conv1 LastMessageAt is zero") } else if !row.LastMessageAt.Equal(wantConv1LastAt) { t.Errorf("conv1 LastMessageAt: got %v, want %v", row.LastMessageAt, wantConv1LastAt) } + // Preview tie-breaker: wa-3 and wa-4-tiebreaker share the same + // timestamp; the higher-ID message should win. + if row.LastPreview != "tiebreaker preview" { + t.Errorf("conv1 LastPreview: got %q, want %q", row.LastPreview, "tiebreaker preview") + } } if row, ok := convByID[conv2ID]; !ok { t.Errorf("conv2 not found in ListConversations results") @@ -272,8 +307,8 @@ func TestIntegration(t *testing.T) { } // TextAggregate by contacts — groups by phone number. - // All 6 messages have +15551234567 as the from participant - // (5 via sender_id, 1 via message_recipients fallback with NULL sender_id). + // All 7 messages have +15551234567 as the from participant + // (6 via sender_id, 1 via message_recipients fallback with NULL sender_id). aggRows, err := te.TextAggregate(ctx, query.TextViewContacts, query.TextAggregateOptions{Limit: 100}) if err != nil { t.Fatalf("TextAggregate(TextViewContacts): %v", err) @@ -285,8 +320,8 @@ func TestIntegration(t *testing.T) { for _, row := range aggRows { if row.Key == "+15551234567" { foundPhone = true - if row.Count != 6 { - t.Errorf("contact +15551234567: got count %d, want 6", row.Count) + if row.Count != 7 { + t.Errorf("contact +15551234567: got count %d, want 7", row.Count) } } } @@ -299,8 +334,8 @@ func TestIntegration(t *testing.T) { if err != nil { t.Fatalf("ListConversationMessages(conv1): %v", err) } - if len(messages) != 3 { - t.Errorf("ListConversationMessages(conv1): got %d messages, want 3", len(messages)) + if len(messages) != 4 { + t.Errorf("ListConversationMessages(conv1): got %d messages, want 4", len(messages)) } // Verify chronological order (ascending by sent_at). for i := 1; i < len(messages); i++ { @@ -315,13 +350,13 @@ func TestIntegration(t *testing.T) { } } - // GetTextStats — should count all 6 text messages. + // GetTextStats — should count all 7 text messages. stats, err := te.GetTextStats(ctx, query.TextStatsOptions{}) if err != nil { t.Fatalf("GetTextStats: %v", err) } - if stats.MessageCount != 6 { - t.Errorf("GetTextStats.MessageCount: got %d, want 6", stats.MessageCount) + if stats.MessageCount != 7 { + t.Errorf("GetTextStats.MessageCount: got %d, want 7", stats.MessageCount) } // Should see 2 accounts (sources). if stats.AccountCount != 2 { @@ -337,7 +372,7 @@ func TestIntegration(t *testing.T) { if err != nil { t.Fatalf("GetTextStats(src1): %v", err) } - if statsS1.MessageCount != 3 { - t.Errorf("GetTextStats(src1).MessageCount: got %d, want 3", statsS1.MessageCount) + if statsS1.MessageCount != 4 { + t.Errorf("GetTextStats(src1).MessageCount: got %d, want 4", statsS1.MessageCount) } } From 6062d5a5f3701c59f3b840c36559ad00f596b380 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 09:18:33 -0500 Subject: [PATCH 44/65] Add regression test for iMessage source selection Extract resolveImessageSource into a testable function and cover: - No existing sources (creates "local") - Only "local" exists (reuses it) - Only legacy exists (reuses it) - Both exist (prefers legacy for dedup stability) - Multiple legacy (picks first non-local) Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/msgvault/cmd/import_imessage.go | 38 +++++----- cmd/msgvault/cmd/import_imessage_test.go | 88 ++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 19 deletions(-) create mode 100644 cmd/msgvault/cmd/import_imessage_test.go diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go index 7c71f01f..b6164073 100644 --- a/cmd/msgvault/cmd/import_imessage.go +++ b/cmd/msgvault/cmd/import_imessage.go @@ -74,25 +74,9 @@ func runImportImessage(cmd *cobra.Command, _ []string) error { } defer func() { _ = client.Close() }() - // Reuse any existing apple_messages source to preserve dedup keys - // from previous imports (which may have used --me as the identifier). - // Prefer a legacy (non-"local") source when both exist. - var src *store.Source - existingSources, listErr := s.ListSources("apple_messages") - if listErr == nil && len(existingSources) > 0 { - // Prefer legacy source (non-"local") for backward compat - src = existingSources[0] - for _, s := range existingSources { - if s.Identifier != "local" { - src = s - break - } - } - } else { - src, err = s.GetOrCreateSource("apple_messages", "local") - if err != nil { - return fmt.Errorf("get or create source: %w", err) - } + src, err := resolveImessageSource(s) + if err != nil { + return fmt.Errorf("get or create source: %w", err) } ctx, cancel := context.WithCancel(cmd.Context()) @@ -243,6 +227,22 @@ func printImessageSummary( } } +// resolveImessageSource finds or creates the apple_messages source. +// Prefers a legacy (non-"local") source when both exist, to preserve +// dedup keys from imports that used --me as the identifier. +func resolveImessageSource(s *store.Store) (*store.Source, error) { + sources, err := s.ListSources("apple_messages") + if err == nil && len(sources) > 0 { + for _, src := range sources { + if src.Identifier != "local" { + return src, nil + } + } + return sources[0], nil + } + return s.GetOrCreateSource("apple_messages", "local") +} + func init() { importImessageCmd.Flags().StringVar( &importImessageDBPath, "db-path", "", diff --git a/cmd/msgvault/cmd/import_imessage_test.go b/cmd/msgvault/cmd/import_imessage_test.go new file mode 100644 index 00000000..b6107dd7 --- /dev/null +++ b/cmd/msgvault/cmd/import_imessage_test.go @@ -0,0 +1,88 @@ +package cmd + +import ( + "testing" + + "github.com/wesm/msgvault/internal/store" +) + +func TestResolveImessageSource(t *testing.T) { + tests := []struct { + name string + seedSources []struct{ sourceType, identifier string } + wantIdentifier string + }{ + { + name: "no existing sources — creates local", + seedSources: nil, + wantIdentifier: "local", + }, + { + name: "only local exists — reuses local", + seedSources: []struct{ sourceType, identifier string }{ + {"apple_messages", "local"}, + }, + wantIdentifier: "local", + }, + { + name: "only legacy exists — reuses legacy", + seedSources: []struct{ sourceType, identifier string }{ + {"apple_messages", "+15551234567"}, + }, + wantIdentifier: "+15551234567", + }, + { + name: "both legacy and local — prefers legacy", + seedSources: []struct{ sourceType, identifier string }{ + {"apple_messages", "local"}, + {"apple_messages", "+15551234567"}, + }, + wantIdentifier: "+15551234567", + }, + { + name: "multiple legacy — picks first non-local", + seedSources: []struct{ sourceType, identifier string }{ + {"apple_messages", "local"}, + {"apple_messages", "alice@icloud.com"}, + {"apple_messages", "+15551234567"}, + }, + // ListSources returns sorted by identifier; + // +15551234567 sorts before alice@icloud.com + wantIdentifier: "+15551234567", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s, err := store.Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + t.Fatal(err) + } + + for _, seed := range tt.seedSources { + if _, err := s.GetOrCreateSource( + seed.sourceType, seed.identifier, + ); err != nil { + t.Fatalf("seed source %q: %v", + seed.identifier, err) + } + } + + src, err := resolveImessageSource(s) + if err != nil { + t.Fatalf("resolveImessageSource: %v", err) + } + if src.Identifier != tt.wantIdentifier { + t.Errorf( + "got identifier %q, want %q", + src.Identifier, tt.wantIdentifier, + ) + } + }) + } +} From e11490589fcd12da0b0315b6f8d140be8b5cba6b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 09:43:04 -0500 Subject: [PATCH 45/65] Use content-aware column widths in text TUI views Replace hardcoded column widths with a two-phase layout: fixed columns get their natural width (measured from visible data, capped), flex columns absorb remaining space. This fixes the Source column being truncated at 10 chars (too narrow for "apple_messages") and the Conversation column consuming too much space via `m.width - 48`. Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/tui/text_view.go | 164 ++++++++++++++++++++++++++++---------- 1 file changed, 121 insertions(+), 43 deletions(-) diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index c13361b1..fe1b8e72 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -140,6 +140,18 @@ func (m Model) textStatsString() string { return "" } +// measureMaxWidth returns the widest string length in values, +// using headerWidth as the minimum. +func measureMaxWidth(values []string, headerWidth int) int { + w := headerWidth + for _, v := range values { + if len(v) > w { + w = len(v) + } + } + return w +} + // textConversationsView renders the conversations list table. func (m Model) textConversationsView() string { if len(m.textState.conversations) == 0 && !m.loading { @@ -152,20 +164,48 @@ func (m Model) textConversationsView() string { var sb strings.Builder - // Column widths - nameWidth := m.width - 48 + // Visible row range + endRow := m.textState.scrollOffset + m.pageSize - 1 + if endRow > len(m.textState.conversations) { + endRow = len(m.textState.conversations) + } + + // Measure source column from visible data + sourceVals := make( + []string, 0, endRow-m.textState.scrollOffset, + ) + for i := m.textState.scrollOffset; i < endRow; i++ { + sourceVals = append( + sourceVals, + m.textState.conversations[i].SourceType, + ) + } + sourceWidth := measureMaxWidth(sourceVals, len("Source")) + if sourceWidth > 16 { + sourceWidth = 16 + } + + // Fixed column widths + const ( + indicatorWidth = 3 + msgsWidth = 10 + lastMsgWidth = 16 + colSpacing = 6 // gaps between columns + ) + fixedTotal := indicatorWidth + sourceWidth + + msgsWidth + lastMsgWidth + colSpacing + nameWidth := m.width - fixedTotal if nameWidth < 15 { nameWidth = 15 } - if nameWidth > 50 { - nameWidth = 50 - } // Header headerRow := fmt.Sprintf( - " %-*s %10s %10s %-16s", + " %-*s %-*s %*s %-*s", nameWidth, "Conversation", - "Source", "Messages", "Last Message", + sourceWidth, "Source", + msgsWidth, "Messages", + lastMsgWidth, "Last Message", ) sb.WriteString( tableHeaderStyle.Render(padRight(headerRow, m.width)), @@ -177,11 +217,6 @@ func (m Model) textConversationsView() string { sb.WriteString("\n") // Data rows - endRow := m.textState.scrollOffset + m.pageSize - 1 - if endRow > len(m.textState.conversations) { - endRow = len(m.textState.conversations) - } - for i := m.textState.scrollOffset; i < endRow; i++ { conv := m.textState.conversations[i] isCursor := i == m.textState.cursor @@ -198,13 +233,18 @@ func (m Model) textConversationsView() string { title = truncateRunes(title, nameWidth) title = fmt.Sprintf("%-*s", nameWidth, title) - source := truncateRunes(conv.SourceType, 10) + source := truncateRunes(conv.SourceType, sourceWidth) + source = fmt.Sprintf("%-*s", sourceWidth, source) msgs := formatCount(conv.MessageCount) - lastMsg := conv.LastMessageAt.Format("2006-01-02 15:04") + lastMsg := conv.LastMessageAt.Format( + "2006-01-02 15:04", + ) line := fmt.Sprintf( - "%s %10s %10s %-16s", - title, source, msgs, lastMsg, + "%s %s %*s %-*s", + title, source, + msgsWidth, msgs, + lastMsgWidth, lastMsg, ) var style lipgloss.Style @@ -245,7 +285,8 @@ func (m Model) textConversationsView() string { return sb.String() } -// textAggregateView renders the text aggregate table (contacts, sources, etc.). +// textAggregateView renders the text aggregate table +// (contacts, sources, etc.). func (m Model) textAggregateView() string { if len(m.textState.aggregateRows) == 0 && !m.loading { return m.fillScreen( @@ -257,14 +298,26 @@ func (m Model) textAggregateView() string { var sb strings.Builder - // Column widths - keyWidth := m.width - 43 + // Visible row range + endRow := m.textState.scrollOffset + m.pageSize - 1 + if endRow > len(m.textState.aggregateRows) { + endRow = len(m.textState.aggregateRows) + } + + // Fixed column widths + const ( + indicatorWidth = 3 + countWidth = 10 + sizeWidth = 12 + attachWidth = 12 + colSpacing = 6 // gaps between columns + ) + fixedTotal := indicatorWidth + countWidth + + sizeWidth + attachWidth + colSpacing + keyWidth := m.width - fixedTotal if keyWidth < 20 { keyWidth = 20 } - if keyWidth > 57 { - keyWidth = 57 - } // Sort indicators sortInd := func(field query.TextSortField) string { @@ -286,13 +339,14 @@ func (m Model) textAggregateView() string { countLabel += si } sizeLabel := "Size" - _ = sizeLabel // TextSortField has no size variant; label kept for column layout attachLabel := "Attchs" headerRow := fmt.Sprintf( - " %-*s %10s %12s %12s", + " %-*s %*s %*s %*s", keyWidth, viewLabel, - countLabel, sizeLabel, attachLabel, + countWidth, countLabel, + sizeWidth, sizeLabel, + attachWidth, attachLabel, ) sb.WriteString( tableHeaderStyle.Render(padRight(headerRow, m.width)), @@ -303,11 +357,6 @@ func (m Model) textAggregateView() string { ) sb.WriteString("\n") - endRow := m.textState.scrollOffset + m.pageSize - 1 - if endRow > len(m.textState.aggregateRows) { - endRow = len(m.textState.aggregateRows) - } - for i := m.textState.scrollOffset; i < endRow; i++ { row := m.textState.aggregateRows[i] isCursor := i == m.textState.cursor @@ -321,11 +370,11 @@ func (m Model) textAggregateView() string { key = fmt.Sprintf("%-*s", keyWidth, key) line := fmt.Sprintf( - "%s %10s %12s %12s", + "%s %*s %*s %*s", key, - formatCount(row.Count), - formatBytes(row.TotalSize), - formatBytes(row.AttachmentSize), + countWidth, formatCount(row.Count), + sizeWidth, formatBytes(row.TotalSize), + attachWidth, formatBytes(row.AttachmentSize), ) var style lipgloss.Style @@ -381,10 +430,44 @@ func (m Model) textTimelineView() string { var sb strings.Builder - // Column widths - dateWidth := 16 - fromWidth := 20 - bodyWidth := m.width - dateWidth - fromWidth - 9 + // Visible row range + endRow := m.textState.scrollOffset + m.pageSize - 1 + if endRow > len(m.textState.messages) { + endRow = len(m.textState.messages) + } + + // Measure sender column from visible data + senderVals := make( + []string, 0, endRow-m.textState.scrollOffset, + ) + for i := m.textState.scrollOffset; i < endRow; i++ { + msg := m.textState.messages[i] + from := msg.FromName + if from == "" && msg.FromPhone != "" { + from = msg.FromPhone + } + if from == "" { + from = msg.FromEmail + } + senderVals = append(senderVals, from) + } + fromWidth := measureMaxWidth(senderVals, len("Sender")) + if fromWidth > 25 { + fromWidth = 25 + } + if fromWidth < 10 { + fromWidth = 10 + } + + // Fixed column widths + const ( + indicatorWidth = 3 + dateWidth = 16 + colSpacing = 7 // gaps between columns + ) + fixedTotal := indicatorWidth + dateWidth + + fromWidth + colSpacing + bodyWidth := m.width - fixedTotal if bodyWidth < 10 { bodyWidth = 10 } @@ -405,11 +488,6 @@ func (m Model) textTimelineView() string { ) sb.WriteString("\n") - endRow := m.textState.scrollOffset + m.pageSize - 1 - if endRow > len(m.textState.messages) { - endRow = len(m.textState.messages) - } - for i := m.textState.scrollOffset; i < endRow; i++ { msg := m.textState.messages[i] isCursor := i == m.textState.cursor From 43b53fefdee4928d607a780d852b955b20059219 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 11:32:23 -0500 Subject: [PATCH 46/65] Fix iMessage body extraction for streamtyped format macOS Ventura+/Sequoia uses NSArchiver "streamtyped" serialization for attributedBody, not NSKeyedArchiver binary plist. The parser now handles both formats: - streamtyped: scan for \x84\x01+ NSString marker, read length- prefixed UTF-8 text - bplist: existing NSKeyedArchiver path (unchanged) This was causing all message snippets and bodies to be empty for iMessage imports on modern macOS. --- internal/imessage/parser.go | 76 +++++++++++++++++++++++++++++--- internal/imessage/parser_test.go | 44 ++++++++++++++++++ 2 files changed, 115 insertions(+), 5 deletions(-) diff --git a/internal/imessage/parser.go b/internal/imessage/parser.go index 1b99fd62..657083db 100644 --- a/internal/imessage/parser.go +++ b/internal/imessage/parser.go @@ -1,6 +1,7 @@ package imessage import ( + "bytes" "strings" "time" @@ -55,17 +56,82 @@ func resolveHandle(handleID string) (phone, email, displayName string) { return "", "", handleID } -// extractAttributedBodyText decodes an NSKeyedArchiver binary plist blob from -// chat.db's attributedBody column and returns the plain text string. +// extractAttributedBodyText extracts the plain text string from chat.db's +// attributedBody column. This column uses one of two serialization formats: // -// macOS Ventura+ / iOS 16+ stopped populating the plain-text "text" column for -// most iMessages; the content lives exclusively in attributedBody as an -// NSAttributedString archived via NSKeyedArchiver. +// - NSArchiver "streamtyped" — legacy format, header starts with +// \x04\x0bstreamtyped. The text is embedded as an NSString with a +// length prefix after the class hierarchy. +// - NSKeyedArchiver binary plist — starts with "bplist". The text lives +// at $objects[rootObj["NS.string"]]. +// +// macOS Ventura+ / iOS 16+ stopped populating the plain-text "text" +// column for most iMessages; the content lives exclusively here. func extractAttributedBodyText(data []byte) string { if len(data) == 0 { return "" } + // Try streamtyped format first (most common on modern macOS) + if bytes.HasPrefix(data, []byte("\x04\x0bstreamtyped")) { + return extractStreamtypedText(data) + } + + // Try NSKeyedArchiver binary plist + if bytes.HasPrefix(data, []byte("bplist")) { + return extractKeyedArchiverText(data) + } + + return "" +} + +// extractStreamtypedText extracts text from NSArchiver streamtyped format. +// The format embeds an NSString with the text content. We scan for the +// NSString class marker followed by a length-prefixed UTF-8 string. +func extractStreamtypedText(data []byte) string { + // Look for the NSString class marker followed by the text. + // The pattern is: \x84\x01+ followed by a length byte/word + // then the UTF-8 text content. + marker := []byte("\x84\x01+") + idx := bytes.Index(data, marker) + if idx < 0 { + return "" + } + pos := idx + len(marker) + if pos >= len(data) { + return "" + } + + // Read the length. First byte: if high bit set, it's a multi-byte + // length. Otherwise it's a single-byte length. + length := 0 + b := data[pos] + pos++ + if b&0x80 == 0 { + // Single byte length + length = int(b) + } else { + // Multi-byte: low 4 bits tell how many length bytes follow + nBytes := int(b & 0x0f) + if nBytes == 0 || pos+nBytes > len(data) { + return "" + } + // Little-endian length + for i := 0; i < nBytes; i++ { + length |= int(data[pos+i]) << (8 * i) + } + pos += nBytes + } + + if length <= 0 || pos+length > len(data) { + return "" + } + + return string(data[pos : pos+length]) +} + +// extractKeyedArchiverText extracts text from NSKeyedArchiver binary plist. +func extractKeyedArchiverText(data []byte) string { var archive struct { Top map[string]plist.UID `plist:"$top"` Objects []interface{} `plist:"$objects"` diff --git a/internal/imessage/parser_test.go b/internal/imessage/parser_test.go index 74ea08f6..4ee4114b 100644 --- a/internal/imessage/parser_test.go +++ b/internal/imessage/parser_test.go @@ -202,6 +202,48 @@ func makeAttributedBodyBlob(text string) []byte { return data } +// makeStreamtypedBlob builds an NSArchiver "streamtyped" blob containing +// an NSAttributedString with the given text. Mirrors the format produced +// by macOS Ventura+/Sequoia chat.db. +func makeStreamtypedBlob(text string) []byte { + // Header + header := []byte("\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84\x84\x84") + // Class name "NSAttributedString" + className := []byte("\x12NSAttributedString") + // Parent class + parent := []byte("\x00\x84\x84\x08NSObject\x00\x85\x92\x84\x84\x84\x08NSString\x01\x94") + // Text length prefix + marker + marker := []byte("\x84\x01+") + + var buf []byte + buf = append(buf, header...) + buf = append(buf, className...) + buf = append(buf, parent...) + buf = append(buf, marker...) + + // Length encoding + textBytes := []byte(text) + n := len(textBytes) + if n < 128 { + buf = append(buf, byte(n)) + } else { + // Multi-byte little-endian length + nBytes := 0 + tmp := n + for tmp > 0 { + nBytes++ + tmp >>= 8 + } + buf = append(buf, 0x80|byte(nBytes)) + for i := 0; i < nBytes; i++ { + buf = append(buf, byte(n>>(8*i))) + } + } + + buf = append(buf, textBytes...) + return buf +} + func TestExtractAttributedBodyText(t *testing.T) { tests := []struct { name string @@ -214,6 +256,8 @@ func TestExtractAttributedBodyText(t *testing.T) { {"plain ASCII message", makeAttributedBodyBlob("Hello from iMessage"), "Hello from iMessage"}, {"unicode and emoji", makeAttributedBodyBlob("Hey! \xf0\x9f\x98\x8a"), "Hey! \xf0\x9f\x98\x8a"}, {"multiline", makeAttributedBodyBlob("Line one\nLine two"), "Line one\nLine two"}, + {"streamtyped short", makeStreamtypedBlob("Hello world"), "Hello world"}, + {"streamtyped long", makeStreamtypedBlob("This is a longer message that tests multi-byte length encoding and should work correctly"), "This is a longer message that tests multi-byte length encoding and should work correctly"}, } for _, tt := range tests { From 38884c9bf4a01899d3c84d55e9fb2924d7f7f2ea Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 12:21:13 -0500 Subject: [PATCH 47/65] Fix TUI deadlock: increase SQLite MaxOpenConns for concurrent reads MaxOpenConns(1) caused the TUI to deadlock when the FTS backfill goroutine held the single connection while async text queries tried to use it. SQLite WAL supports concurrent readers, so allow 4 connections to unblock parallel read operations. Also fix iMessage body extraction for macOS streamtyped format. --- internal/store/store.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/store/store.go b/internal/store/store.go index ea15eb72..8f8d5114 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -67,9 +67,10 @@ func Open(dbPath string) (*Store, error) { return nil, fmt.Errorf("ping database: %w", err) } - // SQLite is single-writer; one connection eliminates - // cross-connection visibility issues with FK checks. - db.SetMaxOpenConns(1) + // SQLite with WAL supports one writer + multiple readers. + // Allow enough connections for concurrent reads (TUI async + // queries, FTS backfill) while SQLite handles write serialization. + db.SetMaxOpenConns(4) return &Store{ db: db, From 2180ad9c98d91b1744f91b7805ca06a23d5fba0c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 12:27:39 -0500 Subject: [PATCH 48/65] Fix streamtyped parser for real macOS data and invalid UTF-8 The streamtyped length encoding had extra framing bytes between the length prefix and text content, causing the parser to include garbage bytes at the start of extracted text. These non-UTF-8 bytes then caused DuckDB to reject the Parquet file. Fix: skip non-printable bytes after the length prefix to find the actual text start, and trim any trailing incomplete UTF-8 sequences before returning. Also add a test case matching the real macOS Sequoia attributedBody format. --- internal/imessage/parser.go | 66 +++++++++++++++++++++----------- internal/imessage/parser_test.go | 20 ++++++++++ 2 files changed, 64 insertions(+), 22 deletions(-) diff --git a/internal/imessage/parser.go b/internal/imessage/parser.go index 657083db..6ed234df 100644 --- a/internal/imessage/parser.go +++ b/internal/imessage/parser.go @@ -4,6 +4,7 @@ import ( "bytes" "strings" "time" + "unicode/utf8" "github.com/wesm/msgvault/internal/textimport" "howett.net/plist" @@ -87,11 +88,11 @@ func extractAttributedBodyText(data []byte) string { // extractStreamtypedText extracts text from NSArchiver streamtyped format. // The format embeds an NSString with the text content. We scan for the -// NSString class marker followed by a length-prefixed UTF-8 string. +// NSString class marker, skip past the variable-length encoding prefix, +// and extract the UTF-8 text that follows. func extractStreamtypedText(data []byte) string { - // Look for the NSString class marker followed by the text. - // The pattern is: \x84\x01+ followed by a length byte/word - // then the UTF-8 text content. + // The NSString payload appears after \x84\x01+ followed by a + // variable-length size prefix, then the raw UTF-8 text. marker := []byte("\x84\x01+") idx := bytes.Index(data, marker) if idx < 0 { @@ -102,32 +103,53 @@ func extractStreamtypedText(data []byte) string { return "" } - // Read the length. First byte: if high bit set, it's a multi-byte - // length. Otherwise it's a single-byte length. - length := 0 + // Skip the length prefix. Single-byte lengths have bit 7 clear. + // Multi-byte: bit 7 is set, and the remaining bytes encode the + // length. Rather than trying to decode the exact format, skip + // all bytes that are clearly part of the prefix (non-printable + // bytes before the text starts). b := data[pos] - pos++ - if b&0x80 == 0 { - // Single byte length - length = int(b) - } else { - // Multi-byte: low 4 bits tell how many length bytes follow - nBytes := int(b & 0x0f) - if nBytes == 0 || pos+nBytes > len(data) { - return "" + if b&0x80 != 0 { + // Multi-byte length — skip the flag byte and any following + // bytes that look like length encoding (values < 0x20 or + // continuation bytes). + pos++ + for pos < len(data) && data[pos] < 0x20 { + pos++ } - // Little-endian length - for i := 0; i < nBytes; i++ { - length |= int(data[pos+i]) << (8 * i) + } else { + // Single-byte length — just skip it + pos++ + } + + if pos >= len(data) { + return "" + } + + // Extract valid UTF-8 text from pos to the next non-text control + // sequence. The text ends at a \x00 null, or at a \x84/\x85/\x86 + // archiver control byte. + end := pos + for end < len(data) { + ch := data[end] + if ch == 0x00 || ch == 0x84 || ch == 0x85 || ch == 0x86 { + break } - pos += nBytes + end++ } - if length <= 0 || pos+length > len(data) { + if end <= pos { return "" } - return string(data[pos : pos+length]) + text := string(data[pos:end]) + + // Validate UTF-8 and trim any trailing incomplete sequences + for len(text) > 0 && !utf8.ValidString(text) { + text = text[:len(text)-1] + } + + return text } // extractKeyedArchiverText extracts text from NSKeyedArchiver binary plist. diff --git a/internal/imessage/parser_test.go b/internal/imessage/parser_test.go index 4ee4114b..a2f71c08 100644 --- a/internal/imessage/parser_test.go +++ b/internal/imessage/parser_test.go @@ -244,6 +244,25 @@ func makeStreamtypedBlob(text string) []byte { return buf } +// makeRealStreamtypedBlob builds a blob matching the actual macOS Sequoia +// format where the multi-byte length has extra framing bytes (\x81\x92\x00) +// between the marker and the text. +func makeRealStreamtypedBlob(text string) []byte { + // This matches the real format seen in chat.db: + // \x84\x01+ \x81 \x92 \x00 \x86 ... + var buf []byte + buf = append(buf, "\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84\x84\x84"...) + buf = append(buf, "\x12NSAttributedString"...) + buf = append(buf, "\x00\x84\x84\x08NSObject\x00\x85\x92\x84\x84\x84\x08NSString\x01\x94"...) + buf = append(buf, "\x84\x01+"...) // marker + // Multi-byte length prefix with real format: \x81 + length bytes + \x00 + n := len(text) + buf = append(buf, 0x81, byte(n), 0x00) + buf = append(buf, text...) + buf = append(buf, 0x86) // terminator + return buf +} + func TestExtractAttributedBodyText(t *testing.T) { tests := []struct { name string @@ -258,6 +277,7 @@ func TestExtractAttributedBodyText(t *testing.T) { {"multiline", makeAttributedBodyBlob("Line one\nLine two"), "Line one\nLine two"}, {"streamtyped short", makeStreamtypedBlob("Hello world"), "Hello world"}, {"streamtyped long", makeStreamtypedBlob("This is a longer message that tests multi-byte length encoding and should work correctly"), "This is a longer message that tests multi-byte length encoding and should work correctly"}, + {"streamtyped real format", makeRealStreamtypedBlob("I am learning Go"), "I am learning Go"}, } for _, tt := range tests { From dc158c9be77a6abc4ec7665cd645abea61b4df92 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 12:48:45 -0500 Subject: [PATCH 49/65] Add inline message expansion in Texts timeline view Press Enter on a message in the timeline to expand it and show the full body text with word wrapping. Press Enter again or Esc to collapse. The full body is fetched from message_bodies via Engine.GetMessage. --- internal/tui/model.go | 8 ++++++++ internal/tui/text_commands.go | 31 +++++++++++++++++++++++++++++++ internal/tui/text_keys.go | 26 ++++++++++++++++++++++++++ internal/tui/text_state.go | 4 ++++ internal/tui/text_view.go | 34 ++++++++++++++++++++++++++++++++-- 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/internal/tui/model.go b/internal/tui/model.go index fc98ef32..563936c4 100644 --- a/internal/tui/model.go +++ b/internal/tui/model.go @@ -767,6 +767,14 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { return m.handleTextSearchResult(msg) case textStatsLoadedMsg: return m.handleTextStatsLoaded(msg) + case textMessageBodyMsg: + if msg.err != nil { + return m, nil // silently ignore body fetch errors + } + if msg.idx == m.textState.expandedIdx { + m.textState.expandedBody = msg.body + } + return m, nil } return m, nil } diff --git a/internal/tui/text_commands.go b/internal/tui/text_commands.go index ab1bc106..9512ba75 100644 --- a/internal/tui/text_commands.go +++ b/internal/tui/text_commands.go @@ -34,6 +34,13 @@ type textSearchResultMsg struct { err error } +// textMessageBodyMsg is sent when a message body is fetched for expansion. +type textMessageBodyMsg struct { + idx int // index in m.textState.messages + body string // full body text + err error +} + // textStatsLoadedMsg is sent when text stats are loaded. type textStatsLoadedMsg struct { stats *query.TotalStats @@ -151,3 +158,27 @@ func (m Model) loadTextData() tea.Cmd { return m.loadTextAggregate() } } + +// loadTextMessageBody fetches the full body of a message for inline expansion. +func (m Model) loadTextMessageBody(msgID int64, idx int) tea.Cmd { + eng := m.engine + return safeCmdWithPanic( + func() tea.Msg { + detail, err := eng.GetMessage( + context.Background(), msgID, + ) + if err != nil { + return textMessageBodyMsg{idx: idx, err: err} + } + return textMessageBodyMsg{ + idx: idx, body: detail.BodyText, + } + }, + func(r any) tea.Msg { + return textMessageBodyMsg{ + idx: idx, + err: fmt.Errorf("fetch body panic: %v", r), + } + }, + ) +} diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 015c4423..5e519881 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -112,7 +112,31 @@ func (m Model) handleTextTimelineKeys( msg tea.KeyMsg, ) (tea.Model, tea.Cmd) { switch msg.String() { + case "enter": + // Toggle inline expansion of the current message + idx := m.textState.cursor + if idx < 0 || idx >= len(m.textState.messages) { + return m, nil + } + if m.textState.expandedIdx == idx { + // Collapse + m.textState.expandedIdx = -1 + m.textState.expandedBody = "" + return m, nil + } + // Expand — fetch full body + m.textState.expandedIdx = idx + m.textState.expandedBody = "" // loading + msgID := m.textState.messages[idx].ID + return m, m.loadTextMessageBody(msgID, idx) + case "esc", "backspace": + if m.textState.expandedIdx >= 0 { + // Close expansion first + m.textState.expandedIdx = -1 + m.textState.expandedBody = "" + return m, nil + } return m.textGoBack() case "j", "down": @@ -383,6 +407,8 @@ func (m Model) textDrillDown() (tea.Model, tea.Cmd) { m.textState.level = textLevelTimeline m.textState.cursor = 0 m.textState.scrollOffset = 0 + m.textState.expandedIdx = -1 + m.textState.expandedBody = "" m.loading = true return m, m.loadTextMessages() diff --git a/internal/tui/text_state.go b/internal/tui/text_state.go index 34213964..0a409fbd 100644 --- a/internal/tui/text_state.go +++ b/internal/tui/text_state.go @@ -33,6 +33,10 @@ type textState struct { filter query.TextFilter stats *query.TotalStats breadcrumbs []textNavSnapshot + + // Inline message expansion + expandedIdx int // index of expanded message (-1 = none) + expandedBody string // full body text of expanded message } // textNavSnapshot stores state for text mode navigation history. diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index fe1b8e72..1818b9cc 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -537,6 +537,36 @@ func (m Model) textTimelineView() string { style.Render(padRight(line, m.width-3)), ) sb.WriteString("\n") + + // Inline expansion: show full wrapped body below this row + if i == m.textState.expandedIdx { + bodyText := m.textState.expandedBody + if bodyText == "" { + bodyText = "(loading...)" + } + bodyText = textutil.SanitizeTerminal(bodyText) + indent := strings.Repeat(" ", indicatorWidth) + wrapWidth := m.width - indicatorWidth - 2 + if wrapWidth < 20 { + wrapWidth = 20 + } + for _, wline := range wrapText(bodyText, wrapWidth) { + sb.WriteString(indent) + sb.WriteString( + style.Render( + padRight(wline, m.width-indicatorWidth), + ), + ) + sb.WriteString("\n") + } + // Blank separator after expanded body + sb.WriteString( + normalRowStyle.Render( + strings.Repeat(" ", m.width), + ), + ) + sb.WriteString("\n") + } } // Fill remaining space @@ -596,8 +626,8 @@ func (m Model) textFooterView() string { case textLevelTimeline: keys = []string{ - "\u2191/\u2193 navigate", "Esc back", - "m email", "? help", + "\u2191/\u2193 navigate", "Enter expand", + "Esc back", "m email", "? help", } n := len(m.textState.messages) if n > 0 { From 3ce07957f1744855acb9bd7e1049dc538fd36555 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 13:10:57 -0500 Subject: [PATCH 50/65] Chat-style timeline view with full message bodies Replace the table-style one-line-per-message timeline with a chat layout: each message shows a sender+time header followed by the full body text with word wrapping. - ListConversationMessages now joins message_bodies for full text - DuckDB delegates to SQLite for timeline (Parquet has no bodies) - Added BodyText field to MessageSummary (populated only for timelines) - Removed the broken Enter-to-expand mechanism - Messages display inline with alternating background per message --- internal/query/duckdb_text.go | 54 ++++++++- internal/query/models.go | 1 + internal/query/sqlite_text.go | 6 +- internal/tui/model.go | 8 -- internal/tui/text_commands.go | 31 ----- internal/tui/text_keys.go | 26 ---- internal/tui/text_state.go | 4 - internal/tui/text_view.go | 215 ++++++++++++++++------------------ 8 files changed, 160 insertions(+), 185 deletions(-) diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go index b27d5e5a..c7ad3bbc 100644 --- a/internal/query/duckdb_text.go +++ b/internal/query/duckdb_text.go @@ -297,6 +297,15 @@ func (e *DuckDBEngine) TextAggregate( func (e *DuckDBEngine) ListConversationMessages( ctx context.Context, convID int64, filter TextFilter, ) ([]MessageSummary, error) { + // Use SQLite directly for timeline messages — Parquet doesn't + // include message_bodies, and timelines need the full body text. + if e.sqliteEngine != nil { + return e.sqliteEngine.ListConversationMessages( + ctx, convID, filter, + ) + } + + // Fallback to Parquet (snippet only, no body text) where, args := e.buildTextFilterConditions(filter) where += " AND msg.conversation_id = ?" args = append(args, convID) @@ -496,8 +505,51 @@ func (e *DuckDBEngine) GetTextStats( return stats, nil } +// scanMessageSummariesWithBody scans rows that include a body_text column +// as the 17th field. Used by ListConversationMessages for chat timelines. +func scanMessageSummariesWithBody(rows *sql.Rows) ([]MessageSummary, error) { + var results []MessageSummary + for rows.Next() { + var msg MessageSummary + var sentAt sql.NullTime + var deletedAt sql.NullTime + if err := rows.Scan( + &msg.ID, + &msg.SourceMessageID, + &msg.ConversationID, + &msg.SourceConversationID, + &msg.Subject, + &msg.Snippet, + &msg.FromEmail, + &msg.FromName, + &msg.FromPhone, + &sentAt, + &msg.SizeEstimate, + &msg.HasAttachments, + &msg.AttachmentCount, + &deletedAt, + &msg.MessageType, + &msg.ConversationTitle, + &msg.BodyText, + ); err != nil { + return nil, fmt.Errorf("scan message: %w", err) + } + if sentAt.Valid { + msg.SentAt = sentAt.Time + } + if deletedAt.Valid { + msg.DeletedAt = &deletedAt.Time + } + results = append(results, msg) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("iterate messages: %w", err) + } + return results, nil +} + // scanMessageSummaries scans rows into MessageSummary slices. -// Shared by ListConversationMessages and TextSearch. +// Shared by TextSearch and Parquet-based timeline fallback. func scanMessageSummaries(rows *sql.Rows) ([]MessageSummary, error) { var results []MessageSummary for rows.Next() { diff --git a/internal/query/models.go b/internal/query/models.go index 26b29879..06d07691 100644 --- a/internal/query/models.go +++ b/internal/query/models.go @@ -37,6 +37,7 @@ type MessageSummary struct { DeletedAt *time.Time `json:"deleted_at,omitempty"` // When message was deleted from server (nil if not deleted) MessageType string `json:"message_type,omitempty"` // e.g., "email", "whatsapp" — from messages.message_type ConversationTitle string `json:"conversation_title,omitempty"` // Group/chat name from conversations.title + BodyText string `json:"body_text,omitempty"` // Full body text (only populated for timeline views) } // MessageDetail represents a full message with body and attachments. diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index 853dd0e5..b6d06114 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -380,10 +380,12 @@ func (e *SQLiteEngine) ListConversationMessages( m.attachment_count, m.deleted_from_source_at, COALESCE(m.message_type, '') AS message_type, - COALESCE(c.title, '') AS conv_title + COALESCE(c.title, '') AS conv_title, + COALESCE(mb.body_text, m.snippet, '') AS body_text FROM messages m LEFT JOIN participants p_sender ON p_sender.id = m.sender_id LEFT JOIN conversations c ON c.id = m.conversation_id + LEFT JOIN message_bodies mb ON mb.message_id = m.id WHERE %s ORDER BY m.sent_at ASC LIMIT ? OFFSET ? @@ -397,7 +399,7 @@ func (e *SQLiteEngine) ListConversationMessages( } defer func() { _ = rows.Close() }() - return scanMessageSummaries(rows) + return scanMessageSummariesWithBody(rows) } // TextSearch performs plain full-text search over text messages. diff --git a/internal/tui/model.go b/internal/tui/model.go index 563936c4..fc98ef32 100644 --- a/internal/tui/model.go +++ b/internal/tui/model.go @@ -767,14 +767,6 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { return m.handleTextSearchResult(msg) case textStatsLoadedMsg: return m.handleTextStatsLoaded(msg) - case textMessageBodyMsg: - if msg.err != nil { - return m, nil // silently ignore body fetch errors - } - if msg.idx == m.textState.expandedIdx { - m.textState.expandedBody = msg.body - } - return m, nil } return m, nil } diff --git a/internal/tui/text_commands.go b/internal/tui/text_commands.go index 9512ba75..ab1bc106 100644 --- a/internal/tui/text_commands.go +++ b/internal/tui/text_commands.go @@ -34,13 +34,6 @@ type textSearchResultMsg struct { err error } -// textMessageBodyMsg is sent when a message body is fetched for expansion. -type textMessageBodyMsg struct { - idx int // index in m.textState.messages - body string // full body text - err error -} - // textStatsLoadedMsg is sent when text stats are loaded. type textStatsLoadedMsg struct { stats *query.TotalStats @@ -158,27 +151,3 @@ func (m Model) loadTextData() tea.Cmd { return m.loadTextAggregate() } } - -// loadTextMessageBody fetches the full body of a message for inline expansion. -func (m Model) loadTextMessageBody(msgID int64, idx int) tea.Cmd { - eng := m.engine - return safeCmdWithPanic( - func() tea.Msg { - detail, err := eng.GetMessage( - context.Background(), msgID, - ) - if err != nil { - return textMessageBodyMsg{idx: idx, err: err} - } - return textMessageBodyMsg{ - idx: idx, body: detail.BodyText, - } - }, - func(r any) tea.Msg { - return textMessageBodyMsg{ - idx: idx, - err: fmt.Errorf("fetch body panic: %v", r), - } - }, - ) -} diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 5e519881..015c4423 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -112,31 +112,7 @@ func (m Model) handleTextTimelineKeys( msg tea.KeyMsg, ) (tea.Model, tea.Cmd) { switch msg.String() { - case "enter": - // Toggle inline expansion of the current message - idx := m.textState.cursor - if idx < 0 || idx >= len(m.textState.messages) { - return m, nil - } - if m.textState.expandedIdx == idx { - // Collapse - m.textState.expandedIdx = -1 - m.textState.expandedBody = "" - return m, nil - } - // Expand — fetch full body - m.textState.expandedIdx = idx - m.textState.expandedBody = "" // loading - msgID := m.textState.messages[idx].ID - return m, m.loadTextMessageBody(msgID, idx) - case "esc", "backspace": - if m.textState.expandedIdx >= 0 { - // Close expansion first - m.textState.expandedIdx = -1 - m.textState.expandedBody = "" - return m, nil - } return m.textGoBack() case "j", "down": @@ -407,8 +383,6 @@ func (m Model) textDrillDown() (tea.Model, tea.Cmd) { m.textState.level = textLevelTimeline m.textState.cursor = 0 m.textState.scrollOffset = 0 - m.textState.expandedIdx = -1 - m.textState.expandedBody = "" m.loading = true return m, m.loadTextMessages() diff --git a/internal/tui/text_state.go b/internal/tui/text_state.go index 0a409fbd..34213964 100644 --- a/internal/tui/text_state.go +++ b/internal/tui/text_state.go @@ -33,10 +33,6 @@ type textState struct { filter query.TextFilter stats *query.TotalStats breadcrumbs []textNavSnapshot - - // Inline message expansion - expandedIdx int // index of expanded message (-1 = none) - expandedBody string // full body text of expanded message } // textNavSnapshot stores state for text mode navigation history. diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index 1818b9cc..23681c2b 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -418,7 +418,9 @@ func (m Model) textAggregateView() string { return sb.String() } -// textTimelineView renders a chronological message timeline. +// textTimelineView renders a chat-style message timeline. +// Each message shows a sender/time header line followed by the full +// body text with word wrapping — like reading a chat app. func (m Model) textTimelineView() string { if len(m.textState.messages) == 0 && !m.loading { return m.fillScreen( @@ -430,76 +432,24 @@ func (m Model) textTimelineView() string { var sb strings.Builder - // Visible row range - endRow := m.textState.scrollOffset + m.pageSize - 1 - if endRow > len(m.textState.messages) { - endRow = len(m.textState.messages) - } - - // Measure sender column from visible data - senderVals := make( - []string, 0, endRow-m.textState.scrollOffset, - ) - for i := m.textState.scrollOffset; i < endRow; i++ { - msg := m.textState.messages[i] - from := msg.FromName - if from == "" && msg.FromPhone != "" { - from = msg.FromPhone - } - if from == "" { - from = msg.FromEmail - } - senderVals = append(senderVals, from) - } - fromWidth := measureMaxWidth(senderVals, len("Sender")) - if fromWidth > 25 { - fromWidth = 25 - } - if fromWidth < 10 { - fromWidth = 10 + // Build rendered lines for visible messages. Each message + // produces multiple screen lines: a header + wrapped body. + type chatLine struct { + text string + msgIdx int + isFirst bool // first line of this message (shows cursor) } - // Fixed column widths - const ( - indicatorWidth = 3 - dateWidth = 16 - colSpacing = 7 // gaps between columns - ) - fixedTotal := indicatorWidth + dateWidth + - fromWidth + colSpacing - bodyWidth := m.width - fixedTotal - if bodyWidth < 10 { - bodyWidth = 10 + bodyWidth := m.width - 6 // indent + margin + if bodyWidth < 20 { + bodyWidth = 20 } - // Header - headerRow := fmt.Sprintf( - " %-*s %-*s %-*s", - dateWidth, "Time", - fromWidth, "Sender", - bodyWidth, "Message", - ) - sb.WriteString( - tableHeaderStyle.Render(padRight(headerRow, m.width)), - ) - sb.WriteString("\n") - sb.WriteString( - separatorStyle.Render(strings.Repeat("\u2500", m.width)), - ) - sb.WriteString("\n") - - for i := m.textState.scrollOffset; i < endRow; i++ { + var allLines []chatLine + for i := 0; i < len(m.textState.messages); i++ { msg := m.textState.messages[i] - isCursor := i == m.textState.cursor - - indicator := " " - if isCursor { - indicator = cursorRowStyle.Render("\u25b6 ") - } - - dateStr := msg.SentAt.Format("2006-01-02 15:04") - // Sender: prefer name, then phone, then email + // Sender line: "Name 12:34 PM" or "Name 2026-03-05 12:34" from := textutil.SanitizeTerminal(msg.FromName) if from == "" && msg.FromPhone != "" { from = textutil.SanitizeTerminal(msg.FromPhone) @@ -507,77 +457,116 @@ func (m Model) textTimelineView() string { if from == "" { from = textutil.SanitizeTerminal(msg.FromEmail) } - from = truncateRunes(from, fromWidth) - from = fmt.Sprintf("%-*s", fromWidth, from) + if from == "" { + from = "Unknown" + } + timeStr := msg.SentAt.Format("2006-01-02 15:04") + headerLine := fmt.Sprintf("%s %s", from, timeStr) - // Message body: use snippet - body := textutil.SanitizeTerminal(msg.Snippet) + allLines = append(allLines, chatLine{ + text: headerLine, msgIdx: i, isFirst: true, + }) + + // Body lines — use BodyText if available, fall back to Snippet + body := textutil.SanitizeTerminal(msg.BodyText) + if body == "" { + body = textutil.SanitizeTerminal(msg.Snippet) + } if body == "" { - body = textutil.SanitizeTerminal(msg.Subject) + body = "(no text)" + } + for _, wline := range wrapText(body, bodyWidth) { + allLines = append(allLines, chatLine{ + text: wline, msgIdx: i, + }) } - body = truncateRunes(body, bodyWidth) - body = fmt.Sprintf("%-*s", bodyWidth, body) - line := fmt.Sprintf( - "%-*s %s %s", - dateWidth, dateStr, from, body, - ) + // Blank line between messages + allLines = append(allLines, chatLine{ + text: "", msgIdx: i, + }) + } + + // Scroll offset is in screen lines, not message indices. + // Map cursor (message index) to screen line offset. + cursorLine := 0 + for _, cl := range allLines { + if cl.msgIdx == m.textState.cursor && cl.isFirst { + break + } + cursorLine++ + } + + // Ensure cursor is visible + visibleLines := m.pageSize - 1 + scrollLine := m.textState.scrollOffset + if cursorLine < scrollLine { + scrollLine = cursorLine + } + if cursorLine >= scrollLine+visibleLines { + scrollLine = cursorLine - visibleLines + 3 + } + if scrollLine < 0 { + scrollLine = 0 + } + + // Render visible lines + linesWritten := 0 + for li := scrollLine; li < len(allLines) && + linesWritten < visibleLines; li++ { + cl := allLines[li] + isCursorMsg := cl.msgIdx == m.textState.cursor var style lipgloss.Style - if isCursor { + if isCursorMsg { style = cursorRowStyle - } else if i%2 == 0 { + } else if cl.msgIdx%2 == 0 { style = normalRowStyle } else { style = altRowStyle } - sb.WriteString(indicator) - sb.WriteString( - style.Render(padRight(line, m.width-3)), - ) - sb.WriteString("\n") + indicator := " " + if cl.isFirst && isCursorMsg { + indicator = cursorRowStyle.Render("\u25b6 ") + } - // Inline expansion: show full wrapped body below this row - if i == m.textState.expandedIdx { - bodyText := m.textState.expandedBody - if bodyText == "" { - bodyText = "(loading...)" - } - bodyText = textutil.SanitizeTerminal(bodyText) - indent := strings.Repeat(" ", indicatorWidth) - wrapWidth := m.width - indicatorWidth - 2 - if wrapWidth < 20 { - wrapWidth = 20 - } - for _, wline := range wrapText(bodyText, wrapWidth) { - sb.WriteString(indent) - sb.WriteString( - style.Render( - padRight(wline, m.width-indicatorWidth), - ), - ) - sb.WriteString("\n") - } - // Blank separator after expanded body + if cl.isFirst { + // Header line: bold-ish via the style + sb.WriteString(indicator) sb.WriteString( - normalRowStyle.Render( - strings.Repeat(" ", m.width), + style.Bold(true).Render( + padRight(cl.text, m.width-3), + ), + ) + } else { + // Body or blank line + sb.WriteString(" ") + sb.WriteString( + style.Render( + padRight(" "+cl.text, m.width-3), ), ) - sb.WriteString("\n") } + sb.WriteString("\n") + linesWritten++ } // Fill remaining space - dataRows := endRow - m.textState.scrollOffset - for i := dataRows; i < m.pageSize-1; i++ { + for linesWritten < visibleLines { sb.WriteString( - normalRowStyle.Render(strings.Repeat(" ", m.width)), + normalRowStyle.Render( + strings.Repeat(" ", m.width), + ), ) sb.WriteString("\n") + linesWritten++ } + // Store scroll offset back (in screen lines) + // Note: can't mutate m here since View is read-only; + // the scrollOffset is maintained by the key handler. + // Info line sb.WriteString(m.renderNotificationLine()) @@ -626,8 +615,8 @@ func (m Model) textFooterView() string { case textLevelTimeline: keys = []string{ - "\u2191/\u2193 navigate", "Enter expand", - "Esc back", "m email", "? help", + "\u2191/\u2193 navigate", "Esc back", + "m email", "? help", } n := len(m.textState.messages) if n > 0 { From b29bc6a0f7e7dc86bc2115a928cb6cbc17ca2f55 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 14:36:22 -0500 Subject: [PATCH 51/65] Timeline UX: reverse sort, search, right-justified timestamps - r key reverses chronological order (newest/oldest first) with breadcrumb indicator - / key opens search in timeline view - Timestamps right-justified on the header line (sender left, time right) - ListConversationMessages now respects TextFilter.SortDirection - Updated footer keybindings to show new keys --- internal/query/sqlite_text.go | 11 +++++++++-- internal/textimport/integration_test.go | 4 +++- internal/tui/text_keys.go | 18 ++++++++++++++++++ internal/tui/text_view.go | 18 +++++++++++++----- 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index b6d06114..8b82e1fb 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -51,6 +51,13 @@ func textMsgTypeFilterAlias(alias string) string { return alias + ".message_type IN ('whatsapp','imessage','sms','google_voice_text')" } +func sqliteDirection(d SortDirection) string { + if d == SortAsc { + return "ASC" + } + return "DESC" +} + // buildSQLiteTextFilterConditions builds WHERE conditions from a TextFilter. // All conditions use the m. prefix for the messages table. func buildSQLiteTextFilterConditions(filter TextFilter) (string, []interface{}) { @@ -387,9 +394,9 @@ func (e *SQLiteEngine) ListConversationMessages( LEFT JOIN conversations c ON c.id = m.conversation_id LEFT JOIN message_bodies mb ON mb.message_id = m.id WHERE %s - ORDER BY m.sent_at ASC + ORDER BY m.sent_at %s LIMIT ? OFFSET ? - `, where) + `, where, sqliteDirection(filter.SortDirection)) args = append(args, limit, filter.Pagination.Offset) diff --git a/internal/textimport/integration_test.go b/internal/textimport/integration_test.go index 1ea12802..29d27b24 100644 --- a/internal/textimport/integration_test.go +++ b/internal/textimport/integration_test.go @@ -330,7 +330,9 @@ func TestIntegration(t *testing.T) { } // ListConversationMessages — returns messages for conv1 in chronological order. - messages, err := te.ListConversationMessages(ctx, conv1ID, query.TextFilter{}) + messages, err := te.ListConversationMessages(ctx, conv1ID, query.TextFilter{ + SortDirection: query.SortAsc, + }) if err != nil { t.Fatalf("ListConversationMessages(conv1): %v", err) } diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 015c4423..59cc631c 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -112,6 +112,24 @@ func (m Model) handleTextTimelineKeys( msg tea.KeyMsg, ) (tea.Model, tea.Cmd) { switch msg.String() { + case "r": + // Reverse chronological order + if m.textState.filter.SortDirection == query.SortAsc { + m.textState.filter.SortDirection = query.SortDesc + } else { + m.textState.filter.SortDirection = query.SortAsc + } + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + m.loading = true + return m, m.loadTextMessages() + + case "/": + m.inlineSearchActive = true + m.searchInput.Reset() + m.searchInput.Focus() + return m, nil + case "esc", "backspace": return m.textGoBack() diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index 23681c2b..e960f0f6 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -85,9 +85,11 @@ func (m Model) textBreadcrumb() string { textDrillKey(m), ) case textLevelTimeline: - return fmt.Sprintf( - "Timeline (conv %d)", m.textState.selectedConvID, - ) + order := "\u2191 oldest first" + if m.textState.filter.SortDirection == query.SortDesc { + order = "\u2193 newest first" + } + return fmt.Sprintf("Timeline %s", order) } return "" } @@ -461,7 +463,12 @@ func (m Model) textTimelineView() string { from = "Unknown" } timeStr := msg.SentAt.Format("2006-01-02 15:04") - headerLine := fmt.Sprintf("%s %s", from, timeStr) + // Right-justify timestamp: sender on left, time on right + gap := bodyWidth - len(from) - len(timeStr) + if gap < 2 { + gap = 2 + } + headerLine := from + strings.Repeat(" ", gap) + timeStr allLines = append(allLines, chatLine{ text: headerLine, msgIdx: i, isFirst: true, @@ -615,7 +622,8 @@ func (m Model) textFooterView() string { case textLevelTimeline: keys = []string{ - "\u2191/\u2193 navigate", "Esc back", + "\u2191/\u2193 navigate", "r reverse", + "/ search", "Esc back", "m email", "? help", } n := len(m.textState.messages) From 6c37857c5be8fc84b6ce37dab1745e02d6690d7d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 19:49:49 -0500 Subject: [PATCH 52/65] Fix Unknown sender for is_from_me iMessages When --me flag isn't provided, create a fallback "Me" participant so outbound messages show "Me" instead of "Unknown" in the timeline. --- internal/imessage/client.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/internal/imessage/client.go b/internal/imessage/client.go index 1dfded37..42c347a5 100644 --- a/internal/imessage/client.go +++ b/internal/imessage/client.go @@ -162,7 +162,10 @@ func (c *Client) Import( return nil, fmt.Errorf("ensure SMS label: %w", err) } - // Resolve owner participant from --me flag for message_recipients + // Resolve owner participant for sender attribution on is_from_me messages. + // When --me is provided, resolve that handle (phone/email). + // Otherwise, create a generic "Me" participant so outbound messages + // aren't shown as "Unknown". var ownerPID int64 if c.ownerHandle != "" { pid, err := c.resolveParticipant( @@ -175,6 +178,16 @@ func (c *Client) Import( c.ownerHandle, err) } ownerPID = pid + } else { + // No --me flag: create a "Me" participant by email convention + pidMap, err := s.EnsureParticipantsBatch( + []mime.Address{{Email: "me@imessage.local", Name: "Me"}}, + ) + if err == nil { + if id, ok := pidMap["me@imessage.local"]; ok { + ownerPID = id + } + } } // Track resolved participants to avoid repeated DB calls From b2cef0a3ffe81d99aece24f38f65c91f8c6f4c55 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 19:58:13 -0500 Subject: [PATCH 53/65] Fix 2 blank lines at bottom of text views --- internal/tui/text_view.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index e960f0f6..ae437b14 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -167,7 +167,12 @@ func (m Model) textConversationsView() string { var sb strings.Builder // Visible row range - endRow := m.textState.scrollOffset + m.pageSize - 1 + // Available data rows = pageSize - header(1) - separator(1) - info(1) + availRows := m.pageSize - 3 + if availRows < 1 { + availRows = 1 + } + endRow := m.textState.scrollOffset + availRows if endRow > len(m.textState.conversations) { endRow = len(m.textState.conversations) } @@ -267,7 +272,7 @@ func (m Model) textConversationsView() string { // Fill remaining space dataRows := endRow - m.textState.scrollOffset - for i := dataRows; i < m.pageSize-1; i++ { + for i := dataRows; i < availRows; i++ { sb.WriteString( normalRowStyle.Render(strings.Repeat(" ", m.width)), ) @@ -300,8 +305,12 @@ func (m Model) textAggregateView() string { var sb strings.Builder - // Visible row range - endRow := m.textState.scrollOffset + m.pageSize - 1 + // Available data rows = pageSize - header(1) - separator(1) - info(1) + aggAvailRows := m.pageSize - 3 + if aggAvailRows < 1 { + aggAvailRows = 1 + } + endRow := m.textState.scrollOffset + aggAvailRows if endRow > len(m.textState.aggregateRows) { endRow = len(m.textState.aggregateRows) } @@ -400,7 +409,7 @@ func (m Model) textAggregateView() string { if len(m.textState.aggregateRows) == 0 && !m.loading { dataRows = 1 } - for i := dataRows; i < m.pageSize-1; i++ { + for i := dataRows; i < aggAvailRows; i++ { sb.WriteString( normalRowStyle.Render(strings.Repeat(" ", m.width)), ) From 2d67cbc4123afb0432b66f00517bbe8cc8302fe5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:05:15 -0500 Subject: [PATCH 54/65] Fix blank lines, add sort indicators to conversation headers - Fix 2 blank lines at bottom: use pageSize-5 for views with header+separator (accounts for all chrome lines correctly) - Add sort arrows to Conversation/Messages/Last Message headers showing which column is actively sorted and in which direction --- internal/tui/text_view.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index ae437b14..6e26b42d 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -168,7 +168,7 @@ func (m Model) textConversationsView() string { // Visible row range // Available data rows = pageSize - header(1) - separator(1) - info(1) - availRows := m.pageSize - 3 + availRows := m.pageSize - 5 if availRows < 1 { availRows = 1 } @@ -206,13 +206,26 @@ func (m Model) textConversationsView() string { nameWidth = 15 } - // Header + // Header with sort indicators + sortArrow := func(field query.TextSortField) string { + if m.textState.filter.SortField == field { + if m.textState.filter.SortDirection == query.SortDesc { + return "\u2193" + } + return "\u2191" + } + return "" + } + convLabel := "Conversation" + sortArrow(query.TextSortByName) + msgsLabel := "Messages" + sortArrow(query.TextSortByCount) + lastLabel := "Last Message" + sortArrow(query.TextSortByLastMessage) + headerRow := fmt.Sprintf( " %-*s %-*s %*s %-*s", - nameWidth, "Conversation", + nameWidth, convLabel, sourceWidth, "Source", - msgsWidth, "Messages", - lastMsgWidth, "Last Message", + msgsWidth, msgsLabel, + lastMsgWidth, lastLabel, ) sb.WriteString( tableHeaderStyle.Render(padRight(headerRow, m.width)), @@ -306,7 +319,7 @@ func (m Model) textAggregateView() string { var sb strings.Builder // Available data rows = pageSize - header(1) - separator(1) - info(1) - aggAvailRows := m.pageSize - 3 + aggAvailRows := m.pageSize - 5 if aggAvailRows < 1 { aggAvailRows = 1 } From b80134f64f8424fc32fddfc2e8a49601d74dd9b1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:15:15 -0500 Subject: [PATCH 55/65] Fix timeline search: filter locally within conversation When / search is used in the timeline view, filter the already-loaded messages client-side instead of calling the global FTS engine. This searches body text and sender name/phone within the current conversation. Press Esc to return to the full message list. --- internal/tui/text_keys.go | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 59cc631c..5e793111 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -1,6 +1,8 @@ package tui import ( + "strings" + tea "github.com/charmbracelet/bubbletea" "github.com/wesm/msgvault/internal/query" ) @@ -213,6 +215,29 @@ func (m Model) handleTextInlineSearchKeys( selectedConvID: m.textState.selectedConvID, }, ) + // In timeline view, filter locally (messages already loaded + // with full body text). In other views, use global FTS. + if m.textState.level == textLevelTimeline { + needle := strings.ToLower(queryStr) + var filtered []query.MessageSummary + for _, msg := range m.textState.messages { + body := strings.ToLower(msg.BodyText) + if body == "" { + body = strings.ToLower(msg.Snippet) + } + sender := strings.ToLower( + msg.FromName + " " + msg.FromPhone, + ) + if strings.Contains(body, needle) || + strings.Contains(sender, needle) { + filtered = append(filtered, msg) + } + } + m.textState.messages = filtered + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + return m, nil + } m.loading = true return m, m.loadTextSearch(queryStr) From cdb99fb1abefa28d63717a4087e378b5b7367c47 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:18:20 -0500 Subject: [PATCH 56/65] Fix conversation search and blank lines - Timeline search now filters locally within the conversation - Fix page size calculation to match email mode (pageSize - 1) --- internal/tui/text_view.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index 6e26b42d..e5d781db 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -168,7 +168,7 @@ func (m Model) textConversationsView() string { // Visible row range // Available data rows = pageSize - header(1) - separator(1) - info(1) - availRows := m.pageSize - 5 + availRows := m.pageSize - 1 if availRows < 1 { availRows = 1 } @@ -319,7 +319,7 @@ func (m Model) textAggregateView() string { var sb strings.Builder // Available data rows = pageSize - header(1) - separator(1) - info(1) - aggAvailRows := m.pageSize - 5 + aggAvailRows := m.pageSize - 1 if aggAvailRows < 1 { aggAvailRows = 1 } From 00e085f2d24110ffd6cf58b7883acf9a4da7a035 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:21:02 -0500 Subject: [PATCH 57/65] Fix timeline: add header, search bar, consistent layout - Add conversation title header + separator to timeline view so footer doesn't shift when drilling in from conversation list - Show search input bar (/) in timeline info line - All three text views now use pageSize-3 consistently (header + separator + info line) --- internal/tui/text_view.go | 41 ++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index e5d781db..900304e6 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -168,7 +168,7 @@ func (m Model) textConversationsView() string { // Visible row range // Available data rows = pageSize - header(1) - separator(1) - info(1) - availRows := m.pageSize - 1 + availRows := m.pageSize - 3 if availRows < 1 { availRows = 1 } @@ -319,7 +319,7 @@ func (m Model) textAggregateView() string { var sb strings.Builder // Available data rows = pageSize - header(1) - separator(1) - info(1) - aggAvailRows := m.pageSize - 1 + aggAvailRows := m.pageSize - 3 if aggAvailRows < 1 { aggAvailRows = 1 } @@ -456,6 +456,29 @@ func (m Model) textTimelineView() string { var sb strings.Builder + // Header + separator (matches conversations/aggregate views so + // the footer doesn't shift when drilling into a conversation) + convTitle := "" + for _, c := range m.textState.conversations { + if c.ConversationID == m.textState.selectedConvID { + convTitle = c.Title + break + } + } + if convTitle == "" { + convTitle = "Messages" + } + sb.WriteString( + tableHeaderStyle.Render( + padRight(" "+convTitle, m.width), + ), + ) + sb.WriteString("\n") + sb.WriteString( + separatorStyle.Render(strings.Repeat("\u2500", m.width)), + ) + sb.WriteString("\n") + // Build rendered lines for visible messages. Each message // produces multiple screen lines: a header + wrapped body. type chatLine struct { @@ -527,7 +550,11 @@ func (m Model) textTimelineView() string { } // Ensure cursor is visible - visibleLines := m.pageSize - 1 + // Available lines = pageSize - header(1) - separator(1) - info(1) + visibleLines := m.pageSize - 3 + if visibleLines < 1 { + visibleLines = 1 + } scrollLine := m.textState.scrollOffset if cursorLine < scrollLine { scrollLine = cursorLine @@ -596,8 +623,12 @@ func (m Model) textTimelineView() string { // Note: can't mutate m here since View is read-only; // the scrollOffset is maintained by the key handler. - // Info line - sb.WriteString(m.renderNotificationLine()) + // Info line (with search bar when active) + var infoContent string + if m.inlineSearchActive { + infoContent = "/" + m.searchInput.View() + } + sb.WriteString(m.renderInfoLine(infoContent, m.loading)) if m.modal != modalNone { return m.overlayModal(sb.String()) From 92f2c35b792e7b511ebe911f8a504cbca6276556 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:27:06 -0500 Subject: [PATCH 58/65] Revert pageSize to -1 (matches email mode; gap is pre-existing) --- internal/tui/text_view.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index 900304e6..b3465619 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -168,7 +168,7 @@ func (m Model) textConversationsView() string { // Visible row range // Available data rows = pageSize - header(1) - separator(1) - info(1) - availRows := m.pageSize - 3 + availRows := m.pageSize - 1 if availRows < 1 { availRows = 1 } @@ -319,7 +319,7 @@ func (m Model) textAggregateView() string { var sb strings.Builder // Available data rows = pageSize - header(1) - separator(1) - info(1) - aggAvailRows := m.pageSize - 3 + aggAvailRows := m.pageSize - 1 if aggAvailRows < 1 { aggAvailRows = 1 } @@ -551,7 +551,7 @@ func (m Model) textTimelineView() string { // Ensure cursor is visible // Available lines = pageSize - header(1) - separator(1) - info(1) - visibleLines := m.pageSize - 3 + visibleLines := m.pageSize - 1 if visibleLines < 1 { visibleLines = 1 } From a8cd880892c095eef776e0483a6e5dce959d0c5c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:33:56 -0500 Subject: [PATCH 59/65] Fix empty-state layout and clear-to-end-of-screen Empty views (no data, no conversations, no messages) now render header+separator+fill to match the data view height exactly, preventing blank lines at the bottom. Also adds \x1b[J (clear-to-end-of-screen) to text view output to prevent ghost text when switching between views of different heights. Fixes both email and text mode empty states. --- internal/tui/text_view.go | 95 ++++++++++++++++++++++++++++++++------- internal/tui/view.go | 18 +++++++- 2 files changed, 96 insertions(+), 17 deletions(-) diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index b3465619..556e8427 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -24,7 +24,9 @@ func (m Model) renderTextView() string { body = m.textConversationsView() } footer := m.textFooterView() - return fmt.Sprintf("%s\n%s\n%s", header, body, footer) + // \x1b[J clears from cursor to end of screen, preventing + // stale content when switching between views of different heights. + return fmt.Sprintf("%s\n%s\n%s\x1b[J", header, body, footer) } // textHeaderView renders the Texts mode header (title bar + breadcrumb). @@ -157,11 +159,32 @@ func measureMaxWidth(values []string, headerWidth int) int { // textConversationsView renders the conversations list table. func (m Model) textConversationsView() string { if len(m.textState.conversations) == 0 && !m.loading { - return m.fillScreen( - normalRowStyle.Render( - padRight("No conversations", m.width), - ), 1, - ) + var sb strings.Builder + // Still render header + separator for consistent height + sb.WriteString(tableHeaderStyle.Render( + padRight(" Conversations", m.width), + )) + sb.WriteString("\n") + sb.WriteString(separatorStyle.Render( + strings.Repeat("\u2500", m.width), + )) + sb.WriteString("\n") + sb.WriteString(normalRowStyle.Render( + padRight(" No conversations", m.width), + )) + sb.WriteString("\n") + for i := 1; i < m.pageSize-2; i++ { + sb.WriteString(normalRowStyle.Render( + strings.Repeat(" ", m.width), + )) + sb.WriteString("\n") + } + sb.WriteString(m.renderInfoLine("", m.loading)) + s := sb.String() + if m.modal != modalNone { + return m.overlayModal(s) + } + return s } var sb strings.Builder @@ -309,11 +332,31 @@ func (m Model) textConversationsView() string { // (contacts, sources, etc.). func (m Model) textAggregateView() string { if len(m.textState.aggregateRows) == 0 && !m.loading { - return m.fillScreen( - normalRowStyle.Render( - padRight("No data", m.width), - ), 1, - ) + var sb strings.Builder + sb.WriteString(tableHeaderStyle.Render( + padRight(" "+m.textState.viewType.String(), m.width), + )) + sb.WriteString("\n") + sb.WriteString(separatorStyle.Render( + strings.Repeat("\u2500", m.width), + )) + sb.WriteString("\n") + sb.WriteString(normalRowStyle.Render( + padRight(" No data", m.width), + )) + sb.WriteString("\n") + for i := 1; i < m.pageSize-2; i++ { + sb.WriteString(normalRowStyle.Render( + strings.Repeat(" ", m.width), + )) + sb.WriteString("\n") + } + sb.WriteString(m.renderInfoLine("", m.loading)) + s := sb.String() + if m.modal != modalNone { + return m.overlayModal(s) + } + return s } var sb strings.Builder @@ -447,11 +490,31 @@ func (m Model) textAggregateView() string { // body text with word wrapping — like reading a chat app. func (m Model) textTimelineView() string { if len(m.textState.messages) == 0 && !m.loading { - return m.fillScreen( - normalRowStyle.Render( - padRight("No messages", m.width), - ), 1, - ) + var sb strings.Builder + sb.WriteString(tableHeaderStyle.Render( + padRight(" Messages", m.width), + )) + sb.WriteString("\n") + sb.WriteString(separatorStyle.Render( + strings.Repeat("\u2500", m.width), + )) + sb.WriteString("\n") + sb.WriteString(normalRowStyle.Render( + padRight(" No messages", m.width), + )) + sb.WriteString("\n") + for i := 1; i < m.pageSize-2; i++ { + sb.WriteString(normalRowStyle.Render( + strings.Repeat(" ", m.width), + )) + sb.WriteString("\n") + } + sb.WriteString(m.renderInfoLine("", m.loading)) + s := sb.String() + if m.modal != modalNone { + return m.overlayModal(s) + } + return s } var sb strings.Builder diff --git a/internal/tui/view.go b/internal/tui/view.go index 7fca53ae..34817b73 100644 --- a/internal/tui/view.go +++ b/internal/tui/view.go @@ -296,7 +296,23 @@ func (m Model) headerView() string { // aggregateTableView renders the aggregate data table. func (m Model) aggregateTableView() string { if len(m.rows) == 0 && !m.loading && !m.inlineSearchActive && m.searchQuery == "" && m.err == nil { - return m.fillScreen(normalRowStyle.Render(padRight("No data", m.width)), 1) + var sb strings.Builder + sb.WriteString(tableHeaderStyle.Render(padRight(" "+viewTypeAbbrev(m.viewType), m.width))) + sb.WriteString("\n") + sb.WriteString(separatorStyle.Render(strings.Repeat("\u2500", m.width))) + sb.WriteString("\n") + sb.WriteString(normalRowStyle.Render(padRight(" No data", m.width))) + sb.WriteString("\n") + for i := 1; i < m.pageSize-2; i++ { + sb.WriteString(normalRowStyle.Render(strings.Repeat(" ", m.width))) + sb.WriteString("\n") + } + sb.WriteString(m.renderInfoLine("", m.loading)) + s := sb.String() + if m.modal != modalNone { + return m.overlayModal(s) + } + return s } var sb strings.Builder From 41ab8fe11c4dc64a84dd4fef7b9662540b73e792 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:45:14 -0500 Subject: [PATCH 60/65] =?UTF-8?q?Fix=20sort=20cycle=20to=20follow=20column?= =?UTF-8?q?=20order:=20Name=20=E2=86=92=20Count=20=E2=86=92=20LastMessage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/tui/text_keys.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 5e793111..80cb9008 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -393,13 +393,14 @@ func (m Model) textRowCount() int { // cycleTextSortField cycles between sort fields for text views. func (m *Model) cycleTextSortField() { + // Cycle follows column order: Name → Count → LastMessage switch m.textState.filter.SortField { - case query.TextSortByLastMessage: + case query.TextSortByName: m.textState.filter.SortField = query.TextSortByCount case query.TextSortByCount: - m.textState.filter.SortField = query.TextSortByName - default: m.textState.filter.SortField = query.TextSortByLastMessage + default: + m.textState.filter.SortField = query.TextSortByName } } From 3efad2e3be7bc2182e13d7755430fffa51cc2139 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 20:53:43 -0500 Subject: [PATCH 61/65] Fix roborev findings: scroll clamp, streamtyped parser, MaxOpenConns, empty states - Clamp cursor/scrollOffset when new text data arrives to prevent negative-capacity panic in textConversationsView after account change - Rewrite streamtyped parser to handle 0x92 framing bytes and use decoded length for single-byte prefix; add >127 byte test cases - Keep MaxOpenConns(1) for :memory: databases to avoid separate per-connection databases - Thread SortDirection into DuckDB fallback ORDER BY for ListConversationMessages - Fix empty-state fill to render pageSize-1 data rows (was 1 short) - Store unfilteredMessages to prevent repeated timeline searches from stacking breadcrumbs and narrowing results - Prefer highest-ID non-local source in resolveImessageSource - Ensure timeline scroll shows message header + body context lines - Add known-limitation comment for TextSearch snippet-only results Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/msgvault/cmd/import_imessage.go | 12 +++-- internal/imessage/parser.go | 69 ++++++++++++++++++++--------- internal/imessage/parser_test.go | 16 +++++-- internal/query/duckdb_text.go | 19 ++++++-- internal/store/store.go | 18 +++++--- internal/tui/model.go | 6 +++ internal/tui/text_keys.go | 62 ++++++++++++++++++++------ internal/tui/text_state.go | 29 ++++++++++++ internal/tui/text_view.go | 21 ++++++--- internal/tui/view.go | 4 +- 10 files changed, 198 insertions(+), 58 deletions(-) diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go index b6164073..5c26a041 100644 --- a/cmd/msgvault/cmd/import_imessage.go +++ b/cmd/msgvault/cmd/import_imessage.go @@ -228,16 +228,22 @@ func printImessageSummary( } // resolveImessageSource finds or creates the apple_messages source. -// Prefers a legacy (non-"local") source when both exist, to preserve -// dedup keys from imports that used --me as the identifier. +// Among non-"local" sources, prefers the one with the highest ID +// (most recently created), which is more likely in active use. func resolveImessageSource(s *store.Store) (*store.Source, error) { sources, err := s.ListSources("apple_messages") if err == nil && len(sources) > 0 { + var best *store.Source for _, src := range sources { if src.Identifier != "local" { - return src, nil + if best == nil || src.ID > best.ID { + best = src + } } } + if best != nil { + return best, nil + } return sources[0], nil } return s.GetOrCreateSource("apple_messages", "local") diff --git a/internal/imessage/parser.go b/internal/imessage/parser.go index 6ed234df..e705f5fd 100644 --- a/internal/imessage/parser.go +++ b/internal/imessage/parser.go @@ -90,9 +90,14 @@ func extractAttributedBodyText(data []byte) string { // The format embeds an NSString with the text content. We scan for the // NSString class marker, skip past the variable-length encoding prefix, // and extract the UTF-8 text that follows. +// +// After the marker (\x84\x01+), there is a length prefix: +// - Single-byte: bit 7 clear (0x00-0x7F), value is the text length +// - Multi-byte: bit 7 set, followed by length bytes and framing bytes +// (including 0x92 and other high bytes) +// +// The text content is always clean UTF-8 surrounded by binary framing. func extractStreamtypedText(data []byte) string { - // The NSString payload appears after \x84\x01+ followed by a - // variable-length size prefix, then the raw UTF-8 text. marker := []byte("\x84\x01+") idx := bytes.Index(data, marker) if idx < 0 { @@ -103,38 +108,61 @@ func extractStreamtypedText(data []byte) string { return "" } - // Skip the length prefix. Single-byte lengths have bit 7 clear. - // Multi-byte: bit 7 is set, and the remaining bytes encode the - // length. Rather than trying to decode the exact format, skip - // all bytes that are clearly part of the prefix (non-printable - // bytes before the text starts). + // Decode the length prefix to know exactly how many text bytes follow. b := data[pos] - if b&0x80 != 0 { - // Multi-byte length — skip the flag byte and any following - // bytes that look like length encoding (values < 0x20 or - // continuation bytes). + textLen := -1 + if b&0x80 == 0 { + // Single-byte length (0x00-0x7F) + textLen = int(b) pos++ - for pos < len(data) && data[pos] < 0x20 { - pos++ - } } else { - // Single-byte length — just skip it + // Multi-byte length: flag byte has bit 7 set. Skip the flag + // and all subsequent non-text framing bytes (high bytes, nulls, + // and other control bytes) until we find text content. pos++ + for pos < len(data) { + fb := data[pos] + if (fb >= 0x20 && fb < 0x7F) || + (fb >= 0xC2 && fb <= 0xF4) { + break + } + pos++ + } } if pos >= len(data) { return "" } - // Extract valid UTF-8 text from pos to the next non-text control - // sequence. The text ends at a \x00 null, or at a \x84/\x85/\x86 - // archiver control byte. + // If we decoded an exact length, use it directly + if textLen > 0 { + end := pos + textLen + if end > len(data) { + end = len(data) + } + text := string(data[pos:end]) + for len(text) > 0 && !utf8.ValidString(text) { + text = text[:len(text)-1] + } + return text + } + + // Without a decoded length, extract until an archiver control + // sequence or null byte. The 0x86 byte reliably marks the end + // of text in multi-byte format. The 0x84/0x85 bytes can also + // appear as UTF-8 continuation bytes, so only treat them as + // terminators when the text so far is already valid UTF-8. end := pos for end < len(data) { ch := data[end] - if ch == 0x00 || ch == 0x84 || ch == 0x85 || ch == 0x86 { + if ch == 0x00 || ch == 0x86 { break } + if ch == 0x84 || ch == 0x85 { + if utf8.ValidString(string(data[pos:end])) { + break + } + } end++ } @@ -143,12 +171,9 @@ func extractStreamtypedText(data []byte) string { } text := string(data[pos:end]) - - // Validate UTF-8 and trim any trailing incomplete sequences for len(text) > 0 && !utf8.ValidString(text) { text = text[:len(text)-1] } - return text } diff --git a/internal/imessage/parser_test.go b/internal/imessage/parser_test.go index a2f71c08..1a7ab80e 100644 --- a/internal/imessage/parser_test.go +++ b/internal/imessage/parser_test.go @@ -245,8 +245,9 @@ func makeStreamtypedBlob(text string) []byte { } // makeRealStreamtypedBlob builds a blob matching the actual macOS Sequoia -// format where the multi-byte length has extra framing bytes (\x81\x92\x00) -// between the marker and the text. +// format where the multi-byte length has extra framing bytes (0x81, 0x92, 0x00) +// between the marker and the text. The 0x92 byte is > 0x20, which the +// original parser couldn't skip. func makeRealStreamtypedBlob(text string) []byte { // This matches the real format seen in chat.db: // \x84\x01+ \x81 \x92 \x00 \x86 ... @@ -255,15 +256,20 @@ func makeRealStreamtypedBlob(text string) []byte { buf = append(buf, "\x12NSAttributedString"...) buf = append(buf, "\x00\x84\x84\x08NSObject\x00\x85\x92\x84\x84\x84\x08NSString\x01\x94"...) buf = append(buf, "\x84\x01+"...) // marker - // Multi-byte length prefix with real format: \x81 + length bytes + \x00 + // Multi-byte length prefix with actual 0x92 framing byte (> 0x20) n := len(text) - buf = append(buf, 0x81, byte(n), 0x00) + buf = append(buf, 0x81, byte(n), 0x92, 0x00) buf = append(buf, text...) buf = append(buf, 0x86) // terminator return buf } func TestExtractAttributedBodyText(t *testing.T) { + // Build a test string > 127 bytes to exercise multi-byte length encoding. + longText := "This message is longer than one hundred and twenty-seven bytes " + + "to exercise the multi-byte length encoding path in streamtyped format parsing. " + + "Extra padding here." + tests := []struct { name string input []byte @@ -277,7 +283,9 @@ func TestExtractAttributedBodyText(t *testing.T) { {"multiline", makeAttributedBodyBlob("Line one\nLine two"), "Line one\nLine two"}, {"streamtyped short", makeStreamtypedBlob("Hello world"), "Hello world"}, {"streamtyped long", makeStreamtypedBlob("This is a longer message that tests multi-byte length encoding and should work correctly"), "This is a longer message that tests multi-byte length encoding and should work correctly"}, + {"streamtyped multi-byte length >127", makeStreamtypedBlob(longText), longText}, {"streamtyped real format", makeRealStreamtypedBlob("I am learning Go"), "I am learning Go"}, + {"streamtyped real format long", makeRealStreamtypedBlob(longText), longText}, } for _, tt := range tests { diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go index c7ad3bbc..e8ef94e2 100644 --- a/internal/query/duckdb_text.go +++ b/internal/query/duckdb_text.go @@ -305,7 +305,9 @@ func (e *DuckDBEngine) ListConversationMessages( ) } - // Fallback to Parquet (snippet only, no body text) + // Fallback to Parquet (snippet only, no body text). + // NOTE: search results will only show snippets, not full body + // text, since Parquet files do not contain message bodies. where, args := e.buildTextFilterConditions(filter) where += " AND msg.conversation_id = ?" args = append(args, convID) @@ -315,13 +317,18 @@ func (e *DuckDBEngine) ListConversationMessages( limit = 500 } + direction := "ASC" + if filter.SortDirection == SortDesc { + direction = "DESC" + } + query := fmt.Sprintf(` WITH %s, filtered_msgs AS ( SELECT msg.id FROM msg WHERE %s - ORDER BY msg.sent_at ASC + ORDER BY msg.sent_at %s LIMIT ? OFFSET ? ), msg_sender AS ( @@ -368,8 +375,8 @@ func (e *DuckDBEngine) ListConversationMessages( LEFT JOIN msg_sender ms ON ms.message_id = msg.id LEFT JOIN direct_sender ds ON ds.message_id = msg.id LEFT JOIN conv c ON c.id = msg.conversation_id - ORDER BY msg.sent_at ASC - `, e.parquetCTEs(), where) + ORDER BY msg.sent_at %s + `, e.parquetCTEs(), where, direction, direction) args = append(args, limit, filter.Pagination.Offset) @@ -384,6 +391,10 @@ func (e *DuckDBEngine) ListConversationMessages( // TextSearch performs plain full-text search over text messages via FTS5. // Returns empty results if SQLite is not available. +// +// Known limitation: results contain snippets but not full BodyText, so the +// chat timeline will show truncated previews for search results rather than +// the complete message body. func (e *DuckDBEngine) TextSearch( ctx context.Context, query string, limit, offset int, ) ([]MessageSummary, error) { diff --git a/internal/store/store.go b/internal/store/store.go index 8f8d5114..f381d272 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -49,10 +49,12 @@ func Open(dbPath string) (*Store, error) { return nil, fmt.Errorf("PostgreSQL is not yet supported in the Go implementation; use SQLite path instead") } - // Ensure directory exists - dir := filepath.Dir(dbPath) - if err := os.MkdirAll(dir, 0755); err != nil { - return nil, fmt.Errorf("create db directory: %w", err) + // Ensure directory exists (skip for in-memory databases) + if dbPath != ":memory:" && !strings.Contains(dbPath, ":memory:") { + dir := filepath.Dir(dbPath) + if err := os.MkdirAll(dir, 0755); err != nil { + return nil, fmt.Errorf("create db directory: %w", err) + } } dsn := dbPath + defaultSQLiteParams @@ -70,7 +72,13 @@ func Open(dbPath string) (*Store, error) { // SQLite with WAL supports one writer + multiple readers. // Allow enough connections for concurrent reads (TUI async // queries, FTS backfill) while SQLite handles write serialization. - db.SetMaxOpenConns(4) + // Exception: :memory: databases are per-connection, so multiple + // connections would create separate databases. + if dbPath == ":memory:" || strings.Contains(dbPath, ":memory:") { + db.SetMaxOpenConns(1) + } else { + db.SetMaxOpenConns(4) + } return &Store{ db: db, diff --git a/internal/tui/model.go b/internal/tui/model.go index fc98ef32..ca78b425 100644 --- a/internal/tui/model.go +++ b/internal/tui/model.go @@ -783,6 +783,9 @@ func (m Model) handleTextConversationsLoaded(msg textConversationsLoadedMsg) (te } m.textState.conversations = msg.conversations m.textState.stats = msg.stats + // Clamp cursor/scrollOffset to new data bounds to prevent + // out-of-range panics after account change or filter. + m.textState.clampCursorToConversations() return m, nil } @@ -798,6 +801,9 @@ func (m Model) handleTextAggregateLoaded(msg textAggregateLoadedMsg) (tea.Model, } m.textState.aggregateRows = msg.rows m.textState.stats = msg.stats + // Clamp cursor/scrollOffset to new data bounds to prevent + // out-of-range panics after account change or filter. + m.textState.clampCursorToAggregates() return m, nil } diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index 80cb9008..bc557b19 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -203,24 +203,30 @@ func (m Model) handleTextInlineSearchKeys( if queryStr == "" { return m, nil } - // Save current state so Esc can return from search results - m.textState.breadcrumbs = append( - m.textState.breadcrumbs, - textNavSnapshot{ - level: m.textState.level, - viewType: m.textState.viewType, - cursor: m.textState.cursor, - scrollOffset: m.textState.scrollOffset, - filter: m.textState.filter, - selectedConvID: m.textState.selectedConvID, - }, - ) // In timeline view, filter locally (messages already loaded // with full body text). In other views, use global FTS. if m.textState.level == textLevelTimeline { + // Save unfiltered messages on first search so repeated + // searches filter from the original set, not stacked results. + if m.textState.unfilteredMessages == nil { + m.textState.unfilteredMessages = m.textState.messages + // Push breadcrumb only on first search to avoid stacking + m.textState.breadcrumbs = append( + m.textState.breadcrumbs, + textNavSnapshot{ + level: m.textState.level, + viewType: m.textState.viewType, + cursor: m.textState.cursor, + scrollOffset: m.textState.scrollOffset, + filter: m.textState.filter, + selectedConvID: m.textState.selectedConvID, + }, + ) + } + source := m.textState.unfilteredMessages needle := strings.ToLower(queryStr) var filtered []query.MessageSummary - for _, msg := range m.textState.messages { + for _, msg := range source { body := strings.ToLower(msg.BodyText) if body == "" { body = strings.ToLower(msg.Snippet) @@ -238,6 +244,18 @@ func (m Model) handleTextInlineSearchKeys( m.textState.scrollOffset = 0 return m, nil } + // Save current state so Esc can return from search results + m.textState.breadcrumbs = append( + m.textState.breadcrumbs, + textNavSnapshot{ + level: m.textState.level, + viewType: m.textState.viewType, + cursor: m.textState.cursor, + scrollOffset: m.textState.scrollOffset, + filter: m.textState.filter, + selectedConvID: m.textState.selectedConvID, + }, + ) m.loading = true return m, m.loadTextSearch(queryStr) @@ -468,6 +486,24 @@ func (m Model) textDrillDown() (tea.Model, tea.Cmd) { // textGoBack returns to the previous text navigation state. func (m Model) textGoBack() (tea.Model, tea.Cmd) { + // If we have unfiltered messages (from a timeline search), restore + // them directly without reloading. This is instant and avoids + // re-querying the database. + if m.textState.unfilteredMessages != nil { + m.textState.messages = m.textState.unfilteredMessages + m.textState.unfilteredMessages = nil + // Pop the search breadcrumb + if len(m.textState.breadcrumbs) > 0 { + snap := m.textState.breadcrumbs[len(m.textState.breadcrumbs)-1] + m.textState.breadcrumbs = m.textState.breadcrumbs[:len(m.textState.breadcrumbs)-1] + m.textState.cursor = snap.cursor + m.textState.scrollOffset = snap.scrollOffset + } else { + m.textState.cursor = 0 + m.textState.scrollOffset = 0 + } + return m, nil + } if len(m.textState.breadcrumbs) == 0 { return m, nil } diff --git a/internal/tui/text_state.go b/internal/tui/text_state.go index 34213964..251633ec 100644 --- a/internal/tui/text_state.go +++ b/internal/tui/text_state.go @@ -33,6 +33,11 @@ type textState struct { filter query.TextFilter stats *query.TotalStats breadcrumbs []textNavSnapshot + + // unfilteredMessages holds the original timeline messages before + // search filtering. Repeated searches always filter from this + // snapshot to prevent stacking breadcrumbs and narrowing results. + unfilteredMessages []query.MessageSummary } // textNavSnapshot stores state for text mode navigation history. @@ -44,3 +49,27 @@ type textNavSnapshot struct { filter query.TextFilter selectedConvID int64 } + +// clampCursorToConversations ensures cursor and scrollOffset +// are within valid bounds after conversation data changes. +func (ts *textState) clampCursorToConversations() { + n := len(ts.conversations) + if ts.cursor >= n { + ts.cursor = max(n-1, 0) + } + if ts.scrollOffset > ts.cursor { + ts.scrollOffset = ts.cursor + } +} + +// clampCursorToAggregates ensures cursor and scrollOffset +// are within valid bounds after aggregate data changes. +func (ts *textState) clampCursorToAggregates() { + n := len(ts.aggregateRows) + if ts.cursor >= n { + ts.cursor = max(n-1, 0) + } + if ts.scrollOffset > ts.cursor { + ts.scrollOffset = ts.cursor + } +} diff --git a/internal/tui/text_view.go b/internal/tui/text_view.go index 556e8427..970ba6bb 100644 --- a/internal/tui/text_view.go +++ b/internal/tui/text_view.go @@ -173,7 +173,9 @@ func (m Model) textConversationsView() string { padRight(" No conversations", m.width), )) sb.WriteString("\n") - for i := 1; i < m.pageSize-2; i++ { + // 1 "No data" + (pageSize-2) blanks = pageSize-1 data rows, + // then +1 info line = pageSize body rows total. + for i := 1; i < m.pageSize-1; i++ { sb.WriteString(normalRowStyle.Render( strings.Repeat(" ", m.width), )) @@ -345,7 +347,9 @@ func (m Model) textAggregateView() string { padRight(" No data", m.width), )) sb.WriteString("\n") - for i := 1; i < m.pageSize-2; i++ { + // 1 "No data" + (pageSize-2) blanks = pageSize-1 data rows, + // then +1 info line = pageSize body rows total. + for i := 1; i < m.pageSize-1; i++ { sb.WriteString(normalRowStyle.Render( strings.Repeat(" ", m.width), )) @@ -503,7 +507,9 @@ func (m Model) textTimelineView() string { padRight(" No messages", m.width), )) sb.WriteString("\n") - for i := 1; i < m.pageSize-2; i++ { + // 1 "No messages" + (pageSize-2) blanks = pageSize-1 data rows, + // then +1 info line = pageSize body rows total. + for i := 1; i < m.pageSize-1; i++ { sb.WriteString(normalRowStyle.Render( strings.Repeat(" ", m.width), )) @@ -612,7 +618,7 @@ func (m Model) textTimelineView() string { cursorLine++ } - // Ensure cursor is visible + // Ensure cursor is visible with some body context. // Available lines = pageSize - header(1) - separator(1) - info(1) visibleLines := m.pageSize - 1 if visibleLines < 1 { @@ -622,8 +628,11 @@ func (m Model) textTimelineView() string { if cursorLine < scrollLine { scrollLine = cursorLine } - if cursorLine >= scrollLine+visibleLines { - scrollLine = cursorLine - visibleLines + 3 + // Show the message header plus a few body lines (not just + // the header), so long messages don't appear cut off. + cursorEndLine := cursorLine + 3 + if cursorEndLine >= scrollLine+visibleLines { + scrollLine = cursorEndLine - visibleLines + 1 } if scrollLine < 0 { scrollLine = 0 diff --git a/internal/tui/view.go b/internal/tui/view.go index 34817b73..51910a44 100644 --- a/internal/tui/view.go +++ b/internal/tui/view.go @@ -303,7 +303,9 @@ func (m Model) aggregateTableView() string { sb.WriteString("\n") sb.WriteString(normalRowStyle.Render(padRight(" No data", m.width))) sb.WriteString("\n") - for i := 1; i < m.pageSize-2; i++ { + // 1 "No data" + (pageSize-2) blanks = pageSize-1 data rows, + // then +1 info line = pageSize body rows total. + for i := 1; i < m.pageSize-1; i++ { sb.WriteString(normalRowStyle.Render(strings.Repeat(" ", m.width))) sb.WriteString("\n") } From 3723b62735d754b27ae369d2d521dbc9872551e3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 21:04:48 -0500 Subject: [PATCH 62/65] Fix sort cycle for aggregates, streamtyped parser, search state - Sort cycle: aggregates only cycle Name/Count (no LastMessage); conversations cycle Name/Count/LastMessage - Streamtyped parser: read actual length byte after 0x81 flag instead of scanning for first printable byte (fixes 32-126 byte messages) - Clear unfilteredMessages on timeline reload to prevent stale search state after sort/reverse operations --- internal/imessage/parser.go | 45 +++++++++++++++++--------------- internal/imessage/parser_test.go | 2 ++ internal/tui/model.go | 1 + internal/tui/text_keys.go | 12 +++++++-- 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/internal/imessage/parser.go b/internal/imessage/parser.go index e705f5fd..7ddf0538 100644 --- a/internal/imessage/parser.go +++ b/internal/imessage/parser.go @@ -108,25 +108,37 @@ func extractStreamtypedText(data []byte) string { return "" } - // Decode the length prefix to know exactly how many text bytes follow. + // Decode the length prefix. + // + // Format after the \x84\x01+ marker: + // Single-byte: 0x00-0x7F = length, then text immediately follows. + // Multi-byte: 0x81 [framing_bytes...] 0x86 + // The 0x81 flag means "1 length byte follows". The length byte + // can be any value (including printable ASCII). After the length + // byte, skip remaining framing bytes (0x00, 0x92, etc.) until + // valid text starts. b := data[pos] - textLen := -1 + var textLen int if b&0x80 == 0 { // Single-byte length (0x00-0x7F) textLen = int(b) pos++ } else { - // Multi-byte length: flag byte has bit 7 set. Skip the flag - // and all subsequent non-text framing bytes (high bytes, nulls, - // and other control bytes) until we find text content. - pos++ + // Multi-byte: 0x81 means 1 length byte follows + pos++ // skip the 0x81 flag + if pos >= len(data) { + return "" + } + textLen = int(data[pos]) + pos++ // skip the length byte + // Skip remaining framing bytes (nulls, high bytes) before text for pos < len(data) { fb := data[pos] - if (fb >= 0x20 && fb < 0x7F) || - (fb >= 0xC2 && fb <= 0xF4) { - break + if fb == 0x00 || (fb >= 0x80 && fb <= 0xBF) { + pos++ + continue } - pos++ + break } } @@ -134,7 +146,7 @@ func extractStreamtypedText(data []byte) string { return "" } - // If we decoded an exact length, use it directly + // Use the decoded length to extract exactly the right bytes if textLen > 0 { end := pos + textLen if end > len(data) { @@ -147,22 +159,13 @@ func extractStreamtypedText(data []byte) string { return text } - // Without a decoded length, extract until an archiver control - // sequence or null byte. The 0x86 byte reliably marks the end - // of text in multi-byte format. The 0x84/0x85 bytes can also - // appear as UTF-8 continuation bytes, so only treat them as - // terminators when the text so far is already valid UTF-8. + // Fallback: extract until archiver control byte or end end := pos for end < len(data) { ch := data[end] if ch == 0x00 || ch == 0x86 { break } - if ch == 0x84 || ch == 0x85 { - if utf8.ValidString(string(data[pos:end])) { - break - } - } end++ } diff --git a/internal/imessage/parser_test.go b/internal/imessage/parser_test.go index 1a7ab80e..69f81c30 100644 --- a/internal/imessage/parser_test.go +++ b/internal/imessage/parser_test.go @@ -285,6 +285,8 @@ func TestExtractAttributedBodyText(t *testing.T) { {"streamtyped long", makeStreamtypedBlob("This is a longer message that tests multi-byte length encoding and should work correctly"), "This is a longer message that tests multi-byte length encoding and should work correctly"}, {"streamtyped multi-byte length >127", makeStreamtypedBlob(longText), longText}, {"streamtyped real format", makeRealStreamtypedBlob("I am learning Go"), "I am learning Go"}, + {"streamtyped real format 50 bytes", makeRealStreamtypedBlob("Yeah, we should catch up soon! How about Thursday?"), "Yeah, we should catch up soon! How about Thursday?"}, + {"streamtyped real format 100 bytes", makeRealStreamtypedBlob("This is exactly one hundred bytes of text for testing the mid-range length encoding in streamtyped!!"), "This is exactly one hundred bytes of text for testing the mid-range length encoding in streamtyped!!"}, {"streamtyped real format long", makeRealStreamtypedBlob(longText), longText}, } diff --git a/internal/tui/model.go b/internal/tui/model.go index ca78b425..987edfe8 100644 --- a/internal/tui/model.go +++ b/internal/tui/model.go @@ -818,6 +818,7 @@ func (m Model) handleTextMessagesLoaded(msg textMessagesLoadedMsg) (tea.Model, t return m, nil } m.textState.messages = msg.messages + m.textState.unfilteredMessages = nil // clear stale search snapshot m.textState.cursor = 0 m.textState.scrollOffset = 0 return m, nil diff --git a/internal/tui/text_keys.go b/internal/tui/text_keys.go index bc557b19..a9487802 100644 --- a/internal/tui/text_keys.go +++ b/internal/tui/text_keys.go @@ -410,13 +410,21 @@ func (m Model) textRowCount() int { } // cycleTextSortField cycles between sort fields for text views. +// Conversations: Name → Count → LastMessage (3 columns). +// Aggregates: Name → Count only (no LastMessage column). func (m *Model) cycleTextSortField() { - // Cycle follows column order: Name → Count → LastMessage + isConv := m.textState.level == textLevelConversations || + m.textState.level == textLevelDrillConversations + switch m.textState.filter.SortField { case query.TextSortByName: m.textState.filter.SortField = query.TextSortByCount case query.TextSortByCount: - m.textState.filter.SortField = query.TextSortByLastMessage + if isConv { + m.textState.filter.SortField = query.TextSortByLastMessage + } else { + m.textState.filter.SortField = query.TextSortByName + } default: m.textState.filter.SortField = query.TextSortByName } From f85e46a15175dfe3b81c1bd8645f5011c3af021b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 1 Apr 2026 21:10:43 -0500 Subject: [PATCH 63/65] Update nix vendorHash for new go.sum dependencies --- flake.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flake.nix b/flake.nix index 16ca3e60..3e017152 100644 --- a/flake.nix +++ b/flake.nix @@ -29,7 +29,7 @@ pname = "msgvault"; version = "0.11.0"; src = ./.; - vendorHash = "sha256-gvK36/Vd2eN7Fy315Y/OpbZwPLIXnRj+7C3YLfTa5a0="; + vendorHash = "sha256-QxvGn74coYm8g0/kj5ctxp+PbN8K98X9uyLcSojLRRg="; proxyVendor = true; subPackages = [ "cmd/msgvault" ]; tags = [ "fts5" ]; From 6e042313ea85a237b7e070d5d4583f17a2baf464 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 2 Apr 2026 07:01:49 -0500 Subject: [PATCH 64/65] Fix iMessage group chat detection and conversation titles - Group detection: check for ";+;" in chat GUID (was incorrectly checking for "chat;+;" which never matched iMessage format) - Group chats without display_name: build title from participant names/phones (e.g., "Alice, +15551234567, Bob") - Fixes duplicate-looking conversations in the TUI where multiple group chats with the same participant showed the same phone number --- internal/imessage/client.go | 69 ++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/internal/imessage/client.go b/internal/imessage/client.go index 42c347a5..83529b9e 100644 --- a/internal/imessage/client.go +++ b/internal/imessage/client.go @@ -614,18 +614,19 @@ func (c *Client) ensureConversation( return id, false, nil } - // Determine conversation type and title + // Determine conversation type from chat GUID format. + // iMessage uses "any;+;" for group chats, "any;-;" for 1:1 direct. convType := "direct_chat" - title := "" - if msg.ChatIdentifier != nil && - strings.Contains(*msg.ChatIdentifier, "chat;+;") { + if chatGUID != "" && strings.Contains(chatGUID, ";+;") { convType = "group_chat" } + // Build conversation title. + title := "" if msg.ChatDisplayName != nil && *msg.ChatDisplayName != "" { title = *msg.ChatDisplayName } else if convType == "direct_chat" && msg.HandleID != nil { - // For 1:1 chats, use the participant's phone/email as title + // For 1:1 chats, use the other party's phone/email phone, email, name := resolveHandle(*msg.HandleID) if name != "" { title = name @@ -635,6 +636,8 @@ func (c *Client) ensureConversation( title = email } } + // Group chats without a display_name: title will be set after + // participants are resolved (below). convID, err := s.EnsureConversationWithType( sourceID, chatGUID, convType, title, @@ -652,9 +655,65 @@ func (c *Client) ensureConversation( ) } + // For group chats without a title, build one from participants + if title == "" && convType == "group_chat" { + title = c.buildGroupTitle(ctx, s, convID) + if title != "" { + _, _ = s.DB().Exec( + "UPDATE conversations SET title = ? WHERE id = ?", + title, convID, + ) + } + } + return convID, true, nil } +// buildGroupTitle builds a group chat title from participant names/phones. +// Returns something like "Alice, +15551234567, Bob" (up to 4 names). +func (c *Client) buildGroupTitle( + ctx context.Context, s *store.Store, convID int64, +) string { + rows, err := s.DB().QueryContext(ctx, ` + SELECT COALESCE( + NULLIF(p.display_name, ''), + NULLIF(p.phone_number, ''), + NULLIF(p.email_address, ''), + '?' + ) + FROM conversation_participants cp + JOIN participants p ON p.id = cp.participant_id + WHERE cp.conversation_id = ? + ORDER BY p.id + LIMIT 5 + `, convID) + if err != nil { + return "" + } + defer func() { _ = rows.Close() }() + + var names []string + for rows.Next() { + var name string + if err := rows.Scan(&name); err != nil { + continue + } + // Skip "Me" placeholder + if name == "Me" || name == "me@imessage.local" { + continue + } + names = append(names, name) + } + if len(names) == 0 { + return "" + } + if len(names) > 4 { + return strings.Join(names[:3], ", ") + + fmt.Sprintf(" +%d more", len(names)-3) + } + return strings.Join(names, ", ") +} + // linkChatParticipants resolves all handles in a chat and links them // as conversation participants. func (c *Client) linkChatParticipants( From c21e6a41f02b0adbbedbc9b7b074ea98dc5ebc39 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 2 Apr 2026 07:30:11 -0500 Subject: [PATCH 65/65] Fix group title: separate count from display names, exclude Me --- internal/imessage/client.go | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/internal/imessage/client.go b/internal/imessage/client.go index 83529b9e..2e2bedf8 100644 --- a/internal/imessage/client.go +++ b/internal/imessage/client.go @@ -670,10 +670,21 @@ func (c *Client) ensureConversation( } // buildGroupTitle builds a group chat title from participant names/phones. -// Returns something like "Alice, +15551234567, Bob" (up to 4 names). +// Returns something like "Alice, +15551234567, Bob" or "Alice, Bob +3 more". func (c *Client) buildGroupTitle( ctx context.Context, s *store.Store, convID int64, ) string { + // Get total non-self participant count + var totalCount int + _ = s.DB().QueryRowContext(ctx, ` + SELECT COUNT(*) FROM conversation_participants cp + JOIN participants p ON p.id = cp.participant_id + WHERE cp.conversation_id = ? + AND COALESCE(p.email_address, '') != 'me@imessage.local' + AND COALESCE(p.display_name, '') != 'Me' + `, convID).Scan(&totalCount) + + // Get first few names for display rows, err := s.DB().QueryContext(ctx, ` SELECT COALESCE( NULLIF(p.display_name, ''), @@ -684,8 +695,10 @@ func (c *Client) buildGroupTitle( FROM conversation_participants cp JOIN participants p ON p.id = cp.participant_id WHERE cp.conversation_id = ? + AND COALESCE(p.email_address, '') != 'me@imessage.local' + AND COALESCE(p.display_name, '') != 'Me' ORDER BY p.id - LIMIT 5 + LIMIT 3 `, convID) if err != nil { return "" @@ -698,18 +711,14 @@ func (c *Client) buildGroupTitle( if err := rows.Scan(&name); err != nil { continue } - // Skip "Me" placeholder - if name == "Me" || name == "me@imessage.local" { - continue - } names = append(names, name) } if len(names) == 0 { return "" } - if len(names) > 4 { - return strings.Join(names[:3], ", ") + - fmt.Sprintf(" +%d more", len(names)-3) + if totalCount > len(names) { + return strings.Join(names, ", ") + + fmt.Sprintf(" +%d more", totalCount-len(names)) } return strings.Join(names, ", ") }