LocalHarness/app.zig at main · humanjesse/LocalHarness · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Application logic - App struct and all related methods
const std = @import("std");
const fs = std.fs;
const mem = std.mem;
const json = std.json;
const process = std.process;
const ui = @import("ui");
const markdown = @import("markdown");
const ollama = @import("ollama");
const llm_provider_module = @import("llm_provider");
const permission = @import("permission");
const tools_module = @import("tools");
const types = @import("types");
const state_module = @import("state");
const context_module = @import("context");
const config_module = @import("config");
const render = @import("render");
const message_renderer = @import("message_renderer");
const tool_executor_module = @import("tool_executor");
const zvdb = @import("zvdb");
const embeddings_module = @import("embeddings");
const embedder_interface = @import("embedder_interface");
const lmstudio = @import("lmstudio");
pub const agents_module = @import("agents"); // Re-export for agent_loader and agent_executor
const config_editor_state = @import("config_editor_state");
const config_editor_renderer = @import("config_editor_renderer");
const config_editor_input = @import("config_editor_input");
const agent_loader = @import("agent_loader");
const agent_builder_state = @import("agent_builder_state");
const agent_builder_renderer = @import("agent_builder_renderer");
const agent_builder_input = @import("agent_builder_input");
const help_state = @import("help_state");
const help_renderer = @import("help_renderer");
const help_input = @import("help_input");
const profile_ui_state = @import("profile_ui_state");
const profile_ui_renderer = @import("profile_ui_renderer");
const profile_ui_input = @import("profile_ui_input");
const conversation_db_module = @import("conversation_db");

// Re-export types for convenience
pub const Message = types.Message;
pub const ClickableArea = types.ClickableArea;
pub const StreamChunk = types.StreamChunk;
pub const Config = config_module.Config;
pub const AppState = state_module.AppState;
pub const AppContext = context_module.AppContext;

// Thread function context for background streaming
const StreamThreadContext = struct {
    allocator: mem.Allocator,
    app: *App,
    llm_provider: *llm_provider_module.LLMProvider,
    model: []const u8,
    messages: []ollama.ChatMessage,
    format: ?[]const u8,
    tools: []const ollama.Tool,
    keep_alive: []const u8,
    num_ctx: usize,
    num_predict: isize,
};

// Agent progress context for streaming sub-agent progress to UI
// Now uses unified ProgressDisplayContext from agents.zig
const ProgressDisplayContext = agents_module.ProgressDisplayContext;

// Finalize agent message with nice formatting when agent completes
// Now uses unified finalization from message_renderer
fn finalizeAgentMessage(ctx: *ProgressDisplayContext) !void {
    return message_renderer.finalizeProgressMessage(ctx);
}

// Progress callback for sub-agents (e.g., file curator) - streams to UI in real-time
fn agentProgressCallback(user_data: ?*anyopaque, update_type: agents_module.ProgressUpdateType, message: []const u8) void {
    const ctx = @as(*ProgressDisplayContext, @ptrCast(@alignCast(user_data orelse return)));
    const allocator = ctx.app.allocator;

    // Accumulate the message content based on type
    switch (update_type) {
        .thinking => {
            ctx.thinking_buffer.appendSlice(allocator, message) catch return;
        },
        .content => {
            ctx.content_buffer.appendSlice(allocator, message) catch return;
        },
        .complete => {
            // Agent finished - finalize the message with nice formatting
            if (!ctx.finalized and ctx.current_message_idx != null) {
                ctx.finalized = true;
                finalizeAgentMessage(ctx) catch return;
                return;  // finalizeAgentMessage handles redraw
            }
        },
        .iteration, .tool_call => {
            // Status updates - could log these or show in UI
            // For now, just continue accumulating
        },
        .embedding, .storage => {
            // Embedding/storage updates not used in current architecture
            // Just ignore for agent callbacks
        },
    }

    // Find or create the progress message
    if (ctx.current_message_idx == null) {
        // Capture start time if not already set
        if (ctx.start_time == 0) {
            ctx.start_time = std.time.milliTimestamp();
        }

        // Create new system message for this agent progress
        // Start with simple message, will be formatted nicely on completion
        const display_content = allocator.dupe(u8, "🤔 Analyzing...") catch return;
        const content_processed = markdown.processMarkdown(allocator, display_content) catch return;

        // Duplicate task name for message
        const task_name_copy = allocator.dupe(u8, ctx.task_name) catch return;

        ctx.app.messages.append(allocator, .{
            .role = .display_only_data,
            .content = display_content,
            .processed_content = content_processed,
            .thinking_content = null,
            .processed_thinking_content = null,
            .thinking_expanded = false,
            .timestamp = std.time.milliTimestamp(),
            // Agent analysis metadata (NEW - for streaming + collapsible display)
            .agent_analysis_name = task_name_copy,
            .agent_analysis_expanded = true,  // Expanded during streaming (shows content)
            .agent_analysis_completed = false,  // Not done yet (no collapse button)
            // Keep tool_execution_time for display (but not using tool collapse)
            .tool_call_expanded = false,
            .tool_name = null,
            .tool_success = null,
            .tool_execution_time = null,  // Will be set on completion
        }) catch return;

        ctx.current_message_idx = ctx.app.messages.items.len - 1;
    } else {
        // Update existing message with streaming content
        const idx = ctx.current_message_idx.?;
        var msg = &ctx.app.messages.items[idx];

        // Free old content
        allocator.free(msg.content);
        for (msg.processed_content.items) |*item| {
            item.deinit(allocator);
        }
        msg.processed_content.deinit(allocator);

        if (msg.thinking_content) |tc| allocator.free(tc);
        if (msg.processed_thinking_content) |*ptc| {
            for (ptc.items) |*item| {
                item.deinit(allocator);
            }
            ptc.deinit(allocator);
        }

        // During streaming, just show raw accumulated content (thinking + content)
        var combined = std.ArrayListUnmanaged(u8){};
        defer combined.deinit(allocator);

        if (ctx.thinking_buffer.items.len > 0) {
            combined.appendSlice(allocator, ctx.thinking_buffer.items) catch return;
            if (ctx.content_buffer.items.len > 0) {
                combined.appendSlice(allocator, "\n\n") catch return;
            }
        }
        if (ctx.content_buffer.items.len > 0) {
            combined.appendSlice(allocator, ctx.content_buffer.items) catch return;
        }

        const display_content = if (combined.items.len > 0)
            allocator.dupe(u8, combined.items) catch return
        else
            allocator.dupe(u8, "🤔 Analyzing...") catch return;

        msg.content = display_content;
        msg.processed_content = markdown.processMarkdown(allocator, display_content) catch return;
        msg.thinking_content = null;
        msg.processed_thinking_content = null;
    }

    // Redraw screen to show progress
    _ = message_renderer.redrawScreen(ctx.app) catch return;
    ctx.app.updateCursorToBottom();
}

// Define available tools for the model
fn createTools(allocator: mem.Allocator) ![]const ollama.Tool {
    return try tools_module.getOllamaTools(allocator);
}

// Incremental rendering support structures
pub const MessageRenderInfo = struct {
    message_index: usize,
    y_start: usize,           // Absolute Y position where message starts
    y_end: usize,             // Absolute Y position where message ends
    height: usize,            // Total lines this message occupies
    content_hash: u64,        // Hash of message content for change detection (includes expansion states)
};

/// Simplified render cache - just tracks terminal size for resize detection
pub const RenderCache = struct {
    last_terminal_width: u16 = 0,
    last_terminal_height: u16 = 0,

    pub fn init() RenderCache {
        return .{};
    }

    pub fn deinit(self: *RenderCache, allocator: mem.Allocator) void {
        _ = self;
        _ = allocator;
    }
};


pub const App = struct {
    allocator: mem.Allocator,
    config: Config,
    messages: std.ArrayListUnmanaged(Message),
    llm_provider: llm_provider_module.LLMProvider,
    input_buffer: std.ArrayListUnmanaged(u8),
    clickable_areas: std.ArrayListUnmanaged(ClickableArea),
    scroll_y: usize = 0,
    cursor_y: usize = 1,
    terminal_size: ui.TerminalSize,
    valid_cursor_positions: std.ArrayListUnmanaged(usize),
    // Resize handling state
    resize_in_progress: bool = false,
    saved_expansion_states: std.ArrayListUnmanaged(bool),
    last_resize_time: i64 = 0,
    // Streaming state
    streaming_active: bool = false,
    stream_mutex: std.Thread.Mutex = .{},
    stream_chunks: std.ArrayListUnmanaged(StreamChunk) = .{},
    stream_thread: ?std.Thread = null,
    stream_thread_ctx: ?*StreamThreadContext = null,
    // Available tools for the model
    tools: []const ollama.Tool,
    // Tool execution state
    pending_tool_calls: ?[]ollama.ToolCall = null,
    tool_call_depth: usize = 0,
    max_tool_depth: usize = 15, // Max tools per iteration (increased for agentic tasks)
    // Permission system
    permission_manager: permission.PermissionManager,
    permission_pending: bool = false,
    permission_response: ?permission.PermissionMode = null, // Set by UI, consumed by tool_executor
    // Tool execution state machine
    tool_executor: tool_executor_module.ToolExecutor,
    // Phase 1: Task management state
    state: AppState,
    app_context: AppContext,
    max_iterations: usize = 10, // Master loop iteration limit
    // Auto-scroll state (receipt printer mode) - removed, now always auto-scrolls
    // Vector DB components (kept for future semantic search)
    vector_store: ?*zvdb.HNSW(f32) = null,
    embedder: ?*embedder_interface.Embedder = null, // Generic interface - works with both Ollama and LM Studio
    // Config editor state (modal mode)
    config_editor: ?config_editor_state.ConfigEditorState = null,
    // Agent system
    agent_registry: agents_module.AgentRegistry,
    agent_loader: agent_loader.AgentLoader,
    agent_builder: ?agent_builder_state.AgentBuilderState = null,
    // Help viewer state (modal mode)
    help_viewer: ?help_state.HelpState = null,
    // Profile manager state (modal mode)
    profile_ui: ?profile_ui_state.ProfileUIState = null,
    // Conversation persistence
    conversation_db: ?conversation_db_module.ConversationDB = null,
    current_conversation_id: ?i64 = null,

    // Incremental rendering state
    render_cache: RenderCache = RenderCache.init(),

    pub fn init(allocator: mem.Allocator, config: Config) !App {
        const tools = try createTools(allocator);

        // Initialize permission manager
        var perm_manager = try permission.PermissionManager.init(allocator, ".", null); // No audit log by default
        const tool_metadata = try tools_module.getPermissionMetadata(allocator);
        defer allocator.free(tool_metadata);
        try perm_manager.registerTools(tool_metadata);

        // Load saved policies from disk
        config_module.loadPolicies(allocator, &perm_manager) catch |err| {
            // Log error but don't fail - just continue with default policies
            std.debug.print("Warning: Failed to load policies: {}\n", .{err});
        };

        // Vector database components reserved for future semantic search
        // Currently disabled - can be re-enabled for semantic code search
        const vector_store_opt: ?*zvdb.HNSW(f32) = null;
        const embedder_opt: ?*embedder_interface.Embedder = null;

        // Create LLM provider based on config
        const provider = try llm_provider_module.createProvider(config.provider, allocator, config);

        // Initialize agent system
        var agent_registry = agents_module.AgentRegistry.init(allocator);
        errdefer agent_registry.deinit();

        var loader = agent_loader.AgentLoader.init(allocator, &agent_registry);
        errdefer loader.deinit();

        // Load all agents (native + markdown)
        try loader.loadAllAgents();

        var app = App{
            .allocator = allocator,
            .config = config,
            .messages = .{},
            .llm_provider = provider,
            .input_buffer = .{},
            .clickable_areas = .{},
            .terminal_size = try ui.Tui.getTerminalSize(),
            .valid_cursor_positions = .{},
            .saved_expansion_states = .{},
            .tools = tools,
            .permission_manager = perm_manager,
            .tool_executor = tool_executor_module.ToolExecutor.init(allocator),
            // Phase 1: Initialize state (session-ephemeral)
            .state = AppState.init(allocator),
            .app_context = undefined, // Will be fixed by caller after struct is in final location
            .vector_store = vector_store_opt,
            .embedder = embedder_opt,
            .agent_registry = agent_registry,
            .agent_loader = loader,
            .agent_builder = null,
        };

        // Initialize conversation database
        const home_dir = std.posix.getenv("HOME") orelse ".";
        const config_dir = try std.fmt.allocPrint(allocator, "{s}/.config/localharness", .{home_dir});
        defer allocator.free(config_dir);

        // Ensure config directory exists
        std.fs.makeDirAbsolute(config_dir) catch |err| {
            if (err != error.PathAlreadyExists) return err;
        };

        const db_path = try std.fmt.allocPrint(allocator, "{s}/conversations.db", .{config_dir});
        defer allocator.free(db_path);

        // Initialize database - fail fast if this fails
        const conv_db = try conversation_db_module.ConversationDB.init(allocator, db_path);
        app.conversation_db = conv_db;

        // Create conversation immediately on startup
        const profile_name = "default"; // TODO: Get from profile manager
        const conv_id = try app.conversation_db.?.createConversation(profile_name);
        app.current_conversation_id = conv_id;
        std.debug.print("Created new conversation: {}\n", .{conv_id});

        // Add system prompt (Position 0 - stable)
        const system_prompt = "You are a helpful coding assistant.";
        const system_processed = try markdown.processMarkdown(allocator, system_prompt);
        try app.messages.append(allocator, .{
            .role = .system,
            .content = try allocator.dupe(u8, system_prompt),
            .processed_content = system_processed,
            .thinking_expanded = true,
            .timestamp = std.time.milliTimestamp(),
        });

        // Persist system message immediately
        try app.persistMessage(app.messages.items.len - 1);

        return app;
    }

    // Fix context pointers after App is in its final location
    // MUST be called immediately after init() in main.zig
    pub fn fixContextPointers(self: *App) void {
        self.app_context = .{
            .allocator = self.allocator,
            .config = &self.config,
            .state = &self.state,
            .llm_provider = &self.llm_provider,
            .vector_store = self.vector_store,
            .embedder = self.embedder,
            .agent_registry = &self.agent_registry,
        };
    }

    // Persist a message to the database immediately
    // Fails fast if database persistence fails
    fn persistMessage(self: *App, message_index: usize) !void {
        // Conversation ID must exist (created in init)
        const conv_id = self.current_conversation_id orelse return error.NoConversation;

        // Database must be initialized
        if (self.conversation_db) |*db| {
            // Save message - fail fast if this fails
            const message = &self.messages.items[message_index];
            _ = try db.saveMessage(conv_id, @intCast(message_index), message);
        } else {
            return error.DatabaseNotInitialized;
        }
    }

    // Check if viewport is currently at the bottom
    fn isViewportAtBottom(self: *App) bool {
        if (self.valid_cursor_positions.items.len == 0) return true;

        const last_position = self.valid_cursor_positions.items[self.valid_cursor_positions.items.len - 1];
        return self.cursor_y == last_position;
    }

    // Pre-calculate and apply scroll position to keep viewport anchored at bottom
    // This should be called BEFORE redrawScreen() to avoid flashing

    // Update cursor to track bottom position after redraw
    pub fn updateCursorToBottom(self: *App) void {
        if (self.valid_cursor_positions.items.len > 0) {
            self.cursor_y = self.valid_cursor_positions.items[self.valid_cursor_positions.items.len - 1];
        }
    }


    fn streamingThreadFn(ctx: *StreamThreadContext) void {
        // Callback that adds chunks to the queue
        const ChunkCallback = struct {
            fn callback(chunk_ctx: *StreamThreadContext, thinking_chunk: ?[]const u8, content_chunk: ?[]const u8, tool_calls_chunk: ?[]const ollama.ToolCall) void {
                chunk_ctx.app.stream_mutex.lock();
                defer chunk_ctx.app.stream_mutex.unlock();

                // Free tool_calls_chunk after processing (we take ownership from ollama.zig)
                defer if (tool_calls_chunk) |calls| {
                    for (calls) |call| {
                        if (call.id) |id| chunk_ctx.allocator.free(id);
                        if (call.type) |t| chunk_ctx.allocator.free(t);
                        chunk_ctx.allocator.free(call.function.name);
                        chunk_ctx.allocator.free(call.function.arguments);
                    }
                    chunk_ctx.allocator.free(calls);
                };

                // Create a chunk and add to queue
                const chunk = StreamChunk{
                    .thinking = if (thinking_chunk) |t| chunk_ctx.allocator.dupe(u8, t) catch null else null,
                    .content = if (content_chunk) |c| chunk_ctx.allocator.dupe(u8, c) catch null else null,
                    .done = false,
                };
                chunk_ctx.app.stream_chunks.append(chunk_ctx.allocator, chunk) catch return;

                // Store tool calls for execution after streaming completes
                if (tool_calls_chunk) |calls| {
                    // Duplicate the tool calls to keep them after streaming
                    const owned_calls = chunk_ctx.allocator.alloc(ollama.ToolCall, calls.len) catch return;
                    for (calls, 0..) |call, i| {
                        // Generate ID if not provided by model
                        const call_id = if (call.id) |id|
                            chunk_ctx.allocator.dupe(u8, id) catch return
                        else
                            std.fmt.allocPrint(chunk_ctx.allocator, "call_{d}", .{i}) catch return;

                        // Use "function" as default type if not provided
                        const call_type = if (call.type) |t|
                            chunk_ctx.allocator.dupe(u8, t) catch return
                        else
                            chunk_ctx.allocator.dupe(u8, "function") catch return;

                        owned_calls[i] = ollama.ToolCall{
                            .id = call_id,
                            .type = call_type,
                            .function = .{
                                .name = chunk_ctx.allocator.dupe(u8, call.function.name) catch return,
                                .arguments = chunk_ctx.allocator.dupe(u8, call.function.arguments) catch return,
                            },
                        };
                    }
                    chunk_ctx.app.pending_tool_calls = owned_calls;
                }
            }
        };

        // Get provider capabilities to check what's supported
        const caps = ctx.llm_provider.getCapabilities();

        // Only enable thinking if both config and provider support it
        const enable_thinking = ctx.app.config.enable_thinking and caps.supports_thinking;

        // Only pass keep_alive if provider supports it
        const keep_alive = if (caps.supports_keep_alive) ctx.keep_alive else null;

        // Run the streaming with retry logic for stale connections
        ctx.llm_provider.chatStream(
            ctx.model,
            ctx.messages,
            enable_thinking, // Capability-aware thinking mode
            ctx.format,
            if (ctx.tools.len > 0) ctx.tools else null, // Pass tools to model
            keep_alive, // Capability-aware keep_alive
            ctx.num_ctx,
            ctx.num_predict,
            null, // temperature - use model default for main chat
            null, // repeat_penalty - use model default for main chat
            ctx,
            ChunkCallback.callback,
        ) catch |err| {
            // Handle stale connection errors with retry
            if (err == error.EndOfStream or err == error.ConnectionResetByPeer) {
                // Send retry message to user
                ctx.app.stream_mutex.lock();
                const retry_msg = std.fmt.allocPrint(
                    ctx.allocator,
                    "Connection failed: {s} - Retrying...",
                    .{@errorName(err)},
                ) catch "Connection failed - Retrying...";
                const retry_chunk = StreamChunk{ .thinking = null, .content = retry_msg, .done = false };
                ctx.app.stream_chunks.append(ctx.allocator, retry_chunk) catch {};
                ctx.app.stream_mutex.unlock();

                // Note: Provider-level retry not implemented yet
                // Different providers may have different retry strategies

                // Small delay before retry
                std.Thread.sleep(100 * std.time.ns_per_ms);

                // Retry the request (reuse capability checks from above)
                ctx.llm_provider.chatStream(
                    ctx.model,
                    ctx.messages,
                    enable_thinking, // Use capability-aware value
                    ctx.format,
                    if (ctx.tools.len > 0) ctx.tools else null,
                    keep_alive, // Use capability-aware value
                    ctx.num_ctx,
                    ctx.num_predict,
                    null, // temperature - use model default for main chat
                    null, // repeat_penalty - use model default for main chat
                    ctx,
                    ChunkCallback.callback,
                ) catch |retry_err| {
                    // Second failure - report error to user
                    ctx.app.stream_mutex.lock();
                    const error_msg = std.fmt.allocPrint(
                        ctx.allocator,
                        "Failed to connect to Ollama: {s}",
                        .{@errorName(retry_err)},
                    ) catch "Failed to connect to Ollama";
                    const error_chunk = StreamChunk{ .thinking = null, .content = error_msg, .done = false };
                    ctx.app.stream_chunks.append(ctx.allocator, error_chunk) catch {};
                    ctx.app.stream_mutex.unlock();
                };
            } else {
                // Other errors - report directly to user
                ctx.app.stream_mutex.lock();
                const error_msg = std.fmt.allocPrint(
                    ctx.allocator,
                    "Connection error: {s}",
                    .{@errorName(err)},
                ) catch "Connection error occurred";
                const error_chunk = StreamChunk{ .thinking = null, .content = error_msg, .done = false };
                ctx.app.stream_chunks.append(ctx.allocator, error_chunk) catch {};
                ctx.app.stream_mutex.unlock();
            }
        };

        // ALWAYS add a "done" chunk, even if chatStream failed
        // This ensures streaming_active gets set to false
        ctx.app.stream_mutex.lock();
        defer ctx.app.stream_mutex.unlock();
        const done_chunk = StreamChunk{ .thinking = null, .content = null, .done = true };
        ctx.app.stream_chunks.append(ctx.allocator, done_chunk) catch return;
    }


    // Compress message history by replacing read_file results with Graph RAG summaries
    // REMOVED: GraphRAG compression no longer needed
    // Curator caching handles this better - instant cache hits for same conversation context

    // Internal method to start streaming with current message history
    fn startStreaming(self: *App, format: ?[]const u8) !void {
        // Set streaming flag FIRST - before any redraws
        // This ensures the status bar shows "AI is responding..." immediately
        self.streaming_active = true;

        // Reset tool call depth when starting a new user message
        // (This will be set correctly by continueStreaming for tool calls)

        // Copy messages to ollama_messages
        var ollama_messages = std.ArrayListUnmanaged(ollama.ChatMessage){};
        defer ollama_messages.deinit(self.allocator);

        for (self.messages.items) |msg| {
            // Skip display_only_data messages - they're UI-only notifications
            if (msg.role == .display_only_data) continue;

            const role_str = switch (msg.role) {
                .user => "user",
                .assistant => "assistant",
                .system => "system",
                .tool => "tool",
                .display_only_data => unreachable, // Already filtered above
            };
            try ollama_messages.append(self.allocator, .{
                .role = role_str,
                .content = msg.content,
                .tool_call_id = msg.tool_call_id,
                .tool_calls = msg.tool_calls,
            });
        }

        // DEBUG: Print what we're sending to the API
        if (std.posix.getenv("DEBUG_TOOLS")) |_| {
            std.debug.print("\n=== DEBUG: Sending {d} messages to API ===\n", .{ollama_messages.items.len});
            for (ollama_messages.items, 0..) |msg, i| {
                std.debug.print("[{d}] role={s}", .{i, msg.role});
                if (msg.tool_calls) |_| std.debug.print(" [HAS_TOOL_CALLS]", .{});
                if (msg.tool_call_id) |id| std.debug.print(" [tool_call_id={s}]", .{id});
                std.debug.print("\n", .{});

                const preview_len = @min(msg.content.len, 80);
                std.debug.print("    content: {s}{s}\n", .{
                    msg.content[0..preview_len],
                    if (msg.content.len > 80) "..." else "",
                });
            }
            std.debug.print("=== END DEBUG ===\n\n", .{});
        }

        // Create placeholder for assistant response (empty initially)
        const assistant_content = try self.allocator.dupe(u8, "");
        const assistant_processed = try markdown.processMarkdown(self.allocator, assistant_content);
        try self.messages.append(self.allocator, .{
            .role = .assistant,
            .content = assistant_content,
            .processed_content = assistant_processed,
            .thinking_content = null,
            .processed_thinking_content = null,
            .thinking_expanded = true,
            .timestamp = std.time.milliTimestamp(),
        });

        // Mark all dirty - new message changes layout
        // Removed dirty state tracking - rendering is now always automatic

        // Redraw to show empty placeholder (receipt printer mode)
        _ = try message_renderer.redrawScreen(self);
        self.updateCursorToBottom();

        // Prepare thread context
        const messages_slice = try ollama_messages.toOwnedSlice(self.allocator);

        const thread_ctx = try self.allocator.create(StreamThreadContext);
        thread_ctx.* = .{
            .allocator = self.allocator,
            .app = self,
            .llm_provider = &self.llm_provider,
            .model = self.config.model,
            .messages = messages_slice,
            .format = format,
            .tools = self.tools,
            .keep_alive = self.config.model_keep_alive,
            .num_ctx = self.config.num_ctx,
            .num_predict = self.config.num_predict,
        };

        // Start streaming in background thread
        self.stream_thread_ctx = thread_ctx;
        self.stream_thread = try std.Thread.spawn(.{}, streamingThreadFn, .{thread_ctx});
    }

    // Send a message and get streaming response from Ollama (non-blocking)
    pub fn sendMessage(self: *App, user_text: []const u8, format: ?[]const u8) !void {
        // Reset tool call depth for new user messages
        self.tool_call_depth = 0;

        // Phase 1: Reset iteration count for new user messages (master loop)
        self.state.iteration_count = 0;

        // Reset auto-scroll state - no longer needed, now always auto-scrolls

        // 1. Add user message
        const user_content = try self.allocator.dupe(u8, user_text);
        const user_processed = try markdown.processMarkdown(self.allocator, user_content);

        try self.messages.append(self.allocator, .{
            .role = .user,
            .content = user_content,
            .processed_content = user_processed,
            .thinking_expanded = true,
            .timestamp = std.time.milliTimestamp(),
        });

        // Persist user message immediately
        try self.persistMessage(self.messages.items.len - 1);

        // Mark all dirty - new message changes layout
        // Removed dirty state tracking - rendering is now always automatic

        // Show user message right away (receipt printer mode)
        _ = try message_renderer.redrawScreen(self);

        // 2. Start streaming
        try self.startStreaming(format);
    }

    // Helper function to show permission prompt (non-blocking)
    fn showPermissionPrompt(
        self: *App,
        tool_call: ollama.ToolCall,
        eval_result: permission.PolicyEngine.EvaluationResult,
    ) !void {
        // Create permission request message
        const prompt_text = try std.fmt.allocPrint(
            self.allocator,
            "Permission requested for tool: {s}",
            .{tool_call.function.name},
        );
        const prompt_processed = try markdown.processMarkdown(self.allocator, prompt_text);

        // Duplicate tool call for storage in message
        const stored_tool_call = ollama.ToolCall{
            .id = if (tool_call.id) |id| try self.allocator.dupe(u8, id) else null,
            .type = if (tool_call.type) |t| try self.allocator.dupe(u8, t) else null,
            .function = .{
                .name = try self.allocator.dupe(u8, tool_call.function.name),
                .arguments = try self.allocator.dupe(u8, tool_call.function.arguments),
            },
        };

        try self.messages.append(self.allocator, .{
            .role = .display_only_data,
            .content = prompt_text,
            .processed_content = prompt_processed,
            .thinking_expanded = false,
            .timestamp = std.time.milliTimestamp(),
            .permission_request = .{
                .tool_call = stored_tool_call,
                .eval_result = .{
                    .allowed = eval_result.allowed,
                    .reason = try self.allocator.dupe(u8, eval_result.reason),
                    .ask_user = eval_result.ask_user,
                    .show_preview = eval_result.show_preview,
                },
                .timestamp = std.time.milliTimestamp(),
            },
        });

        // Persist permission request immediately
        try self.persistMessage(self.messages.items.len - 1);

        // Set permission pending state (non-blocking - main loop will handle response)
        self.permission_pending = true;
        self.permission_response = null;
    }

    // Execute a tool call and return the result (Phase 1: passes AppContext)
    fn executeTool(self: *App, tool_call: ollama.ToolCall) !tools_module.ToolResult {
        // Populate conversation context for context-aware tools
        // Extract last 5 messages (or fewer if conversation is shorter)
        const start_idx = if (self.messages.items.len > 5)
            self.messages.items.len - 5
        else
            0;

        // IMPORTANT: Allocate a COPY of the messages slice to avoid use-after-free
        // During tool execution, self.messages may grow and reallocate its backing buffer
        // This would invalidate any slice pointing into the old buffer
        const messages_copy = try self.allocator.dupe(types.Message, self.messages.items[start_idx..]);
        self.app_context.recent_messages = messages_copy;
        defer self.allocator.free(messages_copy);

        // Set up agent progress streaming for sub-agents (like file curator)
        var agent_progress_ctx = ProgressDisplayContext{
            .app = self,
            .task_name = try self.allocator.dupe(u8, "Agent Analysis"), // Generic default (will be updated by run_agent tool)
            .task_icon = "🤔", // Default icon for file analysis
            .start_time = std.time.milliTimestamp(), // Start tracking execution time
        };
        defer agent_progress_ctx.thinking_buffer.deinit(self.allocator);
        defer agent_progress_ctx.content_buffer.deinit(self.allocator);
        defer self.allocator.free(agent_progress_ctx.task_name);

        self.app_context.agent_progress_callback = agentProgressCallback;
        self.app_context.agent_progress_user_data = &agent_progress_ctx;

        // Execute tool with conversation context and progress streaming
        const result = try tools_module.executeToolCall(self.allocator, tool_call, &self.app_context);

        // Note: Progress message is kept as permanent "Agent Analysis" message
        // It was already finalized by the progress callback when agent completed

        // Clear conversation context and progress callback after use
        self.app_context.recent_messages = null;
        self.app_context.agent_progress_callback = null;
        self.app_context.agent_progress_user_data = null;

        return result;
    }


    pub fn deinit(self: *App) void {
        // GraphRAG indexing queue removed - context queue handles async tasks now

        // Wait for streaming thread to finish if active
        if (self.stream_thread) |thread| {
            thread.join();
        }

        // Clean up thread context if it exists
        if (self.stream_thread_ctx) |ctx| {
            // Note: msg.role and msg.content are NOT owned by the context
            // They are pointers to existing message data, so we only free the array
            self.allocator.free(ctx.messages);

            self.allocator.destroy(ctx);
        }

        // Clean up stream chunks
        for (self.stream_chunks.items) |chunk| {
            if (chunk.thinking) |t| self.allocator.free(t);
            if (chunk.content) |c| self.allocator.free(c);
        }
        self.stream_chunks.deinit(self.allocator);

        for (self.messages.items) |*message| {
            self.allocator.free(message.content);
            for (message.processed_content.items) |*item| {
                item.deinit(self.allocator);
            }
            message.processed_content.deinit(self.allocator);

            // Clean up thinking content if present
            if (message.thinking_content) |thinking| {
                self.allocator.free(thinking);
            }
            if (message.processed_thinking_content) |*thinking_processed| {
                for (thinking_processed.items) |*item| {
                    item.deinit(self.allocator);
                }
                thinking_processed.deinit(self.allocator);
            }

            // Clean up tool calling fields
            if (message.tool_calls) |calls| {
                for (calls) |call| {
                    if (call.id) |id| self.allocator.free(id);
                    if (call.type) |call_type| self.allocator.free(call_type);
                    self.allocator.free(call.function.name);
                    self.allocator.free(call.function.arguments);
                }
                self.allocator.free(calls);
            }
            if (message.tool_call_id) |id| {
                self.allocator.free(id);
            }

            // Clean up permission request if present
            if (message.permission_request) |perm_req| {
                if (perm_req.tool_call.id) |id| self.allocator.free(id);
                if (perm_req.tool_call.type) |call_type| self.allocator.free(call_type);
                self.allocator.free(perm_req.tool_call.function.name);
                self.allocator.free(perm_req.tool_call.function.arguments);
                self.allocator.free(perm_req.eval_result.reason);
            }

            // Clean up tool execution metadata
            if (message.tool_name) |name| {
                self.allocator.free(name);
            }

            // Clean up agent analysis metadata
            if (message.agent_analysis_name) |name| {
                self.allocator.free(name);
            }
        }
        self.messages.deinit(self.allocator);
        self.llm_provider.deinit();
        self.input_buffer.deinit(self.allocator);
        self.clickable_areas.deinit(self.allocator);
        self.valid_cursor_positions.deinit(self.allocator);
        self.saved_expansion_states.deinit(self.allocator);

        // Clean up tools
        for (self.tools) |tool| {
            self.allocator.free(tool.function.name);
            self.allocator.free(tool.function.description);
            self.allocator.free(tool.function.parameters);
        }
        self.allocator.free(self.tools);

        // Clean up pending tool calls if any
        if (self.pending_tool_calls) |calls| {
            for (calls) |call| {
                if (call.id) |id| self.allocator.free(id);
                if (call.type) |call_type| self.allocator.free(call_type);
                self.allocator.free(call.function.name);
                self.allocator.free(call.function.arguments);
            }
            self.allocator.free(calls);
        }

        // Clean up permission manager
        self.permission_manager.deinit();

        // Clean up tool executor
        self.tool_executor.deinit();

        // Phase 1: Clean up state
        self.state.deinit();

        // Clean up Graph RAG components (session-only, not persisted)
        if (self.vector_store) |vs| {
            vs.deinit();
            self.allocator.destroy(vs);
        }

        if (self.embedder) |emb| {
            // Clean up the underlying client first
            switch (emb.*) {
                .ollama => |client| {
                    client.deinit();
                    self.allocator.destroy(client);
                },
                .lmstudio => |client| {
                    client.deinit();
                    self.allocator.destroy(client);
                },
            }
            // Then destroy the embedder wrapper
            self.allocator.destroy(emb);
        }

        // Clean up config editor if active
        if (self.config_editor) |*editor| {
            editor.deinit();
        }

        // Clean up agent builder if active
        if (self.agent_builder) |*builder| {
            builder.deinit();
        }

        // Clean up help viewer if active
        if (self.help_viewer) |*viewer| {
            viewer.deinit();
        }

        // Clean up profile UI if active
        if (self.profile_ui) |*profile_ui| {
            profile_ui.deinit();
        }

        // Clean up conversation database
        if (self.conversation_db) |*db| {
            db.deinit();
        }

        // Clean up agent system
        self.agent_loader.deinit();
        self.agent_registry.deinit();

        // Clean up incremental rendering state
        self.render_cache.deinit(self.allocator);

        // Clean up config (App owns it)
        self.config.deinit(self.allocator);
    }


    pub fn run(self: *App, app_tui: *ui.Tui) !void {
        _ = app_tui; // Will be used later for editor integration

        // Buffers for accumulating stream data
        var thinking_accumulator = std.ArrayListUnmanaged(u8){};
        defer thinking_accumulator.deinit(self.allocator);
        var content_accumulator = std.ArrayListUnmanaged(u8){};
        defer content_accumulator.deinit(self.allocator);

        while (true) {
            // CONFIG EDITOR MODE (modal - takes priority over normal app)
            if (self.config_editor) |*editor| {
                // Render editor (renderer will clear screen)
                var stdout_buffer: [8192]u8 = undefined;
                var buffered_writer = ui.BufferedStdoutWriter.init(&stdout_buffer);
                const writer = buffered_writer.writer();

                try config_editor_renderer.render(
                    editor,
                    writer,
                    self.terminal_size.width,
                    self.terminal_size.height,
                );
                try buffered_writer.flush();

                // Wait for input (blocking)
                var read_buffer: [128]u8 = undefined;
                const bytes_read = ui.c.read(ui.c.STDIN_FILENO, &read_buffer, read_buffer.len);

                if (bytes_read > 0) {
                    const input = read_buffer[0..@intCast(bytes_read)];
                    const result = try config_editor_input.handleInput(editor, input);