agentcore/loop.go at main · voocel/agentcore · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package agentcore

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"maps"
	"math"
	"slices"
	"strconv"
	"strings"
	"time"
)

const (
	defaultMaxTurns            = 100
	defaultMaxRetries          = 3
	defaultMaxLengthRecoveries = 3
	defaultMaxToolErrors       = 3
	defaultMaxRetryDelay       = 60 * time.Second
)

const defaultLengthRecoveryPrompt = "Output token limit hit. Resume directly - no apology, no recap of what you were doing. Pick up mid-thought if that is where the cut happened. Break remaining work into smaller pieces."

const (
	defaultAbortMarkerText     = "[Request interrupted by user]"
	defaultAbortMarkerToolText = "[Request interrupted by user for tool use]"
)

// AgentLoop starts an agent loop with new prompt messages.
// Prompts are added to context and events are emitted for them.
//
// The returned channel MUST be consumed until it closes: while the run is
// live the loop blocks on a full channel (backpressure, no event loss), so
// abandoning the channel without canceling ctx leaks the loop goroutine.
// To stop early, cancel ctx and keep draining — after cancellation delivery
// degrades to best-effort and the loop is guaranteed to exit and close the
// channel even if no one is reading.
func AgentLoop(ctx context.Context, prompts []AgentMessage, agentCtx AgentContext, config LoopConfig) <-chan Event {
	ch := make(chan Event, 128)
	sink := eventSink{ctx: ctx, ch: ch}

	go func() {
		defer close(ch)
		defer recoverToEnd(sink) // runs before close(ch): a panic still emits EventAgentEnd

		var newMessages []AgentMessage

		currentCtx := AgentContext{
			SystemPrompt: agentCtx.SystemPrompt,
			SystemBlocks: agentCtx.SystemBlocks,
			Messages:     copyMessages(agentCtx.Messages),
			Tools:        agentCtx.Tools,
		}

		sink.emit(Event{Type: EventAgentStart})
		sink.emit(Event{Type: EventTurnStart})

		for _, p := range prompts {
			sink.emit(Event{Type: EventMessageStart, Message: p})
			sink.emit(Event{Type: EventMessageEnd, Message: p})
			commitMessage(&currentCtx, &newMessages, config, p)
		}

		runLoop(ctx, &currentCtx, &newMessages, config, sink)
	}()

	return ch
}

// recoverToEnd turns a panic in the loop goroutine into a clean terminal event
// instead of silently closing the channel. The loop contract promises the
// channel always closes; this extends it so EventAgentEnd is emitted on every
// termination path including panic, which observers/subscribers rely on (e.g.
// to mark an agent stopped). Deferred BEFORE close(ch) so the emit lands first.
func recoverToEnd(sink eventSink) {
	if r := recover(); r != nil {
		sink.emitError(fmt.Errorf("agent loop panicked: %v", r), &RunSummary{EndReason: EndReasonError})
	}
}

// AgentLoopContinue continues from existing context without adding new messages.
// The last message in context must convert to user or tool role via ConvertToLLM.
//
// The returned channel follows the same consumption contract as AgentLoop:
// drain until close, or cancel ctx and keep draining to stop early.
func AgentLoopContinue(ctx context.Context, agentCtx AgentContext, config LoopConfig) <-chan Event {
	ch := make(chan Event, 128)
	sink := eventSink{ctx: ctx, ch: ch}

	if len(agentCtx.Messages) == 0 {
		go func() {
			defer close(ch)
			sink.emitError(ErrNoMessages, &RunSummary{
				EndReason: EndReasonError,
			})
		}()
		return ch
	}

	go func() {
		defer close(ch)
		defer recoverToEnd(sink) // runs before close(ch): a panic still emits EventAgentEnd

		var newMessages []AgentMessage
		currentCtx := AgentContext{
			SystemPrompt: agentCtx.SystemPrompt,
			SystemBlocks: agentCtx.SystemBlocks,
			Messages:     copyMessages(agentCtx.Messages),
			Tools:        agentCtx.Tools,
		}

		sink.emit(Event{Type: EventAgentStart})
		sink.emit(Event{Type: EventTurnStart})

		runLoop(ctx, &currentCtx, &newMessages, config, sink)
	}()

	return ch
}

// commitMessage is the single entry point for "message enters agent context".
// It appends the message to both the runtime context and the new-messages list,
// and fires the OnMessage hook if configured.
func commitMessage(currentCtx *AgentContext, newMessages *[]AgentMessage, config LoopConfig, msg AgentMessage) {
	currentCtx.Messages = append(currentCtx.Messages, msg)
	*newMessages = append(*newMessages, msg)
	if config.OnMessage != nil {
		config.OnMessage(msg)
	}
}

// runLoop is the main double-loop logic shared by AgentLoop and AgentLoopContinue.
//
// Core loop contracts:
//   - Streamed tool-call lifecycle signals are authoritative; stop reasons are
//     only hints and must not be used as the sole source of tool state.
//   - Tool results are appended to context only after the assistant message
//     that requested them, even when execution started during streaming.
//   - Once a streamed tool call has completed, the turn is no longer retried
//     automatically; replay could duplicate side effects.
//   - Steering stops not-yet-started tools. Started tools follow their
//     InterruptBehavior and may continue or be cancelled.
func runLoop(ctx context.Context, currentCtx *AgentContext, newMessages *[]AgentMessage, config LoopConfig, sink eventSink) {
	type runSummaryState struct {
		toolCalls  int
		toolErrors int
	}
	summaryState := runSummaryState{}
	buildSummary := func(turnCount int, reason EndReason) *RunSummary {
		return &RunSummary{
			TurnCount:  turnCount,
			ToolCalls:  summaryState.toolCalls,
			ToolErrors: summaryState.toolErrors,
			EndReason:  reason,
		}
	}

	maxTurns := config.MaxTurns
	if maxTurns <= 0 {
		maxTurns = defaultMaxTurns
	}

	firstTurn := true
	turnCount := 0
	lengthRecoveryCount := 0
	toolErrors := make(map[string]int) // consecutive failure count per tool

	// Check for steering messages at start
	var pendingMessages []AgentMessage
	if config.GetSteeringMessages != nil {
		pendingMessages = config.GetSteeringMessages()
	}

	// Track last assistant message so StopGuard can inspect what's stopping us.
	var lastAssistantMsg Message

	// Outer loop: continues when follow-up messages arrive after agent would stop
	for {
		hasMoreToolCalls := true
		var steeringAfterTools []AgentMessage
		afterToolExec := false

		// Inner loop: process tool calls and steering messages
		for hasMoreToolCalls || len(pendingMessages) > 0 {
			// Check for context cancellation (Abort)
			if ctx.Err() != nil {
				if config.ShouldEmitAbortMarker != nil && config.ShouldEmitAbortMarker() {
					phase := "inference"
					text := config.AbortMarkerText
					if text == "" {
						text = defaultAbortMarkerText
					}
					if afterToolExec {
						phase = "tool_execution"
						text = config.AbortMarkerToolText
						if text == "" {
							text = defaultAbortMarkerToolText
						}
					}
					abortMsg := AbortMsg(text, phase)
					sink.emit(Event{Type: EventMessageEnd, Message: abortMsg})
					commitMessage(currentCtx, newMessages, config, abortMsg)
				}
				sink.emit(Event{Type: EventError, Err: ctx.Err()})
				sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, EndReasonAborted)})
				return
			}

			if turnCount >= maxTurns {
				sink.emit(Event{Type: EventError, Err: &MaxTurnsError{Limit: maxTurns}})
				sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, EndReasonMaxTurns)})
				return
			}

			if !firstTurn {
				sink.emit(Event{Type: EventTurnStart})
			} else {
				firstTurn = false
			}

			// Process pending messages (inject before next LLM call)
			if len(pendingMessages) > 0 {
				for _, msg := range pendingMessages {
					sink.emit(Event{Type: EventMessageStart, Message: msg})
					sink.emit(Event{Type: EventMessageEnd, Message: msg})
					commitMessage(currentCtx, newMessages, config, msg)
				}
				pendingMessages = nil
			}

			var streamedTools *turnToolExecutor
			hooks := llmCallHooks{
				OnToolCallComplete: func(call ToolCall) {
					if streamedTools == nil {
						streamedTools = newTurnToolExecutor(ctx, currentCtx.Tools, config, toolErrors, sink)
					}
					streamedTools.Add(call)
				},
			}

			// Reset hook for retries: abort any tool executions started by the
			// failed attempt and clear the executor so the next attempt sees
			// a fresh slate (the OnToolCallComplete closure above re-creates it).
			// Only used when ToolsAreIdempotent is set; without it, retries
			// after a streamed tool_call are skipped entirely.
			resetTurnState := func() {
				if streamedTools != nil {
					streamedTools.AbortAndWait()
					streamedTools = nil
				}
			}

			// Call LLM with retry (streaming: events emitted inside callLLM)
			assistantMsg, callInfo, err := callLLMWithRetry(ctx, currentCtx, config, sink, hooks, resetTurnState)
			if err != nil {
				if streamedTools != nil {
					streamedTools.AbortAndWait()
				}
				if ctx.Err() != nil {
					if config.ShouldEmitAbortMarker != nil && config.ShouldEmitAbortMarker() {
						text := config.AbortMarkerText
						if text == "" {
							text = defaultAbortMarkerText
						}
						abortMsg := AbortMsg(text, "inference")
						sink.emit(Event{Type: EventMessageEnd, Message: abortMsg})
						commitMessage(currentCtx, newMessages, config, abortMsg)
					}
					sink.emit(Event{Type: EventError, Err: ctx.Err()})
					sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, EndReasonAborted)})
					return
				}
				sink.emitError(fmt.Errorf("llm call failed: %w", err), buildSummary(turnCount, EndReasonError))
				return
			}

			// Check stop reason — terminate early on error/aborted
			if assistantMsg.StopReason == StopReasonError || assistantMsg.StopReason == StopReasonAborted {
				// A custom ChatModel may report Error/Aborted on a message whose
				// stream already completed tool calls (the bundled litellm
				// adapter never does, but the kernel is model-agnostic). Drain the
				// executor so its child ctx and goroutines don't leak past this
				// early exit — every other exit that can hold streamedTools does.
				if streamedTools != nil {
					streamedTools.AbortAndWait()
				}
				commitMessage(currentCtx, newMessages, config, assistantMsg)
				sink.emit(Event{Type: EventModelResponse, Message: assistantMsg})
				turnCount++
				reason := EndReasonError
				if assistantMsg.StopReason == StopReasonAborted {
					reason = EndReasonAborted
				}
				sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, reason)})
				return
			}

			// When output was truncated (max_tokens hit), tool calls are likely
			// incomplete with malformed JSON args. Strip them to avoid validation
			// errors and API rejections.
			if assistantMsg.StopReason == StopReasonLength && !callInfo.HasCompletedToolCalls {
				assistantMsg.Content = stripToolCallBlocks(assistantMsg.Content)
			}

			lastAssistantMsg = assistantMsg
			commitMessage(currentCtx, newMessages, config, assistantMsg)

			// Check for tool calls
			toolCalls := assistantMsg.ToolCalls()
			summaryState.toolCalls += len(toolCalls)
			hasMoreToolCalls = len(toolCalls) > 0
			// Recover when output was truncated and no tool calls completed.
			// This includes the case where tool call blocks existed but were
			// stripped due to incomplete JSON — the tool was never executed,
			// so recovery is safe. The recovery prompt tells the model to
			// "break remaining work into smaller pieces."
			shouldRecoverLength := assistantMsg.StopReason == StopReasonLength &&
				len(toolCalls) == 0 &&
				!callInfo.HasCompletedToolCalls &&
				lengthRecoveryCount < defaultMaxLengthRecoveries

			var turnToolResults []ToolResult
			if hasMoreToolCalls {
				var steering []AgentMessage
				if callInfo.HasCompletedToolCalls && streamedTools != nil {
					turnToolResults, steering = streamedTools.Wait()
				} else {
					turnToolResults, steering = executeToolCalls(ctx, currentCtx.Tools, toolCalls, config, toolErrors, sink)
				}
				afterToolExec = true

				for _, tr := range turnToolResults {
					resultMsg := toolResultToMessage(tr)
					sink.emit(Event{Type: EventMessageStart, Message: resultMsg})
					sink.emit(Event{Type: EventMessageEnd, Message: resultMsg})
					commitMessage(currentCtx, newMessages, config, resultMsg)
				}

				steeringAfterTools = steering
			}
			for _, tr := range turnToolResults {
				if tr.IsError {
					summaryState.toolErrors++
				}
			}

			sink.emit(Event{Type: EventModelResponse, Message: assistantMsg, ToolResults: turnToolResults})
			turnCount++

			// Early exit: a terminal tool completed successfully. This is a
			// normal stop, so it passes through the same StopGuard gate as
			// end_turn — guards stay the single stop arbiter and can veto a
			// premature terminal-tool exit (Trigger distinguishes the paths).
			if stopAfterToolHit(config, turnToolResults) {
				inject, escalate := consultStopGuard(ctx, config, StopInfo{
					TurnIndex: turnCount,
					Message:   lastAssistantMsg,
					Trigger:   StopTriggerAfterTool,
				})
				if escalate {
					sink.emit(Event{Type: EventError, Err: ErrStopGuard})
					sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, EndReasonError)})
					return
				}
				if inject == "" {
					sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, EndReasonStop)})
					return
				}
				// Guard vetoed the early exit: keep the loop alive with the
				// injected message, carrying any steering captured during this
				// terminal-tool turn so a follow-up tool turn can't drop the
				// already-dequeued steering.
				pendingMessages = append([]AgentMessage{UserMsg(inject)}, steeringAfterTools...)
				steeringAfterTools = nil
				continue
			}

			if shouldRecoverLength {
				lengthRecoveryCount++
				prompt := config.LengthRecoveryPrompt
				if prompt == "" {
					prompt = defaultLengthRecoveryPrompt
				}
				pendingMessages = []AgentMessage{UserMsg(prompt)}
				continue
			}

			// Get steering messages after turn completes
			if len(steeringAfterTools) > 0 {
				pendingMessages = steeringAfterTools
				steeringAfterTools = nil
			} else if config.GetSteeringMessages != nil {
				pendingMessages = config.GetSteeringMessages()
			}
		}

		// Agent would stop here. Check for follow-up messages.
		if config.GetFollowUpMessages != nil {
			followUp := config.GetFollowUpMessages()
			if len(followUp) > 0 {
				pendingMessages = followUp
				continue
			}
		}

		// StopGuard veto: give the application a chance to keep the loop alive.
		inject, escalate := consultStopGuard(ctx, config, StopInfo{
			TurnIndex: turnCount,
			Message:   lastAssistantMsg,
			Trigger:   StopTriggerEndTurn,
		})
		if escalate {
			sink.emit(Event{Type: EventError, Err: ErrStopGuard})
			sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, EndReasonError)})
			return
		}
		if inject != "" {
			pendingMessages = []AgentMessage{UserMsg(inject)}
			continue
		}

		break
	}

	sink.emit(Event{Type: EventAgentEnd, NewMessages: *newMessages, Summary: buildSummary(turnCount, EndReasonStop)})
}

// stopAfterToolHit reports whether any successful tool result in this turn
// matches a StopAfterTool / StopAfterToolResult hook.
func stopAfterToolHit(config LoopConfig, results []ToolResult) bool {
	if config.StopAfterTool == nil && config.StopAfterToolResult == nil {
		return false
	}
	for _, tr := range results {
		if tr.IsError {
			continue
		}
		if config.StopAfterTool != nil && config.StopAfterTool(tr.ToolName) {
			return true
		}
		if config.StopAfterToolResult != nil && config.StopAfterToolResult(tr.ToolName, tr.Content) {
			return true
		}
	}
	return false
}

// consultStopGuard runs the guard at a would-stop point. A non-empty inject
// keeps the loop alive with that message; escalate ends the run with
// ErrStopGuard. Both zero values mean the stop is allowed (including when no
// guard is configured, or when the guard denies without an InjectMessage —
// never stall silently).
func consultStopGuard(ctx context.Context, config LoopConfig, info StopInfo) (inject string, escalate bool) {
	if config.StopGuard == nil {
		return "", false
	}
	decision := config.StopGuard(ctx, info)
	if decision.Escalate {
		return "", true
	}
	if !decision.Allow && decision.InjectMessage != "" {
		return decision.InjectMessage, false
	}
	return "", false
}

type llmCallHooks struct {
	OnToolCallComplete func(ToolCall)
}

type llmCallInfo struct {
	HasCompletedToolCalls bool
}

// callLLMWithRetry wraps callLLM with retry logic for retryable errors.
// Context overflow errors trigger automatic compaction and a single retry.
//
// resetTurnState, when non-nil, is invoked after a retryable failure and before
// the next attempt. Callers use it to abort and discard any tool executions
// already kicked off via the streaming hook in the failed attempt, so the
// retried attempt can re-execute the tool calls cleanly. Only invoked when
// the failure path actually decides to retry (i.e. the error is retryable
// and the attempt cap has not been reached).
func callLLMWithRetry(ctx context.Context, agentCtx *AgentContext, config LoopConfig, sink eventSink, hooks llmCallHooks, resetTurnState func()) (Message, llmCallInfo, error) {
	maxRetries := config.MaxRetries
	if maxRetries <= 0 {
		msg, info, err := callLLM(ctx, agentCtx, config, sink, hooks)
		if err != nil && IsContextOverflow(err) {
			return recoverOverflow(ctx, agentCtx, config, sink, err, hooks)
		}
		return msg, info, err
	}

	var lastErr error
	var lastInfo llmCallInfo
	for attempt := 0; attempt <= maxRetries; attempt++ {
		msg, info, err := callLLM(ctx, agentCtx, config, sink, hooks)
		if err == nil {
			return msg, info, nil
		}
		lastErr = err
		lastInfo = info

		// Context overflow: compact and retry once (not a normal retry)
		if IsContextOverflow(err) {
			return recoverOverflow(ctx, agentCtx, config, sink, err, hooks)
		}

		// Once streamed tool calls have already completed, retrying this turn
		// risks duplicating side effects from tools that already started — unless
		// the caller has declared its tools idempotent, in which case we abort
		// the in-flight executions and retry the whole turn cleanly.
		if info.HasCompletedToolCalls && !config.ToolsAreIdempotent {
			return Message{}, info, err
		}

		// User cancellation is never retryable: the next attempt would just
		// rediscover ctx.Done(), and emitting EventRetry in that window surfaces
		// confusing "retry (1/N)" messages to users who already aborted. Aligns
		// with IsFailoverEligible (errors.go), which also treats
		// context.Canceled as terminal.
		if errors.Is(err, context.Canceled) {
			return Message{}, info, err
		}

		// PartialStreamError (stream closed without done) is treated as retryable:
		// it most often signals a transient network/provider stream-format issue
		// that a fresh request can recover from. The HasCompletedToolCalls guard
		// above already prevents retrying after a tool side-effect has fired.
		var pse *PartialStreamError
		retryable := isRetryable(err) || errors.As(err, &pse)
		if !retryable || attempt == maxRetries {
			return Message{}, info, err
		}

		// Discard any tool executions started during the failed attempt before
		// the next retry — otherwise the streaming hook would Add a second copy
		// of the same tool_call onto the same executor on the next callLLM.
		if resetTurnState != nil {
			resetTurnState()
		}

		delay := retryDelay(err, attempt)

		sink.emit(Event{
			Type: EventRetry,
			Err:  err,
			RetryInfo: &RetryInfo{
				Attempt:    attempt + 1,
				MaxRetries: maxRetries,
				Delay:      delay,
				Err:        err,
			},
		})

		select {
		case <-ctx.Done():
			return Message{}, lastInfo, ctx.Err()
		case <-time.After(delay):
		}
	}
	return Message{}, lastInfo, lastErr
}

// recoverOverflow attempts to compact the context via the ContextManager and
// retry the LLM call. If no ContextManager is configured, the original error
// is returned.
func recoverOverflow(ctx context.Context, agentCtx *AgentContext, config LoopConfig, sink eventSink, originalErr error, hooks llmCallHooks) (Message, llmCallInfo, error) {
	if config.ContextManager == nil {
		return Message{}, llmCallInfo{}, &ContextOverflowError{Cause: fmt.Errorf("no compaction configured: %w", originalErr)}
	}

	sink.emit(Event{
		Type: EventRetry,
		Err:  originalErr,
		RetryInfo: &RetryInfo{
			Attempt:    1,
			MaxRetries: 1,
			Err:        fmt.Errorf("context overflow detected, compacting and retrying"),
		},
	})

	recovery, err := config.ContextManager.RecoverOverflow(ctx, agentCtx.Messages, originalErr)
	if err != nil {
		return Message{}, llmCallInfo{}, &ContextOverflowError{Cause: fmt.Errorf("compaction failed: %w", err)}
	}
	if len(recovery.View) == 0 {
		return Message{}, llmCallInfo{}, &ContextOverflowError{Cause: errors.New("compaction returned empty prompt view")}
	}
	agentCtx.Messages = recovery.View
	if recovery.ShouldCommit && len(recovery.CommitMessages) > 0 && config.CommitContext != nil {
		if err := config.CommitContext(recovery.CommitMessages, recovery.Usage); err != nil {
			return Message{}, llmCallInfo{}, &ContextOverflowError{Cause: fmt.Errorf("commit failed: %w", err)}
		}
	}
	return callLLM(ctx, agentCtx, config, sink, hooks)
}

// retryDelay calculates the wait duration using exponential backoff.
// Respects Retry-After from rate limit errors. Capped at defaultMaxRetryDelay.
func retryDelay(err error, attempt int) time.Duration {
	maxDelay := defaultMaxRetryDelay
	if after := retryAfterHint(err); after > 0 {
		d := after
		if d > maxDelay {
			d = maxDelay
		}
		return d
	}
	// Exponential backoff: 1s, 2s, 4s, 8s...
	d := time.Duration(math.Pow(2, float64(attempt))) * time.Second
	if d > maxDelay {
		d = maxDelay
	}
	return d
}

// callLLM applies the two-stage pipeline and calls the model.
func callLLM(ctx context.Context, agentCtx *AgentContext, config LoopConfig, sink eventSink, hooks llmCallHooks) (Message, llmCallInfo, error) {
	messages := agentCtx.Messages

	// Stage 1: ContextManager / TransformContext
	if config.ContextManager != nil {
		projection, err := config.ContextManager.Project(ctx, messages)
		if err != nil {
			return Message{}, llmCallInfo{}, fmt.Errorf("project context: %w", err)
		}
		if projection.ShouldCommit && len(projection.CommitMessages) > 0 {
			if config.CommitContext != nil {
				if err := config.CommitContext(projection.CommitMessages, projection.Usage); err != nil {
					return Message{}, llmCallInfo{}, fmt.Errorf("project context commit failed: %w", err)
				}
			}
			agentCtx.Messages = copyMessages(projection.CommitMessages)
			messages = copyMessages(projection.CommitMessages)
		}
		if projection.Messages != nil {
			messages = projection.Messages
		}
	}

	// Stage 2: ConvertToLLM (AgentMessage[] → Message[]) + repair tool-call /
	// tool-result pairing for provider compatibility.
	convertFn := config.ConvertToLLM
	if convertFn == nil {
		convertFn = DefaultConvertToLLM
	}
	llmMessages := RepairMessageSequence(convertFn(messages))

	// Build tool specs
	toolSpecs := buildToolSpecs(agentCtx.Tools)

	// Prepend the static system prompt as first message(s). Keeping it at the
	// head and byte-stable across turns lets providers with prefix-based
	// caching (OpenAI) serve it from cache.
	var prefix []Message
	if len(agentCtx.SystemBlocks) > 0 {
		for _, b := range agentCtx.SystemBlocks {
			m := SystemMsg(b.Text)
			if b.CacheControl != "" {
				m.Metadata = map[string]any{"cache_control": b.CacheControl}
			}
			prefix = append(prefix, m)
		}
	} else if agentCtx.SystemPrompt != "" {
		prefix = append(prefix, SystemMsg(agentCtx.SystemPrompt))
	}
	if len(prefix) > 0 {
		llmMessages = append(prefix, llmMessages...)
	}

	// Place a single cache write breakpoint on the last non-system message when
	// the application has opted into explicit cache orchestration. The helper
	// scans from the tail and skips system reminders, so the breakpoint lands
	// on the freshest user input / tool_result / assistant turn — whichever is
	// last in this request. Inside a tool loop this means each LLM call writes
	// an entry covering the latest tool_use+tool_result, so the next call in
	// the loop reads them from cache instead of re-uploading.
	if config.CacheLastMessage != "" {
		llmMessages = markLastMessageForCache(llmMessages, config.CacheLastMessage)
	}

	if config.Model == nil {
		return Message{}, llmCallInfo{}, ErrNoModel
	}

	// Build per-call options
	var callOpts []CallOption

	// Thinking level. Forward any explicit level, including ThinkingOff — the
	// litellm adapter translates "off" into an explicit disabled-thinking request
	// so models that think by default can actually be turned off. Empty means
	// "unset": leave it to the provider/model default.
	if config.ThinkingLevel != "" {
		callOpts = append(callOpts, WithThinking(config.ThinkingLevel))
	}

	// Use streaming for real-time token deltas
	return callLLMStream(ctx, config.Model, llmMessages, toolSpecs, callOpts, sink, hooks)
}

// markLastMessageForCache returns a copy of messages with cache_control attached
// to the metadata of the last non-system message. System messages are skipped so
// trailing per-turn reminders (which change every turn) don't end up carrying
// the breakpoint. The caller's slice and the original Message values are left
// untouched.
func markLastMessageForCache(messages []Message, cacheControl string) []Message {
	idx := -1
	for i := len(messages) - 1; i >= 0; i-- {
		if messages[i].Role != RoleSystem {
			idx = i
			break
		}
	}
	if idx < 0 {
		return messages
	}
	out := slices.Clone(messages)
	md := maps.Clone(out[idx].Metadata)
	if md == nil {
		md = map[string]any{}
	}
	md["cache_control"] = cacheControl
	out[idx].Metadata = md
	return out
}

// callLLMStream uses GenerateStream and emits real-time events.
// The adapter builds partial Messages with ContentBlocks and emits fine-grained StreamEvents.
//
// Stream init failure is surfaced as an error — there is no silent fallback to
// non-streaming Generate, because callers (TUIs, event subscribers) typically
// depend on stream events for live rendering, tool-call deltas, and cancellation
// semantics. Switching execution model without notice changes the contract.
func callLLMStream(ctx context.Context, model ChatModel, messages []Message, tools []ToolSpec, opts []CallOption, sink eventSink, hooks llmCallHooks) (Message, llmCallInfo, error) {
	streamCh, err := model.GenerateStream(ctx, messages, tools, opts...)
	if err != nil {
		return Message{}, llmCallInfo{}, fmt.Errorf("stream init failed: %w", err)
	}

	var (
		started bool
		partial Message
		info    llmCallInfo
	)

	for ev := range streamCh {
		switch ev.Type {
		case StreamEventTextStart, StreamEventThinkingStart, StreamEventToolCallStart:
			partial = ev.Message
			if !started {
				started = true
				sink.emit(Event{Type: EventMessageStart, Message: partial})
			}

		case StreamEventTextDelta, StreamEventThinkingDelta, StreamEventToolCallDelta:
			partial = ev.Message
			if !started {
				started = true
				sink.emit(Event{Type: EventMessageStart, Message: partial})
			}
			var dk DeltaKind
			switch ev.Type {
			case StreamEventThinkingDelta:
				dk = DeltaThinking
			case StreamEventToolCallDelta:
				dk = DeltaToolCall
			}
			sink.emit(Event{Type: EventMessageUpdate, Message: partial, Delta: ev.Delta, DeltaKind: dk})

		case StreamEventTextEnd, StreamEventThinkingEnd, StreamEventToolCallEnd:
			partial = ev.Message
			if ev.CompletedToolCall != nil {
				info.HasCompletedToolCalls = true
				if hooks.OnToolCallComplete != nil {
					hooks.OnToolCallComplete(*ev.CompletedToolCall)
				}
			}

		case StreamEventDone:
			finalMsg := ev.Message
			finalMsg.Timestamp = time.Now()
			if !started {
				sink.emit(Event{Type: EventMessageStart, Message: finalMsg})
			}
			sink.emit(Event{Type: EventMessageEnd, Message: finalMsg})
			return finalMsg, info, nil

		case StreamEventError:
			return Message{}, info, ev.Err
		}
	}

	// Stream closed without done event — surface as PartialStreamError instead
	// of pretending the message completed. Emitting EventMessageEnd here would
	// let callers persist a half-finished message (missing StopReason, possibly
	// truncated tool_call args, unclosed thinking blocks) into history — the
	// next LLM call would then see structurally invalid context.
	return Message{}, info, &PartialStreamError{Partial: partial}
}

// executeToolCalls runs tool calls for one assistant turn using the shared
// turn executor. The same executor also powers streaming tool execution.
func executeToolCalls(ctx context.Context, tools []Tool, calls []ToolCall, config LoopConfig, toolErrors map[string]int, sink eventSink) ([]ToolResult, []AgentMessage) {
	exec := newTurnToolExecutor(ctx, tools, config, toolErrors, sink)
	for _, call := range calls {
		exec.Add(call)
	}
	return exec.Wait()
}

// executeSingleToolCall executes one tool call: emit events, validate, preview,
// run approval, then execute the tool. result.ToolName is set when the caller
// should update toolErrors; empty means skip (circuit-breaker hit, denial, or
// context cancellation).
func executeSingleToolCall(ctx context.Context, tools []Tool, call ToolCall, config LoopConfig, failCount int, sink eventSink) ToolResult {
	tool := findTool(tools, call.Name)
	label := toolLabel(tool)

	// Fast exit: context already cancelled — don't start any tool work.
	if ctx.Err() != nil {
		content, _ := json.Marshal("Tool execution cancelled.")
		result := ToolResult{ToolCallID: call.ID, Content: content, IsError: true}
		sink.emit(Event{
			Type:      EventToolExecStart,
			ToolID:    call.ID,
			Tool:      call.Name,
			ToolLabel: label,
			Args:      call.Args,
		})
		sink.emit(Event{
			Type:      EventToolExecEnd,
			ToolID:    call.ID,
			Tool:      call.Name,
			ToolLabel: label,
			Result:    result.Content,
			IsError:   true,
		})
		return result
	}

	// Circuit breaker: skip if tool has exceeded consecutive failure threshold
	if config.MaxToolErrors > 0 && failCount >= config.MaxToolErrors {
		sink.emit(Event{
			Type:      EventToolExecStart,
			ToolID:    call.ID,
			Tool:      call.Name,
			ToolLabel: label,
			Args:      call.Args,
		})
		errContent, _ := json.Marshal(fmt.Sprintf("tool %q disabled after %d consecutive errors", call.Name, config.MaxToolErrors))
		result := ToolResult{ToolCallID: call.ID, Content: errContent, IsError: true}
		sink.emit(Event{
			Type:    EventToolExecEnd,
			ToolID:  call.ID,
			Tool:    call.Name,
			Result:  result.Content,
			IsError: true,
		})
		return result // ToolName empty → don't count
	}

	sink.emit(Event{
		Type:      EventToolExecStart,
		ToolID:    call.ID,
		Tool:      call.Name,
		ToolLabel: label,
		Args:      call.Args,
	})

	var result ToolResult

	if tool == nil {
		errContent, _ := json.Marshal(fmt.Sprintf("tool %q not found", call.Name))
		result = ToolResult{
			ToolCallID: call.ID,
			Content:    errContent,
			IsError:    true,
		}
	} else if fixed, err := validateToolArgs(tool, call); err != nil {
		errContent, _ := json.Marshal(err.Error())
		result = ToolResult{
			ToolCallID: call.ID,
			ToolName:   call.Name,
			Content:    errContent,
			IsError:    true,
		}
	} else {
		// Schema validation may have coerced trivially mis-typed args (e.g. a
		// stringified number or JSON array) into clean values; run the tool with
		// the corrected payload so downstream Validators and the tool itself see
		// well-typed input.
		if fixed != nil {
			call.Args = fixed
		}
		// Stage 1: business-level input validation. Distinct from schema
		// validation above — Validators check semantics (write-before-read,
		// mtime drift, ...) using state the tool was constructed with.
		// Failures are surfaced as a normal tool_result with IsError=true so
		// the LLM can self-correct without prompting the user. Validators
		// MUST NOT prompt or mutate persistent state.
		if v, ok := tool.(Validator); ok {
			vr := v.Validate(ctx, call.Args)
			if !vr.OK {
				msg := vr.Message
				if msg == "" {
					msg = "tool input validation failed"
				}
				content, _ := json.Marshal(msg)
				result = ToolResult{
					ToolCallID: call.ID,
					ToolName:   call.Name,
					Content:    content,
					IsError:    true,
				}
				sink.emit(Event{
					Type:      EventToolExecEnd,
					ToolID:    call.ID,
					Tool:      call.Name,
					ToolLabel: label,
					Result:    content,
					IsError:   true,
				})
				return result
			}
		}

		var preview json.RawMessage

		// Preview: if tool supports it, compute and emit preview before execution.
		// Preview runs only after args are validated so approval UIs can use it.
		if p, ok := tool.(Previewer); ok {
			if data, err := p.Preview(ctx, call.Args); err == nil {
				preview = data
				sink.emit(Event{
					Type:       EventToolExecUpdate,
					ToolID:     call.ID,
					Tool:       call.Name,
					ToolLabel:  label,
					Args:       call.Args,
					Result:     data,
					UpdateKind: ToolExecUpdatePreview,
				})
			}
		}

		if config.ToolGate != nil {
			gateReq := GateRequest{
				Tool:      tool,
				Call:      call,
				ToolLabel: label,
				Preview:   preview,
			}
			decision, err := config.ToolGate(ctx, gateReq)
			if err != nil {
				decision = &GateDecision{Allowed: false, Reason: err.Error()}
			}
			if decision != nil && !decision.Allowed {
				reason := decision.Reason
				if reason == "" {
					reason = "tool execution denied"
				}
				errContent, _ := json.Marshal(reason)
				result = ToolResult{
					ToolCallID: call.ID,
					Content:    errContent,
					IsError:    true,
				}
				sink.emit(Event{
					Type:      EventToolExecEnd,
					ToolID:    call.ID,
					Tool:      call.Name,
					ToolLabel: label,
					Result:    result.Content,
					IsError:   true,
				})
				return result
			}
		}

		// Inject progress callback so tools can report partial results
		progressCtx := WithToolProgress(ctx, func(progress ProgressPayload) {
			p := progress
			sink.emit(Event{
				Type:       EventToolExecUpdate,
				ToolID:     call.ID,
				Tool:       call.Name,
				ToolLabel:  label,
				Args:       call.Args,
				Progress:   &p,
				UpdateKind: ToolExecUpdateProgress,
			})
		})

		// ContentTool: returns rich content blocks (e.g., images).
		// When middleware is configured, execute it through a shim so logging,
		// auditing, and short-circuit behavior still apply.
		if ct, ok := tool.(ContentTool); ok {