From ab1d227a1126a129d7a9fe16bd56a5228d1ae341 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 01:18:52 -0700 Subject: [PATCH 01/23] Persist post-turn provider wakeups and backgrounded task completions ClaudeAdapterV2 buffers post-turn SDK activity and emits turn.wakeup (task_notification / ScheduleWakeup origins); ProviderWakeupService waits for thread quiescence and dispatches attach_wakeup runs that adopt the in-flight turn and replay the buffer. Backgrounded Bash items stay running and complete cross-run via task notifications. Replay fixture claude_provider_wakeup covers the full loop. Co-Authored-By: Claude Fable 5 --- .../Adapters/ClaudeAdapterV2.testkit.ts | 5 + .../Adapters/ClaudeAdapterV2.ts | 479 +++++++++++++++++- .../src/orchestration-v2/EffectOutbox.ts | 5 + .../src/orchestration-v2/EffectWorker.ts | 8 +- .../src/orchestration-v2/Orchestrator.ts | 47 +- .../src/orchestration-v2/ProviderAdapter.ts | 34 ++ .../orchestration-v2/ProviderEventIngestor.ts | 4 + .../ProviderSessionManager.ts | 54 ++ .../ProviderTurnStartService.ts | 3 + .../orchestration-v2/ProviderWakeupService.ts | 190 +++++++ .../orchestration-v2/RunExecutionService.ts | 77 ++- .../src/orchestration-v2/runtimeLayer.ts | 15 + ...rchestratorReplayFixtures.contract.test.ts | 7 +- .../testkit/ProviderReplayHarness.ts | 17 +- .../claude_transcript.ndjson | 21 + .../fixtures/claude_provider_wakeup/input.ts | 22 + .../fixtures/claude_provider_wakeup/output.ts | 116 +++++ .../testkit/fixtures/index.ts | 17 + .../testkit/fixtures/shared.ts | 21 + .../chat/MessagesTimeline.logic.test.ts | 91 ++++ .../components/chat/MessagesTimeline.logic.ts | 18 +- .../components/chat/MessagesTimeline.test.tsx | 68 +++ .../src/components/chat/MessagesTimeline.tsx | 72 ++- apps/web/src/session-logic.ts | 19 + packages/contracts/src/orchestrationV2.ts | 24 + 25 files changed, 1391 insertions(+), 43 deletions(-) create mode 100644 apps/server/src/orchestration-v2/ProviderWakeupService.ts create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/claude_transcript.ndjson create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/input.ts create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/output.ts diff --git a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.testkit.ts b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.testkit.ts index b7e5067d72f..76d840edafd 100644 --- a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.testkit.ts +++ b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.testkit.ts @@ -295,6 +295,11 @@ function isClaudeSdkReplayMessage(frame: unknown): frame is SDKMessage { type === "user" || type === "result" || type === "system" || + // Partial-assistant stream events are ignored by the adapter's message + // pipeline but announce provider-initiated turn wakeups, matching the + // recorded transcripts where message_start precedes the first complete + // assistant message by several seconds. + type === "stream_event" || type === "rate_limit_event" ); } diff --git a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts index 2740ceff6d2..77b6ec1c959 100644 --- a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts @@ -55,6 +55,7 @@ import * as Path from "effect/Path"; import * as Queue from "effect/Queue"; import * as Ref from "effect/Ref"; import * as Schema from "effect/Schema"; +import * as Semaphore from "effect/Semaphore"; import * as Stream from "effect/Stream"; import { resolveAttachmentPath } from "../../attachmentStore.ts"; @@ -94,6 +95,7 @@ import { type ProviderAdapterV2SessionRuntime, type ProviderAdapterV2SteerInput, type ProviderAdapterV2TurnInput, + type ProviderTurnWakeupOrigin, } from "../ProviderAdapter.ts"; import { ProviderAdapterDriverCreateError, @@ -1761,6 +1763,16 @@ interface ActiveClaudeTurnContext { readonly input: ProviderAdapterV2TurnInput; readonly nativeTurnId: string; nativeMessageCursor: string | null; + /** + * Set when a ScheduleWakeup tool call succeeds in this turn: the following + * `result` will carry stop_reason "tool_use" and the SDK re-invokes the + * agent at `scheduledFor`. The turn is then held open (provider turn + * `waiting`) instead of finalized, so the sleep/wake poll loop stays one + * run instead of a run per wakeup. + */ + pendingScheduledWakeup: { readonly scheduledFor?: number } | null; + /** The turn yielded on a scheduled wakeup and is waiting to be re-invoked. */ + waitingForWakeup: boolean; readonly providerTurnId: OrchestrationV2ProviderTurn["id"]; readonly providerTurnOrdinal: number; readonly startedAt: DateTime.Utc; @@ -1796,6 +1808,58 @@ interface ClaudeLiveQueryContext { readonly closed: Deferred.Deferred; } +/** + * SDK activity observed while no orchestrator-requested turn is active. The + * Claude Agent SDK resumes sessions on its own (background task notifications, + * scheduled wakeups re-invoke the agent after the previous turn's result), so + * the adapter buffers that activity, announces it once via a `turn.wakeup` + * event, and replays the buffer when the orchestrator attaches a + * provider-initiated run through `attachTurn`. + */ +interface PendingClaudeWakeup { + readonly query: ClaudeAgentSdkQuerySession; + origin: ProviderTurnWakeupOrigin; + announced: boolean; + readonly bufferedMessages: Array; + droppedMessageCount: number; +} + +const CLAUDE_WAKEUP_BUFFER_LIMIT = 4096; +const CLAUDE_WAKEUP_ORIGIN_DETAIL_LIMIT = 280; + +/** + * A Bash command the agent ran with `run_in_background`: its tool result + * returns immediately (carrying `backgroundTaskId`) while the command keeps + * running — often across turn boundaries. The emitted node/turn item stay + * `running` and are re-emitted with their terminal status when the matching + * `task_updated`/`task_notification` arrives (possibly during a later, + * provider-initiated wakeup run — routing accepts same-provider-thread + * updates for earlier runs). + */ +interface ClaudeBackgroundCommandRef { + readonly taskId: string; + readonly node: OrchestrationV2ExecutionNode; + readonly turnItem: OrchestrationV2TurnItem; + /** + * A terminal transition was already emitted (task_updated carries the + * status but no summary; the task_notification that follows enriches the + * item's output with the summary and consumes the ref). + */ + readonly terminal: boolean; +} + +function claudeBackgroundTaskIdFromSdkMessage(message: SDKMessage): string | undefined { + if (message.type !== "user") { + return undefined; + } + const structuredResult = (message as { readonly tool_use_result?: unknown }).tool_use_result; + if (typeof structuredResult !== "object" || structuredResult === null) { + return undefined; + } + const taskId = Reflect.get(structuredResult, "backgroundTaskId"); + return typeof taskId === "string" && taskId.length > 0 ? taskId : undefined; +} + interface ActiveClaudeToolCall { readonly nativeItemId: string; readonly toolName: string; @@ -1851,6 +1915,15 @@ export function makeClaudeAdapterV2( const interruptedTurns = yield* Ref.make(new Set()); const steeredTurns = yield* Ref.make(new Set()); const queryContext = yield* Ref.make(null); + const pendingWakeup = yield* Ref.make(null); + const lastProviderThreadId = yield* Ref.make( + null, + ); + const backgroundCommands = yield* Ref.make(new Map()); + // Serializes the SDK message pump against attachTurn so replayed wakeup + // buffers cannot interleave with live messages. + const sdkMessageGate = yield* Semaphore.make(1); + const sessionThreadId = input.threadId; const openedNativeThreads = yield* Ref.make(new Set()); const itemOrdinals = yield* Ref.make(new Map()); const nextItemOrdinalsByTurn = yield* Ref.make(new Map()); @@ -2048,6 +2121,91 @@ export function makeClaudeAdapterV2( }); }); + /** + * Re-emit a backgrounded command's original node/turn item with its + * terminal status once its task lifecycle concludes. Idempotent: the + * ref is consumed on first completion (task_updated and + * task_notification both report the same terminal transition). + */ + const completeBackgroundCommand = Effect.fnUntraced(function* (input: { + readonly taskId: string; + readonly status: "completed" | "failed"; + readonly detail?: string; + /** task_notification carries the final summary and retires the ref. */ + readonly consume: boolean; + }) { + const entry = (yield* Ref.get(backgroundCommands)).get(input.taskId); + if (entry === undefined) { + return; + } + const detail = input.detail?.trim() ?? ""; + if (entry.terminal && detail.length === 0) { + return; + } + const completedAt = yield* DateTime.now; + const node: OrchestrationV2ExecutionNode = { + ...entry.node, + status: input.status, + completedAt, + }; + const completedItem = { + ...entry.turnItem, + status: input.status, + completedAt, + updatedAt: completedAt, + } as OrchestrationV2TurnItem; + const turnItem: OrchestrationV2TurnItem = + completedItem.type === "command_execution" && + detail.length > 0 && + !(completedItem.output ?? "").includes(detail) + ? { + ...completedItem, + output: + completedItem.output === undefined || completedItem.output.length === 0 + ? detail + : `${completedItem.output}\n${detail}`, + } + : completedItem; + yield* emitToolCallArtifacts({ node, turnItem }); + yield* Ref.update(backgroundCommands, (current) => { + const next = new Map(current); + if (input.consume) { + next.delete(input.taskId); + } else { + next.set(input.taskId, { taskId: input.taskId, node, turnItem, terminal: true }); + } + return next; + }); + }); + + const completeBackgroundCommandFromTaskMessage = Effect.fnUntraced(function* ( + message: SDKMessage, + ) { + if (message.type !== "system") { + return; + } + if (message.subtype === "task_notification") { + yield* completeBackgroundCommand({ + taskId: message.task_id, + status: message.status === "completed" ? "completed" : "failed", + detail: message.summary, + consume: true, + }); + return; + } + if (message.subtype === "task_updated") { + const patchStatus = message.patch.status; + if (patchStatus === "completed" || patchStatus === "failed" || patchStatus === "killed") { + yield* completeBackgroundCommand({ + taskId: message.task_id, + status: patchStatus === "completed" ? "completed" : "failed", + ...(message.patch.error === undefined ? {} : { detail: message.patch.error }), + consume: false, + }); + } + } + }); + const updateClaudeSubagentNode = Effect.fnUntraced(function* (input: { readonly context: ActiveClaudeTurnContext; readonly taskId: string; @@ -2755,7 +2913,89 @@ export function makeClaudeAdapterV2( } }); - const handleSdkMessage = Effect.fnUntraced(function* (input: { + /** + * SDK activity with no active orchestrator turn: the SDK resumed the + * session on its own (e.g. a background task notification re-invoked + * the agent after the previous turn's result). Buffer the activity and + * announce it once via `turn.wakeup` so the wakeup watcher can mint a + * provider-initiated run and attach to it. + */ + const trackWakeupSdkMessage = Effect.fnUntraced(function* (wakeupInput: { + readonly query: ClaudeAgentSdkQuerySession; + readonly message: SDKMessage; + }) { + const providerThreadId = yield* Ref.get(lastProviderThreadId); + if (providerThreadId === null) { + // No turn has run in this session yet, so there is nothing for a + // wakeup run to bind to; this is pre-first-turn protocol noise. + return; + } + const message = wakeupInput.message; + const existing = yield* Ref.get(pendingWakeup); + const pending: PendingClaudeWakeup = + existing !== null && existing.query === wakeupInput.query + ? existing + : { + query: wakeupInput.query, + origin: { kind: "unknown" }, + announced: false, + bufferedMessages: [], + droppedMessageCount: 0, + }; + if ( + pending.origin.kind === "unknown" && + message.type === "system" && + message.subtype === "task_notification" + ) { + const detail = message.summary.trim().slice(0, CLAUDE_WAKEUP_ORIGIN_DETAIL_LIMIT); + pending.origin = { + kind: "task_notification", + nativeTaskId: message.task_id, + ...(detail.length === 0 ? {} : { detail }), + }; + } + // Partial-assistant stream events are ignored by processSdkMessage + // (the adapter consumes complete assistant snapshots), so they are + // not worth buffering — but they DO prove a turn is underway, so + // they still drive the announcement below. + if (message.type !== "stream_event") { + if (pending.bufferedMessages.length >= CLAUDE_WAKEUP_BUFFER_LIMIT) { + pending.droppedMessageCount += 1; + if (pending.droppedMessageCount === 1) { + yield* Effect.logWarning("orchestration-v2.claude-wakeup-buffer-overflow", { + providerSessionId: input.providerSessionId, + threadId: sessionThreadId, + providerThreadId, + }); + } + } else { + pending.bufferedMessages.push(message); + } + } + yield* Ref.set(pendingWakeup, pending); + // System messages alone (init, status, task bookkeeping) do not + // prove a turn is underway; announce on the first real activity — + // in recorded wakeups that is the message_start stream event, ~4s + // before the first complete assistant message. + if (!pending.announced && message.type !== "system") { + pending.announced = true; + yield* Effect.logInfo("orchestration-v2.claude-turn-wakeup", { + providerSessionId: input.providerSessionId, + threadId: sessionThreadId, + providerThreadId, + origin: pending.origin, + }); + yield* emitProviderEvent({ + type: "turn.wakeup", + driver: CLAUDE_PROVIDER, + threadId: sessionThreadId, + providerThreadId, + origin: pending.origin, + }); + } + }); + + const processSdkMessage = Effect.fnUntraced(function* (input: { readonly query: ClaudeAgentSdkQuerySession; readonly message: SDKMessage; }) { @@ -2767,6 +3007,7 @@ export function makeClaudeAdapterV2( const message = input.message; const context = yield* Ref.get(activeTurn); if (context === null) { + yield* trackWakeupSdkMessage(input); return; } @@ -2774,6 +3015,12 @@ export function makeClaudeAdapterV2( context.nativeMessageCursor = message.uuid; } + if (context.waitingForWakeup && message.type === "assistant") { + // Scheduled wakeup fired: the continuation streams into this + // held-open turn (the provider turn stayed `running` throughout). + context.waitingForWakeup = false; + } + if (message.type === "system" && message.subtype === "task_started") { if (isClaudeNonSubagentTask(message)) { context.ignoredTaskIds.add(message.task_id); @@ -2802,6 +3049,12 @@ export function makeClaudeAdapterV2( } } + // Backgrounded commands (local_bash tasks spawned with + // run_in_background) are tracked session-level and complete via + // task lifecycle messages — independent of subagent bookkeeping and + // of which turn the lifecycle message arrives in. + yield* completeBackgroundCommandFromTaskMessage(message); + if (message.type === "system" && message.subtype === "task_notification") { if (!context.ignoredTaskIds.has(message.task_id)) { yield* updateClaudeSubagentNode({ @@ -2819,6 +3072,31 @@ export function makeClaudeAdapterV2( } } + if (message.type === "system" && message.subtype === "task_updated") { + // Status patches carry terminal transitions (completed/failed/ + // killed) that task_notification does not always repeat — killed + // and failed tasks in particular must not be dropped silently. + const patchStatus = message.patch.status; + if ( + patchStatus !== undefined && + !context.ignoredTaskIds.has(message.task_id) && + context.subagentsByTaskId.has(message.task_id) && + (patchStatus === "completed" || patchStatus === "failed" || patchStatus === "killed") + ) { + yield* updateClaudeSubagentNode({ + context, + taskId: message.task_id, + ...(message.patch.error === undefined ? {} : { result: message.patch.error }), + status: + patchStatus === "completed" + ? "completed" + : patchStatus === "killed" + ? "cancelled" + : "failed", + }); + } + } + for (const toolUse of claudeToolUseBlocksFromAssistantMessage(message)) { if (toolUse.name === "Agent") { continue; @@ -2856,6 +3134,13 @@ export function makeClaudeAdapterV2( parentToolUseId, })); const completedAt = yield* DateTime.now; + // A tool result carrying backgroundTaskId means the command keeps + // running detached (Bash run_in_background): the item stays + // `running` and completes via its task lifecycle later — possibly + // in a different (wakeup) run. + const backgroundTaskId = isClaudeToolResultError(toolResult) + ? undefined + : claudeBackgroundTaskIdFromSdkMessage(message); const artifacts = buildToolCallArtifacts({ context, nativeItemId: toolCall.nativeItemId, @@ -2868,11 +3153,43 @@ export function makeClaudeAdapterV2( parentNodeId: toolCall.parentNodeId, ordinal: toolCall.ordinal, output, - status: isClaudeToolResultError(toolResult) ? "failed" : "completed", + status: isClaudeToolResultError(toolResult) + ? "failed" + : backgroundTaskId !== undefined + ? "running" + : "completed", startedAt: toolCall.startedAt, updatedAt: completedAt, }); yield* emitToolCallArtifacts(artifacts); + if (backgroundTaskId !== undefined) { + yield* Ref.update(backgroundCommands, (current) => { + const next = new Map(current); + next.set(backgroundTaskId, { + taskId: backgroundTaskId, + node: artifacts.node, + turnItem: artifacts.turnItem, + terminal: false, + }); + return next; + }); + } + if (toolCall.toolName === "ScheduleWakeup" && !isClaudeToolResultError(toolResult)) { + // The SDK re-invokes the agent when the wakeup fires; the + // upcoming result (stop_reason "tool_use") must not finalize + // the turn. scheduledFor comes from the tool's structured + // output: { scheduledFor, clampedDelaySeconds, wasClamped }. + const structuredResult = + message.type === "user" + ? (message as { readonly tool_use_result?: unknown }).tool_use_result + : undefined; + const scheduledFor = + typeof structuredResult === "object" && structuredResult !== null + ? Reflect.get(structuredResult, "scheduledFor") + : undefined; + context.pendingScheduledWakeup = + typeof scheduledFor === "number" ? { scheduledFor } : {}; + } context.toolCalls.delete(toolCall.nativeItemId); } @@ -2904,6 +3221,27 @@ export function makeClaudeAdapterV2( if (!interrupted && wasSteered && isClaudeActiveSteeringAbortResult(message)) { return; } + if ( + !interrupted && + message.subtype === "success" && + context.pendingScheduledWakeup !== null + ) { + // The turn yielded on ScheduleWakeup (result stop_reason + // "tool_use"): the SDK re-invokes this session at scheduledFor + // and the continuation streams into this same turn. Hold the + // run open as `waiting` instead of finalizing so a sleep/wake + // poll loop stays a single run. + const scheduledFor = context.pendingScheduledWakeup.scheduledFor; + context.pendingScheduledWakeup = null; + context.waitingForWakeup = true; + yield* Effect.logInfo("orchestration-v2.claude-turn-waiting-for-wakeup", { + threadId: context.input.threadId, + runId: context.input.runId, + providerTurnId: context.providerTurnId, + ...(scheduledFor === undefined ? {} : { scheduledFor }), + }); + return; + } yield* Ref.update(steeredTurns, (current) => { const next = new Set(current); next.delete(context.providerTurnId); @@ -2926,6 +3264,11 @@ export function makeClaudeAdapterV2( } }); + const handleSdkMessage = (messageInput: { + readonly query: ClaudeAgentSdkQuerySession; + readonly message: SDKMessage; + }) => sdkMessageGate.withPermits(1)(processSdkMessage(messageInput)); + const canUseToolEffect = Effect.fn("ClaudeAdapterV2.canUseTool")(function* ( toolName: Parameters[0], toolInput: Parameters[1], @@ -3099,6 +3442,9 @@ export function makeClaudeAdapterV2( const ownsLiveQuery = yield* Ref.modify(queryContext, (current) => current?.query === querySession ? [true, null] : [false, current], ); + yield* Ref.update(pendingWakeup, (current) => + current?.query === querySession ? null : current, + ); if (ownsLiveQuery) { yield* finalizeActiveTurnAfterQueryExit( exit._tag === "Failure" ? exit.cause : undefined, @@ -3133,6 +3479,8 @@ export function makeClaudeAdapterV2( input: turnInput, nativeTurnId, nativeMessageCursor: null, + pendingScheduledWakeup: null, + waitingForWakeup: false, providerTurnId, providerTurnOrdinal, startedAt, @@ -3157,7 +3505,32 @@ export function makeClaudeAdapterV2( fileSystem, }); const querySession = yield* openQuery(turnInput, nativeThreadId); + const supersededWakeup = yield* Ref.getAndSet(pendingWakeup, null); + if (supersededWakeup !== null && supersededWakeup.announced) { + // A user-requested turn wins over an unclaimed wakeup: the + // in-flight provider activity folds into this turn's stream and + // the buffered prefix is dropped. + yield* Effect.logWarning("orchestration-v2.claude-wakeup-superseded-by-turn", { + providerSessionId: input.providerSessionId, + threadId: turnInput.threadId, + runId: turnInput.runId, + bufferedMessageCount: supersededWakeup.bufferedMessages.length, + }); + } yield* Ref.set(activeTurn, context); + yield* Ref.set(lastProviderThreadId, turnInput.providerThread.id); + if (supersededWakeup !== null) { + // Background-command lifecycle transitions in the dropped + // buffer are session-scoped bookkeeping, not turn content — + // apply them so earlier runs' backgrounded commands still + // complete. Emitted after activeTurn is set so this run's + // event ingestion (already subscribed) persists them. + yield* Effect.forEach( + supersededWakeup.bufferedMessages, + (buffered) => completeBackgroundCommandFromTaskMessage(buffered), + { discard: true }, + ); + } yield* emitProviderEvent({ type: "provider_turn.updated", driver: CLAUDE_PROVIDER, @@ -3184,6 +3557,107 @@ export function makeClaudeAdapterV2( ), ); + /** + * Adopt a provider-initiated turn announced via `turn.wakeup`: no + * prompt is sent — the SDK is already mid-turn. The orchestrator-minted + * run context is installed and the buffered wakeup activity is replayed + * through the normal message pipeline (including, possibly, the + * `result` if the wakeup turn already finished while unclaimed). + */ + const attachTurn = Effect.fn("ClaudeAdapterV2.attachTurn")( + function* (turnInput: ProviderAdapterV2TurnInput) { + yield* sdkMessageGate.withPermits(1)( + Effect.gen(function* () { + const startedAt = yield* DateTime.now; + const pending = yield* Ref.get(pendingWakeup); + const live = yield* Ref.get(queryContext); + if (pending === null || !pending.announced) { + return yield* new ProviderAdapterProtocolError({ + driver: CLAUDE_PROVIDER, + detail: `Claude provider thread ${turnInput.providerThread.id} has no announced wakeup turn to attach to.`, + }); + } + if (live === null || live.query !== pending.query) { + return yield* new ProviderAdapterProtocolError({ + driver: CLAUDE_PROVIDER, + detail: `Claude provider thread ${turnInput.providerThread.id} wakeup query is no longer live.`, + }); + } + const currentTurn = yield* Ref.get(activeTurn); + if (currentTurn !== null) { + return yield* new ProviderAdapterProtocolError({ + driver: CLAUDE_PROVIDER, + detail: `Claude provider turn ${currentTurn.providerTurnId} is still active.`, + }); + } + const nativeTurnId = `turn:${turnInput.attemptId}`; + const providerTurnId = idAllocator.derive.providerTurn({ + driver: CLAUDE_PROVIDER, + nativeTurnId, + }); + const context: ActiveClaudeTurnContext = { + input: turnInput, + nativeTurnId, + nativeMessageCursor: null, + pendingScheduledWakeup: null, + waitingForWakeup: false, + providerTurnId, + providerTurnOrdinal: turnInput.providerTurnOrdinal, + startedAt, + assistant: { + fallbackText: "", + fallbackNativeItemId: `assistant:${turnInput.runId}`, + emittedNativeItemIds: new Set(), + }, + toolCalls: new Map(), + ignoredTaskIds: new Set(), + subagentsByTaskId: new Map(), + subagentsByToolUseId: new Map(), + subagentNodesByTaskId: new Map(), + }; + yield* Ref.set(activeTurn, context); + yield* Ref.set(pendingWakeup, null); + yield* Ref.set(lastProviderThreadId, turnInput.providerThread.id); + yield* emitProviderEvent({ + type: "provider_turn.updated", + driver: CLAUDE_PROVIDER, + providerTurn: providerTurnPayload({ + context, + status: "running", + completedAt: null, + }), + }); + if (pending.droppedMessageCount > 0) { + yield* Effect.logWarning("orchestration-v2.claude-wakeup-replay-truncated", { + providerSessionId: input.providerSessionId, + threadId: turnInput.threadId, + runId: turnInput.runId, + droppedMessageCount: pending.droppedMessageCount, + }); + } + yield* Effect.forEach( + pending.bufferedMessages, + (message) => processSdkMessage({ query: pending.query, message }), + { discard: true }, + ); + }), + ); + }, + (effect, turnInput) => + effect.pipe( + Effect.mapError( + (cause) => + new ProviderAdapterTurnStartError({ + driver: CLAUDE_PROVIDER, + threadId: turnInput.threadId, + providerThreadId: turnInput.providerThread.id, + runId: turnInput.runId, + cause, + }), + ), + ), + ); + const interruptTurn = Effect.fn("ClaudeAdapterV2.interruptTurn")( function* (turnInput: ProviderAdapterV2InterruptInput) { const existing = yield* Ref.get(queryContext); @@ -3387,6 +3861,7 @@ export function makeClaudeAdapterV2( ), ), startTurn, + attachTurn, steerTurn, interruptTurn, respondToRuntimeRequest: Effect.fn("ClaudeAdapterV2.respondToRuntimeRequest")( diff --git a/apps/server/src/orchestration-v2/EffectOutbox.ts b/apps/server/src/orchestration-v2/EffectOutbox.ts index 75ead585167..0ee0045b82f 100644 --- a/apps/server/src/orchestration-v2/EffectOutbox.ts +++ b/apps/server/src/orchestration-v2/EffectOutbox.ts @@ -32,6 +32,11 @@ export const OrchestrationEffectRequestV2 = Schema.Union([ Schema.Struct({ type: Schema.Literal("provider-turn.start"), runId: RunId, + /** + * "attach" adopts a provider-initiated turn that is already in flight + * (wakeup) instead of sending the run's message to the provider. + */ + turnDelivery: Schema.optional(Schema.Literal("attach")), }), Schema.Struct({ type: Schema.Literal("provider-turn.interrupt"), diff --git a/apps/server/src/orchestration-v2/EffectWorker.ts b/apps/server/src/orchestration-v2/EffectWorker.ts index 612c26c44cf..28e08af7647 100644 --- a/apps/server/src/orchestration-v2/EffectWorker.ts +++ b/apps/server/src/orchestration-v2/EffectWorker.ts @@ -77,7 +77,13 @@ export const executorLayer: Layer.Layer< ); case "provider-turn.start": return providerTurnStart - .start({ threadId: effect.threadId, runId: effect.request.runId }) + .start({ + threadId: effect.threadId, + runId: effect.request.runId, + ...(effect.request.turnDelivery === undefined + ? {} + : { turnDelivery: effect.request.turnDelivery }), + }) .pipe( Effect.mapError( (cause) => diff --git a/apps/server/src/orchestration-v2/Orchestrator.ts b/apps/server/src/orchestration-v2/Orchestrator.ts index b10e370cc5c..c4b03e79357 100644 --- a/apps/server/src/orchestration-v2/Orchestrator.ts +++ b/apps/server/src/orchestration-v2/Orchestrator.ts @@ -2049,6 +2049,45 @@ const makeOrchestrator = Effect.fn("orchestrationV2.Orchestrator.layer")(functio ); const activeRun = projection.runs.find(isBlockingRun); const pendingMergeBackTransfers = pendingMergeBackTransfersForThread(projection); + if (dispatchMode.type === "attach_wakeup") { + // A wakeup run adopts an in-flight provider turn; it can never queue, + // steer, or coexist with a blocking run (a user-requested turn that + // raced the wakeup supersedes it — the adapter drops its buffer). + if (activeRun !== undefined) { + return yield* new OrchestratorDispatchError({ + commandId: command.commandId, + commandType: command.type, + cause: `Thread ${command.threadId} already has blocking run ${activeRun.id}; provider wakeup is superseded.`, + }); + } + if ( + activeProviderThread === undefined || + activeProviderThread.id !== dispatchMode.providerThreadId + ) { + return yield* new OrchestratorDispatchError({ + commandId: command.commandId, + commandType: command.type, + cause: `Provider wakeup for ${dispatchMode.providerThreadId} does not match the active provider thread of thread ${command.threadId}.`, + }); + } + if (modelSelection.instanceId !== activeProviderThread.providerInstanceId) { + return yield* new OrchestratorDispatchError({ + commandId: command.commandId, + commandType: command.type, + cause: `Provider wakeup cannot switch provider instances (thread ${command.threadId}).`, + }); + } + if ( + pendingMergeBackTransfers.length > 0 || + pendingForkTransferForThread(projection) !== undefined + ) { + return yield* new OrchestratorDispatchError({ + commandId: command.commandId, + commandType: command.type, + cause: `Thread ${command.threadId} has pending context transfers; provider wakeup is not attachable.`, + }); + } + } const shouldQueue = activeRun !== undefined && (dispatchMode.type === "defer_start" || @@ -2478,7 +2517,7 @@ const makeOrchestrator = Effect.fn("orchestrationV2.Orchestrator.layer")(functio updatedAt: now, type: "user_message", messageId: command.messageId, - inputIntent: "turn_start", + inputIntent: dispatchMode.type === "attach_wakeup" ? "provider_wakeup" : "turn_start", text: command.text, attachments: command.attachments, }; @@ -2587,7 +2626,11 @@ const makeOrchestrator = Effect.fn("orchestrationV2.Orchestrator.layer")(functio id: `effect:${command.commandId}:provider-turn.start:${runId}`, commandId: command.commandId, threadId: command.threadId, - request: { type: "provider-turn.start", runId }, + request: { + type: "provider-turn.start", + runId, + ...(dispatchMode.type === "attach_wakeup" ? { turnDelivery: "attach" as const } : {}), + }, } satisfies PendingOrchestrationEffectV2; if (dispatchMode.type !== "defer_start") { yield* Ref.update(effects, (existing) => [...existing, pendingEffect]); diff --git a/apps/server/src/orchestration-v2/ProviderAdapter.ts b/apps/server/src/orchestration-v2/ProviderAdapter.ts index d8cf1c9c82d..a347af96fdd 100644 --- a/apps/server/src/orchestration-v2/ProviderAdapter.ts +++ b/apps/server/src/orchestration-v2/ProviderAdapter.ts @@ -13,6 +13,7 @@ import { OrchestrationV2ProviderFailure, OrchestrationV2ProviderThread, OrchestrationV2ProviderTurn, + OrchestrationV2ProviderWakeupOrigin, OrchestrationV2RuntimeRequest, OrchestrationV2Subagent, OrchestrationV2TurnItem, @@ -71,6 +72,15 @@ export const ProviderAdapterV2SessionStatus = Schema.Literals([ ]); export type ProviderAdapterV2SessionStatus = typeof ProviderAdapterV2SessionStatus.Type; +/** + * Why a provider started a turn the orchestrator never requested. Providers + * (e.g. the Claude Agent SDK) can resume a session on their own after the + * previous turn completed — background task notifications and scheduled + * wakeups re-invoke the agent without a user prompt. + */ +export const ProviderTurnWakeupOrigin = OrchestrationV2ProviderWakeupOrigin; +export type ProviderTurnWakeupOrigin = OrchestrationV2ProviderWakeupOrigin; + export const ProviderAdapterV2Event = Schema.Union([ Schema.Struct({ type: Schema.Literal("app_thread.created"), @@ -145,6 +155,20 @@ export const ProviderAdapterV2Event = Schema.Union([ failure: OrchestrationV2ProviderFailure, threadDisposition: Schema.Literals(["reusable", "broken"]), }), + /** + * The provider started producing turn activity with no orchestrator-requested + * turn active (provider-initiated wakeup). This is a control signal, not a + * projection event: the session-lifetime wakeup watcher reacts by minting a + * provider-initiated run and attaching to the in-flight turn via + * `attachTurn`. The adapter buffers the wakeup's SDK activity until then. + */ + Schema.Struct({ + type: Schema.Literal("turn.wakeup"), + driver: ProviderDriverKind, + threadId: ThreadId, + providerThreadId: ProviderThreadId, + origin: ProviderTurnWakeupOrigin, + }), ]); export type ProviderAdapterV2Event = typeof ProviderAdapterV2Event.Type; @@ -486,6 +510,16 @@ export interface ProviderAdapterV2SessionRuntime { readonly startTurn: ( input: ProviderAdapterV2TurnInput, ) => Effect.Effect; + /** + * Attach an orchestrator-minted run to a provider-initiated turn that is + * already in flight (announced via a `turn.wakeup` event). Behaves like + * `startTurn` except no prompt is sent to the provider: the adapter adopts + * the turn context and replays the wakeup activity it buffered since the + * wakeup was announced. Adapters that never self-start turns may omit this. + */ + readonly attachTurn?: ( + input: ProviderAdapterV2TurnInput, + ) => Effect.Effect; readonly steerTurn: ( input: ProviderAdapterV2SteerInput, ) => Effect.Effect; diff --git a/apps/server/src/orchestration-v2/ProviderEventIngestor.ts b/apps/server/src/orchestration-v2/ProviderEventIngestor.ts index 91e4a974390..64774f46f03 100644 --- a/apps/server/src/orchestration-v2/ProviderEventIngestor.ts +++ b/apps/server/src/orchestration-v2/ProviderEventIngestor.ts @@ -252,6 +252,10 @@ export const layer: Layer.Layer()("t3/orchestration-v2/ProviderSessionManager/ProviderSessionManagerV2") {} +/** + * Reacts to `turn.wakeup` adapter events — a provider starting a turn the + * orchestrator never requested (e.g. the Claude SDK resuming after a + * background task notification). The session manager's event pump is the only + * session-lifetime consumer of adapter events (run-scoped ingestion stops at + * each run's terminal), so the observer is invoked from there. The live + * implementation dispatches an `attach_wakeup` message so the wakeup becomes + * a visible provider-initiated run; the default drops wakeups silently (test + * harnesses). + */ +export class ProviderWakeupObserver extends Context.Reference<{ + readonly onWakeup: (input: { + readonly threadId: ThreadId; + readonly providerThreadId: ProviderThreadId; + readonly providerInstanceId: ProviderInstanceId; + readonly providerSessionId: ProviderSessionId; + readonly origin: OrchestrationV2ProviderWakeupOrigin; + }) => Effect.Effect; +}>("t3/orchestration-v2/ProviderWakeupObserver", { + defaultValue: () => ({ onWakeup: () => Effect.void }), +}) {} + interface LiveSessionEntry { readonly attachedThreadIds: ReadonlySet; readonly loadedProviderThreadKeyByThread: ReadonlyMap; @@ -247,6 +271,7 @@ export const layerWithOptions = ( const eventSink = yield* EventSinkV2; const idAllocator = yield* IdAllocatorV2; const projectionStore = yield* ProjectionStoreV2; + const wakeupObserver = yield* ProviderWakeupObserver; const layerScope = yield* Effect.scope; const sessions = yield* Ref.make(new Map()); const nextSubscriberId = yield* Ref.make(0); @@ -995,6 +1020,32 @@ export const layerWithOptions = ( ), ); + const notifyWakeupObserver = ( + entry: LiveSessionEntry, + event: Extract, + ) => + wakeupObserver + .onWakeup({ + threadId: event.threadId, + providerThreadId: event.providerThreadId, + providerInstanceId: entry.runtime.instanceId, + providerSessionId: entry.runtime.providerSessionId, + origin: event.origin, + }) + .pipe( + Effect.catchCause((cause) => + Effect.logWarning("orchestration-v2.driver-session.wakeup-observer-failed", { + providerSessionId: entry.runtime.providerSessionId, + threadId: event.threadId, + cause, + }), + ), + // Forked so a slow wakeup dispatch never stalls the event pump; + // the adapter keeps buffering the wakeup turn until attachTurn. + Effect.forkIn(layerScope), + Effect.asVoid, + ); + const startEventPump = (entry: LiveSessionEntry) => entry.runtime.events.pipe( Stream.runForEach((event) => @@ -1009,6 +1060,9 @@ export const layerWithOptions = ( ? persistProviderSessionUpdate(entry, event) : Effect.void, ), + Effect.andThen( + event.type === "turn.wakeup" ? notifyWakeupObserver(entry, event) : Effect.void, + ), Effect.andThen( publishToSubscribers(entry.eventSubscribers, { type: "event", event }), ), diff --git a/apps/server/src/orchestration-v2/ProviderTurnStartService.ts b/apps/server/src/orchestration-v2/ProviderTurnStartService.ts index b8bd2b2fe84..57fb87b78c0 100644 --- a/apps/server/src/orchestration-v2/ProviderTurnStartService.ts +++ b/apps/server/src/orchestration-v2/ProviderTurnStartService.ts @@ -39,6 +39,7 @@ export interface ProviderTurnStartServiceV2Shape { readonly start: (input: { readonly threadId: ThreadId; readonly runId: RunId; + readonly turnDelivery?: "prompt" | "attach"; }) => Effect.Effect; } @@ -71,6 +72,7 @@ export const layer: Layer.Layer< const start = Effect.fn("orchestrationV2.providerTurnStart.start")(function* (input: { readonly threadId: ThreadId; readonly runId: RunId; + readonly turnDelivery?: "prompt" | "attach"; }) { const { runId } = input; const projection = yield* projectionStore.getThreadProjection(input.threadId); @@ -408,6 +410,7 @@ export const layer: Layer.Layer< appThread: projection.thread, providerSessionId, session, + ...(input.turnDelivery === undefined ? {} : { turnDelivery: input.turnDelivery }), run: runningRun, rootNode: runningRootNode, checkpointScope, diff --git a/apps/server/src/orchestration-v2/ProviderWakeupService.ts b/apps/server/src/orchestration-v2/ProviderWakeupService.ts new file mode 100644 index 00000000000..1c0624afd16 --- /dev/null +++ b/apps/server/src/orchestration-v2/ProviderWakeupService.ts @@ -0,0 +1,190 @@ +import { + CommandId, + MessageId, + type OrchestrationV2ProviderWakeupOrigin, + type ProviderInstanceId, + type ProviderSessionId, + type ProviderThreadId, + type ThreadId, +} from "@t3tools/contracts"; +import * as Context from "effect/Context"; +import * as Effect from "effect/Effect"; +import * as Layer from "effect/Layer"; +import * as Queue from "effect/Queue"; + +import { OrchestratorV2, type OrchestratorV2Shape } from "./Orchestrator.ts"; +import { ProviderWakeupObserver } from "./ProviderSessionManager.ts"; +import { randomUuidV4 } from "./RandomUuid.ts"; + +/** + * Turns adapter `turn.wakeup` announcements (a provider starting a turn the + * orchestrator never requested — e.g. the Claude SDK resuming after a + * background task notification or a scheduled wakeup timer) into visible + * provider-initiated runs. + * + * The session manager consumes the observer at layer construction while the + * orchestrator (which dispatching requires) is itself built on top of the + * session manager, so the observer cannot depend on the orchestrator + * directly. The relay breaks that cycle: the observer enqueues wakeup + * requests, and a daemon that IS wired to the orchestrator drains the queue + * and dispatches `message.dispatch` commands with the `attach_wakeup` mode. + */ +export interface ProviderWakeupRequest { + readonly threadId: ThreadId; + readonly providerThreadId: ProviderThreadId; + readonly providerInstanceId: ProviderInstanceId; + readonly providerSessionId: ProviderSessionId; + readonly origin: OrchestrationV2ProviderWakeupOrigin; +} + +export class ProviderWakeupRelay extends Context.Service< + ProviderWakeupRelay, + { + readonly offer: (input: ProviderWakeupRequest) => Effect.Effect; + readonly take: Effect.Effect; + } +>()("t3/orchestration-v2/ProviderWakeupService/ProviderWakeupRelay") {} + +export const relayLayer: Layer.Layer = Layer.effect( + ProviderWakeupRelay, + Effect.gen(function* () { + const queue = yield* Queue.unbounded(); + return ProviderWakeupRelay.of({ + offer: (input) => Queue.offer(queue, input).pipe(Effect.asVoid), + take: Queue.take(queue), + }); + }), +); + +export const wakeupObserverLive = Layer.effect( + ProviderWakeupObserver, + Effect.gen(function* () { + const relay = yield* ProviderWakeupRelay; + return { onWakeup: relay.offer }; + }), +); + +const BLOCKING_RUN_STATUSES: ReadonlySet = new Set([ + "preparing", + "queued", + "starting", + "running", + "waiting", +]); + +const QUIESCENCE_WAIT_ATTEMPTS = 10_000; + +const yieldToRuntime = Effect.yieldNow.pipe( + Effect.andThen( + Effect.promise( + () => + new Promise((resolve) => { + setImmediate(resolve); + }), + ), + ), +); + +/** + * A wakeup is announced the instant the provider starts streaming — typically + * milliseconds after the previous turn's result, before that run's terminal + * events have committed to the projection. Wait for the thread to quiesce so + * the attach dispatch does not lose the race against its own predecessor. A + * thread that stays busy (e.g. a user turn genuinely raced the wakeup) drops + * the wakeup — the adapter superseded its buffer anyway. + */ +const waitForThreadQuiescence = (orchestrator: OrchestratorV2Shape, input: ProviderWakeupRequest) => + Effect.gen(function* () { + for (let attempt = 0; attempt < QUIESCENCE_WAIT_ATTEMPTS; attempt += 1) { + const projection = yield* orchestrator.getThreadProjection(input.threadId); + const blockingRun = projection.runs.find((run) => BLOCKING_RUN_STATUSES.has(run.status)); + if (blockingRun === undefined) { + return true; + } + yield* yieldToRuntime; + } + return false; + }); + +/** + * Dispatch failures are logged, never propagated: a wakeup that loses the + * race against a user-requested turn (or hits a non-attachable thread state, + * e.g. a pending context transfer) is intentionally superseded — the adapter + * drops its buffered activity when the next turn starts. + */ +const dispatchWakeup = (orchestrator: OrchestratorV2Shape, input: ProviderWakeupRequest) => + Effect.gen(function* () { + const quiesced = yield* waitForThreadQuiescence(orchestrator, input); + if (!quiesced) { + yield* Effect.logWarning("orchestration-v2.provider-wakeup.thread-stayed-busy", { + threadId: input.threadId, + providerThreadId: input.providerThreadId, + providerSessionId: input.providerSessionId, + }); + return; + } + const uuid = yield* randomUuidV4; + const dispatched = yield* orchestrator.dispatch({ + type: "message.dispatch", + commandId: CommandId.make(`command:provider-wakeup:${uuid}`), + threadId: input.threadId, + messageId: MessageId.make(`message:provider-wakeup:${uuid}`), + text: wakeupMessageText(input.origin), + attachments: [], + createdBy: "system", + creationSource: "provider", + dispatchMode: { + type: "attach_wakeup", + providerThreadId: input.providerThreadId, + origin: input.origin, + }, + }); + yield* Effect.logInfo("orchestration-v2.provider-wakeup.dispatched", { + threadId: input.threadId, + providerThreadId: input.providerThreadId, + providerSessionId: input.providerSessionId, + origin: input.origin, + sequence: dispatched.sequence, + }); + }).pipe( + Effect.catchCause((cause) => + Effect.logWarning("orchestration-v2.provider-wakeup.dispatch-failed", { + threadId: input.threadId, + providerThreadId: input.providerThreadId, + providerSessionId: input.providerSessionId, + origin: input.origin, + cause, + }), + ), + Effect.asVoid, + ); + +export const runWakeupDispatcher: Effect.Effect< + never, + never, + ProviderWakeupRelay | OrchestratorV2 +> = Effect.gen(function* () { + const relay = yield* ProviderWakeupRelay; + const orchestrator = yield* OrchestratorV2; + return yield* relay.take.pipe( + Effect.flatMap((input) => dispatchWakeup(orchestrator, input)), + Effect.forever, + ); +}); + +export const wakeupDispatcherDaemonLayer: Layer.Layer< + never, + never, + ProviderWakeupRelay | OrchestratorV2 +> = Layer.effectDiscard(runWakeupDispatcher.pipe(Effect.forkScoped)); + +function wakeupMessageText(origin: OrchestrationV2ProviderWakeupOrigin): string { + switch (origin.kind) { + case "task_notification": + return origin.detail === undefined + ? "Resumed by the provider: a background task finished." + : `Resumed by the provider: a background task finished — ${origin.detail}`; + case "unknown": + return "Resumed by the provider."; + } +} diff --git a/apps/server/src/orchestration-v2/RunExecutionService.ts b/apps/server/src/orchestration-v2/RunExecutionService.ts index c3da3c886fd..a1010daa699 100644 --- a/apps/server/src/orchestration-v2/RunExecutionService.ts +++ b/apps/server/src/orchestration-v2/RunExecutionService.ts @@ -33,6 +33,7 @@ import { CheckpointServiceV2 } from "./CheckpointService.ts"; import { EventSinkV2 } from "./EventSink.ts"; import { IdAllocatorV2, type IdAllocatorV2Shape } from "./IdAllocator.ts"; import type { + ProviderAdapterV2Error, ProviderAdapterV2Event, ProviderAdapterV2RuntimePolicy, ProviderAdapterV2SessionRuntime, @@ -162,18 +163,35 @@ export function routeProviderEvent( return belongs ? [true, addProviderTurn(event.providerTurn.id, isRoot)] : [false, state]; } case "node.updated": { - const belongs = ownsRun(event.node.runId) || ownsChildThread(event.node.threadId); - if (!belongs || event.node.providerThreadId === null) { + // Same-provider-thread updates for earlier runs are legitimate: a + // backgrounded command spawned by a previous turn completes during this + // run (often a wakeup run) and the adapter re-emits the original + // node/item with its terminal status. + const nodeProviderThreadId = event.node.providerThreadId ?? null; + const belongs = + ownsRun(event.node.runId) || + ownsChildThread(event.node.threadId) || + (nodeProviderThreadId !== null && + state.ownedProviderThreadIds.has(nodeProviderThreadId)); + if (!belongs || nodeProviderThreadId === null) { return [belongs, state]; } - return [true, addProviderThread(event.node.providerThreadId)]; + return [true, addProviderThread(nodeProviderThreadId)]; } case "subagent.updated": return [ownsRun(event.subagent.runId) || ownsChildThread(event.subagent.threadId), state]; case "message.updated": return [ownsRun(event.message.runId) || ownsChildThread(event.message.threadId), state]; - case "turn_item.updated": - return [ownsRun(event.turnItem.runId) || ownsChildThread(event.turnItem.threadId), state]; + case "turn_item.updated": { + const itemProviderThreadId = event.turnItem.providerThreadId ?? null; + return [ + ownsRun(event.turnItem.runId) || + ownsChildThread(event.turnItem.threadId) || + (itemProviderThreadId !== null && + state.ownedProviderThreadIds.has(itemProviderThreadId)), + state, + ]; + } case "plan.updated": return [ownsRun(event.plan.runId) || ownsChildThread(event.plan.threadId), state]; case "runtime_request.updated": @@ -185,6 +203,10 @@ export function routeProviderEvent( ]; case "turn.terminal": return [event.providerTurnId === state.rootProviderTurnId, state]; + case "turn.wakeup": + // Wakeups are handled by the session-lifetime wakeup watcher, never by + // an active run's ingestion pipeline. + return [false, state]; } } @@ -244,6 +266,12 @@ export interface RunExecutionServiceV2StartRootRunInput { readonly message: ProviderAdapterV2TurnMessage; readonly modelSelection: ModelSelection; readonly runtimePolicy: ProviderAdapterV2RuntimePolicy; + /** + * "prompt" (default) sends the message to the provider via `startTurn`. + * "attach" adopts a provider-initiated turn that is already in flight via + * `attachTurn` — used by the wakeup watcher; nothing is sent to the provider. + */ + readonly turnDelivery?: "prompt" | "attach"; } export interface RunExecutionServiceV2Shape { @@ -755,21 +783,30 @@ export const layer: Layer.Layer< return; } - yield* input.session - .startTurn({ - appThread: input.appThread, - threadId: input.run.threadId, - runId: input.run.id, - runOrdinal: input.run.ordinal, - providerTurnOrdinal: input.providerTurnOrdinal, - attemptId: input.attemptId, - rootNodeId: input.rootNode.id, - providerThread: input.providerThread, - message: input.message, - modelSelection: input.modelSelection, - runtimePolicy: input.runtimePolicy, - }) - .pipe( + const turnInput = { + appThread: input.appThread, + threadId: input.run.threadId, + runId: input.run.id, + runOrdinal: input.run.ordinal, + providerTurnOrdinal: input.providerTurnOrdinal, + attemptId: input.attemptId, + rootNodeId: input.rootNode.id, + providerThread: input.providerThread, + message: input.message, + modelSelection: input.modelSelection, + runtimePolicy: input.runtimePolicy, + }; + const deliverTurn: Effect.Effect = + input.turnDelivery === "attach" + ? input.session.attachTurn === undefined + ? Effect.die( + new Error( + `Provider session ${input.providerSessionId} emitted a turn wakeup but its adapter does not implement attachTurn.`, + ), + ) + : input.session.attachTurn(turnInput) + : input.session.startTurn(turnInput); + yield* deliverTurn.pipe( Effect.catchCause((cause) => Effect.logError("orchestration V2 provider turn start failed", { runId: input.run.id, diff --git a/apps/server/src/orchestration-v2/runtimeLayer.ts b/apps/server/src/orchestration-v2/runtimeLayer.ts index 33a3b796a51..43b7dc23f21 100644 --- a/apps/server/src/orchestration-v2/runtimeLayer.ts +++ b/apps/server/src/orchestration-v2/runtimeLayer.ts @@ -26,6 +26,11 @@ import { layer as projectionMaintenanceLayer } from "./ProjectionMaintenance.ts" import { layerFromProviderInstanceRegistry as providerAdapterRegistryLayerFromProviderInstances } from "./ProviderAdapterRegistry.ts"; import { layer as providerEventIngestorLayer } from "./ProviderEventIngestor.ts"; import { layer as providerSessionManagerLayer } from "./ProviderSessionManager.ts"; +import { + relayLayer as providerWakeupRelayLayer, + wakeupDispatcherDaemonLayer, + wakeupObserverLive, +} from "./ProviderWakeupService.ts"; import { layer as providerRuntimeRecoveryLayer } from "./ProviderRuntimeRecoveryService.ts"; import { layer as providerSwitchServiceLayer } from "./ProviderSwitchService.ts"; import { layer as providerTurnControlServiceLayer } from "./ProviderTurnControlService.ts"; @@ -81,6 +86,10 @@ const providerSwitchServiceProvided = providerSwitchServiceLayer.pipe( Layer.provide(providerAdapterRegistryProvided), ); +const providerWakeupObserverProvided = wakeupObserverLive.pipe( + Layer.provide(providerWakeupRelayLayer), +); + const providerSessionManagerProvided = providerSessionManagerLayer.pipe( Layer.provide( Layer.mergeAll( @@ -88,6 +97,7 @@ const providerSessionManagerProvided = providerSessionManagerLayer.pipe( eventSinkProvided, idAllocatorLayer, projectionStoreLayer, + providerWakeupObserverProvided, ), ), ); @@ -222,6 +232,10 @@ const scheduledTaskProvided = scheduledTaskServiceLayer.pipe( Layer.provide(Layer.mergeAll(threadLaunchProvided, threadManagementProvided)), ); +const providerWakeupDispatcherProvided = wakeupDispatcherDaemonLayer.pipe( + Layer.provide(Layer.merge(providerWakeupRelayLayer, orchestratorProvided)), +); + export const OrchestrationV2LayerLive = Layer.mergeAll( orchestratorProvided, threadManagementProvided, @@ -229,6 +243,7 @@ export const OrchestrationV2LayerLive = Layer.mergeAll( providerSessionManagerProvided, providerRuntimeRecoveryProvided, projectionMaintenanceProvided, + providerWakeupDispatcherProvided, ); export const OrchestrationV2ProductionLayerLive = Layer.mergeAll( diff --git a/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts b/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts index a82947ba958..2916816a02b 100644 --- a/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts +++ b/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts @@ -69,7 +69,12 @@ describe("orchestrator replay fixture contract", () => { throw new Error(`${fixture.name}/${provider.driver} must start with thread.create`); } assert.equal(firstCommand.threadId, materialized.projectionThreadIds[0]); - assert.equal(materialized.commands.length, fixture.buildInput().steps.length + 1); + // await_provider_wakeup_run steps only await runs minted by the + // wakeup dispatcher; every other input step dispatches a command. + const commandProducingSteps = fixture + .buildInput() + .steps.filter((step) => step.type !== "await_provider_wakeup_run"); + assert.equal(materialized.commands.length, commandProducingSteps.length + 1); assert.isAtLeast(materialized.steps.length, materialized.commands.length); assert.equal(typeof provider.assertOutput, "function"); diff --git a/apps/server/src/orchestration-v2/testkit/ProviderReplayHarness.ts b/apps/server/src/orchestration-v2/testkit/ProviderReplayHarness.ts index b2f5c2cb444..b20e304f68c 100644 --- a/apps/server/src/orchestration-v2/testkit/ProviderReplayHarness.ts +++ b/apps/server/src/orchestration-v2/testkit/ProviderReplayHarness.ts @@ -37,6 +37,11 @@ import { OrchestratorV2, type OrchestratorV2Error } from "../Orchestrator.ts"; import { ProviderAdapterRegistryV2 } from "../ProviderAdapterRegistry.ts"; import { layer as providerEventIngestorLayer } from "../ProviderEventIngestor.ts"; import { layerWithOptions as providerSessionManagerLayerWithOptions } from "../ProviderSessionManager.ts"; +import { + relayLayer as providerWakeupRelayLayer, + runWakeupDispatcher, + wakeupObserverLive, +} from "../ProviderWakeupService.ts"; import { layer as providerSwitchServiceLayer } from "../ProviderSwitchService.ts"; import { layer as providerTurnControlServiceLayer } from "../ProviderTurnControlService.ts"; import { layer as providerTurnStartServiceLayer } from "../ProviderTurnStartService.ts"; @@ -267,6 +272,9 @@ export function makeOrchestratorV2ReplayLayerWithRegistry( idAllocatorLayer, providerEventIngestorProvided, ); + const providerWakeupObserverProvided = wakeupObserverLive.pipe( + Layer.provide(providerWakeupRelayLayer), + ); const providerSessionManagerProvided = providerSessionManagerLayerWithOptions({ configureMcp: false, }).pipe( @@ -277,6 +285,7 @@ export function makeOrchestratorV2ReplayLayerWithRegistry( idAllocatorLayer, mcpSessionRegistryTestLayer, storesLayer, + providerWakeupObserverProvided, ), ), ); @@ -378,7 +387,13 @@ export function makeOrchestratorV2ReplayLayerWithRegistry( Effect.gen(function* () { const orchestrator = yield* OrchestratorV2; yield* runEffectWorkerDaemon.pipe(Effect.forkScoped); + // Provider-initiated turn wakeups (adapter `turn.wakeup` announcements) + // are dispatched by the same daemon wiring the production runtime uses. + yield* runWakeupDispatcher.pipe( + Effect.provideService(OrchestratorV2, orchestrator), + Effect.forkScoped, + ); return orchestrator; }), - ).pipe(Layer.provide(replayRuntime)); + ).pipe(Layer.provide(Layer.merge(replayRuntime, providerWakeupRelayLayer))); } diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/claude_transcript.ndjson b/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/claude_transcript.ndjson new file mode 100644 index 00000000000..48312c112df --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/claude_transcript.ndjson @@ -0,0 +1,21 @@ +{"type":"transcript_start","provider":"claudeAgent","protocol":"claude-agent-sdk.query","version":"0.2.111","scenario":"claude_provider_wakeup","metadata":{"prompts":["Start a background watcher that polls for new bot reviews, then confirm you are waiting.","Wrap up with a summary."],"model":"claude-sonnet-4-6","nativeSessionId":"b896a082-a480-416f-8ffd-c42845e71993","queryMode":"streaming","tools":"claude_code","permissionMode":"bypassPermissions","generatedBy":"manual-replay-from-thread-47763f5e (post-turn task-notification wakeup, 2026-07-01T22:12:56Z)"}} +{"type":"expect_outbound","label":"query.open","frame":{"type":"query.open","options":{"model":"claude-sonnet-4-6","tools":{"type":"preset","preset":"claude_code"},"permissionMode":"bypassPermissions","allowDangerouslySkipPermissions":true,"sessionId":"b896a082-a480-416f-8ffd-c42845e71993"}}} +{"type":"expect_outbound","label":"prompt.offer:1","frame":{"type":"prompt.offer","message":{"type":"user","message":{"role":"user","content":"Start a background watcher that polls for new bot reviews, then confirm you are waiting."},"parent_tool_use_id":null}}} +{"type":"emit_inbound","label":"system:init:1","frame":{"type":"system","subtype":"init","agents":[],"apiKeySource":"none","claude_code_version":"2.1.183","cwd":"/tmp/claude-replay-claude_provider_wakeup","tools":[],"mcp_servers":[],"model":"claude-sonnet-4-6","permissionMode":"bypassPermissions","slash_commands":[],"output_style":"default","skills":[],"plugins":[],"fast_mode_state":"off","uuid":"9e0a54c1-0000-4000-8000-000000000001","session_id":"b896a082-a480-416f-8ffd-c42845e71993"}} +{"type":"emit_inbound","label":"assistant:watcher_bash","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01wakeupfixture1","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01Gd1WxjAEZD4S4LcAyzDktJ","name":"Bash","input":{"command":"while true; do gh pr view --json comments; sleep 60; done","run_in_background":true,"description":"Poll for new bot reviews on the PR"},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-000000000002"}} +{"type":"emit_inbound","label":"task_started:watcher","frame":{"type":"system","subtype":"task_started","task_id":"bdknlk0tp","tool_use_id":"toolu_01Gd1WxjAEZD4S4LcAyzDktJ","description":"Poll for new bot reviews on the PR","task_type":"local_bash","uuid":"9e0a54c1-0000-4000-8000-000000000003","session_id":"b896a082-a480-416f-8ffd-c42845e71993"}} +{"type":"emit_inbound","label":"user:watcher_started","frame":{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01Gd1WxjAEZD4S4LcAyzDktJ","type":"tool_result","content":"Command running in background with ID: bdknlk0tp","is_error":false}]},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-000000000004","tool_use_result":{"stdout":"","stderr":"","interrupted":false,"isImage":false,"noOutputExpected":false,"backgroundTaskId":"bdknlk0tp"}}} +{"type":"emit_inbound","label":"assistant:waiting","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01wakeupfixture2","type":"message","role":"assistant","content":[{"type":"text","text":"Watcher is running - I'll pick it back up the moment the bots post."}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-000000000005"}} +{"type":"emit_inbound","label":"result:1","frame":{"type":"result","subtype":"success","is_error":false,"api_error_status":null,"duration_ms":4000,"duration_api_ms":3500,"num_turns":3,"result":"Watcher is running - I'll pick it back up the moment the bots post.","stop_reason":"end_turn","session_id":"b896a082-a480-416f-8ffd-c42845e71993","total_cost_usd":0.001,"usage":{"input_tokens":2,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":2,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[{"input_tokens":1,"output_tokens":1,"cache_read_input_tokens":0,"cache_creation_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"type":"message"}],"speed":"standard"},"modelUsage":{"claude-sonnet-4-6":{"inputTokens":2,"outputTokens":2,"cacheReadInputTokens":0,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.001,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"9e0a54c1-0000-4000-8000-000000000006"}} +{"type":"emit_inbound","label":"wakeup:task_updated","frame":{"type":"system","subtype":"task_updated","task_id":"bdknlk0tp","patch":{"status":"completed","end_time":1782943976449},"uuid":"9e0a54c1-0000-4000-8000-000000000007","session_id":"b896a082-a480-416f-8ffd-c42845e71993"}} +{"type":"emit_inbound","label":"wakeup:task_notification","frame":{"type":"system","subtype":"task_notification","task_id":"bdknlk0tp","tool_use_id":"toolu_01Gd1WxjAEZD4S4LcAyzDktJ","status":"completed","output_file":"/tmp/tasks/bdknlk0tp.output","summary":"Background command \"Poll for new bot reviews on the PR\" completed","uuid":"9e0a54c1-0000-4000-8000-000000000008","session_id":"b896a082-a480-416f-8ffd-c42845e71993"}} +{"type":"emit_inbound","label":"wakeup:init","frame":{"type":"system","subtype":"init","agents":[],"apiKeySource":"none","claude_code_version":"2.1.183","cwd":"/tmp/claude-replay-claude_provider_wakeup","tools":[],"mcp_servers":[],"model":"claude-sonnet-4-6","permissionMode":"bypassPermissions","slash_commands":[],"output_style":"default","skills":[],"plugins":[],"fast_mode_state":"off","uuid":"9e0a54c1-0000-4000-8000-000000000009","session_id":"b896a082-a480-416f-8ffd-c42845e71993"}} +{"type":"emit_inbound","label":"wakeup:message_start","frame":{"type":"stream_event","event":{"type":"message_start","message":{"model":"claude-sonnet-4-6","id":"msg_01wakeupfixture3","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"}}},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-00000000000a"}} +{"type":"emit_inbound","label":"wakeup:assistant_bash","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01wakeupfixture3","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01wakeupcheckreviews1","name":"Bash","input":{"command":"gh api repos/:owner/:repo/pulls/1/comments --jq 'length'"},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-00000000000b"}} +{"type":"emit_inbound","label":"wakeup:user_result","frame":{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01wakeupcheckreviews1","type":"tool_result","content":"0","is_error":false}]},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-00000000000c","tool_use_result":{"stdout":"0","stderr":"","interrupted":false,"isImage":false,"noOutputExpected":false}}} +{"type":"emit_inbound","label":"wakeup:assistant_final","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01wakeupfixture4","type":"message","role":"assistant","content":[{"type":"text","text":"Watcher fired: the bot review round is clean, nothing new to fix."}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-00000000000d"}} +{"type":"emit_inbound","label":"result:2","frame":{"type":"result","subtype":"success","is_error":false,"api_error_status":null,"duration_ms":4000,"duration_api_ms":3500,"num_turns":2,"result":"Watcher fired: the bot review round is clean, nothing new to fix.","stop_reason":"end_turn","session_id":"b896a082-a480-416f-8ffd-c42845e71993","total_cost_usd":0.001,"usage":{"input_tokens":2,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":2,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[{"input_tokens":1,"output_tokens":1,"cache_read_input_tokens":0,"cache_creation_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"type":"message"}],"speed":"standard"},"modelUsage":{"claude-sonnet-4-6":{"inputTokens":2,"outputTokens":2,"cacheReadInputTokens":0,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.001,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"9e0a54c1-0000-4000-8000-00000000000e","origin":{"kind":"task-notification"}}} +{"type":"expect_outbound","label":"prompt.offer:2","frame":{"type":"prompt.offer","message":{"type":"user","message":{"role":"user","content":"Wrap up with a summary."},"parent_tool_use_id":null}}} +{"type":"emit_inbound","label":"assistant:wrap_up","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01wakeupfixture5","type":"message","role":"assistant","content":[{"type":"text","text":"claude provider wakeup fixture complete"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"b896a082-a480-416f-8ffd-c42845e71993","uuid":"9e0a54c1-0000-4000-8000-00000000000f"}} +{"type":"emit_inbound","label":"result:3","frame":{"type":"result","subtype":"success","is_error":false,"api_error_status":null,"duration_ms":4000,"duration_api_ms":3500,"num_turns":1,"result":"claude provider wakeup fixture complete","stop_reason":"end_turn","session_id":"b896a082-a480-416f-8ffd-c42845e71993","total_cost_usd":0.001,"usage":{"input_tokens":2,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":2,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[{"input_tokens":1,"output_tokens":1,"cache_read_input_tokens":0,"cache_creation_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"type":"message"}],"speed":"standard"},"modelUsage":{"claude-sonnet-4-6":{"inputTokens":2,"outputTokens":2,"cacheReadInputTokens":0,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.001,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"9e0a54c1-0000-4000-8000-000000000010"}} +{"type":"runtime_exit","status":"success"} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/input.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/input.ts new file mode 100644 index 00000000000..1c8e6f44ca7 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/input.ts @@ -0,0 +1,22 @@ +import { + CLAUDE_PROVIDER_WAKEUP_FOLLOW_UP, + CLAUDE_PROVIDER_WAKEUP_PROMPT, + type OrchestratorFixtureInput, +} from "../shared.ts"; + +/** + * Recorded from thread 47763f5e (2026-07-01): a turn ends with a background + * watcher still running; the Claude SDK later resumes the session on its own + * (task_updated + task_notification → init → new streaming turn). The + * orchestrator must mint a provider-initiated run for that wakeup turn — run + * ordinal 2 below is created by the wakeup dispatcher, not by a step. + */ +export function claudeProviderWakeupInput(): OrchestratorFixtureInput { + return { + steps: [ + { type: "message", text: CLAUDE_PROVIDER_WAKEUP_PROMPT }, + { type: "await_provider_wakeup_run", runOrdinal: 2 }, + { type: "message", text: CLAUDE_PROVIDER_WAKEUP_FOLLOW_UP }, + ], + }; +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/output.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/output.ts new file mode 100644 index 00000000000..fe0834ae526 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_provider_wakeup/output.ts @@ -0,0 +1,116 @@ +import { assert } from "@effect/vitest"; +import type { ProviderReplayTranscript } from "@t3tools/contracts"; + +import type { OrchestratorV2ScenarioResult } from "../../OrchestratorScenario.ts"; +import { + assertBaseProjection, + assertSemanticProjectionIntegrity, + assertUserMessagesInclude, + CLAUDE_PROVIDER_WAKEUP_FOLLOW_UP, + CLAUDE_PROVIDER_WAKEUP_PROMPT, + projectionFor, +} from "../shared.ts"; + +export function assertClaudeProviderWakeupOutput( + result: OrchestratorV2ScenarioResult, + transcript: ProviderReplayTranscript, +) { + assertBaseProjection({ + result, + transcript, + runCount: 3, + runStatuses: ["completed", "completed", "completed"], + }); + + const projection = projectionFor(result, transcript.scenario); + assertSemanticProjectionIntegrity(projection); + assertUserMessagesInclude(projection, [ + CLAUDE_PROVIDER_WAKEUP_PROMPT, + CLAUDE_PROVIDER_WAKEUP_FOLLOW_UP, + ]); + + // Run 2 is the provider-initiated wakeup: minted by the wakeup dispatcher + // reacting to the adapter's `turn.wakeup` announcement, never by a user + // command. Its synthetic message records why the provider resumed. + const wakeupRun = projection.runs.find((run) => run.ordinal === 2); + assert.isDefined(wakeupRun); + assert.equal(wakeupRun?.status, "completed"); + const wakeupMessage = projection.messages.find( + (message) => message.id === wakeupRun?.userMessageId, + ); + assert.isDefined(wakeupMessage); + assert.equal(wakeupMessage?.createdBy, "system"); + assert.equal(wakeupMessage?.creationSource, "provider"); + assert.include( + wakeupMessage?.text ?? "", + "Resumed by the provider: a background task finished", + ); + assert.include( + wakeupMessage?.text ?? "", + 'Background command "Poll for new bot reviews on the PR" completed', + ); + + const wakeupUserItem = projection.turnItems.find( + (item) => item.runId === wakeupRun?.id && item.type === "user_message", + ); + assert.isDefined(wakeupUserItem); + assert.equal( + wakeupUserItem?.type === "user_message" ? wakeupUserItem.inputIntent : undefined, + "provider_wakeup", + ); + + // The wakeup turn's buffered activity must be replayed into the attached + // run: the review-check command and the closing assistant message. + const wakeupItems = projection.turnItems.filter( + (item) => item.runId === wakeupRun?.id && item.type !== "checkpoint", + ); + assert.deepEqual( + wakeupItems.map((item) => item.type), + ["user_message", "command_execution", "assistant_message"], + ); + const wakeupCommand = wakeupItems.find((item) => item.type === "command_execution"); + assert.include(JSON.stringify(wakeupCommand ?? null), "gh api repos/:owner/:repo/pulls/1/comments"); + const wakeupAssistantTexts = wakeupItems.flatMap((item) => + item.type === "assistant_message" ? [item.text] : [], + ); + assert.deepEqual(wakeupAssistantTexts, [ + "Watcher fired: the bot review round is clean, nothing new to fix.", + ]); + + // The watcher was started with run_in_background: its command item belongs + // to run 1 but only completes when the task lifecycle concludes — here via + // the task_updated/task_notification replayed into the wakeup run. The + // terminal update must land on the ORIGINAL run-1 item (cross-run routing) + // and append the notification summary to its output. + const run1 = projection.runs.find((run) => run.ordinal === 1); + assert.isDefined(run1); + const watcherItem = projection.turnItems.find( + (item) => item.runId === run1?.id && item.type === "command_execution", + ); + assert.isDefined(watcherItem); + assert.equal(watcherItem?.status, "completed"); + assert.include( + watcherItem?.type === "command_execution" ? (watcherItem.output ?? "") : "", + 'Background command "Poll for new bot reviews on the PR" completed', + ); + const watcherNode = projection.nodes.find((node) => node.id === watcherItem?.nodeId); + assert.equal(watcherNode?.status, "completed"); + + // The watcher is a local_bash task: lifecycle bookkeeping (task_updated / + // task_notification replayed into the wakeup turn) must not project + // subagents or child threads. + assert.lengthOf(result.projections, 1); + assert.lengthOf(projection.subagents, 0); + assert.lengthOf( + projection.nodes.filter((node) => node.kind === "subagent"), + 0, + ); + + // Follow-up run 3 proves the session stays usable after an attached wakeup. + const followUpRun = projection.runs.find((run) => run.ordinal === 3); + assert.isDefined(followUpRun); + const followUpTexts = projection.turnItems.flatMap((item) => + item.runId === followUpRun?.id && item.type === "assistant_message" ? [item.text] : [], + ); + assert.deepEqual(followUpTexts, ["claude provider wakeup fixture complete"]); +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/index.ts b/apps/server/src/orchestration-v2/testkit/fixtures/index.ts index 1d252be087b..0cc15d19410 100644 --- a/apps/server/src/orchestration-v2/testkit/fixtures/index.ts +++ b/apps/server/src/orchestration-v2/testkit/fixtures/index.ts @@ -2,6 +2,8 @@ import { ProviderDriverKind } from "@t3tools/contracts"; import { claudeLocalBashTaskInput } from "./claude_local_bash_task/input.ts"; import { assertClaudeLocalBashTaskOutput } from "./claude_local_bash_task/output.ts"; +import { claudeProviderWakeupInput } from "./claude_provider_wakeup/input.ts"; +import { assertClaudeProviderWakeupOutput } from "./claude_provider_wakeup/output.ts"; import { grokSubagentLineageInput } from "./grok_subagent_lineage/input.ts"; import { assertGrokSubagentLineageOutput } from "./grok_subagent_lineage/output.ts"; import { assertClaudeMessageSteeringOutput } from "./message_steering/claude_output.ts"; @@ -95,6 +97,21 @@ export const ORCHESTRATOR_REPLAY_FIXTURES = [ }, ], }, + { + name: "claude_provider_wakeup", + buildInput: claudeProviderWakeupInput, + providers: [ + { + driver: ProviderDriverKind.make("claudeAgent"), + transcriptFile: new URL( + "./claude_provider_wakeup/claude_transcript.ndjson", + import.meta.url, + ), + modelSelection: CLAUDE_MODEL_SELECTION, + assertOutput: assertClaudeProviderWakeupOutput, + }, + ], + }, { name: "grok_subagent_lineage", buildInput: grokSubagentLineageInput, diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts b/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts index 08ab813fdf8..b24b8364c53 100644 --- a/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts +++ b/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts @@ -38,6 +38,9 @@ export const TOOL_CALL_READ_ONLY_WORKSPACE_ROOT = "/tmp/claude-replay-tool_call_ export const TOOL_CALL_READ_ONLY_PROMPT = `Read ${TOOL_CALL_READ_ONLY_WORKSPACE_ROOT}/package.json and ${TOOL_CALL_READ_ONLY_WORKSPACE_ROOT}/tsconfig.json, then answer exactly: read only tool fixture complete`; export const CLAUDE_LOCAL_BASH_TASK_PROMPT = "Run a local Bash typecheck command, then answer exactly: claude local bash task fixture complete"; +export const CLAUDE_PROVIDER_WAKEUP_PROMPT = + "Start a background watcher that polls for new bot reviews, then confirm you are waiting."; +export const CLAUDE_PROVIDER_WAKEUP_FOLLOW_UP = "Wrap up with a summary."; export const TOOL_CALL_WRITE_PROMPT = "Create or overwrite .codex-probe-write-action.txt with exactly this text: codex app-server approval fixture. Use a local shell command or file edit only, then briefly report what happened. Do not read package metadata, use GitHub, use web, or use MCP."; export const MESSAGE_STEERING_INITIAL_PROMPT = @@ -192,6 +195,16 @@ export type OrchestratorFixtureInputStep = readonly type: "rollback"; readonly checkpointScopeSuffix: string; readonly checkpointSuffix: string; + } + | { + /** + * Wait for a provider-initiated run (minted asynchronously by the + * wakeup dispatcher reacting to an adapter `turn.wakeup` event, never + * by a dispatched command) to reach a status. + */ + readonly type: "await_provider_wakeup_run"; + readonly runOrdinal: number; + readonly status?: OrchestrationV2RunStatus; }; export interface OrchestratorFixtureInput { @@ -530,6 +543,14 @@ export function materializeFixtureInput(input: { steps.push({ type: "advance_clock", duration: "1 millis" }); steps.push({ type: "await_thread_idle", threadId: ids.threadId }); break; + case "await_provider_wakeup_run": + steps.push({ + type: "await_run_status", + threadId: ids.threadId, + runId: runIdFor(step.runOrdinal), + status: step.status ?? "completed", + }); + break; case "steer": messageIndex += 1; steps.push({ diff --git a/apps/web/src/components/chat/MessagesTimeline.logic.test.ts b/apps/web/src/components/chat/MessagesTimeline.logic.test.ts index f846e1affe1..6dea99dd478 100644 --- a/apps/web/src/components/chat/MessagesTimeline.logic.test.ts +++ b/apps/web/src/components/chat/MessagesTimeline.logic.test.ts @@ -626,6 +626,97 @@ describe("deriveMessagesTimelineRows", () => { ]); }); + it("keeps still-running background work visible when the turn settles", () => { + // A Bash command with run_in_background outlives its turn: the run settles + // while the command_execution item stays inProgress until the task's + // lifecycle completes it (possibly during a later provider wakeup). + const backgroundWorkEntry = (toolLifecycleStatus: "inProgress" | "completed") => ({ + id: "background-command-entry", + kind: "work" as const, + createdAt: "2026-01-01T00:00:08Z", + entry: { + id: "background-command", + createdAt: "2026-01-01T00:00:08Z", + runId: "turn-1" as never, + label: "sleep 75 && echo watcher-done", + tone: "tool" as const, + toolLifecycleStatus, + }, + }); + const timelineEntries = (toolLifecycleStatus: "inProgress" | "completed") => [ + { + id: "user-entry", + kind: "message" as const, + createdAt: "2026-01-01T00:00:00Z", + message: { + id: "user-1" as never, + role: "user" as const, + text: "Start a background watcher", + runId: null, + createdAt: "2026-01-01T00:00:00Z", + updatedAt: "2026-01-01T00:00:00Z", + streaming: false, + }, + }, + { + id: "assistant-thought-entry", + kind: "message" as const, + createdAt: "2026-01-01T00:00:05Z", + message: { + id: "assistant-thought" as never, + role: "assistant" as const, + text: "Starting the watcher.", + runId: "turn-1" as never, + createdAt: "2026-01-01T00:00:05Z", + updatedAt: "2026-01-01T00:00:06Z", + streaming: false, + }, + }, + backgroundWorkEntry(toolLifecycleStatus), + { + id: "assistant-final-entry", + kind: "message" as const, + createdAt: "2026-01-01T00:00:10Z", + message: { + id: "assistant-final" as never, + role: "assistant" as const, + text: "Waiting for the watcher to finish.", + runId: "turn-1" as never, + createdAt: "2026-01-01T00:00:10Z", + updatedAt: "2026-01-01T00:00:12Z", + streaming: false, + }, + }, + ]; + + const rowsWhileRunning = deriveMessagesTimelineRows({ + timelineEntries: timelineEntries("inProgress"), + isWorking: false, + activeTurnStartedAt: null, + turnDiffSummaryByAssistantMessageId: new Map(), + revertTurnCountByUserMessageId: new Map(), + }); + expect(rowsWhileRunning.map((row) => row.id)).toEqual([ + "user-entry", + "turn-fold:turn-1", + "background-command-entry", + "assistant-final-entry", + ]); + + const rowsAfterCompletion = deriveMessagesTimelineRows({ + timelineEntries: timelineEntries("completed"), + isWorking: false, + activeTurnStartedAt: null, + turnDiffSummaryByAssistantMessageId: new Map(), + revertTurnCountByUserMessageId: new Map(), + }); + expect(rowsAfterCompletion.map((row) => row.id)).toEqual([ + "user-entry", + "turn-fold:turn-1", + "assistant-final-entry", + ]); + }); + it("collapses only output from a superseded V2 attempt within the active logical run", () => { const runId = "run-steered" as never; const supersededAttemptId = "attempt-1" as never; diff --git a/apps/web/src/components/chat/MessagesTimeline.logic.ts b/apps/web/src/components/chat/MessagesTimeline.logic.ts index 81840db724f..3316e7d3641 100644 --- a/apps/web/src/components/chat/MessagesTimeline.logic.ts +++ b/apps/web/src/components/chat/MessagesTimeline.logic.ts @@ -281,7 +281,7 @@ function deriveSupersededAttemptFolds( * a message, the previous turn is still the "active" one until the server * creates the new turn, and folding must not flicker through that window. */ -function deriveUnsettledRunId(latestRun: TimelineLatestRun | null): RunId | null { +export function deriveUnsettledRunId(latestRun: TimelineLatestRun | null): RunId | null { if (!latestRun) { return null; } @@ -293,6 +293,16 @@ function deriveUnsettledRunId(latestRun: TimelineLatestRun | null): RunId | null return isSettled ? null : latestRun.runId; } +/** + * Work that outlives its turn (e.g. a Bash command with run_in_background) + * stays `inProgress` after the run settles. Folding it away with the rest of + * the turn would make the thread look finished while the command is still + * running, so it stays visible until it reaches a terminal status. + */ +function timelineEntryIsInProgressWork(entry: TimelineEntry): boolean { + return entry.kind === "work" && entry.entry.toolLifecycleStatus === "inProgress"; +} + function timelineEntryFoldRunId(entry: TimelineEntry): RunId | null { if (entry.kind === "message" && entry.message.role === "assistant") { return entry.message.runId ?? null; @@ -388,7 +398,11 @@ function deriveTurnFolds(input: { } const hiddenEntryIds = new Set(); for (const entry of group.entries) { - if (entry.id !== group.terminalEntry?.id && !timelineEntryIsPersistentResourceCard(entry)) { + if ( + entry.id !== group.terminalEntry?.id && + !timelineEntryIsPersistentResourceCard(entry) && + !timelineEntryIsInProgressWork(entry) + ) { hiddenEntryIds.add(entry.id); } } diff --git a/apps/web/src/components/chat/MessagesTimeline.test.tsx b/apps/web/src/components/chat/MessagesTimeline.test.tsx index b784a1cebaf..6dad6666ece 100644 --- a/apps/web/src/components/chat/MessagesTimeline.test.tsx +++ b/apps/web/src/components/chat/MessagesTimeline.test.tsx @@ -398,6 +398,74 @@ describe("MessagesTimeline", () => { expect(markup).toContain(">steer<"); }); + it("labels provider wakeup messages as resumed rather than steer", async () => { + const { MessagesTimeline } = await import("./MessagesTimeline"); + const entry = buildUserTimelineEntry("Resumed by the provider: a background task finished"); + const markup = renderToStaticMarkup( + , + ); + + expect(markup).toContain(">resumed<"); + expect(markup).not.toContain(">steer<"); + }); + + it("keeps a still-running background command visible after its turn settles", async () => { + const { MessagesTimeline } = await import("./MessagesTimeline"); + const runId = RunId.make("run-1"); + const timelineEntries = [ + buildUserTimelineEntry("Start a background watcher"), + { + id: "background-command-entry", + kind: "work" as const, + createdAt: "2026-03-17T19:12:30.000Z", + entry: { + id: "background-command", + createdAt: "2026-03-17T19:12:30.000Z", + runId, + label: "Ran command", + command: "sleep 75 && echo watcher-done", + rawCommand: "sleep 75 && echo watcher-done", + tone: "tool" as const, + toolLifecycleStatus: "inProgress" as const, + }, + }, + { + id: "assistant-final-entry", + kind: "message" as const, + createdAt: "2026-03-17T19:12:40.000Z", + message: { + id: MessageId.make("assistant-final"), + role: "assistant" as const, + text: "Waiting for the watcher to finish.", + runId, + createdAt: "2026-03-17T19:12:40.000Z", + updatedAt: "2026-03-17T19:12:41.000Z", + streaming: false, + }, + }, + ]; + const markup = renderToStaticMarkup( + , + ); + + expect(markup).toContain("sleep 75 && echo watcher-done"); + expect(markup).toContain("Still running in the background"); + }); + it("shows a collapsed disclosure for superseded attempt output", async () => { const { MessagesTimeline } = await import("./MessagesTimeline"); const runId = RunId.make("run-steered"); diff --git a/apps/web/src/components/chat/MessagesTimeline.tsx b/apps/web/src/components/chat/MessagesTimeline.tsx index 1024ae37796..038cd150834 100644 --- a/apps/web/src/components/chat/MessagesTimeline.tsx +++ b/apps/web/src/components/chat/MessagesTimeline.tsx @@ -33,6 +33,7 @@ import { workEntryIndicatesToolFailure, workEntryIndicatesToolNeutralStatus, workEntryIndicatesToolSuccess, + workEntryIsLiveBackgroundWork, workLogEntryIsToolLike, } from "../../session-logic"; import { type TurnDiffSummary } from "../../types"; @@ -75,6 +76,7 @@ import { computeStableMessagesTimelineRows, MAX_VISIBLE_WORK_LOG_ENTRIES, deriveMessagesTimelineRows, + deriveUnsettledRunId, normalizeCompactToolLabel, resolveAssistantMessageCopyState, resolveTimelineIsAtEnd, @@ -160,6 +162,7 @@ interface TimelineRowActivityState { isWorking: boolean; isRevertingCheckpoint: boolean; activeTurnInProgress: boolean; + unsettledRunId: RunId | null; } const TimelineRowCtx = createContext(null!); @@ -468,8 +471,9 @@ export const MessagesTimeline = memo(function MessagesTimeline({ isWorking, isRevertingCheckpoint, activeTurnInProgress, + unsettledRunId: deriveUnsettledRunId(latestRun), }), - [activeTurnInProgress, isRevertingCheckpoint, isWorking], + [activeTurnInProgress, isRevertingCheckpoint, isWorking, latestRun], ); const listHeader = useMemo( () => @@ -1008,26 +1012,35 @@ function UserMessageIntentBadge({ }) { const presentation = intent === "queued_turn" - ? { label: "queued", className: "border-amber-500/25 bg-amber-500/8 text-amber-700" } + ? { + label: "queued", + className: "border-amber-500/25 bg-amber-500/8 text-amber-700", + title: "Queued behind the active turn", + } : intent === "promoted_queued_to_steer" ? { label: "queued → steer", className: "border-sky-500/25 bg-sky-500/8 text-sky-700", + title: "Originally queued, then promoted to steer the active turn", } - : { label: "steer", className: "border-sky-500/25 bg-sky-500/8 text-sky-700" }; + : intent === "provider_wakeup" + ? { + label: "resumed", + className: "border-violet-500/25 bg-violet-500/8 text-violet-700", + title: "The provider resumed the session on its own (e.g. a background task finished)", + } + : { + label: "steer", + className: "border-sky-500/25 bg-sky-500/8 text-sky-700", + title: "Steered the active turn", + }; return ( {presentation.label} @@ -1479,12 +1492,20 @@ const WorkGroupSection = memo(function WorkGroupSection({ groupedEntries: Extract["groupedEntries"]; }) { const { workspaceRoot } = use(TimelineRowCtx); + const activity = use(TimelineRowActivityCtx); const [isExpanded, setIsExpanded] = useState(false); const sectionRef = useRef(null); const anchorBottomBeforeToggleRef = useRef(null); + // Background work that outlived its (settled) run is neutral-status but + // must stay visible — it is the only trace of a still-running command. const nonEmptyEntries = useMemo( - () => groupedEntries.filter((entry) => !workEntryIndicatesToolNeutralStatus(entry)), - [groupedEntries], + () => + groupedEntries.filter( + (entry) => + !workEntryIndicatesToolNeutralStatus(entry) || + workEntryIsLiveBackgroundWork(entry, activity.unsettledRunId), + ), + [groupedEntries, activity.unsettledRunId], ); const hasOverflow = nonEmptyEntries.length > MAX_VISIBLE_WORK_LOG_ENTRIES; const visibleEntries = @@ -2347,10 +2368,16 @@ const SimpleWorkEntryRow = memo(function SimpleWorkEntryRow(props: { ? "font-medium text-destructive" : "font-medium text-foreground/82"; const turnSettled = !activity.activeTurnInProgress; - const showNeutralIndicator = !turnSettled && workEntryIndicatesToolNeutralStatus(workEntry); + // Explicitly-inProgress work from a settled run is still running in the + // background — it must not inherit the settled-turn "assume completed" + // checkmark. + const isLiveBackgroundWork = workEntryIsLiveBackgroundWork(workEntry, activity.unsettledRunId); + const showRunningIndicator = isLiveBackgroundWork; + const showNeutralIndicator = + !turnSettled && !isLiveBackgroundWork && workEntryIndicatesToolNeutralStatus(workEntry); const showSuccessIndicator = workEntryIndicatesToolSuccess(workEntry) || - (turnSettled && workEntryIndicatesToolNeutralStatus(workEntry)); + (turnSettled && !isLiveBackgroundWork && workEntryIndicatesToolNeutralStatus(workEntry)); const rowToggleProps = canExpand ? { role: "button" as const, @@ -2444,6 +2471,23 @@ const SimpleWorkEntryRow = memo(function SimpleWorkEntryRow(props: { Completed + ) : showRunningIndicator ? ( + + + } + > + + + Running in the background + ) : showNeutralIndicator ? ( Date: Fri, 3 Jul 2026 01:18:53 -0700 Subject: [PATCH 02/23] Add orchestration v2 audit remediation plan and findings Co-Authored-By: Claude Fable 5 --- .../22-orchestration-v2-audit-findings.json | 958 ++++++++++++++++++ .../22-orchestration-v2-audit-remediation.md | 524 ++++++++++ 2 files changed, 1482 insertions(+) create mode 100644 .plans/22-orchestration-v2-audit-findings.json create mode 100644 .plans/22-orchestration-v2-audit-remediation.md diff --git a/.plans/22-orchestration-v2-audit-findings.json b/.plans/22-orchestration-v2-audit-findings.json new file mode 100644 index 00000000000..9a7b807a8a8 --- /dev/null +++ b/.plans/22-orchestration-v2-audit-findings.json @@ -0,0 +1,958 @@ +{ + "sessionSummaries": [ + { + "sessionKey": "GLOBAL", + "summary": "Global cross-thread audit of the v2 orchestrator DB (43,378 events across 63 streams, 6,286 turn items, 60 threads, 206 runs). Core invariants are healthy: event streams have zero version gaps/duplicates and no occurred_at regressions >5s; turn-item ordinals have zero collisions and are 100% consistent with orchestration_v2_turn_item_positions; no nativeItemRef duplicates; outbox (463 rows) and command receipts (580) are fully succeeded/accepted; all 12 failed runs surfaced exactly one user-visible error item; no items/runs stuck beyond the one currently-live run (started 17:03Z, audit ran 17:07Z). Findings are edge-case lineage danglers, generic error strings that discard the underlying cause, sessions left 'ready' after detach, and 13 old local_bash tasks projected as empty-prompt subagents.", + "stats": "43,378 events / 63 streams (0 version gaps, 0 dupes, 0 time regressions >5s); 6,286 turn items (6,042 completed, 243 failed, 2 interrupted, 1 running=live); 206 runs (188 completed, 12 failed, 4 cancelled, 1 interrupted, 1 running=live); 212 provider_turns; 19 provider_sessions (3 ready, 16 stopped); 40 subagents (38 completed, 2 failed); outbox 463 rows all 'succeeded'; receipts 580 all 'accepted'; 12 error items total. Loss sample: 258 native tool_use ids in latest 71e29ba5 log \u2192 all accounted for (254 direct items, 3 Agent spawns as subagent items, 1 in-flight run)." + }, + { + "sessionKey": "a5a643b2-codex-heya", + "summary": "Small 4-run codex smoke session (\"heya\") on 2026-06-28 04:52-05:22 UTC, plus one delegated Claude task child thread. Core pipeline is healthy: all 19 main-thread items + 3 child items are projected with verbatim text, ordinals match turn_item_positions exactly, lineage (run/node/provider_turn refs, subagent -> child thread -> context-transfer -> handoff) fully resolves, and all runs/turns/nodes/items are terminal. Two real problems: the codex-native collab subagent spawned in run 3 left zero trace in ingested events (its wait tool call, web search, and final \"Hello.\" message are lost), and the run-4 delegated Claude task that failed with a 401 auth error is recorded as status \"completed\" everywhere because the adapter ignores is_error on subtype \"success\" results.", + "stats": "Main thread: 138 events, 19 projected items (19 distinct item ids in turn-item.updated events), 4 runs / 4 attempts / 4 provider turns, all completed. Child delegated-task thread: 27 events, 3 items, 1 run completed. Native log: 798 lines, 4 turn/start requests but 5 native turns (extra = codex-native subagent thread 019f0c93-d260), 22 item/completed across 2 native threads, 5 reasoning items (all empty summaries), 1 native error (401 auth, is_error:true result)." + }, + { + "sessionKey": "a61e9269-codex", + "summary": "Short two-turn Codex session (\"hey\" then \"spawn a subagent and say hello\") that spawned one provider-native subagent thread. Pipeline health is very good: every user-visible native item (2 user messages, 2 assistant messages, webSearch, subAgentActivity, child assistant message) is ingested and projected with exact untruncated text; lineage is fully consistent (subagent item childThreadId -> child thread, forkedFrom points back to parent provider-thread/turn, provider_turns map to run_attempts); all runs/turns/nodes/items terminal completed matching the native turn/completed events; ordinals match orchestration_v2_turn_item_positions with no collisions or duplicates and projection order matches native emission order. Only gaps: an (empty) reasoning item and token-usage/rate-limit notifications are dropped by the adapter, which has no handlers for them.", + "stats": "Native (decoded, deduped): main thread 2 userMessage, 2 agentMessage, 1 subAgentActivity, 1 webSearch, 1 empty reasoning; child thread 1 agentMessage; 3 native turns all turn/completed; 0 native errors, 2 dev-feature warnings. Ingested: 74 events on main stream (versions 0-73, monotonic), 12 on child stream. Projected: 9 turn_items main (incl 2 synthetic checkpoints + workspace-preparation) + 1 child; 2 runs completed, 2 attempts completed, 3 provider_turns completed, 8 nodes completed; positions table matches item ordinals exactly, no duplicates." + }, + { + "sessionKey": "20296b49-grok-tasks", + "summary": "Short grok session (2026-06-29 21:46-21:48): user said \"yoo\", then \"spawn some subagents\"; grok ran 3 parallel Task subagents (git state, architecture docs, mobile connection changes) as child threads and summarized. Pipeline health is good: all runs/attempts/provider-turns completed, provider session stopped, lineage intact, no ordinal collisions, no duplicates, no truncation (subagent item result lengths 3721/6274/2647 exactly match child assistant_message lengths). The children's 1319-2711 events vs 2 projected items is EXPECTED: each child has exactly 2 distinct item ids (prompt + result) and the event volume is per-token streaming re-upserts of the result item, not lost items.", + "stats": "Main thread: 300 events, 14 turn_items, 2 runs (both completed), 2 run_attempts, 2 provider_turns (completed), 9 subagent.updated events -> 3 subagent rows (completed). Children: 3 threads, 5717 events (1319/2711/1687), 2 items each (6 total, all completed), 6 position rows. Native log: 3702 lines (3694 protocol \u2014 payloads redacted to byteLength/itemCount, 8 request), 2 session/prompt \"failed\" (errorTag Interrupt), no other errors. Integrity: 0 bad run/node/turn/parent refs, 0 ordinal collisions, 0 nativeItemRef dupes, 0 timestamp regressions." + }, + { + "sessionKey": "721fc23c-cursor", + "summary": "Short cursor (glm-5.2) session \"Hello therw!\" with 2 user turns and zero assistant output. The 7-line native log is genuine: the backend child process fatally crashed (Node error originating in @cursor/sdk/dist/esm/index.js) ~0.3s after run 1's run.started, killing the provider mid-turn; a startup reconcile then cancelled run 1. Run 2 (\"what did you just say\") resumed the agent but agent.send rejected 4ms after run.start and was projected as a failed run with only a generic error string; the process crashed again with the same @cursor/sdk fatal error 29s later. Projection internals (ordinals, positions, statuses, lineage forward-links) are consistent and all state is terminal \u2014 the problems are crash handling and error fidelity, not ingestion mechanics.", + "stats": "47 events on stream 721fc23c-... (seq 6271-6340, versions 0-46); 7 native log lines (all protocol/decoded, 0 assistant/tool output); 4 turn items (2 user_message, 1 command_execution \"Workspace ready\", 1 error); 2 runs (ordinal 1 cancelled, ordinal 2 failed); 2 messages (both user); 1 provider_turn (cancelled); 0 subagents/handoffs/transfers; 2 backend-child process crashes; 2 startup reconciles." + }, + { + "sessionKey": "48663fb7-cursor", + "summary": "Thread 48663fb7 is a single \"hello\" message sent to the cursor provider (glm-5.2) at 2026-06-29T21:49:07Z. The provider session opened and acked run.started at 21:49:09.097Z, then the server restarted; a startup runtime-reconcile at 21:49:10.775Z cancelled the run/attempt/provider-turn, idled the provider thread, and stopped the session. Pipeline health is good: no native output was lost (the provider emitted nothing after run.started), lineage and ordinals are consistent, and nothing is stuck in a non-terminal state. The only gap is that the user got no assistant response and no explanation of why the run was cancelled.", + "stats": "Native log: 4 decoded protocol lines (agent.open/agent.opened/run.start/run.started), 0 provider output items, 0 errors. Events: 28 (stream_version 0-27, seq 6318-6346 excl. 6340). Projections: 1 run (cancelled), 1 attempt (cancelled), 1 provider_turn (cancelled), 1 node (cancelled), 2 turn_items (both completed), 1 message, 0 subagents, 0 child threads. Provider session: stopped; provider thread: idle." + }, + { + "sessionKey": "3029dc85-opencode", + "summary": "Opencode chat session \"Hey!\" (2026-06-29 22:02-22:16, idle-reaped 22:46): 7 user turns spawning 5 native opencode subagent sessions, 2 cross-provider delegated claudeAgent tasks (results handed back via context-transfer/handoff), and 3 top-level threads (codex/grok/claude). Pipeline health is very good: all 9 runs/7 attempts/12 provider turns completed, every distinct item id in events has exactly one projection, ordinals match event arrival order with zero collisions or duplicates, lineage (run/node/provider-turn/subagent->child-thread) fully resolves, and the provider session was cleanly stopped after the 30-min idle window. Main issues found are debuggability gaps rather than data loss: the one failed tool call projected no error message (file_search items structurally cannot carry one), file_search outputs are never projected, a placeholder \"pending\" provider_thread row lingers, and the native log itself is payload-redacted so content-level verification is impossible.", + "stats": "Native log: 17,198 lines (15,542 part deltas, 723 part updates, 204 message updates, 7 promptAsync, 5 session.created, 12 session.idle; 0 error lines). Events: main 3,335 (1,047 turn-item.updated over 50 distinct items), opencode children 1051/1107/559/241/209, delegated-task children 94/34. Projected items: main 50, opencode children 185, delegated children 23 \u2014 every distinct event item id has exactly one projection (50/90/41/39/8/7 match 1:1). Runs: 7 main + 2 delegated, all completed; 7 attempts completed; 12 provider_turns completed; sessions stopped. 1 failed item (file_search), 0 stuck items, 0 ordinal collisions, 0 dangling run/node/turn/thread refs, 0 nativeItemRef dupes, positions table fully consistent." + }, + { + "sessionKey": "9f8d616d-opencode", + "summary": "Single-turn opencode session ('hey' -> greeting reply) on 2026-06-29: 1 run, 1 provider turn, ~5s of activity, then 30min of heartbeats until the idle reaper stopped the session. The 100-events/5-items ratio is benign: 25 of the 33 turn-item.updated events are streaming updates of one reasoning part. No data loss, no duplicates, no ordering violations, no errors in the native log, and all runs/turns/items are terminal. Only minor findings: the pending-vs-resolved provider-thread dual-row pattern leaves run/attempt/turn pointing at a different provider_thread_id than the items, and the resolved provider-thread row lingered 'active' for ~20h after session stop before being marked idle.", + "stats": "Native log: 273 lines (180 heartbeats, 32 file.watcher, 31 part deltas, 7 part updates, 6 message updates, 1 session.idle, 0 errors). Ingested: 100 events (stream_version 0-99 contiguous, 100 distinct event_ids), incl. 33 turn-item.updated over exactly 5 distinct items. Projected: 5 turn_items (ordinals 1000001-1000005, positions table matches 1:1), 2 messages, 1 run + 1 attempt + 1 provider_turn (all completed), 3 nodes completed, 2 checkpoints, 0 subagents/children, provider_session stopped." + }, + { + "sessionKey": "1156181e-delegated-task", + "summary": "MCP delegated task \"hello-opus-48\" sent one user prompt (\"Say hello briefly and stop.\") to claudeAgent (claude-opus-4-8). The API call never succeeded: the SDK logged two api_retry events with error_status 401 (authentication_failed) and then emitted a synthetic assistant message \"Failed to authenticate. API Error: 401 Invalid authentication credentials\" plus a result with subtype \"success\" but is_error:true / api_error_status:401. Ingestion and projection are structurally clean (all 3 items projected, ordinals consistent, lineage valid, nothing stuck), but the run/attempt/provider-turn were all recorded as \"completed\" with no failure info, so the delegated task looks successful despite executing zero tokens.", + "stats": "Native log: 10 decoded lines (1 query.open, 1 prompt.offer, 5 system incl. 2x api_retry 401, 1 synthetic assistant msg, 1 result is_error:true). Events: 30 (stream_version 0-29, sequences 116-168). Projections: 1 run, 1 attempt, 1 provider_turn, 2 messages, 3 turn_items (user_message, assistant_message, checkpoint), 0 child threads/subagents. Errors seen: 401 authentication_failed x3." + }, + { + "sessionKey": "6d618dc4-mcp-group", + "summary": "MCP-driven fan-out session: 2 claudeAgent delegated-task threads plus 3 parallel MCP threads (codex :0, grok :1, claudeAgent :2), all single-run. Pipeline health is good overall: all 122 items verified against native logs with exact tool-call counts (19/19 on :2, 1/1 and 16/16 on the delegated tasks), ordinals fully consistent with turn_item_positions, zero lineage orphans, no duplicates, and the codex 400 model error was ingested verbatim and surfaced as a user-visible failed error item. Main defects found: the claudeAgent adapter (as of the code that ran, fc23be8184) concatenates intra-turn assistant text segments without separators into one end-of-turn item, destroying segment boundaries and native interleaving; and the three claudeAgent provider sessions were never idle-reaped, staying 'ready' for ~21 hours until a simultaneous shutdown sweep.", + "stats": "5 threads, 1842 ingested events (34/94/22/1587/105), 122 projected items (4/19/2/75/22), 5 runs (4 completed, 1 failed), 5 provider turns, 5 provider sessions (all stopped). Native: mcp log 3763 lines (grok 3158 / claude 499 / codex 106), delegated log 495 lines. 1 provider error (codex 400), 0 orphan run/node/provider_turn/parent references, 0 ordinal collisions, 0 nativeItemRef dupes, positions match 122/122." + }, + { + "sessionKey": "5dcea72d-grok-subagent", + "summary": "Two-run grok session (\"spawn a subagent\"): run 1 asked the user clarifying questions (xai-question), spawned a background explore subagent; run 2 fetched its output via a blocking TaskOutput tool call and summarized it. Main-thread projections are complete and consistent (16 items, all completed, positions/ordinals match, no dupes, all terminal states correct, no backwards timestamps). The main problem is the subagent child thread: it froze at spawn time with a placeholder result, and the subagent's entire live transcript (116 tool calls over 74.9s) was never ingested \u2014 the real output survives only inside a parent-thread tool item.", + "stats": "Native log: 7028 lines, 3501 decoded incoming + 8 decoded outgoing protocol msgs, 10 request events (2 session/prompt terminations logged failed/Interrupt); payload contents are redacted (shape-only). Ingested: 257 events on main thread + 8 on child thread. Projected: 16 turn_items main + 2 child; 2 runs / 2 attempts / 2 provider_turns all completed; provider session stopped; provider thread idle; 1 subagent row (completed). 0 tool failures / rate limits observed." + }, + { + "sessionKey": "7f1dfff1-claude-scheduled-tasks", + "summary": "Dev session on scheduled tasks: run 1 on codex, then a provider handoff to claudeAgent for runs 2-10 (8 completed, run 9 user-cancelled, run 10 failed 2s after query.open with no SDK response). Pipeline health is very good: zero lost native content in the claudeAgent portion, lineage fully intact, ordering matches native emission exactly, no duplicates, and nothing stuck non-terminal (session stopped, provider threads idle, 0 running items). The only real defect found is that run 10's failure was persisted with a hardcoded generic message, making it undebuggable from the DB.", + "stats": "Main thread: 1707 events, 363 items (169 codex run-1 + 194 claude), 69 messages, 10 runs, 10 provider turns (8 completed/1 cancelled/1 failed). 15 child threads: 2 Explore subagents (95+245 events, 20+50 items) + 13 bash-task threads (7-9 events, 2 items each); all child items completed. Native log 5336 lines (2 files, runs 2-10 only). Cross-checks: 156/156 main tool_use ids accounted for (154 toolu items + 2 Agent calls projected as task subagent items), 64/64 subagent tool_use ids, assistant text preserved exactly (27311 chars log = 27311 chars items), 94 thinking blocks all natively empty (SDK sends thinking_delta with empty text), 5/5 is_error tool_results projected as failed, 1 api_retry (recovered), 14 task_notifications + 1 task_updated-only completion (bqu8waxt3) all reconciled; 0 ordinal collisions, 0 position mismatches, 0 orphan lineage refs. All backgrounded-bash tasks completed within their runs, so cross-run/wakeup paths were not exercised in this session." + }, + { + "sessionKey": "47763f5e-claude-fable", + "summary": "Long claudeAgent coding session (PR #3638 scheduled-tasks work, 20:56-23:43 UTC on 2026-07-01) with 16 runs, 2 Task subagents, and heavy post-turn wakeup activity via task notifications. The persisted portion of the pipeline is healthy \u2014 lineage, ordinals/positions, ordering vs native emission, terminal states, and text fidelity all check out \u2014 but roughly 40% of the provider's actual output (10 full post-turn wakeup turns: 16 assistant messages and ~84 tool calls, including real git pushes) was never ingested at all, this being one of the threads on which that gap was originally diagnosed. Two error-handling gaps stand out: a 401 auth failure projected as a completed run, and run 10's failure persisted only as 'Claude Agent SDK query failed.' with no root cause recorded anywhere.", + "stats": "Native log: 8088 lines (all protocol/decoded): 25 results, 594 assistant msgs (68 distinct main-thread text msgs, 122 with thinking), 371 unique tool_use (307 main / 64 child), 49 task_notification, 4 rate_limit (all 'allowed'), 3 query.open, 1 query.close. Ingested: 1546 events on main stream. Projected: 16 runs (15 completed, 1 failed), 16 provider_turns, 308 main items + 70 child items = 378 (positions table matches 378, no collisions/dupes), 68 messages, 2 subagents (both completed, child threads linked). Coverage gap: 84 tool calls + 16 assistant texts in log with no projection (all in wakeup windows); 0 items stuck running/streaming." + }, + { + "sessionKey": "ea84f015-cursor-handoff", + "summary": "Long mixed-provider session (13 claudeAgent runs, then a clean claudeAgent\u2192cursor provider handoff at run 14, then 3 failed cursor runs) with two claudeAgent subagent child threads. Core pipeline health is good: ordering/positions are perfectly consistent, no duplicates, no stuck non-terminal items/nodes/messages, sampled long assistant text survives un-truncated, and handoff lineage (context-transfer consumed \u2192 handoff ready \u2192 run 14, thread.provider-switched) is fully intact. The big defect is the known 'invisible post-turn turns' problem: three post-run wakeup turns (10 and 17 native turns' worth of tool calls, including a GitHub comment being posted) produced zero events and zero projections. Secondary issues: 529-Overloaded turns (runs 11-13) projected as completed despite native is_error:true, and cursor run failures persisted with only a generic 'Provider turn failed.' string.", + "stats": "Events: 6482 (main) + 165 + 255 (two subagent child threads). Items: 695 main + 34 + 52 child; positions table matches 695/695 with zero ordinal collisions, zero duplicate nativeItemRefs. Runs: 17 total \u2014 13 claudeAgent completed, 1 cursor completed (14), 3 cursor failed (15/16/17; 15 had a superseded steering_restart attempt). 2 subagents, both completed and linked to child threads. Tool-call coverage: cursor run 15 window 51/51 projected; claudeAgent era 529 native tool_use ids, all projected except 42 post-turn-wakeup calls (lost) and 2 Agent spawns (correctly represented as subagent items). Nodes: 751, all terminal. Errors seen natively: 3 is_error results (529 Overloaded), 1 error_during_execution, 1 model_refusal_fallback, 29 api_retry, 3 cursor run status:error." + }, + { + "sessionKey": "c9e72a05-cursor-failing-turns", + "summary": "Two-run cursor session (composer-2.5, fastMode) where the user asked why cursor turns fail; run 1 completed a full debugging pass, run 2 hung silently for 440s and failed \u2014 reproducing the very bug being investigated. The ingestion/projection pipeline itself is in excellent health: all 123 native tool calls plus reasoning/assistant text project 1:1 with exact character counts (3664/3664 assistant, 3382/3382 thinking), ordering matches native emission exactly, positions table is consistent (150/150, 0 ordinal mismatches or dupes), lineage refs all resolve, no stuck states (run/attempt/turn/node all 'failed' for run 2, session 'stopped' after agent.close), and timestamps are monotonic. The one substantive problem is error-handling fidelity: the run 2 failure is projected only as failure {message:\"Provider turn failed.\", code:null}, because CursorAdapterV2 reads a non-existent result.error and never surfaces the Cursor SDK's real error_code (an authentication error), dropping even the requestId/durationMs present in the decoded payload.", + "stats": "Native log: 1920 decoded protocol lines, 2 native runs (run-1cbebefe finished; run-f4d64e64 status error), 123 distinct tool calls (87 shell / 18 read / 14 grep / 4 glob; 118 natively completed), 17 thinking blocks, 3664 assistant chars, 3382 thinking chars. Ingested: 906 orchestration_events for the stream. Projected: 150 turn items (2 user_message, 5 assistant_message, 17 reasoning, 88 command_execution, 36 file_search, 1 checkpoint, 1 error), 2 runs (1 completed / 1 failed), 2 run attempts, 2 provider turns, 7 messages, 0 subagents/child threads. 1 provider failure seen (generic)." + }, + { + "sessionKey": "8ee00dcc-cursor-spacing", + "summary": "Single-run Cursor (composer-2.5, fastMode) session: user asked to remove extra spacing on the mobile Threads/Home screen; the agent searched, read files, made 3 edits to HomeScreen.tsx, and summarized. The pipeline is essentially clean: every native item (33 tool calls, 10 thinking segments, 3 assistant messages, 1 user message) maps 1:1 to projected turn_items with exact content fidelity, ordinals are contiguous and match native emission order, all statuses reached terminal states, and no errors occurred in the native log. Only minor gaps found: token usage from the native turn-ended/run.completed is not ingested anywhere, and run_attempt.provider_turn_id is never backfilled.", + "stats": "Native log: 665 lines (all stage=decoded), 659 interaction.update (33 tool-call-started + 33 tool-call-completed, 10 thinking-completed, 94 text-delta, 107 thinking-delta, 377 token-delta, 3 partial-tool-call, 1 turn-ended, 1 step-completed) + run.start/started/completed + agent.open/opened/close. Events: 415 (stream versions 0-414 contiguous; 194 turn-item.updated, 194 node.updated). Projections: 49 turn_items (1 user_message, 1 workspace-prep command_execution, 3 assistant_message, 10 reasoning, 29 file_search, 3 file_change, 1 tool command_execution, 1 checkpoint), 49 positions, 1 run (completed), 1 attempt (completed), 1 provider_turn (completed), 4 messages. Zero errors in native log." + }, + { + "sessionKey": "codex-no-native-logs", + "summary": "Four sibling codex threads multiplexed onto the shared codex app-server session ('provider-session:provider-instance:codex:shared'). The missing native logs are explained: the codex protocol logger is bound to the threadId that opened the shared session (71e29ba5, whose own codex native thread is 019f1b50), so all four threads' traffic went to 71e29ba5's log file and was mostly rotated away by that thread's heavy claude traffic. Events-to-projections consistency is excellent: all 11 runs and provider turns terminal (10 completed, 1 user-interrupted with proper interrupt request/result items), zero lineage violations, zero ordinal collisions or duplicates, projection order matches native start order (verified against the surviving log window for c878541b runs 4-6), and message text verified byte-exact (1733/1733 chars). Only gaps: codex reasoning items (empty-content in this session) and one contextCompaction item are not projected at all.", + "stats": "4 sibling codex threads (not parent/child; 0 subagent rows), 1797 events total (1077/468/222/30), 11 runs (10 completed, 1 interrupted), 440 turn_items (117 assistant_message, 264 command_execution incl. 20 failed, 18 file_change, 8 dynamic_tool, 12 user_message, 11 checkpoint, 2 interrupt items), 129 messages sampled on c878541b (61 assistant/7 user = item counts). 0 lineage violations, 0 ordinal collisions, 0 native-ref dupes, 0 non-terminal items/runs/turns/nodes. Native log survives only for c878541b runs 4-6 (16:00-16:44Z window)." + }, + { + "sessionKey": "71e29ba5-claude-mega", + "summary": "Largest session in the store: the wakeup/blur dev thread, started on codex (runs 1-55), provider-switched to claudeAgent via a ready context handoff (runs 56-122), spawning 7 claude subagent child threads; still live during the audit (run 122 completed mid-audit at 17:13:36Z). In-run ingestion is very healthy: ordinals/positions are perfectly consistent across main + child threads with zero collisions or dangling lineage refs, projection order exactly matches native emission (run 121 verified item-by-item), no duplicates, no stuck items/turns/streaming messages, and all failed runs and user-interrupted subagents are surfaced with error/failed items. The two significant problems are both loss-of-data outside run windows: 76 completed claudeAgent tool calls plus assistant text from post-turn task_notification wakeup turns were never ingested (the known invisible-post-turn-turns bug this thread was used to develop the fix for \u2014 no provider_wakeup runs exist here), and a 30-minute codex-era ingestion gap (16:10-16:40 Jul 1) lost 119 command/fileChange items with only assistant text later backfilled at a single flattened timestamp.", + "stats": "Main thread: 14259 events (seq 13799-43607, still live during audit), 3390 projected turn_items (922 assistant_message, 1658 command_execution, 345 file_change, 144 dynamic_tool, 26 web_search, 7 subagent, 4 error, 169 user_message, 117 checkpoint, 1 handoff), 1091 messages, 122 runs = 122 provider_turns (117 completed / 4 failed / 1 cancelled, statuses exactly aligned). 7 subagent child threads (not 9): 79-260 events and 17-53 items each, 214 child items total. Native logs cover only 2026-07-01T16:00:59Z onward (~50k lines, 11 files; earlier rotations deleted). Claude era: 1130 native toolu ids vs 1045 projected (76 lost in wakeup gaps, 7 Agent-remapped, 2 in-flight). Codex era (in-log): 450 native call ids, 119 unprojected. 4 native error_during_execution, 2 orchestrator transport failures, 1 user cancel, 2 user-interrupted subagents." + } + ], + "confirmed": [ + { + "title": "Codex-native collab subagent (run 3) entirely missing from events and projections", + "category": "lost-data", + "severity": "high", + "confidence": "high", + "knownIssue": false, + "evidence": "User prompt run 3 was 'spawn a subagent'. Native log a5a643b2-...log shows: item/completed subAgentActivity call_9wPxyaS7ueh1MXl7XJt6zysq (kind 'started', agentThreadId 019f0c93-d260-7740-a477-0c0d19e8b299, agentPath /root/hello_agent) at 2026-06-28T04:54:02.530Z; collabAgentToolCall call_DO2fwaX8dQlHoNipTbnPWxIX (tool 'wait', inProgress 04:54:04.013Z -> completed 04:54:14.015Z); and on child native thread 019f0c93-d260: webSearch ws_08e14d73aa33777f016a40a8f75cd0819ba433374351c23d4f (query 'Codex OpenAI latest docs'), reasoning rs_08e14..., agentMessage msg_08e14d73aa33777f016a40a8fd816c819bbd856f295f6ce730 (text 'Hello.', completed 04:54:23, child turn 019f0c93-d421 completed 04:54:23.777Z -- AFTER parent run 3 finalized 04:54:15.874Z). SQL: 0 rows in orchestration_events for LIKE '%call_DO2fwaX8dQlHoNipTbnPWxIX%', '%call_9wPxyaS7ueh1MXl7XJt6zysq%', '%ws_08e14d73%', '%msg_08e14d73%', '%019f0c93-d260%'; orchestration_v2_projection_subagents has only the run-4 delegate-task row. Run 3 projected only user_message + 2 assistant messages + checkpoint; the parent assistant text ('Subagent spawned as /root/hello_agent. It hasn't returned within the short wait window yet.') is the only user-visible trace. Note: current CodexAdapterV2.ts (registerSubagentActivity/registerSubagentThreads, lines ~2815/2986; commits 3b864044c6 2026-06-28 'Adopt userdata-v2 and subagent activity mapping' and 57aa76ac14 2026-06-29) now maps these item types, so the session likely ran a pre-fix build -- but the historical data is unrecoverable and the child agent's post-run-completion output (arrived 8s after parent turn completed) remains a late-arrival hazard.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT COUNT(*) FROM orchestration_events WHERE payload_json LIKE '%call_DO2fwaX8dQlHoNipTbnPWxIX%' OR payload_json LIKE '%019f0c93-d260%'\" # returns 0; vs grep -c 'subAgentActivity\\|collabAgentToolCall' ~/.t3/userdata-v2/logs/provider/a5a643b2-6ca8-4250-9c54-ddefe7d55565.log", + "sessionKey": "a5a643b2-codex-heya", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Every evidence point reproduces: the native log contains subAgentActivity call_9wPxyaS7ueh1MXl7XJt6zysq, collabAgentToolCall call_DO2fwaX8dQlHoNipTbnPWxIX, and 29 child-thread (019f0c93-d260) lines including agentMessage msg_08e14d73 (\"Hello.\") at 04:54:23.755Z, while orchestration_events and all projections return 0 rows for every identifier; run 3 projected only user_message + 2 assistant messages + checkpoint, and the subagents table has only the run-4 delegate-task row. The child output also arrived 8s after run 3's projected completed_at (04:54:15.874Z), matching the late-arrival claim. So the data loss is real and unrecoverable. Severity adjusted to medium because the session predates fix commit 3b864044c6 (2026-06-28 17:58 PDT, ~20h after the session) and current CodexAdapterV2.ts (lines 2815/2986-2998) now maps both item types, so recurrence for these types is addressed; only the historical loss and an unverified late-arrival hazard remain." + } + }, + { + "title": "Backend child process fatally crashed twice on @cursor/sdk, killing run 1 mid-turn with no user-visible error", + "category": "error-handling", + "severity": "high", + "confidence": "high", + "knownIssue": false, + "evidence": "server-child.log (~/.t3/userdata-v2/logs/server-child.log): stderr fatal-error dump referencing 'file:///Applications/T3 Code (Alpha).app/.../node_modules/@cursor/sdk/dist/esm/index.js:1' at 2026-06-29T21:48:20.166Z (0.33s after native run.started run-9d9ff525-e89c-41e3-90e6-1f7e841cc188 at 21:48:19.834Z) and again at 21:49:09.455Z (followed by 'Node.js v24.15.0' crash footer). Server restart at 21:48:21.35 logged 'V2 orchestration recovery completed { terminalizedRuns: 1, stoppedSessions: 2 }'. Reconcile events seq 6294-6299 (command:runtime-reconcile:startup...21:48:21.501Z) set run:thread:721fc23c...:ordinal:1, its attempt, and provider-turn run-9d9ff525 to 'cancelled'. Run 1 has no error/cancelled turn item \u2014 user asked 'Hello therw!' and got only the 'Workspace ready' item; the cancellation is silent (run status is the only signal). No assistant content was lost in the log sense (crash preceded any deltas), but the turn was killed by a server bug, not a user cancel, and is projected indistinguishably from a user cancellation.", + "repro": "grep -n 'cursor/sdk\\|terminalizedRuns' ~/.t3/userdata-v2/logs/server-child.log ; sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT event_type,occurred_at FROM orchestration_events WHERE stream_id='721fc23c-2cf3-42bf-9d84-edd94359dca9' AND event_id LIKE '%runtime-reconcile%'\"", + "sessionKey": "721fc23c-cursor", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Every cited artifact reproduces: the @cursor/sdk fatal stderr dump at 21:48:20.166Z (0.33s after native run.started run-9d9ff525), a second fatal crash at 21:49:09.455Z with Node.js footer and child exit code=1, the recovery log 'terminalizedRuns: 1, stoppedSessions: 2', reconcile events seq 6294-6299 setting run 1/attempt/node/provider-turn to 'cancelled', and no error or cancelled turn item on run 1 (only user_message + workspace item). ProviderRuntimeRecoveryService.ts intentionally marks interrupted runs 'cancelled' and never persists its 'server restarted' detail string user-visibly, so the crash-induced kill is projected identically to a user cancel with zero debuggable trace \u2014 a real gap, though the terminalization itself is designed behavior. Downgraded to medium because recovery worked cleanly (nothing stuck, no assistant content lost since the crash preceded any deltas), and the finding slightly overstates impact: run 2's failure WAS surfaced with a visible provider_error item; only run 1 died silently." + } + }, + { + "title": "401 authentication failure recorded as fully successful run \u2014 is_error on SDK result ignored", + "category": "error-handling", + "severity": "high", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log 2026-06-28T05:22:33.360Z: result payload has subtype:\"success\" but is_error:true, api_error_status:401, result:\"Failed to authenticate. API Error: 401 Invalid authentication credentials\", usage all zeros; preceded by two api_retry events (05:22:31.612Z, 05:22:32.312Z, error_status 401 authentication_failed). Projections: run run:...ordinal:1 status=completed (event seq 146, run.updated stream_version 27), run_attempt ...attempt:1 status=completed (seq 139), provider_turn ...attempt%3A1 status=completed (seq 138), assistant turn-item turn-item:provider:claudeAgent:native-item:03e7bb1c-2ee8-46a9-8bf2-f2b66c7501dc status=completed. No failure/error field anywhere in any payload_json; the only trace of the error is the synthetic assistant message text (which did survive verbatim, so it is user-visible as ordinary assistant text, not as an error state). Root cause: terminalStatusFromResult in /Users/julius/.t3/worktrees/codething-mvp/t3code-c1e5e1d1/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts:1652 returns \"completed\" whenever message.subtype === \"success\" and the failure branch at ~line 3254 only fires for non-success subtypes; is_error/api_error_status are never checked (only tool-result is_error at lines 1498-1500). An MCP delegate_task caller polling run status sees success for a task that never executed.", + "repro": "grep 'api_error_status' ~/.t3/userdata-v2/logs/provider/thread-delegated-task-command-3amcp-3a1156181e-e188-4fe4-8cc5-a294003ebfed-3adel.log ; sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT status FROM orchestration_v2_projection_runs WHERE thread_id LIKE '%1156181e%';\"", + "sessionKey": "1156181e-delegated-task", + "verdict": { + "isReal": true, + "adjustedSeverity": "high", + "reasoning": "Every evidence point reproduces: the native log at 2026-06-28T05:22:33.360Z shows a result with subtype \"success\" but is_error:true, api_error_status:401, zero usage, preceded by two api_retry 401 authentication_failed events (05:22:31.612Z, 05:22:32.312Z); projections show run (seq 146), run_attempt (seq 139), provider_turn (seq 138) and assistant turn-item 03e7bb1c-2ee8-46a9-8bf2-f2b66c7501dc all status=completed with no failure field in any payload_json. The code claim is accurate: terminalStatusFromResult (ClaudeAdapterV2.ts:1658) returns \"completed\" for any subtype===\"success\" and the failure branch at ~3254 only fires for non-success subtypes, while the SDK's SDKResultSuccess type explicitly exposes is_error and api_error_status (sdk.d.ts:2739-2740) \u2014 so this is an ignored error signal, not an SDK ambiguity or intentional design. Severity high stands because this thread is a delegated MCP task whose caller consumes run status programmatically; an auth failure that executed zero turns is reported as a fully successful run, with the only trace being ordinary assistant text." + } + }, + { + "title": "Subagent live transcript and final output never projected to its child thread", + "category": "lost-data", + "severity": "high", + "confidence": "high", + "knownIssue": false, + "evidence": "Child thread 'thread:provider:grok:native-thread:019f1b5f-efdb-7da0-acd7-1a31b3fe00bc%3Atask%3Acall-cdbbacb3-0c58-4992-86ce-7b549ffb16ce-1' has only 8 events (sequences 14437-14451, last stream_version 7 at 2026-07-01T01:52:10.705Z) and 2 turn_items, both timestamped at spawn (01:52:10). Its assistant_message item 'turn-item:provider:grok:native-item:call-cdbbacb3-...-1%3Aresult' contains only the bootstrap text 'Subagent started in background... Use get_command_or_subagent_output ... to wait for results.' Yet the task actually ran 01:52:10Z-01:53:25Z with tool_calls=116 (per TaskOutput result in parent item turn-item:provider:grok:native-item:019f1b5f...%3Atool%3Acall-5ca8a4be-6fc8-4924-909c-b198de7dcbc8-2). Native log shows 811 decoded incoming ACP messages between 01:52:14 and 01:52:59 (gap between run 1 Interrupt at 01:52:13.249 and run 2 start at 01:52:59.9) with ZERO orchestration_events ingested for any session stream in that window \u2014 the subagent's streamed activity was dropped, likely because AcpAdapterV2's subagent context (context.subagents / subagentsBySessionId / pendingSubagentNotifications in /Users/julius/.t3/worktrees/codething-mvp/t3code-c1e5e1d1/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts ~line 559) is per-active-turn and turn 1 had already finalized. The 15,053-char final output is preserved un-truncated ONLY inside the parent's turn-2 dynamic_tool item; a user opening the child thread 'Explore codebase structure and architecture' sees none of the subagent's work.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT COUNT(*) FROM orchestration_events WHERE occurred_at>'2026-07-01T01:52:13.4' AND occurred_at<'2026-07-01T01:52:59' AND (stream_id LIKE '%5dcea72d%' OR stream_id LIKE 'thread:provider:grok:native-thread:019f1b5f%');\" -- returns 0, vs grep of the log showing 811 decoded incoming messages in the same window", + "sessionKey": "5dcea72d-grok-subagent", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "All evidence reproduces: child thread has only 8 spawn-time events (seq 14437-14451, all 01:52:10Z) and its result item contains only bootstrap text; 0 orchestration_events exist in the 01:52:13.4-01:52:59 gap despite continuous decoded incoming traffic in the log; run 1 prompt failed with errorTag Interrupt at 01:52:13.249Z and the subagent ran to 01:53:25Z (116 tool_calls per parent TaskOutput item). Code intent confirms this is a bug, not design: AcpAdapterV2's emitSubagentAssistant/projectSubagentNotification/pendingSubagentNotifications (lines 845-882, 1447-1461) are built to stream subagent text into the child thread, but the per-turn context's finalized guard plus run 2's empty subagentsBySessionId map dropped everything; the subagent projection row is also frozen at spawn (completed_at 01:52:10.702Z, result=bootstrap text). Downgraded to medium because the full 15,053-char final output is preserved un-truncated and user-visible in the parent's turn-2 dynamic_tool item, and even happy-path design only projects assistant text chunks (not tool calls) to the child thread \u2014 the loss is the child-thread transcript and an accurate subagent row, not the final answer." + } + }, + { + "title": "10 post-turn wakeup turns entirely unpersisted (16 assistant messages + 84 tool calls lost)", + "category": "lost-data", + "severity": "high", + "confidence": "high", + "knownIssue": true, + "evidence": "Native log has 25 result messages but only 16 runs exist. 10 results have no corresponding run/message/item: 21:19:42 ('CodeRabbit's final pass came back clean'), 22:14:37, 22:19:27, 22:25:18, 22:32:25, 22:38:55 (review-fix 'Rounds 3-7' incl. real git pushes 'a3c87931a2', 'cb28a0842f', 'c435bec111', '57ca73e655', 'd851eb7b9c'), 23:12:04, 23:17:19, 23:32:05 ('Round 11 pushed c19ceca8a5'), 23:41:55 ('Converged. PR #3638 is fully green'). Diff of tool_use ids in log vs nativeItemRef in projections: 86 missing (84 after excluding 2 Agent ids that are represented as task-id subagent items) \u2014 all timestamped inside wakeup windows (21:19, 22:13-22:38, 22:59:24-32, 23:11:24-34, 23:16-23:23, 23:29-23:31, 23:41). 16 native assistant text messages missing (uuids incl. 6299b5cf-7941-4355-8063-eb9d60d7216f @21:19:41, d226f500-43a0-489e-a1bd-888747c56495 @22:38:54, 093e91b9-0d5b-4ad9-a7a9-2ef06c4c2021 @23:41:52). Zero orchestration_events on the stream between 22:10:30 and 22:48:00 except one provider-session.updated (stopped) at 22:41:50.039Z. No run/message has inputIntent provider_wakeup; all 16 user messages have creationSource=web. This is the documented 'invisible post-turn turns' issue diagnosed on this exact thread (fix implemented later on branch t3code/codex-turn-mapping); this session's data predates the fix.", + "repro": "SELECT COUNT(*) FROM orchestration_events WHERE stream_id='47763f5e-76c1-4d67-8037-442a280f1514' AND occurred_at>'2026-07-01T22:10:30' AND occurred_at<'2026-07-01T22:48:00'; -- =1 (session stop only), while the native log shows 5 full result turns in that window: grep '\"type\":\"result\"' ~/.t3/userdata-v2/logs/provider/47763f5e-76c1-4d67-8037-442a280f1514.log.1 | grep 'T22:[123]'", + "sessionKey": "47763f5e-claude-fable", + "verdict": { + "isReal": true, + "adjustedSeverity": "high", + "reasoning": "Every evidence point reproduces: 25 decoded result messages in the native log vs 16 projected runs, with the exact 10 cited orphan results unmatched by any run completed_at; the repro SQL returns 1 (only a provider-session stop at 22:41:50.039Z) for a window where the log shows 5 full turns; 86 top-level tool_use ids (371 in log vs 285 projected across all threads) are missing, all timestamped in the orphan-turn windows; cited assistant texts/uuids exist in the log but in zero projection rows; and no run/message carries provider_wakeup intent. This is real, substantial user-facing data loss (whole review-fix rounds with real git pushes invisible), matching the documented pre-fix \"invisible post-turn turns\" issue on this exact thread \u2014 known issue, but the lost session data is not backfilled by the fix, so high severity stands." + } + }, + { + "title": "Invisible post-turn wakeup turns: 42 native tool calls + final assistant messages never ingested or projected", + "category": "lost-data", + "severity": "high", + "confidence": "high", + "knownIssue": true, + "evidence": "Three windows where the claudeAgent SDK kept working after the orchestrator closed the run: 23:31:27-23:36:48 (16 calls: Bash/Read/Edit/TaskCreate/TaskUpdate, after run 5 completed 23:30:07), 00:16:13-00:17:13 (9 calls, after run 6 completed 00:16:09), 00:33:32-00:37:24 (16 calls incl. mcp__t3-code__preview_* tools, after run 7 completed 00:33:22). Native results confirm full turns: [2026-07-02T00:17:51.602Z] subtype success num_turns:10 and [00:37:39.534Z] num_turns:17. Zero rows in orchestration_events for e.g. toolu_012dzkuS4eoKeRiAaQ3icFpD (Bash 00:17:05.797Z); zero main-stream events between 00:16:10 and 00:21:50. The wakeup turn's final assistant text 'Done \u2014 comment posted: https://github.com/pingdotgg/t3code/pull/2829#issuecomment-4861082710' (log.2, 00:17:4x) has 0 events and 0 turn_items \u2014 the agent posted a GitHub comment with no user-visible record. This is the diagnosed 'invisible post-turn turns' issue; the fix (turn.wakeup / ProviderWakeupService / attach_wakeup) is on this branch but this session ran on the pre-fix build (this thread is itself the diagnosis session, title 'check in on this thread: 47763f5e...').", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT count(*) FROM orchestration_events WHERE payload_json LIKE '%toolu_012dzkuS4eoKeRiAaQ3icFpD%'\" -- returns 0; grep toolu_012dzkuS4eoKeRiAaQ3icFpD ~/.t3/userdata-v2/logs/provider/ea84f015-*.log.2 shows the Bash tool_use", + "sessionKey": "ea84f015-cursor-handoff", + "verdict": { + "isReal": true, + "adjustedSeverity": "high", + "reasoning": "Re-ran the full evidence trail: run 5/6/7 completion timestamps match, all 9 window-2 tool ids (e.g. toolu_012dzkuS4eoKeRiAaQ3icFpD) and 33 window-1/3 ids have zero rows in orchestration_events, zero main-stream events exist in all three gap windows, and the final assistant text with the posted GitHub comment URL (issuecomment-4861082710) has 0 events and 0 turn_items despite 4 native log occurrences with a success result (num_turns:10 at 00:17:51.602Z). Minor correction: 5 ids the finding attributed to window 1 belong to run 6 and were ingested, but the total lost count is still exactly 42. The session ran on the pre-fix build (the fix lives uncommitted on this branch), so whole turns including a real-world side effect were permanently lost with no user-visible record \u2014 real lost-data, correctly flagged as a known issue, severity stays high." + } + }, + { + "title": "Cursor turn failure projected with only generic 'Provider turn failed.' \u2014 real error detail (auth error) never captured", + "category": "error-handling", + "severity": "high", + "confidence": "high", + "knownIssue": false, + "evidence": "Run 2 (native run-f4d64e64-0af6-42bc-8813-5b6ce53a328d) failed: native log line [2026-07-02T02:35:28.424Z] run.completed result {status:\"error\", requestId:\"beca30c7-7e90-481b-abf7-fdab84b3c48d\", durationMs:440732} \u2014 no error field in the decoded payload, and zero interaction.updates for the entire 440s turn (only run.started at 02:28:07.973Z then silence). Projected error item turn-item:provider:cursor:native-item:terminal-failure%3Aprovider-turn%3A...run-f4d64e64... (event seq 38493, occurred_at 2026-07-02T02:35:28.524Z) carries failure {class:\"provider_error\", message:\"Provider turn failed.\", code:null, retryable:null} \u2014 requestId and durationMs from the native payload are dropped too. Cause: CursorAdapterV2.ts:2165-2168 maps `cause: (result as { readonly error?: unknown }).error` which is undefined on the SDK RunResult, falling back to DEFAULT_PROVIDER_FAILURE_MESSAGE. The session's own in-turn diagnosis (final assistant item ...assistant%3Arun-1cbebefe...%3A5) confirmed the real cause lives only in the Cursor SDK store runs.error_code (\"Authentication error If you are logged in, try logging out and back in.\") which T3 never reads. The failure IS surfaced user-visibly (error item status failed, run/attempt/provider-turn all 'failed'), but with zero debuggable detail \u2014 this exact gap forced the user to spawn this debugging thread, and then recurred on the thread itself (run ordinal 2).", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT json_extract(payload_json,'$.failure') FROM orchestration_v2_projection_turn_items WHERE thread_id='c9e72a05-4c87-4dd5-b1b6-83834cc73afe' AND type='error';\" ; grep 'run.completed' ~/.t3/userdata-v2/logs/provider/c9e72a05-4c87-4dd5-b1b6-83834cc73afe.log | tail -1", + "sessionKey": "c9e72a05-cursor-failing-turns", + "verdict": { + "isReal": true, + "adjustedSeverity": "high", + "reasoning": "Fully reproduced: the projected error item for run-f4d64e64 carries only {\"message\":\"Provider turn failed.\",\"code\":null}, while the native log's run.completed (02:35:28.424Z, status \"error\", requestId beca30c7, durationMs 440732) confirms no detail was captured and requestId/durationMs were dropped. The cited code (CursorAdapterV2.ts:2165-2168) reads (result as {error?: unknown}).error, but @cursor/sdk@1.0.22's RunResult type has no error field whatsoever \u2014 the fallback is guaranteed dead-code behavior for EVERY failed Cursor turn, and the SDK's public store API does expose RunRecord.errorCode (\"Authentication error...\") which T3 never reads. The failure is surfaced user-visibly with correct terminal states (so nothing is swallowed), but the projection retains zero debuggable detail for a systematic, provider-wide failure path, which matches the audit's explicit error-handling criterion; severity high stands." + } + }, + { + "title": "Shared codex session native logs written to opener thread's file, not the active thread's \u2014 no log file ever created for these threads", + "category": "other", + "severity": "high", + "confidence": "high", + "knownIssue": false, + "evidence": "All four threads bind provider_session_id 'provider-session:provider-instance:codex:shared'. Their codex app-server protocol traffic appears in 71e29ba5-654b-4f67-b363-ad400e5bd016.log.6/.9/.10 (e.g. 019f1b62-f532 = c878541b's nativeThreadRef: 4768 hits in .log.10, 2828 in .log.9), interleaved with that thread's own claudeAgent and codex (019f1b50-4248) traffic. Cause: in /Users/julius/.t3/worktrees/codething-mvp/t3code-c1e5e1d1/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.ts, codexAppServerClientFactoryFromSettingsLayer.open builds makeCodexAppServerProtocolLogger({ nativeEventLogger, threadId: input.threadId, ... }) once per app-server process; EventNdjsonLogger.write routes to `${threadSegment}.log` from that frozen threadId, so every thread multiplexed onto the shared session logs under the opener's file. No file named c878541b-*/de5f191a-*/68f7595b-*/af66fc2c-* was ever created.", + "repro": "grep -l '019f1b62-f532' ~/.t3/userdata-v2/logs/provider/*.log* # hits only 71e29ba5-654b-...log.{6,9,10}, never a file named after the audited threads", + "sessionKey": "codex-no-native-logs", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Fully reproduced: all four threads bind provider-session:provider-instance:codex:shared; c878541b's nativeThreadRef 019f1b62-f532 appears exactly 4768x in 71e29ba5.log.10 and 2828x in .log.9 (interleaved with 71e29ba5's own codex ref 019f1b50-4248), and no log file named after any of the four threads exists. Code confirms the cause: CodexAdapterV2.ts line 1136 freezes input.threadId into makeCodexAppServerProtocolLogger at app-server open, so every multiplexed thread's protocol traffic routes to the opener's `${threadSegment}.log` \u2014 defeating EventNdjsonLogger's per-write threadId routing, so this is a misrouting bug, not intent. Downgraded to medium because ingestion/projections for these threads are intact (observability-only defect), though rotation under the busy opener file has permanently deleted 3 of 4 threads' protocol history." + } + }, + { + "title": "Invisible post-turn wakeup turns (claudeAgent): 76 completed tool calls + assistant text never ingested or projected", + "category": "lost-data", + "severity": "high", + "confidence": "high", + "knownIssue": true, + "evidence": "1130 unique toolu_ ids in decoded native logs vs 1045 projected nativeItemRef ids. Of the 85 unmatched, 7 are Agent spawns (projected under task::subagent ids, fine) and 2 are stream-only in-flight blocks; the remaining 76 (60 Bash, 7 Edit, 5 Read, 4 ScheduleWakeup) ALL fall in gaps between run windows \u2014 0 fall inside any run. Example: run 56 completed 2026-07-01T22:18:57.958Z (result subtype=success 22:18:57.333Z); at 22:20:52.374Z 'system task_notification bk7bmoyti' re-woke the session, assistant emitted text 'The build failed \u2014 let me see the actual error.' (22:20:58.275Z) then Bash toolu_011YH7oQm6yGVjWTfxwQ1QsH (22:20:59, tool_result 22:21:00) etc. \u2014 none exist in orchestration_events or turn_items (LIKE search for that text returns 0). This thread has ZERO wakeup runs/messages (0 messages with inputIntent provider_wakeup / creationSource other than agent|provider,user|web), while 367 provider_wakeup events exist elsewhere in the DB (first at 2026-07-01T22:47:41) \u2014 this is the thread on which the fix was being developed (memory: diagnosed 2026-07-01 on threads 47763f5e & 71e29ba5).", + "repro": "comm -23 /tmp/native_toolu.txt /tmp/proj_toolu.txt (native ids from grep -h '\"stage\":\"decoded\"' ~/.t3/userdata-v2/logs/provider/71e29ba5-654b-4f67-b363-ad400e5bd016.log* | grep -o 'toolu_[A-Za-z0-9]*' vs SELECT json_extract(payload_json,'$.nativeItemRef.nativeId') FROM orchestration_v2_projection_turn_items WHERE thread_id LIKE '%71e29ba5%')", + "sessionKey": "71e29ba5-claude-mega", + "verdict": { + "isReal": true, + "adjustedSeverity": "high", + "reasoning": "Re-ran the full evidence trail: 85 genuine claudeAgent tool_use ids in decoded native logs have no projection; after excluding 7 Agent spawns (verified projected as task: items) and 2 stream-only in-flight blocks, exactly 76 completed tool calls (60 Bash, 7 Edit, 5 Read, 4 ScheduleWakeup) all fall in gaps between run windows (0 inside any of the 127 runs). The cited example reproduces: assistant text 'The build failed...' at 2026-07-01T22:20:58.275Z between run 56 (ended 22:18:57.958Z) and run 57 (started 22:22:38.734Z), with 0 matching rows in orchestration_events; the thread has 0 provider_wakeup events/messages while 367 exist elsewhere. This is the documented known issue the branch fixes, but for this session the provider-emitted tool calls, results, and assistant text are unrecoverably absent from ingestion and projections, with losses continuing into 2026-07-02 \u2014 real high-severity lost data, correctly flagged knownIssue=true." + } + }, + { + "title": "Provider failures persist only generic error strings; underlying cause is unrecoverable from the DB", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "4 claudeAgent error items say only 'Claude Agent SDK query failed.' (class transport_error, code null) \u2014 e.g. run:thread:47763f5e-76c1-4d67-8037-442a280f1514:ordinal:10 failed 2026-07-01T22:48:38 with zero native-log traffic in that minute, so the real exception exists nowhere. Root cause: ClaudeAgentSdkQueryRunnerError.message getter in apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts:280 hardcodes the string and discards `cause`. Similarly 4 cursor items say 'Provider turn failed.' (code null) while the native run.completed carried status:'error' plus requestId d1cd7f44-91fb-45ce-bdd3-86e9b5cc8e3e and durationMs 2434335 that were dropped. Counter-example showing it can be done right: codex errors keep full detail ('stream disconnected before completion: error sending request for url (https://chatgpt.com/backend-api/codex/responses)' and the full 400 invalid_request_error body for gpt-5.3-codex).", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT thread_id, json_extract(payload_json,'$.failure.message') FROM orchestration_v2_projection_turn_items WHERE type='error';\"", + "sessionKey": "GLOBAL", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Reproduced the SQL: all 4 claudeAgent error items persist only 'Claude Agent SDK query failed.' (code/retryable null) and no richer detail exists in any orchestration_events row for those failures; the root cause is confirmed at ClaudeAdapterV2.ts:279-281 where the message getter hardcodes the string while the Schema.Defect cause field is discarded by makeProviderFailure (real exception goes only to Effect.logWarning). This is not intentional: ProviderFailure.ts has redaction/4096-char bounding built to persist real error text, and codex failures do persist full detail. Minor correction: the '47763f5e had zero native-log traffic at 22:48' claim is false (8 lines exist in .log.1, outgoing query.open/prompt.offer at 22:48:38.086), and the cursor generic message is partly upstream (native run.completed carried no error message, only requestId d1cd7f44/durationMs which were dropped) \u2014 but the core finding stands, so severity stays medium." + } + }, + { + "title": "Delegated Claude task failed with 401 auth error but all statuses say 'completed' (is_error ignored on success-subtype result)", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Child native log thread-delegated-task-command-3amcp-3a1156181e-...log (2026-06-28T05:22:33Z) shows SDK result {\"type\":\"result\",\"subtype\":\"success\",\"is_error\":true,\"result\":\"Failed to authenticate. API Error: 401 Invalid authentication credentials\"}. Projections: child run run:thread:thread%3Adelegated-task%3Acommand%253Amcp%253A1156181e-...:ordinal:1 status=completed; provider_turn status=completed; subagent row node:delegated-task:command%3Amcp%3A1156181e-...%3Ahello-opus-48 status='completed' with result='Failed to authenticate...'; delegate_task tool item call_qV0wnlB5N6Osa6CPxTGN1Ajh output.status='completed'. Root cause in current code: ClaudeAdapterV2.ts terminalStatusFromResult (~line 1658) returns 'completed' whenever message.subtype==='success' without checking is_error, and the result handler (~line 3254) only attaches a ProviderFailure when subtype!=='success'. Mitigating: the error text IS preserved and user-visible (child assistant item 03e7bb1c-2ee8-46a9-8bf2-f2b66c7501dc text = the 401 message; parent's final assistant message msg_...9bb460 explains the failure), so nothing was swallowed -- but status semantics are wrong for anything filtering on failed tasks.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT status, json_extract(payload_json,'$.result') FROM orchestration_v2_projection_subagents WHERE subagent_id LIKE '%hello-opus-48%'\" # completed | Failed to authenticate. API Error: 401 ...", + "sessionKey": "a5a643b2-codex-heya", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Re-ran every evidence step: the child native log shows SDK result subtype=\"success\" with is_error=true and api_error_status=401 (zero tokens, \"Failed to authenticate. API Error: 401...\"), yet the child run, provider_turn, subagent row, and the parent's delegate_task item (call_qV0wnlB5N6Osa6CPxTGN1Ajh, output.status=\"completed\") all report completed. Source confirms the cause: ClaudeAdapterV2.ts terminalStatusFromResult (line 1658) returns \"completed\" on subtype===\"success\" without checking is_error, and the result handler (lines 3250-3263) only attaches a ProviderFailure when subtype!==\"success\" \u2014 is_error is checked only for tool results, so this is an oversight, not intent. Medium stands: the error text is preserved and user-visible, but status semantics are wrong end-to-end for anything filtering on failed delegated tasks." + } + }, + { + "title": "Streaming deltas persisted as full-row event pairs cause ~2800x event amplification on child result items", + "category": "other", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Child thread ...composer_call_kFjs8 accumulated 2711 events (1352 turn-item.updated + 1352 message.updated + overhead) to stream a single 6274-char result; the 3 children total 5702 turn-item/message events for ~12.6KB of final text. This one 2-minute session wrote 6017 rows to orchestration_events (43k total table), i.e. each streaming delta is persisted as a full-row upsert event for both the turn item and its message. Not a correctness bug (final projections are correct) but a storage/replay-cost anomaly.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT event_type, COUNT(*) FROM orchestration_events WHERE stream_id LIKE 'thread:provider:grok:native-thread:019f1558%' GROUP BY 1;\"", + "sessionKey": "20296b49-grok-tasks", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Evidence reproduces exactly: 1352 turn-item.updated + 1352 message.updated events on the single kFjs8 result item with monotonically growing text (4 -> 6274 chars, sequences 2160-4862), and the session totals exactly 6017 event rows. Code confirms the mechanism: AcpAdapterV2.emitSubagentAssistant emits a full-row message+turn-item event pair per agent_message_chunk and ProviderEventIngestor persists each pair durably with no coalescing, throttling, or event-log pruning (ProjectionMaintenance only rebuilds projections by replaying the full log). Quantified: 18.0 MB of payload_json for ~12.6 KB of final subagent text (~1400x, quadratic in output length), 15.5% of the entire event table from one 2-minute session. Final projections are correct so severity stays medium, but the unbounded quadratic storage/replay cost is a real, unmitigated pipeline anomaly, not intended behavior or a misreading." + } + }, + { + "title": "Run 2 failure projected with generic wrapper message; actual cause dropped from projection and depth-elided in server log", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Error turn item turn-item:provider:cursor:native-item:terminal-failure%3Aprovider-turn%3A...ordinal%25253A2... (ordinal 2000002, status failed, 21:48:40.703Z) persists failure = {class:'provider_error', message:'Failed to start run run:thread:721fc23c-2cf3-42bf-9d84-edd94359dca9:ordinal:2 on cursor provider thread provider-thread:provider:cursor:native-thread:pending%3A...', code:null, retryable:null}. That string is ProviderAdapterTurnStartError's message getter (apps/server/src/orchestration-v2/ProviderAdapter.ts:291); RunExecutionService.ts:830 builds the failure via makeProviderFailure({cause: Cause.squash(cause)}) so the tagged error's wrapped `cause` (the real agent.send rejection from CursorAgentSdk runnerError 'run.start') never reaches the projection. The only other record, server-child.log line 89 at 21:48:40.703Z, printed the cause as \"{ _id: 'Cause', failures: [ [Object] ] }\" \u2014 depth-elided. The underlying reason run 2 failed 4ms after run.start is now unrecoverable. Positive: the failure WAS surfaced user-visibly as an error item and run/attempt/node all show 'failed'.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT payload_json FROM orchestration_v2_projection_turn_items WHERE thread_id='721fc23c-2cf3-42bf-9d84-edd94359dca9' AND type='error'\" ; grep -n '721fc23c' ~/.t3/userdata-v2/logs/server-child.log", + "sessionKey": "721fc23c-cursor", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "All cited evidence reproduces: the projected error item and its source event (orchestration_events sequence 6314) both persist only ProviderAdapterTurnStartError's generic message getter (ProviderAdapter.ts:280-293); RunExecutionService.ts:~831 feeds Cause.squash(cause) into makeProviderFailure, which (ProviderFailure.ts:90-113) reads only .message/.code and never unwraps the nested cause defect that CursorAdapterV2.ts:2203 attaches. server-child.log line 89 depth-elides the cause as \"[ [Object] ]\" and the native provider log ends at the outgoing run.start (21:48:40.699Z) with no error event, so the real reason run 2 failed is unrecoverable from any persisted artifact. The failure was surfaced user-visibly with correct terminal statuses, so medium (not high) is the honest severity \u2014 this is a debuggability gap, not silent loss." + } + }, + { + "title": "Native provider log never records failure frames \u2014 cursor SDK send errors and process death are invisible in the ground-truth log", + "category": "other", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log 721fc23c-2cf3-42bf-9d84-edd94359dca9.log ends at [2026-06-29T21:48:40.699Z] with outgoing run.start 'what did you just say' and no incoming frame, though the turn failed at 21:48:40.703Z. In Adapters/CursorAgentSdk.ts send(), the log writes cover outgoing run.start and incoming run.started/interaction.update/run.completed only; when agent.send rejects, Effect.tryPromise raises runnerError(cause,'run.start') with no protocol log write. Similarly run 1 shows run.started at 21:48:19.834Z and then nothing \u2014 no error, no lifecycle/exit record \u2014 because the process crash killed the logger too. Result: the 'ground truth' log cannot explain either failed turn, which is exactly why this session looked suspicious (7 lines vs 47 events).", + "repro": "cat ~/.t3/userdata-v2/logs/provider/721fc23c-2cf3-42bf-9d84-edd94359dca9.log | tail -2 ; grep -n 'runnerError(cause, \"run.start\")' apps/server/src/orchestration-v2/Adapters/CursorAgentSdk.ts", + "sessionKey": "721fc23c-cursor", + "verdict": { + "isReal": true, + "adjustedSeverity": "low", + "reasoning": "Evidence reproduces fully: the native log ends at 21:48:40.699Z with the outgoing run.start and no failure frame while events 6313-6315 record the run failing at 21:48:40.703Z, and CursorAgentSdk.ts's error paths (runnerError catches at lines 408/440; run.completed logged via success-only Effect.tap) confirm no error-frame logging exists \u2014 indeed no adapter has an error/lifecycle log kind at all. However, the pipeline did not swallow anything: the failure was ingested and user-surfaced as turn-item 6314 (type error, status failed) with correct terminal statuses, and the raw cause is logged to the server application log (CursorAdapterV2.ts:2188 logWarning with cause), so this is an observability gap in the debug/protocol log (compounded by the generic projected message with code:null) rather than a data-loss or projection bug. Run 1's \"process crash\" explanation is unproven (DB shows cancelled, lastError null), so I downgrade to low: real, design-wide log-scope limitation worth fixing, but errors remain debuggable and user-visible through other channels." + } + }, + { + "title": "Failed tool item drops the provider error message (file_search projection has no error/output field)", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Child thread ses_0ea978228: turn-item:provider:opencode:native-item:prt_f15692f30001qL66As1xsUCQXc (ordinal 87) went running->failed at 2026-06-29T22:04:02.915Z (events seq 8284 -> 8286 on stream thread:provider:opencode:native-thread:ses_0ea978228ffeVM6qpFUgGn62Vl). Final payload contains only {status:'failed', type:'file_search', pattern:'/Users/julius/Development/Work/codething-mvp/CLAUDE.md'} \u2014 no error text at all. Root cause in OpenCodeAdapterV2.ts: toolOutput() (line 659) returns part.state.error for status 'error', but the file_search branch (~line 1376) only maps 'pattern'; output/error is only attached for dynamic_tool. The contract (packages/contracts/src/orchestrationV2.ts line 841) also has no output/error field on file_search items, so the failure reason is unrecoverable from projections. Failure IS user-visible via status=failed, but not debuggable.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT payload_json FROM orchestration_v2_projection_turn_items WHERE turn_item_id='turn-item:provider:opencode:native-item:prt_f15692f30001qL66As1xsUCQXc';\"", + "sessionKey": "3029dc85-opencode", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Reproduced the exact projection payload (failed file_search item prt_f15692f30001qL66As1xsUCQXc, ordinal 87, no error text) and the running->failed transition at events seq 8284->8286. Source confirms the mechanism: OpenCodeAdapterV2.ts toolOutput() (line 659) extracts part.state.error, but the file_search branch (lines 1376-1383) only maps pattern \u2014 output/error is attached only for command_execution/dynamic_tool \u2014 and the contract's file_search schema (orchestrationV2.ts line 841) has no field to hold it. The native log is additionally redacted (protocol payloads logged as fieldCount summaries only), so the error message is unrecoverable anywhere; failure is visible via status=failed but not debuggable, so medium stands." + } + }, + { + "title": "Claude assistant text segments merged without separator into a single end-of-turn item, losing interleaving", + "category": "lost-data", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Thread :2 native log has 5 assistant text blocks with distinct SDK uuids (341420a1 len 95 @22:09:54, 78f308a1 len 37 @22:09:58, 26b0aa8d len 90 @22:10:11, 31c4bfa0 len 73 @22:11:40, bfa7a134 len 10347 @22:13:07), interleaved with 19 tool calls. Projection has ONE assistant_message item turn-item:provider:claudeAgent:native-item:bfa7a134-04d5-4434-bb14-66927497fb91, ordinal 1000021 (after all tools), text length 10642 = exact sum, joined with no whitespace: '...read all the files thoroughly.I'll read all files in the directory.Let me gather context...'. The 4 intermediate uuids have ZERO orchestration_events (only seq 11544-11546 reference bfa7a134, all at 22:13:07). Root cause in code that ran (git fc23be8184 ClaudeAdapterV2.ts:2815): `context.assistant.text += assistantText.text` with single emission at result time; current worktree code (emitAssistantTextArtifacts per message.uuid) already emits per-segment items, so this appears fixed for future runs.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT substr(json_extract(payload_json,'$.text'),80,150) FROM orchestration_v2_projection_turn_items WHERE thread_id LIKE '%126456a2%:2' AND type='assistant_message';\"", + "sessionKey": "6d618dc4-mcp-group", + "verdict": { + "isReal": true, + "adjustedSeverity": "low", + "reasoning": "Reproduced every claim: native log has 5 assistant text blocks (lens 95/37/90/73/10347 at 22:09:54\u201322:13:07) but the projection has one assistant_message item (native ref bfa7a134, ordinal 1000021 after all 19 tools) of length 10642 = exact sum, joined with no separator (\"thoroughly.I'll read all files in the directory.Let me gather context\"); the 4 intermediate uuids have zero orchestration_events while bfa7a134 has seq 11544\u201311547. Historical code (fc23be8184 ClaudeAdapterV2.ts:2815 `context.assistant.text +=`) caused it; current worktree emitAssistantTextArtifacts emits per-nativeItemId, so it is fixed going forward. Real for this session's data, but downgraded to low because no text content was lost (only interleaving/segmentation) and the fix already landed." + } + }, + { + "title": "Subagent projection marked completed at spawn with placeholder result (lifecycle wrong by ~75s)", + "category": "other", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "orchestration_v2_projection_subagents row 'node:provider:grok:native-item:call-cdbbacb3-0c58-4992-86ce-7b549ffb16ce-1' has status=completed, started_at=2026-07-01T01:52:10.677Z, completed_at=2026-07-01T01:52:10.702Z and result='Subagent started in background...\\nUse get_command_or_subagent_output...' \u2014 but the task's real lifetime was 01:52:10Z to 01:53:25Z (duration_secs 74.874 per the TaskOutput payload). The parent 'subagent' turn_item (ordinal 1000007) mirrors the same premature completedAt and placeholder result. Cause visible in AcpAdapterV2.emitSubagent (~lines 1047-1060): when the spawn tool returns 'started in background', taskStatus is non-running and update.result (the bootstrap text) is adopted as assistantText/result; nothing later updates it when the background task actually finishes.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT status, completed_at, substr(json_extract(payload_json,'$.result'),1,60) FROM orchestration_v2_projection_subagents WHERE thread_id='5dcea72d-15e1-4ded-922b-0b00c587de6c';\"", + "sessionKey": "5dcea72d-grok-subagent", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Reproduced end-to-end: projection row 'node:provider:grok:native-item:call-cdbbacb3-...-1' has status=completed, completed_at=2026-07-01T01:52:10.702Z with result='Subagent started in background...' while the TaskOutput payload (event seq 14622, 01:53:28.477Z, item call-5ca8a4be-...) shows the task actually ran 01:52:10Z-01:53:25Z (duration_secs 74.874); only 3 subagent.updated events exist, all at spawn time, so nothing ever corrects it. Code confirms the cause: extractXAiAcpSubagentUpdate (XAiAcpExtension.ts:86-91) maps the spawn tool's completed status straight to subagent status with no background-spawn detection, and AcpAdapterV2.ts:1047-1053 adopts the bootstrap text as assistantText/result; the later TaskOutput tool call is projected as an unrelated dynamic_tool item and never routed back to the subagent, and the child thread's only assistant message is also the placeholder. Severity stays medium: the real 15.5KB output survives user-visibly in the separate dynamic_tool item, but the subagent card's lifecycle, result, and child-thread transcript are wrong, and this Grok/ACP path is not covered by the claudeAgent backgrounded-task known issue." + } + }, + { + "title": "Run 10 terminal failure persisted with only a generic error message", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Run run:thread:7f1dfff1-...:ordinal:10 failed at 2026-07-01T20:47:58.607Z. The projected error item (turn-item:provider:claudeAgent:native-item:terminal-failure%3A...ordinal%25253A10..., event seq 25810) has failure {class:\"transport_error\", message:\"Claude Agent SDK query failed.\", code:null, retryable:null}. run-attempt payload (seq 25811) carries no error detail either. Native log ends at 20:47:58.429Z with outgoing query.open + prompt.offer (user text \".\") and zero incoming SDK messages, so the DB retains no clue what actually failed. Root cause in code: ClaudeAdapterV2.ts:279-280 \u2014 ClaudeAgentSdkQueryRunnerError.message hardcodes \"Claude Agent SDK query failed.\" and the underlying `cause` defect is never propagated into the projected failure. The failure IS user-visible (error item + run failed), but is undebuggable after the fact.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT json_extract(payload_json,'$.failure') FROM orchestration_v2_projection_turn_items WHERE thread_id='7f1dfff1-4cbb-450b-8514-764d11c903e1' AND type='error';\"", + "sessionKey": "7f1dfff1-claude-scheduled-tasks", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Re-ran the repro: the error turn-item (event seq 25810) and run-attempt (seq 25811) for run ordinal 10 persist only {class:\"transport_error\", message:\"Claude Agent SDK query failed.\", code:null}, and the native log truly ends at 20:47:58.429Z with outgoing query.open/prompt.offer and no incoming SDK data, so no error detail exists anywhere in DB or provider log. Code confirms the mechanism: ClaudeAgentSdkQueryRunnerError.message is hardcoded at ClaudeAdapterV2.ts:279-280 and makeProviderFailure (ProviderFailure.ts:97-100) reads exactly that getter, dropping the Schema.Defect cause; the real cause only reaches an ephemeral Effect.logWarning. Other adapters (e.g. OpenCodeAdapterV2.ts:2202) propagate real detail, so this is an unintended gap, not design \u2014 finding stands at medium since the failure is user-visible but undebuggable after the fact." + } + }, + { + "title": "401 authentication error result projected as successful run + plain assistant message", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Native result at 2026-07-01T20:56:54.878Z has is_error=true with result 'Failed to authenticate. API Error: 401 Invalid authentication credentials'. Run run:thread:47763f5e-...:ordinal:1 is projected status='completed' (completed_at 20:56:54.916Z) and its provider_turn completed. The error text is user-visible only as a normal assistant_message (turn-item:provider:claudeAgent:native-item:94ed662f-814a-43d7-aad3-92e479b9dfe8, status=completed, type=assistant_message) \u2014 no error item, no failed status, so the run reads as a success.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT status FROM orchestration_v2_projection_runs WHERE run_id='run:thread:47763f5e-76c1-4d67-8037-442a280f1514:ordinal:1';\" vs grep -m1 '\"type\":\"result\"' ~/.t3/userdata-v2/logs/provider/47763f5e-76c1-4d67-8037-442a280f1514.log.1 (is_error:true)", + "sessionKey": "47763f5e-claude-fable", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Reproduced fully: native result at 20:56:54.878Z has subtype \"success\" with is_error:true and api_error_status:401, yet run ordinal:1 and its provider_turn are projected status=completed, and the error text survives only as a plain completed assistant_message item (94ed662f, itself the SDK's synthetic \"\"-model assistant message). Code confirms this is a gap, not intent: terminalStatusFromResult in ClaudeAdapterV2.ts only inspects message.subtype and the failure payload is attached only for non-\"success\" subtypes; is_error/api_error_status on result messages are never checked. The error text is user-visible via the SDK's synthetic message, but the run's structured status misreports failure as success and the 401 detail is not persisted, so medium severity stands." + } + }, + { + "title": "Run 10 failure persisted with generic message only \u2014 no root cause anywhere", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Run ordinal:10 failed in 0.5s (requested 22:48:37.674Z, completed 22:48:38.293Z). The only error record is the error turn-item terminal-failure...ordinal%25253A10 with failure {class:'transport_error', message:'Claude Agent SDK query failed.', code:null, retryable:null}. The native log contains no error detail either \u2014 line 6324 (.log.1) shows the outgoing query.open at 22:48:38.086Z using create-style param sessionId:'b896a082-a480-416f-8ffd-c42845e71993' (after the session was closed at 22:41:50), then nothing incoming; the retry run 11 at 22:48:55.300Z used resume:'b896a082...' and succeeded. The actual SDK failure reason was never captured, and the failed open plausibly stems from reopening with sessionId instead of resume. Related: the query.close at 22:41:50 came ~31.8 min after run 9 completed (22:10:01) despite native wakeup activity through 22:38:55 \u2014 consistent with the known idle-session-reaper-vs-wakeup issue.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT json_extract(payload_json,'$.failure') FROM orchestration_v2_projection_turn_items WHERE thread_id='47763f5e-76c1-4d67-8037-442a280f1514' AND type='error';\" and sed -n '6323,6327p' ~/.t3/userdata-v2/logs/provider/47763f5e-76c1-4d67-8037-442a280f1514.log.1", + "sessionKey": "47763f5e-claude-fable", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Fully reproduced: run ordinal:10 failed in ~0.6s and the only persisted failure anywhere (turn-item event seq 29464) is the generic {class:transport_error, message:'Claude Agent SDK query failed.'}; the native log (.log.1 lines 6324-6325) shows query.open at 22:48:38.086Z with create-style sessionId after the 22:41:50 query.close, no incoming lines, then a successful resume-based retry at 22:48:55. The genericness is by construction: ClaudeAgentSdkQueryRunnerError (ClaudeAdapterV2.ts:279-281) hardcodes its .message getter, defeating makeProviderFailure's cause.message extraction (ProviderFailure.ts), whose redaction/4096-char bounding shows real messages were intended to flow through. Even the fallback Effect.logWarning (server-child.log:457) collapsed the cause to '[Object]', so the root cause is unrecoverable from every persisted artifact \u2014 a real error-handling gap, though user-visibly surfaced and recovered by retry, so medium stands." + } + }, + { + "title": "Runs 11-13 projected as 'completed' although native result had is_error:true (API 529 Overloaded)", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Native results in log.2: [2026-07-02T00:51:30.884Z], [00:55:19.085Z], [01:00:28.143Z] all 'subtype':'success','is_error':true,'num_turns':1 (preceded by 29 api_retry 'overloaded' events and one model_refusal_fallback at 00:51:30.857Z). Projections: runs ordinal 11/12/13 status 'completed', provider_turns 'completed'. The only user-visible signal is that the SDK emitted the error text as an assistant message (turn-item:provider:claudeAgent:native-item:9c0cead6-f253-44c8-8cc7-3c95b58a326e, run 13, text 'API Error: 529 Overloaded...'; run 11 item text 'API Error: Overloaded'). Run/attempt/turn statuses do not reflect the failure, so downstream logic treating these runs as successful is wrong.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT run_id,status FROM orchestration_v2_projection_runs WHERE run_id LIKE '%ea84f015%ordinal:1_' \" vs grep '\"is_error\":true' ~/.t3/userdata-v2/logs/provider/ea84f015-*.log.2", + "sessionKey": "ea84f015-cursor-handoff", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Evidence fully reproduces: log.2 has three decoded results with subtype 'success' but is_error:true (00:51:30.884Z 'API Error: Overloaded', 00:55:19.085Z api_error_status:529, 01:00:28.143Z), while runs ordinal 11/12/13 and their provider_turns project as 'completed' with matching completed_at timestamps, and the only trace is the error text projected as a completed assistant_message (item 9c0cead6-... on run 13). Code confirms the gap: terminalStatusFromResult in ClaudeAdapterV2.ts returns 'completed' for any subtype==='success' and attaches a failure payload only for non-success subtypes, even though the SDK's SDKResultSuccess type explicitly carries is_error and api_error_status \u2014 the signal is typed and available but ignored, so this is a real mapping bug, not intended behavior. Severity stays medium: the error is user-visible as assistant text, but structured run/attempt/turn statuses misreport success and the 529 detail is not queryable, which misleads any downstream logic." + } + }, + { + "title": "Native ground truth for most of the session destroyed by rotation of the opener thread's log", + "category": "lost-data", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Because codex traffic shares 71e29ba5's high-churn log (10 files x 10MB), the oldest surviving rotation (71e29ba5-...log.10) starts 2026-07-01T16:00:59.024Z. Everything earlier is deleted: c878541b runs 1-3 (01:54-15:58Z), all of de5f191a runs 1-3 (07:42-15:49Z), all of 68f7595b (05:53-05:56Z) and af66fc2c incl. its interrupted run (05:52-05:53Z). Only c878541b runs 4-6 (16:00-16:44Z) retain native logs. Consequence of the finding above; debuggability loss only \u2014 projections for the lost window exist and are internally consistent.", + "repro": "for f in ~/.t3/userdata-v2/logs/provider/71e29ba5-*.log*; do head -1 \"$f\" | cut -c2-25; done # earliest = 2026-07-01T16:00:59.024Z", + "sessionKey": "codex-no-native-logs", + "verdict": { + "isReal": true, + "adjustedSeverity": "low", + "reasoning": "Evidence fully reproduces: oldest surviving rotation (71e29ba5-...log.10) starts 2026-07-01T16:00:59.024Z; DB run windows confirm c878541b runs 1-3, all of de5f191a/68f7595b/af66fc2c predate it; native item ids from c878541b run 1 (msg_024da2f2..., call_z8cGFhrA...) appear in zero log files while run 4's msg_0e493c0caf68... appears 82x in .log.10, and no dedicated logs exist for the codex threads because CodexAdapterV2's protocol logger is keyed to the shared session's opener thread. Downgraded to low: the loss is a by-design consequence of intentional bounded rotation (10x10MB in EventNdjsonLogger.ts), the durable pipeline record (events + projections) for the lost window is intact and consistent, and the actual defect (codex traffic sharing a high-churn foreign thread's rotation budget) is the parent finding this one merely derives from." + } + }, + { + "title": "Terminal-failure error items carry only generic message ('Claude Agent SDK query failed.'), underlying cause not preserved", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "All 4 failed runs (37, 54 codex; 89, 119 claudeAgent) DO get a user-visible type='error' item (good surfacing), but payload is generic: run 119 item failure = {class:'transport_error', message:'Claude Agent SDK query failed.', code:null, retryable:null}, title 'Provider error'. The native log at both claude failure sites (2026-07-02T04:47:07.200Z and 16:45:25.921Z) shows only outgoing query.open/prompt.offer with no captured error line, so neither projection nor provider log retains the actual failure reason \u2014 undebuggable after the fact.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT json_extract(payload_json,'$.failure'), json_extract(payload_json,'$.title') FROM orchestration_v2_projection_turn_items WHERE thread_id='71e29ba5-654b-4f67-b363-ad400e5bd016' AND type='error'\"", + "sessionKey": "71e29ba5-claude-mega", + "verdict": { + "isReal": true, + "adjustedSeverity": "medium", + "reasoning": "Reproduced fully: claudeAgent failed runs 89/119 project only {class:'transport_error', message:'Claude Agent SDK query failed.', code:null} (event seq 43072), and the provider log at both failure sites (04:47:07.200Z log.2:790, 16:45:25.921Z log:3026) shows only outgoing query.open/prompt.offer with no error line. Source confirms the mechanism: ClaudeAgentSdkQueryRunnerError (ClaudeAdapterV2.ts:280) hardcodes its message getter while the real error lives in its cause Defect, which makeProviderFailure (ProviderFailure.ts:90) never unwraps \u2014 defeating the redaction/bounding machinery built to carry real messages (codex failures on this same thread DO carry specific messages, proving intent). The adapter's Effect.logWarning with the full cause was not found in any retained server trace log, so the failure reason is genuinely lost; medium severity stands since errors are surfaced but undebuggable." + } + } + ], + "refuted": [ + { + "title": "Orchestrator ingestion gap 16:10-16:40 Jul 1 (codex era): 119 native command/fileChange items lost; assistant messages backfilled with flattened timestamps", + "category": "lost-data", + "severity": "high", + "confidence": "high", + "knownIssue": true, + "evidence": "Zero orchestration_events for the stream between 2026-07-01T16:10:19 (run 50 completed) and 16:40:12 (run 51 requested), yet the native codex log shows substantive work in that window and around it: item/started fileChange items and commandExecutions incl. 'vp test .../OrchestratorReplayFixtures', 'vp check', 'git add apps/server/src/orchestration-v2/Adapters/Claude...' (call_0j3Wjk1K3an2l1CpCRgYBp8b 16:43:43) and 'git push origin t3code/codex-turn-mapping' (call_Hc9NQMOiQ9wo3by6RnzGbShQ 16:43:50), all with source 'unifiedExecStartup' on native thread 019f1b62-f532-79e2-9553-af817911965c. 119 of 450 native call_ ids have no projected turn_item (comm diff), all first seen 16:01-16:43 Jul 1. When run 51 started, 13 assistant messages (message:provider:codex:native-item:msg_*) were backfilled with IDENTICAL created_at 2026-07-01T16:40:14.821Z describing that missing work, but 0 turn_items have startedAt in 16:11-16:40. Text recovered, tool/file items and real timestamps lost. Mechanism (server down/restart vs codex turn-mapping bug) not fully determinable because logs older than 16:00:59Z were rotated away; loss itself is certain.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT count(*) FROM orchestration_events WHERE stream_id='71e29ba5-654b-4f67-b363-ad400e5bd016' AND occurred_at BETWEEN '2026-07-01T16:11' AND '2026-07-01T16:39'\" -> 0; SELECT count(*) FROM orchestration_v2_projection_messages WHERE thread_id='71e29ba5-654b-4f67-b363-ad400e5bd016' AND created_at='2026-07-01T16:40:14.821Z' -> 13", + "sessionKey": "71e29ba5-claude-mega", + "verdict": { + "isReal": false, + "reasoning": "The \"lost\" work belongs to a different session: the codex provider session is shared, so this thread's log contains other threads' traffic. All gap-window log lines (1322 decoded events, 16:11-16:39) carry threadId 019f1b62-f532-..., which per orchestration_v2_projection_provider_threads is owned by thread c878541b-a832-..., not 71e29ba5 (whose codex nativeThreadRef is 019f1b50-4248-...). Exactly 119 call_ ids in the log belong to 019f1b62 and ALL 119 (including the cited call_0j3Wjk1K3an2l1CpCRgYBp8b and call_Hc9NQMOiQ9wo3by6RnzGbShQ) are projected on thread c878541b; all 62 call_ ids on this thread's own native id are projected on 71e29ba5 (comm diff = 0 both ways). The zero-event window is genuine idle time between run 50 (completed 16:10:19) and run 51 (requested 16:40:12), and the 13 messages at created_at 16:40:14.821Z are run 51's live-streamed assistant messages (e.g. msg_00c4d1e6... streamed at 16:48:38 on threadId 019f1b50) \u2014 every one of the 55 codex runs shows the same pattern of assistant messages sharing created_at = run start, so this is systematic projection behavior, not backfill of lost work." + } + }, + { + "title": "All thinking/reasoning blocks dropped for main thread (122 native thinking messages, 0 reasoning items)", + "category": "lost-data", + "severity": "medium", + "confidence": "medium", + "knownIssue": false, + "evidence": "Native log contains 122 distinct main-thread assistant messages with thinking content blocks plus 646 'thinking_tokens' system events, but the main thread has zero projected reasoning turn-items (the only 2 'reasoning' items in the session are subagent-progress placeholders on the child threads, e.g. turn-item:...task%3Aac08aa451ed54d3f6%3Aprogress). ClaudeAdapterV2.ts only extracts part.type==='text' blocks (line 730: content.flatMap((part) => (part.type === \"text\" ? [part.text] : []))). May be an intentional product choice for claudeAgent, but it means no reasoning is ever visible/persisted for this provider.", + "repro": "cat log.1 log | jq 'select(.event.payload.type==\"assistant\" and .event.payload.parent_tool_use_id==null) | select([.event.payload.message.content[]?|select(.type==\"thinking\")]|length>0) | .event.payload.uuid' | sort -u | wc -l # 122, vs SELECT COUNT(*) FROM orchestration_v2_projection_turn_items WHERE thread_id='47763f5e-76c1-4d67-8037-442a280f1514' AND type='reasoning'; -- 0", + "sessionKey": "47763f5e-claude-fable", + "verdict": { + "isReal": false, + "reasoning": "The counts reproduce (155 native thinking blocks, 0 projected reasoning items), but inspection of the actual block contents refutes the lost-data claim: every final thinking block in the native log is {\"thinking\":\"\",\"signature\":\"...\"} and all 593 thinking_delta stream events are {\"thinking\":\"\",\"estimated_tokens\":N} \u2014 the Claude Agent SDK redacted all thinking text at the source for claude-fable-5, so there was never any reasoning content to project. ClaudeAdapterV2.ts:730 extracting only text parts drops nothing user-visible here; projecting these blocks would produce 155 empty reasoning items. The 646 'thinking_tokens' system events are token telemetry, not content." + } + }, + { + "title": "Failed cursor runs 15/16/17 keep only a generic 'Provider turn failed.' with code:null \u2014 no diagnostic detail persisted", + "category": "error-handling", + "severity": "medium", + "confidence": "high", + "knownIssue": false, + "evidence": "Error items exist and are user-visible (turn-item:provider:cursor:native-item:terminal-failure%3A...run-abf255cf / run-ceb09abc / run-f5fe7fcf, status failed, ordinals 15000098/16000002/17000002) but failure = {class:'provider_error', message:'Provider turn failed.', code:null, retryable:null}; run and run_attempt payloads contain no error field at all. Native run.completed results (log: 02:05:24.474Z, 02:16:37.138Z, 02:19:07.097Z) carry status:'error' with no message either \u2014 so cursor itself gave nothing \u2014 but available context (result.status, requestId d1cd7f44/55687657/6de8f367, durationMs 2434335/1223/657) was dropped. Run 15 did 40 min of work then errored; runs 16/17 ('continue') died in ~1s each with only 'Provider error' shown.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT payload_json FROM orchestration_v2_projection_turn_items WHERE thread_id='ea84f015-e597-4f79-9557-d39b5a895fba' AND type='error'\"", + "sessionKey": "ea84f015-cursor-handoff", + "verdict": { + "isReal": false, + "reasoning": "The evidence reproduces (three failed cursor runs projected with the generic 'Provider turn failed.' failure), but the native log shows cursor's run.completed results carried status:'error' with NO error/message/code field at all, so there was no provider diagnostic to persist. The code path is intentional: CursorAdapterV2 forwards result.error as cause, and makeProviderFailure (ProviderFailure.ts) extracts message/code when supplied, falling back to the tested DEFAULT_PROVIDER_FAILURE_MESSAGE \u2014 including an explicit test asserting arbitrary cause objects are deliberately not serialized. The 'dropped context' claim largely fails: result.status is persisted as failed on item/run/provider_turn, and durationMs is recoverable from the persisted provider_turn startedAt/completedAt (01:24:50.203Z\u219202:05:24.475Z \u2248 2434335ms); only cursor's requestId is unpersisted, which the ProviderFailure contract intentionally omits and which remains in the provider log. This is an upstream cursor SDK limitation plus a possible enhancement (persist requestId), not a pipeline discrepancy.", + "adjustedSeverity": "low" + } + } + ], + "lowUnverified": [ + { + "title": "Interrupt-result turn item references an interrupt-request parent that was never emitted", + "category": "lineage", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "turn_item 'turn-item:run:run%3Athread%3Aea84f015-e597-4f79-9557-d39b5a895fba%3Aordinal%3A15:signal:interrupt-result' (event seq 34176, 2026-07-02T01:24:49.393Z) has parent_item_id '...:signal:interrupt-request' with no matching row/event. Contrast thread af66fc2c-73cd-4c30-bea5-66932ecdba29 where both request (seq 20000) and result (seq 20003) were emitted. Code: makeInterruptResultTurnItem in apps/server/src/orchestration-v2/RunExecutionService.ts (line ~874) unconditionally derives parentItemId from the request id without ensuring that item was emitted; the request emission lives on a separate path in Orchestrator.ts:4422.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT turn_item_id, parent_item_id FROM orchestration_v2_projection_turn_items ti WHERE parent_item_id IS NOT NULL AND NOT EXISTS (SELECT 1 FROM orchestration_v2_projection_turn_items p WHERE p.turn_item_id=ti.parent_item_id);\"", + "sessionKey": "GLOBAL" + }, + { + "title": "Synthesized terminal-failure error item references a provider_turn_id that was never projected", + "category": "lineage", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "turn_item 'turn-item:provider:cursor:native-item:terminal-failure%3A...' on thread 721fc23c-2cf3-42bf-9d84-edd94359dca9 (updated 2026-06-29T21:48:40.703Z) carries providerTurnId 'provider-turn:provider:cursor:native-turn:failed%3Arun-attempt%3A...ordinal%253A2%3Aattempt%3A1' which has no row in orchestration_v2_projection_provider_turns. The run failed 4ms after the outgoing run.start ('what did you just say', log 2026-06-29T21:48:40.699Z) with no incoming response; the synthetic 'failed' turn id was minted for the failure item but the turn itself never projected.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT turn_item_id FROM orchestration_v2_projection_turn_items ti WHERE ti.provider_turn_id IS NOT NULL AND NOT EXISTS (SELECT 1 FROM orchestration_v2_projection_provider_turns p WHERE p.provider_turn_id=ti.provider_turn_id);\"", + "sessionKey": "GLOBAL" + }, + { + "title": "provider_sessions stay status='ready' after provider-session.detached (projection ignores detach)", + "category": "stuck-state", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Session provider-session:...claudeAgent:thread:47763f5e...:1c70abe8-4ecb-... got provider-session.detached event seq 31344 (2026-07-01T23:44:29.976Z, reason 'Thread archived.') and session ...ea84f015...:21ed85a2-... got detached seq 32566 (2026-07-02T01:03:00.273Z), yet both projection rows still show status='ready' with stale updated_at (22:48:37 / 22:12:31 \u2014 predating even later provider-session.updated events like seq 31274 at 23:42:43). The 16 'stopped' sessions were transitioned via updated events; detach alone leaves rows 'ready' indefinitely on archived threads.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT s.provider_session_id, s.status, s.updated_at FROM orchestration_v2_projection_provider_sessions s WHERE s.status='ready' AND EXISTS (SELECT 1 FROM orchestration_events e WHERE e.event_type='provider-session.detached' AND e.payload_json LIKE '%'||s.provider_session_id||'%');\"", + "sessionKey": "GLOBAL" + }, + { + "title": "13 backgrounded local_bash tasks projected as subagents with empty prompts and synthetic child threads", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": true, + "evidence": "Thread 7f1dfff1-4cbb-450b-8514-764d11c903e1 run 2 (2026-07-01T15:51): tasks bw28jiyac, bzmkw8629, bnkw62tp7, etc. are task_type 'local_bash' in the native log (task_started has only a 'description' like 'vp run --filter @t3tools/web typecheck...', no prompt), yet each got a subagent row (prompt length 0), a child thread, and a user_message prompt item with text ''. Newer local_agent tasks (a-prefixed 16-hex ids) capture full prompts (1.9k-4.3k chars). Predates the backgrounded-Bash redesign on t3code/codex-turn-mapping where local_bash stays a running Bash item completed cross-run; these are stale artifacts of the old mapping, not current behavior.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT turn_item_id FROM orchestration_v2_projection_turn_items WHERE type='user_message' AND coalesce(json_extract(payload_json,'$.text'),'')='';\"", + "sessionKey": "GLOBAL" + }, + { + "title": "Cursor stream stall: abandoned partial-tool-call finalized as 'completed'; provider retry duplicated assistant text", + "category": "duplicate", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Thread ea84f015 run 15: partial-tool-call toolu_01HgLigck78NC9MSVyjitDXe (edit) at log 2026-07-02T01:54:20.622Z never received tool-call-started/completed (stream stalled 6.5 min, cursor retried at 02:00:47 with new callId toolu_01NkWmCzHTn5ofjtYi4NkhsJ); the abandoned call's item 'turn-item:provider:cursor:native-item:toolu_01HgLigck78NC9MSVyjitDXe' (type file_change) was finalized status='completed' at run end (02:05:24.475Z) though the edit never executed. The retry also re-emitted identical assistant text ('Now the `WorkGroupSection` filter and the `SimpleWorkEntryRow` indicator.') producing items ...abf255cf...%3A6 (ordinal 15000077) and %3A7 (15000079) with duplicate visible text \u2014 faithful to two native text-delta emissions (log 01:54:19.813Z and 02:00:47.715Z), so provider-side, but rendered twice to the user.", + "repro": "grep -a 'SimpleWorkEntryRow\\` indicator' ~/.t3/userdata-v2/logs/provider/ea84f015-e597-4f79-9557-d39b5a895fba.log | cut -c1-30", + "sessionKey": "GLOBAL" + }, + { + "title": "provider_turn status has no 'waiting' literal during ScheduleWakeup sleep loops", + "category": "stuck-state", + "severity": "low", + "confidence": "high", + "knownIssue": true, + "evidence": "Known issue per assignment; currently only 1 provider_turn 'running' (thread 71e29ba5, started 2026-07-02T17:03:04.470Z \u2014 genuinely live at audit time 17:07Z), so no stale instance observed in this pass, but the schema/status distribution confirms no 'waiting' state exists (statuses seen: cancelled/completed/failed/interrupted/running).", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT status, count(*) FROM orchestration_v2_projection_provider_turns GROUP BY status;\"", + "sessionKey": "GLOBAL" + }, + { + "title": "Codex reasoning items never ingested (no content lost in this session: all summaries empty)", + "category": "lost-data", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Native log contains 5 reasoning item/completed events (e.g. rs_0cfa4cea83165175016a40a8e17de08199a84ec646c88a40cd run 2, rs_0cfa4cea...8c48/9052/9329 run 4), all with summary: []. SQL: 0 orchestration_events on stream a5a643b2-... with payload LIKE '%native-item:rs_%', and 0 turn_items of reasoning type with a codex provider_thread anywhere in the DB (172 reasoning items exist for other providers). CodexAdapterV2.ts has no reasoning item mapping (grep hits only 'reasoningEffort' config). With empty summaries nothing user-visible was lost here, and dropping empty reasoning may be intentional, but non-empty reasoning summaries would be silently dropped.", + "repro": "sed 's/^\\[[^]]*\\] NTIVE: //' ~/.t3/userdata-v2/logs/provider/a5a643b2-6ca8-4250-9c54-ddefe7d55565.log | jq -c 'select(.event.stage==\"decoded\") | .event.payload | select(.method==\"item/completed\" and .params.item.type==\"reasoning\")' | wc -l # 5, vs sqlite count 0", + "sessionKey": "a5a643b2-codex-heya" + }, + { + "title": "Codex reasoning items are silently dropped by CodexAdapterV2 (no handler for item type \"reasoning\")", + "category": "lost-data", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log emits item/started + item/completed for reasoning item rs_056ea93bc8adc0f3016a42c0594d788194ad4afb537a0a0b16 at 2026-06-29T18:58:33.314Z/33.340Z (turn 019f14bf-4440-7bb0-9655-7194b384e690), but zero rows in orchestration_events reference it and no turn_item exists. Code confirms: CodexAdapterV2.ts item/started (line 2802) and item/completed (line 2875) handlers cover userMessage, subAgentActivity, commandExecution, mcpToolCall/dynamicToolCall, fileChange, webSearch, plan, collabAgentToolCall, agentMessage \u2014 no \"reasoning\" branch and no item/reasoning delta handler, despite capabilities declaring streamsReasoning: true (line 139). In THIS session the item had empty summary/content so no user-visible text was lost, hence severity low \u2014 but any non-empty reasoning summary would vanish.", + "repro": "grep 'rs_056ea93bc8adc0f3' ~/.t3/userdata-v2/logs/provider/a61e9269-0180-4f4c-988a-aca00b240469.log ; sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT count(*) FROM orchestration_events WHERE payload_json LIKE '%rs_056ea93bc8adc0f3%'\" -- returns 0", + "sessionKey": "a61e9269-codex" + }, + { + "title": "thread/tokenUsage/updated and account/rateLimits/updated notifications never ingested; runs have no usage data", + "category": "lost-data", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Native log contains 5 thread/tokenUsage/updated and 5 account/rateLimits/updated decoded notifications; CodexAdapterV2.ts registers no handler for either (only item/agentMessage/delta, item/plan/delta, turn/plan/updated, turn/started, item/started, item/completed, turn/completed at lines 2693-3395). Runs projection payload has usage:null for both run:thread:a61e9269-0180-4f4c-988a-aca00b240469:ordinal:1 and :2, and no orchestration_events row carries token usage. May be intentional scope, but per-turn token accounting from the provider is discarded.", + "repro": "sed 's/^\\[[^]]*\\] NTIVE: //' ~/.t3/userdata-v2/logs/provider/a61e9269-0180-4f4c-988a-aca00b240469.log | jq -r 'select(.event.stage==\"decoded\") | .event.payload.method' | grep -c tokenUsage", + "sessionKey": "a61e9269-codex" + }, + { + "title": "Child task threads projecting only 2 items is intended, not data loss", + "category": "lost-data", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Each child stream (e.g. ...composer_call_kFjs8) has exactly 2 distinct turn-item ids across its 2711 events: ...%3Aprompt (user_message, 1 event) and ...%3Aresult (assistant_message, 1352 events growing text 4->6274 chars). Grok's ACP Task envelope only exposes rawInput.prompt plus streamed output text (apps/server/src/provider/acp/XAiAcpExtension.ts extractXAiAcpSubagentUpdate / xAiTaskOutputText), and the child's intermediate narration is contained in the streamed result text ('I'll read the connection files...'). Parent subagent items' result lengths (3721/6274/2647) exactly equal child assistant_message text lengths \u2014 nothing truncated.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT json_extract(payload_json,'$.id'), COUNT(*), MAX(length(json_extract(payload_json,'$.text'))) FROM orchestration_events WHERE stream_id LIKE '%composer_call_kFjs8' AND event_type='turn-item.updated' GROUP BY 1;\"", + "sessionKey": "20296b49-grok-tasks" + }, + { + "title": "Every successful grok turn logs session/prompt as failed with errorTag Interrupt", + "category": "error-handling", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log events ec0b4762-8c1d-41c9-b4b3-2687da09d40b (2026-06-29T21:46:13.448Z) and 5a4a5b58-40ad-4f71-b2de-6e703e0c9173 (21:47:37.827Z): payload {method:'session/prompt',status:'failed',errorTag:'Interrupt'} \u2014 yet both runs completed successfully (run ordinal 1 completedAt 21:46:13.476Z, ordinal 2 completedAt 21:47:38.686Z). Cause: XAiAcpExtension races the direct prompt request against the private _x.ai/session/prompt_complete fallback; the losing fiber is interrupted and logged as a failed request. Correctly NOT surfaced as a user-visible error (nothing to surface), but the 'failed' log entries are misleading noise for future debugging.", + "repro": "grep '\"status\":\"failed\"' ~/.t3/userdata-v2/logs/provider/20296b49-edbf-420b-a862-d04238c10caf.log", + "sessionKey": "20296b49-grok-tasks" + }, + { + "title": "Grok native log redacts all payload content, limiting ground-truth audits", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "All 3694 protocol lines in ~/.t3/userdata-v2/logs/provider/20296b49-edbf-420b-a862-d04238c10caf.log carry only shape summaries, e.g. payload {valueType:'string',byteLength:165} or {valueType:'array',itemCount:1} \u2014 no message text, tool names, or ids. Content-level loss/truncation verification against the provider is impossible from this log; verification here relied on internal consistency (event max text length == projection text length). If other providers' logs contain full JSON, grok's redaction is an inconsistency worth knowing about.", + "repro": "grep -c 'byteLength' ~/.t3/userdata-v2/logs/provider/20296b49-edbf-420b-a862-d04238c10caf.log", + "sessionKey": "20296b49-grok-tasks" + }, + { + "title": "Child task thread items carry null run_id/provider_thread_id/provider_turn_id", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "All 6 child items have runId=null, providerThreadId=null, providerTurnId=null (e.g. turn-item ...composer_call_Olv1X%3Aprompt payload: runId:null, nodeId:'node:...%3Achild-root', creationSource:'provider'); child threads have zero run.created / provider-turn events. node_id resolves (child-root nodes exist) and the subagents projection links parent node -> child_thread_id, so navigation works via subagents rather than runs. Appears intentional for provider-native task children (no orchestrator run exists), but any consumer that groups items by run_id will see these as orphans.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT turn_item_id, run_id, provider_turn_id FROM orchestration_v2_projection_turn_items WHERE thread_id LIKE 'thread:provider:grok:native-thread:019f1558%';\"", + "sessionKey": "20296b49-grok-tasks" + }, + { + "title": "run_attempts.provider_turn_id is never populated (null for all 207 attempts DB-wide) \u2014 back-link from attempt to provider turn missing", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "This session: run-attempt:run:run%3Athread%3A721fc23c...%3Aordinal%3A1:attempt:1 has providerTurnId:null although provider-turn:provider:cursor:native-turn:run-9d9ff525-e89c-41e3-90e6-1f7e841cc188 references that exact runAttemptId (forward link intact, nativeTurnRef strong). Global check: SELECT provider_turn_id IS NULL, count(*) FROM orchestration_v2_projection_run_attempts GROUP BY 1 shows every row (completed 188, failed 12, cancelled 4, interrupted 1, running 1, superseded 1) has NULL \u2014 the column/payload field exists but nothing ever writes it. Forward resolution via provider_turns.run_attempt_id works, so impact is limited to reverse-lookup ergonomics; may be intentional but the schema advertises the field.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT status, provider_turn_id IS NULL, count(*) FROM orchestration_v2_projection_run_attempts GROUP BY 1,2\"", + "sessionKey": "721fc23c-cursor" + }, + { + "title": "Run orphaned by server restart is cancelled with no user-visible or persisted reason; user's message got no response", + "category": "error-handling", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log ends at run.started 2026-06-29T21:49:09.097Z (runId run-f062e5d9-534a-48a8-8214-48fea97bc268) with zero assistant output. Events seq 6341-6346 (command:runtime-reconcile:startup:48663fb7-...:2026-06-29T21:49:10.775Z) set run:thread:48663fb7-...:ordinal:1, its attempt, and provider-turn:provider:cursor:native-turn:run-f062e5d9-... to status 'cancelled' with no reason field in any payload, indistinguishable from a user cancel. No error/system turn_item was emitted, so the timeline shows only the user 'hello' and 'Workspace ready' with nothing explaining the missing reply. Source: ProviderRuntimeRecoveryService.ts:107 builds detail 'Cancelled because the server restarted before the provider work completed.' but attaches it only to runtime-request payloads and outbox effect cancellations (lines 291/316), never to run/attempt/turn/item rows \u2014 intentional code path, but a debuggability/UX gap.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT sequence, event_type, payload_json FROM orchestration_events WHERE stream_id='48663fb7-fbd6-4be2-ad14-d8163282c069' AND command_id LIKE 'command:runtime-reconcile:%'\" \u2014 note status 'cancelled' payloads contain no reason; and grep -c NTIVE ~/.t3/userdata-v2/logs/provider/48663fb7-fbd6-4be2-ad14-d8163282c069.log shows only 4 lines with no assistant output.", + "sessionKey": "48663fb7-cursor" + }, + { + "title": "file_search items never carry results (read/grep/glob outputs dropped) despite contract supporting them", + "category": "lost-data", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "All 117 file_search items across the 5 opencode child threads have results=NULL (contract OrchestrationV2FileSearchResult array at orchestrationV2.ts:843 is never populated by OpenCodeAdapterV2 \u2014 its file_search branch emits only 'pattern'). Example: prt_f15688536002X4CTVKyjMIxGfL (ses_0ea978228, ordinal 3) has only pattern '/Users/julius/Development/Work/codething-mvp'. Tool outputs (file contents read, matches found) exist in opencode's part.state.output but are discarded for this item type. Likely intentional UI minimalism, but it is provider-emitted content with no projection.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT COUNT(*), SUM(json_extract(payload_json,'$.results') IS NOT NULL) FROM orchestration_v2_projection_turn_items WHERE thread_id LIKE 'thread:provider:opencode:native-thread:ses_%' AND type='file_search';\" -- returns 117|0", + "sessionKey": "3029dc85-opencode" + }, + { + "title": "Placeholder 'pending' provider_thread row persists alongside the real one; orchestrator-synthesized items split across the two", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Main thread has TWO provider_thread rows: provider-thread:provider:opencode:native-thread:pending%3Arun%3Athread%3A3029dc85...%3Aordinal%3A1 (status idle, first/last run ordinals 1-7) and the real provider-thread:provider:opencode:native-thread:ses_0ea983868ffens4ayJQLKgPE6J (idle, 1-7). 20 items (7 user_message, 7 checkpoint, 3 thread_created, 2 delegated subagent, 1 command_execution) still reference the pending placeholder while all native items reference ses_0ea983868 \u2014 e.g. thread_created item turn-item:created-thread:...126456a2...%3A0 has providerThreadId '...pending%3Arun%3A...ordinal%3A1' but providerTurnId pointing at the real session ses_0ea983868. Lineage is queryable but split; the placeholder was never collapsed after the native session id was learned.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT provider_thread_id, status, first_run_ordinal, last_run_ordinal FROM orchestration_v2_projection_provider_threads WHERE thread_id='3029dc85-93b7-43c5-8993-1beff41b1c98';\"", + "sessionKey": "3029dc85-opencode" + }, + { + "title": "Single occurred_at inversion on main stream (checkpoint capture vs concurrent subagent-result handoff)", + "category": "ordering", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Stream 3029dc85...: stream_version 1868 (sequence 10004, checkpoint.captured, occurred_at 2026-06-29T22:09:54.344Z, checkpoint root:name:5 for run 5) was appended AFTER stream_versions 1863-1867 (sequences 9881-9885, delegated subagent d43de13f completion + context-handoff/transfer, occurred_at 22:09:54.779Z) \u2014 occurred_at goes backwards by 435ms. Benign concurrency between run-5 completion checkpointing and the delegated claudeAgent subagent result handoff landing; projection ordinals unaffected (0 ordinal inversions vs first-seen event order across all 6 threads).", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT stream_version, sequence, event_type, occurred_at FROM (SELECT stream_version, sequence, event_type, occurred_at, LAG(occurred_at) OVER (ORDER BY stream_version) prev FROM orchestration_events WHERE stream_id='3029dc85-93b7-43c5-8993-1beff41b1c98') WHERE occurred_at < prev;\"", + "sessionKey": "3029dc85-opencode" + }, + { + "title": "Native provider log payloads are fully redacted (valueType/fieldCount only), preventing content-level loss auditing", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Every one of the 17,198 log lines in 3029dc85-93b7-43c5-8993-1beff41b1c98.log(.1) has kind='protocol' with inner payload reduced to {valueType:'object', fieldCount:N} \u2014 e.g. message.part.updated lines carry no part id, text, or error. Only method-level counts are auditable (15,542 message.part.delta, 723 message.part.updated, 204 message.updated, 7 session.promptAsync, 5 session.created, 12 session.idle). Method counts DO reconcile with projections (7 promptAsync = 7 runs; 5 session.created = 5 child threads; 12 session.idle = 7 main turns + 5 child turns), but text truncation and the actual error string of the failed file_search cannot be verified against the wire. If these logs are meant to be 'ground truth', the opencode SSE logger is not recording decoded payloads.", + "repro": "grep -h '\"message.part.updated\"' ~/.t3/userdata-v2/logs/provider/3029dc85-93b7-43c5-8993-1beff41b1c98.log.1 | head -1", + "sessionKey": "3029dc85-opencode" + }, + { + "title": "Split provider-thread identity: run/attempt/provider_turn reference the pending-keyed provider-thread row while turn items and threads.active_provider_thread_id reference the ses_-keyed row", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Two rows in orchestration_v2_projection_provider_threads for the same native session: 'provider-thread:provider:opencode:native-thread:pending%3Arun%3Athread%3A9f8d616d-...%3Aordinal%3A1' and 'provider-thread:provider:opencode:native-thread:ses_0ea7ea5deffe0eNd8RbfyTfgAw' (both idle, same provider_session_id, and the pending row's payload has nativeThreadRef.nativeId=ses_0ea7ea5deffe0eNd8RbfyTfgAw strength strong). projection_runs.provider_thread_id, run_attempts.provider_thread_id and provider_turns.provider_thread_id all point at the pending-keyed row, while the two provider-part turn_items (prt_f158168eb001r0Ubqqry5Z31Q1, prt_f158169d0002CgOsT8eNRkPGI3) and threads.active_provider_thread_id point at the ses_-keyed row. Notably provider_turns row provider-turn:...ses_0ea7ea5deffe0eNd8RbfyTfgAw%3Aattempt... encodes ses_ in its own id but its provider_thread_id column is the pending row. Pending id minted in Orchestrator.ts (`nativeThreadId: \\`pending:${runId}\\``, lines 2384/2772); the pending row is resolved via nativeThreadRef rather than rebinding downstream rows, so joins on provider_thread_id split the turn across two ids.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT provider_thread_id FROM orchestration_v2_projection_provider_turns WHERE thread_id='9f8d616d-86ee-4560-bd52-2387d6adb1c3'; SELECT DISTINCT provider_thread_id FROM orchestration_v2_projection_turn_items WHERE thread_id='9f8d616d-86ee-4560-bd52-2387d6adb1c3';\"", + "sessionKey": "9f8d616d-opencode" + }, + { + "title": "Real provider-thread row stayed status 'active' for ~20h after the provider session stopped", + "category": "stuck-state", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "provider-session.updated seq 13792 marks session stopped at 2026-06-29T23:00:38.028Z (idle reaper ~30min after last activity 22:30:30; last native log line heartbeat 2026-06-29T23:00:27.024Z). The pending-keyed provider-thread went idle at turn completion (seq 13783, 22:30:30.877Z), but the ses_0ea7ea5deffe0eNd8RbfyTfgAw provider-thread's last status update before shutdown was 'active' (seq 13779, 22:30:30.876Z) and it was only flipped to idle by seq 13797 at 2026-06-30T18:59:15.308Z, ~20h later (presumably on next app start). During that window the projection showed an active provider-thread bound to a stopped session. Eventually reconciled; final state is consistent.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT sequence, occurred_at, json_extract(payload_json,'$.id'), json_extract(payload_json,'$.status') FROM orchestration_events WHERE stream_id='9f8d616d-86ee-4560-bd52-2387d6adb1c3' AND event_type IN ('provider-thread.updated','provider-session.updated') ORDER BY sequence\"", + "sessionKey": "9f8d616d-opencode" + }, + { + "title": "OpenCode native log payloads are redacted to shape summaries, preventing content-level ground-truth verification", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Every one of the 273 lines in 9f8d616d-86ee-4560-bd52-2387d6adb1c3.log is kind 'protocol' (opencode-sdk.sse) with payload reduced to e.g. {\"payload\":{\"valueType\":\"object\",\"fieldCount\":3}} \u2014 no message text, part ids, or tool args (contrast with claude logs which carry full decoded JSON). Only method names/counts are auditable: 31 message.part.delta, 7 message.part.updated, 6 message.updated, 1 session.idle, 0 session.error. Projected content was spot-checked instead and looks sensible and un-truncated (user 'hey'; reasoning 'The user is just saying \"hey\" - a casual greeting...'; assistant 'Hey! What can I help you with?').", + "repro": "sed -E 's/^\\[[^]]+\\] NTIVE: //' ~/.t3/userdata-v2/logs/provider/9f8d616d-86ee-4560-bd52-2387d6adb1c3.log | jq -r '[.event.payload.direction,.event.payload.method]|@tsv' | sort | uniq -c", + "sessionKey": "9f8d616d-opencode" + }, + { + "title": "run_attempt.provider_turn_id never populated (null on all 207 attempts DB-wide)", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "run-attempt row run-attempt:...ordinal%3A1:attempt:1 has providerTurnId:null in payload_json and empty provider_turn_id column, while the provider_turn row correctly back-references runAttemptId. DB-wide: SELECT count(*), sum(provider_turn_id IS NOT NULL AND provider_turn_id != '') FROM orchestration_v2_projection_run_attempts \u2192 207|0. Forward lineage attempt\u2192turn is only recoverable by scanning provider_turns; may be intentional (reverse-only linkage) but the column exists and is never filled.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT count(*), sum(provider_turn_id IS NOT NULL AND provider_turn_id != '') FROM orchestration_v2_projection_run_attempts;\"", + "sessionKey": "1156181e-delegated-task" + }, + { + "title": "Provider session stayed 'running' ~25h after turn ended; only reconciled to 'stopped' at server shutdown", + "category": "stuck-state", + "severity": "low", + "confidence": "medium", + "knownIssue": true, + "evidence": "Turn completed 2026-06-28T05:22:33.360Z (last native log line); no further session events until sequence 168 provider-session.updated at 2026-06-29T06:33:36.481Z via command:runtime-reconcile:shutdown, which set status=stopped. No provider-session.detached was ever emitted for this thread. Related to the known idle-session-reaper/session-lifecycle area; state is correct now, so no data impact, but the projection reported a live session for a day for a one-shot delegated task.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT sequence, event_type, occurred_at FROM orchestration_events WHERE stream_id LIKE '%1156181e%' AND event_type LIKE 'provider-session%' ORDER BY sequence;\"", + "sessionKey": "1156181e-delegated-task" + }, + { + "title": "claudeAgent provider sessions never idle-reaped; stayed 'ready' ~21 hours until simultaneous shutdown sweep", + "category": "stuck-state", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "codex session stopped 2026-06-29T22:39:55 (seq 13789, 30 min after its 22:09:55 failure) and grok stopped 22:41:44 (seq 13790, 30 min after 22:11:44 completion) \u2014 idle reaper working. But all 3 claudeAgent sessions (last activity 22:08:48 / 22:09:50 / 22:13:07 on 6/29) only transitioned ready->stopped at 2026-06-30T18:59:15.286/.289/.291 (seq 13793-13795), simultaneously, ~21h later \u2014 a shutdown sweep, not the 30-min reaper. Native logs show no claude activity after 22:13, so either processes lingered or projections misreported 'ready' for a day. Inverse of the known 'reaper kills long sleeps' issue: reaper never fired for claudeAgent.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT sequence, occurred_at, json_extract(payload_json,'$.status') FROM orchestration_events WHERE event_type LIKE 'provider-session%' AND stream_id LIKE '%6d618dc4%' ORDER BY sequence;\"", + "sessionKey": "6d618dc4-mcp-group" + }, + { + "title": "Codex model-rejection error fully preserved and user-visible (works as intended)", + "category": "error-handling", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log 22:09:55.424 method 'error' / turn/completed status failed with message 'The gpt-5.3-codex model is not supported when using Codex with a ChatGPT account' (400 invalid_request_error). Projection: turn_item turn-item:provider:codex:native-item:terminal-failure%3A...019f156e-851a... type=error status=failed carries the verbatim JSON error in failure.message; run/run_attempt/provider_turn all 'failed'. No swallowing \u2014 this is a positive verification, listed for completeness.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT json_extract(payload_json,'$.failure.message') FROM orchestration_v2_projection_turn_items WHERE turn_item_id LIKE '%terminal-failure%019f156e-851a%';\"", + "sessionKey": "6d618dc4-mcp-group" + }, + { + "title": "Grok session/prompt request ends with errorTag 'Interrupt' at the instant the turn completes", + "category": "other", + "severity": "low", + "confidence": "low", + "knownIssue": false, + "evidence": "Log 2026-06-29T22:11:44.909Z: {method:'session/prompt',status:'failed',errorTag:'Interrupt'} while provider_turn 019f156e-797f...:turn:1 completedAt 22:11:44.910 status=completed and the 6663-char assistant report is fully projected (item ordinal 1000074). Likely adapter-initiated interrupt cleanup after observing turn completion (grok capabilities: supportsSteeringByInterruptRestart=true); no data lost, but a native 'failed' prompt request maps to a 'completed' turn, which could confuse future debugging.", + "repro": "grep '\"method\":\"session/prompt\",\"status\":\"failed\"' ~/.t3/userdata-v2/logs/provider/thread-mcp-6d618dc4-25fd-453b-9193-fd22a6739d1f-126456a2-f1ff-4098-a0a0-fde04bb1.log", + "sessionKey": "6d618dc4-mcp-group" + }, + { + "title": "Grok file_search/read items persist only pattern + fileName echo, no tool output; grok native log fully redacted", + "category": "lost-data", + "severity": "low", + "confidence": "low", + "knownIssue": false, + "evidence": "49 file_search items on thread :1 store e.g. pattern='pnpm-workspace.yaml', results=[{'fileName':'pnpm-workspace.yaml'}] (item turn-item:provider:grok:native-item:...call-e4489e09-...-1, ordinal 1000004) \u2014 the file content the tool returned is not kept, unlike claudeAgent command items which keep full stdout envelopes. Additionally the grok log redacts all decoded payloads to shape summaries ({'valueType':'array','itemCount':1}), so native-vs-projection content verification is impossible for grok; command_execution and reasoning items do carry real content, so this may be an intentional thin mapping for read-style tools.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT json_extract(payload_json,'$.pattern'), json_extract(payload_json,'$.results') FROM orchestration_v2_projection_turn_items WHERE thread_id LIKE '%126456a2%:1' AND type='file_search' LIMIT 5;\"", + "sessionKey": "6d618dc4-mcp-group" + }, + { + "title": "Both session/prompt requests logged as failed with errorTag=Interrupt despite runs completing", + "category": "error-handling", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Native log request events: session/prompt started 01:51:45.517 -> failed errorTag=Interrupt at 01:52:13.249; session/prompt started 01:53:00.071 -> failed errorTag=Interrupt at 01:53:37.665. These timestamps exactly equal provider_turn completed_at values (turn 1: 01:52:13.249, turn 2: 01:53:37.665), and both runs/turns/items project as completed with full assistant content, so this appears to be intentional adapter teardown (interrupting the pending ACP prompt fiber once the turn result is reached; grok sets interruptPromptOnCancel:false in GrokAdapterV2.ts line 90/641). No content was lost and no spurious failure surfaced, but the log-level 'failed' status for successful turns will mislead debugging.", + "repro": "sed 's/^\\[[^]]*\\] NTIVE: //' ~/.t3/userdata-v2/logs/provider/5dcea72d-15e1-4ded-922b-0b00c587de6c.log | jq -c 'select(.event.kind==\"request\" and .event.payload.method==\"session/prompt\")'", + "sessionKey": "5dcea72d-grok-subagent" + }, + { + "title": "run_attempts.provider_turn_id never populated (reverse link only)", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Both attempts for this session (run-attempt:run:run%3Athread%3A5dcea72d...ordinal%3A1:attempt:1 and ...ordinal%3A2:attempt:1) have provider_turn_id NULL in both the column and payload_json; DB-wide, 0 of all rows in orchestration_v2_projection_run_attempts have a non-NULL provider_turn_id. Lineage is still resolvable via orchestration_v2_projection_provider_turns.run_attempt_id (both turns correctly reference their attempts), so this may be an intentionally unused column, but the forward link is dead.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT COUNT(*) FROM orchestration_v2_projection_run_attempts WHERE provider_turn_id IS NOT NULL;\"", + "sessionKey": "5dcea72d-grok-subagent" + }, + { + "title": "Backgrounded-bash child threads have empty-text prompt user_message items", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "All 13 backgrounded-bash child threads project a user_message item task%3A%3Aprompt with text:\"\" (e.g. turn-item:provider:claudeAgent:native-item:task%3Abw28jiyac%3Aprompt, startedAt 2026-07-01T15:51:26.160Z), even though the native task_started event carried description \"vp run --filter @t3tools/web typecheck 2>&1 | tail...\". The command is still user-visible via the main-thread subagent item title, so impact is cosmetic (empty bubble if the child thread is opened).", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT turn_item_id FROM orchestration_v2_projection_turn_items WHERE thread_id LIKE '%7f1dfff1%' AND type='user_message' AND json_extract(payload_json,'$.text')='';\"", + "sessionKey": "7f1dfff1-claude-scheduled-tasks" + }, + { + "title": "Codex run 1 (169 items) has no surviving native log to verify against", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Run ordinal 1 (codex, 2026-07-01T01:45:03Z\u201302:01:10Z, 169 items incl. 10 failed command_executions) predates the oldest surviving log rotation: 7f1dfff1-...log.1 first line is 2026-07-01T15:48:00.224Z (run 2). The codex portion of the log was rotated away, so item-level loss/ordering for run 1 could not be ground-truthed; its events/projections are internally consistent (contiguous ordinals 1000001-1000169, all terminal).", + "repro": "head -1 ~/.t3/userdata-v2/logs/provider/7f1dfff1-4cbb-450b-8514-764d11c903e1.log.1", + "sessionKey": "7f1dfff1-claude-scheduled-tasks" + }, + { + "title": "Provider session projection stuck at status 'ready' after detach/archive", + "category": "stuck-state", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "provider-session.detached event at 2026-07-01T23:44:29.976Z (reason 'Thread archived.') for provider-session:...:1c70abe8-4ecb-43f8-8ee6-9ce19beddcfe, but the orchestration_v2_projection_provider_sessions row still shows status='ready' with updated_at=2026-07-01T22:48:37.724Z and thread_id still bound. The event stream also shows the session flip stopped\u2192ready correctly at 22:41:50/22:48:37, so the projection simply never reflects the final detached/terminated state.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT status, updated_at FROM orchestration_v2_projection_provider_sessions WHERE thread_id='47763f5e-76c1-4d67-8037-442a280f1514';\"", + "sessionKey": "47763f5e-claude-fable" + }, + { + "title": "Detached claudeAgent provider session left in status 'ready' forever after handoff", + "category": "stuck-state", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Event seq 32566 provider-session.detached at 2026-07-02T01:03:00.273Z (reason 'Provider or model selection changed.') for provider-session:provider-instance:claudeAgent:thread:ea84f015-...:21ed85a2-2ed1-4bd8-94be-1e5affe1aa72, but the orchestration_v2_projection_provider_sessions row still shows status='ready' \u2014 no stopped update was ever emitted. The replacement cursor session was correctly moved to 'stopped' at 02:49:07 (seq 38639).", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT provider_session_id,status FROM orchestration_v2_projection_provider_sessions WHERE provider_session_id LIKE '%ea84f015%'\"", + "sessionKey": "ea84f015-cursor-handoff" + }, + { + "title": "Subagent child-thread items have null run_id / provider_thread_id / provider_turn_id", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "All 34 items of child thread ...a4bd1db1834cb086a and 52 items of ...ab96794f302d5fd03 have runId:null, providerThreadId:null, providerTurnId:null (e.g. turn-item:provider:claudeAgent:native-item:toolu_01L5SAT2nkGMce3gWf3qFFqJ). Lineage back to the parent exists only via orchestration_v2_projection_subagents (child_thread_id set, run_id = parent runs ordinal 2 and 5, both completed) and the two parent subagent turn items (task%3Aa4bd1db1834cb086a%3Asubagent / task%3Aab96794f302d5fd03%3Asubagent). Likely by design (child threads have no runs of their own), but items cannot be joined to any run/turn directly.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT count(*) FROM orchestration_v2_projection_turn_items WHERE thread_id LIKE 'thread:provider:claudeAgent%ea84f015%' AND run_id IS NULL\"", + "sessionKey": "ea84f015-cursor-handoff" + }, + { + "title": "error_during_execution result during run 5 coincides with steering message \u2014 absorbed without any projected trace", + "category": "error-handling", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "log.3 line 677 [2026-07-01T23:13:24.083Z] result subtype error_during_execution, is_error:true, duration_api_ms 2473149, session 44cbb4c7. A user steering message was ingested 50ms earlier (seq 30447 message.updated 23:13:24.031Z, message 513621d3-5a03-46e1-8fd7-cc4de4b417f1, run 5), so this looks like the intentional interrupt-for-steering path; run 5 continued and completed at 23:30:07 with items after 23:13. No event records the aborted query, which is acceptable if intentional but leaves no audit trail of the abort.", + "repro": "grep -n error_during_execution ~/.t3/userdata-v2/logs/provider/ea84f015-e597-4f79-9557-d39b5a895fba.log.3", + "sessionKey": "ea84f015-cursor-handoff" + }, + { + "title": "5 tool calls that never returned natively were auto-marked 'completed' (with no results) at turn end", + "category": "lost-data", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log has 123 distinct tool-call-started but only 118 tool-call-completed; the 5 without completion (tool_e1f3ce39... glob, tool_fd1ba594..., tool_e2f6262c..., tool_840aeaa9... grep, tool_671d1b53... read) are all projected as status 'completed' with completedAt forced to the turn-end timestamp 2026-07-02T02:25:57.991Z and no results payload (e.g. turn-item:provider:cursor:native-item:tool_e1f3ce39-4604-4ced-8039-bbd5e75946b ends with pattern:'**/*' and no results field). This is the intentional finalizeTurn flush (CursorAdapterV2.ts:1886-1889 emitToolArtifacts({completed:true})), but 'completed' misrepresents tools whose results never arrived; something like 'interrupted' or an explicit no-result marker would be more truthful. Minor: cursor itself abandoned these calls, so no user-visible content was lost.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT status, json_extract(payload_json,'$.completedAt') FROM orchestration_v2_projection_turn_items WHERE json_extract(payload_json,'$.nativeItemRef.nativeId')='tool_e1f3ce39-4604-4ced-8039-bbd5e75946b';\"", + "sessionKey": "c9e72a05-cursor-failing-turns" + }, + { + "title": "Run 2 was a 7.3-minute silent hang before failing \u2014 nothing projected during the window (faithful to provider, but no progress/heartbeat signal exists)", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log: run.started for run-f4d64e64 at 02:28:07.973Z, next event is run.completed status 'error' at 02:35:28.424Z \u2014 zero interaction.updates in between. Ingested events for the stream in that window: 0 (SELECT COUNT(*) ... occurred_at BETWEEN '02:28:09' AND '02:35:28' returns 0). The pipeline mirrors the provider exactly (nothing lost), but the projection offers no signal distinguishing 'model working' from 'SDK dead due to auth error' during those 440s; the user just saw a running turn that eventually failed generically.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT COUNT(*) FROM orchestration_events WHERE stream_id='c9e72a05-4c87-4dd5-b1b6-83834cc73afe' AND occurred_at > '2026-07-02T02:28:09' AND occurred_at < '2026-07-02T02:35:28';\"", + "sessionKey": "c9e72a05-cursor-failing-turns" + }, + { + "title": "Native 'read' tool calls are projected as type 'file_search'", + "category": "other", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "18 native 'read' tool calls (e.g. tool_671d1b53-bb49-46ef-94de-8f682391c93 reading a file path) are projected as type 'file_search' with the file path as 'pattern'. Intentional per CursorAdapterV2.ts:1203-1219 (glob/grep/read/ls/readLints/semSearch all map to file_search), so the content is preserved, but the item type flattens semantically distinct operations (file reads rendered as searches). Cosmetic fidelity note only.", + "repro": "sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \"SELECT type FROM orchestration_v2_projection_turn_items WHERE json_extract(payload_json,'$.nativeItemRef.nativeId')='tool_671d1b53-bb49-46ef-94de-8f682391c93';\"", + "sessionKey": "c9e72a05-cursor-failing-turns" + }, + { + "title": "Token usage from turn-ended/run.completed is dropped by the ingestion pipeline", + "category": "lost-data", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log line 663 (2026-07-02T07:08:07.565Z) turn-ended carries usage {inputTokens:596379, outputTokens:6430, cacheReadTokens:522080}; line 664 run.completed repeats it with totalTokens:1124889. No event for stream_id=8ee00dcc-fa04-4fe7-9920-cd3432d45415 contains 'usage' or 'Tokens' (query returned 0 rows), and neither the run nor provider_turn projection payloads have a usage field. grep for 'usage' in apps/server/src/orchestration-v2/Adapters/CursorAdapterV2.ts, ProviderEventIngestor.ts and ProviderAdapter.ts returns nothing \u2014 the v2 contract simply doesn't model usage, so this looks like a pipeline-wide design gap rather than a session bug, but users lose per-turn token/cost data.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT count(*) FROM orchestration_events WHERE stream_id='8ee00dcc-fa04-4fe7-9920-cd3432d45415' AND payload_json LIKE '%Tokens%';\" (returns 0) vs grep -c 'inputTokens' ~/.t3/userdata-v2/logs/provider/8ee00dcc-fa04-4fe7-9920-cd3432d45415.log (returns 2)", + "sessionKey": "8ee00dcc-cursor-spacing" + }, + { + "title": "run_attempt.provider_turn_id never backfilled (forward link missing; reverse link intact)", + "category": "lineage", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "orchestration_v2_projection_run_attempts row run-attempt:run:run%3Athread%3A8ee00dcc-fa04-4fe7-9920-cd3432d45415%3Aordinal%3A1:attempt:1 has provider_turn_id='' and payload providerTurnId:null even after completion, while orchestration_v2_projection_provider_turns row provider-turn:provider:cursor:native-turn:run-ee800ef2-cd0e-45bd-9481-65fd5cfdae67 correctly points back via run_attempt_id to that attempt. Lineage is resolvable only in one direction (turn->attempt).", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT a.attempt_id, a.provider_turn_id, t.provider_turn_id FROM orchestration_v2_projection_run_attempts a JOIN orchestration_v2_projection_provider_turns t ON t.run_attempt_id=a.attempt_id WHERE a.thread_id='8ee00dcc-fa04-4fe7-9920-cd3432d45415';\"", + "sessionKey": "8ee00dcc-cursor-spacing" + }, + { + "title": "file_change diffStr has malformed diff headers (a//abs/path, b//abs/path)", + "category": "other", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "turn-item:provider:cursor:native-item:tool_fee434e5-6388-483d-82f1-9e6c6ab418c (ordinal 1000042) payload diffStr begins '--- a//Users/julius/.t3/worktrees/codething-mvp/t3code-3cba2658/apps/mobile/src/features/home/HomeScreen.tsx' \u2014 absolute path concatenated after 'a/' yields double slash and a non-standard diff header; same for the other two file_change items (tool_361abe3b, tool_b586252d). Diff hunk content itself is intact (additions:2 deletions:1 recorded). Cosmetic, but breaks strict unified-diff parsers.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT substr(json_extract(payload_json,'$.diffStr'),1,80) FROM orchestration_v2_projection_turn_items WHERE thread_id='8ee00dcc-fa04-4fe7-9920-cd3432d45415' AND type='file_change';\"", + "sessionKey": "8ee00dcc-cursor-spacing" + }, + { + "title": "Codex 'reasoning' items are silently dropped (no adapter handler, no projection)", + "category": "lost-data", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Surviving log window shows 45 item/started + 45 item/completed reasoning items for native thread 019f1b62-f532 (c878541b runs 4-6), but the projection has 0 reasoning-type turn_items for any of the four threads (types present: assistant_message, command_execution, file_change, user_message, checkpoint, dynamic_tool, run_interrupt_*). CodexAdapterV2.ts item/started / item/completed handlers dispatch on userMessage/commandExecution/mcpToolCall/dynamicToolCall/fileChange/webSearch/plan/collabAgentToolCall/subAgentActivity and fall through for 'reasoning' despite capabilities streamsReasoning:true (line 139). Mitigating: every observed reasoning item had empty summary/content (e.g. rs_0e493c0caf681939016a4539c1c0988193b7174c9c067dfdb2, summary:[], content:[]), so no user-visible text was actually lost in this session.", + "repro": "cat ~/.t3/userdata-v2/logs/provider/71e29ba5-*.log.10 | grep '\"decoded\"' | grep '\"item/completed\"' | grep '019f1b62' | grep -c '\"reasoning\"' vs SELECT count(*) FROM orchestration_v2_projection_turn_items WHERE thread_id='c878541b-a832-4757-a254-5550e3f6c2d1' AND type LIKE '%reason%';", + "sessionKey": "codex-no-native-logs" + }, + { + "title": "contextCompaction native item has no projection representation", + "category": "lost-data", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Native log shows item/completed contextCompaction id ad331e6d-f971-46a9-a04a-c70a1b86894f on turn 019f1e69-826e-7de2-9200-b6852ff53138 (c878541b run 4, ~16:00-16:05Z); no turn_item, node, or event for it exists (payload_json LIKE '%ompaction%' matches only incidental message text). Users cannot see that context was compacted mid-run. Native item carries only an id (no content), so the informational loss is small.", + "repro": "grep 'contextCompaction' ~/.t3/userdata-v2/logs/provider/71e29ba5-*.log.10 | grep 019f1b62", + "sessionKey": "codex-no-native-logs" + }, + { + "title": "Run 119 marked failed while its native query kept running; retry run 120 adopted the result, leaving a duplicated user message", + "category": "duplicate", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Run 119 failed at 2026-07-02T16:45:26.143Z (transport_error, ~1.5s after requested_at 16:45:24.618Z), but the native log shows query.open + prompt.offer at 16:45:25.921Z proceeding to SessionStart:resume (16:45:41) and a successful result at 16:47:00.764Z. Retry run 120 (requested 16:45:36.222Z) completed at 16:47:01.354Z with that assistant message (message:provider:claudeAgent:native-item:693d3576-4e56-4c10-9b0f-90ff9825f663, 3190 chars, intact). Two user messages with identical text exist: 9b135da3-dd1b-4a9a-90bc-ee4166bce48d (run 119) and 0f1b1078-74cb-4488-9626-e1ddbe933a9a (run 120) \u2014 timeline shows the prompt twice (plus the error item). Content preserved; cosmetic/duplication issue only.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT message_id, run_id, created_at FROM orchestration_v2_projection_messages WHERE thread_id='71e29ba5-654b-4f67-b363-ad400e5bd016' AND created_at BETWEEN '2026-07-02T16:45' AND '2026-07-02T16:46'\"", + "sessionKey": "71e29ba5-claude-mega" + }, + { + "title": "4 native error_during_execution results silently recovered in-adapter with no projection trace", + "category": "error-handling", + "severity": "low", + "confidence": "medium", + "knownIssue": false, + "evidence": "Native results with subtype=error_during_execution/is_error=true at 2026-07-02T01:34:23.046Z, 02:29:30.034Z, 05:08:27.090Z, 17:03:10.206Z fall inside runs 73/80/95/122 \u2014 all completed with a single run attempt and no error item/event. At 01:34:23 the error coincides with an outgoing prompt.offer (mid-stream steering) followed immediately by system init and continued streaming, so the adapter's transparent re-query is likely intentional; but the transient error is invisible to projections (debug info only in provider log).", + "repro": "grep -h '\"subtype\":\"error_during_execution\"' ~/.t3/userdata-v2/logs/provider/71e29ba5*.log* | grep '\"stage\":\"decoded\"'", + "sessionKey": "71e29ba5-claude-mega" + }, + { + "title": "Session has 7 subagent child threads, not 9 as briefed; all 7 fully linked and consistent", + "category": "lineage", + "severity": "low", + "confidence": "high", + "knownIssue": false, + "evidence": "Native log contains exactly 7 top-level Agent tool_use spawns (toolu_01CtiEb3..., 01CrTuja..., 01TiUmm..., 01W2SNA..., 01RjCn6..., 017H3QK..., 018VQWh...); projections have 7 type='subagent' items, 7 orchestration_v2_projection_subagents rows all with non-null child_thread_id resolving to existing thread rows, and 7 child-thread event streams (79-260 events each) with items on the correct thread ids (0 items with mismatched threadId). The 2 failed subagents (task%3Aafbafdde..., task%3Aa0e0b5f3..., completed_at 2026-07-02T08:34:33.868Z) match native '[Request interrupted by user for tool use]' is_error tool_results for toolu_017H3QKcEb9FrecsT4tyzg1M / toolu_018VQWhHKSAP1cFkZwZGRGe5 at 08:34:33.867Z \u2014 correct failure surfacing, not a bug.", + "repro": "sqlite3 ~/.t3/userdata-v2/state.sqlite \"SELECT count(*) FROM orchestration_v2_projection_subagents WHERE thread_id='71e29ba5-654b-4f67-b363-ad400e5bd016'\"", + "sessionKey": "71e29ba5-claude-mega" + } + ] +} diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md new file mode 100644 index 00000000000..9ec825bd378 --- /dev/null +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -0,0 +1,524 @@ +# Plan: Orchestration V2 Session Audit Remediation + +## Summary + +A full audit of every v2 orchestrator session in `~/.t3/userdata-v2` (2026-07-02, 18 session +groups, ~43k events, ~6.3k projected turn items, all 5 providers) compared native provider logs +against ingested events and item projections. Core mechanics are healthy — zero stream_version +gaps, zero ordinal collisions, positions table 100% consistent, no duplicate `nativeItemRef`s, +no stuck non-terminal state — but 23 verified discrepancies cluster into the workstreams below. + +Every finding was adversarially re-verified against the raw DB/logs and the adapter source. +Repro commands are inline. File/line references are as of branch `t3code/codex-turn-mapping` +on the audit date; expect drift. + +Status legend: `[ ]` todo · `[~]` in progress · `[x]` done + +## Tracking checklist + +- [ ] 1. Honor `is_error` on Claude SDK results (runs marked completed on 401/529) +- [ ] 2. Preserve real failure causes in projected errors (Claude adapter + ProviderFailure) +- [ ] 3. Preserve cursor failure detail (requestId, durationMs, SDK `error_code`) +- [ ] 4. Log failure/lifecycle frames in native provider logs +- [ ] 5. Surface provider-process crashes / reconcile cancellations to the user +- [ ] 6. Ingest codex-native collab subagents +- [ ] 7. Fix grok/ACP background subagent lifecycle + transcript projection +- [x] 8. Invisible post-turn wakeup turns (fix already on this branch — verify against audit scenarios) +- [ ] 9. Route shared-codex-session native logs to the correct thread's file +- [ ] 10. Coalesce streaming-delta event persistence (~2800x amplification) +- [x] 11. Assistant text segments merged without separator (fixed in worktree — add regression fixture) +- [ ] 12. OpenCode `file_search` items drop error/output +- [ ] 13. Low-severity backlog (see section) +- [ ] 14. Cursor SDK unhandled `write EPIPE` crashes the backend child (recurring, post-SDK-bump) + +--- + +## 1. `is_error: true` on Claude SDK results ignored — failed runs recorded as completed + +**Severity: high.** The Claude Agent SDK emits terminal `result` messages with +`subtype: "success"` but `is_error: true` for API-level failures. The adapter only branches on +subtype, so the run, run_attempt, provider_turn, and (for delegated tasks) the subagent row and +the parent's `delegate_task` item all read `completed`. + +Observed in four independent places: + +- Delegated task `hello-opus-48` (thread `1156181e`): 401 auth failure, zero token usage, + everything projected `completed`. +- Same 401 in a5a643b2's run-4 delegated child — parent `delegate_task` item output.status + `completed`. +- Thread `47763f5e` run 1: 401 result projected as a completed run; error text survives only as + a plain completed `assistant_message` (the SDK's synthetic ``-model message) — no + error item, so the run reads as a success. +- Thread `ea84f015` runs 11–13: `API Error: 529 Overloaded` (29 `api_retry` events preceding), + all three runs `completed`. + +**Root cause:** `ClaudeAdapterV2.ts` — `terminalStatusFromResult` (~line 1652) returns +`"completed"` whenever `message.subtype === "success"`; the failure branch (~line 3254) only +fires for non-success subtypes. `is_error` / `api_error_status` are never consulted. + +**Proposed fix:** + +1. In `terminalStatusFromResult` (and the result handler), treat `subtype === "success" && + is_error === true` as a failure. +2. Build a `ProviderFailure` from the result: `message` = the `result` text (e.g. `Failed to + authenticate. API Error: 401 ...`), `code` = `api_error_status` when present, plausibly + `retryable: true` for 429/529. +3. Ensure a terminal-failure error item is emitted (not just the synthetic assistant message) + and delegated-task subagent rows go to `failed`. +4. Testkit: extend `ClaudeAdapterV2.testkit.ts` + replay fixtures with an + `is_error`-on-success-subtype result. + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT status FROM orchestration_v2_projection_runs WHERE thread_id LIKE '%1156181e%';" # completed +grep 'api_error_status' ~/.t3/userdata-v2/logs/provider/thread-delegated-task-command-3amcp-3a1156181e-*.log +``` + +- [ ] Status: not started + +## 2. Terminal failures persist only generic strings — root cause unrecoverable from DB + +**Severity: high (systemic).** Every claudeAgent failed run in the DB persists exactly +`{class: "transport_error", message: "Claude Agent SDK query failed.", code: null, +retryable: null}` (threads 47763f5e run 10, 7f1dfff1 run 10, 71e29ba5 runs 89/119). The native +log at those failure sites shows only the outgoing `query.open`/`prompt.offer` with nothing +incoming — so after the fact, **no record of the actual failure exists anywhere**. The same +pattern hit cursor (issue 3) — while codex proves it can be done right: codex failures keep the +full upstream error body. + +**Root causes:** + +- `ClaudeAdapterV2.ts:279-281` — `ClaudeAgentSdkQueryRunnerError.message` getter hardcodes the + string; the wrapped `cause` defect is discarded. +- `ProviderFailure.ts:90-113` — `makeProviderFailure` reads only `.message`/`.code` off the + squashed cause and never unwraps nested causes of tagged wrapper errors. +- `RunExecutionService.ts:~830` — feeds `Cause.squash(cause)` into `makeProviderFailure`, so a + tagged error's inner cause never reaches the projection. The server log printed the cause + depth-elided (`failures: [ [Object] ]`), destroying the last copy. + +**Proposed fix:** + +1. Give wrapper errors (`ClaudeAgentSdkQueryRunnerError`, `ProviderAdapterTurnStartError` at + `ProviderAdapter.ts:~291`) a message that includes `String(cause)` (or a `detail` field). +2. In `makeProviderFailure`, walk the cause chain (`Cause.prettyErrors` / unwrap `.cause`) and + persist a structured `detail` (message chain + code) on the failure payload; contracts change + in `packages/contracts/src/orchestrationV2.ts` if the failure schema needs a `detail` field. +3. Fix the server-log print to use full-depth inspection for causes. +4. Side note from 47763f5e run 10: the failed `query.open` reused a **create-style `sessionId` + param after the session had been closed** (retry with `resume:` succeeded) — check + `ProviderSessionManager`/adapter open-vs-resume selection for closed sessions while fixing. + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT thread_id, json_extract(payload_json,'$.failure.message') \ + FROM orchestration_v2_projection_turn_items WHERE type='error';" +``` + +- [ ] Status: not started + +## 3. Cursor failure detail dropped (requestId, durationMs, SDK error_code) + +**Severity: high.** Thread `c9e72a05` (the "why are cursor turns failing" session): run 2 failed +after a 440s silent turn; native `run.completed` carried `status:"error"`, +`requestId:"beca30c7-..."`, `durationMs:440732`. Projection kept only +`{message: "Provider turn failed.", code: null}` — requestId and duration dropped. The real +cause ("Authentication error. If you are logged in, try logging out and back in.") lives only in +the Cursor SDK's local store (`runs.error_code`), which T3 never reads. + +**Root cause:** `CursorAdapterV2.ts:2165-2168` maps `cause: (result as {error?: unknown}).error` +— that field does not exist on the SDK `RunResult`, so it always falls back to +`DEFAULT_PROVIDER_FAILURE_MESSAGE`. + +**Proposed fix:** + +1. Map what the result actually carries: persist `requestId` and `durationMs` on the failure + payload (code field is a natural home for requestId). +2. On `status === "error"`, query the Cursor SDK store for `runs.error_code` / + error message for that run id and attach it. +3. Note: verify against current SDK shape — the audit also found (refuted-as-intentional) cases + where cursor's `run.completed` genuinely carries no detail; the fix is "persist everything + present", not "invent detail". + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT json_extract(payload_json,'$.failure') FROM orchestration_v2_projection_turn_items \ + WHERE thread_id='c9e72a05-4c87-4dd5-b1b6-83834cc73afe' AND type='error';" +grep 'run.completed' ~/.t3/userdata-v2/logs/provider/c9e72a05-*.log | tail -1 +``` + +- [ ] Status: not started + +## 4. Native provider logs never record failure/lifecycle frames + +**Severity: medium.** The "ground truth" logs cannot explain failed turns. In `721fc23c` the +log ends with an outgoing `run.start` and no trace of the `agent.send` rejection 4ms later, nor +of the provider child process fatally crashing (twice, `@cursor/sdk` Node error — visible only +in `server-child.log`). No adapter has an error/lifecycle log kind at all. + +**Root cause:** e.g. `CursorAgentSdk.ts` `send()` — log writes cover outgoing `run.start` and +incoming frames only; `runnerError(cause, "run.start")` paths (lines ~408/440) write nothing. +Process exit is not logged to the per-thread native log either. + +**Proposed fix:** + +1. Add an `error`/`lifecycle` kind to the native event logger contract and write a frame on: + adapter send/open rejections, runner errors, provider process spawn/exit (with exit code), + and turn-abort paths — across all adapters (cursor, claude, codex, ACP, opencode). +2. Keep payloads small (message + code + native run/turn id), no secrets. + +- [ ] Status: not started + +## 5. Provider crashes / reconcile cancellations are silent to the user + +**Severity: high.** In `721fc23c` the cursor backend crashed mid-turn; startup reconcile +cancelled run 1. In `48663fb7` a server restart orphaned the run; reconcile cancelled it. In +both cases the user asked something, got **no response, no error item, and no explanation** — +a reconcile-cancelled run is projected indistinguishably from a user cancellation. + +**Root cause:** the startup runtime-reconcile (`command:runtime-reconcile:startup...`) +terminalizes runs/attempts/turns to `cancelled` but emits no user-visible item and no reason. + +**Proposed fix:** + +1. When reconcile terminalizes a run it did not start, emit an error (or `interrupted`) turn + item with a reason: e.g. "Provider process exited unexpectedly" / "Run was interrupted by an + app restart". +2. Record the cancellation reason on the run payload (`cancelReason: "runtime_reconcile" | + "user" | ...`) so UI and debugging can distinguish. +3. Optional: auto-offer retry in UI for reconcile-cancelled runs. + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT event_type, occurred_at FROM orchestration_events \ + WHERE stream_id='721fc23c-2cf3-42bf-9d84-edd94359dca9' AND event_id LIKE '%runtime-reconcile%';" +grep -n 'cursor/sdk\|terminalizedRuns' ~/.t3/userdata-v2/logs/server-child.log +``` + +- [ ] Status: not started + +## 6. Codex-native collab subagents entirely missing from events and projections + +**Severity: high (lost data).** In `a5a643b2` run 3 ("spawn a subagent"), codex spawned a +native collab subagent: `subAgentActivity` item, `collabAgentToolCall` (`wait`), and a child +native thread (`019f0c93-d260`) with webSearch, reasoning, and a final `agentMessage` +("Hello."). **Zero** of it was ingested — no events, no items, no subagent row, no child +thread. The child also completed *after* the parent turn finalized, so late-arriving child +items need routing even post-turn. + +**Root cause:** `CodexAdapterV2.ts` has no handlers for `subAgentActivity` / +`collabAgentToolCall` item types, and events on non-primary native threads of the shared +app-server session are not attributed to any T3 thread. + +**Proposed fix:** + +1. Handle `subAgentActivity`: create a subagent projection row + child provider thread (mirror + of what `a61e9269`'s newer session shape already does — that session projected + `subAgentActivity` fine, so check what differs: likely experimental collab API vs newer + subagent API). +2. Handle `collabAgentToolCall` as a tool item on the parent. +3. Route child-native-thread items to the child T3 thread, including after the parent turn + completed (same routing-loosening pattern used for owned-provider-thread updates in the + wakeup work). + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT COUNT(*) FROM orchestration_events WHERE payload_json LIKE '%019f0c93-d260%';" # 0 +grep -c 'subAgentActivity\|collabAgentToolCall' ~/.t3/userdata-v2/logs/provider/a5a643b2-*.log # >0 +``` + +- [ ] Status: not started + +## 7. Grok/ACP background subagent: wrong lifecycle + transcript never projected + +**Severity: high (lost data) + medium (lifecycle).** Thread `5dcea72d` ("spawn a subagent"): + +- The subagent row and parent `subagent` item were marked `completed` at spawn time + (01:52:10) with the placeholder result "Subagent started in background..." — the task + actually ran 75s (116 tool calls, per the later `TaskOutput` payload). +- The child thread got only its 2 spawn-time items; the live transcript (811 decoded incoming + ACP messages between runs) produced **zero** ingested events. The subagent's real final + output never reached the child thread. + +**Root cause:** `AcpAdapterV2.ts` `emitSubagent` (~lines 1047-1060) adopts the spawn tool's +bootstrap text as the result and a non-running taskStatus as terminal; session notifications +arriving **between runs** (no active turn) are dropped rather than buffered/routed to the +subagent's child thread. + +**Proposed fix:** + +1. Keep the subagent `running` until a terminal task status; update result/completedAt from + `TaskOutput` (or task-completion notification) when it arrives. +2. Route/buffer ACP session notifications for background tasks outside active turns to the + child thread (reuse the wakeup-buffer pattern from ClaudeAdapterV2 if applicable to ACP). +3. Related low finding: child-thread items carry null `run_id`/`provider_thread_id`/ + `provider_turn_id` — populate lineage when projecting child items. + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT status, completed_at, substr(json_extract(payload_json,'$.result'),1,60) \ + FROM orchestration_v2_projection_subagents WHERE thread_id='5dcea72d-15e1-4ded-922b-0b00c587de6c';" +``` + +- [ ] Status: not started + +## 8. Invisible post-turn wakeup turns (known — fix on this branch) + +**Severity: high (lost data), known issue.** The audit quantified the pre-fix damage: + +- `47763f5e`: 10 whole wakeup turns unpersisted — 16 assistant messages + 84 tool calls, + including 5 real git pushes and the final "Converged. PR #3638 is fully green". +- `ea84f015`: 42 tool calls + final assistant messages across 3 windows — including a GitHub + comment posted with no user-visible record (`pull/2829#issuecomment-4861082710`). +- `71e29ba5`: 76 completed tool calls (60 Bash, 7 Edit, 5 Read, 4 ScheduleWakeup), all falling + in gaps between run windows, zero inside any run. + +The fix (`turn.wakeup` / `ProviderWakeupService` / `attach_wakeup` / backgrounded-bash +adoption) is implemented on `t3code/codex-turn-mapping`; these sessions ran on pre-fix builds. +First `provider_wakeup` events in the DB appear 2026-07-01T22:47:41 — post-fix threads persist +wakeups. + +**Remaining work:** + +- [x] Core fix implemented (this branch, replay fixture `claude_provider_wakeup`) +- [ ] Sanity-check the three audit scenarios against the fixture set (task-notification wakeup, + ScheduleWakeup sleep-loop, backgrounded Bash completion) — the audit evidence makes good + additional fixture material +- [ ] Known follow-ups (from memory): idle session reaper vs long sleeps; superseded-wakeup + buffer only replays task bookkeeping; provider_turn has no `waiting` status literal + +## 9. Shared codex session logs written to opener thread's file + +**Severity: medium.** Four codex threads (`c878541b`, `de5f191a`, `68f7595b`, `af66fc2c`) have +**no native log file at all** — their app-server traffic was written into +`71e29ba5-...log.*` because that thread opened the shared codex app-server session. +Consequence: rotation of the busy opener's log (10 files × 10MB) **destroyed the native ground +truth** for most of those threads' runs; it also produced a false "ingestion gap" signal during +the audit (another thread's traffic interleaved in 71e29ba5's log). + +**Root cause:** `CodexAdapterV2.ts:~1136` — `codexAppServerClientFactoryFromSettingsLayer.open` +builds `makeCodexAppServerProtocolLogger({ threadId: input.threadId })` once per app-server +process; `EventNdjsonLogger.write` routes to `${threadSegment}.log` from that frozen threadId. + +**Proposed fix:** + +1. Resolve the log target per message, not per process: maintain a native-thread-id → T3 + thread-id map (the adapter already tracks owned native threads) and route each protocol + frame to the owning thread's log; fall back to a shared + `codex-shared-session.log` for unattributable frames (initialize, thread/start, etc.). +2. Consider retention bump for provider logs, since they are the only ground truth + (`.plans/06-provider-logstream-lifecycle.md` is the related prior art). + +**Repro:** + +```sh +grep -l '019f1b62-f532' ~/.t3/userdata-v2/logs/provider/*.log* # only 71e29ba5-*.log.{6,9,10} +ls ~/.t3/userdata-v2/logs/provider/ | grep -c 'c878541b\|de5f191a\|68f7595b\|af66fc2c' # 0 +``` + +- [ ] Status: not started + +## 10. Streaming deltas persisted as full-row event pairs (~2800x amplification) + +**Severity: medium (cost/scale, not correctness).** Grok/ACP streams a child task's result +per-token; each chunk is persisted as a **full-row** `turn-item.updated` + `message.updated` +event pair. One 6,274-char result accumulated 2,704 events; a 2-minute session wrote 6,017 +rows (14% of the whole 43k-row table). Replay and projection costs scale with this. + +**Root cause:** `AcpAdapterV2.emitSubagentAssistant` emits a full event pair per +`agent_message_chunk`, and `ProviderEventIngestor` persists every emission. + +**Proposed fix:** + +1. Split live streaming from persistence: broadcast deltas to subscribers in-memory, persist + coalesced snapshots (e.g. on item completion + every N seconds/K bytes while running). +2. Alternatively debounce persistence per item id in the ingestor so any adapter gets the + benefit. +3. Check other adapters for the same pattern (claude/codex emit per-block, which is fine; + ACP per-token is the outlier). + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT event_type, COUNT(*) FROM orchestration_events \ + WHERE stream_id LIKE 'thread:provider:grok:native-thread:019f1558%' GROUP BY 1;" +``` + +- [ ] Status: not started + +## 11. Claude assistant text segments merged without separator (fixed — needs regression fixture) + +**Severity: medium, already fixed in worktree.** In the `6d618dc4` MCP group (build +`fc23be8184`), 5 interleaved assistant text blocks were accumulated +(`context.assistant.text += ...`) and emitted as ONE item at end of turn — 10,642 chars joined +with no separator, ordered after all 19 tool calls, losing interleaving. Current worktree code +(`emitAssistantTextArtifacts` per `message.uuid`) already emits per-segment items. + +**Remaining work:** + +- [x] Fix (already in worktree) +- [ ] Add a replay fixture asserting multi-segment assistant text produces one item per SDK + uuid, interleaved with tool items at correct ordinals + +## 12. OpenCode `file_search` items drop error/output + +**Severity: medium.** Child session `ses_0ea978228`: a failed `file_search` item projects only +`{status: 'failed', type: 'file_search', pattern: '...'}` — the provider's error message is +unrecoverable. Same shape drops successful read/grep/glob outputs (low finding). + +**Root cause:** `OpenCodeAdapterV2.ts` — `toolOutput()` (line ~659) extracts +`part.state.error`, but the `file_search` mapping branch (~line 1376) only maps `pattern`; +output/error only attach for `dynamic_tool`. The contract type +(`packages/contracts/src/orchestrationV2.ts` ~line 841) has no output/error field on +`file_search` either. + +**Proposed fix:** + +1. Add optional `output`/`error` to the `file_search` item contract. +2. Map `part.state.error` (and output where present) in the opencode/ACP `file_search` branch. +3. Grok has the same gap (`file_search`/`read` persist only pattern/fileName echo — low + finding); fix at the shared mapping level if possible. + +**Repro:** + +```sh +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT payload_json FROM orchestration_v2_projection_turn_items \ + WHERE turn_item_id='turn-item:provider:opencode:native-item:prt_f15692f30001qL66As1xsUCQXc';" +``` + +- [ ] Status: not started + +## 13. Low-severity backlog + +Unverified (single-auditor) findings, grouped. Tick when triaged/fixed: + +**Lineage** +- [ ] `run_attempts.provider_turn_id` is null on ALL 207 attempts DB-wide (reverse link only) — + populate the forward link or drop the column +- [ ] Opencode/claude "pending"-keyed provider-thread placeholder row persists alongside the + real `ses_`-keyed row; runs/attempts reference one, items/active_provider_thread_id the + other (3029dc85, 9f8d616d) +- [ ] Subagent child-thread items carry null run/provider_thread/provider_turn ids (grok, + ea84f015) +- [ ] Interrupt-result turn item references an interrupt-request parent item that was never + emitted (GLOBAL) +- [ ] Synthesized terminal-failure error item references a provider_turn_id that was never + projected (GLOBAL) + +**Stuck state / lifecycle** +- [ ] Projection ignores `provider-session.detached` — sessions stay `ready`/`running` for + ~20-25h until shutdown sweep (GLOBAL, 1156181e, 6d618dc4, 47763f5e, ea84f015); also + claudeAgent sessions are never idle-reaped +- [ ] Provider-thread row stayed `active` ~20h after its session stopped (9f8d616d) +- [ ] Run 119 (71e29ba5) marked failed while its native query kept running; retry run 120 + adopted the result leaving a duplicated user message — dedupe/adopt semantics on retry + +**Dropped provider data (no user-visible loss observed yet)** +- [ ] Codex `reasoning` items silently dropped (no adapter handler; all observed summaries were + empty — would lose data once codex emits populated summaries) +- [ ] Codex `thread/tokenUsage/updated` + `account/rateLimits/updated` never ingested — runs + have no usage data; cursor `run.completed` usage also dropped (8ee00dcc) +- [ ] Codex `contextCompaction` item has no projection representation +- [ ] Cursor tool calls that never returned natively are auto-marked `completed` with no + results at turn end (c9e72a05) — should be `failed`/`interrupted` +- [ ] 4 native `error_during_execution` results silently recovered in-adapter with no + projection trace (71e29ba5); one absorbed during a steering message (ea84f015) + +**Cosmetic / observability** +- [ ] Grok + opencode native logs redact payloads to shape summaries — no content-level ground + truth for audits; consider a debug setting to log content +- [ ] Grok logs every successful `session/prompt` as `failed` with `errorTag: Interrupt` +- [ ] Cursor native `read` tool calls projected as type `file_search` (semantic mismatch) +- [ ] Cursor `file_change` diffStr has malformed headers (`a//abs/path`, `b//abs/path`) +- [ ] 13 old backgrounded `local_bash` tasks projected as subagents with empty prompts and + synthetic child threads (GLOBAL); backgrounded-bash child threads have empty-text prompt + user_message items (7f1dfff1) +- [ ] Cursor 7.3-minute silent hang before failing (c9e72a05) — no progress/heartbeat signal + exists to distinguish a hung turn from a thinking one + +## 14. Cursor SDK unhandled `write EPIPE` crashes the backend child (recurring) + +**Severity: high.** Added 2026-07-02 after new cursor sessions reproduced backend crashes. +Distinct from the June-29 crash that the `@cursor/sdk` version bump addressed. + +Three occurrences in `server-child.log`, identical signature — `Unhandled 'error' event / +Error: write EPIPE` on a `net.Socket`, followed by the `Node.js v24.15.0` footer (process +death): + +- **2026-07-02T19:04:57.558Z** — stack definitively inside `@cursor/sdk/dist/esm/357.js` + (`et.execute` → `We.execute` → `Is.execute`): a connect-RPC execute writing to a + cursor-agent socket whose far end had closed. Context: TWO cursor sessions streaming + concurrently (d1bfdd3d run-0294b303, 4f3381e5 run-c7e93ba0, each with its own provider + session, plus a native subagent task thread); both streamed token-deltas until 19:04:55.5. +- **2026-07-01T20:47:34.573Z** and **2026-07-02T04:46:20.557Z** — same signature, but stack is + only the async write-completion frame (`WriteWrap.onWriteComplete`), so origin unproven. No + cursor turns were active; consistent with a lingering cursor-agent connection dying idle and + a later write hitting EPIPE. + +**Collateral (ties into issues 2/4/5):** + +- d1bfdd3d run 1 + 4f3381e5 run 1 silently `cancelled` by startup reconcile at 19:05:01; + 4f3381e5 has NO error item at all (issue 5). +- d1bfdd3d run 2 (retry) failed with the generic `Failed to start run ... on cursor provider + thread ...pending...` wrapper, cause depth-elided in the log (issue 2), and targeted the + stale `pending`-keyed provider thread (issue 13 lineage note). +- The two earlier EPIPE crashes explain two previously "undebuggable" audit failures: the + restarted child killed the live claude CLI processes, so 7f1dfff1 run 10 (requested + 20:47:56, `claude-query-stream-failed` 20:47:58) and 71e29ba5 run 89 (requested 04:47:03) + failed ~20-40s after each crash with the generic "Claude Agent SDK query failed." +- The June-29 crash's actual error text is unknowable: the stderr dump was truncated at + 51,674 chars (log line cap) BEFORE the error message — only minified bundle source survived. + +**Proposed fix:** + +1. Containment: attach `error` handlers to (or wrap) the cursor SDK's sockets, and add a + `process.on('uncaughtException'/'unhandledRejection')` policy in the backend child that + fails the owning provider session instead of dying. Longer-term: isolate provider SDKs in + their own child process so an SDK crash cannot kill the orchestrator. +2. Report upstream to Cursor: SDK leaves its agent socket without an `error` listener; + `write EPIPE` after agent-process exit is fatal to the host. +3. Fix stderr log truncation: when capping captured child output, keep the TAIL (where Node + prints the error + stack), not just the head. +4. Issues 2/4/5 remain the reason this was hard to diagnose — they get us error items with + real causes, native-log failure frames, and user-visible reconcile cancellations. + +**Repro:** + +```sh +grep -n "Unhandled 'error' event" ~/.t3/userdata-v2/logs/server-child.log # 3 hits +sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ + "SELECT run_id,status,completed_at FROM orchestration_v2_projection_runs \ + WHERE thread_id IN ('d1bfdd3d-38cc-4ff3-ab75-8be6dc592b00','4f3381e5-89e2-45a8-bca1-bbe5d520bbba');" +``` + +- [ ] Status: not started + +## Refuted during verification (do NOT act on) + +- 71e29ba5 "ingestion gap, 119 items lost" — traffic belonged to c878541b via the shared codex + session (see issue 9). +- 47763f5e "122 thinking blocks dropped" — all native thinking blocks were empty + (`{"thinking":""}`); nothing to lose. (Reasoning ingestion is still absent, tracked as low.) +- ea84f015 failed cursor runs 15-17 "generic error" — native results genuinely carried no + detail; nothing to persist (but see issues 3/4 for making cursor carry detail). + +## Validation + +- Re-run the audit repro queries above after each fix; each issue's repro should flip. +- Extend replay fixtures (`testkit/fixtures/`) per issue where marked; the audit evidence + (thread ids + native log excerpts) is fixture source material. +- Full audit data with per-finding evidence, verifier verdicts, and repro commands: + `.plans/22-orchestration-v2-audit-findings.json`. From a195a63c5f0811f68ae358b05484a5c4be08a104 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:12:32 -0700 Subject: [PATCH 03/23] Fail Claude runs on is_error results and resume idle-released sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two reliability fixes for the Claude adapter, both found by the session audit (.plans/22): - The SDK reports API-level failures (401 auth, 529 overloaded) as result subtype "success" with is_error: true. The adapter only checked the subtype, so those runs projected as completed with no failure recorded (threads 1156181e, a5a643b2, 47763f5e, ea84f015). Terminal status now honors is_error and the failure keeps the API error text and status code. - The create-vs-resume decision for query.open lived only in the in-memory openedNativeThreads set, which dies with the session runtime. After the 30-minute idle release (or an app restart) the next turn re-created an existing native session and failed in under a second — the first message after every idle gap was burned (11 occurrences across 6 threads). The persisted provider thread's firstRunOrdinal now serves as the durable resume signal, and threads are only marked opened after a successful open. New replay fixtures claude_result_is_error (from thread 47763f5e run 1) and claude_idle_resume (from thread d0fe9018 runs 6-8, using a new advance_clock fixture step that drives the real idle reaper via the test clock). Co-Authored-By: Claude Fable 5 --- .../Adapters/ClaudeAdapterV2.ts | 76 ++++++++++++++----- ...rchestratorReplayFixtures.contract.test.ts | 10 ++- .../claude_transcript.ndjson | 19 +++++ .../fixtures/claude_idle_resume/input.ts | 25 ++++++ .../fixtures/claude_idle_resume/output.ts | 42 ++++++++++ .../claude_transcript.ndjson | 10 +++ .../fixtures/claude_result_is_error/input.ts | 22 ++++++ .../fixtures/claude_result_is_error/output.ts | 71 +++++++++++++++++ .../testkit/fixtures/index.ts | 31 ++++++++ .../testkit/fixtures/shared.ts | 14 ++++ 10 files changed, 296 insertions(+), 24 deletions(-) create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/claude_transcript.ndjson create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/input.ts create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/output.ts create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/claude_transcript.ndjson create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/input.ts create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/output.ts diff --git a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts index 77b6ec1c959..b4171c6064b 100644 --- a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts @@ -1656,7 +1656,9 @@ function terminalStatusFromResult( "completed" | "interrupted" | "failed" | "cancelled" > { if (message.subtype === "success") { - return "completed"; + // The SDK reports API-level failures (401 auth, 529 overloaded, …) as + // subtype "success" with is_error set; the turn produced no real work. + return message.is_error ? "failed" : "completed"; } const errorText = message.errors.join("\n").toLowerCase(); if (errorText.includes("interrupt")) { @@ -1672,6 +1674,28 @@ function isClaudeActiveSteeringAbortResult(message: SDKResultMessage): boolean { return message.terminal_reason === "aborted_streaming"; } +function providerFailureFromResult( + message: SDKResultMessage, +): OrchestrationV2ProviderFailure | null { + if (message.subtype !== "success") { + return makeProviderFailure({ + message: message.errors.join("\n"), + code: message.subtype, + class: "provider_error", + }); + } + if (!message.is_error) { + return null; + } + const apiErrorStatus = message.api_error_status ?? null; + return makeProviderFailure({ + message: message.result, + code: apiErrorStatus === null ? "sdk_result_error" : `api_error_${apiErrorStatus}`, + class: "provider_error", + retryable: apiErrorStatus === 429 || apiErrorStatus === 529 ? true : null, + }); +} + function buildAssistantArtifacts(input: { readonly idAllocator: IdAllocatorV2Shape; readonly turnInput: ProviderAdapterV2TurnInput; @@ -3203,7 +3227,12 @@ export function makeClaudeAdapterV2( return; } - const resultText = resultTextFromSdkMessage(message); + // An is_error result's text is the error message; it belongs on the + // terminal-failure item, not on a synthetic assistant message. + const resultText = + message.type === "result" && message.subtype === "success" && message.is_error + ? null + : resultTextFromSdkMessage(message); if ( context.assistant.emittedNativeItemIds.size === 0 && context.assistant.fallbackText.length === 0 && @@ -3224,6 +3253,7 @@ export function makeClaudeAdapterV2( if ( !interrupted && message.subtype === "success" && + !message.is_error && context.pendingScheduledWakeup !== null ) { // The turn yielded on ScheduleWakeup (result stop_reason @@ -3247,19 +3277,12 @@ export function makeClaudeAdapterV2( next.delete(context.providerTurnId); return next; }); + const resultFailure = interrupted ? null : providerFailureFromResult(message); yield* finalizeActiveTurn({ context, status: interrupted ? "interrupted" : terminalStatusFromResult(message), completedAt, - ...(message.subtype === "success" || interrupted - ? {} - : { - failure: makeProviderFailure({ - message: message.errors.join("\n"), - code: message.subtype, - class: "provider_error", - }), - }), + ...(resultFailure === null ? {} : { failure: resultFailure }), }); } }); @@ -3395,16 +3418,17 @@ export function makeClaudeAdapterV2( yield* existing.query.close.pipe(Effect.ignore); } - const openedWithResume = yield* Ref.modify(openedNativeThreads, (current) => { - const hasOpenedThread = current.has(nativeThreadId); - if (hasOpenedThread) { - return [true, current]; - } - const updated = new Set(current); - updated.add(nativeThreadId); - return [false, updated]; - }); - const shouldResume = resumeSessionAt !== undefined || openedWithResume; + const openedThisRuntime = (yield* Ref.get(openedNativeThreads)).has(nativeThreadId); + // openedNativeThreads dies with this session runtime (idle release, + // app restart). The persisted provider thread is the durable signal: + // firstRunOrdinal is stamped when a turn starts, so an earlier run + // means the native session was already opened and must be resumed — + // a create-style open for an existing session fails the whole turn. + const establishedByPriorRun = + turnInput.providerThread.firstRunOrdinal !== null && + turnInput.providerThread.firstRunOrdinal < turnInput.runOrdinal; + const shouldResume = + resumeSessionAt !== undefined || openedThisRuntime || establishedByPriorRun; const querySession = yield* queryRunner.open({ threadId: turnInput.threadId, providerSessionId: input.providerSessionId, @@ -3425,6 +3449,16 @@ export function makeClaudeAdapterV2( ...(shouldInstallClaudePermissionCallback(queryPolicy) ? { canUseTool } : {}), }), }); + // Marked only after a successful open: a failed create must not + // leave the runtime believing the native session exists. + yield* Ref.update(openedNativeThreads, (current) => { + if (current.has(nativeThreadId)) { + return current; + } + const updated = new Set(current); + updated.add(nativeThreadId); + return updated; + }); const closed = yield* Deferred.make(); const context: ClaudeLiveQueryContext = { nativeThreadId, diff --git a/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts b/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts index 2916816a02b..e4d146f4f40 100644 --- a/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts +++ b/apps/server/src/orchestration-v2/testkit/OrchestratorReplayFixtures.contract.test.ts @@ -69,11 +69,15 @@ describe("orchestrator replay fixture contract", () => { throw new Error(`${fixture.name}/${provider.driver} must start with thread.create`); } assert.equal(firstCommand.threadId, materialized.projectionThreadIds[0]); - // await_provider_wakeup_run steps only await runs minted by the - // wakeup dispatcher; every other input step dispatches a command. + // await_provider_wakeup_run only awaits runs minted by the wakeup + // dispatcher and advance_clock only moves the test clock; every + // other input step dispatches a command. const commandProducingSteps = fixture .buildInput() - .steps.filter((step) => step.type !== "await_provider_wakeup_run"); + .steps.filter( + (step) => + step.type !== "await_provider_wakeup_run" && step.type !== "advance_clock", + ); assert.equal(materialized.commands.length, commandProducingSteps.length + 1); assert.isAtLeast(materialized.steps.length, materialized.commands.length); assert.equal(typeof provider.assertOutput, "function"); diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/claude_transcript.ndjson b/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/claude_transcript.ndjson new file mode 100644 index 00000000000..ba7fd8e044a --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/claude_transcript.ndjson @@ -0,0 +1,19 @@ +{"type":"transcript_start","provider":"claudeAgent","protocol":"claude-agent-sdk.query","version":"0.2.111","scenario":"claude_idle_resume","metadata":{"prompts":["Respond with exactly: idle resume first turn complete","Respond with exactly: idle resume second turn complete"],"model":"claude-sonnet-4-6","nativeSessionId":"1d1e5e55-1041-4e40-8e40-000000000001","queryMode":"restart","tools":"claude_code","permissionMode":"bypassPermissions","generatedBy":"manual-replay-from-thread-d0fe9018 runs 6-8 (idle-released session must reopen with resume, 2026-07-02T20:46:05Z)"}} +{"type":"expect_outbound","label":"query.open:1","frame":{"type":"query.open","options":{"model":"claude-sonnet-4-6","tools":{"type":"preset","preset":"claude_code"},"permissionMode":"bypassPermissions","allowDangerouslySkipPermissions":true,"sessionId":"1d1e5e55-1041-4e40-8e40-000000000001"}}} +{"type":"expect_outbound","label":"prompt.offer:1","frame":{"type":"prompt.offer","message":{"type":"user","message":{"role":"user","content":"Respond with exactly: idle resume first turn complete"},"parent_tool_use_id":null}}} +{"type":"emit_inbound","label":"system","frame":{"type":"system","subtype":"hook_started","hook_id":"96353a54-8984-4e77-8060-b1dc6b030cf6","hook_name":"SessionStart:startup","hook_event":"SessionStart","uuid":"ffbfcc28-dd82-4c2a-a99f-58e5af6e393e","session_id":"1d1e5e55-1041-4e40-8e40-000000000001"}} +{"type":"emit_inbound","label":"system","frame":{"type":"system","subtype":"hook_response","hook_id":"96353a54-8984-4e77-8060-b1dc6b030cf6","hook_name":"SessionStart:startup","hook_event":"SessionStart","output":"","stdout":"","stderr":"","exit_code":0,"outcome":"success","uuid":"bbf29fb9-d4b3-4ab2-b0e3-9d2aa833099c","session_id":"1d1e5e55-1041-4e40-8e40-000000000001"}} +{"type":"emit_inbound","label":"system","frame":{"type":"system","subtype":"init","agents":[],"apiKeySource":"none","claude_code_version":"2.1.111","cwd":"/tmp/claude-replay-claude_idle_resume","tools":[],"mcp_servers":[],"model":"claude-sonnet-4-6","permissionMode":"bypassPermissions","slash_commands":[],"output_style":"default","skills":[],"plugins":[],"fast_mode_state":"off","uuid":"f468f37d-e0e6-43cf-bcd7-71e34045083e","session_id":"1d1e5e55-1041-4e40-8e40-000000000001"}} +{"type":"emit_inbound","label":"assistant","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01DKpWHtEMhugcX5nJwxKfeM","type":"message","role":"assistant","content":[{"type":"text","text":"idle resume first turn complete"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":11792,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"1d1e5e55-1041-4e40-8e40-000000000001","uuid":"7e96c9db-0543-4276-b97d-ced7f53b93c4"}} +{"type":"emit_inbound","label":"rate_limit_event","frame":{"type":"rate_limit_event","rate_limit_info":{"status":"allowed"},"uuid":"28620e49-aa10-42fa-a7f9-768c9c79c093","session_id":"1d1e5e55-1041-4e40-8e40-000000000001"}} +{"type":"emit_inbound","label":"result","frame":{"type":"result","subtype":"success","is_error":false,"api_error_status":null,"duration_ms":2809,"duration_api_ms":2523,"num_turns":1,"result":"idle resume first turn complete","stop_reason":"end_turn","session_id":"1d1e5e55-1041-4e40-8e40-000000000001","total_cost_usd":0.0036516,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":11792,"output_tokens":7,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[{"input_tokens":3,"output_tokens":7,"cache_read_input_tokens":11792,"cache_creation_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"type":"message"}],"speed":"standard"},"modelUsage":{"claude-sonnet-4-6":{"inputTokens":3,"outputTokens":7,"cacheReadInputTokens":11792,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.0036516,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"d88c31ae-a8fd-4ed4-a168-96cfd3b0edd7"}} +{"type":"runtime_exit","status":"success"} +{"type":"expect_outbound","label":"query.open:2","frame":{"type":"query.open","options":{"model":"claude-sonnet-4-6","tools":{"type":"preset","preset":"claude_code"},"permissionMode":"bypassPermissions","allowDangerouslySkipPermissions":true,"resume":"1d1e5e55-1041-4e40-8e40-000000000001"}}} +{"type":"expect_outbound","label":"prompt.offer:2","frame":{"type":"prompt.offer","message":{"type":"user","message":{"role":"user","content":"Respond with exactly: idle resume second turn complete"},"parent_tool_use_id":null}}} +{"type":"emit_inbound","label":"system","frame":{"type":"system","subtype":"hook_started","hook_id":"4db1e61e-de28-49ee-b7ef-c375670d0aae","hook_name":"SessionStart:resume","hook_event":"SessionStart","uuid":"66e900fe-b02e-41a0-ac08-92260125b13d","session_id":"377e06a0-fe46-420e-bfaa-b4b9fcc8b871"}} +{"type":"emit_inbound","label":"system","frame":{"type":"system","subtype":"hook_response","hook_id":"4db1e61e-de28-49ee-b7ef-c375670d0aae","hook_name":"SessionStart:resume","hook_event":"SessionStart","output":"","stdout":"","stderr":"","exit_code":0,"outcome":"success","uuid":"03f2754d-daee-4128-b180-c1a39faa03f0","session_id":"377e06a0-fe46-420e-bfaa-b4b9fcc8b871"}} +{"type":"emit_inbound","label":"system","frame":{"type":"system","subtype":"init","agents":[],"apiKeySource":"none","claude_code_version":"2.1.111","cwd":"/tmp/claude-replay-claude_idle_resume","tools":[],"mcp_servers":[],"model":"claude-sonnet-4-6","permissionMode":"bypassPermissions","slash_commands":[],"output_style":"default","skills":[],"plugins":[],"fast_mode_state":"off","uuid":"5e1d64c9-a2ca-4ad1-ba2c-546ba18d1785","session_id":"1d1e5e55-1041-4e40-8e40-000000000001"}} +{"type":"emit_inbound","label":"assistant","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01CiTVsN2HCfqr6Ge166z8hv","type":"message","role":"assistant","content":[{"type":"text","text":"idle resume second turn complete"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":11811,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":7,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"1d1e5e55-1041-4e40-8e40-000000000001","uuid":"e821cdff-a217-4fb3-b1d0-f89387dbac41"}} +{"type":"emit_inbound","label":"rate_limit_event","frame":{"type":"rate_limit_event","rate_limit_info":{"status":"allowed"},"uuid":"21e2dfd3-a7f5-48d1-a343-ddb1be3198c8","session_id":"1d1e5e55-1041-4e40-8e40-000000000001"}} +{"type":"emit_inbound","label":"result","frame":{"type":"result","subtype":"success","is_error":false,"api_error_status":null,"duration_ms":2103,"duration_api_ms":1644,"num_turns":1,"result":"idle resume second turn complete","stop_reason":"end_turn","session_id":"1d1e5e55-1041-4e40-8e40-000000000001","total_cost_usd":0.0036573,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":11811,"output_tokens":7,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[{"input_tokens":3,"output_tokens":7,"cache_read_input_tokens":11811,"cache_creation_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"type":"message"}],"speed":"standard"},"modelUsage":{"claude-sonnet-4-6":{"inputTokens":3,"outputTokens":7,"cacheReadInputTokens":11811,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.0036573,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"a876579c-a18c-4237-ba4e-6105b2162bbb"}} +{"type":"runtime_exit","status":"success"} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/input.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/input.ts new file mode 100644 index 00000000000..8ba8310f3e4 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/input.ts @@ -0,0 +1,25 @@ +import type { OrchestratorFixtureInput } from "../shared.ts"; + +export const CLAUDE_IDLE_RESUME_PROMPT_1 = "Respond with exactly: idle resume first turn complete"; +export const CLAUDE_IDLE_RESUME_PROMPT_2 = "Respond with exactly: idle resume second turn complete"; + +/** + * Regression for the stale-session first-message failure (threads 47763f5e + * run 10 and d0fe9018 runs 7/8, 2026-07-01/02): the provider session manager + * idle-releases the session runtime between turns, wiping the adapter's + * in-memory openedNativeThreads set. The next turn must reopen the SDK query + * with `resume` (the persisted provider thread proves the native session + * exists) — a create-style `sessionId` open fails against an existing + * session and burned the user's first message after every idle gap. + */ +export function claudeIdleResumeInput(): OrchestratorFixtureInput { + return { + steps: [ + { type: "message", text: CLAUDE_IDLE_RESUME_PROMPT_1 }, + // Past ProviderSessionManager's 30-minute idle timeout: the reaper + // releases the live session entry and closes the SDK query. + { type: "advance_clock", duration: "31 minutes" }, + { type: "message", text: CLAUDE_IDLE_RESUME_PROMPT_2 }, + ], + }; +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/output.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/output.ts new file mode 100644 index 00000000000..fc747532762 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_idle_resume/output.ts @@ -0,0 +1,42 @@ +import { assert } from "@effect/vitest"; +import type { ProviderReplayTranscript } from "@t3tools/contracts"; + +import type { OrchestratorV2ScenarioResult } from "../../OrchestratorScenario.ts"; +import { + assertBaseProjection, + assertSemanticProjectionIntegrity, + assertUserMessagesInclude, + projectionFor, +} from "../shared.ts"; +import { CLAUDE_IDLE_RESUME_PROMPT_1, CLAUDE_IDLE_RESUME_PROMPT_2 } from "./input.ts"; + +export function assertClaudeIdleResumeOutput( + result: OrchestratorV2ScenarioResult, + transcript: ProviderReplayTranscript, +) { + // The transcript itself enforces the regression: query.open:2 expects a + // `resume`-style open. A create-style reopen (the bug) fails the replay at + // the boundary before these assertions run. Both runs must complete — the + // pre-fix behavior burned the first post-idle message as a failed run. + assertBaseProjection({ + result, + transcript, + runCount: 2, + runStatuses: ["completed", "completed"], + }); + + const projection = projectionFor(result, transcript.scenario); + assertSemanticProjectionIntegrity(projection); + assertUserMessagesInclude(projection, [ + CLAUDE_IDLE_RESUME_PROMPT_1, + CLAUDE_IDLE_RESUME_PROMPT_2, + ]); + + const assistantTexts = projection.turnItems.flatMap((item) => + item.type === "assistant_message" ? [item.text] : [], + ); + assert.deepEqual(assistantTexts, [ + "idle resume first turn complete", + "idle resume second turn complete", + ]); +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/claude_transcript.ndjson b/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/claude_transcript.ndjson new file mode 100644 index 00000000000..e5393a74466 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/claude_transcript.ndjson @@ -0,0 +1,10 @@ +{"type":"transcript_start","provider":"claudeAgent","protocol":"claude-agent-sdk.query","version":"0.2.111","scenario":"claude_result_is_error","metadata":{"prompts":["Say hello before the credentials expire.","Try again now that auth is back."],"model":"claude-sonnet-4-6","nativeSessionId":"aa401aa4-e597-4e40-8e40-1e401e401e40","queryMode":"streaming","tools":"claude_code","permissionMode":"bypassPermissions","generatedBy":"manual-replay-from-thread-47763f5e run 1 (subtype success + is_error 401 result, 2026-07-01T20:56:54Z)"}} +{"type":"expect_outbound","label":"query.open","frame":{"type":"query.open","options":{"model":"claude-sonnet-4-6","tools":{"type":"preset","preset":"claude_code"},"permissionMode":"bypassPermissions","allowDangerouslySkipPermissions":true,"sessionId":"aa401aa4-e597-4e40-8e40-1e401e401e40"}}} +{"type":"expect_outbound","label":"prompt.offer:1","frame":{"type":"prompt.offer","message":{"type":"user","message":{"role":"user","content":"Say hello before the credentials expire."},"parent_tool_use_id":null}}} +{"type":"emit_inbound","label":"system:init:1","frame":{"type":"system","subtype":"init","agents":[],"apiKeySource":"none","claude_code_version":"2.1.183","cwd":"/tmp/claude-replay-claude_result_is_error","tools":[],"mcp_servers":[],"model":"claude-sonnet-4-6","permissionMode":"bypassPermissions","slash_commands":[],"output_style":"default","skills":[],"plugins":[],"fast_mode_state":"off","uuid":"aa401aa4-0000-4000-8000-000000000001","session_id":"aa401aa4-e597-4e40-8e40-1e401e401e40"}} +{"type":"emit_inbound","label":"assistant:synthetic_auth_error","frame":{"type":"assistant","message":{"id":"7f28a402-6d4c-491e-abbb-2b3c231a06f0","container":null,"model":"","role":"assistant","stop_details":null,"stop_reason":"stop_sequence","stop_sequence":"","type":"message","usage":{"input_tokens":0,"output_tokens":0,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":null,"cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":null,"iterations":null,"speed":null},"content":[{"type":"text","text":"Failed to authenticate. API Error: 401 Invalid authentication credentials"}],"context_management":null},"parent_tool_use_id":null,"session_id":"aa401aa4-e597-4e40-8e40-1e401e401e40","uuid":"aa401aa4-0000-4000-8000-000000000002","error":"authentication_failed"}} +{"type":"emit_inbound","label":"result:1_is_error","frame":{"type":"result","subtype":"success","is_error":true,"api_error_status":401,"duration_ms":2116,"duration_api_ms":0,"num_turns":1,"result":"Failed to authenticate. API Error: 401 Invalid authentication credentials","stop_reason":"stop_sequence","session_id":"aa401aa4-e597-4e40-8e40-1e401e401e40","total_cost_usd":0,"usage":{"input_tokens":0,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":0,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[],"speed":"standard"},"modelUsage":{},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"aa401aa4-0000-4000-8000-000000000003"}} +{"type":"expect_outbound","label":"prompt.offer:2","frame":{"type":"prompt.offer","message":{"type":"user","message":{"role":"user","content":"Try again now that auth is back."},"parent_tool_use_id":null}}} +{"type":"emit_inbound","label":"assistant:recovered","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01iserrorfixture1","type":"message","role":"assistant","content":[{"type":"text","text":"claude result is_error fixture recovered"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"aa401aa4-e597-4e40-8e40-1e401e401e40","uuid":"aa401aa4-0000-4000-8000-000000000004"}} +{"type":"emit_inbound","label":"result:2_success","frame":{"type":"result","subtype":"success","is_error":false,"api_error_status":null,"duration_ms":4000,"duration_api_ms":3500,"num_turns":1,"result":"claude result is_error fixture recovered","stop_reason":"end_turn","session_id":"aa401aa4-e597-4e40-8e40-1e401e401e40","total_cost_usd":0.001,"usage":{"input_tokens":2,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":2,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[{"input_tokens":1,"output_tokens":1,"cache_read_input_tokens":0,"cache_creation_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"type":"message"}],"speed":"standard"},"modelUsage":{"claude-sonnet-4-6":{"inputTokens":2,"outputTokens":2,"cacheReadInputTokens":0,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.001,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"aa401aa4-0000-4000-8000-000000000005"}} +{"type":"runtime_exit","status":"success"} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/input.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/input.ts new file mode 100644 index 00000000000..0f7af1a6757 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/input.ts @@ -0,0 +1,22 @@ +import { + CLAUDE_RESULT_IS_ERROR_FOLLOW_UP, + CLAUDE_RESULT_IS_ERROR_PROMPT, + type OrchestratorFixtureInput, +} from "../shared.ts"; + +/** + * Recorded from thread 47763f5e run 1 (2026-07-01): the SDK reports API-level + * failures (401 auth, 529 overloaded, …) as a result with subtype "success" + * but is_error: true, alongside a synthetic-model assistant message carrying + * the error text. The run must finalize as failed with the API error on the + * failure item — not as a completed run — and the session must stay usable + * for the next turn. + */ +export function claudeResultIsErrorInput(): OrchestratorFixtureInput { + return { + steps: [ + { type: "message", text: CLAUDE_RESULT_IS_ERROR_PROMPT }, + { type: "message", text: CLAUDE_RESULT_IS_ERROR_FOLLOW_UP }, + ], + }; +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/output.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/output.ts new file mode 100644 index 00000000000..8d676460112 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_result_is_error/output.ts @@ -0,0 +1,71 @@ +import { assert } from "@effect/vitest"; +import type { ProviderReplayTranscript } from "@t3tools/contracts"; + +import type { OrchestratorV2ScenarioResult } from "../../OrchestratorScenario.ts"; +import { + assertBaseProjection, + assertSemanticProjectionIntegrity, + assertUserMessagesInclude, + CLAUDE_RESULT_IS_ERROR_FOLLOW_UP, + CLAUDE_RESULT_IS_ERROR_PROMPT, + projectionFor, +} from "../shared.ts"; + +const AUTH_ERROR_TEXT = "Failed to authenticate. API Error: 401 Invalid authentication credentials"; + +export function assertClaudeResultIsErrorOutput( + result: OrchestratorV2ScenarioResult, + transcript: ProviderReplayTranscript, +) { + assertBaseProjection({ + result, + transcript, + runCount: 2, + runStatuses: ["failed", "completed"], + }); + + const projection = projectionFor(result, transcript.scenario); + assertSemanticProjectionIntegrity(projection); + assertUserMessagesInclude(projection, [ + CLAUDE_RESULT_IS_ERROR_PROMPT, + CLAUDE_RESULT_IS_ERROR_FOLLOW_UP, + ]); + + // Run 1 ended with subtype "success" + is_error: the run and its provider + // turn must be failed, and the failure must preserve the API error verbatim + // (message + status code), not a generic wrapper string. + const failedRun = projection.runs.find((run) => run.ordinal === 1); + assert.isDefined(failedRun); + assert.equal(failedRun?.status, "failed"); + const failedTurn = projection.providerTurns.find( + (turn) => turn.status !== "completed" && turn.status !== "running", + ); + assert.equal(failedTurn?.status, "failed"); + + const errorItem = projection.turnItems.find( + (item) => item.runId === failedRun?.id && item.type === "error", + ); + assert.isDefined(errorItem); + if (errorItem?.type !== "error") throw new Error("expected error item"); + assert.equal(errorItem.failure.message, AUTH_ERROR_TEXT); + assert.equal(errorItem.failure.code, "api_error_401"); + assert.equal(errorItem.failure.class, "provider_error"); + + // The SDK's synthetic assistant message still surfaces as ordinary + // assistant text (that is what the stream contained), but the error text + // must not be duplicated a second time via the result-text fallback. + const run1AssistantTexts = projection.turnItems.flatMap((item) => + item.runId === failedRun?.id && item.type === "assistant_message" ? [item.text] : [], + ); + assert.deepEqual(run1AssistantTexts, [AUTH_ERROR_TEXT]); + + // The failed turn must not poison the thread: the follow-up run reuses the + // same open query and completes. + const recoveredRun = projection.runs.find((run) => run.ordinal === 2); + assert.isDefined(recoveredRun); + assert.equal(recoveredRun?.status, "completed"); + const recoveredTexts = projection.turnItems.flatMap((item) => + item.runId === recoveredRun?.id && item.type === "assistant_message" ? [item.text] : [], + ); + assert.deepEqual(recoveredTexts, ["claude result is_error fixture recovered"]); +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/index.ts b/apps/server/src/orchestration-v2/testkit/fixtures/index.ts index 0cc15d19410..fd756e3f10a 100644 --- a/apps/server/src/orchestration-v2/testkit/fixtures/index.ts +++ b/apps/server/src/orchestration-v2/testkit/fixtures/index.ts @@ -4,6 +4,10 @@ import { claudeLocalBashTaskInput } from "./claude_local_bash_task/input.ts"; import { assertClaudeLocalBashTaskOutput } from "./claude_local_bash_task/output.ts"; import { claudeProviderWakeupInput } from "./claude_provider_wakeup/input.ts"; import { assertClaudeProviderWakeupOutput } from "./claude_provider_wakeup/output.ts"; +import { claudeIdleResumeInput } from "./claude_idle_resume/input.ts"; +import { assertClaudeIdleResumeOutput } from "./claude_idle_resume/output.ts"; +import { claudeResultIsErrorInput } from "./claude_result_is_error/input.ts"; +import { assertClaudeResultIsErrorOutput } from "./claude_result_is_error/output.ts"; import { grokSubagentLineageInput } from "./grok_subagent_lineage/input.ts"; import { assertGrokSubagentLineageOutput } from "./grok_subagent_lineage/output.ts"; import { assertClaudeMessageSteeringOutput } from "./message_steering/claude_output.ts"; @@ -112,6 +116,33 @@ export const ORCHESTRATOR_REPLAY_FIXTURES = [ }, ], }, + { + name: "claude_idle_resume", + buildInput: claudeIdleResumeInput, + providers: [ + { + driver: ProviderDriverKind.make("claudeAgent"), + transcriptFile: new URL("./claude_idle_resume/claude_transcript.ndjson", import.meta.url), + modelSelection: CLAUDE_MODEL_SELECTION, + assertOutput: assertClaudeIdleResumeOutput, + }, + ], + }, + { + name: "claude_result_is_error", + buildInput: claudeResultIsErrorInput, + providers: [ + { + driver: ProviderDriverKind.make("claudeAgent"), + transcriptFile: new URL( + "./claude_result_is_error/claude_transcript.ndjson", + import.meta.url, + ), + modelSelection: CLAUDE_MODEL_SELECTION, + assertOutput: assertClaudeResultIsErrorOutput, + }, + ], + }, { name: "grok_subagent_lineage", buildInput: grokSubagentLineageInput, diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts b/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts index b24b8364c53..99fa0d023c1 100644 --- a/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts +++ b/apps/server/src/orchestration-v2/testkit/fixtures/shared.ts @@ -18,6 +18,7 @@ import { type ProviderReplayTranscript, type ProviderUserInputAnswers, } from "@t3tools/contracts"; +import type * as Duration from "effect/Duration"; import * as Effect from "effect/Effect"; import type { @@ -41,6 +42,8 @@ export const CLAUDE_LOCAL_BASH_TASK_PROMPT = export const CLAUDE_PROVIDER_WAKEUP_PROMPT = "Start a background watcher that polls for new bot reviews, then confirm you are waiting."; export const CLAUDE_PROVIDER_WAKEUP_FOLLOW_UP = "Wrap up with a summary."; +export const CLAUDE_RESULT_IS_ERROR_PROMPT = "Say hello before the credentials expire."; +export const CLAUDE_RESULT_IS_ERROR_FOLLOW_UP = "Try again now that auth is back."; export const TOOL_CALL_WRITE_PROMPT = "Create or overwrite .codex-probe-write-action.txt with exactly this text: codex app-server approval fixture. Use a local shell command or file edit only, then briefly report what happened. Do not read package metadata, use GitHub, use web, or use MCP."; export const MESSAGE_STEERING_INITIAL_PROMPT = @@ -205,6 +208,14 @@ export type OrchestratorFixtureInputStep = readonly type: "await_provider_wakeup_run"; readonly runOrdinal: number; readonly status?: OrchestrationV2RunStatus; + } + | { + /** + * Advance the deterministic test clock, e.g. past the provider session + * manager's idle timeout so the next message must reopen the session. + */ + readonly type: "advance_clock"; + readonly duration: Duration.Input; }; export interface OrchestratorFixtureInput { @@ -551,6 +562,9 @@ export function materializeFixtureInput(input: { status: step.status ?? "completed", }); break; + case "advance_clock": + steps.push({ type: "advance_clock", duration: step.duration }); + break; case "steer": messageIndex += 1; steps.push({ From e13be35f537139e80729ab1e19534fe899f48c6a Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:24:32 -0700 Subject: [PATCH 04/23] Update audit plan: tick items 1 and 15, add sections 15/16 Co-Authored-By: Claude Fable 5 --- .../22-orchestration-v2-audit-remediation.md | 68 ++++++++++++++++++- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index 9ec825bd378..71d44ab272a 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -16,7 +16,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done ## Tracking checklist -- [ ] 1. Honor `is_error` on Claude SDK results (runs marked completed on 401/529) +- [x] 1. Honor `is_error` on Claude SDK results (runs marked completed on 401/529) - [ ] 2. Preserve real failure causes in projected errors (Claude adapter + ProviderFailure) - [ ] 3. Preserve cursor failure detail (requestId, durationMs, SDK `error_code`) - [ ] 4. Log failure/lifecycle frames in native provider logs @@ -29,7 +29,9 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [x] 11. Assistant text segments merged without separator (fixed in worktree — add regression fixture) - [ ] 12. OpenCode `file_search` items drop error/output - [ ] 13. Low-severity backlog (see section) -- [ ] 14. Cursor SDK unhandled `write EPIPE` crashes the backend child (recurring, post-SDK-bump) +- [ ] 14. Cursor SDK unhandled `write EPIPE` crashes the backend child (recurring, post-SDK-bump) — reported upstream to Cursor, on hold +- [x] 15. Stale Claude session: first message after idle gap always fails, retry succeeds +- [ ] 16. Steering latency invisible: queued→steer offers sit unconsumed with no UI feedback --- @@ -76,7 +78,11 @@ sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ grep 'api_error_status' ~/.t3/userdata-v2/logs/provider/thread-delegated-task-command-3amcp-3a1156181e-*.log ``` -- [ ] Status: not started +- [x] Status: FIXED (commit 8188f974be) — `terminalStatusFromResult` honors `is_error`; + `providerFailureFromResult` keeps result text + `api_error_` code, retryable for + 429/529; ScheduleWakeup hold-open and result-text fallback gated on `!is_error`. Replay + fixture `claude_result_is_error` (from thread 47763f5e run 1). App-level error injection + not practical (needs a real 401 from the API); verified normal flows in-app instead. ## 2. Terminal failures persist only generic strings — root cause unrecoverable from DB @@ -506,6 +512,62 @@ sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ - [ ] Status: not started +## 15. Stale Claude session: first message after idle gap always fails (FIXED) + +**Severity: high (recurring UX failure).** Diagnosed 2026-07-03. Every claudeAgent thread left +idle past the session manager's 30-minute timeout burned the user's next message: the run +failed in <1s with the generic "Claude Agent SDK query failed.", and the immediate retry +succeeded. 11 occurrences across 6 threads (7 idle-reaper, 4 restart-triggered — including the +post-EPIPE-crash instant failures on 7f1dfff1 run 10 and 71e29ba5 run 89, retroactively +explained). + +**Root cause:** `ClaudeAdapterV2.openQuery` decided create-vs-resume solely from the in-memory +`openedNativeThreads` set (allocated per `openSession` runtime at line ~1951). Idle release / +crash / restart destroys the runtime; the next open saw an empty set → create-style +`sessionId:` open for a native session that already exists → SDK error. The failed attempt +pre-inserted the thread into the set, which is why the retry resumed and succeeded — the +failure itself "fixed" the state. + +**Fix (commit 8188f974be):** `shouldResume` now also consults the persisted provider thread — +`firstRunOrdinal < runOrdinal` proves an earlier run already opened the native session (note: +`firstRunOrdinal` is stamped at turn start by `ProviderTurnStartService`, so a plain non-null +check would break first-ever opens). Threads are marked opened only after `queryRunner.open` +succeeds, so a failed create no longer poisons the in-memory state either. Replay fixture +`claude_idle_resume` drives the real idle reaper via a new `advance_clock` fixture step +(31 simulated minutes) and asserts the reopen uses `resume:` with both runs completing. + +- [x] Status: FIXED (commit 8188f974be) + replay fixture. App-verified 2026-07-03 against a + real claudeAgent (Haiku) session in an isolated dev instance: turn 1 opened with + `sessionId:` (create), backend process restarted mid-thread, turn 2 opened with + `resume:` and completed on the FIRST attempt; turn 3 reused the live query + (no extra query.open). Zero failed runs, zero error items. + +## 16. Steering latency invisible: queued→steer offers sit unconsumed with no feedback + +**Severity: medium (UX, caused a perceived total failure).** Diagnosed 2026-07-03 from thread +7c366fdb (05:37–06:06 UTC): user queued a message during a 27-minute claudeAgent turn, promoted +it to steer at 06:04:50; the SDK only consumes offered messages at an internal step boundary, +which took ~76 seconds. Nothing in the events or UI distinguishes "steer accepted by app" from +"provider actually acting on it", so the user saw silence, nudged twice more (each nudge fired +another `query.offer` into the same unconsumed stream), concluded the session was dead, and +manually restarted the app — which cleanly stopped every provider session (mass +`provider-session → stopped` at 06:06:18.6, recovery `terminalizedRuns: 0` at 06:06:27 proves +no crash). The first post-restart run then insta-failed (issue 15), completing the "session +died" impression. NOT part of the EPIPE crash family. + +**Proposed fix:** + +1. Emit a steering-pending signal when the `provider-turn.steer` effect is dispatched + (`Orchestrator.ts` `dispatchSteerIntoRun`), and resolve it when the adapter observes the + SDK's `aborted_streaming` result (`ClaudeAdapterV2.ts` steering-abort branch) — UI shows + "steering — the agent will pick this up at its next step" instead of silence. +2. Coalesce repeated steers targeting the same run while one is unacknowledged (queue as + follow-up context rather than stacking `query.offer`s). +3. Verify a steered-but-unconsumed message survives an app restart (or is re-queued) rather + than being dropped. + +- [ ] Status: not started + ## Refuted during verification (do NOT act on) - 71e29ba5 "ingestion gap, 119 items lost" — traffic belonged to c878541b via the shared codex From 37c59fdaad71a0da16737f92b6a1798075a70978 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:36:14 -0700 Subject: [PATCH 05/23] Preserve nested failure causes in projected provider errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapter errors are tagged wrappers whose message getter is a fixed string ("Claude Agent SDK query failed.", "Failed to start run ...") while the real defect lives in .cause — makeProviderFailure only read the wrapper, so every failed run persisted a generic message with the root cause recorded nowhere (audit plan #2: 8 undebuggable failures across 5 threads). makeProviderFailure now walks the cause chain (bounded depth, deduped) and joins the messages, and picks up the deepest error code. The run-execution failure log also prints the Cause pretty-formatted instead of a depth-elided object. App-verified: breaking the Claude binary path now projects "Claude Agent SDK query failed. ← Claude Code native binary not found at /tmp/nonexistent-claude-binary. ..." as the user-visible error item. Co-Authored-By: Claude Fable 5 --- .../orchestration-v2/ProviderFailure.test.ts | 22 +++++++++ .../src/orchestration-v2/ProviderFailure.ts | 49 +++++++++++++++++-- .../orchestration-v2/RunExecutionService.ts | 5 +- 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/apps/server/src/orchestration-v2/ProviderFailure.test.ts b/apps/server/src/orchestration-v2/ProviderFailure.test.ts index 68cd5e2d227..94a8d500af1 100644 --- a/apps/server/src/orchestration-v2/ProviderFailure.test.ts +++ b/apps/server/src/orchestration-v2/ProviderFailure.test.ts @@ -66,6 +66,28 @@ it("does not split a surrogate pair at the truncation boundary", () => { assert.notMatch(failure.message.slice(0, -1), /[\uD800-\uDBFF]$/u); }); +it("preserves the nested cause chain of wrapper errors", () => { + const sdkError = new Error("spawn claude ENOENT"); + (sdkError as Error & { code?: string }).code = "ENOENT"; + const wrapper = new Error("Claude Agent SDK query failed.", { cause: sdkError }); + + const failure = makeProviderFailure({ cause: wrapper, class: "transport_error" }); + + assert.equal(failure.message, "Claude Agent SDK query failed. ← spawn claude ENOENT"); + assert.equal(failure.code, "ENOENT"); +}); + +it("uses the deepest available message without duplicating repeated ones", () => { + const failure = makeProviderFailure({ + cause: { + message: "Failed to start run", + cause: { message: "Failed to start run", cause: "socket hang up" }, + }, + }); + + assert.equal(failure.message, "Failed to start run ← socket hang up"); +}); + it("does not serialize arbitrary provider causes", () => { const failure = makeProviderFailure({ cause: { diff --git a/apps/server/src/orchestration-v2/ProviderFailure.ts b/apps/server/src/orchestration-v2/ProviderFailure.ts index 866d7740662..d797f1e8f22 100644 --- a/apps/server/src/orchestration-v2/ProviderFailure.ts +++ b/apps/server/src/orchestration-v2/ProviderFailure.ts @@ -87,6 +87,43 @@ function boundedText(value: string, maxLength: number): string { return `${redacted.slice(0, end)}…`; } +const MAX_FAILURE_CAUSE_DEPTH = 6; + +/** + * Adapter errors are tagged wrappers whose `message` getter is a fixed + * human-readable string while the actual defect lives in `.cause` (often + * nested several levels deep). Walking the chain is what keeps the real + * failure reason debuggable after the fact — persisting only the wrapper + * message produced errors like "Claude Agent SDK query failed." with the + * underlying exception recorded nowhere. + */ +function causeChain(value: unknown): { + readonly messages: ReadonlyArray; + readonly code: string | undefined; +} { + const messages: Array = []; + let code: string | undefined; + const seen = new Set(); + let current: unknown = value; + for (let depth = 0; depth < MAX_FAILURE_CAUSE_DEPTH; depth += 1) { + if (current === null || current === undefined || seen.has(current)) break; + seen.add(current); + if (typeof current === "string") { + if (current.length > 0 && messages.at(-1) !== current) messages.push(current); + break; + } + if (typeof current !== "object" && !(current instanceof Error)) break; + const message = + current instanceof Error ? current.message : stringField(current, "message"); + if (typeof message === "string" && message.length > 0 && messages.at(-1) !== message) { + messages.push(message); + } + code ??= stringField(current, "code"); + current = (current as { readonly cause?: unknown }).cause; + } + return { messages, code }; +} + export function makeProviderFailure(input: { readonly cause?: unknown; readonly message?: string | undefined; @@ -94,12 +131,16 @@ export function makeProviderFailure(input: { readonly class?: OrchestrationV2ProviderFailureClass; readonly retryable?: boolean | null; }): OrchestrationV2ProviderFailure { + const chain = causeChain(input.cause); + const chainMessage = chain.messages.length === 0 ? undefined : chain.messages.join(" ← "); const rawMessage = - input.message ?? - (input.cause instanceof Error ? input.cause.message : stringField(input.cause, "message")) ?? - DEFAULT_PROVIDER_FAILURE_MESSAGE; + input.message === undefined + ? (chainMessage ?? DEFAULT_PROVIDER_FAILURE_MESSAGE) + : chainMessage === undefined || chainMessage === input.message + ? input.message + : `${input.message} ← ${chainMessage}`; const message = boundedText(rawMessage, MAX_PROVIDER_FAILURE_MESSAGE_LENGTH); - const rawCode = input.code ?? stringField(input.cause, "code") ?? null; + const rawCode = input.code ?? chain.code ?? null; const code = rawCode === null ? null : boundedText(rawCode, MAX_PROVIDER_FAILURE_CODE_LENGTH) || null; diff --git a/apps/server/src/orchestration-v2/RunExecutionService.ts b/apps/server/src/orchestration-v2/RunExecutionService.ts index a1010daa699..d45469d1d77 100644 --- a/apps/server/src/orchestration-v2/RunExecutionService.ts +++ b/apps/server/src/orchestration-v2/RunExecutionService.ts @@ -810,7 +810,10 @@ export const layer: Layer.Layer< Effect.catchCause((cause) => Effect.logError("orchestration V2 provider turn start failed", { runId: input.run.id, - cause, + // Pretty-printed: the structured object gets depth-elided to + // "[ [Object] ]" by the log sink, destroying the only copy + // of the real failure reason. + cause: Cause.pretty(cause), }).pipe( Effect.andThen(Fiber.interrupt(providerEventFiber)), Effect.andThen(Ref.get(latestProviderThread)), From cd0178e4f23405235425b29e1433de984903ec63 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:36:48 -0700 Subject: [PATCH 06/23] Tick audit plan item 2 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index 71d44ab272a..38fdc17318d 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -17,7 +17,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done ## Tracking checklist - [x] 1. Honor `is_error` on Claude SDK results (runs marked completed on 401/529) -- [ ] 2. Preserve real failure causes in projected errors (Claude adapter + ProviderFailure) +- [x] 2. Preserve real failure causes in projected errors (Claude adapter + ProviderFailure) - [ ] 3. Preserve cursor failure detail (requestId, durationMs, SDK `error_code`) - [ ] 4. Log failure/lifecycle frames in native provider logs - [ ] 5. Surface provider-process crashes / reconcile cancellations to the user @@ -124,7 +124,12 @@ sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ FROM orchestration_v2_projection_turn_items WHERE type='error';" ``` -- [ ] Status: not started +- [x] Status: FIXED (commit 5669dc4644) — `makeProviderFailure` walks the cause chain (bounded, + deduped) and joins messages with " ← ", picks up the deepest `code`; run-execution failure + log prints `Cause.pretty`. Unit tests added. App-verified 2026-07-03: with a broken Claude + binary path, the user-visible error item reads "Claude Agent SDK query failed. ← Claude + Code native binary not found at ...". The open-vs-resume side note was fixed under + issue 15. Cursor-specific detail (requestId/durationMs/error_code) tracked in issue 3. ## 3. Cursor failure detail dropped (requestId, durationMs, SDK error_code) From 24f3a78176e990c587b5b00614d39612f2c3294f Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:46:06 -0700 Subject: [PATCH 07/23] Persist cursor run correlation data on turn failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A failed cursor turn projected only the generic "Provider turn failed." because the adapter read a nonexistent `error` field off RunResult (@cursor/sdk 1.0.22 carries no error text on results; errorCode lives only in the SDK's internal run store). Persist what the result does carry — run id, requestId, and duration — so failures can be matched to Cursor-side logs (audit plan #3, thread c9e72a05 run 2 lost requestId beca30c7 + 440s duration). The speculative `error` read stays as a makeProviderFailure cause for future SDK versions. Co-Authored-By: Claude Fable 5 --- .../orchestration-v2/Adapters/CursorAdapterV2.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/apps/server/src/orchestration-v2/Adapters/CursorAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/CursorAdapterV2.ts index bda1e0d909c..ae5ae177648 100644 --- a/apps/server/src/orchestration-v2/Adapters/CursorAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/CursorAdapterV2.ts @@ -2162,7 +2162,22 @@ export function makeCursorAdapterV2( status, ...(status === "failed" ? { + // RunResult carries no error text in @cursor/sdk + // 1.0.22 (errorCode lives only in the SDK's + // internal run store); persist the correlation + // data we do get so the failure can be matched to + // Cursor-side logs. `error` is read speculatively + // for future SDK versions. failure: makeProviderFailure({ + message: [ + `Cursor run ${result.id} ended with status "error".`, + ...(result.requestId === undefined + ? [] + : [`requestId ${result.requestId}`]), + ...(result.durationMs === undefined + ? [] + : [`after ${Math.round(result.durationMs / 1000)}s`]), + ].join(" "), cause: (result as { readonly error?: unknown }).error, class: "provider_error", }), From 2ff9384c04c858dc62d08cfbc42212c44f784169 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:46:36 -0700 Subject: [PATCH 08/23] Tick audit plan item 3 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index 38fdc17318d..ff034219421 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -18,7 +18,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [x] 1. Honor `is_error` on Claude SDK results (runs marked completed on 401/529) - [x] 2. Preserve real failure causes in projected errors (Claude adapter + ProviderFailure) -- [ ] 3. Preserve cursor failure detail (requestId, durationMs, SDK `error_code`) +- [x] 3. Preserve cursor failure detail (requestId, durationMs; SDK exposes no error text yet) - [ ] 4. Log failure/lifecycle frames in native provider logs - [ ] 5. Surface provider-process crashes / reconcile cancellations to the user - [ ] 6. Ingest codex-native collab subagents @@ -163,7 +163,11 @@ sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ grep 'run.completed' ~/.t3/userdata-v2/logs/provider/c9e72a05-*.log | tail -1 ``` -- [ ] Status: not started +- [x] Status: FIXED for what the SDK provides (commit after 5669dc4644) — failed cursor turns + now persist "Cursor run ended with status \"error\". requestId after s". + @cursor/sdk 1.0.22 exposes NO error text/errorCode on RunResult or Run (store-internal + only) — ask Cursor to surface errorCode alongside the EPIPE report (issue 14). Real + cursor-failure app verification deferred: isolated dev home has no cursor credentials. ## 4. Native provider logs never record failure/lifecycle frames From 7e9eede624bfac8be3c6a9aca2d00c1a239c332c Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:56:42 -0700 Subject: [PATCH 09/23] Keep output/error on opencode file_search items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Failed read/grep/glob tools projected only {status: failed, pattern} — the provider's error message was unrecoverable from projections, and successful outputs were dropped too (audit plan #12, thread 3029dc85 item prt_f15692f3). The file_search contract gains optional output and error fields; the OpenCode adapter maps part.state.output/error by terminal status. App-verified with a real OpenCode agent: a successful package.json read projects the content in output, and a read of a nonexistent file projects status failed with error "File not found: ...". Co-Authored-By: Claude Fable 5 --- .../src/orchestration-v2/Adapters/OpenCodeAdapterV2.ts | 8 ++++++++ packages/contracts/src/orchestrationV2.ts | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/apps/server/src/orchestration-v2/Adapters/OpenCodeAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/OpenCodeAdapterV2.ts index 7d8b8a8c47f..34f31f100b4 100644 --- a/apps/server/src/orchestration-v2/Adapters/OpenCodeAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/OpenCodeAdapterV2.ts @@ -1380,6 +1380,14 @@ export function makeOpenCodeAdapterV2(options: OpenCodeAdapterV2Options): Provid ...(recordString(input, "pattern", "query", "path", "filePath") === undefined ? {} : { pattern: recordString(input, "pattern", "query", "path", "filePath")! }), + // Keep the tool's outcome: a failed read/grep/glob previously + // projected only the pattern, leaving the provider's error + // message unrecoverable (audit plan #12). + ...(output === undefined + ? {} + : part.state.status === "error" + ? { error: output } + : { output }), }; } else if (projectionKind === "web_search") { const pattern = recordString(input, "query", "url", "pattern"); diff --git a/packages/contracts/src/orchestrationV2.ts b/packages/contracts/src/orchestrationV2.ts index a8ed633c602..e2657d26b7f 100644 --- a/packages/contracts/src/orchestrationV2.ts +++ b/packages/contracts/src/orchestrationV2.ts @@ -841,6 +841,8 @@ export const OrchestrationV2TurnItem = Schema.Union([ type: Schema.Literal("file_search"), pattern: Schema.optional(Schema.String), results: Schema.optional(Schema.Array(OrchestrationV2FileSearchResult)), + output: Schema.optional(Schema.String), + error: Schema.optional(Schema.String), }), Schema.Struct({ ...OrchestrationV2TurnItemBaseFields, @@ -1435,6 +1437,8 @@ export const OrchestrationV2TurnItemJson = Schema.Union([ type: Schema.Literal("file_search"), pattern: Schema.optional(Schema.String), results: Schema.optional(Schema.Array(OrchestrationV2FileSearchResult)), + output: Schema.optional(Schema.String), + error: Schema.optional(Schema.String), }), Schema.Struct({ ...OrchestrationV2TurnItemJsonBaseFields, From 5ab65ac5f2ed71f27e6fc05222c86ae574907de1 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 02:56:56 -0700 Subject: [PATCH 10/23] Tick audit plan item 12 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index ff034219421..3ba0bc6cb07 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -27,7 +27,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [ ] 9. Route shared-codex-session native logs to the correct thread's file - [ ] 10. Coalesce streaming-delta event persistence (~2800x amplification) - [x] 11. Assistant text segments merged without separator (fixed in worktree — add regression fixture) -- [ ] 12. OpenCode `file_search` items drop error/output +- [x] 12. OpenCode `file_search` items drop error/output - [ ] 13. Low-severity backlog (see section) - [ ] 14. Cursor SDK unhandled `write EPIPE` crashes the backend child (recurring, post-SDK-bump) — reported upstream to Cursor, on hold - [x] 15. Stale Claude session: first message after idle gap always fails, retry succeeds @@ -414,7 +414,11 @@ sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ WHERE turn_item_id='turn-item:provider:opencode:native-item:prt_f15692f30001qL66As1xsUCQXc';" ``` -- [ ] Status: not started +- [x] Status: FIXED — file_search contract gains optional output/error; OpenCode adapter maps + part.state.output/error by terminal status. App-verified with a real OpenCode agent + (successful read → output populated; missing file → status failed with the provider's + "File not found: ..." error preserved). Grok/ACP file_search already carries + results[].preview; its redaction gap stays in the low backlog. ## 13. Low-severity backlog From 59eaa56c044408fcef7fc5a69311a03c4d29fd0b Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 03:01:26 -0700 Subject: [PATCH 11/23] Add claude_text_segments regression fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locks in per-uuid assistant text segmentation (audit plan #11): the 6d618dc4 MCP session on build fc23be8184 merged 5 interleaved assistant text segments into one separator-less end-of-turn item ordered after all tool calls. The current emitAssistantTextArtifacts path already projects one item per SDK message uuid; this fixture asserts text → command → text ordering survives replay. Co-Authored-By: Claude Fable 5 --- .../claude_transcript.ndjson | 10 +++++ .../fixtures/claude_text_segments/input.ts | 18 +++++++++ .../fixtures/claude_text_segments/output.ts | 40 +++++++++++++++++++ .../testkit/fixtures/index.ts | 17 ++++++++ 4 files changed, 85 insertions(+) create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/claude_transcript.ndjson create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/input.ts create mode 100644 apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/output.ts diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/claude_transcript.ndjson b/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/claude_transcript.ndjson new file mode 100644 index 00000000000..bd02994f55f --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/claude_transcript.ndjson @@ -0,0 +1,10 @@ +{"type":"transcript_start","provider":"claudeAgent","protocol":"claude-agent-sdk.query","version":"0.2.111","scenario":"claude_text_segments","metadata":{"prompts":["List the workspace files, narrating before and after."],"model":"claude-sonnet-4-6","nativeSessionId":"5e97e975-1041-4e40-8e40-000000000002","queryMode":"streaming","tools":"claude_code","permissionMode":"bypassPermissions","generatedBy":"manual-replay-from-thread-mcp-6d618dc4 (5 interleaved assistant text segments were merged into one end-of-turn item, 2026-06-29T22:13:07Z)"}} +{"type":"expect_outbound","label":"query.open","frame":{"type":"query.open","options":{"model":"claude-sonnet-4-6","tools":{"type":"preset","preset":"claude_code"},"permissionMode":"bypassPermissions","allowDangerouslySkipPermissions":true,"sessionId":"5e97e975-1041-4e40-8e40-000000000002"}}} +{"type":"expect_outbound","label":"prompt.offer:1","frame":{"type":"prompt.offer","message":{"type":"user","message":{"role":"user","content":"List the workspace files, narrating before and after."},"parent_tool_use_id":null}}} +{"type":"emit_inbound","label":"system:init:1","frame":{"type":"system","subtype":"init","agents":[],"apiKeySource":"none","claude_code_version":"2.1.183","cwd":"/tmp/claude-replay-claude_text_segments","tools":[],"mcp_servers":[],"model":"claude-sonnet-4-6","permissionMode":"bypassPermissions","slash_commands":[],"output_style":"default","skills":[],"plugins":[],"fast_mode_state":"off","uuid":"5e975e97-0000-4000-8000-000000000001","session_id":"5e97e975-1041-4e40-8e40-000000000002"}} +{"type":"emit_inbound","label":"assistant:segment_one","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01segmentfixture1","type":"message","role":"assistant","content":[{"type":"text","text":"Listing the workspace now."}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"5e97e975-1041-4e40-8e40-000000000002","uuid":"5e975e97-0000-4000-8000-000000000002"}} +{"type":"emit_inbound","label":"assistant:ls_bash","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01segmentfixture2","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01segmentfixturels1","name":"Bash","input":{"command":"ls","description":"List workspace files"},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"5e97e975-1041-4e40-8e40-000000000002","uuid":"5e975e97-0000-4000-8000-000000000003"}} +{"type":"emit_inbound","label":"user:ls_result","frame":{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01segmentfixturels1","type":"tool_result","content":"package.json\ntsconfig.json","is_error":false}]},"parent_tool_use_id":null,"session_id":"5e97e975-1041-4e40-8e40-000000000002","uuid":"5e975e97-0000-4000-8000-000000000004","tool_use_result":{"stdout":"package.json\ntsconfig.json","stderr":"","interrupted":false,"isImage":false,"noOutputExpected":false}}} +{"type":"emit_inbound","label":"assistant:segment_two","frame":{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01segmentfixture3","type":"message","role":"assistant","content":[{"type":"text","text":"Two files found: package.json and tsconfig.json."}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"5e97e975-1041-4e40-8e40-000000000002","uuid":"5e975e97-0000-4000-8000-000000000005"}} +{"type":"emit_inbound","label":"result:1","frame":{"type":"result","subtype":"success","is_error":false,"api_error_status":null,"duration_ms":4000,"duration_api_ms":3500,"num_turns":2,"result":"Two files found: package.json and tsconfig.json.","stop_reason":"end_turn","session_id":"5e97e975-1041-4e40-8e40-000000000002","total_cost_usd":0.001,"usage":{"input_tokens":2,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":2,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":0},"inference_geo":"","iterations":[],"speed":"standard"},"modelUsage":{"claude-sonnet-4-6":{"inputTokens":2,"outputTokens":2,"cacheReadInputTokens":0,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.001,"contextWindow":200000,"maxOutputTokens":32000}},"permission_denials":[],"terminal_reason":"completed","fast_mode_state":"off","uuid":"5e975e97-0000-4000-8000-000000000006"}} +{"type":"runtime_exit","status":"success"} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/input.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/input.ts new file mode 100644 index 00000000000..bc457235647 --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/input.ts @@ -0,0 +1,18 @@ +import type { OrchestratorFixtureInput } from "../shared.ts"; + +export const CLAUDE_TEXT_SEGMENTS_PROMPT = + "List the workspace files, narrating before and after."; + +/** + * Regression for merged assistant text (audit plan #11, thread + * thread:mcp:6d618dc4 on build fc23be8184): interleaved assistant text + * segments were accumulated and emitted as ONE item at result time — text + * joined with no separator and ordered after all tool calls, losing the + * narrate → tool → narrate structure. Each SDK assistant message uuid must + * project its own assistant_message item at its position in the stream. + */ +export function claudeTextSegmentsInput(): OrchestratorFixtureInput { + return { + steps: [{ type: "message", text: CLAUDE_TEXT_SEGMENTS_PROMPT }], + }; +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/output.ts b/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/output.ts new file mode 100644 index 00000000000..04126ecbade --- /dev/null +++ b/apps/server/src/orchestration-v2/testkit/fixtures/claude_text_segments/output.ts @@ -0,0 +1,40 @@ +import { assert } from "@effect/vitest"; +import type { ProviderReplayTranscript } from "@t3tools/contracts"; + +import type { OrchestratorV2ScenarioResult } from "../../OrchestratorScenario.ts"; +import { + assertBaseProjection, + assertSemanticProjectionIntegrity, + assertUserMessagesInclude, + projectionFor, +} from "../shared.ts"; +import { CLAUDE_TEXT_SEGMENTS_PROMPT } from "./input.ts"; + +export function assertClaudeTextSegmentsOutput( + result: OrchestratorV2ScenarioResult, + transcript: ProviderReplayTranscript, +) { + assertBaseProjection({ + result, + transcript, + runCount: 1, + runStatuses: ["completed"], + }); + + const projection = projectionFor(result, transcript.scenario); + assertSemanticProjectionIntegrity(projection); + assertUserMessagesInclude(projection, [CLAUDE_TEXT_SEGMENTS_PROMPT]); + + // The stream interleaved text → tool → text. Each assistant message uuid + // must be its own item, ordered around the command — never one merged + // blob after all tools. + const ordered = [...projection.turnItems] + .sort((a, b) => a.ordinal - b.ordinal) + .filter((item) => item.type === "assistant_message" || item.type === "command_execution") + .map((item) => (item.type === "assistant_message" ? `text:${item.text}` : "command")); + assert.deepEqual(ordered, [ + "text:Listing the workspace now.", + "command", + "text:Two files found: package.json and tsconfig.json.", + ]); +} diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/index.ts b/apps/server/src/orchestration-v2/testkit/fixtures/index.ts index fd756e3f10a..5cabe48dc87 100644 --- a/apps/server/src/orchestration-v2/testkit/fixtures/index.ts +++ b/apps/server/src/orchestration-v2/testkit/fixtures/index.ts @@ -8,6 +8,8 @@ import { claudeIdleResumeInput } from "./claude_idle_resume/input.ts"; import { assertClaudeIdleResumeOutput } from "./claude_idle_resume/output.ts"; import { claudeResultIsErrorInput } from "./claude_result_is_error/input.ts"; import { assertClaudeResultIsErrorOutput } from "./claude_result_is_error/output.ts"; +import { claudeTextSegmentsInput } from "./claude_text_segments/input.ts"; +import { assertClaudeTextSegmentsOutput } from "./claude_text_segments/output.ts"; import { grokSubagentLineageInput } from "./grok_subagent_lineage/input.ts"; import { assertGrokSubagentLineageOutput } from "./grok_subagent_lineage/output.ts"; import { assertClaudeMessageSteeringOutput } from "./message_steering/claude_output.ts"; @@ -143,6 +145,21 @@ export const ORCHESTRATOR_REPLAY_FIXTURES = [ }, ], }, + { + name: "claude_text_segments", + buildInput: claudeTextSegmentsInput, + providers: [ + { + driver: ProviderDriverKind.make("claudeAgent"), + transcriptFile: new URL( + "./claude_text_segments/claude_transcript.ndjson", + import.meta.url, + ), + modelSelection: CLAUDE_MODEL_SELECTION, + assertOutput: assertClaudeTextSegmentsOutput, + }, + ], + }, { name: "grok_subagent_lineage", buildInput: grokSubagentLineageInput, From 6c17b5d41c1645186bad70fd58badd7a37d90102 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 03:01:36 -0700 Subject: [PATCH 12/23] Tick audit plan item 11 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index 3ba0bc6cb07..1376acb6ac5 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -26,7 +26,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [x] 8. Invisible post-turn wakeup turns (fix already on this branch — verify against audit scenarios) - [ ] 9. Route shared-codex-session native logs to the correct thread's file - [ ] 10. Coalesce streaming-delta event persistence (~2800x amplification) -- [x] 11. Assistant text segments merged without separator (fixed in worktree — add regression fixture) +- [x] 11. Assistant text segments merged without separator (fixed; regression fixture claude_text_segments added) - [x] 12. OpenCode `file_search` items drop error/output - [ ] 13. Low-severity backlog (see section) - [ ] 14. Cursor SDK unhandled `write EPIPE` crashes the backend child (recurring, post-SDK-bump) — reported upstream to Cursor, on hold @@ -384,8 +384,8 @@ with no separator, ordered after all 19 tool calls, losing interleaving. Current **Remaining work:** - [x] Fix (already in worktree) -- [ ] Add a replay fixture asserting multi-segment assistant text produces one item per SDK - uuid, interleaved with tool items at correct ordinals +- [x] Replay fixture `claude_text_segments` asserts text → command → text ordering with one + assistant item per SDK uuid ## 12. OpenCode `file_search` items drop error/output From 16ae8678024c9dfe51c2bb7939c3805b22998213 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 09:50:26 -0700 Subject: [PATCH 13/23] Surface runtime-reconcile cancellations as visible turn items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A run cancelled by the startup/shutdown reconcile was indistinguishable from a user cancel: the user's message simply never got an answer and no explanation appeared anywhere (audit plan #5, threads 721fc23c and 48663fb7 — cursor crash and server restart both ate the turn silently). The reconcile now appends a "Run interrupted" error item per terminalized run carrying the reason ("Cancelled because the server restarted/shut down before the provider work completed."), ordinal-appended after all projected items to respect the thread-wide position uniqueness. App-verified: killed the backend mid-turn; after restart the thread shows "Run interrupted — Cancelled because the server restarted before the provider work completed." Co-Authored-By: Claude Fable 5 --- .../ProviderRuntimeRecoveryService.test.ts | 3 ++ .../ProviderRuntimeRecoveryService.ts | 38 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.test.ts b/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.test.ts index 932beec8995..a048a8ec6ee 100644 --- a/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.test.ts +++ b/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.test.ts @@ -515,6 +515,9 @@ it.effect( ["provider-turn.updated", "cancelled"], ["message.updated", null], ["turn-item.updated", "cancelled"], + // Synthesized "Run interrupted" notice: reconcile cancellations + // must be user-visible, not silent. + ["turn-item.updated", "cancelled"], ["provider-thread.updated", "idle"], ["provider-session.updated", "stopped"], ], diff --git a/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.ts b/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.ts index 5dadaa7b01e..e9bbc3e03a8 100644 --- a/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.ts +++ b/apps/server/src/orchestration-v2/ProviderRuntimeRecoveryService.ts @@ -2,6 +2,7 @@ import { CommandId, type OrchestrationV2DomainEvent, type OrchestrationV2ThreadProjection, + type OrchestrationV2TurnItem, ThreadId, } from "@t3tools/contracts"; import * as Context from "effect/Context"; @@ -15,6 +16,7 @@ import * as EffectOutbox from "./EffectOutbox.ts"; import * as EventSink from "./EventSink.ts"; import * as IdAllocator from "./IdAllocator.ts"; import * as ProjectionStore from "./ProjectionStore.ts"; +import { makeProviderFailure } from "./ProviderFailure.ts"; export class ProviderRuntimeRecoveryError extends Schema.TaggedErrorClass()( "ProviderRuntimeRecoveryError", @@ -120,6 +122,10 @@ export const make = Effect.gen(function* () { ), ); const events: Array = []; + // Thread-wide ordinals are UNIQUE-constrained in the positions table; + // synthesized cancellation notices append after everything projected. + let nextSynthesizedItemOrdinal = + (projection.turnItems ?? []).reduce((max, item) => Math.max(max, item.ordinal), 0) + 1; for (const request of requests) { events.push({ id: yield* allocateEventId(), @@ -252,6 +258,38 @@ export const make = Effect.gen(function* () { payload: { ...item, status: "cancelled", completedAt: now, updatedAt: now }, }); } + // Without a visible notice, a reconcile-cancelled run is + // indistinguishable from a user cancel: the user's message simply + // never gets an answer (audit plan #5, threads 721fc23c/48663fb7). + const cancellationNotice: OrchestrationV2TurnItem = { + id: ids.derive.runSignalTurnItem({ runId: run.id, signal: "runtime-reconcile" }), + threadId: projection.thread.id, + runId: run.id, + nodeId: run.rootNodeId, + providerThreadId: run.providerThreadId, + providerTurnId: null, + nativeItemRef: null, + parentItemId: null, + ordinal: nextSynthesizedItemOrdinal, + status: "cancelled", + title: "Run interrupted", + startedAt: now, + completedAt: now, + updatedAt: now, + type: "error", + failure: makeProviderFailure({ message: detail, class: "transport_error" }), + }; + nextSynthesizedItemOrdinal += 1; + events.push({ + id: yield* allocateEventId(), + type: "turn-item.updated", + threadId: projection.thread.id, + runId: run.id, + ...(run.rootNodeId === null ? {} : { nodeId: run.rootNodeId }), + providerInstanceId: run.providerInstanceId, + occurredAt: now, + payload: cancellationNotice, + }); } for (const providerThread of projection.providerThreads.filter( (candidate) => candidate.status === "active", From 85b00e7c6972a7abab44a8ab812c2ca94bc47187 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 09:50:26 -0700 Subject: [PATCH 14/23] Tick audit plan item 5 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index 1376acb6ac5..a3a10e5a361 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -20,7 +20,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [x] 2. Preserve real failure causes in projected errors (Claude adapter + ProviderFailure) - [x] 3. Preserve cursor failure detail (requestId, durationMs; SDK exposes no error text yet) - [ ] 4. Log failure/lifecycle frames in native provider logs -- [ ] 5. Surface provider-process crashes / reconcile cancellations to the user +- [x] 5. Surface provider-process crashes / reconcile cancellations to the user - [ ] 6. Ingest codex-native collab subagents - [ ] 7. Fix grok/ACP background subagent lifecycle + transcript projection - [x] 8. Invisible post-turn wakeup turns (fix already on this branch — verify against audit scenarios) @@ -208,6 +208,13 @@ terminalizes runs/attempts/turns to `cancelled` but emits no user-visible item a "user" | ...`) so UI and debugging can distinguish. 3. Optional: auto-offer retry in UI for reconcile-cancelled runs. +- [x] Status: FIXED — the reconcile appends a "Run interrupted" error item (status cancelled) + per terminalized run with the restart/shutdown reason; unit test updated. App-verified + 2026-07-03: killed the backend mid-turn, after restart the thread renders "Run + interrupted — Cancelled because the server restarted before the provider work + completed." A `cancelReason` field on runs and UI retry affordance remain optional + follow-ups. + **Repro:** ```sh From f326bdc66e9f99642295ab0b16207c286666feeb Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 10:23:58 -0700 Subject: [PATCH 15/23] Write runner.error frames to native provider logs on SDK failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The native provider logs are the debugging ground truth, but SDK rejections (query open, message stream, prompt offer, agent open, run start/wait) raised errors without any log write — a failed turn left the log ending mid-conversation with no explanation (audit plan #4: neither of thread 721fc23c's failed turns was explainable from its log). The Claude and Cursor runners now tap every fallible SDK boundary and write a runner.error frame carrying the redacted cause chain. App-verified: with a broken Claude binary the native log now records `runner.error messages.stream | Claude Agent SDK query failed. ← Claude Code native binary not found at ...` where it previously went silent. Co-Authored-By: Claude Fable 5 --- .../Adapters/ClaudeAdapterV2.ts | 43 ++++++++++++++++++- .../Adapters/CursorAgentSdk.ts | 34 ++++++++++++++- 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts index b4171c6064b..8a961cc5a91 100644 --- a/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/ClaudeAdapterV2.ts @@ -398,6 +398,18 @@ export type ClaudeAgentSdkProtocolLogEvent = readonly direction: "incoming"; readonly stage: "decoded"; readonly payload: SDKMessage; + } + | { + // Failure frame: without it the native log ends mid-conversation with + // no trace of WHY (audit plan #4 — failed opens/streams left the + // "ground truth" log unable to explain any failed turn). + readonly direction: "incoming"; + readonly stage: "decoded"; + readonly payload: { + readonly type: "runner.error"; + readonly method: string; + readonly message: string; + }; }; export type ClaudeAgentSdkProtocolLogger = ( @@ -485,6 +497,22 @@ export const claudeAgentSdkQueryRunnerLiveLayer: Layer.Layer< }); const logProtocolEvent = (event: ClaudeAgentSdkProtocolLogEvent) => protocolLogger === undefined ? Effect.void : protocolLogger(event); + const logRunnerFailure = + (method: string) => + (effect: Effect.Effect): Effect.Effect => + effect.pipe( + Effect.tapError((error) => + logProtocolEvent({ + direction: "incoming", + stage: "decoded", + payload: { + type: "runner.error", + method, + message: makeProviderFailure({ cause: error }).message, + }, + }), + ), + ); const promptQueue = yield* Queue.unbounded(); const prompt = Stream.fromQueue(promptQueue).pipe( Stream.catchCause((cause) => @@ -499,7 +527,7 @@ export const claudeAgentSdkQueryRunnerLiveLayer: Layer.Layer< options: input.options, }), catch: (cause) => queryRunnerError(cause, "query"), - }); + }).pipe(logRunnerFailure("query.open")); yield* logProtocolEvent({ direction: "outgoing", stage: "decoded", @@ -520,6 +548,17 @@ export const claudeAgentSdkQueryRunnerLiveLayer: Layer.Layer< payload: message, }), ), + Stream.tapError((error) => + logProtocolEvent({ + direction: "incoming", + stage: "decoded", + payload: { + type: "runner.error", + method: "messages.stream", + message: makeProviderFailure({ cause: error }).message, + }, + }), + ), ), offer: (message) => Queue.offer(promptQueue, message).pipe( @@ -534,12 +573,14 @@ export const claudeAgentSdkQueryRunnerLiveLayer: Layer.Layer< }, }), ), + logRunnerFailure("prompt.offer"), ), setModel: (model) => Effect.tryPromise({ try: () => queryRuntime.setModel(model), catch: (cause) => queryRunnerError(cause, "setModel"), }).pipe( + logRunnerFailure("query.set_model"), Effect.tap(() => logProtocolEvent({ direction: "outgoing", diff --git a/apps/server/src/orchestration-v2/Adapters/CursorAgentSdk.ts b/apps/server/src/orchestration-v2/Adapters/CursorAgentSdk.ts index ab96f548afa..04c9c2d0aff 100644 --- a/apps/server/src/orchestration-v2/Adapters/CursorAgentSdk.ts +++ b/apps/server/src/orchestration-v2/Adapters/CursorAgentSdk.ts @@ -18,6 +18,7 @@ import * as Layer from "effect/Layer"; import * as Schema from "effect/Schema"; import { ServerConfig } from "../../config.ts"; +import { makeProviderFailure } from "../ProviderFailure.ts"; import { type EventNdjsonLogger, makeEventNdjsonLogger, @@ -197,6 +198,18 @@ export type CursorAgentSdkProtocolLogEvent = readonly type: "agent.close"; readonly agentId: string; }; + } + | { + // Failure frame: SDK rejections previously left the native log ending + // mid-conversation with no trace of why (audit plan #4 — thread + // 721fc23c's failed turns were unexplainable from the log). + readonly direction: "incoming"; + readonly stage: "decoded"; + readonly payload: { + readonly type: "runner.error"; + readonly method: string; + readonly message: string; + }; }; export type CursorAgentSdkProtocolLogger = ( @@ -316,6 +329,22 @@ export const cursorAgentSdkRunnerLiveLayer: Layer.Layer protocolLogger === undefined ? Effect.void : protocolLogger(event); + const logRunnerFailure = + (method: string) => + (effect: Effect.Effect): Effect.Effect => + effect.pipe( + Effect.tapError((error) => + log({ + direction: "incoming", + stage: "decoded", + payload: { + type: "runner.error", + method, + message: makeProviderFailure({ cause: error }).message, + }, + }), + ), + ); yield* log({ direction: "outgoing", @@ -334,7 +363,7 @@ export const cursorAgentSdkRunnerLiveLayer: Layer.Layer runnerError(cause, `agent.${input.operation}`), - }); + }).pipe(logRunnerFailure(`agent.${input.operation}`)); yield* log({ direction: "incoming", @@ -406,7 +435,7 @@ export const cursorAgentSdkRunnerLiveLayer: Layer.Layer runnerError(cause, "run.start"), - }); + }).pipe(logRunnerFailure("run.start")); runId = run.id; yield* log({ direction: "incoming", @@ -449,6 +478,7 @@ export const cursorAgentSdkRunnerLiveLayer: Layer.Layer Date: Fri, 3 Jul 2026 10:23:58 -0700 Subject: [PATCH 16/23] Tick audit plan item 4 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index a3a10e5a361..2f709acaf36 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -19,7 +19,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [x] 1. Honor `is_error` on Claude SDK results (runs marked completed on 401/529) - [x] 2. Preserve real failure causes in projected errors (Claude adapter + ProviderFailure) - [x] 3. Preserve cursor failure detail (requestId, durationMs; SDK exposes no error text yet) -- [ ] 4. Log failure/lifecycle frames in native provider logs +- [x] 4. Log failure/lifecycle frames in native provider logs (claude + cursor runners) - [x] 5. Surface provider-process crashes / reconcile cancellations to the user - [ ] 6. Ingest codex-native collab subagents - [ ] 7. Fix grok/ACP background subagent lifecycle + transcript projection @@ -187,7 +187,15 @@ Process exit is not logged to the per-thread native log either. and turn-abort paths — across all adapters (cursor, claude, codex, ACP, opencode). 2. Keep payloads small (message + code + native run/turn id), no secrets. -- [ ] Status: not started +- [x] Status: FIXED for the two adapters with observed gaps — Claude and Cursor runners tap + every fallible SDK boundary (query open, messages stream, prompt offer, set_model / + agent open, run.start, run.wait) and write a `runner.error` frame with the redacted + cause chain (reuses makeProviderFailure redaction). App-verified: broken Claude binary + now leaves `runner.error messages.stream | ... native binary not found ...` in the + native log. Codex already logs upstream errors as protocol messages; ACP/opencode + logging remains payload-redacted (low backlog). Process spawn/exit lifecycle frames + deferred — the cursor SDK is in-process and claude CLI exits already surface as stream + errors. ## 5. Provider crashes / reconcile cancellations are silent to the user From 260b3a8f6429ed220e91653a9d90b8555ab158fd Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 10:44:06 -0700 Subject: [PATCH 17/23] Route shared codex app-server logs to the owning app thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One codex app-server process multiplexes many app threads (the opener plus every native subagent child), but the protocol logger froze the opener's threadId at open time, so all traffic landed in the opener's log file — and rotation of that busy file destroyed other threads' native ground truth (audit plan #9, threads c878541b/de5f191a/68f7595b/ af66fc2c had no log file at all). The logger now resolves the target per frame: it extracts the native thread id from each frame and looks it up in a per-session route map seeded when a root turn or subagent thread registers, falling back to the opener for unrouted frames. App-verified with a real codex subagent spawn: the two subagent native threads each got their own dedicated log file containing only their own traffic (94 and 54 frames), where previously everything multiplexed into the parent's file. Unit tests cover native-thread-id extraction across decoded/raw frame shapes and the two-thread routing decision. Co-Authored-By: Claude Fable 5 --- .../Adapters/CodexAdapterV2.test.ts | 69 ++++++++++++++++++ .../Adapters/CodexAdapterV2.ts | 72 ++++++++++++++++++- 2 files changed, 138 insertions(+), 3 deletions(-) diff --git a/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.test.ts b/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.test.ts index e7e33e85449..2d7b37dd409 100644 --- a/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.test.ts +++ b/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.test.ts @@ -25,6 +25,7 @@ import type { EventNdjsonLogger } from "../../provider/Layers/EventNdjsonLogger. import { buildCodexTurnStartParams, CODEX_DRIVER_KIND, + codexNativeThreadIdFromProtocolEvent, codexThreadRuntimeParams, type CodexAgentMessageDeltaUpdate, makeCodexAgentMessageDeltaCoalescer, @@ -432,6 +433,74 @@ describe("CodexAdapterV2 native protocol logging", () => { assert.equal(protocolLogger, undefined); }); + + it("extracts the native thread id from decoded and raw frames", () => { + assert.equal( + codexNativeThreadIdFromProtocolEvent({ + payload: { method: "thread/event", params: { threadId: "019f-abc" } }, + }), + "019f-abc", + ); + assert.equal( + codexNativeThreadIdFromProtocolEvent({ + payload: { result: { thread: { id: "019f-def" } } }, + }), + "019f-def", + ); + assert.equal( + codexNativeThreadIdFromProtocolEvent({ payload: '{"id":1,"params":{"threadId":"019f-raw"}}' }), + "019f-raw", + ); + assert.equal( + codexNativeThreadIdFromProtocolEvent({ payload: { method: "initialize" } }), + undefined, + ); + }); + + it.effect("routes shared-session frames to the owning app thread's log", () => + Effect.gen(function* () { + const writes: Array<{ readonly threadId: ThreadId | null }> = []; + const logger: EventNdjsonLogger = { + filePath: "/tmp/events.log", + write: (_event, threadId) => + Effect.sync(() => { + writes.push({ threadId }); + }), + close: () => Effect.void, + }; + const openerThreadId = ThreadId.make("thread-opener"); + // Two app threads multiplexed on one app-server process: the opener and + // a second thread whose native id maps elsewhere (audit plan #9). + const routes = new Map([["019f-owned", ThreadId.make("thread-owned")]]); + const protocolLogger = makeCodexAppServerProtocolLogger({ + nativeEventLogger: logger, + threadId: openerThreadId, + providerSessionId: ProviderSessionId.make("provider-session-shared"), + resolveThreadId: (nativeThreadId) => + (nativeThreadId === undefined ? undefined : routes.get(nativeThreadId)) ?? openerThreadId, + }); + if (protocolLogger === undefined) { + assert.fail("expected a protocol logger"); + return; + } + + yield* protocolLogger({ + direction: "incoming", + stage: "decoded", + payload: { method: "thread/event", params: { threadId: "019f-owned" } }, + }); + yield* protocolLogger({ + direction: "incoming", + stage: "decoded", + payload: { method: "thread/event", params: { threadId: "019f-unknown" } }, + }); + + assert.deepEqual( + writes.map((write) => write.threadId), + [ThreadId.make("thread-owned"), openerThreadId], + ); + }), + ); }); describe("CodexAdapterV2 rollback mapping", () => { diff --git a/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.ts index c77d7b484c3..86e95e82ae3 100644 --- a/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/CodexAdapterV2.ts @@ -939,6 +939,13 @@ export interface CodexAppServerClientFactoryShape { readonly runtimePolicy: ProviderAdapterV2RuntimePolicy; readonly settings: CodexSettings; readonly environment: NodeJS.ProcessEnv; + /** + * Routes protocol log frames to the app thread that owns the native + * codex thread. One shared app-server process multiplexes many app + * threads; without per-frame routing every thread's traffic lands in + * the opener's log file (audit plan #9). + */ + readonly resolveLogThreadId?: (nativeThreadId: string | undefined) => ThreadId; }) => Effect.Effect< CodexClient.CodexAppServerClient["Service"], ProviderAdapterOpenSessionError, @@ -1045,18 +1052,57 @@ export const makeCodexAppServerClientFactoryCommandLayer = ( }), ); +const CODEX_RAW_THREAD_ID_PATTERN = /"threadId"\s*:\s*"([^"]+)"/; + +/** Best-effort native thread id extraction from a protocol log frame. */ +export function codexNativeThreadIdFromProtocolEvent(event: unknown): string | undefined { + if (typeof event !== "object" || event === null) { + return undefined; + } + const payload = (event as { readonly payload?: unknown }).payload; + if (typeof payload === "string") { + return CODEX_RAW_THREAD_ID_PATTERN.exec(payload)?.[1]; + } + if (typeof payload !== "object" || payload === null) { + return undefined; + } + const record = payload as Record; + for (const container of [record["params"], record["result"], record]) { + if (typeof container !== "object" || container === null) { + continue; + } + const threadId = (container as Record)["threadId"]; + if (typeof threadId === "string") { + return threadId; + } + const thread = (container as Record)["thread"]; + if (typeof thread === "object" && thread !== null) { + const id = (thread as Record)["id"]; + if (typeof id === "string") { + return id; + } + } + } + return undefined; +} + export function makeCodexAppServerProtocolLogger(input: { readonly nativeEventLogger: EventNdjsonLogger | undefined; readonly threadId: ThreadId; readonly providerSessionId: OrchestrationV2ProviderSession["id"]; + readonly resolveThreadId?: (nativeThreadId: string | undefined) => ThreadId; }): CodexClient.CodexAppServerClientOptions["logger"] | undefined { const { nativeEventLogger } = input; if (nativeEventLogger === undefined) { return undefined; } - return (event) => - nativeEventLogger + return (event) => { + const threadId = + input.resolveThreadId === undefined + ? input.threadId + : input.resolveThreadId(codexNativeThreadIdFromProtocolEvent(event)); + return nativeEventLogger .write( { provider: CODEX_PROVIDER, @@ -1065,9 +1111,10 @@ export function makeCodexAppServerProtocolLogger(input: { providerSessionId: input.providerSessionId, event: redactCodexProtocolValue(event), }, - input.threadId, + threadId, ) .pipe(Effect.ignore); + }; } export function redactCodexProtocolValue(value: unknown): unknown { @@ -1137,6 +1184,9 @@ export const codexAppServerClientFactoryFromSettingsLayer: Layer.Layer< nativeEventLogger, threadId: input.threadId, providerSessionId: input.providerSessionId, + ...(input.resolveLogThreadId === undefined + ? {} + : { resolveThreadId: input.resolveLogThreadId }), }); const clientOptions: CodexClient.CodexAppServerClientOptions = protocolLogger === undefined @@ -1250,6 +1300,14 @@ export function makeCodexAdapterV2(adapterOptions: CodexAdapterV2Options): Provi planSelectionTransition: () => Effect.succeed(turnScopedSelectionTransition()), openSession: (input) => Effect.gen(function* () { + // native codex thread id → owning app thread. Mutable on purpose: + // the resolver closes over it before any turn registers a route. + // Unknown ids fall back to the opener (its own handshake frames carry + // no route yet); a subagent's earliest frames may land here before its + // route registers, but the bulk of its traffic gets a dedicated file — + // enough that the opener log's rotation no longer destroys other + // threads' ground truth (audit plan #9). + const logThreadRoutes = new Map(); const client = yield* clientFactory.open({ instanceId: adapterOptions.instanceId, threadId: input.threadId, @@ -1257,6 +1315,9 @@ export function makeCodexAdapterV2(adapterOptions: CodexAdapterV2Options): Provi runtimePolicy: input.runtimePolicy, settings: adapterOptions.settings, environment: adapterOptions.environment, + resolveLogThreadId: (nativeThreadId) => + (nativeThreadId === undefined ? undefined : logThreadRoutes.get(nativeThreadId)) ?? + input.threadId, }); const initialized = yield* Ref.make(false); const ensureInitialized = Effect.gen(function* () { @@ -1306,6 +1367,10 @@ export function makeCodexAdapterV2(adapterOptions: CodexAdapterV2Options): Provi readonly startedAt: DateTime.Utc; }) => Effect.gen(function* () { + const nativeThreadId = input.turnInput.providerThread.nativeThreadRef?.nativeId; + if (nativeThreadId !== undefined && nativeThreadId !== null) { + logThreadRoutes.set(nativeThreadId, input.turnInput.threadId); + } const existing = (yield* Ref.get(activeTurns)).get(input.nativeTurnId); if (existing !== undefined) { return existing; @@ -1619,6 +1684,7 @@ export function makeCodexAdapterV2(adapterOptions: CodexAdapterV2Options): Provi driver: CODEX_PROVIDER, nativeThreadId: input.nativeThreadId, }); + logThreadRoutes.set(input.nativeThreadId, childThreadId); const turnItemOrdinal = yield* resolveItemOrdinal(input.context, input.nativeItemId); const providerThread = { id: idAllocator.derive.providerThread({ From 4f82406c61e6de781569c0db118dcc7e95e01d7a Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 10:44:07 -0700 Subject: [PATCH 18/23] Tick audit plan item 9 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index 2f709acaf36..fce1adcc6f4 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -24,7 +24,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [ ] 6. Ingest codex-native collab subagents - [ ] 7. Fix grok/ACP background subagent lifecycle + transcript projection - [x] 8. Invisible post-turn wakeup turns (fix already on this branch — verify against audit scenarios) -- [ ] 9. Route shared-codex-session native logs to the correct thread's file +- [x] 9. Route shared-codex-session native logs to the correct thread's file - [ ] 10. Coalesce streaming-delta event persistence (~2800x amplification) - [x] 11. Assistant text segments merged without separator (fixed; regression fixture claude_text_segments added) - [x] 12. OpenCode `file_search` items drop error/output @@ -357,7 +357,14 @@ grep -l '019f1b62-f532' ~/.t3/userdata-v2/logs/provider/*.log* # only 71e29ba5- ls ~/.t3/userdata-v2/logs/provider/ | grep -c 'c878541b\|de5f191a\|68f7595b\|af66fc2c' # 0 ``` -- [ ] Status: not started +- [x] Status: FIXED — the codex protocol logger resolves the target per frame via + `codexNativeThreadIdFromProtocolEvent` + a per-session native→app-thread route map + seeded at root-turn / subagent-thread registration; unrouted frames fall back to the + opener. App-verified: a real codex subagent spawn produced dedicated per-native-thread + log files (only that thread's ids inside). A subagent's earliest pre-registration frames + may still land in the opener file, but the bulk gets a dedicated file — enough that the + opener log's rotation no longer erases other threads' ground truth. Retention bump + (.plans/06) is the complementary follow-up. ## 10. Streaming deltas persisted as full-row event pairs (~2800x amplification) From eb7452d41c56d1c498eaf08d7e9e427ed8473446 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 10:57:50 -0700 Subject: [PATCH 19/23] Coalesce streamed ACP subagent assistant deltas before persistence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ACP streams a subagent's result per token, and emitSubagentAssistant persisted a full-row message.updated + turn-item.updated event pair per chunk — one 6KB grok subagent result amplified into ~2700 stored events, 14% of a whole session's event table (audit plan #10). Streamed chunks now accumulate and emit at most once per 100ms window; the final text is always flushed immediately on task completion, so the projection stays exact. The grok_subagent_lineage fixture now asserts each child result persists a single coalesced message.updated event (was one per chunk). App-checked: a real streaming ACP result (4190 chars) persisted 4 message.updated events instead of thousands. Co-Authored-By: Claude Fable 5 --- .../orchestration-v2/Adapters/AcpAdapterV2.ts | 74 ++++++++++++++++--- .../fixtures/grok_subagent_lineage/output.ts | 15 ++++ 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts index c3e5dd1e849..318f4cdc795 100644 --- a/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts @@ -26,6 +26,7 @@ import { modelSelectionsEqual } from "@t3tools/shared/model"; import * as Cause from "effect/Cause"; import * as DateTime from "effect/DateTime"; import * as Deferred from "effect/Deferred"; +import * as Duration from "effect/Duration"; import * as Effect from "effect/Effect"; import * as FileSystem from "effect/FileSystem"; import * as Option from "effect/Option"; @@ -84,6 +85,13 @@ import { export const ACP_PROTOCOL = "acp.ndjson-jsonrpc" as const; +/** + * Window for coalescing streamed subagent assistant deltas into one persisted + * snapshot. Matches the codex agent-message coalescer cadence; the final text + * is always flushed on task completion regardless of this interval. + */ +const SUBAGENT_STREAM_FLUSH_INTERVAL_MS = 100; + export interface AcpAdapterV2RuntimeInput { readonly cwd: string; readonly mcpServers: ReadonlyArray; @@ -576,6 +584,12 @@ interface ActiveAcpSubagent { childSessionId: string | null; assistantText: string; nextChildOrdinal: number; + // Streaming-emit throttle: ACP streams the subagent result per token, so a + // full-row event pair per chunk amplified one 6KB result into ~2700 stored + // events (audit plan #10). We coalesce intermediate emits and always flush + // the final text. + streamFlushScheduled: boolean; + streamPendingText: boolean; } type PendingRuntimeRequest = { @@ -842,12 +856,10 @@ export function makeAcpAdapterV2(options: AcpAdapterV2Options): ProviderAdapterV yield* emitTextSegment(context, kind, false); }); - const emitSubagentAssistant = Effect.fnUntraced(function* ( + const emitSubagentAssistantSnapshot = Effect.fnUntraced(function* ( subagent: ActiveAcpSubagent, - text: string, ) { - if (text.length === 0) return; - subagent.assistantText += text; + if (subagent.assistantText.length === 0) return; const now = yield* DateTime.now; const nativeItemId = `${subagent.task.nativeTaskRef?.nativeId ?? subagent.task.id}:result`; const artifacts = makeSubagentConversationArtifacts({ @@ -871,13 +883,48 @@ export function makeAcpAdapterV2(options: AcpAdapterV2Options): ProviderAdapterV }); }); + // Streaming append: accumulate and emit at most once per flush window. + const streamSubagentAssistant = Effect.fnUntraced(function* ( + subagent: ActiveAcpSubagent, + text: string, + ) { + if (text.length === 0) return; + subagent.assistantText += text; + subagent.streamPendingText = true; + if (subagent.streamFlushScheduled) return; + subagent.streamFlushScheduled = true; + yield* Effect.sleep(Duration.millis(SUBAGENT_STREAM_FLUSH_INTERVAL_MS)).pipe( + Effect.andThen( + Effect.suspend(() => { + subagent.streamFlushScheduled = false; + if (!subagent.streamPendingText) return Effect.void; + subagent.streamPendingText = false; + return emitSubagentAssistantSnapshot(subagent); + }), + ), + Effect.forkIn(sessionScope), + ); + }); + + // Terminal/one-shot emit: always persists the final text immediately. + const flushSubagentAssistant = Effect.fnUntraced(function* ( + subagent: ActiveAcpSubagent, + finalText?: string, + ) { + if (finalText !== undefined && finalText.length > 0) { + subagent.assistantText += finalText; + } + subagent.streamPendingText = false; + yield* emitSubagentAssistantSnapshot(subagent); + }); + const projectSubagentNotification = Effect.fnUntraced(function* ( subagent: ActiveAcpSubagent, notification: EffectAcpSchema.SessionNotification, ) { const update = notification.update; if (update.sessionUpdate === "agent_message_chunk" && update.content.type === "text") { - yield* emitSubagentAssistant(subagent, update.content.text); + yield* streamSubagentAssistant(subagent, update.content.text); } }); @@ -952,6 +999,8 @@ export function makeAcpAdapterV2(options: AcpAdapterV2Options): ProviderAdapterV childSessionId: null, assistantText: "", nextChildOrdinal: 101, + streamFlushScheduled: false, + streamPendingText: false, }; subagent.task = task; context.subagents.set(update.nativeTaskId, subagent); @@ -1044,12 +1093,15 @@ export function makeAcpAdapterV2(options: AcpAdapterV2Options): ProviderAdapterV ); } - if ( - taskStatus !== "running" && - subagent.assistantText.length === 0 && - update.result !== null - ) { - yield* emitSubagentAssistant(subagent, update.result); + if (taskStatus !== "running") { + // Terminal: flush the final text immediately (adopting the + // one-shot result when nothing streamed) instead of leaving the + // last throttled snapshot possibly unemitted. + if (subagent.assistantText.length === 0 && update.result !== null) { + yield* flushSubagentAssistant(subagent, update.result); + } else if (subagent.streamPendingText) { + yield* flushSubagentAssistant(subagent); + } } const result = subagent.assistantText || update.result; subagent.task = { diff --git a/apps/server/src/orchestration-v2/testkit/fixtures/grok_subagent_lineage/output.ts b/apps/server/src/orchestration-v2/testkit/fixtures/grok_subagent_lineage/output.ts index 5b290cb201b..d01c98e050d 100644 --- a/apps/server/src/orchestration-v2/testkit/fixtures/grok_subagent_lineage/output.ts +++ b/apps/server/src/orchestration-v2/testkit/fixtures/grok_subagent_lineage/output.ts @@ -82,6 +82,21 @@ export function assertGrokSubagentLineageOutput( const assistant = child.messages.find((message) => message.role === "assistant"); assert.isDefined(assistant); assert.isBelow(assistant.text.indexOf(expected.first), assistant.text.indexOf(expected.second)); + + // Coalescing (audit plan #10): the subagent streamed a progress chunk and + // a result chunk into this one result message. Without coalescing each + // chunk persisted its own full-row message.updated event; the throttle + // collapses intermediate emits so the final text lands in a single flush. + const resultMessageEvents = result.storedEvents.filter((stored) => { + if (stored.event.type !== "message.updated") return false; + const payload = stored.event.payload as { readonly id?: unknown; readonly threadId?: unknown }; + return payload.threadId === subagent.childThreadId && payload.id === assistant.id; + }); + assert.lengthOf( + resultMessageEvents, + 1, + `expected coalesced subagent result for ${expected.title}, got ${resultMessageEvents.length} message.updated events`, + ); for (const other of EXPECTED_CHILDREN) { if (other.sessionId !== expected.sessionId) { assert.notInclude( From dd30e28578a39a4095e6f8576d60219b5c139d67 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 10:57:50 -0700 Subject: [PATCH 20/23] Tick audit plan item 10 Co-Authored-By: Claude Fable 5 --- .plans/22-orchestration-v2-audit-remediation.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.plans/22-orchestration-v2-audit-remediation.md b/.plans/22-orchestration-v2-audit-remediation.md index fce1adcc6f4..47e996418f0 100644 --- a/.plans/22-orchestration-v2-audit-remediation.md +++ b/.plans/22-orchestration-v2-audit-remediation.md @@ -25,7 +25,7 @@ Status legend: `[ ]` todo · `[~]` in progress · `[x]` done - [ ] 7. Fix grok/ACP background subagent lifecycle + transcript projection - [x] 8. Invisible post-turn wakeup turns (fix already on this branch — verify against audit scenarios) - [x] 9. Route shared-codex-session native logs to the correct thread's file -- [ ] 10. Coalesce streaming-delta event persistence (~2800x amplification) +- [x] 10. Coalesce streaming-delta event persistence (~2800x amplification) - [x] 11. Assistant text segments merged without separator (fixed; regression fixture claude_text_segments added) - [x] 12. OpenCode `file_search` items drop error/output - [ ] 13. Low-severity backlog (see section) @@ -393,7 +393,14 @@ sqlite3 -readonly ~/.t3/userdata-v2/state.sqlite \ WHERE stream_id LIKE 'thread:provider:grok:native-thread:019f1558%' GROUP BY 1;" ``` -- [ ] Status: not started +- [x] Status: FIXED for the ACP subagent path (the audited amplifier) — emitSubagentAssistant + throttles streamed deltas to one snapshot per 100ms window, with a guaranteed terminal + flush. grok_subagent_lineage fixture asserts one coalesced message.updated per child + result (was one per chunk). App-checked: a 4190-char streamed result persisted 4 + message.updated events. Note: this coalesces PERSISTED events; live UI streaming + granularity drops to the flush cadence (acceptable — the projection is unchanged). The + main-assistant ACP path already emits few large chunks; codex/claude use per-block (not + per-token) emits and were never the amplifier. ## 11. Claude assistant text segments merged without separator (fixed — needs regression fixture) From b1a0017d0f27f698e20a05acd7c782e41fb6f5ee Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 16:00:59 -0700 Subject: [PATCH 21/23] Address review findings on wakeup lifecycle and ACP coalescer - Decorate attachTurn with the same ensureThreadAttached/markBusy/markIdle pairing as startTurn: a wakeup-attached run left busyCount at 0, so the idle reaper could release the live session mid-turn (macroscope, High). - Guard the turn.wakeup pump branch with the same runtime-liveness check as persistProviderSessionUpdate so stale buffered wakeups drained after releaseEntry cannot dispatch attaches against a released or replaced session (macroscope, Medium). - Dispatch wakeups concurrently, one in flight per thread: quiescence waiting on one busy thread no longer head-of-line-blocks every other thread's wakeup; duplicate wakeups for a thread with one in flight are coalesced since the adapter buffers all activity behind a single attach (macroscope, Medium). - Flush throttled subagent assistant text in ACP finalizeTurn: after the turn ends the run fiber that routes child-thread events may be gone, so relying on the coalescer's timer could drop the stream tail (cursor bugbot, Medium). Co-Authored-By: Claude Fable 5 --- .../orchestration-v2/Adapters/AcpAdapterV2.ts | 8 +++ .../ProviderSessionManager.ts | 43 +++++++++++++-- .../orchestration-v2/ProviderWakeupService.ts | 54 ++++++++++++++++--- 3 files changed, 93 insertions(+), 12 deletions(-) diff --git a/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts b/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts index 318f4cdc795..f67fa61a56b 100644 --- a/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts +++ b/apps/server/src/orchestration-v2/Adapters/AcpAdapterV2.ts @@ -2054,6 +2054,14 @@ export function makeAcpAdapterV2(options: AcpAdapterV2Options): ProviderAdapterV if (context.finalized) return; context.finalized = true; yield* closeTextStreams(context); + // Persist any throttled subagent text now: after the turn ends the + // run fiber that routes child-thread events may be gone, so waiting + // for the coalescer's timer flush could drop the tail of a stream. + for (const subagent of context.subagents.values()) { + if (subagent.streamPendingText) { + yield* flushSubagentAssistant(subagent); + } + } const now = yield* DateTime.now; const turn = providerTurnPayload(context, status, now); yield* Ref.update(providerTurns, (current) => { diff --git a/apps/server/src/orchestration-v2/ProviderSessionManager.ts b/apps/server/src/orchestration-v2/ProviderSessionManager.ts index 15b3f776600..45de202c582 100644 --- a/apps/server/src/orchestration-v2/ProviderSessionManager.ts +++ b/apps/server/src/orchestration-v2/ProviderSessionManager.ts @@ -37,6 +37,7 @@ import { type ProviderAdapterV2Event, type ProviderAdapterV2EventSubscription, type ProviderAdapterV2SessionRuntime, + type ProviderAdapterV2TurnInput, } from "./ProviderAdapter.ts"; import { ProviderAdapterRegistryV2 } from "./ProviderAdapterRegistry.ts"; import { ProjectionStoreV2 } from "./ProjectionStore.ts"; @@ -979,6 +980,30 @@ export const layerWithOptions = ( ), ), ), + // A wakeup-attached run occupies the session exactly like a started + // one: without the markBusy pairing the idle reaper sees busyCount 0 + // and can release the live session mid-turn. + ...(runtime.attachTurn === undefined + ? {} + : { + attachTurn: (input: ProviderAdapterV2TurnInput) => + observeActivity( + providerSessionId, + ensureThreadAttached({ + providerSessionId, + threadId: input.threadId, + providerInstanceId: runtime.instanceId, + }), + ).pipe( + Effect.andThen(observeActivity(providerSessionId, markBusy(providerSessionId))), + Effect.andThen(runtime.attachTurn!(input)), + Effect.catch((error) => + observeActivity(providerSessionId, markIdle(providerSessionId)).pipe( + Effect.andThen(Effect.fail(error)), + ), + ), + ), + }), steerTurn: (input) => observeActivity(providerSessionId, touchActivity(providerSessionId)).pipe( Effect.andThen(runtime.steerTurn(input)), @@ -1024,15 +1049,25 @@ export const layerWithOptions = ( entry: LiveSessionEntry, event: Extract, ) => - wakeupObserver - .onWakeup({ + Effect.gen(function* () { + // Same staleness guard as persistProviderSessionUpdate: the pump + // can still be draining buffered events after releaseEntry removed + // (or replaced) this session — a stale wakeup must not dispatch an + // attach against the released/replaced runtime. + const current = (yield* Ref.get(sessions)).get( + sessionKey(entry.runtime.providerSessionId), + ); + if (current?.runtime !== entry.runtime) { + return; + } + yield* wakeupObserver.onWakeup({ threadId: event.threadId, providerThreadId: event.providerThreadId, providerInstanceId: entry.runtime.instanceId, providerSessionId: entry.runtime.providerSessionId, origin: event.origin, - }) - .pipe( + }); + }).pipe( Effect.catchCause((cause) => Effect.logWarning("orchestration-v2.driver-session.wakeup-observer-failed", { providerSessionId: entry.runtime.providerSessionId, diff --git a/apps/server/src/orchestration-v2/ProviderWakeupService.ts b/apps/server/src/orchestration-v2/ProviderWakeupService.ts index 1c0624afd16..73da88efb7e 100644 --- a/apps/server/src/orchestration-v2/ProviderWakeupService.ts +++ b/apps/server/src/orchestration-v2/ProviderWakeupService.ts @@ -11,6 +11,7 @@ import * as Context from "effect/Context"; import * as Effect from "effect/Effect"; import * as Layer from "effect/Layer"; import * as Queue from "effect/Queue"; +import * as Ref from "effect/Ref"; import { OrchestratorV2, type OrchestratorV2Shape } from "./Orchestrator.ts"; import { ProviderWakeupObserver } from "./ProviderSessionManager.ts"; @@ -163,14 +164,51 @@ export const runWakeupDispatcher: Effect.Effect< never, never, ProviderWakeupRelay | OrchestratorV2 -> = Effect.gen(function* () { - const relay = yield* ProviderWakeupRelay; - const orchestrator = yield* OrchestratorV2; - return yield* relay.take.pipe( - Effect.flatMap((input) => dispatchWakeup(orchestrator, input)), - Effect.forever, - ); -}); +> = Effect.scoped( + Effect.gen(function* () { + const relay = yield* ProviderWakeupRelay; + const orchestrator = yield* OrchestratorV2; + // Wakeups dispatch concurrently, one in flight per thread: quiescence + // waiting for one busy thread must not head-of-line-block every other + // thread's wakeup (which could be superseded while it waits). A wakeup + // arriving while its thread already has one in flight is coalesced away — + // the adapter buffers all pending activity behind a single attach. + const inFlightThreads = yield* Ref.make(new Set()); + return yield* relay.take.pipe( + Effect.flatMap((input) => + Ref.modify(inFlightThreads, (current) => { + if (current.has(input.threadId)) { + return [false, current] as const; + } + const next = new Set(current); + next.add(input.threadId); + return [true, next] as const; + }).pipe( + Effect.flatMap((claimed) => + claimed + ? dispatchWakeup(orchestrator, input).pipe( + Effect.ensuring( + Ref.update(inFlightThreads, (current) => { + const next = new Set(current); + next.delete(input.threadId); + return next; + }), + ), + Effect.forkScoped, + Effect.asVoid, + ) + : Effect.logInfo("orchestration-v2.provider-wakeup.coalesced", { + threadId: input.threadId, + providerThreadId: input.providerThreadId, + origin: input.origin, + }), + ), + ), + ), + Effect.forever, + ); + }), +); export const wakeupDispatcherDaemonLayer: Layer.Layer< never, From 2a8052cbf36c5316d7916138af67cf4e62ec11e6 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 16:10:51 -0700 Subject: [PATCH 22/23] Park coalesced wakeups as follow-ups instead of dropping them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A wakeup arriving while its thread already had a dispatch in flight was logged and discarded — if the in-flight attempt then gave up (quiescence timeout) or failed, the parked request never got another attempt (cursor bugbot, Medium). Each thread now keeps a keep-latest follow-up slot that dispatches after the in-flight attempt finishes regardless of its outcome. Keep-latest is sufficient because the adapter buffers all pending activity behind a single attach. Co-Authored-By: Claude Fable 5 --- .../orchestration-v2/ProviderWakeupService.ts | 55 ++++++++++++------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/apps/server/src/orchestration-v2/ProviderWakeupService.ts b/apps/server/src/orchestration-v2/ProviderWakeupService.ts index 73da88efb7e..0a0808ed635 100644 --- a/apps/server/src/orchestration-v2/ProviderWakeupService.ts +++ b/apps/server/src/orchestration-v2/ProviderWakeupService.ts @@ -160,6 +160,11 @@ const dispatchWakeup = (orchestrator: OrchestratorV2Shape, input: ProviderWakeup Effect.asVoid, ); +type WakeupThreadState = { + readonly inFlight: true; + readonly followUp: ProviderWakeupRequest | null; +}; + export const runWakeupDispatcher: Effect.Effect< never, never, @@ -171,33 +176,45 @@ export const runWakeupDispatcher: Effect.Effect< // Wakeups dispatch concurrently, one in flight per thread: quiescence // waiting for one busy thread must not head-of-line-block every other // thread's wakeup (which could be superseded while it waits). A wakeup - // arriving while its thread already has one in flight is coalesced away — - // the adapter buffers all pending activity behind a single attach. - const inFlightThreads = yield* Ref.make(new Set()); + // arriving while its thread already has one in flight parks in that + // thread's follow-up slot (keep-latest — the adapter buffers all pending + // activity behind a single attach, so intermediate requests are + // redundant) and dispatches after the in-flight attempt finishes, even + // when that attempt gave up or failed. + const threadStates = yield* Ref.make(new Map()); + + const drainThread = (input: ProviderWakeupRequest): Effect.Effect => + dispatchWakeup(orchestrator, input).pipe( + Effect.andThen( + Ref.modify(threadStates, (current) => { + const state = current.get(input.threadId); + const next = new Map(current); + if (state?.followUp != null) { + next.set(input.threadId, { inFlight: true, followUp: null }); + return [state.followUp, next] as const; + } + next.delete(input.threadId); + return [null, next] as const; + }), + ), + Effect.flatMap((followUp) => (followUp === null ? Effect.void : drainThread(followUp))), + ); + return yield* relay.take.pipe( Effect.flatMap((input) => - Ref.modify(inFlightThreads, (current) => { + Ref.modify(threadStates, (current) => { + const next = new Map(current); if (current.has(input.threadId)) { - return [false, current] as const; + next.set(input.threadId, { inFlight: true, followUp: input }); + return [false, next] as const; } - const next = new Set(current); - next.add(input.threadId); + next.set(input.threadId, { inFlight: true, followUp: null }); return [true, next] as const; }).pipe( Effect.flatMap((claimed) => claimed - ? dispatchWakeup(orchestrator, input).pipe( - Effect.ensuring( - Ref.update(inFlightThreads, (current) => { - const next = new Set(current); - next.delete(input.threadId); - return next; - }), - ), - Effect.forkScoped, - Effect.asVoid, - ) - : Effect.logInfo("orchestration-v2.provider-wakeup.coalesced", { + ? drainThread(input).pipe(Effect.forkScoped, Effect.asVoid) + : Effect.logInfo("orchestration-v2.provider-wakeup.follow-up-parked", { threadId: input.threadId, providerThreadId: input.providerThreadId, origin: input.origin, From 0e0df04d40250fa72994bbd8bd3960dd1d226fa8 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Fri, 3 Jul 2026 16:19:04 -0700 Subject: [PATCH 23/23] Clear wakeup in-flight marker on drain interruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An interrupted drainThread fiber left its thread marked in flight, which would park every later wakeup for that thread forever (cursor bugbot, High). Interruption only occurs when the dispatcher scope tears down — taking the state map with it — but the invariant now holds locally via an interrupt-scoped cleanup instead of depending on that lifecycle. Interrupt-only rather than ensuring: on the success path the marker may already belong to a new claim. Co-Authored-By: Claude Fable 5 --- .../src/orchestration-v2/ProviderWakeupService.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/apps/server/src/orchestration-v2/ProviderWakeupService.ts b/apps/server/src/orchestration-v2/ProviderWakeupService.ts index 0a0808ed635..319ee95bcb3 100644 --- a/apps/server/src/orchestration-v2/ProviderWakeupService.ts +++ b/apps/server/src/orchestration-v2/ProviderWakeupService.ts @@ -9,6 +9,7 @@ import { } from "@t3tools/contracts"; import * as Context from "effect/Context"; import * as Effect from "effect/Effect"; +import * as Exit from "effect/Exit"; import * as Layer from "effect/Layer"; import * as Queue from "effect/Queue"; import * as Ref from "effect/Ref"; @@ -198,6 +199,20 @@ export const runWakeupDispatcher: Effect.Effect< }), ), Effect.flatMap((followUp) => (followUp === null ? Effect.void : drainThread(followUp))), + // Interruption only happens when the dispatcher scope tears down, but + // clear the in-flight marker anyway so the invariant does not depend + // on that lifecycle: a stale marker would park every later wakeup for + // this thread forever. Interrupt-only (not ensuring): on the success + // path the marker may already belong to a NEW claim. + Effect.onExit((exit) => + Exit.hasInterrupts(exit) + ? Ref.update(threadStates, (current) => { + const next = new Map(current); + next.delete(input.threadId); + return next; + }) + : Effect.void, + ), ); return yield* relay.take.pipe(