diff --git a/src/chat.ts b/src/chat.ts index 8ecc8d8..d6fda33 100644 --- a/src/chat.ts +++ b/src/chat.ts @@ -178,9 +178,16 @@ async function chatLoop(_profile: { host?: string }): Promise { JSON.stringify(toolsForApi.map((t) => t.function.name)) ); - let maxIterations = 10; + // Agent-loop budget. 40 covers real-world polling-heavy patterns like + // waiting on a job run with periodic status checks plus several follow-up + // tool calls. When the loop exhausts the budget we surface a clear error + // below instead of silently returning to idle. + const ITERATION_BUDGET = 40; + let maxIterations = ITERATION_BUDGET; + let iterationsUsed = 0; while (maxIterations-- > 0) { + iterationsUsed += 1; const chatToken = await getAuthToken(); const sel = modelEl.value; let chatGateway = getGatewayUrl(); @@ -244,13 +251,19 @@ async function chatLoop(_profile: { host?: string }): Promise { window.api.onChatChunk((chunk: any) => { if (firstChunk) { firstChunk = false; - removeThinking(); clearWelcome(); streamingEl = document.createElement("div"); streamingEl.className = "msg assistant"; streamingEl.style.whiteSpace = "pre-wrap"; if (messagesEl) { messagesEl.appendChild(streamingEl); + // Keep the building-bricks indicator visible *below* the streaming + // bubble so users see "still working" even when chunks pause + // mid-stream (Opus 4.7 often pauses between paragraphs). The + // thinking div was appended earlier; move it after streamingEl + // so DOM order is text-then-bricks. + const thinking = document.getElementById("thinkingIndicator"); + if (thinking) messagesEl.appendChild(thinking); messagesEl.scrollTop = messagesEl.scrollHeight; } } @@ -548,4 +561,14 @@ async function chatLoop(_profile: { host?: string }): Promise { break; } + + // If we exit the loop without hitting `return` in the text branch, we + // either ran out of iteration budget mid-tool-loop or hit an unexpected + // result type. Tell the user clearly — silent stops at this point look + // like a hang. + const hitBudget = iterationsUsed >= ITERATION_BUDGET; + const message = hitBudget + ? `Agent loop hit the ${ITERATION_BUDGET}-step budget — the model was making tool calls but never produced a final answer. Send another message (e.g. "continue" or a more specific question) to keep going. If this happens repeatedly, break the task into smaller steps or narrow the toolset.` + : "Conversation ended unexpectedly. Send another message to continue."; + addMessageEl("error", message); } diff --git a/src/main.ts b/src/main.ts index 94eae85..1d41d97 100644 --- a/src/main.ts +++ b/src/main.ts @@ -131,6 +131,42 @@ function chatFetch( // (Gemini, some Anthropic responses, etc. return `content: [{type:"text", text:"..."}]`). // Without this, the renderer feeds an array to marked() and gets a confusing // "input parameter is of type [object Array], string expected" error. +// Anthropic prompt-caching helper. Sets cache_control: {type: "ephemeral"} +// breakpoints on the heaviest static portions of the prompt (tools + last +// system message) so repeated turns within a 5-minute window read at ~10% of +// the input cost. No-op for non-Claude models. Databricks AI Gateway passes +// cache_control through to Anthropic; the response's usage.cache_read_input_tokens +// surfaces whether a cache hit occurred (logged below). +function applyAnthropicCaching(body: any, model: string): void { + if (typeof model !== "string") return; + if (!model.toLowerCase().includes("claude")) return; + + // Mark the last tool. Anthropic caches everything up to and including the + // marked element, so a single breakpoint at the end covers all tools. + if (Array.isArray(body.tools) && body.tools.length > 0) { + const lastIdx = body.tools.length - 1; + body.tools[lastIdx] = { + ...body.tools[lastIdx], + cache_control: { type: "ephemeral" }, + }; + } + + // Mark the last system message. Covers skills manifest + user system prompt + // + tool-aware nudge — everything stable before the dynamic chat history. + if (Array.isArray(body.messages)) { + let lastSystemIdx = -1; + for (let i = 0; i < body.messages.length; i++) { + if (body.messages[i]?.role === "system") lastSystemIdx = i; + } + if (lastSystemIdx >= 0) { + body.messages[lastSystemIdx] = { + ...body.messages[lastSystemIdx], + cache_control: { type: "ephemeral" }, + }; + } + } +} + // Sanitize tool_calls before returning to the renderer. Two failure modes: // • Empty arguments: providers stream tools that take no params as // function.arguments="" — the Databricks AI Gateway rejects that on the @@ -2018,7 +2054,23 @@ ipcMain.handle( } } - if (shouldStream) body.stream = true; + if (shouldStream) { + body.stream = true; + // Ask the upstream to include a final usage chunk in the SSE stream so + // we can log cache hits and total token counts. Without this, the + // streaming path never sees usage and we can't verify caching is + // actually engaging. + body.stream_options = { include_usage: true }; + } + + // Anthropic prompt caching. Tool schemas dominate every turn (~16K tokens + // for ~80 tools at ~200 tokens each). With cache_control on the last tool, + // Anthropic caches the entire tools prefix for 5 minutes — subsequent + // turns within the cache window read at 10% of input cost. Also mark the + // last system message so the static instruction prefix gets cached too. + // OpenAI prefixes >1024 tokens cache automatically (no opt-in needed). + // Gemini/Meta/Qwen have no standard caching API — leave untouched. + applyAnthropicCaching(body, String(model)); const res = await chatFetch( url, @@ -2059,6 +2111,7 @@ ipcMain.handle( const reader = (res.body as any).getReader(); const decoder = new TextDecoder(); let buffer = ""; + let lastUsage: any = null; while (true) { const { done, value } = await reader.read(); @@ -2098,9 +2151,23 @@ ipcMain.handle( } } } + // Databricks Gateway sends usage in every SSE chunk with running + // totals — capture the latest and log once after the stream ends + // (logging here floods the console with N copies). + if (chunk.usage) lastUsage = chunk.usage; } catch (_) {} } } + if (lastUsage) { + const u = lastUsage; + const cReq = u.cache_creation_input_tokens; + const cRead = u.cache_read_input_tokens; + const inTok = u.input_tokens ?? u.prompt_tokens ?? "?"; + const outTok = u.output_tokens ?? u.completion_tokens ?? "?"; + console.log( + `[CHAT] Usage (streamed) — input: ${inTok}, output: ${outTok}, cache_created: ${cReq || 0}, cache_read: ${cRead || 0}` + ); + } // Compact (in case the stream skipped an index) and sanitize tool args // (empty or malformed JSON → "{}") before returning. const toolCalls = sanitizeToolCalls( @@ -2119,6 +2186,19 @@ ipcMain.handle( const data: any = await res.json(); + // Surface Anthropic cache metrics when present so we can verify caching + // is actually engaging. Both fields are reported in data.usage; absence + // means either non-Claude or no cache hit/creation this turn. + if (data?.usage) { + const cReq = data.usage.cache_creation_input_tokens; + const cRead = data.usage.cache_read_input_tokens; + if (cReq || cRead) { + console.log( + `[CHAT] Cache usage — created: ${cReq || 0}, read: ${cRead || 0}, input: ${data.usage.input_tokens || data.usage.prompt_tokens || "?"}` + ); + } + } + if (isResponses) { const items = data.output || []; const toolCalls = items diff --git a/src/utils.ts b/src/utils.ts index 2ab2f5b..5f2a6c6 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -113,5 +113,32 @@ function trimHistory(history: MasonMessage[]): MasonMessage[] { } } - return [...systems, ...kept]; + // Drop orphan tool results — a role:"tool" whose tool_call_id isn't matched + // by a tool_calls[].id in an earlier role:"assistant" message within the + // kept slice. Slicing or char-budget trimming can leave these dangling when + // the corresponding assistant message gets pruned, and Anthropic (via the + // Databricks Gateway) rejects the whole request: "unexpected tool_use_id + // found in tool_result blocks ... must have a corresponding tool_use block + // in the previous message." + const knownToolUseIds = new Set(); + const cleaned: MasonMessage[] = []; + for (const m of kept) { + const anyM = m as any; + if (anyM.role === "assistant" && Array.isArray(anyM.tool_calls)) { + for (const tc of anyM.tool_calls) { + if (tc?.id) knownToolUseIds.add(tc.id); + } + cleaned.push(m); + continue; + } + if (anyM.role === "tool") { + const id = anyM.tool_call_id; + if (id && knownToolUseIds.has(id)) cleaned.push(m); + // orphan tool result — drop silently + continue; + } + cleaned.push(m); + } + + return [...systems, ...cleaned]; }