databricks-solutions · DoyleDev · May 21, 2026 · May 20, 2026 · May 21, 2026 · May 21, 2026
diff --git a/src/chat.ts b/src/chat.ts
@@ -178,9 +178,16 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
       JSON.stringify(toolsForApi.map((t) => t.function.name))
     );
 
-  let maxIterations = 10;
+  // Agent-loop budget. 40 covers real-world polling-heavy patterns like
+  // waiting on a job run with periodic status checks plus several follow-up
+  // tool calls. When the loop exhausts the budget we surface a clear error
+  // below instead of silently returning to idle.
+  const ITERATION_BUDGET = 40;
+  let maxIterations = ITERATION_BUDGET;
+  let iterationsUsed = 0;
 
   while (maxIterations-- > 0) {
+    iterationsUsed += 1;
     const chatToken = await getAuthToken();
     const sel = modelEl.value;
     let chatGateway = getGatewayUrl();
@@ -244,13 +251,19 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
       window.api.onChatChunk((chunk: any) => {
         if (firstChunk) {
           firstChunk = false;
-          removeThinking();
           clearWelcome();
           streamingEl = document.createElement("div");
           streamingEl.className = "msg assistant";
           streamingEl.style.whiteSpace = "pre-wrap";
           if (messagesEl) {
             messagesEl.appendChild(streamingEl);
+            // Keep the building-bricks indicator visible *below* the streaming
+            // bubble so users see "still working" even when chunks pause
+            // mid-stream (Opus 4.7 often pauses between paragraphs). The
+            // thinking div was appended earlier; move it after streamingEl
+            // so DOM order is text-then-bricks.
+            const thinking = document.getElementById("thinkingIndicator");
+            if (thinking) messagesEl.appendChild(thinking);
             messagesEl.scrollTop = messagesEl.scrollHeight;
           }
         }
@@ -548,4 +561,14 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
 
     break;
   }
+
+  // If we exit the loop without hitting `return` in the text branch, we
+  // either ran out of iteration budget mid-tool-loop or hit an unexpected
+  // result type. Tell the user clearly — silent stops at this point look
+  // like a hang.
+  const hitBudget = iterationsUsed >= ITERATION_BUDGET;
+  const message = hitBudget
+    ? `Agent loop hit the ${ITERATION_BUDGET}-step budget — the model was making tool calls but never produced a final answer. Send another message (e.g. "continue" or a more specific question) to keep going. If this happens repeatedly, break the task into smaller steps or narrow the toolset.`
+    : "Conversation ended unexpectedly. Send another message to continue.";
+  addMessageEl("error", message);
 }
diff --git a/src/main.ts b/src/main.ts
@@ -131,6 +131,42 @@ function chatFetch(
 // (Gemini, some Anthropic responses, etc. return `content: [{type:"text", text:"..."}]`).
 // Without this, the renderer feeds an array to marked() and gets a confusing
 // "input parameter is of type [object Array], string expected" error.
+// Anthropic prompt-caching helper. Sets cache_control: {type: "ephemeral"}
+// breakpoints on the heaviest static portions of the prompt (tools + last
+// system message) so repeated turns within a 5-minute window read at ~10% of
+// the input cost. No-op for non-Claude models. Databricks AI Gateway passes
+// cache_control through to Anthropic; the response's usage.cache_read_input_tokens
+// surfaces whether a cache hit occurred (logged below).
+function applyAnthropicCaching(body: any, model: string): void {
+  if (typeof model !== "string") return;
+  if (!model.toLowerCase().includes("claude")) return;
+
+  // Mark the last tool. Anthropic caches everything up to and including the
+  // marked element, so a single breakpoint at the end covers all tools.
+  if (Array.isArray(body.tools) && body.tools.length > 0) {
+    const lastIdx = body.tools.length - 1;
+    body.tools[lastIdx] = {
+      ...body.tools[lastIdx],
+      cache_control: { type: "ephemeral" },
+    };
+  }
+
+  // Mark the last system message. Covers skills manifest + user system prompt
+  // + tool-aware nudge — everything stable before the dynamic chat history.
+  if (Array.isArray(body.messages)) {
+    let lastSystemIdx = -1;
+    for (let i = 0; i < body.messages.length; i++) {
+      if (body.messages[i]?.role === "system") lastSystemIdx = i;
+    }
+    if (lastSystemIdx >= 0) {
+      body.messages[lastSystemIdx] = {
+        ...body.messages[lastSystemIdx],
+        cache_control: { type: "ephemeral" },
+      };
+    }
+  }
+}
+
 // Sanitize tool_calls before returning to the renderer. Two failure modes:
 //   • Empty arguments: providers stream tools that take no params as
 //     function.arguments="" — the Databricks AI Gateway rejects that on the
@@ -2018,7 +2054,23 @@ ipcMain.handle(
       }
     }
 
-    if (shouldStream) body.stream = true;
+    if (shouldStream) {
+      body.stream = true;
+      // Ask the upstream to include a final usage chunk in the SSE stream so
+      // we can log cache hits and total token counts. Without this, the
+      // streaming path never sees usage and we can't verify caching is
+      // actually engaging.
+      body.stream_options = { include_usage: true };
+    }
+
+    // Anthropic prompt caching. Tool schemas dominate every turn (~16K tokens
+    // for ~80 tools at ~200 tokens each). With cache_control on the last tool,
+    // Anthropic caches the entire tools prefix for 5 minutes — subsequent
+    // turns within the cache window read at 10% of input cost. Also mark the
+    // last system message so the static instruction prefix gets cached too.
+    // OpenAI prefixes >1024 tokens cache automatically (no opt-in needed).
+    // Gemini/Meta/Qwen have no standard caching API — leave untouched.
+    applyAnthropicCaching(body, String(model));
 
     const res = await chatFetch(
       url,
@@ -2059,6 +2111,7 @@ ipcMain.handle(
       const reader = (res.body as any).getReader();
       const decoder = new TextDecoder();
       let buffer = "";
+      let lastUsage: any = null;
 
       while (true) {
         const { done, value } = await reader.read();
@@ -2098,9 +2151,23 @@ ipcMain.handle(
                 }
               }
             }
+            // Databricks Gateway sends usage in every SSE chunk with running
+            // totals — capture the latest and log once after the stream ends
+            // (logging here floods the console with N copies).
+            if (chunk.usage) lastUsage = chunk.usage;
           } catch (_) {}
         }
       }
+      if (lastUsage) {
+        const u = lastUsage;
+        const cReq = u.cache_creation_input_tokens;
+        const cRead = u.cache_read_input_tokens;
+        const inTok = u.input_tokens ?? u.prompt_tokens ?? "?";
+        const outTok = u.output_tokens ?? u.completion_tokens ?? "?";
+        console.log(
+          `[CHAT] Usage (streamed) — input: ${inTok}, output: ${outTok}, cache_created: ${cReq || 0}, cache_read: ${cRead || 0}`
+        );
+      }
       // Compact (in case the stream skipped an index) and sanitize tool args
       // (empty or malformed JSON → "{}") before returning.
       const toolCalls = sanitizeToolCalls(
@@ -2119,6 +2186,19 @@ ipcMain.handle(
 
     const data: any = await res.json();
 
+    // Surface Anthropic cache metrics when present so we can verify caching
+    // is actually engaging. Both fields are reported in data.usage; absence
+    // means either non-Claude or no cache hit/creation this turn.
+    if (data?.usage) {
+      const cReq = data.usage.cache_creation_input_tokens;
+      const cRead = data.usage.cache_read_input_tokens;
+      if (cReq || cRead) {
+        console.log(
+          `[CHAT] Cache usage — created: ${cReq || 0}, read: ${cRead || 0}, input: ${data.usage.input_tokens || data.usage.prompt_tokens || "?"}`
+        );
+      }
+    }
+
     if (isResponses) {
       const items = data.output || [];
       const toolCalls = items

diff --git a/src/utils.ts b/src/utils.ts
@@ -113,5 +113,32 @@ function trimHistory(history: MasonMessage[]): MasonMessage[] {
     }
   }
 
-  return [...systems, ...kept];
+  // Drop orphan tool results — a role:"tool" whose tool_call_id isn't matched
+  // by a tool_calls[].id in an earlier role:"assistant" message within the
+  // kept slice. Slicing or char-budget trimming can leave these dangling when
+  // the corresponding assistant message gets pruned, and Anthropic (via the
+  // Databricks Gateway) rejects the whole request: "unexpected tool_use_id
+  // found in tool_result blocks ... must have a corresponding tool_use block
+  // in the previous message."
+  const knownToolUseIds = new Set<string>();
+  const cleaned: MasonMessage[] = [];
+  for (const m of kept) {
+    const anyM = m as any;
+    if (anyM.role === "assistant" && Array.isArray(anyM.tool_calls)) {
+      for (const tc of anyM.tool_calls) {
+        if (tc?.id) knownToolUseIds.add(tc.id);
+      }
+      cleaned.push(m);
+      continue;
+    }
+    if (anyM.role === "tool") {
+      const id = anyM.tool_call_id;
+      if (id && knownToolUseIds.has(id)) cleaned.push(m);
+      // orphan tool result — drop silently
+      continue;
+    }
+    cleaned.push(m);
+  }
+
+  return [...systems, ...cleaned];
 }