From 48574a68f619818b66b97d63e842857cc3127ceb Mon Sep 17 00:00:00 2001
From: Grant Doyle <grant.doyle1@outlook.com>
Date: Wed, 20 May 2026 16:42:43 -0500
Subject: [PATCH 1/6] fix(chat): drop orphan tool results from trimHistory
 output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Anthropic via Databricks Gateway rejects requests where a tool_result
block has no matching tool_use in a prior assistant message:

  API 400: messages.0.content.0: unexpected tool_use_id found in
  tool_result blocks: toolu_bdrk_…. Each tool_result block must have
  a corresponding tool_use block in the previous message.

trimHistory could leave orphans in two ways:
1. The 50-message slice cut between an assistant-with-tool_calls and
   its tool result message.
2. The char-budget loop shifted off the assistant message but kept
   subsequent tool results.

Now: after slicing/trimming, walk the kept array tracking
tool_calls[].id from assistant messages. Any role:"tool" message
whose tool_call_id isn't in that set is dropped. Non-tool messages
pass through unchanged.

Co-authored-by: Isaac
---
 src/utils.ts | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/src/utils.ts b/src/utils.ts
index 2ab2f5b..5f2a6c6 100644
--- a/src/utils.ts
+++ b/src/utils.ts
@@ -113,5 +113,32 @@ function trimHistory(history: MasonMessage[]): MasonMessage[] {
     }
   }
 
-  return [...systems, ...kept];
+  // Drop orphan tool results — a role:"tool" whose tool_call_id isn't matched
+  // by a tool_calls[].id in an earlier role:"assistant" message within the
+  // kept slice. Slicing or char-budget trimming can leave these dangling when
+  // the corresponding assistant message gets pruned, and Anthropic (via the
+  // Databricks Gateway) rejects the whole request: "unexpected tool_use_id
+  // found in tool_result blocks ... must have a corresponding tool_use block
+  // in the previous message."
+  const knownToolUseIds = new Set<string>();
+  const cleaned: MasonMessage[] = [];
+  for (const m of kept) {
+    const anyM = m as any;
+    if (anyM.role === "assistant" && Array.isArray(anyM.tool_calls)) {
+      for (const tc of anyM.tool_calls) {
+        if (tc?.id) knownToolUseIds.add(tc.id);
+      }
+      cleaned.push(m);
+      continue;
+    }
+    if (anyM.role === "tool") {
+      const id = anyM.tool_call_id;
+      if (id && knownToolUseIds.has(id)) cleaned.push(m);
+      // orphan tool result — drop silently
+      continue;
+    }
+    cleaned.push(m);
+  }
+
+  return [...systems, ...cleaned];
 }

From 1d2dbd9c3c80713934df5c4cc8de539e499db8a8 Mon Sep 17 00:00:00 2001
From: Grant Doyle <grant.doyle1@outlook.com>
Date: Wed, 20 May 2026 20:01:28 -0500
Subject: [PATCH 2/6] =?UTF-8?q?fix(chat):=20keep=20"Building=E2=80=A6"=20v?=
 =?UTF-8?q?isible=20below=20streaming=20text?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the brick indicator was removed when the first content chunk
arrived — the streaming text itself was supposed to be the "still
working" cue. But Opus 4.7 often pauses mid-stream between paragraphs
and the typewriter catches up, leaving static text on screen with no
indication that more is coming. Users assume Mason has frozen.

Don't removeThinking on first chunk anymore. After creating the
streaming bubble, move the thinking div to *after* it so DOM order is
text-then-bricks. The brick stays visible through the entire stream.
addMessageEl (when "Calling tool:" lands) removes the thinking div if
the response had tool_calls, and send()'s finally removes it when
chatLoop exits for a text-only response.

Co-authored-by: Isaac
---
 src/chat.ts | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/chat.ts b/src/chat.ts
index 8ecc8d8..e268fc7 100644
--- a/src/chat.ts
+++ b/src/chat.ts
@@ -244,13 +244,19 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
       window.api.onChatChunk((chunk: any) => {
         if (firstChunk) {
           firstChunk = false;
-          removeThinking();
           clearWelcome();
           streamingEl = document.createElement("div");
           streamingEl.className = "msg assistant";
           streamingEl.style.whiteSpace = "pre-wrap";
           if (messagesEl) {
             messagesEl.appendChild(streamingEl);
+            // Keep the building-bricks indicator visible *below* the streaming
+            // bubble so users see "still working" even when chunks pause
+            // mid-stream (Opus 4.7 often pauses between paragraphs). The
+            // thinking div was appended earlier; move it after streamingEl
+            // so DOM order is text-then-bricks.
+            const thinking = document.getElementById("thinkingIndicator");
+            if (thinking) messagesEl.appendChild(thinking);
             messagesEl.scrollTop = messagesEl.scrollHeight;
           }
         }

From 015c140273ed5e074d5b55fa45742ed06ef4a3be Mon Sep 17 00:00:00 2001
From: Grant Doyle <grant.doyle1@outlook.com>
Date: Wed, 20 May 2026 20:06:48 -0500
Subject: [PATCH 3/6] feat(chat): Anthropic prompt caching for tools + system
 messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tool schemas are the heaviest static portion of every turn. With ~80
tools at ~200 tokens each, Mason was re-sending ~16K tokens of static
tool definitions on every single turn — roughly $0.24 per turn at
Opus 4.7 input pricing, $12 over a 50-turn agentic session, just for
tool overhead.

Anthropic supports prompt caching via cache_control: {type: "ephemeral"}.
Marked content is cached for 5 minutes; subsequent turns within the
window read at ~10% of input cost.

Mason now applies cache_control to:
- The last tool definition (Anthropic caches everything up to and
  including the marked element, so a single breakpoint covers the
  entire tools array)
- The last system message (covers skills manifest + user system
  prompt + tool-aware nudge — everything stable before chat history)

No-op for non-Claude models. OpenAI prefixes >1024 tokens cache
automatically already; Gemini uses a separate cachedContents API
that doesn't fit our shape; Meta/Qwen have no standard caching.

Also logs data.usage.cache_creation_input_tokens and
cache_read_input_tokens from non-streaming responses so we can
verify caching is engaging in practice (and how much it saves).

Co-authored-by: Isaac
---
 src/main.ts | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/src/main.ts b/src/main.ts
index 94eae85..7052cd9 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -131,6 +131,42 @@ function chatFetch(
 // (Gemini, some Anthropic responses, etc. return `content: [{type:"text", text:"..."}]`).
 // Without this, the renderer feeds an array to marked() and gets a confusing
 // "input parameter is of type [object Array], string expected" error.
+// Anthropic prompt-caching helper. Sets cache_control: {type: "ephemeral"}
+// breakpoints on the heaviest static portions of the prompt (tools + last
+// system message) so repeated turns within a 5-minute window read at ~10% of
+// the input cost. No-op for non-Claude models. Databricks AI Gateway passes
+// cache_control through to Anthropic; the response's usage.cache_read_input_tokens
+// surfaces whether a cache hit occurred (logged below).
+function applyAnthropicCaching(body: any, model: string): void {
+  if (typeof model !== "string") return;
+  if (!model.toLowerCase().includes("claude")) return;
+
+  // Mark the last tool. Anthropic caches everything up to and including the
+  // marked element, so a single breakpoint at the end covers all tools.
+  if (Array.isArray(body.tools) && body.tools.length > 0) {
+    const lastIdx = body.tools.length - 1;
+    body.tools[lastIdx] = {
+      ...body.tools[lastIdx],
+      cache_control: { type: "ephemeral" },
+    };
+  }
+
+  // Mark the last system message. Covers skills manifest + user system prompt
+  // + tool-aware nudge — everything stable before the dynamic chat history.
+  if (Array.isArray(body.messages)) {
+    let lastSystemIdx = -1;
+    for (let i = 0; i < body.messages.length; i++) {
+      if (body.messages[i]?.role === "system") lastSystemIdx = i;
+    }
+    if (lastSystemIdx >= 0) {
+      body.messages[lastSystemIdx] = {
+        ...body.messages[lastSystemIdx],
+        cache_control: { type: "ephemeral" },
+      };
+    }
+  }
+}
+
 // Sanitize tool_calls before returning to the renderer. Two failure modes:
 //   • Empty arguments: providers stream tools that take no params as
 //     function.arguments="" — the Databricks AI Gateway rejects that on the
@@ -2020,6 +2056,15 @@ ipcMain.handle(
 
     if (shouldStream) body.stream = true;
 
+    // Anthropic prompt caching. Tool schemas dominate every turn (~16K tokens
+    // for ~80 tools at ~200 tokens each). With cache_control on the last tool,
+    // Anthropic caches the entire tools prefix for 5 minutes — subsequent
+    // turns within the cache window read at 10% of input cost. Also mark the
+    // last system message so the static instruction prefix gets cached too.
+    // OpenAI prefixes >1024 tokens cache automatically (no opt-in needed).
+    // Gemini/Meta/Qwen have no standard caching API — leave untouched.
+    applyAnthropicCaching(body, String(model));
+
     const res = await chatFetch(
       url,
       {
@@ -2119,6 +2164,19 @@ ipcMain.handle(
 
     const data: any = await res.json();
 
+    // Surface Anthropic cache metrics when present so we can verify caching
+    // is actually engaging. Both fields are reported in data.usage; absence
+    // means either non-Claude or no cache hit/creation this turn.
+    if (data?.usage) {
+      const cReq = data.usage.cache_creation_input_tokens;
+      const cRead = data.usage.cache_read_input_tokens;
+      if (cReq || cRead) {
+        console.log(
+          `[CHAT] Cache usage — created: ${cReq || 0}, read: ${cRead || 0}, input: ${data.usage.input_tokens || data.usage.prompt_tokens || "?"}`
+        );
+      }
+    }
+
     if (isResponses) {
       const items = data.output || [];
       const toolCalls = items

From a9c4921e0c15d21cdcb6e8e6e162710362b1d7f1 Mon Sep 17 00:00:00 2001
From: Grant Doyle <grant.doyle1@outlook.com>
Date: Wed, 20 May 2026 20:19:09 -0500
Subject: [PATCH 4/6] fix(chat): raise agent-loop budget 10 -> 40 + surface
 clear stop reason
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chat loop silently exited after 10 tool-call rounds. Polling
patterns (waiting on a job run with manage_job_runs status checks)
or multi-step builds (creating tables + seeding + verifying) blow
past 10 easily — users saw the brick disappear and idle return with
no explanation.

- Raise ITERATION_BUDGET to 40. That covers realistic polling plus
  several follow-ups in the same turn.
- Track iterationsUsed and, after the while loop exits without a
  clean text-branch return, surface an in-chat error explaining
  what happened and how to recover ("send 'continue' or break the
  task into smaller steps"). Different message when we exit via
  the type-mismatch break vs the budget exhaustion path.

Co-authored-by: Isaac
---
 src/chat.ts | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/chat.ts b/src/chat.ts
index e268fc7..d6fda33 100644
--- a/src/chat.ts
+++ b/src/chat.ts
@@ -178,9 +178,16 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
       JSON.stringify(toolsForApi.map((t) => t.function.name))
     );
 
-  let maxIterations = 10;
+  // Agent-loop budget. 40 covers real-world polling-heavy patterns like
+  // waiting on a job run with periodic status checks plus several follow-up
+  // tool calls. When the loop exhausts the budget we surface a clear error
+  // below instead of silently returning to idle.
+  const ITERATION_BUDGET = 40;
+  let maxIterations = ITERATION_BUDGET;
+  let iterationsUsed = 0;
 
   while (maxIterations-- > 0) {
+    iterationsUsed += 1;
     const chatToken = await getAuthToken();
     const sel = modelEl.value;
     let chatGateway = getGatewayUrl();
@@ -554,4 +561,14 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
 
     break;
   }
+
+  // If we exit the loop without hitting `return` in the text branch, we
+  // either ran out of iteration budget mid-tool-loop or hit an unexpected
+  // result type. Tell the user clearly — silent stops at this point look
+  // like a hang.
+  const hitBudget = iterationsUsed >= ITERATION_BUDGET;
+  const message = hitBudget
+    ? `Agent loop hit the ${ITERATION_BUDGET}-step budget — the model was making tool calls but never produced a final answer. Send another message (e.g. "continue" or a more specific question) to keep going. If this happens repeatedly, break the task into smaller steps or narrow the toolset.`
+    : "Conversation ended unexpectedly. Send another message to continue.";
+  addMessageEl("error", message);
 }

From e01789fbdf38fcc7701be0e7ed46422323917679 Mon Sep 17 00:00:00 2001
From: Grant Doyle <grant.doyle1@outlook.com>
Date: Wed, 20 May 2026 20:28:52 -0500
Subject: [PATCH 5/6] fix(chat): surface streaming usage so cache hits are
 visible in logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cache-usage log only fired on non-streamed responses, but every
chat with tools now streams (see PR #9). Without seeing the usage
breakdown there was no way to verify the Anthropic prompt caching
actually engaged.

- Set stream_options.include_usage = true on the chat completions
  body when shouldStream is set. The upstream then emits one final
  SSE chunk with chunk.usage populated and empty choices.
- Parse chunk.usage in the SSE loop and log:
    [CHAT] Usage (streamed) — input: X, output: Y,
                              cache_created: A, cache_read: B
- cache_created > 0 means the cache was warmed this turn.
  cache_read > 0 means a prior cache was reused — that's the savings
  signal.

Now users can verify caching by watching the renderer console (or
the packaged-build log) after a few turns. If cache_read climbs
turn-over-turn while staying close to the tool-schema size, caching
is doing its job.

Co-authored-by: Isaac
---
 src/main.ts | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/main.ts b/src/main.ts
index 7052cd9..1c0f1fd 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -2054,7 +2054,14 @@ ipcMain.handle(
       }
     }
 
-    if (shouldStream) body.stream = true;
+    if (shouldStream) {
+      body.stream = true;
+      // Ask the upstream to include a final usage chunk in the SSE stream so
+      // we can log cache hits and total token counts. Without this, the
+      // streaming path never sees usage and we can't verify caching is
+      // actually engaging.
+      body.stream_options = { include_usage: true };
+    }
 
     // Anthropic prompt caching. Tool schemas dominate every turn (~16K tokens
     // for ~80 tools at ~200 tokens each). With cache_control on the last tool,
@@ -2143,6 +2150,19 @@ ipcMain.handle(
                 }
               }
             }
+            // The final stream chunk (when include_usage is set) carries
+            // usage stats with empty choices. Surface cache + token counts
+            // so we can confirm prompt caching is engaging.
+            if (chunk.usage) {
+              const u = chunk.usage;
+              const cReq = u.cache_creation_input_tokens;
+              const cRead = u.cache_read_input_tokens;
+              const inTok = u.input_tokens ?? u.prompt_tokens ?? "?";
+              const outTok = u.output_tokens ?? u.completion_tokens ?? "?";
+              console.log(
+                `[CHAT] Usage (streamed) — input: ${inTok}, output: ${outTok}, cache_created: ${cReq || 0}, cache_read: ${cRead || 0}`
+              );
+            }
           } catch (_) {}
         }
       }

From 2d27179f7940445dd5e619121eb585f4192209fc Mon Sep 17 00:00:00 2001
From: Grant Doyle <grant.doyle1@outlook.com>
Date: Thu, 21 May 2026 14:03:13 -0500
Subject: [PATCH 6/6] fix(chat): log streamed usage once (final), not per chunk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Databricks Gateway sends chunk.usage in every streaming SSE event
with running totals — Mason was logging the line ~38 times per turn,
each with output_tokens still climbing. Capture the latest usage and
log once after the stream loop completes so the console shows one
clear line per turn.

Co-authored-by: Isaac
---
 src/main.ts | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/main.ts b/src/main.ts
index 1c0f1fd..1d41d97 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -2111,6 +2111,7 @@ ipcMain.handle(
       const reader = (res.body as any).getReader();
       const decoder = new TextDecoder();
       let buffer = "";
+      let lastUsage: any = null;
 
       while (true) {
         const { done, value } = await reader.read();
@@ -2150,22 +2151,23 @@ ipcMain.handle(
                 }
               }
             }
-            // The final stream chunk (when include_usage is set) carries
-            // usage stats with empty choices. Surface cache + token counts
-            // so we can confirm prompt caching is engaging.
-            if (chunk.usage) {
-              const u = chunk.usage;
-              const cReq = u.cache_creation_input_tokens;
-              const cRead = u.cache_read_input_tokens;
-              const inTok = u.input_tokens ?? u.prompt_tokens ?? "?";
-              const outTok = u.output_tokens ?? u.completion_tokens ?? "?";
-              console.log(
-                `[CHAT] Usage (streamed) — input: ${inTok}, output: ${outTok}, cache_created: ${cReq || 0}, cache_read: ${cRead || 0}`
-              );
-            }
+            // Databricks Gateway sends usage in every SSE chunk with running
+            // totals — capture the latest and log once after the stream ends
+            // (logging here floods the console with N copies).
+            if (chunk.usage) lastUsage = chunk.usage;
           } catch (_) {}
         }
       }
+      if (lastUsage) {
+        const u = lastUsage;
+        const cReq = u.cache_creation_input_tokens;
+        const cRead = u.cache_read_input_tokens;
+        const inTok = u.input_tokens ?? u.prompt_tokens ?? "?";
+        const outTok = u.output_tokens ?? u.completion_tokens ?? "?";
+        console.log(
+          `[CHAT] Usage (streamed) — input: ${inTok}, output: ${outTok}, cache_created: ${cReq || 0}, cache_read: ${cRead || 0}`
+        );
+      }
       // Compact (in case the stream skipped an index) and sanitize tool args
       // (empty or malformed JSON → "{}") before returning.
       const toolCalls = sanitizeToolCalls(