Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions src/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,16 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
JSON.stringify(toolsForApi.map((t) => t.function.name))
);

let maxIterations = 10;
// Agent-loop budget. 40 covers real-world polling-heavy patterns like
// waiting on a job run with periodic status checks plus several follow-up
// tool calls. When the loop exhausts the budget we surface a clear error
// below instead of silently returning to idle.
const ITERATION_BUDGET = 40;
let maxIterations = ITERATION_BUDGET;
let iterationsUsed = 0;

while (maxIterations-- > 0) {
iterationsUsed += 1;
const chatToken = await getAuthToken();
const sel = modelEl.value;
let chatGateway = getGatewayUrl();
Expand Down Expand Up @@ -244,13 +251,19 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {
window.api.onChatChunk((chunk: any) => {
if (firstChunk) {
firstChunk = false;
removeThinking();
clearWelcome();
streamingEl = document.createElement("div");
streamingEl.className = "msg assistant";
streamingEl.style.whiteSpace = "pre-wrap";
if (messagesEl) {
messagesEl.appendChild(streamingEl);
// Keep the building-bricks indicator visible *below* the streaming
// bubble so users see "still working" even when chunks pause
// mid-stream (Opus 4.7 often pauses between paragraphs). The
// thinking div was appended earlier; move it after streamingEl
// so DOM order is text-then-bricks.
const thinking = document.getElementById("thinkingIndicator");
if (thinking) messagesEl.appendChild(thinking);
messagesEl.scrollTop = messagesEl.scrollHeight;
}
}
Expand Down Expand Up @@ -548,4 +561,14 @@ async function chatLoop(_profile: { host?: string }): Promise<void> {

break;
}

// If we exit the loop without hitting `return` in the text branch, we
// either ran out of iteration budget mid-tool-loop or hit an unexpected
// result type. Tell the user clearly — silent stops at this point look
// like a hang.
const hitBudget = iterationsUsed >= ITERATION_BUDGET;
const message = hitBudget
? `Agent loop hit the ${ITERATION_BUDGET}-step budget — the model was making tool calls but never produced a final answer. Send another message (e.g. "continue" or a more specific question) to keep going. If this happens repeatedly, break the task into smaller steps or narrow the toolset.`
: "Conversation ended unexpectedly. Send another message to continue.";
addMessageEl("error", message);
}
82 changes: 81 additions & 1 deletion src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,42 @@ function chatFetch(
// (Gemini, some Anthropic responses, etc. return `content: [{type:"text", text:"..."}]`).
// Without this, the renderer feeds an array to marked() and gets a confusing
// "input parameter is of type [object Array], string expected" error.
// Anthropic prompt-caching helper. Sets cache_control: {type: "ephemeral"}
// breakpoints on the heaviest static portions of the prompt (tools + last
// system message) so repeated turns within a 5-minute window read at ~10% of
// the input cost. No-op for non-Claude models. Databricks AI Gateway passes
// cache_control through to Anthropic; the response's usage.cache_read_input_tokens
// surfaces whether a cache hit occurred (logged below).
function applyAnthropicCaching(body: any, model: string): void {
if (typeof model !== "string") return;
if (!model.toLowerCase().includes("claude")) return;

// Mark the last tool. Anthropic caches everything up to and including the
// marked element, so a single breakpoint at the end covers all tools.
if (Array.isArray(body.tools) && body.tools.length > 0) {
const lastIdx = body.tools.length - 1;
body.tools[lastIdx] = {
...body.tools[lastIdx],
cache_control: { type: "ephemeral" },
};
}

// Mark the last system message. Covers skills manifest + user system prompt
// + tool-aware nudge — everything stable before the dynamic chat history.
if (Array.isArray(body.messages)) {
let lastSystemIdx = -1;
for (let i = 0; i < body.messages.length; i++) {
if (body.messages[i]?.role === "system") lastSystemIdx = i;
}
if (lastSystemIdx >= 0) {
body.messages[lastSystemIdx] = {
...body.messages[lastSystemIdx],
cache_control: { type: "ephemeral" },
};
}
}
}

// Sanitize tool_calls before returning to the renderer. Two failure modes:
// • Empty arguments: providers stream tools that take no params as
// function.arguments="" — the Databricks AI Gateway rejects that on the
Expand Down Expand Up @@ -2018,7 +2054,23 @@ ipcMain.handle(
}
}

if (shouldStream) body.stream = true;
if (shouldStream) {
body.stream = true;
// Ask the upstream to include a final usage chunk in the SSE stream so
// we can log cache hits and total token counts. Without this, the
// streaming path never sees usage and we can't verify caching is
// actually engaging.
body.stream_options = { include_usage: true };
}

// Anthropic prompt caching. Tool schemas dominate every turn (~16K tokens
// for ~80 tools at ~200 tokens each). With cache_control on the last tool,
// Anthropic caches the entire tools prefix for 5 minutes — subsequent
// turns within the cache window read at 10% of input cost. Also mark the
// last system message so the static instruction prefix gets cached too.
// OpenAI prefixes >1024 tokens cache automatically (no opt-in needed).
// Gemini/Meta/Qwen have no standard caching API — leave untouched.
applyAnthropicCaching(body, String(model));

const res = await chatFetch(
url,
Expand Down Expand Up @@ -2059,6 +2111,7 @@ ipcMain.handle(
const reader = (res.body as any).getReader();
const decoder = new TextDecoder();
let buffer = "";
let lastUsage: any = null;

while (true) {
const { done, value } = await reader.read();
Expand Down Expand Up @@ -2098,9 +2151,23 @@ ipcMain.handle(
}
}
}
// Databricks Gateway sends usage in every SSE chunk with running
// totals — capture the latest and log once after the stream ends
// (logging here floods the console with N copies).
if (chunk.usage) lastUsage = chunk.usage;
} catch (_) {}
}
}
if (lastUsage) {
const u = lastUsage;
const cReq = u.cache_creation_input_tokens;
const cRead = u.cache_read_input_tokens;
const inTok = u.input_tokens ?? u.prompt_tokens ?? "?";
const outTok = u.output_tokens ?? u.completion_tokens ?? "?";
console.log(
`[CHAT] Usage (streamed) — input: ${inTok}, output: ${outTok}, cache_created: ${cReq || 0}, cache_read: ${cRead || 0}`
);
}
// Compact (in case the stream skipped an index) and sanitize tool args
// (empty or malformed JSON → "{}") before returning.
const toolCalls = sanitizeToolCalls(
Expand All @@ -2119,6 +2186,19 @@ ipcMain.handle(

const data: any = await res.json();

// Surface Anthropic cache metrics when present so we can verify caching
// is actually engaging. Both fields are reported in data.usage; absence
// means either non-Claude or no cache hit/creation this turn.
if (data?.usage) {
const cReq = data.usage.cache_creation_input_tokens;
const cRead = data.usage.cache_read_input_tokens;
if (cReq || cRead) {
console.log(
`[CHAT] Cache usage — created: ${cReq || 0}, read: ${cRead || 0}, input: ${data.usage.input_tokens || data.usage.prompt_tokens || "?"}`
);
}
}

if (isResponses) {
const items = data.output || [];
const toolCalls = items
Expand Down
29 changes: 28 additions & 1 deletion src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,5 +113,32 @@ function trimHistory(history: MasonMessage[]): MasonMessage[] {
}
}

return [...systems, ...kept];
// Drop orphan tool results — a role:"tool" whose tool_call_id isn't matched
// by a tool_calls[].id in an earlier role:"assistant" message within the
// kept slice. Slicing or char-budget trimming can leave these dangling when
// the corresponding assistant message gets pruned, and Anthropic (via the
// Databricks Gateway) rejects the whole request: "unexpected tool_use_id
// found in tool_result blocks ... must have a corresponding tool_use block
// in the previous message."
const knownToolUseIds = new Set<string>();
const cleaned: MasonMessage[] = [];
for (const m of kept) {
const anyM = m as any;
if (anyM.role === "assistant" && Array.isArray(anyM.tool_calls)) {
for (const tc of anyM.tool_calls) {
if (tc?.id) knownToolUseIds.add(tc.id);
}
cleaned.push(m);
continue;
}
if (anyM.role === "tool") {
const id = anyM.tool_call_id;
if (id && knownToolUseIds.has(id)) cleaned.push(m);
// orphan tool result — drop silently
continue;
}
cleaned.push(m);
}

return [...systems, ...cleaned];
}
Loading