diff --git a/CHANGELOG.md b/CHANGELOG.md index 948f8f6..1abcbee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.2] - 2026-06-24 + +### Fixed + +- **MatterAI inference routed to the wrong backend.** `AxonClient` was + building the OpenAI `baseURL` by running `API_GATEWAY_PATH` + (`https://api2.matterai.so/v1/web/`) through `getUrlFromToken`, which + rehosts *any* `api.matterai.so` target onto the control-plane host + resolved from the JWT — so every inference request silently hit + `https://api.matterai.so/v1/web/` instead of the gateway at + `https://api2.matterai.so/v1/web/`. The gateway URL is now used + directly, with the per-model `baseUrl` override still winning for + local dev. Profile, task title, balance, and web search/fetch still + go through the rehost helper (they intentionally target the control + plane). + ## [0.3.1] - 2026-06-23 ### Changed diff --git a/package.json b/package.json index 25c1fcd..2b4b6a8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@matterailab/orbcode", - "version": "0.3.1", + "version": "0.3.2", "description": "OrbCode CLI — agentic coding in your terminal, powered by Axon models by MatterAI", "type": "module", "bin": { diff --git a/src/api/client.ts b/src/api/client.ts index fb1467c..67c91b0 100644 --- a/src/api/client.ts +++ b/src/api/client.ts @@ -1,185 +1,216 @@ -import OpenAI from "openai" - -import { API_GATEWAY_PATH, getUrlFromToken } from "../auth/auth.js" -import { DEFAULT_HEADERS, X_AXONCODE_TASKID, X_AXON_REPO, X_ORGANIZATIONID } from "./headers.js" -import { stripReasoningDetails, type LLMClient } from "./llmClient.js" -import { getModel } from "./models.js" -import type { ApiStreamChunk } from "./stream.js" +import OpenAI from "openai"; + +import { API_GATEWAY_PATH } from "../auth/auth.js"; +import { + DEFAULT_HEADERS, + X_AXONCODE_TASKID, + X_AXON_REPO, + X_ORGANIZATIONID, +} from "./headers.js"; +import { stripReasoningDetails, type LLMClient } from "./llmClient.js"; +import { getModel } from "./models.js"; +import type { ApiStreamChunk } from "./stream.js"; interface CompletionUsage { - completion_tokens?: number - completion_tokens_details?: { reasoning_tokens?: number } - prompt_tokens?: number - prompt_tokens_details?: { cached_tokens?: number } - total_tokens?: number - cost?: number - is_byok?: boolean - cost_details?: { upstream_inference_cost?: number } + completion_tokens?: number; + completion_tokens_details?: { reasoning_tokens?: number }; + prompt_tokens?: number; + prompt_tokens_details?: { cached_tokens?: number }; + total_tokens?: number; + cost?: number; + is_byok?: boolean; + cost_details?: { upstream_inference_cost?: number }; } export interface AxonClientOptions { - token: string - modelId: string - taskId: string - organizationId?: string - /** git remote URL or workspace folder name, sent as X-AXON-REPO */ - repo?: string - /** custom OpenAI-compatible gateway; defaults to the MatterAI gateway derived from the token */ - baseUrl?: string + token: string; + modelId: string; + taskId: string; + organizationId?: string; + /** git remote URL or workspace folder name, sent as X-AXON-REPO */ + repo?: string; + /** custom OpenAI-compatible gateway; defaults to the MatterAI gateway derived from the token */ + baseUrl?: string; } export class AxonClient implements LLMClient { - private client: OpenAI - private options: AxonClientOptions - - constructor(options: AxonClientOptions) { - this.options = options - this.client = new OpenAI({ - baseURL: options.baseUrl || getUrlFromToken(API_GATEWAY_PATH, options.token), - apiKey: options.token, - defaultHeaders: DEFAULT_HEADERS, - }) - } - - private requestHeaders(): Record { - const headers: Record = { [X_AXONCODE_TASKID]: this.options.taskId } - if (this.options.organizationId) headers[X_ORGANIZATIONID] = this.options.organizationId - if (this.options.repo) headers[X_AXON_REPO] = this.options.repo - return headers - } - - async *createMessage( - systemPrompt: string, - messages: OpenAI.Chat.ChatCompletionMessageParam[], - tools: OpenAI.Chat.ChatCompletionTool[], - abortSignal?: AbortSignal, - ): AsyncGenerator { - const model = getModel(this.options.modelId) - - const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = { - model: model.id, - temperature: 0.2, - messages: [{ role: "system", content: systemPrompt }, ...stripReasoningDetails(messages)], - stream: true, - stream_options: { include_usage: true }, - max_tokens: model.maxOutputTokens, - } - if (tools.length > 0) { - requestOptions.tools = tools - requestOptions.tool_choice = "auto" - requestOptions.parallel_tool_calls = true - } - - const stream = await this.client.chat.completions.create(requestOptions, { - headers: this.requestHeaders(), - signal: abortSignal, - }) - - let lastUsage: CompletionUsage | undefined - let inferenceProvider: string | undefined - let fullContent = "" - let isThinking = false - - for await (const chunk of stream) { - // The gateway may return an error object instead of throwing. - if ("error" in chunk) { - const error = (chunk as { error?: { message?: string; code?: number } }).error - const err = new Error(`Axon API Error ${error?.code ?? ""}: ${error?.message ?? "unknown error"}`) - ;(err as { status?: number }).status = error?.code - throw err - } - - if ("provider" in chunk && typeof (chunk as { provider?: string }).provider === "string") { - inferenceProvider = (chunk as { provider?: string }).provider - } - - if (chunk.usage) { - lastUsage = chunk.usage as CompletionUsage - } - - const delta = chunk.choices?.[0]?.delta - if (!delta) continue - - if (delta.content) { - // Some backends re-send cumulative content; only forward the new suffix. - let newText = delta.content - if (fullContent && newText.startsWith(fullContent)) { - newText = newText.substring(fullContent.length) - } - fullContent = delta.content - - if (newText) { - // blocks embedded in content are reasoning. - if (newText.includes("")) { - isThinking = true - } - if (newText.includes("") || newText.includes("") || isThinking) { - if (newText.includes("")) { - isThinking = false - } - yield { type: "reasoning", text: newText } - } else { - yield { type: "text", text: newText } - } - } - } - - // Models report reasoning under either key. - const deltaRecord = delta as Record - if (typeof deltaRecord.reasoning === "string" && deltaRecord.reasoning) { - yield { type: "reasoning", text: deltaRecord.reasoning } - } - if (typeof deltaRecord.reasoning_content === "string" && deltaRecord.reasoning_content) { - yield { type: "reasoning", text: deltaRecord.reasoning_content } - } - - if (delta.tool_calls && delta.tool_calls.length > 0) { - const validToolCalls = delta.tool_calls - .filter((tc) => tc.function) - .filter((tc) => { - // First delta carries id + name; later deltas carry only - // index + argument fragments. Drop pure placeholders. - const hasValidId = tc.id !== null && tc.id !== undefined - const hasValidName = !!tc.function!.name - const hasArguments = - typeof tc.function!.arguments === "string" && tc.function!.arguments.length > 0 - return hasValidId || hasValidName || hasArguments - }) - .map((tc) => ({ - index: tc.index, - id: tc.id, - type: tc.type, - function: { - name: tc.function!.name || "", - arguments: tc.function!.arguments || "", - }, - })) - - if (validToolCalls.length > 0) { - yield { type: "native_tool_calls", toolCalls: validToolCalls } - } - } - } - - if (lastUsage) { - yield { - type: "usage", - inputTokens: lastUsage.prompt_tokens || 0, - outputTokens: lastUsage.completion_tokens || 0, - cacheReadTokens: lastUsage.prompt_tokens_details?.cached_tokens, - reasoningTokens: lastUsage.completion_tokens_details?.reasoning_tokens, - totalCost: this.getTotalCost(lastUsage), - inferenceProvider, - } - } - } - - private getTotalCost(lastUsage: CompletionUsage): number { - const model = getModel(this.options.modelId) - if (model.free) return 0 - if (lastUsage.is_byok) { - return lastUsage.cost_details?.upstream_inference_cost || 0 - } - return lastUsage.cost || 0 - } + private client: OpenAI; + private options: AxonClientOptions; + + constructor(options: AxonClientOptions) { + this.options = options; + this.client = new OpenAI({ + // Use the gateway URL directly. Do not rehost it through + // getUrlFromToken — that helper rewrites any api.matterai.so host + // onto the control plane, which would send inference to + // api.matterai.so instead of the gateway at api2.matterai.so. + // Per-model `baseUrl` overrides (e.g. local dev) still win. + baseURL: options.baseUrl || API_GATEWAY_PATH, + apiKey: options.token, + defaultHeaders: DEFAULT_HEADERS, + }); + } + + private requestHeaders(): Record { + const headers: Record = { + [X_AXONCODE_TASKID]: this.options.taskId, + }; + if (this.options.organizationId) + headers[X_ORGANIZATIONID] = this.options.organizationId; + if (this.options.repo) headers[X_AXON_REPO] = this.options.repo; + return headers; + } + + async *createMessage( + systemPrompt: string, + messages: OpenAI.Chat.ChatCompletionMessageParam[], + tools: OpenAI.Chat.ChatCompletionTool[], + abortSignal?: AbortSignal, + ): AsyncGenerator { + const model = getModel(this.options.modelId); + + const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = + { + model: model.id, + temperature: 0.2, + messages: [ + { role: "system", content: systemPrompt }, + ...stripReasoningDetails(messages), + ], + stream: true, + stream_options: { include_usage: true }, + max_tokens: model.maxOutputTokens, + }; + if (tools.length > 0) { + requestOptions.tools = tools; + requestOptions.tool_choice = "auto"; + requestOptions.parallel_tool_calls = true; + } + + const stream = await this.client.chat.completions.create(requestOptions, { + headers: this.requestHeaders(), + signal: abortSignal, + }); + + let lastUsage: CompletionUsage | undefined; + let inferenceProvider: string | undefined; + let fullContent = ""; + let isThinking = false; + + for await (const chunk of stream) { + // The gateway may return an error object instead of throwing. + if ("error" in chunk) { + const error = (chunk as { error?: { message?: string; code?: number } }) + .error; + const err = new Error( + `Axon API Error ${error?.code ?? ""}: ${error?.message ?? "unknown error"}`, + ); + (err as { status?: number }).status = error?.code; + throw err; + } + + if ( + "provider" in chunk && + typeof (chunk as { provider?: string }).provider === "string" + ) { + inferenceProvider = (chunk as { provider?: string }).provider; + } + + if (chunk.usage) { + lastUsage = chunk.usage as CompletionUsage; + } + + const delta = chunk.choices?.[0]?.delta; + if (!delta) continue; + + if (delta.content) { + // Some backends re-send cumulative content; only forward the new suffix. + let newText = delta.content; + if (fullContent && newText.startsWith(fullContent)) { + newText = newText.substring(fullContent.length); + } + fullContent = delta.content; + + if (newText) { + // blocks embedded in content are reasoning. + if (newText.includes("")) { + isThinking = true; + } + if ( + newText.includes("") || + newText.includes("") || + isThinking + ) { + if (newText.includes("")) { + isThinking = false; + } + yield { type: "reasoning", text: newText }; + } else { + yield { type: "text", text: newText }; + } + } + } + + // Models report reasoning under either key. + const deltaRecord = delta as Record; + if (typeof deltaRecord.reasoning === "string" && deltaRecord.reasoning) { + yield { type: "reasoning", text: deltaRecord.reasoning }; + } + if ( + typeof deltaRecord.reasoning_content === "string" && + deltaRecord.reasoning_content + ) { + yield { type: "reasoning", text: deltaRecord.reasoning_content }; + } + + if (delta.tool_calls && delta.tool_calls.length > 0) { + const validToolCalls = delta.tool_calls + .filter((tc) => tc.function) + .filter((tc) => { + // First delta carries id + name; later deltas carry only + // index + argument fragments. Drop pure placeholders. + const hasValidId = tc.id !== null && tc.id !== undefined; + const hasValidName = !!tc.function!.name; + const hasArguments = + typeof tc.function!.arguments === "string" && + tc.function!.arguments.length > 0; + return hasValidId || hasValidName || hasArguments; + }) + .map((tc) => ({ + index: tc.index, + id: tc.id, + type: tc.type, + function: { + name: tc.function!.name || "", + arguments: tc.function!.arguments || "", + }, + })); + + if (validToolCalls.length > 0) { + yield { type: "native_tool_calls", toolCalls: validToolCalls }; + } + } + } + + if (lastUsage) { + yield { + type: "usage", + inputTokens: lastUsage.prompt_tokens || 0, + outputTokens: lastUsage.completion_tokens || 0, + cacheReadTokens: lastUsage.prompt_tokens_details?.cached_tokens, + reasoningTokens: lastUsage.completion_tokens_details?.reasoning_tokens, + totalCost: this.getTotalCost(lastUsage), + inferenceProvider, + }; + } + } + + private getTotalCost(lastUsage: CompletionUsage): number { + const model = getModel(this.options.modelId); + if (model.free) return 0; + if (lastUsage.is_byok) { + return lastUsage.cost_details?.upstream_inference_cost || 0; + } + return lastUsage.cost || 0; + } }