diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts index 4ee17b8e25eb..0c12de4d1af0 100644 --- a/packages/core/src/models.ts +++ b/packages/core/src/models.ts @@ -57,6 +57,25 @@ export const Model = Schema.Struct({ }), ]), ), + // Whether the model's chat template accepts an assistant turn as the LAST + // message (a.k.a. "prefill" / "response continuation"). + // + // Default (undefined) is treated as `true` for backwards compatibility. + // + // Set to `false` for thinking-on-by-default models whose chat template + // rejects trailing-assistant when thinking is enabled. Affected families + // (non-exhaustive, 2025-2026): + // - Qwen3 hybrid (all sizes), Qwen3-Thinking-2507, Qwen3-VL, + // Qwen3.5, Qwen3.6, QwQ-32B -> llama.cpp "Assistant response prefill + // is incompatible with enable_thinking" (ggml-org/llama.cpp#20861, + // #21889; mastra-ai/mastra#15234) + // - DeepSeek-R1 / R1-0528 / V4 (vllm-project/vllm#12999) + // - GLM-4.6 / 4.7 thinking (ggml-org/llama.cpp#15401) + // - Kimi-K2-Thinking, MiniMax-M2 + // + // Qwen3-Coder, Qwen3-Instruct-2507, Qwen2.5 keep `true` — their templates + // do not branch on `enable_thinking`, so prefill is safe. + prefill: Schema.optional(Schema.Boolean), cost: Schema.optional(Cost), limit: Schema.Struct({ context: Schema.Finite, diff --git a/packages/opencode/src/config/provider.ts b/packages/opencode/src/config/provider.ts index 5635512cedf9..7f742b3b5609 100644 --- a/packages/opencode/src/config/provider.ts +++ b/packages/opencode/src/config/provider.ts @@ -19,6 +19,15 @@ export const Model = Schema.Struct({ }), ]), ), + prefill: Schema.optional(Schema.Boolean).annotate({ + description: + "Whether the model accepts an assistant turn as the last message. " + + "Set false for thinking-on-by-default templates whose chat template " + + "rejects trailing-assistant (Qwen3 hybrid/3.5/3.6, QwQ, DeepSeek-R1, " + + "GLM-4.6/4.7 thinking, Kimi-K2-Thinking, MiniMax-M2). Defaults to " + + "true for non-openai-compatible providers, false for openai-compatible " + + "with reasoning enabled.", + }), cost: Schema.optional( Schema.Struct({ input: Schema.Finite, diff --git a/packages/opencode/src/provider/capability-probe.ts b/packages/opencode/src/provider/capability-probe.ts new file mode 100644 index 000000000000..a06a38bbd2f6 --- /dev/null +++ b/packages/opencode/src/provider/capability-probe.ts @@ -0,0 +1,91 @@ +export * as CapabilityProbe from "./capability-probe" + +// Runtime detection of OpenAI-compatible server capabilities. +// +// Self-hosted llama.cpp servers expose `/props` with the active chat +// template. Templates that branch on `enable_thinking` (Qwen3 hybrid, Qwen3.5, +// Qwen3.6, QwQ, DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking, +// MiniMax-M2, etc.) reject trailing-assistant prefill at runtime with +// `HTTP 400 "Assistant response prefill is incompatible with enable_thinking"` +// (llama.cpp#20861, mastra-ai#15234). +// +// Probing the live template removes the need for per-family name lists in +// models.dev or user config: any server whose template branches on +// `enable_thinking` is detected automatically, including future thinking +// families. +// +// Probe is opt-in by base URL, fail-silent (vLLM/TGI/mistral.rs have no +// `/props` endpoint — they fall through to existing detection), short-timeout +// (1.5s), and cached per process so we hit the network at most once per base +// URL. + +export type ProbedCapabilities = { + prefill?: boolean + reasoning?: boolean +} + +const PROBE_TIMEOUT_MS = 1500 +const cache = new Map>() + +// Normalises a baseURL ("http://host/v1/", "http://host", "http://host/v1") +// to the server root the /props endpoint lives under. +function rootURL(baseURL: string): string { + return baseURL.replace(/\/v1\/?$/, "").replace(/\/+$/, "") +} + +async function probeOnce(baseURL: string): Promise { + const root = rootURL(baseURL) + if (!root) return {} + + const result: ProbedCapabilities = {} + const ctrl = new AbortController() + const timer = setTimeout(() => ctrl.abort(), PROBE_TIMEOUT_MS) + try { + const resp = await fetch(`${root}/props`, { signal: ctrl.signal }) + if (!resp.ok) return {} + const data = (await resp.json()) as { chat_template?: unknown; chat_template_caps?: { supports_preserve_reasoning?: unknown } } + + // Primary signal: the chat template branches on `enable_thinking`. This + // is the exact condition that produces the prefill-incompatible 400 — it + // means the template adds `` differently depending on whether + // generation_prompt is requested, and a trailing-assistant turn (no + // generation_prompt) lands in the path that conflicts with reasoning. + if (typeof data.chat_template === "string" && data.chat_template.includes("enable_thinking")) { + result.prefill = false + result.reasoning = true + } + + // Secondary signal: llama.cpp also exposes `supports_preserve_reasoning` + // on chat_template_caps for thinking templates. This catches a few edge + // templates that don't use the literal `enable_thinking` keyword. + if (data.chat_template_caps?.supports_preserve_reasoning === true) { + result.reasoning = true + } + } catch { + // Probe failed: server has no /props (vLLM/TGI/mistral.rs), is offline, + // or timed out. Fall back to other detection paths silently. + } finally { + clearTimeout(timer) + } + return result +} + +// Returns probed capabilities for the given openai-compatible base URL. +// Result is cached per base URL for the process lifetime; concurrent callers +// share the same in-flight probe. +export function probe(baseURL: string): Promise { + if (!baseURL) return Promise.resolve({}) + const key = rootURL(baseURL) + let pending = cache.get(key) + if (!pending) { + pending = probeOnce(baseURL) + cache.set(key, pending) + } + return pending +} + +// Test-only: clears the in-process probe cache. Used by unit tests so they +// can re-probe without restarting the test runner. +export function _resetCache(): void { + cache.clear() +} diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts index 063e2800d167..43105b66aa5b 100644 --- a/packages/opencode/src/provider/provider.ts +++ b/packages/opencode/src/provider/provider.ts @@ -24,6 +24,7 @@ import { AppFileSystem } from "@opencode-ai/core/filesystem" import { isRecord } from "@/util/record" import { optionalOmitUndefined } from "@opencode-ai/core/schema" import * as ProviderTransform from "./transform" +import { CapabilityProbe } from "./capability-probe" import { ModelID, ProviderID } from "./schema" import { ModelStatus } from "./model-status" import { RuntimeFlags } from "@/effect/runtime-flags" @@ -870,6 +871,12 @@ const ProviderCapabilities = Schema.Struct({ input: ProviderModalities, output: ProviderModalities, interleaved: ProviderInterleaved, + // Trailing-assistant ("prefill") support. See models.dev Model.prefill for + // the per-family rationale. Undefined = true (backwards-compatible default). + // Read via `canAcceptTrailingAssistant(model)` rather than checking this + // field directly, so the undefined-default and provider-level inference + // live in one place. + prefill: Schema.optional(Schema.Boolean), }) const ProviderCacheCost = Schema.Struct({ @@ -1083,6 +1090,7 @@ function fromModelsDevModel(provider: ModelsDev.Provider, model: ModelsDev.Model pdf: model.modalities?.output?.includes("pdf") ?? false, }, interleaved: model.interleaved ?? false, + prefill: model.prefill, }, release_date: model.release_date ?? "", variants: {}, @@ -1267,6 +1275,21 @@ export const layer = Layer.effect( models: existing?.models ?? {}, } + // Auto-detect prefill/reasoning by probing the live OpenAI-compatible + // server (llama.cpp `/props` endpoint). The probe inspects the active + // chat template for the `enable_thinking` branch that produces the + // trailing-assistant 400. Fail-silent for non-llama.cpp upstreams, + // cached per base URL. See CapabilityProbe for details. + const providerNpm = provider.npm ?? modelsDev[providerID]?.npm + const probeBaseURL = + providerNpm === "@ai-sdk/openai-compatible" || !providerNpm + ? (parsed.options as { baseURL?: unknown })?.baseURL + : undefined + const probed: CapabilityProbe.ProbedCapabilities = + typeof probeBaseURL === "string" && probeBaseURL + ? yield* Effect.promise(() => CapabilityProbe.probe(probeBaseURL)) + : {} + for (const [modelID, model] of Object.entries(provider.models ?? {})) { const existingModel = parsed.models[model.id ?? modelID] const apiID = model.id ?? existingModel?.api.id ?? modelID @@ -1293,7 +1316,7 @@ export const layer = Layer.effect( providerID: ProviderID.make(providerID), capabilities: { temperature: model.temperature ?? existingModel?.capabilities.temperature ?? false, - reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? false, + reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? probed.reasoning ?? false, attachment: model.attachment ?? existingModel?.capabilities.attachment ?? false, toolcall: model.tool_call ?? existingModel?.capabilities.toolcall ?? true, input: { @@ -1319,6 +1342,7 @@ export const layer = Layer.effect( (!existingModel && apiNpm === "@ai-sdk/openai-compatible" && apiID.includes("deepseek") ? { field: "reasoning_content" } : false), + prefill: model.prefill ?? existingModel?.capabilities.prefill ?? probed.prefill, }, cost: { input: model?.cost?.input ?? existingModel?.cost?.input ?? 0, diff --git a/packages/opencode/src/provider/transform.ts b/packages/opencode/src/provider/transform.ts index c8dbe6117055..b0c01092d142 100644 --- a/packages/opencode/src/provider/transform.ts +++ b/packages/opencode/src/provider/transform.ts @@ -21,6 +21,27 @@ export function sanitizeSurrogates(content: string) { return content.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(? { + globalThis.fetch = originalFetch + CapabilityProbe._resetCache() +}) + +function mockFetch(impl: (url: string) => Response | Promise) { + globalThis.fetch = ((input: RequestInfo | URL) => { + const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url + return Promise.resolve(impl(url)) + }) as typeof fetch +} + +describe("CapabilityProbe.probe", () => { + test("detects prefill=false and reasoning=true when chat_template contains enable_thinking", async () => { + mockFetch((url) => { + expect(url).toBe("http://localhost:8080/props") + return new Response( + JSON.stringify({ + chat_template: + "{%- if enable_thinking is defined and enable_thinking is false %}{%- else %}{%- endif %}", + }), + { status: 200 }, + ) + }) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({ prefill: false, reasoning: true }) + }) + + test("strips trailing /v1 from baseURL to find /props", async () => { + let called = "" + mockFetch((url) => { + called = url + return new Response("{}", { status: 200 }) + }) + await CapabilityProbe.probe("http://localhost:8080/v1/") + expect(called).toBe("http://localhost:8080/props") + }) + + test("handles baseURL without /v1 suffix", async () => { + let called = "" + mockFetch((url) => { + called = url + return new Response("{}", { status: 200 }) + }) + await CapabilityProbe.probe("http://localhost:8080") + expect(called).toBe("http://localhost:8080/props") + }) + + test("returns empty when /props is not present (404)", async () => { + mockFetch(() => new Response("Not Found", { status: 404 })) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("returns empty when chat_template lacks enable_thinking", async () => { + mockFetch( + () => + new Response(JSON.stringify({ chat_template: "<|user|>{{ messages }}<|assistant|>" }), { + status: 200, + }), + ) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("detects reasoning=true from supports_preserve_reasoning even when chat_template is missing", async () => { + mockFetch( + () => + new Response(JSON.stringify({ chat_template_caps: { supports_preserve_reasoning: true } }), { + status: 200, + }), + ) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result.reasoning).toBe(true) + // No prefill signal from supports_preserve_reasoning alone — only chat_template can determine that + expect(result.prefill).toBeUndefined() + }) + + test("fails silent on network error", async () => { + mockFetch(() => { + throw new Error("ECONNREFUSED") + }) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("fails silent on invalid JSON", async () => { + mockFetch(() => new Response("not-json", { status: 200 })) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("returns empty for empty baseURL", async () => { + let called = false + mockFetch(() => { + called = true + return new Response("{}", { status: 200 }) + }) + const result = await CapabilityProbe.probe("") + expect(result).toEqual({}) + expect(called).toBe(false) + }) + + test("caches result per base URL — second call does not hit network", async () => { + let calls = 0 + mockFetch(() => { + calls++ + return new Response(JSON.stringify({ chat_template: "enable_thinking" }), { status: 200 }) + }) + const a = await CapabilityProbe.probe("http://localhost:8080/v1") + const b = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(calls).toBe(1) + expect(a).toEqual(b) + }) + + test("normalises trailing slashes for cache hits", async () => { + let calls = 0 + mockFetch(() => { + calls++ + return new Response("{}", { status: 200 }) + }) + await CapabilityProbe.probe("http://localhost:8080/v1") + await CapabilityProbe.probe("http://localhost:8080/v1/") + await CapabilityProbe.probe("http://localhost:8080") + expect(calls).toBe(1) + }) +}) diff --git a/packages/opencode/test/provider/transform.test.ts b/packages/opencode/test/provider/transform.test.ts index 90e2a177fee2..6049dadd8eb0 100644 --- a/packages/opencode/test/provider/transform.test.ts +++ b/packages/opencode/test/provider/transform.test.ts @@ -3809,3 +3809,74 @@ describe("ProviderTransform.providerOptions - ai-gateway-provider", () => { expect(result).toEqual({ openaiCompatible: { reasoningEffort: "high" } }) }) }) + +// canAcceptTrailingAssistant drives: +// - prompt.ts:1828 MAX_STEPS message role (assistant vs. user) +// - any future caller that wants to know whether trailing-assistant is safe +// +// The matrix below uses one representative per thinking-model family rather +// than one test per concrete model. Adding a new family later means adding +// one row, not maintaining N copies. See models.dev Model.prefill comment +// for the full per-family list. +describe("ProviderTransform.canAcceptTrailingAssistant", () => { + const baseCaps = { + temperature: true, + attachment: false, + toolcall: true, + input: { text: true, audio: false, image: false, video: false, pdf: false }, + output: { text: true, audio: false, image: false, video: false, pdf: false }, + interleaved: false, + } + const make = (npm: string, reasoning: boolean, prefill?: boolean) => + ({ + id: ModelID.make("test/model"), + providerID: ProviderID.make("test"), + name: "test", + api: { id: "test", url: "http://localhost", npm }, + capabilities: { ...baseCaps, reasoning, ...(prefill === undefined ? {} : { prefill }) }, + cost: { input: 0, output: 0, cache: { read: 0, write: 0 } }, + limit: { context: 1, output: 1 }, + status: "active", + options: {}, + headers: {}, + release_date: "", + }) as any + + test("explicit prefill=false wins over any inference", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/anthropic", true, false))).toBe(false) + }) + + test("explicit prefill=true wins over openai-compatible+reasoning auto-false", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", true, true))).toBe(true) + }) + + describe("auto-inference (no explicit capability)", () => { + // Representatives of the thinking-on-default class — every 2025-2026 + // open-weight thinking family hits this regardless of model ID. + test("openai-compatible + reasoning -> false (Qwen3/3.5/3.6, DeepSeek-R1, GLM-thinking, Kimi-K2, MiniMax-M2, QwQ class)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", true))).toBe(false) + }) + + test("openai-compatible WITHOUT reasoning -> true (Qwen3-Coder, Qwen2.5 class)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", false))).toBe(true) + }) + + // Non-openai-compatible packages have native handling for thinking; + // prefill is generally safe (and Anthropic relies on it). + test("anthropic + reasoning -> true (prefill is a Claude feature)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/anthropic", true))).toBe(true) + }) + + test("openai (not -compatible) + reasoning -> true", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai", true))).toBe(true) + }) + + test("google + reasoning -> true", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/google", true))).toBe(true) + }) + + test("bedrock + reasoning -> true (uses Anthropic surface)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/amazon-bedrock", true))).toBe(true) + }) + }) +})