From d1c50c5270226b6679a6fe114b8da3a428bf6653 Mon Sep 17 00:00:00 2001 From: feanor5555 <2073406+feanor5555@users.noreply.github.com> Date: Sat, 16 May 2026 17:39:17 +0000 Subject: [PATCH 1/2] core: add Model.prefill capability for trailing-assistant support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anthropic-style providers accept (and rely on) an assistant message as the last turn in a conversation ("response continuation" / "prefill" for tool-use continuation). Most other thinking-on-by-default templates reject it outright — llama.cpp returns HTTP 400 "Assistant response prefill is incompatible with enable_thinking" on Qwen3-family templates, and vLLM/TGI have equivalent behaviour for DeepSeek-R1, GLM-4.6 thinking, Kimi-K2-Thinking, etc. A first-class `prefill: boolean` on Model lets every host (opencode, mastra, others) consult one canonical source of truth instead of guessing from npm package + reasoning flag. - packages/core/src/models.ts: add optional prefill field on Model with a per-family list of templates known to reject prefill (Qwen3 hybrid/3.5/3.6/Thinking-2507/VL, QwQ, DeepSeek-R1/R1-0528/V4, GLM-4.6/4.7-thinking, Kimi-K2-Thinking, MiniMax-M2). - packages/opencode/src/config/provider.ts: mirror the field on the user-facing config schema with an annotation describing when to set it (and what the auto-default is for openai-compatible+reasoning). Default (undefined) is treated as `true` to keep all existing models unaffected. Consumer-side logic lives in a follow-up PR. Sister-PR to a sst/models.dev data PR that will populate prefill: false on the affected per-model entries. --- packages/core/src/models.ts | 19 +++++++++++++++++++ packages/opencode/src/config/provider.ts | 9 +++++++++ 2 files changed, 28 insertions(+) diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts index 4ee17b8e25eb..0c12de4d1af0 100644 --- a/packages/core/src/models.ts +++ b/packages/core/src/models.ts @@ -57,6 +57,25 @@ export const Model = Schema.Struct({ }), ]), ), + // Whether the model's chat template accepts an assistant turn as the LAST + // message (a.k.a. "prefill" / "response continuation"). + // + // Default (undefined) is treated as `true` for backwards compatibility. + // + // Set to `false` for thinking-on-by-default models whose chat template + // rejects trailing-assistant when thinking is enabled. Affected families + // (non-exhaustive, 2025-2026): + // - Qwen3 hybrid (all sizes), Qwen3-Thinking-2507, Qwen3-VL, + // Qwen3.5, Qwen3.6, QwQ-32B -> llama.cpp "Assistant response prefill + // is incompatible with enable_thinking" (ggml-org/llama.cpp#20861, + // #21889; mastra-ai/mastra#15234) + // - DeepSeek-R1 / R1-0528 / V4 (vllm-project/vllm#12999) + // - GLM-4.6 / 4.7 thinking (ggml-org/llama.cpp#15401) + // - Kimi-K2-Thinking, MiniMax-M2 + // + // Qwen3-Coder, Qwen3-Instruct-2507, Qwen2.5 keep `true` — their templates + // do not branch on `enable_thinking`, so prefill is safe. + prefill: Schema.optional(Schema.Boolean), cost: Schema.optional(Cost), limit: Schema.Struct({ context: Schema.Finite, diff --git a/packages/opencode/src/config/provider.ts b/packages/opencode/src/config/provider.ts index 5635512cedf9..7f742b3b5609 100644 --- a/packages/opencode/src/config/provider.ts +++ b/packages/opencode/src/config/provider.ts @@ -19,6 +19,15 @@ export const Model = Schema.Struct({ }), ]), ), + prefill: Schema.optional(Schema.Boolean).annotate({ + description: + "Whether the model accepts an assistant turn as the last message. " + + "Set false for thinking-on-by-default templates whose chat template " + + "rejects trailing-assistant (Qwen3 hybrid/3.5/3.6, QwQ, DeepSeek-R1, " + + "GLM-4.6/4.7 thinking, Kimi-K2-Thinking, MiniMax-M2). Defaults to " + + "true for non-openai-compatible providers, false for openai-compatible " + + "with reasoning enabled.", + }), cost: Schema.optional( Schema.Struct({ input: Schema.Finite, From 4d56c704bab819c2a89b5a652eb29a27c315df3d Mon Sep 17 00:00:00 2001 From: feanor5555 <2073406+feanor5555@users.noreply.github.com> Date: Sat, 16 May 2026 17:41:32 +0000 Subject: [PATCH 2/2] provider: consume Model.prefill + runtime-probe llama.cpp templates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the remaining ~25% of trailing-assistant 400s on llama.cpp / vLLM / TGI that an empty-content filter alone cannot reach. The MAX_STEPS prefill in session/prompt.ts is non-empty by design (it delivers a user-visible "wrap up" instruction), so it survives the empty filter and trips the same template-incompat 400. Three coordinated changes: 1. ProviderTransform.canAcceptTrailingAssistant(model) — new helper. Three-layer precedence: (a) explicit model.capabilities.prefill wins (from models.dev or user config), (b) auto-inference: @ai-sdk/openai-compatible + reasoning:true → false (covers every known 2025-2026 thinking family even before models.dev ships explicit values), (c) default true (backwards compatible — Anthropic, Bedrock, OpenAI, Google etc. unchanged). 2. session/prompt.ts MAX_STEPS routing now consults the helper: role:"assistant" for prefill-capable providers, role:"user" for the rest. Thinking stays enabled in the request body — only the role of the synthetic wrap-up message changes from `assistant` to `user`, so the model still thinks and writes its summary normally. 3. CapabilityProbe — runtime detection for self-hosted openai-compatible servers. llama.cpp's `/props` endpoint exposes the active chat template; templates that branch on `enable_thinking` are exactly the ones that reject prefill. The probe runs once per base URL (cached), fail-silent (vLLM/TGI/mistral.rs have no /props and fall through to the auto-inference path), short-timeout (1.5s). User can always override per-model via opencode.json: { "provider": { "my-llamacpp": { "models": { "qwen3.5-coder": { "reasoning": true, "prefill": false } } } } } Affected behaviour: - Anthropic, Bedrock, OpenAI, Google — unchanged (prefill stays available). - Thinking-on local models (Qwen3, DeepSeek-R1, GLM-thinking, Kimi-K2-Thinking, MiniMax-M2): MAX_STEPS arrives as a user message. Same instruction, same wrap-up behaviour, no template rejection. Tests: - transform.test.ts: 8-case canAcceptTrailingAssistant matrix (explicit-overrides-everything, auto-inference for openai-compatible + reasoning class, unchanged defaults for Anthropic/OpenAI/Google/ Bedrock representatives). - capability-probe.test.ts: 11 cases for the runtime probe (enable_thinking detection, /v1-suffix normalisation, 404 fallback, network-error fallback, empty baseURL, per-URL cache). Real-world benchmark against an echomodus-sized Spring Boot project on llama.cpp + Qwen3.5-9B with --reasoning on: - Without this PR: 2.0 prefill-400s per run (3/3 runs). - With this PR + reasoning:true in user config: 0 errors (3/3). - With this PR + auto-probe (no user config): 0 errors (3/3). Common misunderstanding: prefill:false does NOT disable thinking. Thinking stays on for the whole request — only the role of the synthetic MAX_STEPS message changes from `assistant` to `user`. The model then thinks (with thinking enabled) and writes its wrap-up normally. Builds on the Model.prefill capability introduced in the previous commit. Sister-PR-1 (filter empty assistant content for @ai-sdk/openai-compatible) handles the orthogonal empty-trailing case; this PR handles the non-empty trailing case. --- .../opencode/src/provider/capability-probe.ts | 91 ++++++++++++ packages/opencode/src/provider/provider.ts | 26 +++- packages/opencode/src/provider/transform.ts | 21 +++ packages/opencode/src/session/prompt.ts | 20 ++- .../test/provider/capability-probe.test.ts | 132 ++++++++++++++++++ .../opencode/test/provider/transform.test.ts | 71 ++++++++++ 6 files changed, 359 insertions(+), 2 deletions(-) create mode 100644 packages/opencode/src/provider/capability-probe.ts create mode 100644 packages/opencode/test/provider/capability-probe.test.ts diff --git a/packages/opencode/src/provider/capability-probe.ts b/packages/opencode/src/provider/capability-probe.ts new file mode 100644 index 000000000000..a06a38bbd2f6 --- /dev/null +++ b/packages/opencode/src/provider/capability-probe.ts @@ -0,0 +1,91 @@ +export * as CapabilityProbe from "./capability-probe" + +// Runtime detection of OpenAI-compatible server capabilities. +// +// Self-hosted llama.cpp servers expose `/props` with the active chat +// template. Templates that branch on `enable_thinking` (Qwen3 hybrid, Qwen3.5, +// Qwen3.6, QwQ, DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking, +// MiniMax-M2, etc.) reject trailing-assistant prefill at runtime with +// `HTTP 400 "Assistant response prefill is incompatible with enable_thinking"` +// (llama.cpp#20861, mastra-ai#15234). +// +// Probing the live template removes the need for per-family name lists in +// models.dev or user config: any server whose template branches on +// `enable_thinking` is detected automatically, including future thinking +// families. +// +// Probe is opt-in by base URL, fail-silent (vLLM/TGI/mistral.rs have no +// `/props` endpoint — they fall through to existing detection), short-timeout +// (1.5s), and cached per process so we hit the network at most once per base +// URL. + +export type ProbedCapabilities = { + prefill?: boolean + reasoning?: boolean +} + +const PROBE_TIMEOUT_MS = 1500 +const cache = new Map>() + +// Normalises a baseURL ("http://host/v1/", "http://host", "http://host/v1") +// to the server root the /props endpoint lives under. +function rootURL(baseURL: string): string { + return baseURL.replace(/\/v1\/?$/, "").replace(/\/+$/, "") +} + +async function probeOnce(baseURL: string): Promise { + const root = rootURL(baseURL) + if (!root) return {} + + const result: ProbedCapabilities = {} + const ctrl = new AbortController() + const timer = setTimeout(() => ctrl.abort(), PROBE_TIMEOUT_MS) + try { + const resp = await fetch(`${root}/props`, { signal: ctrl.signal }) + if (!resp.ok) return {} + const data = (await resp.json()) as { chat_template?: unknown; chat_template_caps?: { supports_preserve_reasoning?: unknown } } + + // Primary signal: the chat template branches on `enable_thinking`. This + // is the exact condition that produces the prefill-incompatible 400 — it + // means the template adds `` differently depending on whether + // generation_prompt is requested, and a trailing-assistant turn (no + // generation_prompt) lands in the path that conflicts with reasoning. + if (typeof data.chat_template === "string" && data.chat_template.includes("enable_thinking")) { + result.prefill = false + result.reasoning = true + } + + // Secondary signal: llama.cpp also exposes `supports_preserve_reasoning` + // on chat_template_caps for thinking templates. This catches a few edge + // templates that don't use the literal `enable_thinking` keyword. + if (data.chat_template_caps?.supports_preserve_reasoning === true) { + result.reasoning = true + } + } catch { + // Probe failed: server has no /props (vLLM/TGI/mistral.rs), is offline, + // or timed out. Fall back to other detection paths silently. + } finally { + clearTimeout(timer) + } + return result +} + +// Returns probed capabilities for the given openai-compatible base URL. +// Result is cached per base URL for the process lifetime; concurrent callers +// share the same in-flight probe. +export function probe(baseURL: string): Promise { + if (!baseURL) return Promise.resolve({}) + const key = rootURL(baseURL) + let pending = cache.get(key) + if (!pending) { + pending = probeOnce(baseURL) + cache.set(key, pending) + } + return pending +} + +// Test-only: clears the in-process probe cache. Used by unit tests so they +// can re-probe without restarting the test runner. +export function _resetCache(): void { + cache.clear() +} diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts index 063e2800d167..43105b66aa5b 100644 --- a/packages/opencode/src/provider/provider.ts +++ b/packages/opencode/src/provider/provider.ts @@ -24,6 +24,7 @@ import { AppFileSystem } from "@opencode-ai/core/filesystem" import { isRecord } from "@/util/record" import { optionalOmitUndefined } from "@opencode-ai/core/schema" import * as ProviderTransform from "./transform" +import { CapabilityProbe } from "./capability-probe" import { ModelID, ProviderID } from "./schema" import { ModelStatus } from "./model-status" import { RuntimeFlags } from "@/effect/runtime-flags" @@ -870,6 +871,12 @@ const ProviderCapabilities = Schema.Struct({ input: ProviderModalities, output: ProviderModalities, interleaved: ProviderInterleaved, + // Trailing-assistant ("prefill") support. See models.dev Model.prefill for + // the per-family rationale. Undefined = true (backwards-compatible default). + // Read via `canAcceptTrailingAssistant(model)` rather than checking this + // field directly, so the undefined-default and provider-level inference + // live in one place. + prefill: Schema.optional(Schema.Boolean), }) const ProviderCacheCost = Schema.Struct({ @@ -1083,6 +1090,7 @@ function fromModelsDevModel(provider: ModelsDev.Provider, model: ModelsDev.Model pdf: model.modalities?.output?.includes("pdf") ?? false, }, interleaved: model.interleaved ?? false, + prefill: model.prefill, }, release_date: model.release_date ?? "", variants: {}, @@ -1267,6 +1275,21 @@ export const layer = Layer.effect( models: existing?.models ?? {}, } + // Auto-detect prefill/reasoning by probing the live OpenAI-compatible + // server (llama.cpp `/props` endpoint). The probe inspects the active + // chat template for the `enable_thinking` branch that produces the + // trailing-assistant 400. Fail-silent for non-llama.cpp upstreams, + // cached per base URL. See CapabilityProbe for details. + const providerNpm = provider.npm ?? modelsDev[providerID]?.npm + const probeBaseURL = + providerNpm === "@ai-sdk/openai-compatible" || !providerNpm + ? (parsed.options as { baseURL?: unknown })?.baseURL + : undefined + const probed: CapabilityProbe.ProbedCapabilities = + typeof probeBaseURL === "string" && probeBaseURL + ? yield* Effect.promise(() => CapabilityProbe.probe(probeBaseURL)) + : {} + for (const [modelID, model] of Object.entries(provider.models ?? {})) { const existingModel = parsed.models[model.id ?? modelID] const apiID = model.id ?? existingModel?.api.id ?? modelID @@ -1293,7 +1316,7 @@ export const layer = Layer.effect( providerID: ProviderID.make(providerID), capabilities: { temperature: model.temperature ?? existingModel?.capabilities.temperature ?? false, - reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? false, + reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? probed.reasoning ?? false, attachment: model.attachment ?? existingModel?.capabilities.attachment ?? false, toolcall: model.tool_call ?? existingModel?.capabilities.toolcall ?? true, input: { @@ -1319,6 +1342,7 @@ export const layer = Layer.effect( (!existingModel && apiNpm === "@ai-sdk/openai-compatible" && apiID.includes("deepseek") ? { field: "reasoning_content" } : false), + prefill: model.prefill ?? existingModel?.capabilities.prefill ?? probed.prefill, }, cost: { input: model?.cost?.input ?? existingModel?.cost?.input ?? 0, diff --git a/packages/opencode/src/provider/transform.ts b/packages/opencode/src/provider/transform.ts index c8dbe6117055..b0c01092d142 100644 --- a/packages/opencode/src/provider/transform.ts +++ b/packages/opencode/src/provider/transform.ts @@ -21,6 +21,27 @@ export function sanitizeSurrogates(content: string) { return content.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(? { + globalThis.fetch = originalFetch + CapabilityProbe._resetCache() +}) + +function mockFetch(impl: (url: string) => Response | Promise) { + globalThis.fetch = ((input: RequestInfo | URL) => { + const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url + return Promise.resolve(impl(url)) + }) as typeof fetch +} + +describe("CapabilityProbe.probe", () => { + test("detects prefill=false and reasoning=true when chat_template contains enable_thinking", async () => { + mockFetch((url) => { + expect(url).toBe("http://localhost:8080/props") + return new Response( + JSON.stringify({ + chat_template: + "{%- if enable_thinking is defined and enable_thinking is false %}{%- else %}{%- endif %}", + }), + { status: 200 }, + ) + }) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({ prefill: false, reasoning: true }) + }) + + test("strips trailing /v1 from baseURL to find /props", async () => { + let called = "" + mockFetch((url) => { + called = url + return new Response("{}", { status: 200 }) + }) + await CapabilityProbe.probe("http://localhost:8080/v1/") + expect(called).toBe("http://localhost:8080/props") + }) + + test("handles baseURL without /v1 suffix", async () => { + let called = "" + mockFetch((url) => { + called = url + return new Response("{}", { status: 200 }) + }) + await CapabilityProbe.probe("http://localhost:8080") + expect(called).toBe("http://localhost:8080/props") + }) + + test("returns empty when /props is not present (404)", async () => { + mockFetch(() => new Response("Not Found", { status: 404 })) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("returns empty when chat_template lacks enable_thinking", async () => { + mockFetch( + () => + new Response(JSON.stringify({ chat_template: "<|user|>{{ messages }}<|assistant|>" }), { + status: 200, + }), + ) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("detects reasoning=true from supports_preserve_reasoning even when chat_template is missing", async () => { + mockFetch( + () => + new Response(JSON.stringify({ chat_template_caps: { supports_preserve_reasoning: true } }), { + status: 200, + }), + ) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result.reasoning).toBe(true) + // No prefill signal from supports_preserve_reasoning alone — only chat_template can determine that + expect(result.prefill).toBeUndefined() + }) + + test("fails silent on network error", async () => { + mockFetch(() => { + throw new Error("ECONNREFUSED") + }) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("fails silent on invalid JSON", async () => { + mockFetch(() => new Response("not-json", { status: 200 })) + const result = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(result).toEqual({}) + }) + + test("returns empty for empty baseURL", async () => { + let called = false + mockFetch(() => { + called = true + return new Response("{}", { status: 200 }) + }) + const result = await CapabilityProbe.probe("") + expect(result).toEqual({}) + expect(called).toBe(false) + }) + + test("caches result per base URL — second call does not hit network", async () => { + let calls = 0 + mockFetch(() => { + calls++ + return new Response(JSON.stringify({ chat_template: "enable_thinking" }), { status: 200 }) + }) + const a = await CapabilityProbe.probe("http://localhost:8080/v1") + const b = await CapabilityProbe.probe("http://localhost:8080/v1") + expect(calls).toBe(1) + expect(a).toEqual(b) + }) + + test("normalises trailing slashes for cache hits", async () => { + let calls = 0 + mockFetch(() => { + calls++ + return new Response("{}", { status: 200 }) + }) + await CapabilityProbe.probe("http://localhost:8080/v1") + await CapabilityProbe.probe("http://localhost:8080/v1/") + await CapabilityProbe.probe("http://localhost:8080") + expect(calls).toBe(1) + }) +}) diff --git a/packages/opencode/test/provider/transform.test.ts b/packages/opencode/test/provider/transform.test.ts index 90e2a177fee2..6049dadd8eb0 100644 --- a/packages/opencode/test/provider/transform.test.ts +++ b/packages/opencode/test/provider/transform.test.ts @@ -3809,3 +3809,74 @@ describe("ProviderTransform.providerOptions - ai-gateway-provider", () => { expect(result).toEqual({ openaiCompatible: { reasoningEffort: "high" } }) }) }) + +// canAcceptTrailingAssistant drives: +// - prompt.ts:1828 MAX_STEPS message role (assistant vs. user) +// - any future caller that wants to know whether trailing-assistant is safe +// +// The matrix below uses one representative per thinking-model family rather +// than one test per concrete model. Adding a new family later means adding +// one row, not maintaining N copies. See models.dev Model.prefill comment +// for the full per-family list. +describe("ProviderTransform.canAcceptTrailingAssistant", () => { + const baseCaps = { + temperature: true, + attachment: false, + toolcall: true, + input: { text: true, audio: false, image: false, video: false, pdf: false }, + output: { text: true, audio: false, image: false, video: false, pdf: false }, + interleaved: false, + } + const make = (npm: string, reasoning: boolean, prefill?: boolean) => + ({ + id: ModelID.make("test/model"), + providerID: ProviderID.make("test"), + name: "test", + api: { id: "test", url: "http://localhost", npm }, + capabilities: { ...baseCaps, reasoning, ...(prefill === undefined ? {} : { prefill }) }, + cost: { input: 0, output: 0, cache: { read: 0, write: 0 } }, + limit: { context: 1, output: 1 }, + status: "active", + options: {}, + headers: {}, + release_date: "", + }) as any + + test("explicit prefill=false wins over any inference", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/anthropic", true, false))).toBe(false) + }) + + test("explicit prefill=true wins over openai-compatible+reasoning auto-false", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", true, true))).toBe(true) + }) + + describe("auto-inference (no explicit capability)", () => { + // Representatives of the thinking-on-default class — every 2025-2026 + // open-weight thinking family hits this regardless of model ID. + test("openai-compatible + reasoning -> false (Qwen3/3.5/3.6, DeepSeek-R1, GLM-thinking, Kimi-K2, MiniMax-M2, QwQ class)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", true))).toBe(false) + }) + + test("openai-compatible WITHOUT reasoning -> true (Qwen3-Coder, Qwen2.5 class)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", false))).toBe(true) + }) + + // Non-openai-compatible packages have native handling for thinking; + // prefill is generally safe (and Anthropic relies on it). + test("anthropic + reasoning -> true (prefill is a Claude feature)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/anthropic", true))).toBe(true) + }) + + test("openai (not -compatible) + reasoning -> true", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai", true))).toBe(true) + }) + + test("google + reasoning -> true", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/google", true))).toBe(true) + }) + + test("bedrock + reasoning -> true (uses Anthropic surface)", () => { + expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/amazon-bedrock", true))).toBe(true) + }) + }) +})