anomalyco · feanor5555 · May 16, 2026 · May 16, 2026
diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts
@@ -57,6 +57,25 @@ export const Model = Schema.Struct({
       }),
     ]),
   ),
+  // Whether the model's chat template accepts an assistant turn as the LAST
+  // message (a.k.a. "prefill" / "response continuation").
+  //
+  // Default (undefined) is treated as `true` for backwards compatibility.
+  //
+  // Set to `false` for thinking-on-by-default models whose chat template
+  // rejects trailing-assistant when thinking is enabled. Affected families
+  // (non-exhaustive, 2025-2026):
+  //   - Qwen3 hybrid (all sizes), Qwen3-Thinking-2507, Qwen3-VL,
+  //     Qwen3.5, Qwen3.6, QwQ-32B  ->  llama.cpp "Assistant response prefill
+  //     is incompatible with enable_thinking" (ggml-org/llama.cpp#20861,
+  //     #21889; mastra-ai/mastra#15234)
+  //   - DeepSeek-R1 / R1-0528 / V4  (vllm-project/vllm#12999)
+  //   - GLM-4.6 / 4.7 thinking      (ggml-org/llama.cpp#15401)
+  //   - Kimi-K2-Thinking, MiniMax-M2
+  //
+  // Qwen3-Coder, Qwen3-Instruct-2507, Qwen2.5 keep `true` — their templates
+  // do not branch on `enable_thinking`, so prefill is safe.
+  prefill: Schema.optional(Schema.Boolean),
   cost: Schema.optional(Cost),
   limit: Schema.Struct({
     context: Schema.Finite,

diff --git a/packages/opencode/src/config/provider.ts b/packages/opencode/src/config/provider.ts
@@ -19,6 +19,15 @@ export const Model = Schema.Struct({
       }),
     ]),
   ),
+  prefill: Schema.optional(Schema.Boolean).annotate({
+    description:
+      "Whether the model accepts an assistant turn as the last message. " +
+      "Set false for thinking-on-by-default templates whose chat template " +
+      "rejects trailing-assistant (Qwen3 hybrid/3.5/3.6, QwQ, DeepSeek-R1, " +
+      "GLM-4.6/4.7 thinking, Kimi-K2-Thinking, MiniMax-M2). Defaults to " +
+      "true for non-openai-compatible providers, false for openai-compatible " +
+      "with reasoning enabled.",
+  }),
   cost: Schema.optional(
     Schema.Struct({
       input: Schema.Finite,

diff --git a/packages/opencode/src/provider/capability-probe.ts b/packages/opencode/src/provider/capability-probe.ts
@@ -0,0 +1,91 @@
+export * as CapabilityProbe from "./capability-probe"
+
+// Runtime detection of OpenAI-compatible server capabilities.
+//
+// Self-hosted llama.cpp servers expose `<root>/props` with the active chat
+// template. Templates that branch on `enable_thinking` (Qwen3 hybrid, Qwen3.5,
+// Qwen3.6, QwQ, DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking,
+// MiniMax-M2, etc.) reject trailing-assistant prefill at runtime with
+// `HTTP 400 "Assistant response prefill is incompatible with enable_thinking"`
+// (llama.cpp#20861, mastra-ai#15234).
+//
+// Probing the live template removes the need for per-family name lists in
+// models.dev or user config: any server whose template branches on
+// `enable_thinking` is detected automatically, including future thinking
+// families.
+//
+// Probe is opt-in by base URL, fail-silent (vLLM/TGI/mistral.rs have no
+// `/props` endpoint — they fall through to existing detection), short-timeout
+// (1.5s), and cached per process so we hit the network at most once per base
+// URL.
+
+export type ProbedCapabilities = {
+  prefill?: boolean
+  reasoning?: boolean
+}
+
+const PROBE_TIMEOUT_MS = 1500
+const cache = new Map<string, Promise<ProbedCapabilities>>()
+
+// Normalises a baseURL ("http://host/v1/", "http://host", "http://host/v1")
+// to the server root the /props endpoint lives under.
+function rootURL(baseURL: string): string {
+  return baseURL.replace(/\/v1\/?$/, "").replace(/\/+$/, "")
+}
+
+async function probeOnce(baseURL: string): Promise<ProbedCapabilities> {
+  const root = rootURL(baseURL)
+  if (!root) return {}
+
+  const result: ProbedCapabilities = {}
+  const ctrl = new AbortController()
+  const timer = setTimeout(() => ctrl.abort(), PROBE_TIMEOUT_MS)
+  try {
+    const resp = await fetch(`${root}/props`, { signal: ctrl.signal })
+    if (!resp.ok) return {}
+    const data = (await resp.json()) as { chat_template?: unknown; chat_template_caps?: { supports_preserve_reasoning?: unknown } }
+
+    // Primary signal: the chat template branches on `enable_thinking`. This
+    // is the exact condition that produces the prefill-incompatible 400 — it
+    // means the template adds `<think>` differently depending on whether
+    // generation_prompt is requested, and a trailing-assistant turn (no
+    // generation_prompt) lands in the path that conflicts with reasoning.
+    if (typeof data.chat_template === "string" && data.chat_template.includes("enable_thinking")) {
+      result.prefill = false
+      result.reasoning = true
+    }
+
+    // Secondary signal: llama.cpp also exposes `supports_preserve_reasoning`
+    // on chat_template_caps for thinking templates. This catches a few edge
+    // templates that don't use the literal `enable_thinking` keyword.
+    if (data.chat_template_caps?.supports_preserve_reasoning === true) {
+      result.reasoning = true
+    }
+  } catch {
+    // Probe failed: server has no /props (vLLM/TGI/mistral.rs), is offline,
+    // or timed out. Fall back to other detection paths silently.
+  } finally {
+    clearTimeout(timer)
+  }
+  return result
+}
+
+// Returns probed capabilities for the given openai-compatible base URL.
+// Result is cached per base URL for the process lifetime; concurrent callers
+// share the same in-flight probe.
+export function probe(baseURL: string): Promise<ProbedCapabilities> {
+  if (!baseURL) return Promise.resolve({})
+  const key = rootURL(baseURL)
+  let pending = cache.get(key)
+  if (!pending) {
+    pending = probeOnce(baseURL)
+    cache.set(key, pending)
+  }
+  return pending
+}
+
+// Test-only: clears the in-process probe cache. Used by unit tests so they
+// can re-probe without restarting the test runner.
+export function _resetCache(): void {
+  cache.clear()
+}
diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts
@@ -24,6 +24,7 @@ import { AppFileSystem } from "@opencode-ai/core/filesystem"
 import { isRecord } from "@/util/record"
 import { optionalOmitUndefined } from "@opencode-ai/core/schema"
 import * as ProviderTransform from "./transform"
+import { CapabilityProbe } from "./capability-probe"
 import { ModelID, ProviderID } from "./schema"
 import { ModelStatus } from "./model-status"
 import { RuntimeFlags } from "@/effect/runtime-flags"
@@ -870,6 +871,12 @@ const ProviderCapabilities = Schema.Struct({
   input: ProviderModalities,
   output: ProviderModalities,
   interleaved: ProviderInterleaved,
+  // Trailing-assistant ("prefill") support. See models.dev Model.prefill for
+  // the per-family rationale. Undefined = true (backwards-compatible default).
+  // Read via `canAcceptTrailingAssistant(model)` rather than checking this
+  // field directly, so the undefined-default and provider-level inference
+  // live in one place.
+  prefill: Schema.optional(Schema.Boolean),
 })
 
 const ProviderCacheCost = Schema.Struct({
@@ -1083,6 +1090,7 @@ function fromModelsDevModel(provider: ModelsDev.Provider, model: ModelsDev.Model
         pdf: model.modalities?.output?.includes("pdf") ?? false,
       },
       interleaved: model.interleaved ?? false,
+      prefill: model.prefill,
     },
     release_date: model.release_date ?? "",
     variants: {},
@@ -1267,6 +1275,21 @@ export const layer = Layer.effect(
             models: existing?.models ?? {},
           }
 
+          // Auto-detect prefill/reasoning by probing the live OpenAI-compatible
+          // server (llama.cpp `/props` endpoint). The probe inspects the active
+          // chat template for the `enable_thinking` branch that produces the
+          // trailing-assistant 400. Fail-silent for non-llama.cpp upstreams,
+          // cached per base URL. See CapabilityProbe for details.
+          const providerNpm = provider.npm ?? modelsDev[providerID]?.npm
+          const probeBaseURL =
+            providerNpm === "@ai-sdk/openai-compatible" || !providerNpm
+              ? (parsed.options as { baseURL?: unknown })?.baseURL
+              : undefined
+          const probed: CapabilityProbe.ProbedCapabilities =
+            typeof probeBaseURL === "string" && probeBaseURL
+              ? yield* Effect.promise(() => CapabilityProbe.probe(probeBaseURL))
+              : {}
+
           for (const [modelID, model] of Object.entries(provider.models ?? {})) {
             const existingModel = parsed.models[model.id ?? modelID]
             const apiID = model.id ?? existingModel?.api.id ?? modelID
@@ -1293,7 +1316,7 @@ export const layer = Layer.effect(
               providerID: ProviderID.make(providerID),
               capabilities: {
                 temperature: model.temperature ?? existingModel?.capabilities.temperature ?? false,
-                reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? false,
+                reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? probed.reasoning ?? false,
                 attachment: model.attachment ?? existingModel?.capabilities.attachment ?? false,
                 toolcall: model.tool_call ?? existingModel?.capabilities.toolcall ?? true,
                 input: {
@@ -1319,6 +1342,7 @@ export const layer = Layer.effect(
                   (!existingModel && apiNpm === "@ai-sdk/openai-compatible" && apiID.includes("deepseek")
                     ? { field: "reasoning_content" }
                     : false),
+                prefill: model.prefill ?? existingModel?.capabilities.prefill ?? probed.prefill,
               },
               cost: {
                 input: model?.cost?.input ?? existingModel?.cost?.input ?? 0,

diff --git a/packages/opencode/src/provider/transform.ts b/packages/opencode/src/provider/transform.ts
@@ -21,6 +21,27 @@ export function sanitizeSurrogates(content: string) {
   return content.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g, "\uFFFD")
 }
 
+// Whether the model accepts an assistant message as the LAST turn in a
+// request ("prefill" / "response continuation").
+//
+// Anthropic and Bedrock-Anthropic accept it (and rely on it for tool-use
+// continuation). Most OpenAI-compatible servers do not when reasoning is
+// enabled, because the chat template branches on `enable_thinking` and rejects
+// a trailing assistant. See the per-family list in models.dev Model.prefill.
+//
+// Precedence:
+//   1. Explicit `model.capabilities.prefill` (from models.dev / user config) wins.
+//   2. Else: openai-compatible + reasoning-capable models default to false,
+//      because every known 2025-2026 open-weight thinking family hits the
+//      template incompat (Qwen3/3.5/3.6 hybrid + Thinking variants, QwQ,
+//      DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking, MiniMax-M2).
+//   3. Else: true.
+export function canAcceptTrailingAssistant(model: Provider.Model): boolean {
+  if (model.capabilities.prefill !== undefined) return model.capabilities.prefill
+  if (model.api.npm === "@ai-sdk/openai-compatible" && model.capabilities.reasoning) return false
+  return true
+}
+
 // Maps npm package to the key the AI SDK expects for providerOptions
 function sdkKey(npm: string): string | undefined {
   switch (npm) {

diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts
@@ -1825,7 +1825,25 @@ NOTE: At any point in time through this workflow you should feel free to ask the
               sessionID,
               parentSessionID: session.parentID,
               system,
-              messages: [...modelMsgs, ...(isLastStep ? [{ role: "assistant" as const, content: MAX_STEPS }] : [])],
+              // On the final step we inject the MAX_STEPS instruction so the
+              // model wraps up instead of calling more tools. Anthropic-style
+              // providers accept this as an assistant-prefill ("response
+              // continuation"); thinking-on-by-default templates (most
+              // openai-compatible local servers — Qwen3 hybrid/3.5/3.6,
+              // DeepSeek-R1, GLM-thinking, etc.) reject any trailing-assistant
+              // outright. For those, deliver the same instruction as a user
+              // message so it reaches the model without tripping the template.
+              messages: [
+                ...modelMsgs,
+                ...(isLastStep
+                  ? [
+                      {
+                        role: ProviderTransform.canAcceptTrailingAssistant(model) ? ("assistant" as const) : ("user" as const),
+                        content: MAX_STEPS,
+                      },
+                    ]
+                  : []),
+              ],
               tools,
               model,
               toolChoice: format.type === "json_schema" ? "required" : undefined,

diff --git a/packages/opencode/test/provider/capability-probe.test.ts b/packages/opencode/test/provider/capability-probe.test.ts
@@ -0,0 +1,132 @@
+import { afterEach, describe, expect, test } from "bun:test"
+import { CapabilityProbe } from "@/provider/capability-probe"
+
+const originalFetch = globalThis.fetch
+
+afterEach(() => {
+  globalThis.fetch = originalFetch
+  CapabilityProbe._resetCache()
+})
+
+function mockFetch(impl: (url: string) => Response | Promise<Response>) {
+  globalThis.fetch = ((input: RequestInfo | URL) => {
+    const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url
+    return Promise.resolve(impl(url))
+  }) as typeof fetch
+}
+
+describe("CapabilityProbe.probe", () => {
+  test("detects prefill=false and reasoning=true when chat_template contains enable_thinking", async () => {
+    mockFetch((url) => {
+      expect(url).toBe("http://localhost:8080/props")
+      return new Response(
+        JSON.stringify({
+          chat_template:
+            "{%- if enable_thinking is defined and enable_thinking is false %}<think></think>{%- else %}<think>{%- endif %}",
+        }),
+        { status: 200 },
+      )
+    })
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({ prefill: false, reasoning: true })
+  })
+
+  test("strips trailing /v1 from baseURL to find /props", async () => {
+    let called = ""
+    mockFetch((url) => {
+      called = url
+      return new Response("{}", { status: 200 })
+    })
+    await CapabilityProbe.probe("http://localhost:8080/v1/")
+    expect(called).toBe("http://localhost:8080/props")
+  })
+
+  test("handles baseURL without /v1 suffix", async () => {
+    let called = ""
+    mockFetch((url) => {
+      called = url
+      return new Response("{}", { status: 200 })
+    })
+    await CapabilityProbe.probe("http://localhost:8080")
+    expect(called).toBe("http://localhost:8080/props")
+  })
+
+  test("returns empty when /props is not present (404)", async () => {
+    mockFetch(() => new Response("Not Found", { status: 404 }))
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("returns empty when chat_template lacks enable_thinking", async () => {
+    mockFetch(
+      () =>
+        new Response(JSON.stringify({ chat_template: "<|user|>{{ messages }}<|assistant|>" }), {
+          status: 200,
+        }),
+    )
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("detects reasoning=true from supports_preserve_reasoning even when chat_template is missing", async () => {
+    mockFetch(
+      () =>
+        new Response(JSON.stringify({ chat_template_caps: { supports_preserve_reasoning: true } }), {
+          status: 200,
+        }),
+    )
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result.reasoning).toBe(true)
+    // No prefill signal from supports_preserve_reasoning alone — only chat_template can determine that
+    expect(result.prefill).toBeUndefined()
+  })
+
+  test("fails silent on network error", async () => {
+    mockFetch(() => {
+      throw new Error("ECONNREFUSED")
+    })
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("fails silent on invalid JSON", async () => {
+    mockFetch(() => new Response("not-json", { status: 200 }))
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("returns empty for empty baseURL", async () => {
+    let called = false
+    mockFetch(() => {
+      called = true
+      return new Response("{}", { status: 200 })
+    })
+    const result = await CapabilityProbe.probe("")
+    expect(result).toEqual({})
+    expect(called).toBe(false)
+  })
+
+  test("caches result per base URL — second call does not hit network", async () => {
+    let calls = 0
+    mockFetch(() => {
+      calls++
+      return new Response(JSON.stringify({ chat_template: "enable_thinking" }), { status: 200 })
+    })
+    const a = await CapabilityProbe.probe("http://localhost:8080/v1")
+    const b = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(calls).toBe(1)
+    expect(a).toEqual(b)
+  })
+
+  test("normalises trailing slashes for cache hits", async () => {
+    let calls = 0
+    mockFetch(() => {
+      calls++
+      return new Response("{}", { status: 200 })
+    })
+    await CapabilityProbe.probe("http://localhost:8080/v1")
+    await CapabilityProbe.probe("http://localhost:8080/v1/")
+    await CapabilityProbe.probe("http://localhost:8080")
+    expect(calls).toBe(1)
+  })
+})