Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions packages/core/src/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,25 @@ export const Model = Schema.Struct({
}),
]),
),
// Whether the model's chat template accepts an assistant turn as the LAST
// message (a.k.a. "prefill" / "response continuation").
//
// Default (undefined) is treated as `true` for backwards compatibility.
//
// Set to `false` for thinking-on-by-default models whose chat template
// rejects trailing-assistant when thinking is enabled. Affected families
// (non-exhaustive, 2025-2026):
// - Qwen3 hybrid (all sizes), Qwen3-Thinking-2507, Qwen3-VL,
// Qwen3.5, Qwen3.6, QwQ-32B -> llama.cpp "Assistant response prefill
// is incompatible with enable_thinking" (ggml-org/llama.cpp#20861,
// #21889; mastra-ai/mastra#15234)
// - DeepSeek-R1 / R1-0528 / V4 (vllm-project/vllm#12999)
// - GLM-4.6 / 4.7 thinking (ggml-org/llama.cpp#15401)
// - Kimi-K2-Thinking, MiniMax-M2
//
// Qwen3-Coder, Qwen3-Instruct-2507, Qwen2.5 keep `true` — their templates
// do not branch on `enable_thinking`, so prefill is safe.
prefill: Schema.optional(Schema.Boolean),
cost: Schema.optional(Cost),
limit: Schema.Struct({
context: Schema.Finite,
Expand Down
9 changes: 9 additions & 0 deletions packages/opencode/src/config/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ export const Model = Schema.Struct({
}),
]),
),
prefill: Schema.optional(Schema.Boolean).annotate({
description:
"Whether the model accepts an assistant turn as the last message. " +
"Set false for thinking-on-by-default templates whose chat template " +
"rejects trailing-assistant (Qwen3 hybrid/3.5/3.6, QwQ, DeepSeek-R1, " +
"GLM-4.6/4.7 thinking, Kimi-K2-Thinking, MiniMax-M2). Defaults to " +
"true for non-openai-compatible providers, false for openai-compatible " +
"with reasoning enabled.",
}),
cost: Schema.optional(
Schema.Struct({
input: Schema.Finite,
Expand Down
91 changes: 91 additions & 0 deletions packages/opencode/src/provider/capability-probe.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
export * as CapabilityProbe from "./capability-probe"

// Runtime detection of OpenAI-compatible server capabilities.
//
// Self-hosted llama.cpp servers expose `<root>/props` with the active chat
// template. Templates that branch on `enable_thinking` (Qwen3 hybrid, Qwen3.5,
// Qwen3.6, QwQ, DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking,
// MiniMax-M2, etc.) reject trailing-assistant prefill at runtime with
// `HTTP 400 "Assistant response prefill is incompatible with enable_thinking"`
// (llama.cpp#20861, mastra-ai#15234).
//
// Probing the live template removes the need for per-family name lists in
// models.dev or user config: any server whose template branches on
// `enable_thinking` is detected automatically, including future thinking
// families.
//
// Probe is opt-in by base URL, fail-silent (vLLM/TGI/mistral.rs have no
// `/props` endpoint — they fall through to existing detection), short-timeout
// (1.5s), and cached per process so we hit the network at most once per base
// URL.

export type ProbedCapabilities = {
prefill?: boolean
reasoning?: boolean
}

const PROBE_TIMEOUT_MS = 1500
const cache = new Map<string, Promise<ProbedCapabilities>>()

// Normalises a baseURL ("http://host/v1/", "http://host", "http://host/v1")
// to the server root the /props endpoint lives under.
function rootURL(baseURL: string): string {
return baseURL.replace(/\/v1\/?$/, "").replace(/\/+$/, "")
}

async function probeOnce(baseURL: string): Promise<ProbedCapabilities> {
const root = rootURL(baseURL)
if (!root) return {}

const result: ProbedCapabilities = {}
const ctrl = new AbortController()
const timer = setTimeout(() => ctrl.abort(), PROBE_TIMEOUT_MS)
try {
const resp = await fetch(`${root}/props`, { signal: ctrl.signal })
if (!resp.ok) return {}
const data = (await resp.json()) as { chat_template?: unknown; chat_template_caps?: { supports_preserve_reasoning?: unknown } }

// Primary signal: the chat template branches on `enable_thinking`. This
// is the exact condition that produces the prefill-incompatible 400 — it
// means the template adds `<think>` differently depending on whether
// generation_prompt is requested, and a trailing-assistant turn (no
// generation_prompt) lands in the path that conflicts with reasoning.
if (typeof data.chat_template === "string" && data.chat_template.includes("enable_thinking")) {
result.prefill = false
result.reasoning = true
}

// Secondary signal: llama.cpp also exposes `supports_preserve_reasoning`
// on chat_template_caps for thinking templates. This catches a few edge
// templates that don't use the literal `enable_thinking` keyword.
if (data.chat_template_caps?.supports_preserve_reasoning === true) {
result.reasoning = true
}
} catch {
// Probe failed: server has no /props (vLLM/TGI/mistral.rs), is offline,
// or timed out. Fall back to other detection paths silently.
} finally {
clearTimeout(timer)
}
return result
}

// Returns probed capabilities for the given openai-compatible base URL.
// Result is cached per base URL for the process lifetime; concurrent callers
// share the same in-flight probe.
export function probe(baseURL: string): Promise<ProbedCapabilities> {
if (!baseURL) return Promise.resolve({})
const key = rootURL(baseURL)
let pending = cache.get(key)
if (!pending) {
pending = probeOnce(baseURL)
cache.set(key, pending)
}
return pending
}

// Test-only: clears the in-process probe cache. Used by unit tests so they
// can re-probe without restarting the test runner.
export function _resetCache(): void {
cache.clear()
}
26 changes: 25 additions & 1 deletion packages/opencode/src/provider/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import { AppFileSystem } from "@opencode-ai/core/filesystem"
import { isRecord } from "@/util/record"
import { optionalOmitUndefined } from "@opencode-ai/core/schema"
import * as ProviderTransform from "./transform"
import { CapabilityProbe } from "./capability-probe"
import { ModelID, ProviderID } from "./schema"
import { ModelStatus } from "./model-status"
import { RuntimeFlags } from "@/effect/runtime-flags"
Expand Down Expand Up @@ -870,6 +871,12 @@ const ProviderCapabilities = Schema.Struct({
input: ProviderModalities,
output: ProviderModalities,
interleaved: ProviderInterleaved,
// Trailing-assistant ("prefill") support. See models.dev Model.prefill for
// the per-family rationale. Undefined = true (backwards-compatible default).
// Read via `canAcceptTrailingAssistant(model)` rather than checking this
// field directly, so the undefined-default and provider-level inference
// live in one place.
prefill: Schema.optional(Schema.Boolean),
})

const ProviderCacheCost = Schema.Struct({
Expand Down Expand Up @@ -1083,6 +1090,7 @@ function fromModelsDevModel(provider: ModelsDev.Provider, model: ModelsDev.Model
pdf: model.modalities?.output?.includes("pdf") ?? false,
},
interleaved: model.interleaved ?? false,
prefill: model.prefill,
},
release_date: model.release_date ?? "",
variants: {},
Expand Down Expand Up @@ -1267,6 +1275,21 @@ export const layer = Layer.effect(
models: existing?.models ?? {},
}

// Auto-detect prefill/reasoning by probing the live OpenAI-compatible
// server (llama.cpp `/props` endpoint). The probe inspects the active
// chat template for the `enable_thinking` branch that produces the
// trailing-assistant 400. Fail-silent for non-llama.cpp upstreams,
// cached per base URL. See CapabilityProbe for details.
const providerNpm = provider.npm ?? modelsDev[providerID]?.npm
const probeBaseURL =
providerNpm === "@ai-sdk/openai-compatible" || !providerNpm
? (parsed.options as { baseURL?: unknown })?.baseURL
: undefined
const probed: CapabilityProbe.ProbedCapabilities =
typeof probeBaseURL === "string" && probeBaseURL
? yield* Effect.promise(() => CapabilityProbe.probe(probeBaseURL))
: {}

for (const [modelID, model] of Object.entries(provider.models ?? {})) {
const existingModel = parsed.models[model.id ?? modelID]
const apiID = model.id ?? existingModel?.api.id ?? modelID
Expand All @@ -1293,7 +1316,7 @@ export const layer = Layer.effect(
providerID: ProviderID.make(providerID),
capabilities: {
temperature: model.temperature ?? existingModel?.capabilities.temperature ?? false,
reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? false,
reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? probed.reasoning ?? false,
attachment: model.attachment ?? existingModel?.capabilities.attachment ?? false,
toolcall: model.tool_call ?? existingModel?.capabilities.toolcall ?? true,
input: {
Expand All @@ -1319,6 +1342,7 @@ export const layer = Layer.effect(
(!existingModel && apiNpm === "@ai-sdk/openai-compatible" && apiID.includes("deepseek")
? { field: "reasoning_content" }
: false),
prefill: model.prefill ?? existingModel?.capabilities.prefill ?? probed.prefill,
},
cost: {
input: model?.cost?.input ?? existingModel?.cost?.input ?? 0,
Expand Down
21 changes: 21 additions & 0 deletions packages/opencode/src/provider/transform.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,27 @@ export function sanitizeSurrogates(content: string) {
return content.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g, "\uFFFD")
}

// Whether the model accepts an assistant message as the LAST turn in a
// request ("prefill" / "response continuation").
//
// Anthropic and Bedrock-Anthropic accept it (and rely on it for tool-use
// continuation). Most OpenAI-compatible servers do not when reasoning is
// enabled, because the chat template branches on `enable_thinking` and rejects
// a trailing assistant. See the per-family list in models.dev Model.prefill.
//
// Precedence:
// 1. Explicit `model.capabilities.prefill` (from models.dev / user config) wins.
// 2. Else: openai-compatible + reasoning-capable models default to false,
// because every known 2025-2026 open-weight thinking family hits the
// template incompat (Qwen3/3.5/3.6 hybrid + Thinking variants, QwQ,
// DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking, MiniMax-M2).
// 3. Else: true.
export function canAcceptTrailingAssistant(model: Provider.Model): boolean {
if (model.capabilities.prefill !== undefined) return model.capabilities.prefill
if (model.api.npm === "@ai-sdk/openai-compatible" && model.capabilities.reasoning) return false
return true
}

// Maps npm package to the key the AI SDK expects for providerOptions
function sdkKey(npm: string): string | undefined {
switch (npm) {
Expand Down
20 changes: 19 additions & 1 deletion packages/opencode/src/session/prompt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1825,7 +1825,25 @@ NOTE: At any point in time through this workflow you should feel free to ask the
sessionID,
parentSessionID: session.parentID,
system,
messages: [...modelMsgs, ...(isLastStep ? [{ role: "assistant" as const, content: MAX_STEPS }] : [])],
// On the final step we inject the MAX_STEPS instruction so the
// model wraps up instead of calling more tools. Anthropic-style
// providers accept this as an assistant-prefill ("response
// continuation"); thinking-on-by-default templates (most
// openai-compatible local servers — Qwen3 hybrid/3.5/3.6,
// DeepSeek-R1, GLM-thinking, etc.) reject any trailing-assistant
// outright. For those, deliver the same instruction as a user
// message so it reaches the model without tripping the template.
messages: [
...modelMsgs,
...(isLastStep
? [
{
role: ProviderTransform.canAcceptTrailingAssistant(model) ? ("assistant" as const) : ("user" as const),
content: MAX_STEPS,
},
]
: []),
],
tools,
model,
toolChoice: format.type === "json_schema" ? "required" : undefined,
Expand Down
132 changes: 132 additions & 0 deletions packages/opencode/test/provider/capability-probe.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import { afterEach, describe, expect, test } from "bun:test"
import { CapabilityProbe } from "@/provider/capability-probe"

const originalFetch = globalThis.fetch

afterEach(() => {
globalThis.fetch = originalFetch
CapabilityProbe._resetCache()
})

function mockFetch(impl: (url: string) => Response | Promise<Response>) {
globalThis.fetch = ((input: RequestInfo | URL) => {
const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url
return Promise.resolve(impl(url))
}) as typeof fetch
}

describe("CapabilityProbe.probe", () => {
test("detects prefill=false and reasoning=true when chat_template contains enable_thinking", async () => {
mockFetch((url) => {
expect(url).toBe("http://localhost:8080/props")
return new Response(
JSON.stringify({
chat_template:
"{%- if enable_thinking is defined and enable_thinking is false %}<think></think>{%- else %}<think>{%- endif %}",
}),
{ status: 200 },
)
})
const result = await CapabilityProbe.probe("http://localhost:8080/v1")
expect(result).toEqual({ prefill: false, reasoning: true })
})

test("strips trailing /v1 from baseURL to find /props", async () => {
let called = ""
mockFetch((url) => {
called = url
return new Response("{}", { status: 200 })
})
await CapabilityProbe.probe("http://localhost:8080/v1/")
expect(called).toBe("http://localhost:8080/props")
})

test("handles baseURL without /v1 suffix", async () => {
let called = ""
mockFetch((url) => {
called = url
return new Response("{}", { status: 200 })
})
await CapabilityProbe.probe("http://localhost:8080")
expect(called).toBe("http://localhost:8080/props")
})

test("returns empty when /props is not present (404)", async () => {
mockFetch(() => new Response("Not Found", { status: 404 }))
const result = await CapabilityProbe.probe("http://localhost:8080/v1")
expect(result).toEqual({})
})

test("returns empty when chat_template lacks enable_thinking", async () => {
mockFetch(
() =>
new Response(JSON.stringify({ chat_template: "<|user|>{{ messages }}<|assistant|>" }), {
status: 200,
}),
)
const result = await CapabilityProbe.probe("http://localhost:8080/v1")
expect(result).toEqual({})
})

test("detects reasoning=true from supports_preserve_reasoning even when chat_template is missing", async () => {
mockFetch(
() =>
new Response(JSON.stringify({ chat_template_caps: { supports_preserve_reasoning: true } }), {
status: 200,
}),
)
const result = await CapabilityProbe.probe("http://localhost:8080/v1")
expect(result.reasoning).toBe(true)
// No prefill signal from supports_preserve_reasoning alone — only chat_template can determine that
expect(result.prefill).toBeUndefined()
})

test("fails silent on network error", async () => {
mockFetch(() => {
throw new Error("ECONNREFUSED")
})
const result = await CapabilityProbe.probe("http://localhost:8080/v1")
expect(result).toEqual({})
})

test("fails silent on invalid JSON", async () => {
mockFetch(() => new Response("not-json", { status: 200 }))
const result = await CapabilityProbe.probe("http://localhost:8080/v1")
expect(result).toEqual({})
})

test("returns empty for empty baseURL", async () => {
let called = false
mockFetch(() => {
called = true
return new Response("{}", { status: 200 })
})
const result = await CapabilityProbe.probe("")
expect(result).toEqual({})
expect(called).toBe(false)
})

test("caches result per base URL — second call does not hit network", async () => {
let calls = 0
mockFetch(() => {
calls++
return new Response(JSON.stringify({ chat_template: "enable_thinking" }), { status: 200 })
})
const a = await CapabilityProbe.probe("http://localhost:8080/v1")
const b = await CapabilityProbe.probe("http://localhost:8080/v1")
expect(calls).toBe(1)
expect(a).toEqual(b)
})

test("normalises trailing slashes for cache hits", async () => {
let calls = 0
mockFetch(() => {
calls++
return new Response("{}", { status: 200 })
})
await CapabilityProbe.probe("http://localhost:8080/v1")
await CapabilityProbe.probe("http://localhost:8080/v1/")
await CapabilityProbe.probe("http://localhost:8080")
expect(calls).toBe(1)
})
})
Loading
Loading