From d1c50c5270226b6679a6fe114b8da3a428bf6653 Mon Sep 17 00:00:00 2001 From: feanor5555 <2073406+feanor5555@users.noreply.github.com> Date: Sat, 16 May 2026 17:39:17 +0000 Subject: [PATCH] core: add Model.prefill capability for trailing-assistant support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anthropic-style providers accept (and rely on) an assistant message as the last turn in a conversation ("response continuation" / "prefill" for tool-use continuation). Most other thinking-on-by-default templates reject it outright — llama.cpp returns HTTP 400 "Assistant response prefill is incompatible with enable_thinking" on Qwen3-family templates, and vLLM/TGI have equivalent behaviour for DeepSeek-R1, GLM-4.6 thinking, Kimi-K2-Thinking, etc. A first-class `prefill: boolean` on Model lets every host (opencode, mastra, others) consult one canonical source of truth instead of guessing from npm package + reasoning flag. - packages/core/src/models.ts: add optional prefill field on Model with a per-family list of templates known to reject prefill (Qwen3 hybrid/3.5/3.6/Thinking-2507/VL, QwQ, DeepSeek-R1/R1-0528/V4, GLM-4.6/4.7-thinking, Kimi-K2-Thinking, MiniMax-M2). - packages/opencode/src/config/provider.ts: mirror the field on the user-facing config schema with an annotation describing when to set it (and what the auto-default is for openai-compatible+reasoning). Default (undefined) is treated as `true` to keep all existing models unaffected. Consumer-side logic lives in a follow-up PR. Sister-PR to a sst/models.dev data PR that will populate prefill: false on the affected per-model entries. --- packages/core/src/models.ts | 19 +++++++++++++++++++ packages/opencode/src/config/provider.ts | 9 +++++++++ 2 files changed, 28 insertions(+) diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts index 4ee17b8e25eb..0c12de4d1af0 100644 --- a/packages/core/src/models.ts +++ b/packages/core/src/models.ts @@ -57,6 +57,25 @@ export const Model = Schema.Struct({ }), ]), ), + // Whether the model's chat template accepts an assistant turn as the LAST + // message (a.k.a. "prefill" / "response continuation"). + // + // Default (undefined) is treated as `true` for backwards compatibility. + // + // Set to `false` for thinking-on-by-default models whose chat template + // rejects trailing-assistant when thinking is enabled. Affected families + // (non-exhaustive, 2025-2026): + // - Qwen3 hybrid (all sizes), Qwen3-Thinking-2507, Qwen3-VL, + // Qwen3.5, Qwen3.6, QwQ-32B -> llama.cpp "Assistant response prefill + // is incompatible with enable_thinking" (ggml-org/llama.cpp#20861, + // #21889; mastra-ai/mastra#15234) + // - DeepSeek-R1 / R1-0528 / V4 (vllm-project/vllm#12999) + // - GLM-4.6 / 4.7 thinking (ggml-org/llama.cpp#15401) + // - Kimi-K2-Thinking, MiniMax-M2 + // + // Qwen3-Coder, Qwen3-Instruct-2507, Qwen2.5 keep `true` — their templates + // do not branch on `enable_thinking`, so prefill is safe. + prefill: Schema.optional(Schema.Boolean), cost: Schema.optional(Cost), limit: Schema.Struct({ context: Schema.Finite, diff --git a/packages/opencode/src/config/provider.ts b/packages/opencode/src/config/provider.ts index 5635512cedf9..7f742b3b5609 100644 --- a/packages/opencode/src/config/provider.ts +++ b/packages/opencode/src/config/provider.ts @@ -19,6 +19,15 @@ export const Model = Schema.Struct({ }), ]), ), + prefill: Schema.optional(Schema.Boolean).annotate({ + description: + "Whether the model accepts an assistant turn as the last message. " + + "Set false for thinking-on-by-default templates whose chat template " + + "rejects trailing-assistant (Qwen3 hybrid/3.5/3.6, QwQ, DeepSeek-R1, " + + "GLM-4.6/4.7 thinking, Kimi-K2-Thinking, MiniMax-M2). Defaults to " + + "true for non-openai-compatible providers, false for openai-compatible " + + "with reasoning enabled.", + }), cost: Schema.optional( Schema.Struct({ input: Schema.Finite,