From d1c50c5270226b6679a6fe114b8da3a428bf6653 Mon Sep 17 00:00:00 2001
From: feanor5555 <2073406+feanor5555@users.noreply.github.com>
Date: Sat, 16 May 2026 17:39:17 +0000
Subject: [PATCH] core: add Model.prefill capability for trailing-assistant
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Anthropic-style providers accept (and rely on) an assistant message as
the last turn in a conversation ("response continuation" / "prefill"
for tool-use continuation). Most other thinking-on-by-default templates
reject it outright — llama.cpp returns HTTP 400 "Assistant response
prefill is incompatible with enable_thinking" on Qwen3-family templates,
and vLLM/TGI have equivalent behaviour for DeepSeek-R1, GLM-4.6 thinking,
Kimi-K2-Thinking, etc.

A first-class `prefill: boolean` on Model lets every host (opencode,
mastra, others) consult one canonical source of truth instead of
guessing from npm package + reasoning flag.

- packages/core/src/models.ts: add optional prefill field on Model
  with a per-family list of templates known to reject prefill
  (Qwen3 hybrid/3.5/3.6/Thinking-2507/VL, QwQ, DeepSeek-R1/R1-0528/V4,
  GLM-4.6/4.7-thinking, Kimi-K2-Thinking, MiniMax-M2).

- packages/opencode/src/config/provider.ts: mirror the field on the
  user-facing config schema with an annotation describing when to set
  it (and what the auto-default is for openai-compatible+reasoning).

Default (undefined) is treated as `true` to keep all existing models
unaffected. Consumer-side logic lives in a follow-up PR.

Sister-PR to a sst/models.dev data PR that will populate prefill: false
on the affected per-model entries.
---
 packages/core/src/models.ts              | 19 +++++++++++++++++++
 packages/opencode/src/config/provider.ts |  9 +++++++++
 2 files changed, 28 insertions(+)

diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts
index 4ee17b8e25eb..0c12de4d1af0 100644
--- a/packages/core/src/models.ts
+++ b/packages/core/src/models.ts
@@ -57,6 +57,25 @@ export const Model = Schema.Struct({
       }),
     ]),
   ),
+  // Whether the model's chat template accepts an assistant turn as the LAST
+  // message (a.k.a. "prefill" / "response continuation").
+  //
+  // Default (undefined) is treated as `true` for backwards compatibility.
+  //
+  // Set to `false` for thinking-on-by-default models whose chat template
+  // rejects trailing-assistant when thinking is enabled. Affected families
+  // (non-exhaustive, 2025-2026):
+  //   - Qwen3 hybrid (all sizes), Qwen3-Thinking-2507, Qwen3-VL,
+  //     Qwen3.5, Qwen3.6, QwQ-32B  ->  llama.cpp "Assistant response prefill
+  //     is incompatible with enable_thinking" (ggml-org/llama.cpp#20861,
+  //     #21889; mastra-ai/mastra#15234)
+  //   - DeepSeek-R1 / R1-0528 / V4  (vllm-project/vllm#12999)
+  //   - GLM-4.6 / 4.7 thinking      (ggml-org/llama.cpp#15401)
+  //   - Kimi-K2-Thinking, MiniMax-M2
+  //
+  // Qwen3-Coder, Qwen3-Instruct-2507, Qwen2.5 keep `true` — their templates
+  // do not branch on `enable_thinking`, so prefill is safe.
+  prefill: Schema.optional(Schema.Boolean),
   cost: Schema.optional(Cost),
   limit: Schema.Struct({
     context: Schema.Finite,
diff --git a/packages/opencode/src/config/provider.ts b/packages/opencode/src/config/provider.ts
index 5635512cedf9..7f742b3b5609 100644
--- a/packages/opencode/src/config/provider.ts
+++ b/packages/opencode/src/config/provider.ts
@@ -19,6 +19,15 @@ export const Model = Schema.Struct({
       }),
     ]),
   ),
+  prefill: Schema.optional(Schema.Boolean).annotate({
+    description:
+      "Whether the model accepts an assistant turn as the last message. " +
+      "Set false for thinking-on-by-default templates whose chat template " +
+      "rejects trailing-assistant (Qwen3 hybrid/3.5/3.6, QwQ, DeepSeek-R1, " +
+      "GLM-4.6/4.7 thinking, Kimi-K2-Thinking, MiniMax-M2). Defaults to " +
+      "true for non-openai-compatible providers, false for openai-compatible " +
+      "with reasoning enabled.",
+  }),
   cost: Schema.optional(
     Schema.Struct({
       input: Schema.Finite,