From d1c50c5270226b6679a6fe114b8da3a428bf6653 Mon Sep 17 00:00:00 2001
From: feanor5555 <2073406+feanor5555@users.noreply.github.com>
Date: Sat, 16 May 2026 17:39:17 +0000
Subject: [PATCH 1/2] core: add Model.prefill capability for trailing-assistant
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Anthropic-style providers accept (and rely on) an assistant message as
the last turn in a conversation ("response continuation" / "prefill"
for tool-use continuation). Most other thinking-on-by-default templates
reject it outright — llama.cpp returns HTTP 400 "Assistant response
prefill is incompatible with enable_thinking" on Qwen3-family templates,
and vLLM/TGI have equivalent behaviour for DeepSeek-R1, GLM-4.6 thinking,
Kimi-K2-Thinking, etc.

A first-class `prefill: boolean` on Model lets every host (opencode,
mastra, others) consult one canonical source of truth instead of
guessing from npm package + reasoning flag.

- packages/core/src/models.ts: add optional prefill field on Model
  with a per-family list of templates known to reject prefill
  (Qwen3 hybrid/3.5/3.6/Thinking-2507/VL, QwQ, DeepSeek-R1/R1-0528/V4,
  GLM-4.6/4.7-thinking, Kimi-K2-Thinking, MiniMax-M2).

- packages/opencode/src/config/provider.ts: mirror the field on the
  user-facing config schema with an annotation describing when to set
  it (and what the auto-default is for openai-compatible+reasoning).

Default (undefined) is treated as `true` to keep all existing models
unaffected. Consumer-side logic lives in a follow-up PR.

Sister-PR to a sst/models.dev data PR that will populate prefill: false
on the affected per-model entries.
---
 packages/core/src/models.ts              | 19 +++++++++++++++++++
 packages/opencode/src/config/provider.ts |  9 +++++++++
 2 files changed, 28 insertions(+)

diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts
index 4ee17b8e25eb..0c12de4d1af0 100644
--- a/packages/core/src/models.ts
+++ b/packages/core/src/models.ts
@@ -57,6 +57,25 @@ export const Model = Schema.Struct({
       }),
     ]),
   ),
+  // Whether the model's chat template accepts an assistant turn as the LAST
+  // message (a.k.a. "prefill" / "response continuation").
+  //
+  // Default (undefined) is treated as `true` for backwards compatibility.
+  //
+  // Set to `false` for thinking-on-by-default models whose chat template
+  // rejects trailing-assistant when thinking is enabled. Affected families
+  // (non-exhaustive, 2025-2026):
+  //   - Qwen3 hybrid (all sizes), Qwen3-Thinking-2507, Qwen3-VL,
+  //     Qwen3.5, Qwen3.6, QwQ-32B  ->  llama.cpp "Assistant response prefill
+  //     is incompatible with enable_thinking" (ggml-org/llama.cpp#20861,
+  //     #21889; mastra-ai/mastra#15234)
+  //   - DeepSeek-R1 / R1-0528 / V4  (vllm-project/vllm#12999)
+  //   - GLM-4.6 / 4.7 thinking      (ggml-org/llama.cpp#15401)
+  //   - Kimi-K2-Thinking, MiniMax-M2
+  //
+  // Qwen3-Coder, Qwen3-Instruct-2507, Qwen2.5 keep `true` — their templates
+  // do not branch on `enable_thinking`, so prefill is safe.
+  prefill: Schema.optional(Schema.Boolean),
   cost: Schema.optional(Cost),
   limit: Schema.Struct({
     context: Schema.Finite,
diff --git a/packages/opencode/src/config/provider.ts b/packages/opencode/src/config/provider.ts
index 5635512cedf9..7f742b3b5609 100644
--- a/packages/opencode/src/config/provider.ts
+++ b/packages/opencode/src/config/provider.ts
@@ -19,6 +19,15 @@ export const Model = Schema.Struct({
       }),
     ]),
   ),
+  prefill: Schema.optional(Schema.Boolean).annotate({
+    description:
+      "Whether the model accepts an assistant turn as the last message. " +
+      "Set false for thinking-on-by-default templates whose chat template " +
+      "rejects trailing-assistant (Qwen3 hybrid/3.5/3.6, QwQ, DeepSeek-R1, " +
+      "GLM-4.6/4.7 thinking, Kimi-K2-Thinking, MiniMax-M2). Defaults to " +
+      "true for non-openai-compatible providers, false for openai-compatible " +
+      "with reasoning enabled.",
+  }),
   cost: Schema.optional(
     Schema.Struct({
       input: Schema.Finite,

From 4d56c704bab819c2a89b5a652eb29a27c315df3d Mon Sep 17 00:00:00 2001
From: feanor5555 <2073406+feanor5555@users.noreply.github.com>
Date: Sat, 16 May 2026 17:41:32 +0000
Subject: [PATCH 2/2] provider: consume Model.prefill + runtime-probe llama.cpp
 templates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the remaining ~25% of trailing-assistant 400s on llama.cpp /
vLLM / TGI that an empty-content filter alone cannot reach. The
MAX_STEPS prefill in session/prompt.ts is non-empty by design (it
delivers a user-visible "wrap up" instruction), so it survives the
empty filter and trips the same template-incompat 400.

Three coordinated changes:

1. ProviderTransform.canAcceptTrailingAssistant(model) — new helper.
   Three-layer precedence:
     (a) explicit model.capabilities.prefill wins (from models.dev or
         user config),
     (b) auto-inference: @ai-sdk/openai-compatible + reasoning:true
         → false (covers every known 2025-2026 thinking family even
         before models.dev ships explicit values),
     (c) default true (backwards compatible — Anthropic, Bedrock,
         OpenAI, Google etc. unchanged).

2. session/prompt.ts MAX_STEPS routing now consults the helper:
   role:"assistant" for prefill-capable providers, role:"user" for the
   rest. Thinking stays enabled in the request body — only the role of
   the synthetic wrap-up message changes from `assistant` to `user`,
   so the model still thinks and writes its summary normally.

3. CapabilityProbe — runtime detection for self-hosted openai-compatible
   servers. llama.cpp's `<root>/props` endpoint exposes the active
   chat template; templates that branch on `enable_thinking` are exactly
   the ones that reject prefill. The probe runs once per base URL
   (cached), fail-silent (vLLM/TGI/mistral.rs have no /props and fall
   through to the auto-inference path), short-timeout (1.5s).

User can always override per-model via opencode.json:

    {
      "provider": {
        "my-llamacpp": {
          "models": {
            "qwen3.5-coder": { "reasoning": true, "prefill": false }
          }
        }
      }
    }

Affected behaviour:
  - Anthropic, Bedrock, OpenAI, Google — unchanged (prefill stays
    available).
  - Thinking-on local models (Qwen3, DeepSeek-R1, GLM-thinking,
    Kimi-K2-Thinking, MiniMax-M2): MAX_STEPS arrives as a user message.
    Same instruction, same wrap-up behaviour, no template rejection.

Tests:
  - transform.test.ts: 8-case canAcceptTrailingAssistant matrix
    (explicit-overrides-everything, auto-inference for openai-compatible
    + reasoning class, unchanged defaults for Anthropic/OpenAI/Google/
    Bedrock representatives).
  - capability-probe.test.ts: 11 cases for the runtime probe
    (enable_thinking detection, /v1-suffix normalisation, 404 fallback,
    network-error fallback, empty baseURL, per-URL cache).

Real-world benchmark against an echomodus-sized Spring Boot project
on llama.cpp + Qwen3.5-9B with --reasoning on:
  - Without this PR: 2.0 prefill-400s per run (3/3 runs).
  - With this PR + reasoning:true in user config: 0 errors (3/3).
  - With this PR + auto-probe (no user config): 0 errors (3/3).

Common misunderstanding: prefill:false does NOT disable thinking.
Thinking stays on for the whole request — only the role of the synthetic
MAX_STEPS message changes from `assistant` to `user`. The model then
thinks (with thinking enabled) and writes its wrap-up normally.

Builds on the Model.prefill capability introduced in the previous
commit. Sister-PR-1 (filter empty assistant content for
@ai-sdk/openai-compatible) handles the orthogonal empty-trailing case;
this PR handles the non-empty trailing case.
---
 .../opencode/src/provider/capability-probe.ts |  91 ++++++++++++
 packages/opencode/src/provider/provider.ts    |  26 +++-
 packages/opencode/src/provider/transform.ts   |  21 +++
 packages/opencode/src/session/prompt.ts       |  20 ++-
 .../test/provider/capability-probe.test.ts    | 132 ++++++++++++++++++
 .../opencode/test/provider/transform.test.ts  |  71 ++++++++++
 6 files changed, 359 insertions(+), 2 deletions(-)
 create mode 100644 packages/opencode/src/provider/capability-probe.ts
 create mode 100644 packages/opencode/test/provider/capability-probe.test.ts

diff --git a/packages/opencode/src/provider/capability-probe.ts b/packages/opencode/src/provider/capability-probe.ts
new file mode 100644
index 000000000000..a06a38bbd2f6
--- /dev/null
+++ b/packages/opencode/src/provider/capability-probe.ts
@@ -0,0 +1,91 @@
+export * as CapabilityProbe from "./capability-probe"
+
+// Runtime detection of OpenAI-compatible server capabilities.
+//
+// Self-hosted llama.cpp servers expose `<root>/props` with the active chat
+// template. Templates that branch on `enable_thinking` (Qwen3 hybrid, Qwen3.5,
+// Qwen3.6, QwQ, DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking,
+// MiniMax-M2, etc.) reject trailing-assistant prefill at runtime with
+// `HTTP 400 "Assistant response prefill is incompatible with enable_thinking"`
+// (llama.cpp#20861, mastra-ai#15234).
+//
+// Probing the live template removes the need for per-family name lists in
+// models.dev or user config: any server whose template branches on
+// `enable_thinking` is detected automatically, including future thinking
+// families.
+//
+// Probe is opt-in by base URL, fail-silent (vLLM/TGI/mistral.rs have no
+// `/props` endpoint — they fall through to existing detection), short-timeout
+// (1.5s), and cached per process so we hit the network at most once per base
+// URL.
+
+export type ProbedCapabilities = {
+  prefill?: boolean
+  reasoning?: boolean
+}
+
+const PROBE_TIMEOUT_MS = 1500
+const cache = new Map<string, Promise<ProbedCapabilities>>()
+
+// Normalises a baseURL ("http://host/v1/", "http://host", "http://host/v1")
+// to the server root the /props endpoint lives under.
+function rootURL(baseURL: string): string {
+  return baseURL.replace(/\/v1\/?$/, "").replace(/\/+$/, "")
+}
+
+async function probeOnce(baseURL: string): Promise<ProbedCapabilities> {
+  const root = rootURL(baseURL)
+  if (!root) return {}
+
+  const result: ProbedCapabilities = {}
+  const ctrl = new AbortController()
+  const timer = setTimeout(() => ctrl.abort(), PROBE_TIMEOUT_MS)
+  try {
+    const resp = await fetch(`${root}/props`, { signal: ctrl.signal })
+    if (!resp.ok) return {}
+    const data = (await resp.json()) as { chat_template?: unknown; chat_template_caps?: { supports_preserve_reasoning?: unknown } }
+
+    // Primary signal: the chat template branches on `enable_thinking`. This
+    // is the exact condition that produces the prefill-incompatible 400 — it
+    // means the template adds `<think>` differently depending on whether
+    // generation_prompt is requested, and a trailing-assistant turn (no
+    // generation_prompt) lands in the path that conflicts with reasoning.
+    if (typeof data.chat_template === "string" && data.chat_template.includes("enable_thinking")) {
+      result.prefill = false
+      result.reasoning = true
+    }
+
+    // Secondary signal: llama.cpp also exposes `supports_preserve_reasoning`
+    // on chat_template_caps for thinking templates. This catches a few edge
+    // templates that don't use the literal `enable_thinking` keyword.
+    if (data.chat_template_caps?.supports_preserve_reasoning === true) {
+      result.reasoning = true
+    }
+  } catch {
+    // Probe failed: server has no /props (vLLM/TGI/mistral.rs), is offline,
+    // or timed out. Fall back to other detection paths silently.
+  } finally {
+    clearTimeout(timer)
+  }
+  return result
+}
+
+// Returns probed capabilities for the given openai-compatible base URL.
+// Result is cached per base URL for the process lifetime; concurrent callers
+// share the same in-flight probe.
+export function probe(baseURL: string): Promise<ProbedCapabilities> {
+  if (!baseURL) return Promise.resolve({})
+  const key = rootURL(baseURL)
+  let pending = cache.get(key)
+  if (!pending) {
+    pending = probeOnce(baseURL)
+    cache.set(key, pending)
+  }
+  return pending
+}
+
+// Test-only: clears the in-process probe cache. Used by unit tests so they
+// can re-probe without restarting the test runner.
+export function _resetCache(): void {
+  cache.clear()
+}
diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts
index 063e2800d167..43105b66aa5b 100644
--- a/packages/opencode/src/provider/provider.ts
+++ b/packages/opencode/src/provider/provider.ts
@@ -24,6 +24,7 @@ import { AppFileSystem } from "@opencode-ai/core/filesystem"
 import { isRecord } from "@/util/record"
 import { optionalOmitUndefined } from "@opencode-ai/core/schema"
 import * as ProviderTransform from "./transform"
+import { CapabilityProbe } from "./capability-probe"
 import { ModelID, ProviderID } from "./schema"
 import { ModelStatus } from "./model-status"
 import { RuntimeFlags } from "@/effect/runtime-flags"
@@ -870,6 +871,12 @@ const ProviderCapabilities = Schema.Struct({
   input: ProviderModalities,
   output: ProviderModalities,
   interleaved: ProviderInterleaved,
+  // Trailing-assistant ("prefill") support. See models.dev Model.prefill for
+  // the per-family rationale. Undefined = true (backwards-compatible default).
+  // Read via `canAcceptTrailingAssistant(model)` rather than checking this
+  // field directly, so the undefined-default and provider-level inference
+  // live in one place.
+  prefill: Schema.optional(Schema.Boolean),
 })
 
 const ProviderCacheCost = Schema.Struct({
@@ -1083,6 +1090,7 @@ function fromModelsDevModel(provider: ModelsDev.Provider, model: ModelsDev.Model
         pdf: model.modalities?.output?.includes("pdf") ?? false,
       },
       interleaved: model.interleaved ?? false,
+      prefill: model.prefill,
     },
     release_date: model.release_date ?? "",
     variants: {},
@@ -1267,6 +1275,21 @@ export const layer = Layer.effect(
             models: existing?.models ?? {},
           }
 
+          // Auto-detect prefill/reasoning by probing the live OpenAI-compatible
+          // server (llama.cpp `/props` endpoint). The probe inspects the active
+          // chat template for the `enable_thinking` branch that produces the
+          // trailing-assistant 400. Fail-silent for non-llama.cpp upstreams,
+          // cached per base URL. See CapabilityProbe for details.
+          const providerNpm = provider.npm ?? modelsDev[providerID]?.npm
+          const probeBaseURL =
+            providerNpm === "@ai-sdk/openai-compatible" || !providerNpm
+              ? (parsed.options as { baseURL?: unknown })?.baseURL
+              : undefined
+          const probed: CapabilityProbe.ProbedCapabilities =
+            typeof probeBaseURL === "string" && probeBaseURL
+              ? yield* Effect.promise(() => CapabilityProbe.probe(probeBaseURL))
+              : {}
+
           for (const [modelID, model] of Object.entries(provider.models ?? {})) {
             const existingModel = parsed.models[model.id ?? modelID]
             const apiID = model.id ?? existingModel?.api.id ?? modelID
@@ -1293,7 +1316,7 @@ export const layer = Layer.effect(
               providerID: ProviderID.make(providerID),
               capabilities: {
                 temperature: model.temperature ?? existingModel?.capabilities.temperature ?? false,
-                reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? false,
+                reasoning: model.reasoning ?? existingModel?.capabilities.reasoning ?? probed.reasoning ?? false,
                 attachment: model.attachment ?? existingModel?.capabilities.attachment ?? false,
                 toolcall: model.tool_call ?? existingModel?.capabilities.toolcall ?? true,
                 input: {
@@ -1319,6 +1342,7 @@ export const layer = Layer.effect(
                   (!existingModel && apiNpm === "@ai-sdk/openai-compatible" && apiID.includes("deepseek")
                     ? { field: "reasoning_content" }
                     : false),
+                prefill: model.prefill ?? existingModel?.capabilities.prefill ?? probed.prefill,
               },
               cost: {
                 input: model?.cost?.input ?? existingModel?.cost?.input ?? 0,
diff --git a/packages/opencode/src/provider/transform.ts b/packages/opencode/src/provider/transform.ts
index c8dbe6117055..b0c01092d142 100644
--- a/packages/opencode/src/provider/transform.ts
+++ b/packages/opencode/src/provider/transform.ts
@@ -21,6 +21,27 @@ export function sanitizeSurrogates(content: string) {
   return content.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g, "\uFFFD")
 }
 
+// Whether the model accepts an assistant message as the LAST turn in a
+// request ("prefill" / "response continuation").
+//
+// Anthropic and Bedrock-Anthropic accept it (and rely on it for tool-use
+// continuation). Most OpenAI-compatible servers do not when reasoning is
+// enabled, because the chat template branches on `enable_thinking` and rejects
+// a trailing assistant. See the per-family list in models.dev Model.prefill.
+//
+// Precedence:
+//   1. Explicit `model.capabilities.prefill` (from models.dev / user config) wins.
+//   2. Else: openai-compatible + reasoning-capable models default to false,
+//      because every known 2025-2026 open-weight thinking family hits the
+//      template incompat (Qwen3/3.5/3.6 hybrid + Thinking variants, QwQ,
+//      DeepSeek-R1, GLM-4.6/4.7-thinking, Kimi-K2-Thinking, MiniMax-M2).
+//   3. Else: true.
+export function canAcceptTrailingAssistant(model: Provider.Model): boolean {
+  if (model.capabilities.prefill !== undefined) return model.capabilities.prefill
+  if (model.api.npm === "@ai-sdk/openai-compatible" && model.capabilities.reasoning) return false
+  return true
+}
+
 // Maps npm package to the key the AI SDK expects for providerOptions
 function sdkKey(npm: string): string | undefined {
   switch (npm) {
diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts
index ba9a4d6f1a0f..5f7beaab22ce 100644
--- a/packages/opencode/src/session/prompt.ts
+++ b/packages/opencode/src/session/prompt.ts
@@ -1825,7 +1825,25 @@ NOTE: At any point in time through this workflow you should feel free to ask the
               sessionID,
               parentSessionID: session.parentID,
               system,
-              messages: [...modelMsgs, ...(isLastStep ? [{ role: "assistant" as const, content: MAX_STEPS }] : [])],
+              // On the final step we inject the MAX_STEPS instruction so the
+              // model wraps up instead of calling more tools. Anthropic-style
+              // providers accept this as an assistant-prefill ("response
+              // continuation"); thinking-on-by-default templates (most
+              // openai-compatible local servers — Qwen3 hybrid/3.5/3.6,
+              // DeepSeek-R1, GLM-thinking, etc.) reject any trailing-assistant
+              // outright. For those, deliver the same instruction as a user
+              // message so it reaches the model without tripping the template.
+              messages: [
+                ...modelMsgs,
+                ...(isLastStep
+                  ? [
+                      {
+                        role: ProviderTransform.canAcceptTrailingAssistant(model) ? ("assistant" as const) : ("user" as const),
+                        content: MAX_STEPS,
+                      },
+                    ]
+                  : []),
+              ],
               tools,
               model,
               toolChoice: format.type === "json_schema" ? "required" : undefined,
diff --git a/packages/opencode/test/provider/capability-probe.test.ts b/packages/opencode/test/provider/capability-probe.test.ts
new file mode 100644
index 000000000000..7c59ccbb5727
--- /dev/null
+++ b/packages/opencode/test/provider/capability-probe.test.ts
@@ -0,0 +1,132 @@
+import { afterEach, describe, expect, test } from "bun:test"
+import { CapabilityProbe } from "@/provider/capability-probe"
+
+const originalFetch = globalThis.fetch
+
+afterEach(() => {
+  globalThis.fetch = originalFetch
+  CapabilityProbe._resetCache()
+})
+
+function mockFetch(impl: (url: string) => Response | Promise<Response>) {
+  globalThis.fetch = ((input: RequestInfo | URL) => {
+    const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url
+    return Promise.resolve(impl(url))
+  }) as typeof fetch
+}
+
+describe("CapabilityProbe.probe", () => {
+  test("detects prefill=false and reasoning=true when chat_template contains enable_thinking", async () => {
+    mockFetch((url) => {
+      expect(url).toBe("http://localhost:8080/props")
+      return new Response(
+        JSON.stringify({
+          chat_template:
+            "{%- if enable_thinking is defined and enable_thinking is false %}<think></think>{%- else %}<think>{%- endif %}",
+        }),
+        { status: 200 },
+      )
+    })
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({ prefill: false, reasoning: true })
+  })
+
+  test("strips trailing /v1 from baseURL to find /props", async () => {
+    let called = ""
+    mockFetch((url) => {
+      called = url
+      return new Response("{}", { status: 200 })
+    })
+    await CapabilityProbe.probe("http://localhost:8080/v1/")
+    expect(called).toBe("http://localhost:8080/props")
+  })
+
+  test("handles baseURL without /v1 suffix", async () => {
+    let called = ""
+    mockFetch((url) => {
+      called = url
+      return new Response("{}", { status: 200 })
+    })
+    await CapabilityProbe.probe("http://localhost:8080")
+    expect(called).toBe("http://localhost:8080/props")
+  })
+
+  test("returns empty when /props is not present (404)", async () => {
+    mockFetch(() => new Response("Not Found", { status: 404 }))
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("returns empty when chat_template lacks enable_thinking", async () => {
+    mockFetch(
+      () =>
+        new Response(JSON.stringify({ chat_template: "<|user|>{{ messages }}<|assistant|>" }), {
+          status: 200,
+        }),
+    )
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("detects reasoning=true from supports_preserve_reasoning even when chat_template is missing", async () => {
+    mockFetch(
+      () =>
+        new Response(JSON.stringify({ chat_template_caps: { supports_preserve_reasoning: true } }), {
+          status: 200,
+        }),
+    )
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result.reasoning).toBe(true)
+    // No prefill signal from supports_preserve_reasoning alone — only chat_template can determine that
+    expect(result.prefill).toBeUndefined()
+  })
+
+  test("fails silent on network error", async () => {
+    mockFetch(() => {
+      throw new Error("ECONNREFUSED")
+    })
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("fails silent on invalid JSON", async () => {
+    mockFetch(() => new Response("not-json", { status: 200 }))
+    const result = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(result).toEqual({})
+  })
+
+  test("returns empty for empty baseURL", async () => {
+    let called = false
+    mockFetch(() => {
+      called = true
+      return new Response("{}", { status: 200 })
+    })
+    const result = await CapabilityProbe.probe("")
+    expect(result).toEqual({})
+    expect(called).toBe(false)
+  })
+
+  test("caches result per base URL — second call does not hit network", async () => {
+    let calls = 0
+    mockFetch(() => {
+      calls++
+      return new Response(JSON.stringify({ chat_template: "enable_thinking" }), { status: 200 })
+    })
+    const a = await CapabilityProbe.probe("http://localhost:8080/v1")
+    const b = await CapabilityProbe.probe("http://localhost:8080/v1")
+    expect(calls).toBe(1)
+    expect(a).toEqual(b)
+  })
+
+  test("normalises trailing slashes for cache hits", async () => {
+    let calls = 0
+    mockFetch(() => {
+      calls++
+      return new Response("{}", { status: 200 })
+    })
+    await CapabilityProbe.probe("http://localhost:8080/v1")
+    await CapabilityProbe.probe("http://localhost:8080/v1/")
+    await CapabilityProbe.probe("http://localhost:8080")
+    expect(calls).toBe(1)
+  })
+})
diff --git a/packages/opencode/test/provider/transform.test.ts b/packages/opencode/test/provider/transform.test.ts
index 90e2a177fee2..6049dadd8eb0 100644
--- a/packages/opencode/test/provider/transform.test.ts
+++ b/packages/opencode/test/provider/transform.test.ts
@@ -3809,3 +3809,74 @@ describe("ProviderTransform.providerOptions - ai-gateway-provider", () => {
     expect(result).toEqual({ openaiCompatible: { reasoningEffort: "high" } })
   })
 })
+
+// canAcceptTrailingAssistant drives:
+//   - prompt.ts:1828 MAX_STEPS message role (assistant vs. user)
+//   - any future caller that wants to know whether trailing-assistant is safe
+//
+// The matrix below uses one representative per thinking-model family rather
+// than one test per concrete model. Adding a new family later means adding
+// one row, not maintaining N copies. See models.dev Model.prefill comment
+// for the full per-family list.
+describe("ProviderTransform.canAcceptTrailingAssistant", () => {
+  const baseCaps = {
+    temperature: true,
+    attachment: false,
+    toolcall: true,
+    input: { text: true, audio: false, image: false, video: false, pdf: false },
+    output: { text: true, audio: false, image: false, video: false, pdf: false },
+    interleaved: false,
+  }
+  const make = (npm: string, reasoning: boolean, prefill?: boolean) =>
+    ({
+      id: ModelID.make("test/model"),
+      providerID: ProviderID.make("test"),
+      name: "test",
+      api: { id: "test", url: "http://localhost", npm },
+      capabilities: { ...baseCaps, reasoning, ...(prefill === undefined ? {} : { prefill }) },
+      cost: { input: 0, output: 0, cache: { read: 0, write: 0 } },
+      limit: { context: 1, output: 1 },
+      status: "active",
+      options: {},
+      headers: {},
+      release_date: "",
+    }) as any
+
+  test("explicit prefill=false wins over any inference", () => {
+    expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/anthropic", true, false))).toBe(false)
+  })
+
+  test("explicit prefill=true wins over openai-compatible+reasoning auto-false", () => {
+    expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", true, true))).toBe(true)
+  })
+
+  describe("auto-inference (no explicit capability)", () => {
+    // Representatives of the thinking-on-default class — every 2025-2026
+    // open-weight thinking family hits this regardless of model ID.
+    test("openai-compatible + reasoning -> false (Qwen3/3.5/3.6, DeepSeek-R1, GLM-thinking, Kimi-K2, MiniMax-M2, QwQ class)", () => {
+      expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", true))).toBe(false)
+    })
+
+    test("openai-compatible WITHOUT reasoning -> true (Qwen3-Coder, Qwen2.5 class)", () => {
+      expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai-compatible", false))).toBe(true)
+    })
+
+    // Non-openai-compatible packages have native handling for thinking;
+    // prefill is generally safe (and Anthropic relies on it).
+    test("anthropic + reasoning -> true (prefill is a Claude feature)", () => {
+      expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/anthropic", true))).toBe(true)
+    })
+
+    test("openai (not -compatible) + reasoning -> true", () => {
+      expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/openai", true))).toBe(true)
+    })
+
+    test("google + reasoning -> true", () => {
+      expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/google", true))).toBe(true)
+    })
+
+    test("bedrock + reasoning -> true (uses Anthropic surface)", () => {
+      expect(ProviderTransform.canAcceptTrailingAssistant(make("@ai-sdk/amazon-bedrock", true))).toBe(true)
+    })
+  })
+})