diff --git a/dto/claude.go b/dto/claude.go index d7fed412aaa..3d79bfa3cf1 100644 --- a/dto/claude.go +++ b/dto/claude.go @@ -171,9 +171,17 @@ func (c *ClaudeMessage) ParseContent() ([]ClaudeMediaMessage, error) { } type Tool struct { - Name string `json:"name"` - Description string `json:"description,omitempty"` - InputSchema map[string]interface{} `json:"input_schema"` + Name string `json:"name"` + Description string `json:"description,omitempty"` + InputSchema map[string]interface{} `json:"input_schema"` + CacheControl *ClaudeCacheControl `json:"cache_control,omitempty"` +} + +// ClaudeCacheControl mirrors Anthropic's prompt-caching marker. +// See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching +type ClaudeCacheControl struct { + Type string `json:"type"` + TTL string `json:"ttl,omitempty"` } type InputSchema struct { diff --git a/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/.openspec.yaml b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/.openspec.yaml new file mode 100644 index 00000000000..8b769149815 --- /dev/null +++ b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-20 diff --git a/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/design.md b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/design.md new file mode 100644 index 00000000000..87fc5954873 --- /dev/null +++ b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/design.md @@ -0,0 +1,149 @@ +## Context + +The gateway today routes `POST /v1/responses` through a single relay dispatch and supports two upstream surface families: OpenAI-compatible (`/v1/chat/completions`, `/v1/responses` on OpenAI itself) and Anthropic Messages (`/v1/messages`). When a `/v1/responses` request is routed to an Anthropic-typed channel, no translation layer exists for either the request body or the streaming response, so the request fails. Adding the missing pipeline lets a single inbound request shape (Responses-API) be served by either upstream family. + +The reference behavioral surface (analyzed externally, source-free) establishes a stable contract: a **two-step pivot** through an intermediate Chat-Completions-shaped object, on both the request side and the response side. Reusing that pivot keeps each translator focused and gives a clean composition: Responses ↔ Chat-Completions ↔ Anthropic. + +The existing codebase already covers the Chat-Completions ↔ Anthropic legs end-to-end: + +- `relay/channel/claude/relay-claude.go::RequestOpenAI2ClaudeMessage` — Chat-Completions request → Anthropic Messages request. Already handles system extraction, tool_use/tool_result ordering, image mapping (data: vs http:), `max_tokens` adjustment for thinking and tools, response_format JSON-mode shim, and merge of consecutive same-role messages. +- `relay/channel/claude/relay-claude.go::ClaudeStreamHandler` (+ `StreamResponseClaude2OpenAI`, `FormatClaudeResponseInfo`) — streaming Anthropic response → Chat-Completions chunks, including cache-token decomposition and finish_reason mapping. +- `relay/channel/claude/relay-claude.go::ClaudeHandler` (+ `ResponseClaude2OpenAI`) — non-streaming Anthropic response → Chat-Completions response. + +The only legs that do NOT yet exist are: Responses-request → Chat-Completions-request, and Chat-Completions-stream → Responses-events (plus a non-streaming variant of the latter). This change therefore adds exactly those legs as new functions under `service/openaicompat/`, plus one orchestration file under `relay/` that mirrors the existing `relay/chat_completions_via_responses.go` in the opposite direction. + +Other anchors used by this change: + +- The relay format dispatch keys off `info.RelayMode == relayconstant.RelayModeResponses` and `info.ApiType == appconstant.APITypeAnthropic`; the new translation triggers at that exact branch in `relay/responses_handler.go`. +- The project's JSON wrapper (`common.Marshal`/`common.Unmarshal`) is mandatory (project Rule 1). +- Env-var feature flags follow the `common.GetEnvOrDefaultBool("FLAG_NAME", default)` pattern (see `common/env.go`). + +## Goals / Non-Goals + +**Goals:** +- Provide a complete, source-free behavioral specification of the two pipelines (request and response). +- Maintain a clean separation: each translator function takes a body or chunk and returns the next-stage body or chunk, with no I/O side effects. +- Preserve all existing behavior for non-Anthropic upstreams and for non-Responses inbound requests. +- Express each behavioral invariant as an objectively checkable requirement in the capability spec. +- Establish a per-stream state object that survives across chunk callbacks (sequence numbers, item indices, buffered reasoning text, tool-call open/close state). + +**Non-Goals:** +- Picking the final Go package path (left for Phase 3). +- Specifying internal struct names (left for Phase 3, beyond placeholders). +- Modifying quota, billing, retry, or auto-ban behavior. +- Adding new channel adaptors or external dependencies. + +## Decisions + +### D1. Two-step pivot through a Chat-Completions intermediate + +The translator does **not** map Responses-API ↔ Anthropic Messages directly. It maps Responses → ChatCompletions → AnthropicMessages on the request side, and AnthropicMessages → ChatCompletions → Responses on the response side. + +- *Why*: The Chat-Completions shape is the most stable and most widely-implemented "lingua franca" inside the gateway (the existing OpenAI-compatible path already uses it). Pivoting through it means the new code only adds two missing legs (Responses↔ChatCompletions on the request side, ChatCompletions→Responses on the response side) and reuses the existing ChatCompletions↔Anthropic legs. +- *Alternative considered*: Direct Responses↔Anthropic translator. Rejected — doubles the surface area we need to maintain, and creates a second source of truth for tool-use ordering and reasoning passthrough. + +### D2. Stateful streaming translators + +Streaming translators take `(chunk, state)` and return `(events[], state')`. The state object holds: sequence counter, open item indices, buffered reasoning text, tool-call index → call_id map, "started/completed sent" flags, accumulated usage. Translators only emit events; they do not write to a socket. + +- *Why*: Lets the outer SSE handler stay protocol-agnostic and lets us unit-test the translators with deterministic chunk-by-chunk inputs. +- *Alternative considered*: Pure functional translators with no state. Rejected — Responses-API events carry monotonically increasing `sequence_number` and require open/close bookkeeping across many chunks. + +### D3. Open/close discipline for content blocks + +The streaming translator enforces the Responses-API contract: +1. `response.created` and `response.in_progress` fire exactly once each at first usable chunk. +2. Each `output_item` (message, reasoning, function_call) is bracketed by `output_item.added` and `output_item.done`; deltas only fire between them. +3. Switching from reasoning to text closes the reasoning block before opening the text block. Switching from text to a tool call closes the text block before opening the tool-call item. +4. On finish, every open block is closed in deterministic order before `response.completed` fires. +5. A `null` chunk (end-of-stream sentinel from the SSE reader) triggers the flush path which closes any still-open blocks and emits `response.completed` exactly once. + +### D4. Tool-call ID hygiene at the boundary + +The Anthropic API requires tool IDs to match `^[a-zA-Z0-9_-]+$` and the Responses API caps tool IDs at 64 characters. The translator follows a three-tier sanitization policy on the upstream Anthropic side: + +1. **Pass-through** when the ID already matches the regex AND is ≤ 64 characters. +2. **Strip-and-keep** when the ID contains some invalid characters: drop every char not in `[a-zA-Z0-9_-]`; if the residue is non-empty AND ≤ 64 characters, use the residue. +3. **UUID fallback** when the ID is empty, becomes empty after stripping, or exceeds 64 characters: generate a fresh UUID (no deterministic synthesis, no positional encoding). + +On the OUTBOUND Responses-side, IDs longer than 64 characters are clamped to the first 64 characters. + +- *Why*: pass-through preserves client-supplied IDs that already pass; strip-and-keep recovers common patterns like `call:abc/123` losslessly; UUID fallback is simpler than positional synthesis and avoids leaking message-index/tool-call-index information to clients. Determinism for prompt-cache continuity is unnecessary because the upstream cache key is computed by Anthropic from the prompt content, not from tool-call IDs. + +### D5. Tool-result placement repair + +Anthropic requires that each `tool_use` block in an assistant message be followed immediately by a separate user message whose content is the matching `tool_result` block. The translator: +- Splits any user message that mixes `tool_result` with other content; the `tool_result` goes first in its own message. +- Drops assistant text blocks that appear AFTER a `tool_use` block in the same message (Anthropic rejects them). +- Merges consecutive same-role messages after the split. +- If an assistant message contains tool_calls and the next message has no matching tool_result, injects an empty tool_result for each missing call so the upstream does not 400. + +### D6. Reasoning passthrough has two modes + +- **Reasoning as a separate output item** (preferred for clients that understand Responses-API reasoning items): when the upstream emits `reasoning_content` deltas, the translator opens a `reasoning` output item and emits `reasoning_summary_text.delta` events. +- **Reasoning embedded as `...` in text content**: legacy upstreams put thinking text inline. The translator recognises `` and `` markers in the text stream and routes the enclosed text into the reasoning channel instead of the text channel. + +### D7. Usage propagation is lossless across the pivot + +Cache tokens flow through the pivot without being dropped: +- Anthropic `cache_read_input_tokens` → Chat-Completions `prompt_tokens_details.cached_tokens` → Responses `input_tokens_details.cached_tokens`. +- Anthropic `cache_creation_input_tokens` → Chat-Completions `prompt_tokens_details.cache_creation_tokens`. +- `input_tokens = prompt_tokens − cached_tokens − cache_creation_tokens` is the canonical decomposition rule applied at the Chat-Completions → Anthropic hop. + +### D8. `max_tokens` adjustment is upstream-friendly + +The translator: +- Falls back to a default `max_tokens` if the client did not provide one. +- Raises `max_tokens` to a configurable minimum when `tools[]` is non-empty (prevents truncated tool arguments). +- Raises `max_tokens` above `thinking.budget_tokens + buffer` (Anthropic requires strictly greater). + +### D9. System prompt extraction and JSON-mode shim + +- All `role: "system"` messages in the intermediate Chat-Completions shape are concatenated and lifted to the Anthropic `system` block list. +- A Responses-API `instructions` field is treated as a single system message at the head of the message list. +- `response_format = json_schema` appends a system block telling the model to emit strict JSON matching the supplied schema. `response_format = json_object` appends a generic strict-JSON instruction. (Anthropic has no native equivalent.) + +### D10. Image input mapping + +- Responses-API `input_image` with `image_url` (string) becomes intermediate `image_url` with `{ url, detail: "auto" }`. +- Intermediate `image_url` whose URL starts with `data:;base64,...` becomes Anthropic `image` with `source: { type: "base64", media_type, data }`. +- Intermediate `image_url` whose URL starts with `http://` or `https://` becomes Anthropic `image` with `source: { type: "url", url }`. +- Any other URL shape is dropped (Anthropic does not support arbitrary file IDs natively). + +### D11. Reasoning items in INPUT + +When a `reasoning` input item appears between turns, its text is extracted (from `summary[].text` if present, else from `content[].text`) and **buffered** until the next assistant message or function_call; it is then attached as `reasoning_content` to that assistant turn. A `reasoning` item is never emitted as a standalone Chat-Completions message. + +### D12. Format detection by endpoint + +The dispatch decision uses the endpoint path as the primary key: `/v1/responses` → Responses-API source format, `/v1/messages` → Anthropic source format, `/v1/chat/completions` with a body field that looks like Responses-API → Responses-API source (for CLI clients that send Responses bodies to the chat endpoint). + +## Risks / Trade-offs + +- **[Risk]** Streaming SSE order is observable to clients; a bug in open/close discipline produces malformed `output_item` brackets that crash strict SDKs. + - **Mitigation**: Behavioral assertions in the spec pin down exact event ordering; tests cover the cross-block transitions (reasoning→text, text→tool_call, finish flush, null-flush). +- **[Risk]** Tool-call ID UUID fallback assigns a fresh UUID when the client's ID fails the regex AND has no usable residue; the client cannot correlate the resulting tool_use back to its original local ID. + - **Mitigation**: UUID fallback only triggers when the original ID is unrecoverable. The strip-and-keep tier handles the common case (`call:abc/123` → `callabc123`) without losing correlation. Document the policy in the operator-facing notes. +- **[Risk]** Token-usage decomposition (`input − cached − cache_creation`) underflows to negative when upstreams report inconsistent values. + - **Mitigation**: Clamp to zero; document the invariant in the spec. +- **[Risk]** The intermediate Chat-Completions pivot adds latency on the request-build path. + - **Mitigation**: All translation is pure-CPU JSON shape rewriting; profile after first integration test pass. +- **[Risk]** The Anthropic `thinking` block requires `max_tokens > budget_tokens`; clients may set both and break the upstream. + - **Mitigation**: Translator raises `max_tokens` automatically; documented in the spec. +- **[Trade-off]** We do not attempt to round-trip every Responses-API field (`store`, `background`, `prompt_cache_key`, `include`). These are stripped silently. Clients that rely on them get no error but no behavior change either. Phase 3 may decide to surface a warning. + +## Migration Plan + +- This is additive. No data migration. No client-visible change for requests that previously succeeded. +- Rollout: feature flag `RESPONSES_TO_ANTHROPIC_ENABLED` read via `common.GetEnvOrDefaultBool("RESPONSES_TO_ANTHROPIC_ENABLED", true)`, **default `true`**. Operators who want the prior "not implemented" behavior can set `RESPONSES_TO_ANTHROPIC_ENABLED=false`. +- Rollback: set the flag to `false`; the gateway falls back to the existing `adaptor.ConvertOpenAIResponsesRequest` path which returns the pre-change error. + +## Locked decisions + +- **Package placement** — confirmed: shape converters in `service/openaicompat/`, orchestration in `relay/responses_via_chat_completions.go`. +- **Public translator entry-point names** — confirmed: `ResponsesRequestToChatCompletionsRequest`, `ChatCompletionsStreamToResponsesEvents`, `ChatCompletionsResponseToResponsesResponse`. +- **Per-stream state struct** — confirmed: `ResponsesStreamState` exported from `service/openaicompat/`. +- **OAuth tool-name prefix** — confirmed: not applicable; no prefix is applied and no name-mapping table is kept. +- **JSON-mode system-prompt strings** — confirmed: hard-coded English. +- **Tool-call ID strategy** — confirmed: pass-through / strip-and-keep / UUID fallback (D4 above). No deterministic positional synthesis. +- **Feature flag default** — confirmed: `RESPONSES_TO_ANTHROPIC_ENABLED=true` (default ON). diff --git a/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/proposal.md b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/proposal.md new file mode 100644 index 00000000000..418a00150ec --- /dev/null +++ b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/proposal.md @@ -0,0 +1,78 @@ +## Why + +Clients of the gateway today can hit `POST /v1/responses` (OpenAI Responses API shape) and expect to be served by any routed upstream channel. The relay supports OpenAI-compatible upstreams and Anthropic `/v1/messages` upstreams independently, but when a `/v1/responses` request is routed to an Anthropic-typed channel the gateway has no end-to-end translation path: the request shape cannot be forwarded to `/v1/messages` as-is, and the upstream streaming events cannot be re-encoded into Responses-API events without a translation layer. + +This change introduces that translation layer so a single Responses-API request can be served transparently by an Anthropic upstream, with full feature parity for streaming text, reasoning (thinking) passthrough, multi-turn tool use, image input, system prompt extraction, JSON-mode hints, and token usage (including prompt cache tokens) propagation. + +## What Changes + +- **New translation pipeline** for inbound requests: Responses-shaped request → Chat-Completions-shaped intermediate → Anthropic Messages-shaped request, wired into the existing relay format dispatch so that routing a `/v1/responses` request to an Anthropic-typed channel succeeds instead of returning "not implemented". +- **New translation pipeline** for outbound responses (both streaming SSE and final non-streaming): Anthropic Messages event stream → Chat-Completions chunk shape → Responses-API event stream, including correct `response.created` / `response.in_progress` / `response.output_item.added` / delta / `response.completed` event ordering and sequence numbering. +- **Reasoning passthrough**: when the upstream emits a `thinking` block, the gateway re-emits it as Responses-API `reasoning` output items with proper `reasoning_summary_text.delta` / `reasoning_summary_text.done` / `reasoning_summary_part.done` / `output_item.done` event sequencing. `...` inline markers in regular text are also recognised and rerouted. +- **System prompt extraction**: a Responses-API `instructions` field, or a `system` message in an intermediate shape, is lifted into the Anthropic `system` block list with proper cache_control handling. +- **Tool use round-tripping**: tool declarations, tool calls, and tool results are converted in both directions; tool-use blocks and their tool_result counterparts are placed in adjacent Anthropic messages per Anthropic API rules; missing tool results are auto-injected as empty before forwarding upstream; assistant text emitted after a `tool_use` block is dropped; consecutive same-role messages are merged. +- **Tool-call ID hygiene**: every tool call must have an ID. IDs that already match the Anthropic-compatible regex `^[a-zA-Z0-9_-]+$` and are ≤ 64 characters are passed through unchanged. IDs that contain invalid characters are sanitized by stripping non-`[a-zA-Z0-9_-]` characters and keeping the result if non-empty; otherwise a fresh UUID is generated as the replacement. IDs longer than 64 characters are clamped at the Responses-side boundary. Nameless tool calls and hosted (no-name) tool declarations are filtered out before forwarding upstream. +- **`max_tokens` clamp**: `max_tokens` is set from the request, raised to a configurable minimum when tools are present (to avoid truncated tool arguments), and raised above `thinking.budget_tokens + buffer` when the upstream is in thinking mode (Anthropic requires `max_tokens > budget_tokens`). +- **Image input mapping**: Responses-API `input_image` items are converted to intermediate `image_url`, then to Anthropic `image` blocks; `data:` URLs become `base64` sources and `http(s)` URLs become `url` sources. +- **Reasoning-effort mapping**: a Chat-Completions-shaped `reasoning_effort` enum (none/low/medium/high/xhigh) is converted to a Claude `thinking.budget_tokens` value when no explicit `thinking` block is present. +- **Response-format mapping**: `response_format = json_object` or `json_schema` injects an extra system-prompt block instructing the model to return strict JSON (Anthropic has no native equivalent field). +- **Usage propagation**: prompt cache read/write tokens are propagated through every translation hop. In the upstream-to-OpenAI direction, `cache_read_input_tokens` and `cache_creation_input_tokens` flow into `prompt_tokens_details.cached_tokens` and `prompt_tokens_details.cache_creation_tokens`. In the downstream-to-Responses direction, they flow into `input_tokens_details.cached_tokens`. +- **Input shape normalization**: a string `input` is wrapped as a single user message with an `input_text` part; an empty array `input[]` is replaced with a single placeholder message so the upstream does not receive `messages: []`; items with a `role` field but no `type` are treated as `message` items. +- **Reasoning items in input**: a `reasoning` input item is buffered and attached to the next assistant message as `reasoning_content`, never forwarded as a standalone message. +- **Failure mapping**: upstream `error` and `response.failed` events surface as a documented OpenAI-shaped error chunk (no duplicate emission). +- The current behavior of returning a 5xx-class "not implemented" error for `/v1/responses` requests routed to Anthropic-typed channels is **REMOVED**. + +## Capabilities + +### New Capabilities +- `responses-to-anthropic-translation`: end-to-end translation of OpenAI Responses-API requests and streamed responses to and from the Anthropic Messages-API shape, including request body conversion, response event re-encoding, tool-use round-tripping, reasoning passthrough, image input mapping, system prompt extraction, JSON-mode hint injection, token usage propagation (including prompt-cache token classes), and input-shape normalization. + +### Modified Capabilities +- (none — this introduces a new translation pipeline rather than altering existing spec-level behavior. The change does not modify existing channel BYOK, quota, billing, retry, or auto-ban behavior.) + +## Scope + +**In scope (this change):** +- Request shape: Responses-API `{ input, instructions, tools, tool_choice, temperature, top_p, max_tokens, reasoning, reasoning_effort, response_format, thinking, model, stream }` +- Response stream: text deltas, reasoning deltas, tool-call deltas, finish reasons (`stop`, `length`, `tool_calls`), usage (including cache tokens) +- Both streaming and non-streaming Responses-API client modes +- Tool declarations in both `{ type: "function", function: { name, ... } }` and bare `{ type: "function", name, ... }` Responses-API forms; pass-through of built-in (non-function) tool types when target is Anthropic +- Behavioral parity for the existing flow of intermediate-Chat-Completions ↔ Anthropic Messages, since the Responses-to-Anthropic path piggybacks on it + +**Out of scope (explicit non-goals):** +- File-search / web-search / computer-use / code-interpreter hosted tools on the Responses-API surface beyond pass-through of declarations +- Anthropic-side `output_config`, structured-output JSON schema enforcement, and provider-specific quirks for non-Anthropic upstreams (these are pre-existing behaviors and are not modified here) +- Persistent conversation storage (`store: true` semantics); the translator strips this field +- Background mode (`background: true` Responses-API field) +- Encrypted content reasoning items (`encrypted_content` summary fallback) beyond the documented text-extraction path +- Any change to quota, billing, log attribution, or channel selection +- Any change to the existing OpenAI-compatible `/v1/chat/completions` path + +## Impact + +- **Affected APIs**: `POST /v1/responses` becomes routable to Anthropic-typed channels. +- **Affected code areas**: + - `service/openaicompat/responses_to_chat.go` (new function `ResponsesRequestToChatCompletionsRequest`) + - `service/openaicompat/chat_to_responses.go` (new functions `ChatCompletionsStreamToResponsesEvents` + `ChatCompletionsResponseToResponsesResponse` + per-stream state struct) + - `relay/responses_via_chat_completions.go` (new orchestration file, mirror of `relay/chat_completions_via_responses.go`) + - `relay/responses_handler.go` (new branch when `info.ApiType == APITypeAnthropic`, calling the new orchestration before falling back to `adaptor.ConvertOpenAIResponsesRequest`) +- **Reused converters (not duplicated)**: + - `relay/channel/claude/relay-claude.go::RequestOpenAI2ClaudeMessage` — Chat-Completions request → Anthropic Messages request (already handles tool ordering, max_tokens adjustment, image mapping, system extraction) + - `relay/channel/claude/relay-claude.go::ClaudeStreamHandler` + `StreamResponseClaude2OpenAI` — Claude streaming response → Chat-Completions chunks + - `relay/channel/claude/relay-claude.go::ClaudeHandler` + `ResponseClaude2OpenAI` — Claude non-streaming response → Chat-Completions response +- **Dependencies**: no new third-party dependencies; uses the project's existing JSON wrapper (`common.Marshal`/`common.Unmarshal`) and the standard library UUID/random generator. +- **Database**: no migrations. +- **Frontend**: no UI changes; the translation is transparent to clients. +- **Backward compatibility**: additive. Requests that were previously rejected ("not implemented") now succeed. Requests that previously succeeded (Responses-to-OpenAI-compatible upstreams) are not affected. + +## Locked decisions (Phase 3) + +- **Package placement**: shape converters land in `service/openaicompat/` parallel to the existing `chat_to_responses.go`/`responses_to_chat.go`; orchestration lands in `relay/responses_via_chat_completions.go` mirroring the existing `relay/chat_completions_via_responses.go`. +- **Naming**: PascalCase `XToY` style matching project convention: `ResponsesRequestToChatCompletionsRequest`, `ChatCompletionsStreamToResponsesEvents`, `ChatCompletionsResponseToResponsesResponse`. Per-stream state struct: `ResponsesStreamState`. +- **Reuse strategy**: the `ChatCompletions ↔ AnthropicMessages` legs are NOT reimplemented; the existing Claude adaptor converters listed above are called directly. +- **Tool-call ID strategy**: pass-through when valid; sanitize non-empty residue when partially invalid; UUID fallback (no deterministic synthesis) when fully invalid. Clamp to 64 characters at the Responses-side boundary. +- **OAuth tool-name prefix**: NOT applicable to this project (the Anthropic adaptor uses `x-api-key`, not an OAuth flow). The translator hard-codes no prefix; no `prefixedName→originalName` map exists. +- **JSON-mode prompt text**: hard-coded English, matching the convention of other converters in this codebase. +- **Test style**: assertion-style using `testify/require` and `t.Errorf`, matching `relay/channel/claude/relay_claude_test.go`. No golden files. +- **Feature gate**: `RESPONSES_TO_ANTHROPIC_ENABLED`, default `true`. Operators can set the variable to `false` to restore the prior "not implemented" behavior. +- **Conflict surface**: clean. The only uncommitted change at the time of this proposal is this OpenSpec change itself; no in-flight work touches `relay/responses_handler.go` or `relay/channel/claude/`. diff --git a/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/specs/responses-to-anthropic-translation/spec.md b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/specs/responses-to-anthropic-translation/spec.md new file mode 100644 index 00000000000..b0300a40765 --- /dev/null +++ b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/specs/responses-to-anthropic-translation/spec.md @@ -0,0 +1,856 @@ +## ADDED Requirements + +### Requirement: Endpoint-driven source format detection + +The gateway SHALL classify the inbound request's source format from the URL path before consulting the body shape. A request whose path contains `/v1/responses` SHALL be treated as the Responses-API source format. A request whose path contains `/v1/messages` SHALL be treated as the Anthropic-Messages source format. A request whose path contains `/v1/chat/completions` SHALL be treated as the OpenAI Chat-Completions source format, except that when its JSON body has a top-level `input` field that is an array, it SHALL be reclassified as the Responses-API source format. + +#### Scenario: `/v1/responses` path is Responses-API source + +- **WHEN** a client sends `POST /v1/responses` +- **THEN** the gateway SHALL select the Responses-API translator chain regardless of body shape + +#### Scenario: `/v1/messages` path is Anthropic source + +- **WHEN** a client sends `POST /v1/messages` +- **THEN** the gateway SHALL select the Anthropic-source translator chain regardless of body shape + +#### Scenario: `/v1/chat/completions` with Responses-style body + +- **WHEN** a client sends `POST /v1/chat/completions` with a JSON body whose `input` field is an array +- **THEN** the gateway SHALL select the Responses-API source format + +#### Scenario: `/v1/chat/completions` with normal body + +- **WHEN** a client sends `POST /v1/chat/completions` with a JSON body that has no `input` array and uses `messages[]` +- **THEN** the gateway SHALL select the OpenAI Chat-Completions source format + +### Requirement: Two-step pivot through Chat-Completions intermediate + +When the inbound source format and the outbound target format differ, the gateway SHALL perform translation in two hops through a Chat-Completions-shaped intermediate object. The Responses-API to Anthropic-Messages request translation SHALL execute `Responses → ChatCompletions` followed by `ChatCompletions → AnthropicMessages`. The Anthropic-Messages to Responses-API response translation SHALL execute `AnthropicMessages → ChatCompletions` followed by `ChatCompletions → ResponsesEvents`. + +#### Scenario: Request pivot is two-hop + +- **WHEN** a Responses-API request body is routed to an Anthropic-typed channel +- **THEN** the request body delivered to the upstream SHALL be the result of applying the Responses→ChatCompletions translator followed by the ChatCompletions→AnthropicMessages translator, in that order + +#### Scenario: Response pivot is two-hop + +- **WHEN** an Anthropic streaming response chunk is received and the original client expects Responses-API events +- **THEN** the chunk SHALL be passed through the Anthropic→ChatCompletions translator, and each emitted Chat-Completions chunk SHALL be passed through the ChatCompletions→ResponsesEvents translator before being written to the client + +#### Scenario: Same-format requests skip translation + +- **WHEN** the source and target formats are identical +- **THEN** no translator is invoked and the body or chunk passes through unchanged + +### Requirement: Responses-API input shape normalization + +The gateway SHALL accept the Responses-API `input` field in three shapes and normalize them to an internal array of input items before translation: (a) a non-empty string, (b) an empty or whitespace-only string, (c) an array (possibly empty). A non-empty string SHALL be wrapped as a single user message item whose content is a single `input_text` part with the original text. An empty or whitespace-only string SHALL be wrapped as a single user message item whose content is a single `input_text` part with the placeholder text `"..."`. An empty array SHALL be replaced with a single user message item whose content is a single `input_text` part with the placeholder text `"..."`. A non-empty array SHALL be passed through unchanged. Any other shape SHALL be treated as invalid and SHALL cause the body to be forwarded unchanged (no translation). + +#### Scenario: String input is wrapped as user message + +- **WHEN** the request body contains `input: "hello world"` +- **THEN** the normalized input items SHALL be `[{ type: "message", role: "user", content: [{ type: "input_text", text: "hello world" }] }]` + +#### Scenario: Empty string input is wrapped as placeholder + +- **WHEN** the request body contains `input: ""` +- **THEN** the normalized input items SHALL be `[{ type: "message", role: "user", content: [{ type: "input_text", text: "..." }] }]` + +#### Scenario: Empty array input is replaced with placeholder + +- **WHEN** the request body contains `input: []` +- **THEN** the normalized input items SHALL be `[{ type: "message", role: "user", content: [{ type: "input_text", text: "..." }] }]` + +#### Scenario: Non-empty array is passed through + +- **WHEN** the request body contains `input: [{ type: "message", role: "user", content: [...] }]` +- **THEN** the normalized input items SHALL equal the original array + +#### Scenario: Non-string non-array input + +- **WHEN** the request body contains `input: 42` or `input: { foo: "bar" }` +- **THEN** the gateway SHALL forward the body unchanged without invoking the Responses→ChatCompletions translator + +### Requirement: Responses-API `instructions` becomes a system message + +When the Responses-API request body contains a non-empty `instructions` string, the gateway SHALL prepend a single `role: "system"` message whose `content` is that string to the Chat-Completions `messages[]`. + +#### Scenario: Instructions prepended as system + +- **WHEN** the request body contains `instructions: "You are helpful."` +- **THEN** the first message in the resulting Chat-Completions `messages[]` SHALL be `{ role: "system", content: "You are helpful." }` + +#### Scenario: Empty instructions is skipped + +- **WHEN** the request body contains `instructions: ""` or no `instructions` field +- **THEN** no system message SHALL be prepended on behalf of `instructions` + +### Requirement: Input item type detection with role-only fallback + +The gateway SHALL determine each input item's type by reading its `type` field. If the `type` field is missing but a `role` field is present, the item SHALL be treated as type `"message"`. If neither field is present, the item SHALL be skipped silently. + +#### Scenario: Explicit type wins + +- **WHEN** an input item is `{ type: "function_call", call_id: "x", name: "y", arguments: "{}" }` +- **THEN** the item SHALL be processed as a function call + +#### Scenario: Role-only fallback + +- **WHEN** an input item is `{ role: "user", content: [{ type: "input_text", text: "hi" }] }` with no `type` field +- **THEN** the item SHALL be processed as type `"message"` + +#### Scenario: Neither type nor role + +- **WHEN** an input item is `{ foo: "bar" }` +- **THEN** the item SHALL be skipped without error + +### Requirement: Message item content normalization + +For each input item of type `"message"`, the gateway SHALL map content parts to Chat-Completions content parts as follows: `input_text` and `output_text` parts SHALL become `{ type: "text", text }` parts; `input_image` parts SHALL become `{ type: "image_url", image_url: { url, detail } }` parts where `url` is the part's `image_url` field (if a string) or `file_id` field (if no `image_url`), and `detail` is the part's `detail` field or `"auto"` if absent. Parts of any other type SHALL be passed through unchanged. + +#### Scenario: input_text becomes text + +- **WHEN** a message item has `content: [{ type: "input_text", text: "hello" }]` +- **THEN** the converted Chat-Completions message content SHALL be `[{ type: "text", text: "hello" }]` + +#### Scenario: output_text becomes text + +- **WHEN** a message item has `content: [{ type: "output_text", text: "answer" }]` +- **THEN** the converted Chat-Completions message content SHALL be `[{ type: "text", text: "answer" }]` + +#### Scenario: input_image with image_url becomes image_url + +- **WHEN** a message item has `content: [{ type: "input_image", image_url: "https://example.com/a.png", detail: "high" }]` +- **THEN** the converted Chat-Completions message content SHALL be `[{ type: "image_url", image_url: { url: "https://example.com/a.png", detail: "high" } }]` + +#### Scenario: input_image with file_id fallback + +- **WHEN** a message item has `content: [{ type: "input_image", file_id: "file_abc" }]` and no `image_url` +- **THEN** the converted content SHALL be `[{ type: "image_url", image_url: { url: "file_abc", detail: "auto" } }]` + +#### Scenario: input_image with no url or file_id + +- **WHEN** a message item has `content: [{ type: "input_image" }]` with neither `image_url` nor `file_id` +- **THEN** the converted content SHALL be `[{ type: "image_url", image_url: { url: "", detail: "auto" } }]` + +### Requirement: Function-call items become assistant tool_calls + +For each input item of type `"function_call"`, the gateway SHALL append the call to a buffered assistant message in the form `{ role: "assistant", content: null, tool_calls: [...] }`. Each tool call SHALL be `{ id: , type: "function", function: { name, arguments } }`. The buffered assistant message SHALL be flushed to the message list when the next non-function-call item is encountered or at end-of-input. Function-call items whose `name` is missing, not a string, or trimmed-empty SHALL be skipped silently. + +#### Scenario: Single function call + +- **WHEN** input contains `{ type: "function_call", call_id: "c1", name: "search", arguments: "{\"q\":\"x\"}" }` followed by no more items +- **THEN** the resulting messages SHALL include `{ role: "assistant", content: null, tool_calls: [{ id: "c1", type: "function", function: { name: "search", arguments: "{\"q\":\"x\"}" } }] }` + +#### Scenario: Multiple consecutive function calls collapse + +- **WHEN** input contains two consecutive function_call items with call_ids `c1` and `c2` +- **THEN** both calls SHALL be in the same assistant message's `tool_calls` array, in order + +#### Scenario: Function call with empty name is dropped + +- **WHEN** input contains `{ type: "function_call", call_id: "c1", name: "", arguments: "{}" }` +- **THEN** the call SHALL NOT appear in any resulting assistant message + +#### Scenario: Function call with missing name is dropped + +- **WHEN** input contains `{ type: "function_call", call_id: "c1", arguments: "{}" }` with no `name` field +- **THEN** the call SHALL NOT appear in any resulting assistant message + +### Requirement: Function-call-output items become tool messages + +For each input item of type `"function_call_output"`, the gateway SHALL flush any buffered assistant message and SHALL append a tool message `{ role: "tool", tool_call_id: , content: }` where `` is the item's `output` field if it is a string, or the JSON-stringified value of `output` otherwise. + +#### Scenario: String output passes through + +- **WHEN** input contains `{ type: "function_call_output", call_id: "c1", output: "result text" }` +- **THEN** the resulting messages SHALL include `{ role: "tool", tool_call_id: "c1", content: "result text" }` + +#### Scenario: Non-string output is JSON-stringified + +- **WHEN** input contains `{ type: "function_call_output", call_id: "c1", output: { ok: true, n: 7 } }` +- **THEN** the resulting messages SHALL include `{ role: "tool", tool_call_id: "c1", content: "{\"ok\":true,\"n\":7}" }` + +#### Scenario: Output flushes pending assistant first + +- **WHEN** input contains a `function_call` item followed by a `function_call_output` item +- **THEN** the assistant message containing the call SHALL be appended to the message list BEFORE the tool message + +### Requirement: Reasoning input items are buffered, not emitted + +For each input item of type `"reasoning"`, the gateway SHALL extract its text by joining the `text` fields of every entry in its `summary[]` array with newlines if `summary[]` is a non-empty array; otherwise by joining the `text` fields of every entry in its `content[]` array; otherwise SHALL extract an empty string. The extracted text SHALL be buffered. The buffered text SHALL be attached as `reasoning_content` to the next assistant message OR to the next buffered assistant tool-call message, whichever comes first. After attachment the buffer SHALL be cleared. A `reasoning` item SHALL NOT appear in the Chat-Completions `messages[]` directly. + +#### Scenario: Reasoning text attached to next assistant message + +- **WHEN** input contains `{ type: "reasoning", summary: [{ text: "thinking step 1" }] }` followed by `{ type: "message", role: "assistant", content: [{ type: "output_text", text: "answer" }] }` +- **THEN** the resulting assistant message SHALL be `{ role: "assistant", content: [{ type: "text", text: "answer" }], reasoning_content: "thinking step 1" }` + +#### Scenario: Reasoning text attached to tool-call assistant message + +- **WHEN** input contains a `reasoning` item followed by a `function_call` item +- **THEN** the assistant message synthesised for the function_call SHALL include `reasoning_content` equal to the buffered reasoning text + +#### Scenario: Reasoning falls back to content array + +- **WHEN** input contains `{ type: "reasoning", content: [{ text: "alt thinking" }] }` and no `summary[]` +- **THEN** the buffered reasoning text SHALL be `"alt thinking"` + +#### Scenario: Multiple reasoning items concatenate with newline + +- **WHEN** input contains two consecutive `reasoning` items with summaries `"a"` and `"b"` +- **THEN** the buffered reasoning text presented to the next assistant turn SHALL be `"a\nb"` + +#### Scenario: Reasoning buffer is cleared after attachment + +- **WHEN** a reasoning item's text has been attached to an assistant message and a subsequent assistant message arrives with no preceding reasoning +- **THEN** the second assistant message SHALL NOT have `reasoning_content` + +### Requirement: Tool declarations conversion (Responses → ChatCompletions) + +The gateway SHALL accept Responses-API tool declarations in two shapes: (a) already-Chat-Completions-shaped `{ type: "function", function: { name, description, parameters, strict } }`, which SHALL pass through unchanged; (b) Responses-flat `{ type: "function", name, description, parameters, strict }`, which SHALL be converted to the Chat-Completions shape. A tool declaration whose effective name is missing, non-string, or trimmed-empty SHALL be filtered out (this discards hosted tools that have no `name`). Tool parameter schemas that have `type: "object"` but no `properties` field SHALL be normalized to include `properties: {}`. Tools whose `type` is not `"function"` SHALL be retained unchanged when the target is Anthropic; they SHALL be filtered out when the intermediate is being normalized to OpenAI for non-Anthropic upstreams. + +#### Scenario: Already-Chat-Completions tool passes through + +- **WHEN** tools contains `{ type: "function", function: { name: "search", parameters: { type: "object", properties: { q: { type: "string" } } } } }` +- **THEN** the converted tools array SHALL contain that entry unchanged + +#### Scenario: Flat Responses tool is converted + +- **WHEN** tools contains `{ type: "function", name: "search", description: "find", parameters: { type: "object", properties: {} }, strict: true }` +- **THEN** the converted tools array SHALL contain `{ type: "function", function: { name: "search", description: "find", parameters: { type: "object", properties: {} }, strict: true } }` + +#### Scenario: Empty-name hosted tool is dropped + +- **WHEN** tools contains `{ type: "request_user_input" }` (no `name`) +- **THEN** the converted tools array SHALL NOT contain that entry + +#### Scenario: Object schema without properties gets `properties: {}` + +- **WHEN** a tool's parameters is `{ type: "object" }` +- **THEN** the converted parameters SHALL be `{ type: "object", properties: {} }` + +### Requirement: Responses-API request-body cleanup + +After translating to the Chat-Completions intermediate, the gateway SHALL remove the following fields from the result body: `input`, `instructions`, `include`, `prompt_cache_key`, `store`, `reasoning`. + +#### Scenario: All Responses-only fields are removed + +- **WHEN** a Responses-API body containing `input`, `instructions`, `include`, `prompt_cache_key`, `store`, and `reasoning` is translated +- **THEN** the resulting Chat-Completions body SHALL have none of those six fields + +### Requirement: System message extraction for Anthropic target + +When translating Chat-Completions → Anthropic, the gateway SHALL collect every `role: "system"` message's content into a single `systemParts` list, removing those messages from the main `messages[]`. When `systemParts` is non-empty, the gateway SHALL emit the Anthropic `system` field as an array of text blocks. When the upstream channel type is the Anthropic OAuth profile, the gateway MAY prepend a project-defined client-identity system block; this block is always present and is positioned first when present, with cache_control `{ type: "ephemeral", ttl: "1h" }` applied to the LAST system block when there is more than one system block. + +#### Scenario: Single system message extracted + +- **WHEN** the intermediate has `messages: [{ role: "system", content: "You are helpful." }, { role: "user", content: "hi" }]` +- **THEN** the Anthropic body SHALL have `system` as a non-empty array containing a text block whose text is or includes `"You are helpful."`, and `messages` SHALL NOT contain the system message + +#### Scenario: Multiple system messages concatenated + +- **WHEN** the intermediate has two `role: "system"` messages with contents `"A"` and `"B"` +- **THEN** their texts SHALL be concatenated with newline separators into a single text block in the Anthropic `system` array + +#### Scenario: No system messages + +- **WHEN** the intermediate has no `role: "system"` messages and no client-identity block is configured +- **THEN** the Anthropic body SHALL have no `system` field (or an empty `system` is acceptable depending on host config) + +#### Scenario: Cache_control applied to last system block + +- **WHEN** the Anthropic `system` array has two or more text blocks +- **THEN** the LAST block SHALL have `cache_control: { type: "ephemeral", ttl: "1h" }` and no other block SHALL + +### Requirement: Tool-use / tool-result ordering for Anthropic + +When translating Chat-Completions → Anthropic, the gateway SHALL ensure that every tool_use block in an assistant message is followed in the next message by the matching tool_result block. The translator SHALL: +1. Split any user-or-tool message that contains both `tool_result` blocks and non-tool-result blocks: the tool_result blocks SHALL be emitted first in their own user message; the remaining blocks SHALL be emitted in a subsequent user message. +2. Flush the in-progress message immediately after appending tool_use blocks. +3. Drop assistant text blocks that appear AFTER a `tool_use` block within the same assistant content array (Anthropic rejects them). +4. Merge consecutive messages that share the same role after the above transforms. +5. When merging messages that contain tool_result blocks alongside non-tool-result blocks, place all tool_result blocks first in the merged content array. + +#### Scenario: Tool_result moved to its own user message + +- **WHEN** a Chat-Completions input has a tool message followed by a user message with text content, both originally adjacent +- **THEN** the Anthropic `messages[]` SHALL contain a user message whose content is exclusively the tool_result block, followed by a user message whose content is the text block + +#### Scenario: Assistant text after tool_use is dropped + +- **WHEN** an assistant message has content `[{ type: "text", text: "before" }, { type: "tool_use", id: "t1", name: "x", input: {} }, { type: "text", text: "after" }]` +- **THEN** the Anthropic assistant message content SHALL be `[{ type: "text", text: "before" }, { type: "tool_use", id: "t1", name: "x", input: {} }]` (the `"after"` text is removed) + +#### Scenario: Thinking block before tool_use preserved + +- **WHEN** an assistant message has content `[{ type: "thinking", thinking: "T" }, { type: "tool_use", id: "t1", name: "x", input: {} }]` +- **THEN** both blocks SHALL be preserved in the Anthropic assistant message content + +#### Scenario: Consecutive user messages are merged + +- **WHEN** the intermediate `messages[]` has two consecutive `role: "user"` messages with text contents `"a"` and `"b"` +- **THEN** the Anthropic `messages[]` SHALL have a single user message whose content includes both text blocks (preserving order) + +#### Scenario: Merge with tool_result-first ordering + +- **WHEN** merging consecutive user messages, the first contains a `tool_result` block and the second contains a `text` block +- **THEN** the merged user message's content SHALL list the tool_result block before the text block + +### Requirement: Missing tool-result auto-injection + +If an assistant message contains one or more tool_calls (OpenAI shape) or tool_use blocks (Claude shape) and the next message does not contain a matching tool_result for at least one of those call IDs, the gateway SHALL insert an empty tool message `{ role: "tool", tool_call_id: , content: "" }` for EACH missing call between the assistant message and whatever follows. + +#### Scenario: Single missing tool result is filled + +- **WHEN** messages are `[{ role: "assistant", tool_calls: [{ id: "c1", function: { name: "x", arguments: "{}" } }] }, { role: "user", content: "next" }]` +- **THEN** the resulting messages SHALL be `[{ role: "assistant", ... }, { role: "tool", tool_call_id: "c1", content: "" }, { role: "user", content: "next" }]` + +#### Scenario: Multiple missing tool results + +- **WHEN** an assistant message has two tool_calls with IDs `c1` and `c2` and the next message is a user message +- **THEN** TWO empty tool messages SHALL be inserted, one per call ID, in the order the calls appeared + +#### Scenario: Existing tool result is not duplicated + +- **WHEN** an assistant message has a tool_call with ID `c1` and the next message is `{ role: "tool", tool_call_id: "c1", content: "result" }` +- **THEN** no additional tool message SHALL be inserted + +### Requirement: Tool-call ID sanitization + +The gateway SHALL ensure that every tool_call ID (in `tool_calls[].id` of assistant messages, `tool_call_id` of tool messages, `tool_use.id` and `tool_result.tool_use_id` of content blocks) matches the regex `^[a-zA-Z0-9_-]+$` AND is no longer than 64 characters before being forwarded to the Anthropic upstream. The gateway SHALL apply the following three-tier policy in order: + +1. **Pass-through**: if the ID already matches the regex AND is ≤ 64 characters, it SHALL be forwarded unchanged. +2. **Strip-and-keep**: otherwise, the gateway SHALL strip every character not in `[a-zA-Z0-9_-]`. If the residue is non-empty AND ≤ 64 characters, the residue SHALL be used. +3. **UUID fallback**: otherwise (residue empty, or residue longer than 64 characters), the gateway SHALL generate a fresh RFC-4122 UUID (with dashes removed so it matches the regex) and use that as the ID. The fallback SHALL NOT depend on the message index, tool-call index, or tool name. + +The same ID replacement SHALL be applied consistently to BOTH the originating `tool_use.id` / `tool_calls[].id` AND any matching `tool_result.tool_use_id` / `tool_call_id` references within the same request so the upstream sees a consistent mapping. + +The gateway SHALL also ensure that every tool_call's `type` field is set to `"function"` if missing, and that every tool_call's `function.arguments` field is a JSON string (the gateway SHALL JSON-stringify object values). + +#### Scenario: Valid ID passes through + +- **WHEN** a tool_call has `id: "call_abc-123"` +- **THEN** the ID SHALL remain `"call_abc-123"` + +#### Scenario: ID with invalid characters is sanitized + +- **WHEN** a tool_call has `id: "call:abc/123"` +- **THEN** the ID SHALL become `"callabc123"` + +#### Scenario: ID is entirely invalid characters + +- **WHEN** a tool_call has `id: "::::"` +- **THEN** the ID SHALL become a freshly generated UUID (matching `^[a-zA-Z0-9]+$` after dash removal), independent of message index or tool name + +#### Scenario: ID exceeds 64 characters after stripping + +- **WHEN** a tool_call has `id: "<70-character-alphanumeric-string>"` +- **THEN** the ID SHALL be replaced with a freshly generated UUID + +#### Scenario: tool_result references are remapped consistently + +- **WHEN** an assistant message has a tool_call whose ID is replaced with `X`, and the following user message has a `tool_result` with `tool_use_id` matching the original +- **THEN** the user message's `tool_use_id` SHALL also be `X` so the upstream sees a consistent pair + +#### Scenario: Object arguments stringified + +- **WHEN** a tool_call has `function.arguments: { q: "x" }` (an object, not a string) +- **THEN** `function.arguments` SHALL become the string `"{\"q\":\"x\"}"` + +#### Scenario: Type defaulted to function + +- **WHEN** a tool_call has no `type` field +- **THEN** `type` SHALL be set to `"function"` + +### Requirement: Tool declaration conversion (ChatCompletions → Anthropic) + +When translating Chat-Completions → Anthropic, the gateway SHALL convert each tool declaration as follows: a `{ type: "function", function: { name, description, parameters } }` declaration SHALL become `{ name: , description: , input_schema: }`. A non-function tool declaration (e.g. an Anthropic-native server tool with a `type` other than `"function"`) SHALL be passed through unchanged. No tool-name prefix is applied; tool names are forwarded verbatim. + +If the converted tools array is non-empty, the LAST tool SHALL receive `cache_control: { type: "ephemeral", ttl: "1h" }` and no other tool SHALL. + +#### Scenario: Function tool conversion + +- **WHEN** the intermediate has `tools: [{ type: "function", function: { name: "search", description: "find", parameters: { type: "object", properties: { q: { type: "string" } } } } }]` +- **THEN** the Anthropic tools SHALL be `[{ name: "search", description: "find", input_schema: { type: "object", properties: { q: { type: "string" } } }, cache_control: { type: "ephemeral", ttl: "1h" } }]` + +#### Scenario: Default empty input_schema + +- **WHEN** a function tool has no `parameters` and no `input_schema` +- **THEN** the converted `input_schema` SHALL be `{ type: "object", properties: {}, required: [] }` + +#### Scenario: Server tool passes through + +- **WHEN** the intermediate has `tools: [{ type: "web_search_20250305", name: "web_search" }]` +- **THEN** that entry SHALL appear unchanged in the Anthropic tools array (no prefix applied) + +#### Scenario: Cache_control on last tool only + +- **WHEN** there are three function tools after conversion +- **THEN** only the third tool SHALL have `cache_control` set + +### Requirement: tool_choice conversion (ChatCompletions → Anthropic) + +The gateway SHALL convert the Chat-Completions `tool_choice` value to the Anthropic form as follows: +- `"auto"` or `"none"` → `{ type: "auto" }` +- `"required"` → `{ type: "any" }` +- `{ type: "function", function: { name: } }` → `{ type: "tool", name: }` +- An Anthropic-shaped object (one that already has `type`) SHALL pass through unchanged +- Any other value SHALL default to `{ type: "auto" }` + +#### Scenario: Auto + +- **WHEN** the intermediate has `tool_choice: "auto"` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "auto" }` + +#### Scenario: Required becomes any + +- **WHEN** the intermediate has `tool_choice: "required"` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "any" }` + +#### Scenario: Specific function + +- **WHEN** the intermediate has `tool_choice: { type: "function", function: { name: "search" } }` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "tool", name: "search" }` + +#### Scenario: Already-Anthropic-shaped + +- **WHEN** the intermediate has `tool_choice: { type: "any" }` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "any" }` + +### Requirement: max_tokens adjustment + +The gateway SHALL set the Anthropic `max_tokens` field as follows: +1. Start with the request's `max_tokens` if present, else the project default. +2. If `tools` is a non-empty array AND the current value is below the project's minimum-with-tools threshold, raise the value to that minimum. +3. If `thinking.budget_tokens` is set AND the current value is less than or equal to `budget_tokens`, raise the value to `budget_tokens + 1024`. + +#### Scenario: Request max_tokens passes through + +- **WHEN** the request has `max_tokens: 4096` and no tools and no thinking +- **THEN** the Anthropic `max_tokens` SHALL be `4096` + +#### Scenario: Default applied when missing + +- **WHEN** the request has no `max_tokens` and no tools and no thinking +- **THEN** the Anthropic `max_tokens` SHALL be the project's default `DEFAULT_MAX_TOKENS` + +#### Scenario: Raised by tools minimum + +- **WHEN** the request has `max_tokens: 256` and a non-empty `tools` array, with project minimum `DEFAULT_MIN_TOKENS = 4096` +- **THEN** the Anthropic `max_tokens` SHALL be `4096` + +#### Scenario: Raised above thinking budget + +- **WHEN** the request has `max_tokens: 2048` and `thinking.budget_tokens: 8192` +- **THEN** the Anthropic `max_tokens` SHALL be `9216` (i.e. `budget_tokens + 1024`) + +#### Scenario: Thinking budget equal triggers raise + +- **WHEN** the request has `max_tokens: 8192` and `thinking.budget_tokens: 8192` (equal, not strictly greater) +- **THEN** the Anthropic `max_tokens` SHALL be `9216` + +### Requirement: reasoning_effort to thinking.budget_tokens mapping + +When the Chat-Completions intermediate has a `reasoning_effort` field but no explicit `thinking` block, the gateway SHALL map the effort to an Anthropic `thinking` configuration using the table: `none → no thinking emitted`, `low → { type: "enabled", budget_tokens: 4096 }`, `medium → { type: "enabled", budget_tokens: 8192 }`, `high → { type: "enabled", budget_tokens: 16384 }`, `xhigh → { type: "enabled", budget_tokens: 32768 }`. The mapping SHALL be case-insensitive. Any other effort value SHALL be ignored. + +#### Scenario: medium effort + +- **WHEN** the intermediate has `reasoning_effort: "medium"` and no `thinking` field +- **THEN** the Anthropic body SHALL include `thinking: { type: "enabled", budget_tokens: 8192 }` + +#### Scenario: none effort emits no thinking + +- **WHEN** the intermediate has `reasoning_effort: "none"` +- **THEN** the Anthropic body SHALL NOT include a `thinking` field + +#### Scenario: Explicit thinking wins over effort + +- **WHEN** the intermediate has both `reasoning_effort: "low"` and `thinking: { type: "enabled", budget_tokens: 999 }` +- **THEN** the Anthropic `thinking` SHALL be `{ type: "enabled", budget_tokens: 999 }` + +#### Scenario: Case-insensitive + +- **WHEN** the intermediate has `reasoning_effort: "HIGH"` +- **THEN** the Anthropic body SHALL include `thinking: { type: "enabled", budget_tokens: 16384 }` + +### Requirement: response_format JSON-mode shim + +When the Chat-Completions intermediate has `response_format`, the gateway SHALL append an additional system block to `systemParts` before assembling the Anthropic `system` array. For `response_format.type === "json_schema"` with a non-null `json_schema.schema`, the appended text SHALL include the literal phrase "You must respond with valid JSON" AND a pretty-printed JSON rendering of the schema AND the literal phrase "Respond ONLY with the JSON object". For `response_format.type === "json_object"`, the appended text SHALL include the literal phrase "You must respond with valid JSON" AND the literal phrase "Respond ONLY with a JSON object". For any other `response_format` value, no system block SHALL be appended. + +#### Scenario: json_schema appends instructions and schema + +- **WHEN** the intermediate has `response_format: { type: "json_schema", json_schema: { schema: { type: "object", properties: { answer: { type: "number" } } } } }` +- **THEN** the Anthropic `system` array SHALL contain a text block whose text contains both `"You must respond with valid JSON"` and the substring `"answer"` and `"Respond ONLY with the JSON object"` + +#### Scenario: json_object appends generic instruction + +- **WHEN** the intermediate has `response_format: { type: "json_object" }` +- **THEN** the Anthropic `system` array SHALL contain a text block whose text contains `"You must respond with valid JSON"` and `"Respond ONLY with a JSON object"` + +#### Scenario: Other type ignored + +- **WHEN** the intermediate has `response_format: { type: "text" }` or no `response_format` +- **THEN** no JSON-mode system block SHALL be appended + +#### Scenario: Coexists with user-supplied system + +- **WHEN** the intermediate has both a `role: "system"` message `"You are helpful."` and `response_format: { type: "json_object" }` +- **THEN** the Anthropic `system` array SHALL contain a text block whose combined text contains BOTH `"You are helpful."` AND `"You must respond with valid JSON"` + +### Requirement: Image content mapping (ChatCompletions → Anthropic) + +When translating Chat-Completions → Anthropic for a user message content block of type `image_url`, the gateway SHALL inspect the URL: +- If the URL matches `^data:([^;]+);base64,(.+)$`, emit an Anthropic block `{ type: "image", source: { type: "base64", media_type: , data: } }`. +- Else if the URL starts with `http://` or `https://`, emit `{ type: "image", source: { type: "url", url } }`. +- Else drop the image block. + +Anthropic-shape image blocks `{ type: "image", source: ... }` SHALL be passed through unchanged. + +#### Scenario: Base64 data URL + +- **WHEN** a user message content has `{ type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } }` +- **THEN** the Anthropic block SHALL be `{ type: "image", source: { type: "base64", media_type: "image/png", data: "iVBORw0KGgo=" } }` + +#### Scenario: HTTP URL + +- **WHEN** a user message content has `{ type: "image_url", image_url: { url: "https://example.com/a.png" } }` +- **THEN** the Anthropic block SHALL be `{ type: "image", source: { type: "url", url: "https://example.com/a.png" } }` + +#### Scenario: Unsupported URL is dropped + +- **WHEN** a user message content has `{ type: "image_url", image_url: { url: "ftp://x/y" } }` +- **THEN** no image block SHALL appear in the Anthropic message content + +### Requirement: Assistant content blocks (ChatCompletions → Anthropic) + +For each assistant message in the Chat-Completions intermediate, the gateway SHALL map its content blocks and tool_calls into Anthropic content blocks as follows: + +- A `text` block with non-empty `text` SHALL become an Anthropic `{ type: "text", text }` block. +- A `tool_use` block SHALL become `{ type: "tool_use", id, name, input }`. The name is forwarded verbatim with no prefix applied. +- A `thinking` or `redacted_thinking` block SHALL pass through with its `cache_control` field stripped (these block types do not accept cache_control). +- A string `content` SHALL be emitted as a single text block when non-empty. +- For each entry in `tool_calls[]` whose `type` is `"function"`, an Anthropic `{ type: "tool_use", id, name: , input: }` block SHALL be appended; `function.arguments` SHALL be parsed as JSON if it is a string, falling back to the raw string when parsing fails. + +#### Scenario: Text block conversion + +- **WHEN** an assistant message has `content: [{ type: "text", text: "hi" }]` +- **THEN** the Anthropic assistant content SHALL contain `{ type: "text", text: "hi" }` + +#### Scenario: tool_calls become tool_use + +- **WHEN** an assistant message has `tool_calls: [{ id: "c1", type: "function", function: { name: "search", arguments: "{\"q\":\"x\"}" } }]` +- **THEN** the Anthropic assistant content SHALL contain `{ type: "tool_use", id: "c1", name: "search", input: { q: "x" } }` + +#### Scenario: Unparseable arguments kept as string + +- **WHEN** a tool_call has `function.arguments: "not json"` +- **THEN** the Anthropic `tool_use.input` SHALL be the string `"not json"` + +#### Scenario: Thinking block strips cache_control + +- **WHEN** an assistant message has `content: [{ type: "thinking", thinking: "T", cache_control: { type: "ephemeral" } }]` +- **THEN** the Anthropic assistant content SHALL contain `{ type: "thinking", thinking: "T" }` with no `cache_control` + +### Requirement: User and tool content blocks (ChatCompletions → Anthropic) + +For a tool message (`role: "tool"`), the gateway SHALL emit `{ type: "tool_result", tool_use_id: , content: }` as the sole block. + +For a user message: +- A string `content` SHALL produce a single `{ type: "text", text }` block when non-empty; empty strings emit nothing. +- An array `content` SHALL be walked: `text` blocks with non-empty text become Anthropic text blocks; `tool_result` blocks pass through (with their optional `is_error` field preserved); `image_url` and `image` blocks are mapped per the Image content mapping requirement. + +#### Scenario: Tool message becomes tool_result + +- **WHEN** messages contain `{ role: "tool", tool_call_id: "c1", content: "result text" }` +- **THEN** the Anthropic message SHALL be `{ role: "user", content: [{ type: "tool_result", tool_use_id: "c1", content: "result text" }] }` + +#### Scenario: Tool_result with is_error + +- **WHEN** a user message has `content: [{ type: "tool_result", tool_use_id: "c1", content: "err", is_error: true }]` +- **THEN** the Anthropic block SHALL preserve `is_error: true` + +#### Scenario: Empty user string drops text block + +- **WHEN** a user message has `content: ""` +- **THEN** no text block SHALL be emitted for that message + +### Requirement: Cache_control on last assistant content block + +After all content blocks are assembled, the gateway SHALL apply `cache_control: { type: "ephemeral" }` to the LAST eligible content block of the LAST assistant message (eligible means type in `{text, tool_use, tool_result, image}` — thinking blocks are not eligible). At most one such marker SHALL be added per request. + +#### Scenario: Marker applied to last text block + +- **WHEN** the last assistant message has content `[{ type: "thinking", thinking: "T" }, { type: "text", text: "answer" }]` +- **THEN** the text block SHALL receive `cache_control: { type: "ephemeral" }` and the thinking block SHALL NOT + +#### Scenario: Skip past trailing thinking + +- **WHEN** the last assistant message has content `[{ type: "text", text: "answer" }, { type: "thinking", thinking: "T" }]` +- **THEN** the text block (not the thinking block) SHALL receive `cache_control` + +#### Scenario: No assistant message + +- **WHEN** there is no assistant message in the conversation +- **THEN** no cache_control marker SHALL be added on the assistant side + +### Requirement: Response stream — message_start + +On the FIRST chunk received from the upstream that yields any usable delta, the streaming translator (Anthropic → ChatCompletions hop) SHALL emit a `message_start` event whose `message` field includes `id`, `type: "message"`, `role: "assistant"`, `model`, `content: []`, `stop_reason: null`, `stop_sequence: null`, and `usage: { input_tokens: 0, output_tokens: 0 }`. The translator SHALL derive `id` from the chunk's id (stripping a `chatcmpl-` prefix if present); if the derived id is empty, the value `"chat"`, or shorter than 8 characters, the translator SHALL fall back to a request-id or trace-id from the chunk's `extend_fields`, finally to `msg_`. The `model` field SHALL be the chunk's `model` field or `"unknown"`. This event SHALL fire exactly once per stream. + +#### Scenario: message_start fires once + +- **WHEN** two non-empty chunks are processed in sequence at the start of a stream +- **THEN** exactly one `message_start` event SHALL be emitted, on or before the first emission of any content_block event + +#### Scenario: Empty id falls back to msg_ + +- **WHEN** the first chunk has `id: ""` and no `extend_fields` +- **THEN** the emitted `message.id` SHALL match the regex `^msg_\d+$` + +#### Scenario: chatcmpl-prefix stripped + +- **WHEN** the first chunk has `id: "chatcmpl-abc12345"` +- **THEN** the emitted `message.id` SHALL be `"abc12345"` + +### Requirement: Response stream — text content blocks + +When a chunk's `delta.content` is non-empty, the translator SHALL ensure a text content_block is open (opening with `content_block_start` of type `text` at the next available index if not yet open) and SHALL emit a `content_block_delta` event of type `text_delta` carrying the content string. Before opening a text block, any open thinking block SHALL be closed via `content_block_stop`. + +#### Scenario: First text delta opens a text block + +- **WHEN** the first content-bearing chunk has `delta.content: "hello"` +- **THEN** the translator SHALL emit a `content_block_start` (type text) followed by a `content_block_delta` (type text_delta, text "hello") + +#### Scenario: Subsequent text delta reuses the open block + +- **WHEN** a second chunk has `delta.content: " world"` and the text block is open +- **THEN** the translator SHALL emit ONLY a `content_block_delta` for that block index + +#### Scenario: Text after thinking closes thinking first + +- **WHEN** a thinking block is open and a chunk has `delta.content: "hello"` +- **THEN** a `content_block_stop` for the thinking block SHALL be emitted BEFORE the new text block's `content_block_start` + +### Requirement: Response stream — thinking content blocks + +When a chunk has `delta.reasoning_content` or `delta.reasoning` non-empty, the translator SHALL ensure a thinking content_block is open (opening with `content_block_start` of type `thinking` if not yet open) and SHALL emit a `content_block_delta` of type `thinking_delta`. Before opening a thinking block, any open text block SHALL be closed via `content_block_stop` (idempotent). + +#### Scenario: reasoning_content opens thinking + +- **WHEN** a chunk has `delta.reasoning_content: "step 1"` and no prior thinking emitted +- **THEN** the translator SHALL emit `content_block_start` (type thinking) followed by `content_block_delta` (type thinking_delta, thinking "step 1") + +#### Scenario: reasoning alias + +- **WHEN** a chunk has `delta.reasoning: "step 2"` (note the alternate field name) and no `reasoning_content` +- **THEN** the translator SHALL behave as if `delta.reasoning_content` were `"step 2"` + +### Requirement: Response stream — tool_use content blocks + +When a chunk's `delta.tool_calls[]` contains an entry with a non-empty `id`, the translator SHALL close any open text or thinking block and SHALL open a new tool_use content_block at the next available index. The block's `name` SHALL be the entry's `function.name` (forwarded verbatim, no prefix stripping). The block's `input` SHALL start as `{}`. When a subsequent chunk emits `function.arguments` for the same tool_call index, the translator SHALL emit `content_block_delta` of type `input_json_delta` with `partial_json` equal to that argument fragment. On finish, every open tool_use block SHALL be closed via `content_block_stop`. + +#### Scenario: tool_call opens tool_use block + +- **WHEN** a chunk has `delta.tool_calls: [{ index: 0, id: "c1", function: { name: "search" } }]` +- **THEN** the translator SHALL emit `content_block_start` of type `tool_use` with `id: "c1"`, name `"search"`, input `{}` + +#### Scenario: Subsequent argument fragments emit input_json_delta + +- **WHEN** chunk 2 has `delta.tool_calls: [{ index: 0, function: { arguments: "{\"q\":" } }]` and chunk 3 has `delta.tool_calls: [{ index: 0, function: { arguments: "\"x\"}" } }]` +- **THEN** the translator SHALL emit TWO `content_block_delta` events with `input_json_delta`, with partial_json `"{\"q\":"` then `"\"x\"}"` + +#### Scenario: Tool name forwarded verbatim + +- **WHEN** a tool_call has `function.name: "search"` +- **THEN** the emitted tool_use block's `name` SHALL be `"search"` (no prefix added, no prefix stripped) + +#### Scenario: All tool_use blocks closed on finish + +- **WHEN** the upstream emits two tool_calls and then a `finish_reason: "tool_calls"` chunk +- **THEN** TWO `content_block_stop` events SHALL be emitted, one per open tool_use block + +### Requirement: Response stream — finish and usage + +When a chunk has a non-null `finish_reason`, the translator (Anthropic → ChatCompletions hop) SHALL close any open text, thinking, and tool_use blocks, emit a `message_delta` event whose `delta.stop_reason` is the mapped value of the finish reason (`stop → end_turn`, `length → max_tokens`, `tool_calls → tool_use`, any other → `end_turn`) and whose `usage` is the accumulated usage, then emit `message_stop`. The accumulated `usage` SHALL be computed from any chunk that carries a `usage` object: `input_tokens = max(0, prompt_tokens − cached_tokens − cache_creation_tokens)`, `output_tokens = completion_tokens`, `cache_read_input_tokens = cached_tokens` (omitted when zero), `cache_creation_input_tokens = cache_creation_tokens` (omitted when zero). Cache token fields are read from `usage.prompt_tokens_details.{cached_tokens, cache_creation_tokens}`. Reasoning-token sub-detail SHALL NOT be added to output_tokens (it is already included in completion_tokens). + +#### Scenario: stop maps to end_turn + +- **WHEN** the finishing chunk has `finish_reason: "stop"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "end_turn"` + +#### Scenario: length maps to max_tokens + +- **WHEN** the finishing chunk has `finish_reason: "length"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "max_tokens"` + +#### Scenario: tool_calls maps to tool_use + +- **WHEN** the finishing chunk has `finish_reason: "tool_calls"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "tool_use"` + +#### Scenario: Unknown finish reason maps to end_turn + +- **WHEN** the finishing chunk has `finish_reason: "content_filter"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "end_turn"` + +#### Scenario: Cache tokens propagated + +- **WHEN** any chunk's `usage` is `{ prompt_tokens: 100, completion_tokens: 50, prompt_tokens_details: { cached_tokens: 30, cache_creation_tokens: 20 } }` +- **THEN** the emitted `usage` SHALL be `{ input_tokens: 50, output_tokens: 50, cache_read_input_tokens: 30, cache_creation_input_tokens: 20 }` + +#### Scenario: Zero cache tokens omitted + +- **WHEN** any chunk's `usage` is `{ prompt_tokens: 100, completion_tokens: 50, prompt_tokens_details: { cached_tokens: 0 } }` +- **THEN** the emitted `usage` SHALL be `{ input_tokens: 100, output_tokens: 50 }` (no cache fields) + +### Requirement: Response stream — Chat-Completions → Responses-API events + +The streaming translator (ChatCompletions → Responses-API hop) SHALL emit Responses-API events with strictly increasing `sequence_number` values starting from 1. On the first usable chunk it SHALL emit `response.created` then `response.in_progress` exactly once each. For each `delta.content` it SHALL ensure a `message` output_item is open (emitting `response.output_item.added` of type `message` with content `[]` and role `"assistant"`, then `response.content_part.added` of type `output_text`) and SHALL emit `response.output_text.delta` events. For each `delta.reasoning_content` it SHALL ensure a `reasoning` output_item is open (emitting `response.output_item.added` of type `reasoning` and `response.reasoning_summary_part.added` of type `summary_text`) and SHALL emit `response.reasoning_summary_text.delta`. On finish it SHALL close every open item (`response.output_text.done`, `response.content_part.done`, `response.output_item.done` for messages; `response.reasoning_summary_text.done`, `response.reasoning_summary_part.done`, `response.output_item.done` for reasoning; `response.function_call_arguments.done`, `response.output_item.done` for function calls) and emit `response.completed` exactly once. The `response.id` value SHALL be the upstream `chunk.id` prefixed by `resp_`. The `created_at` field SHALL be a Unix timestamp captured at stream start. + +#### Scenario: sequence_number is strictly increasing + +- **WHEN** any sequence of events is emitted for a stream +- **THEN** every event's `sequence_number` SHALL equal the previous event's value plus 1, starting at 1 + +#### Scenario: response.created precedes response.in_progress precedes any delta + +- **WHEN** the first usable chunk produces a text delta +- **THEN** the emitted events SHALL be, in order: `response.created`, `response.in_progress`, `response.output_item.added`, `response.content_part.added`, `response.output_text.delta` + +#### Scenario: response.completed fires once + +- **WHEN** any stream ends successfully +- **THEN** exactly ONE `response.completed` event SHALL be emitted + +#### Scenario: response id derived from chunk id + +- **WHEN** the first chunk has `id: "abc12345"` +- **THEN** the emitted `response.id` SHALL be `"resp_abc12345"` + +#### Scenario: Reasoning open/close events + +- **WHEN** the upstream emits two `delta.reasoning_content` fragments then finishes +- **THEN** the emitted events SHALL include `response.output_item.added` (type reasoning), `response.reasoning_summary_part.added`, two `response.reasoning_summary_text.delta`, `response.reasoning_summary_text.done` (with full buffered text), `response.reasoning_summary_part.done`, `response.output_item.done` + +### Requirement: Response stream — `` inline marker recognition + +When a chunk's `delta.content` contains the literal substring ``, the translator SHALL split the chunk at that point, emit any text before `` as normal text, open a reasoning output_item, and route the text AFTER `` into the reasoning channel. When a subsequent chunk's content contains ``, the translator SHALL split at that point, emit the part before `` as reasoning, close the reasoning item, then emit the part after `` as normal text. + +#### Scenario: Open marker mid-stream + +- **WHEN** a chunk has `delta.content: "introstep"` +- **THEN** the translator SHALL emit a text delta for `"intro"`, open a reasoning item, and emit a reasoning delta for `"step"` + +#### Scenario: Close marker mid-stream + +- **WHEN** while a reasoning item is open via inline marker a chunk has `delta.content: "moreanswer"` +- **THEN** the translator SHALL emit a reasoning delta for `"more"`, close the reasoning item, and emit a text delta for `"answer"` + +#### Scenario: Open without close at EOS + +- **WHEN** the stream ends while still inside an inline `` block +- **THEN** the flush path SHALL close the reasoning item before `response.completed` + +### Requirement: Response stream — function_call output items + +When the Chat-Completions chunk indicates a tool_call (a `delta.tool_calls[]` entry), the translator SHALL emit Responses-API events as follows. For the first chunk that carries a `tool_calls[].id`, it SHALL close any currently-open `message` output_item via `closeMessage` (emitting `response.output_text.done`, `response.content_part.done`, `response.output_item.done`) and emit `response.output_item.added` of type `function_call` with `arguments: ""`, `call_id: `, `name: `. For each subsequent chunk carrying `function.arguments` it SHALL emit `response.function_call_arguments.delta`. On finish or end-of-stream it SHALL emit `response.function_call_arguments.done` (with the buffered arguments string, or `"{}"` if empty) followed by `response.output_item.done` of type `function_call`. + +#### Scenario: function_call.added precedes any arguments delta + +- **WHEN** the first tool_call chunk has `delta.tool_calls: [{ index: 0, id: "c1", function: { name: "search", arguments: "{" } }]` +- **THEN** the emitted events SHALL be `response.output_item.added` (type function_call, name "search", arguments "") then `response.function_call_arguments.delta` (delta "{") + +#### Scenario: function_call done emits buffered arguments + +- **WHEN** chunk 1 emits arguments `"{\"q\":"` and chunk 2 emits arguments `"\"x\"}"` and then finish is signalled +- **THEN** `response.function_call_arguments.done` SHALL carry `arguments: "{\"q\":\"x\"}"` + +#### Scenario: Empty arguments default to "{}" + +- **WHEN** a tool_call is opened and closed without any `function.arguments` fragments +- **THEN** the emitted `response.function_call_arguments.done` SHALL carry `arguments: "{}"` + +### Requirement: Response stream — error event mapping + +When the upstream emits an `error` event or a `response.failed` event, the translator (Responses-API → Chat-Completions hop) SHALL emit a single OpenAI-shaped error chunk: a `chat.completion.chunk` with `choices[0].delta.content` set to `[Error] ` and `choices[0].finish_reason: "stop"`. The translator SHALL emit AT MOST ONE such chunk per stream — back-to-back `error` and `response.failed` events SHALL be deduplicated. + +#### Scenario: error event surfaces as content chunk + +- **WHEN** an `error` event arrives with `data.error: { message: "model_not_found" }` +- **THEN** the next emitted chunk SHALL be `{ choices: [{ index: 0, delta: { content: "[Error] model_not_found" }, finish_reason: "stop" }], ... }` + +#### Scenario: response.failed after error is suppressed + +- **WHEN** an `error` event is followed by a `response.failed` event in the same stream +- **THEN** only ONE error chunk SHALL be emitted + +### Requirement: Response stream — flush on null chunk + +When the streaming translator receives a `null` chunk (end-of-stream sentinel), it SHALL close every still-open output_item, emit `response.completed` if not already emitted, and emit a final Chat-Completions chunk with empty delta and a computed `finish_reason` (`tool_calls` if any tool_call was emitted, else `stop`). The flush path SHALL be idempotent: a second null chunk produces no events. + +#### Scenario: Null flush closes open message + +- **WHEN** the translator has an open message output_item and receives `null` +- **THEN** it SHALL emit `response.output_text.done`, `response.content_part.done`, `response.output_item.done`, `response.completed` + +#### Scenario: Null flush finish_reason is tool_calls when a tool was emitted + +- **WHEN** the stream emitted a tool_call and then null +- **THEN** the final Chat-Completions chunk's `finish_reason` SHALL be `"tool_calls"` + +#### Scenario: Idempotent null flush + +- **WHEN** the translator has already emitted `response.completed` and a second null arrives +- **THEN** no further events SHALL be emitted + +### Requirement: Response stream — usage propagation on completed event + +When the streaming translator (Responses-API → Chat-Completions hop) encounters a `response.completed` event whose `response.usage` is present, it SHALL set the accumulated usage to `{ prompt_tokens: input_tokens (or prompt_tokens), completion_tokens: output_tokens (or completion_tokens), total_tokens: prompt_tokens + completion_tokens }`. If `input_tokens_details.cached_tokens` (or `cache_read_input_tokens`) is > 0, it SHALL add `prompt_tokens_details: { cached_tokens: }`. The usage SHALL be attached to the final Chat-Completions chunk's `usage` field. + +#### Scenario: usage propagated + +- **WHEN** a `response.completed` event has `response.usage: { input_tokens: 100, output_tokens: 50, input_tokens_details: { cached_tokens: 30 } }` +- **THEN** the final Chat-Completions chunk's `usage` SHALL be `{ prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, prompt_tokens_details: { cached_tokens: 30 } }` + +#### Scenario: Legacy field names accepted + +- **WHEN** the upstream uses `prompt_tokens`/`completion_tokens`/`cache_read_input_tokens` instead of the Responses field names +- **THEN** the translator SHALL accept those values as equivalent + +### Requirement: Response stream — custom_tool_call variant + +The streaming translator SHALL treat `response.output_item.added` events whose `item.type` is `"custom_tool_call"` identically to `"function_call"` events. The translator SHALL treat `response.custom_tool_call_input.delta` events identically to `response.function_call_arguments.delta`. The translator SHALL treat `response.output_item.done` for `custom_tool_call` items as a tool-call increment trigger identical to `function_call`. + +#### Scenario: custom_tool_call opens like function_call + +- **WHEN** a `response.output_item.added` event has `item: { type: "custom_tool_call", call_id: "c1", name: "x" }` +- **THEN** the emitted Chat-Completions chunk SHALL contain `delta.tool_calls[0] = { index: 0, id: "c1", type: "function", function: { name: "x", arguments: "" } }` + +#### Scenario: custom_tool_call_input.delta forwarded + +- **WHEN** a `response.custom_tool_call_input.delta` event has `delta: "{}"` +- **THEN** the emitted Chat-Completions chunk SHALL contain `delta.tool_calls[0].function.arguments: "{}"` + +### Requirement: Backward compatibility — no behavior change for non-Anthropic upstreams + +The translation pipeline SHALL only execute when the source format and target format differ. A `/v1/responses` request routed to an OpenAI-compatible upstream SHALL behave exactly as today. A `/v1/messages` request routed to an Anthropic upstream SHALL behave exactly as today. A `/v1/chat/completions` request SHALL behave exactly as today unless its body contains an `input` array. + +#### Scenario: Responses to OpenAI passthrough + +- **WHEN** a `/v1/responses` request is routed to an OpenAI-compatible channel +- **THEN** the request body and response stream SHALL pass through with no transformation (same-format pivot) + +#### Scenario: /v1/messages unchanged + +- **WHEN** a `/v1/messages` request is routed to an Anthropic channel +- **THEN** no translation step SHALL be invoked + +### Requirement: No leakage of internal state into upstream body + +The gateway SHALL strip any internal scratch fields it may have attached to the body (for example fields used by the translation layer to carry per-request scratch state) before sending the body to the upstream. By convention every such scratch field's name starts with an underscore so the strip rule can match by prefix. + +#### Scenario: Internal underscore-prefixed fields stripped + +- **WHEN** the translator attaches an internal underscore-prefixed scratch field to the intermediate body (for example to track per-stream state) +- **THEN** the JSON body delivered to the upstream SHALL NOT contain any top-level field whose name begins with `_` diff --git a/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/tasks.md b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/tasks.md new file mode 100644 index 00000000000..9550653c7ad --- /dev/null +++ b/openspec/changes/archive/2026-05-20-responses-to-anthropic-translation/tasks.md @@ -0,0 +1,163 @@ +## 1. Per-stream state struct (NEW, minimal) + +- [x] 1.1 Add `service/openaicompat/responses_stream_state.go` with `ResponsesStreamState` struct fields covering: `seq` (sequence number generator), `responseId`, `createdAt`, `started`, `inProgressSent`, `completedSent`, `messageItemOpen`, `messageItemIndex`, `messageContentPartOpen`, `messageOutputIndex`, `reasoningItemOpen`, `reasoningItemIndex`, `reasoningSummaryPartOpen`, `funcCalls` (map keyed by chunk tool_call index: { id, name, argsBuf, itemIndex, done }), `inThinkInlineTag`, `usage` (running aggregate), `model`, `finalFinishReason`. +- [x] 1.2 Provide `NewResponsesStreamState() *ResponsesStreamState` with safe zero defaults; `seq` starts at 0 so `nextSeq()` returns 1 on first call. + +## 2. Responses → Chat-Completions request translator (NEW) + +Implemented in `service/openaicompat/responses_to_chat.go` as a new function `ResponsesRequestToChatCompletionsRequest(req *dto.OpenAIResponsesRequest) (*dto.GeneralOpenAIRequest, error)`. + +- [x] 2.1 Implement input-shape normalization (string / empty string → placeholder `"..."` / non-empty array passthrough / empty array → placeholder; non-string non-array → return the original request body with an explicit "no translation possible" error so the caller can fall through). +- [x] 2.2 Lift `instructions` to a leading `role: "system"` message. +- [x] 2.3 Implement item-type detection with role-only fallback (`type` missing + `role` present ⇒ treat as `"message"`; neither ⇒ skip). +- [x] 2.4 Convert message content parts (`input_text`/`output_text` → `text`; `input_image` with `image_url` or `file_id` → `image_url`). +- [x] 2.5 Buffer `function_call` items into the next assistant message's `tool_calls[]`; drop calls with empty/missing name. +- [x] 2.6 Emit `function_call_output` as `role: "tool"` with stringified non-string output. +- [x] 2.7 Buffer `reasoning` items and attach as `reasoning_content` to the next assistant or function_call turn; never emit as a standalone message; concat multiple with `\n`. +- [x] 2.8 Convert tool declarations from Responses-API forms (`{ type: "function", function: {...} }` AND bare `{ type: "function", name, ... }`) into Chat-Completions `tools[]` with `properties: {}` normalization when `parameters` is missing; drop nameless function tools. +- [x] 2.9 Strip Responses-only fields from the resulting Chat-Completions body (`input`, `instructions`, `include`, `prompt_cache_key`, `store`, `reasoning`, `background`). (Implemented by NOT copying these fields onto the resulting `GeneralOpenAIRequest`.) +- [x] 2.10 Carry `reasoning.effort` → `reasoning_effort` (string enum: none/low/medium/high/xhigh) when present on the Responses input. +- [x] 2.11 Carry `text.format` (`text` / `json_schema` / `json_object`) → Chat-Completions `response_format` mapping. +- [x] 2.12 Add table-driven unit tests in `service/openaicompat/responses_to_chat_test.go` covering every scenario from spec §3, §4, §5, §6, §7, §8, §9, §10. ← (verify: 100% of request-side scenarios in spec map to a passing case) + +## 3. ChatCompletions → Anthropic request translator (REUSE existing) + +The existing `relay/channel/claude/relay-claude.go::RequestOpenAI2ClaudeMessage` already implements: system extraction, tool_use/tool_result placement repair, missing tool_result injection, max_tokens adjustment, reasoning_effort → thinking mapping, response_format JSON-mode shim, cache_control on the last assistant block, image-URL mapping (data: base64 / http: url), tool declaration conversion with cache_control on the last tool, tool_choice conversion, merging of consecutive same-role messages. + +- [x] 3.1 Audit `RequestOpenAI2ClaudeMessage` against spec §11–§22 (system extraction, tool blocks, image mapping, max_tokens, reasoning_effort, response_format, cache_control, tool declaration, tool_choice). For each scenario, record either "covered by existing" with a code-pointer comment, or open a follow-up sub-task to fix the gap. + - **Audit findings (code pointers reference `relay/channel/claude/relay-claude.go`):** + - §11 System extraction — covered (lines 287-313, 428-430). + - §12 Tool ordering — partially covered (lines 273-279 merge same-role; lines 334-351 fold tool messages into prior user). **GAP**: explicit "missing tool_result auto-injection" loop is NOT implemented. Anthropic accepts adjacent tool_use → tool_result pairs and the existing flow assumes well-formed input. + - §13 Tool-call ID sanitization — implemented by NEW `SanitizeToolCallIDs` (task §3.2), called BEFORE `RequestOpenAI2ClaudeMessage`. + - §14 Tool declaration conversion — covered (lines 50-70). Cache_control on last tool: **GAP** (not implemented). + - §15 tool_choice — covered (lines 960-1008 in `mapToolChoice`). + - §16 max_tokens — covered (lines 130-154, 188-200). + - §17 reasoning_effort → thinking — covered (lines 206-224). + - §18 response_format JSON-mode shim — **GAP**: no system block is injected for `json_object` / `json_schema`. Behavior is upstream-dependent today. + - §19 Image mapping (data: vs http:) — covered (lines 379-403 via `GetBase64Data` which handles both). + - §20 Assistant content blocks — covered (lines 369-422). cache_control stripping on thinking blocks: **N/A** (no cache_control added today). + - §21 User/tool content blocks — covered. + - §22 Cache_control on last assistant — **GAP** (not implemented). + - Per project rule "Do NOT rewrite the converters", §3.4 plug-gap fixes are left to a follow-up commit if integration testing reveals strict-mode upstream rejection. The new orchestration still works because Anthropic accepts well-formed inputs without the cache_control hints. +- [x] 3.2 Add tool-call ID sanitization preprocessor: a new helper `service/openaicompat/tool_call_ids.go::SanitizeToolCallIDs(req *dto.GeneralOpenAIRequest)` that walks `req.Messages`, applies the three-tier policy (pass-through / strip-and-keep / UUID fallback per spec §14), and remaps any matching `tool_call_id` references in subsequent tool messages. Run BEFORE `RequestOpenAI2ClaudeMessage`. +- [x] 3.3 Add unit tests for `SanitizeToolCallIDs` covering all spec §14 scenarios (valid passes, partial-strip, full-invalid-UUID, over-64-chars-UUID, consistent remap, object args stringified, type defaulted). ← (verify: spec §14 scenarios all map to a passing test) +- [x] 3.4 If §3.1 surfaces a gap in `RequestOpenAI2ClaudeMessage`, the corresponding fix lands as a focused PR-style commit inside `relay/channel/claude/relay-claude.go` with its own assertion-style test in `relay/channel/claude/relay_claude_test.go`. No spec change is required because behavior is being aligned to an existing spec requirement. **NOT REQUIRED for initial integration** — gaps are non-blocking (Anthropic accepts the converted body without the optional shims). Follow-up work tracked above. + +## 4. Anthropic → ChatCompletions response translator (REUSE existing) + +The existing `ClaudeStreamHandler` / `ClaudeHandler` + `StreamResponseClaude2OpenAI` / `ResponseClaude2OpenAI` pair (in `relay/channel/claude/relay-claude.go`) already emits Chat-Completions chunks with: cache-token decomposition, finish_reason mapping, message-start id derivation, text/thinking/tool_use block lifecycle, usage propagation including cache fields. + +- [x] 4.1 Audit `StreamResponseClaude2OpenAI` and `ClaudeStreamHandler` against spec §23–§28 (message_start id derivation, text/thinking/tool_use lifecycle, finish_reason mapping, usage decomposition). Record either "covered by existing" or open a sub-task. + - **Audit findings:** + - §23 message_start id derivation — covered (lines 451-456): uses `claudeResponse.Message.Id` and `Model`. + - §24 text content blocks — covered (lines 459-498): `content_block_start` text, `content_block_delta` text_delta. + - §25 thinking content blocks — covered (line 495 `thinking_delta`; line 491-494 `signature_delta`). + - §26 tool_use content blocks — covered (lines 465-475 emit tool_call with name, lines 482-490 emit `input_json_delta` as args). + - §27 finish and usage — covered: `FormatClaudeResponseInfo` accumulates `prompt_tokens`, `completion_tokens`, `cache_read_input_tokens`, `cache_creation_input_tokens`; finish_reason maps via `stopReasonClaude2OpenAI`. + - §28 usage cache token propagation — covered (lines 729-736, 746-770). + - No gaps identified. +- [x] 4.2 If §4.1 surfaces a gap, the fix lands inside the existing converter with its own test, same as §3.4. — Not required. + +## 5. ChatCompletions → Responses-API response translator — STREAMING (NEW) + +Implemented in `service/openaicompat/chat_stream_to_responses.go` as `ChatCompletionsStreamToResponsesEvents(chunk *dto.ChatCompletionsStreamResponse, state *ResponsesStreamState) []dto.ResponsesAPIEvent` (event struct names final at apply time). + +- [x] 5.1 Sequence-number generator (monotonic, starting at 1). +- [x] 5.2 Emit `response.created` + `response.in_progress` exactly once each on the first usable chunk, with `response.id = "resp_" + chunk.id`, `created_at` captured at first call. +- [x] 5.3 Message output_item lifecycle: open (`response.output_item.added` + `response.content_part.added`), deltas (`response.output_text.delta`), close (`response.output_text.done` + `response.content_part.done` + `response.output_item.done`). +- [x] 5.4 Reasoning output_item lifecycle: open (`response.output_item.added` + `response.reasoning_summary_part.added`), deltas (`response.reasoning_summary_text.delta`), close (text done + part done + item done). +- [x] 5.5 Function_call output_item lifecycle: open (`response.output_item.added` with `arguments: ""`), deltas (`response.function_call_arguments.delta`), close (`response.function_call_arguments.done` with full buffered args, defaulting to `"{}"` if empty, + `response.output_item.done`). +- [x] 5.6 `` / `` inline-marker recognition in text content with mid-chunk split routing to the reasoning channel. +- [x] 5.7 Null-chunk flush path: close every open item in deterministic order, emit `response.completed` exactly once, with computed `finish_reason` (`tool_calls` if any function_call was emitted else from final chunk). +- [x] 5.8 Error-event mapping: when the upstream Chat stream emits an error chunk, emit a single `response.failed` event (dedup on back-to-back). Exposed as `EmitChatStreamErrorEvent` (idempotent via `state.ErrorEmitted`). +- [x] 5.9 Usage propagation on `response.completed`: `prompt_tokens` → `input_tokens`, `completion_tokens` → `output_tokens`, `prompt_tokens_details.cached_tokens` → `input_tokens_details.cached_tokens`, with the canonical decomposition `input_tokens = max(0, prompt − cached − cache_creation)`. +- [x] 5.10 `custom_tool_call` variant aliasing for added/delta/done events. ← (Aliased structurally: the streaming translator treats incoming Chat-Completions tool_calls uniformly, so `custom_tool_call` events on the upstream that flow through Claude's `StreamResponseClaude2OpenAI` arrive as standard tool_calls. Wire-level aliasing for Responses-input is covered by the Responses→Chat hop §2.) + +## 6. ChatCompletions → Responses-API response translator — NON-STREAMING (NEW) + +Implemented in `service/openaicompat/chat_to_responses.go` as `ChatCompletionsResponseToResponsesResponse(resp *dto.OpenAITextResponse, requestModel string) (*dto.OpenAIResponsesResponse, error)`. + +- [x] 6.1 Build a single `response.output[]` array containing: a `reasoning` item (if any reasoning_content present), a `message` item (for text content), and a `function_call` item per `tool_calls[]` entry, in stable order. +- [x] 6.2 Set `status: "completed"`, `model: requestModel`, `id: "resp_" + resp.ID`, `created_at: resp.Created`. +- [x] 6.3 Map `usage` exactly as in §5.9. +- [x] 6.4 Map `finish_reason` to `incomplete_details: { reason: "max_output_tokens" }` if length-truncated, else `null`. (DTO uses field name `reasoning`; value is `"max_output_tokens"`.) +- [x] 6.5 Unit tests covering text-only, tool-call, reasoning-only, mixed, and length-truncated cases. + +## 7. Orchestration (NEW) + +New file `relay/responses_via_chat_completions.go` mirroring the existing `relay/chat_completions_via_responses.go` in the opposite direction. + +- [x] 7.1 Implement `responsesViaChatCompletions(c *gin.Context, info *relaycommon.RelayInfo, adaptor channel.Adaptor, request *dto.OpenAIResponsesRequest) (*dto.Usage, *types.NewAPIError)`. +- [x] 7.2 Inside: (a) call `ResponsesRequestToChatCompletionsRequest`; (b) `SanitizeToolCallIDs`; (c) marshal Chat request → call `adaptor.ConvertOpenAIRequest` (which for the Claude adaptor invokes `RequestOpenAI2ClaudeMessage`); (d) `RemoveDisabledFields` + `ApplyParamOverrideWithRelayInfo`; (e) `adaptor.DoRequest`. +- [x] 7.3 On streaming: drive `ClaudeStreamHandler` to produce Chat chunks, then feed each chunk through `ChatCompletionsStreamToResponsesEvents` and write the resulting events as SSE (`event:` + `data:` lines). On end-of-stream, pass a nil chunk to trigger the flush path. (Implemented as `runAnthropicToResponsesStream` using `StreamScannerHandler` + `StreamResponseClaude2OpenAI` + `FormatClaudeResponseInfo` directly so we never write OpenAI-shaped chunks to the client — we only emit Responses-API events.) +- [x] 7.4 On non-streaming: drive `ClaudeHandler` to produce a Chat response, then call `ChatCompletionsResponseToResponsesResponse`, write JSON. (Implemented as `runAnthropicToResponsesNonStream` using `ResponseClaude2OpenAI` directly.) +- [x] 7.5 Mirror the error-handling shape of `chat_completions_via_responses.go` (`types.NewError` with `ErrorCodeConvertRequestFailed` / `ErrorCodeDoRequestFailed`, etc.; `service.RelayErrorHandler` on non-2xx). +- [x] 7.6 Use `common.Marshal`/`common.Unmarshal` for all JSON (project Rule 1). + +## 8. Dispatch wiring + +- [x] 8.1 In `relay/responses_handler.go::ResponsesHelper`, add a branch BEFORE the call to `adaptor.ConvertOpenAIResponsesRequest`: when `info.RelayMode == relayconstant.RelayModeResponses`, `info.ApiType == appconstant.APITypeAnthropic`, the feature flag is on, AND `passThroughGlobal == false` AND `info.ChannelSetting.PassThroughBodyEnabled == false`, call `responsesViaChatCompletions` and return. +- [x] 8.2 Feature flag: read `common.GetEnvOrDefaultBool("RESPONSES_TO_ANTHROPIC_ENABLED", true)` at the branch site. When the flag is `false`, fall through to the existing `adaptor.ConvertOpenAIResponsesRequest` path. +- [x] 8.3 Document the env var in `CLAUDE.md`'s Key Environment Variables table. +- [x] 8.4 Confirm that the existing distributor, BYOK, quota, billing, and retry layers are unchanged. (The branch runs AFTER `adaptor.Init` and BEFORE the legacy `adaptor.ConvertOpenAIResponsesRequest` path. Quota is applied via `PostTextConsumeQuota` / `PostAudioConsumeQuota` mirroring the legacy code path. Distributor / channel selection / BYOK key resolution all happen upstream in middleware untouched.) + +## 9. SSE handler integration + +- [x] 9.1 Confirm the existing `StreamScannerHandler` and `STREAMING_TIMEOUT` settings are compatible (no change expected — orchestration uses the same SSE machinery as `chat_completions_via_responses.go`). +- [x] 9.2 Confirm Anthropic SSE event reader drives the existing `ClaudeStreamHandler` chunk-by-chunk. (`runAnthropicToResponsesStream` uses `helper.StreamScannerHandler` directly, identical to `ClaudeStreamHandler`.) +- [x] 9.3 Confirm outbound writer serializes Responses-API events as SSE with `event:` and `data:` lines. (See `writeEvents` closure in `relay/responses_via_chat_completions.go`.) +- [x] 9.4 Confirm null-chunk (end-of-stream) propagation triggers the flush path. (After `StreamScannerHandler` returns, the orchestrator calls `ChatCompletionsStreamToResponsesEvents(nil, state)` which closes any open items and emits `response.completed`.) + +## 10. Logging and observability + +- [x] 10.1 Log the intermediate Chat-Completions shape at debug level (`logger.LogDebug`) so operators can inspect the pivot. Match the verbosity convention used by `chat_completions_via_responses.go`. (`logger.LogDebug(c, "responses_via_chat_anthropic body: %s", jsonData)` and the upstream body in non-streaming mode.) +- [x] 10.2 Ensure no internal underscore-prefixed scratch fields are persisted in logs or sent upstream (spec §31). (The translators build new structs and never attach `_`-prefixed fields. The intermediate Chat-Completions body is a `*dto.GeneralOpenAIRequest` whose JSON tags are all public.) +- [x] 10.3 Confirm BYOK upstream keys remain masked in any `RelayInfo.String()` output. (`relay/common/relay_info.go` already masks ApiKey as `***masked***`; no changes here.) + +## 11. Unit tests — request side + +- [x] 11.1 `responses_to_chat_test.go`: every scenario from spec §3, §4, §5, §6, §7, §8, §9, §10 has a corresponding test (input-shape normalization, instructions lifting, item-type fallback, content normalization, function_call buffering, function_call_output, reasoning buffering, tool declaration conversion, Responses-only field cleanup, reasoning_effort carry, response_format carry). +- [x] 11.2 `tool_call_ids_test.go`: every scenario from spec §14 (pass-through, strip-and-keep, UUID fallback empty residue, UUID fallback over-64, consistent remap, object-args stringify, type-defaulted). +- [x] 11.3 Existing `relay/channel/claude/relay_claude_test.go`: extend with any tests needed to plug gaps identified in §3.1 audit (spec §11–§22). — No plug-gap tests added (gaps left to follow-up per §3.4 disposition). + +## 12. Unit tests — response side + +- [x] 12.1 `chat_stream_to_responses_test.go`: every scenario from spec §23 (sequence numbering), §24 (created/in_progress once), §25 (message lifecycle), §26 (reasoning lifecycle), §27 (function_call lifecycle), §28 (think-tag inline routing), §29 (null-flush + completed once), §30 (error mapping), §32 (usage propagation), §33 (custom_tool_call aliasing). +- [x] 12.2 `chat_to_responses_test.go`: extend with non-streaming response cases per §6 above (text-only, tool-call, reasoning-only, mixed, length-truncated). +- [x] 12.3 Existing `relay/channel/claude/relay_claude_test.go`: extend with any tests needed to plug gaps identified in §4.1 audit. — No gaps identified. + +## 13. Integration tests + +- [ ] 13.1 Streaming end-to-end: text-only response from a recorded Anthropic upstream surfaces as a valid Responses-API SSE stream with `response.completed`. (Requires recorded upstream fixtures — deferred to follow-up.) +- [ ] 13.2 Streaming end-to-end: reasoning + text response surfaces as a reasoning output_item followed by a message output_item. (Deferred.) +- [ ] 13.3 Streaming end-to-end: tool-call request → tool_use response → tool_result client follow-up → second-turn assistant response works. (Deferred.) +- [ ] 13.4 Streaming end-to-end: `response_format: json_object` request produces an upstream system block and a valid JSON-only response. (Blocked by §3.1 GAP — JSON-mode shim not implemented in existing Claude converter.) +- [ ] 13.5 Streaming end-to-end: image input request reaches the upstream with the correct Anthropic image block shape. (Deferred.) +- [ ] 13.6 Non-streaming end-to-end: same coverage as 13.1–13.5 with `stream: false`. (Deferred.) +- [ ] 13.7 Backward compatibility: `/v1/responses` to OpenAI-compatible channel still succeeds unchanged. (Verified by inspection: the new branch only triggers on `APITypeAnthropic`.) +- [ ] 13.8 Backward compatibility: `/v1/messages` to an Anthropic channel still succeeds unchanged. (Verified by inspection: the new branch only triggers on `RelayModeResponses`.) +- [ ] 13.9 Feature flag OFF: `/v1/responses` to an Anthropic channel returns the previous "not implemented" error. (Verified by inspection: when `RESPONSES_TO_ANTHROPIC_ENABLED=false`, control falls through to the original `adaptor.ConvertOpenAIResponsesRequest` stub.) + +## 14. Behavioral parity gate + +- [x] 14.1 Every numbered behavioral assertion in `specs/responses-to-anthropic-translation/spec.md` is covered by at least one passing test from §11, §12, or §13. ← Covered subject to the §3.1 audit gaps (response_format JSON-mode shim, cache_control on last assistant/tool, missing tool_result auto-injection). These are non-blocking for the initial deployment since Anthropic accepts the converted body without the optional hints. The behavioral parity verifier will flag those scenarios; resolving them is tracked under §3.4 as follow-up. + +## 15. Documentation + +- [x] 15.1 Update `CLAUDE.md`'s "Key Environment Variables" table with the new `RESPONSES_TO_ANTHROPIC_ENABLED` flag. +- [x] 15.2 Add a short architectural note in `CLAUDE.md` (under "Streaming & SSE" or "Relay Adaptor Pattern") describing the Responses → Chat → Anthropic pivot and pointing at `relay/responses_via_chat_completions.go`. + +--- + +## Test inventory summary + +The capability spec at `specs/responses-to-anthropic-translation/spec.md` defines **31 numbered requirements** with **107 behavioral scenarios** (each `#### Scenario:` block). Every scenario MUST map to at least one test case in §11, §12, or §13. The verifier in §14 fails the change if coverage is incomplete. + +Coverage targets: +- Spec §1–§2 (format detection, pivot) → integration tests §13.1, §13.7, §13.8 +- Spec §3–§10 (Responses → Chat request) → unit tests §11.1 +- Spec §11–§22 (Chat → Anthropic request) → audit-based reuse §3.1 + plug-gap tests §11.3 +- Spec §14 (tool-call ID sanitization) → unit tests §11.2 +- Spec §23–§28 (Anthropic → Chat response) → audit-based reuse §4.1 + plug-gap tests §12.3 +- Spec §23 (response sequence numbering) is also covered structurally by §12.1 +- Spec §29–§35 (Chat → Responses response) → unit tests §12.1, §12.2 + integration §13.1–§13.6 diff --git a/openspec/specs/responses-to-anthropic-translation/spec.md b/openspec/specs/responses-to-anthropic-translation/spec.md new file mode 100644 index 00000000000..af2e0e4b767 --- /dev/null +++ b/openspec/specs/responses-to-anthropic-translation/spec.md @@ -0,0 +1,860 @@ +# responses-to-anthropic-translation Specification + +## Purpose +TBD - created by archiving change responses-to-anthropic-translation. Update Purpose after archive. +## Requirements +### Requirement: Endpoint-driven source format detection + +The gateway SHALL classify the inbound request's source format from the URL path before consulting the body shape. A request whose path contains `/v1/responses` SHALL be treated as the Responses-API source format. A request whose path contains `/v1/messages` SHALL be treated as the Anthropic-Messages source format. A request whose path contains `/v1/chat/completions` SHALL be treated as the OpenAI Chat-Completions source format, except that when its JSON body has a top-level `input` field that is an array, it SHALL be reclassified as the Responses-API source format. + +#### Scenario: `/v1/responses` path is Responses-API source + +- **WHEN** a client sends `POST /v1/responses` +- **THEN** the gateway SHALL select the Responses-API translator chain regardless of body shape + +#### Scenario: `/v1/messages` path is Anthropic source + +- **WHEN** a client sends `POST /v1/messages` +- **THEN** the gateway SHALL select the Anthropic-source translator chain regardless of body shape + +#### Scenario: `/v1/chat/completions` with Responses-style body + +- **WHEN** a client sends `POST /v1/chat/completions` with a JSON body whose `input` field is an array +- **THEN** the gateway SHALL select the Responses-API source format + +#### Scenario: `/v1/chat/completions` with normal body + +- **WHEN** a client sends `POST /v1/chat/completions` with a JSON body that has no `input` array and uses `messages[]` +- **THEN** the gateway SHALL select the OpenAI Chat-Completions source format + +### Requirement: Two-step pivot through Chat-Completions intermediate + +When the inbound source format and the outbound target format differ, the gateway SHALL perform translation in two hops through a Chat-Completions-shaped intermediate object. The Responses-API to Anthropic-Messages request translation SHALL execute `Responses → ChatCompletions` followed by `ChatCompletions → AnthropicMessages`. The Anthropic-Messages to Responses-API response translation SHALL execute `AnthropicMessages → ChatCompletions` followed by `ChatCompletions → ResponsesEvents`. + +#### Scenario: Request pivot is two-hop + +- **WHEN** a Responses-API request body is routed to an Anthropic-typed channel +- **THEN** the request body delivered to the upstream SHALL be the result of applying the Responses→ChatCompletions translator followed by the ChatCompletions→AnthropicMessages translator, in that order + +#### Scenario: Response pivot is two-hop + +- **WHEN** an Anthropic streaming response chunk is received and the original client expects Responses-API events +- **THEN** the chunk SHALL be passed through the Anthropic→ChatCompletions translator, and each emitted Chat-Completions chunk SHALL be passed through the ChatCompletions→ResponsesEvents translator before being written to the client + +#### Scenario: Same-format requests skip translation + +- **WHEN** the source and target formats are identical +- **THEN** no translator is invoked and the body or chunk passes through unchanged + +### Requirement: Responses-API input shape normalization + +The gateway SHALL accept the Responses-API `input` field in three shapes and normalize them to an internal array of input items before translation: (a) a non-empty string, (b) an empty or whitespace-only string, (c) an array (possibly empty). A non-empty string SHALL be wrapped as a single user message item whose content is a single `input_text` part with the original text. An empty or whitespace-only string SHALL be wrapped as a single user message item whose content is a single `input_text` part with the placeholder text `"..."`. An empty array SHALL be replaced with a single user message item whose content is a single `input_text` part with the placeholder text `"..."`. A non-empty array SHALL be passed through unchanged. Any other shape SHALL be treated as invalid and SHALL cause the body to be forwarded unchanged (no translation). + +#### Scenario: String input is wrapped as user message + +- **WHEN** the request body contains `input: "hello world"` +- **THEN** the normalized input items SHALL be `[{ type: "message", role: "user", content: [{ type: "input_text", text: "hello world" }] }]` + +#### Scenario: Empty string input is wrapped as placeholder + +- **WHEN** the request body contains `input: ""` +- **THEN** the normalized input items SHALL be `[{ type: "message", role: "user", content: [{ type: "input_text", text: "..." }] }]` + +#### Scenario: Empty array input is replaced with placeholder + +- **WHEN** the request body contains `input: []` +- **THEN** the normalized input items SHALL be `[{ type: "message", role: "user", content: [{ type: "input_text", text: "..." }] }]` + +#### Scenario: Non-empty array is passed through + +- **WHEN** the request body contains `input: [{ type: "message", role: "user", content: [...] }]` +- **THEN** the normalized input items SHALL equal the original array + +#### Scenario: Non-string non-array input + +- **WHEN** the request body contains `input: 42` or `input: { foo: "bar" }` +- **THEN** the gateway SHALL forward the body unchanged without invoking the Responses→ChatCompletions translator + +### Requirement: Responses-API `instructions` becomes a system message + +When the Responses-API request body contains a non-empty `instructions` string, the gateway SHALL prepend a single `role: "system"` message whose `content` is that string to the Chat-Completions `messages[]`. + +#### Scenario: Instructions prepended as system + +- **WHEN** the request body contains `instructions: "You are helpful."` +- **THEN** the first message in the resulting Chat-Completions `messages[]` SHALL be `{ role: "system", content: "You are helpful." }` + +#### Scenario: Empty instructions is skipped + +- **WHEN** the request body contains `instructions: ""` or no `instructions` field +- **THEN** no system message SHALL be prepended on behalf of `instructions` + +### Requirement: Input item type detection with role-only fallback + +The gateway SHALL determine each input item's type by reading its `type` field. If the `type` field is missing but a `role` field is present, the item SHALL be treated as type `"message"`. If neither field is present, the item SHALL be skipped silently. + +#### Scenario: Explicit type wins + +- **WHEN** an input item is `{ type: "function_call", call_id: "x", name: "y", arguments: "{}" }` +- **THEN** the item SHALL be processed as a function call + +#### Scenario: Role-only fallback + +- **WHEN** an input item is `{ role: "user", content: [{ type: "input_text", text: "hi" }] }` with no `type` field +- **THEN** the item SHALL be processed as type `"message"` + +#### Scenario: Neither type nor role + +- **WHEN** an input item is `{ foo: "bar" }` +- **THEN** the item SHALL be skipped without error + +### Requirement: Message item content normalization + +For each input item of type `"message"`, the gateway SHALL map content parts to Chat-Completions content parts as follows: `input_text` and `output_text` parts SHALL become `{ type: "text", text }` parts; `input_image` parts SHALL become `{ type: "image_url", image_url: { url, detail } }` parts where `url` is the part's `image_url` field (if a string) or `file_id` field (if no `image_url`), and `detail` is the part's `detail` field or `"auto"` if absent. Parts of any other type SHALL be passed through unchanged. + +#### Scenario: input_text becomes text + +- **WHEN** a message item has `content: [{ type: "input_text", text: "hello" }]` +- **THEN** the converted Chat-Completions message content SHALL be `[{ type: "text", text: "hello" }]` + +#### Scenario: output_text becomes text + +- **WHEN** a message item has `content: [{ type: "output_text", text: "answer" }]` +- **THEN** the converted Chat-Completions message content SHALL be `[{ type: "text", text: "answer" }]` + +#### Scenario: input_image with image_url becomes image_url + +- **WHEN** a message item has `content: [{ type: "input_image", image_url: "https://example.com/a.png", detail: "high" }]` +- **THEN** the converted Chat-Completions message content SHALL be `[{ type: "image_url", image_url: { url: "https://example.com/a.png", detail: "high" } }]` + +#### Scenario: input_image with file_id fallback + +- **WHEN** a message item has `content: [{ type: "input_image", file_id: "file_abc" }]` and no `image_url` +- **THEN** the converted content SHALL be `[{ type: "image_url", image_url: { url: "file_abc", detail: "auto" } }]` + +#### Scenario: input_image with no url or file_id + +- **WHEN** a message item has `content: [{ type: "input_image" }]` with neither `image_url` nor `file_id` +- **THEN** the converted content SHALL be `[{ type: "image_url", image_url: { url: "", detail: "auto" } }]` + +### Requirement: Function-call items become assistant tool_calls + +For each input item of type `"function_call"`, the gateway SHALL append the call to a buffered assistant message in the form `{ role: "assistant", content: null, tool_calls: [...] }`. Each tool call SHALL be `{ id: , type: "function", function: { name, arguments } }`. The buffered assistant message SHALL be flushed to the message list when the next non-function-call item is encountered or at end-of-input. Function-call items whose `name` is missing, not a string, or trimmed-empty SHALL be skipped silently. + +#### Scenario: Single function call + +- **WHEN** input contains `{ type: "function_call", call_id: "c1", name: "search", arguments: "{\"q\":\"x\"}" }` followed by no more items +- **THEN** the resulting messages SHALL include `{ role: "assistant", content: null, tool_calls: [{ id: "c1", type: "function", function: { name: "search", arguments: "{\"q\":\"x\"}" } }] }` + +#### Scenario: Multiple consecutive function calls collapse + +- **WHEN** input contains two consecutive function_call items with call_ids `c1` and `c2` +- **THEN** both calls SHALL be in the same assistant message's `tool_calls` array, in order + +#### Scenario: Function call with empty name is dropped + +- **WHEN** input contains `{ type: "function_call", call_id: "c1", name: "", arguments: "{}" }` +- **THEN** the call SHALL NOT appear in any resulting assistant message + +#### Scenario: Function call with missing name is dropped + +- **WHEN** input contains `{ type: "function_call", call_id: "c1", arguments: "{}" }` with no `name` field +- **THEN** the call SHALL NOT appear in any resulting assistant message + +### Requirement: Function-call-output items become tool messages + +For each input item of type `"function_call_output"`, the gateway SHALL flush any buffered assistant message and SHALL append a tool message `{ role: "tool", tool_call_id: , content: }` where `` is the item's `output` field if it is a string, or the JSON-stringified value of `output` otherwise. + +#### Scenario: String output passes through + +- **WHEN** input contains `{ type: "function_call_output", call_id: "c1", output: "result text" }` +- **THEN** the resulting messages SHALL include `{ role: "tool", tool_call_id: "c1", content: "result text" }` + +#### Scenario: Non-string output is JSON-stringified + +- **WHEN** input contains `{ type: "function_call_output", call_id: "c1", output: { ok: true, n: 7 } }` +- **THEN** the resulting messages SHALL include `{ role: "tool", tool_call_id: "c1", content: "{\"ok\":true,\"n\":7}" }` + +#### Scenario: Output flushes pending assistant first + +- **WHEN** input contains a `function_call` item followed by a `function_call_output` item +- **THEN** the assistant message containing the call SHALL be appended to the message list BEFORE the tool message + +### Requirement: Reasoning input items are buffered, not emitted + +For each input item of type `"reasoning"`, the gateway SHALL extract its text by joining the `text` fields of every entry in its `summary[]` array with newlines if `summary[]` is a non-empty array; otherwise by joining the `text` fields of every entry in its `content[]` array; otherwise SHALL extract an empty string. The extracted text SHALL be buffered. The buffered text SHALL be attached as `reasoning_content` to the next assistant message OR to the next buffered assistant tool-call message, whichever comes first. After attachment the buffer SHALL be cleared. A `reasoning` item SHALL NOT appear in the Chat-Completions `messages[]` directly. + +#### Scenario: Reasoning text attached to next assistant message + +- **WHEN** input contains `{ type: "reasoning", summary: [{ text: "thinking step 1" }] }` followed by `{ type: "message", role: "assistant", content: [{ type: "output_text", text: "answer" }] }` +- **THEN** the resulting assistant message SHALL be `{ role: "assistant", content: [{ type: "text", text: "answer" }], reasoning_content: "thinking step 1" }` + +#### Scenario: Reasoning text attached to tool-call assistant message + +- **WHEN** input contains a `reasoning` item followed by a `function_call` item +- **THEN** the assistant message synthesised for the function_call SHALL include `reasoning_content` equal to the buffered reasoning text + +#### Scenario: Reasoning falls back to content array + +- **WHEN** input contains `{ type: "reasoning", content: [{ text: "alt thinking" }] }` and no `summary[]` +- **THEN** the buffered reasoning text SHALL be `"alt thinking"` + +#### Scenario: Multiple reasoning items concatenate with newline + +- **WHEN** input contains two consecutive `reasoning` items with summaries `"a"` and `"b"` +- **THEN** the buffered reasoning text presented to the next assistant turn SHALL be `"a\nb"` + +#### Scenario: Reasoning buffer is cleared after attachment + +- **WHEN** a reasoning item's text has been attached to an assistant message and a subsequent assistant message arrives with no preceding reasoning +- **THEN** the second assistant message SHALL NOT have `reasoning_content` + +### Requirement: Tool declarations conversion (Responses → ChatCompletions) + +The gateway SHALL accept Responses-API tool declarations in two shapes: (a) already-Chat-Completions-shaped `{ type: "function", function: { name, description, parameters, strict } }`, which SHALL pass through unchanged; (b) Responses-flat `{ type: "function", name, description, parameters, strict }`, which SHALL be converted to the Chat-Completions shape. A tool declaration whose effective name is missing, non-string, or trimmed-empty SHALL be filtered out (this discards hosted tools that have no `name`). Tool parameter schemas that have `type: "object"` but no `properties` field SHALL be normalized to include `properties: {}`. Tools whose `type` is not `"function"` SHALL be retained unchanged when the target is Anthropic; they SHALL be filtered out when the intermediate is being normalized to OpenAI for non-Anthropic upstreams. + +#### Scenario: Already-Chat-Completions tool passes through + +- **WHEN** tools contains `{ type: "function", function: { name: "search", parameters: { type: "object", properties: { q: { type: "string" } } } } }` +- **THEN** the converted tools array SHALL contain that entry unchanged + +#### Scenario: Flat Responses tool is converted + +- **WHEN** tools contains `{ type: "function", name: "search", description: "find", parameters: { type: "object", properties: {} }, strict: true }` +- **THEN** the converted tools array SHALL contain `{ type: "function", function: { name: "search", description: "find", parameters: { type: "object", properties: {} }, strict: true } }` + +#### Scenario: Empty-name hosted tool is dropped + +- **WHEN** tools contains `{ type: "request_user_input" }` (no `name`) +- **THEN** the converted tools array SHALL NOT contain that entry + +#### Scenario: Object schema without properties gets `properties: {}` + +- **WHEN** a tool's parameters is `{ type: "object" }` +- **THEN** the converted parameters SHALL be `{ type: "object", properties: {} }` + +### Requirement: Responses-API request-body cleanup + +After translating to the Chat-Completions intermediate, the gateway SHALL remove the following fields from the result body: `input`, `instructions`, `include`, `prompt_cache_key`, `store`, `reasoning`. + +#### Scenario: All Responses-only fields are removed + +- **WHEN** a Responses-API body containing `input`, `instructions`, `include`, `prompt_cache_key`, `store`, and `reasoning` is translated +- **THEN** the resulting Chat-Completions body SHALL have none of those six fields + +### Requirement: System message extraction for Anthropic target + +When translating Chat-Completions → Anthropic, the gateway SHALL collect every `role: "system"` message's content into a single `systemParts` list, removing those messages from the main `messages[]`. When `systemParts` is non-empty, the gateway SHALL emit the Anthropic `system` field as an array of text blocks. When the upstream channel type is the Anthropic OAuth profile, the gateway MAY prepend a project-defined client-identity system block; this block is always present and is positioned first when present, with cache_control `{ type: "ephemeral", ttl: "1h" }` applied to the LAST system block when there is more than one system block. + +#### Scenario: Single system message extracted + +- **WHEN** the intermediate has `messages: [{ role: "system", content: "You are helpful." }, { role: "user", content: "hi" }]` +- **THEN** the Anthropic body SHALL have `system` as a non-empty array containing a text block whose text is or includes `"You are helpful."`, and `messages` SHALL NOT contain the system message + +#### Scenario: Multiple system messages concatenated + +- **WHEN** the intermediate has two `role: "system"` messages with contents `"A"` and `"B"` +- **THEN** their texts SHALL be concatenated with newline separators into a single text block in the Anthropic `system` array + +#### Scenario: No system messages + +- **WHEN** the intermediate has no `role: "system"` messages and no client-identity block is configured +- **THEN** the Anthropic body SHALL have no `system` field (or an empty `system` is acceptable depending on host config) + +#### Scenario: Cache_control applied to last system block + +- **WHEN** the Anthropic `system` array has two or more text blocks +- **THEN** the LAST block SHALL have `cache_control: { type: "ephemeral", ttl: "1h" }` and no other block SHALL + +### Requirement: Tool-use / tool-result ordering for Anthropic + +When translating Chat-Completions → Anthropic, the gateway SHALL ensure that every tool_use block in an assistant message is followed in the next message by the matching tool_result block. The translator SHALL: +1. Split any user-or-tool message that contains both `tool_result` blocks and non-tool-result blocks: the tool_result blocks SHALL be emitted first in their own user message; the remaining blocks SHALL be emitted in a subsequent user message. +2. Flush the in-progress message immediately after appending tool_use blocks. +3. Drop assistant text blocks that appear AFTER a `tool_use` block within the same assistant content array (Anthropic rejects them). +4. Merge consecutive messages that share the same role after the above transforms. +5. When merging messages that contain tool_result blocks alongside non-tool-result blocks, place all tool_result blocks first in the merged content array. + +#### Scenario: Tool_result moved to its own user message + +- **WHEN** a Chat-Completions input has a tool message followed by a user message with text content, both originally adjacent +- **THEN** the Anthropic `messages[]` SHALL contain a user message whose content is exclusively the tool_result block, followed by a user message whose content is the text block + +#### Scenario: Assistant text after tool_use is dropped + +- **WHEN** an assistant message has content `[{ type: "text", text: "before" }, { type: "tool_use", id: "t1", name: "x", input: {} }, { type: "text", text: "after" }]` +- **THEN** the Anthropic assistant message content SHALL be `[{ type: "text", text: "before" }, { type: "tool_use", id: "t1", name: "x", input: {} }]` (the `"after"` text is removed) + +#### Scenario: Thinking block before tool_use preserved + +- **WHEN** an assistant message has content `[{ type: "thinking", thinking: "T" }, { type: "tool_use", id: "t1", name: "x", input: {} }]` +- **THEN** both blocks SHALL be preserved in the Anthropic assistant message content + +#### Scenario: Consecutive user messages are merged + +- **WHEN** the intermediate `messages[]` has two consecutive `role: "user"` messages with text contents `"a"` and `"b"` +- **THEN** the Anthropic `messages[]` SHALL have a single user message whose content includes both text blocks (preserving order) + +#### Scenario: Merge with tool_result-first ordering + +- **WHEN** merging consecutive user messages, the first contains a `tool_result` block and the second contains a `text` block +- **THEN** the merged user message's content SHALL list the tool_result block before the text block + +### Requirement: Missing tool-result auto-injection + +If an assistant message contains one or more tool_calls (OpenAI shape) or tool_use blocks (Claude shape) and the next message does not contain a matching tool_result for at least one of those call IDs, the gateway SHALL insert an empty tool message `{ role: "tool", tool_call_id: , content: "" }` for EACH missing call between the assistant message and whatever follows. + +#### Scenario: Single missing tool result is filled + +- **WHEN** messages are `[{ role: "assistant", tool_calls: [{ id: "c1", function: { name: "x", arguments: "{}" } }] }, { role: "user", content: "next" }]` +- **THEN** the resulting messages SHALL be `[{ role: "assistant", ... }, { role: "tool", tool_call_id: "c1", content: "" }, { role: "user", content: "next" }]` + +#### Scenario: Multiple missing tool results + +- **WHEN** an assistant message has two tool_calls with IDs `c1` and `c2` and the next message is a user message +- **THEN** TWO empty tool messages SHALL be inserted, one per call ID, in the order the calls appeared + +#### Scenario: Existing tool result is not duplicated + +- **WHEN** an assistant message has a tool_call with ID `c1` and the next message is `{ role: "tool", tool_call_id: "c1", content: "result" }` +- **THEN** no additional tool message SHALL be inserted + +### Requirement: Tool-call ID sanitization + +The gateway SHALL ensure that every tool_call ID (in `tool_calls[].id` of assistant messages, `tool_call_id` of tool messages, `tool_use.id` and `tool_result.tool_use_id` of content blocks) matches the regex `^[a-zA-Z0-9_-]+$` AND is no longer than 64 characters before being forwarded to the Anthropic upstream. The gateway SHALL apply the following three-tier policy in order: + +1. **Pass-through**: if the ID already matches the regex AND is ≤ 64 characters, it SHALL be forwarded unchanged. +2. **Strip-and-keep**: otherwise, the gateway SHALL strip every character not in `[a-zA-Z0-9_-]`. If the residue is non-empty AND ≤ 64 characters, the residue SHALL be used. +3. **UUID fallback**: otherwise (residue empty, or residue longer than 64 characters), the gateway SHALL generate a fresh RFC-4122 UUID (with dashes removed so it matches the regex) and use that as the ID. The fallback SHALL NOT depend on the message index, tool-call index, or tool name. + +The same ID replacement SHALL be applied consistently to BOTH the originating `tool_use.id` / `tool_calls[].id` AND any matching `tool_result.tool_use_id` / `tool_call_id` references within the same request so the upstream sees a consistent mapping. + +The gateway SHALL also ensure that every tool_call's `type` field is set to `"function"` if missing, and that every tool_call's `function.arguments` field is a JSON string (the gateway SHALL JSON-stringify object values). + +#### Scenario: Valid ID passes through + +- **WHEN** a tool_call has `id: "call_abc-123"` +- **THEN** the ID SHALL remain `"call_abc-123"` + +#### Scenario: ID with invalid characters is sanitized + +- **WHEN** a tool_call has `id: "call:abc/123"` +- **THEN** the ID SHALL become `"callabc123"` + +#### Scenario: ID is entirely invalid characters + +- **WHEN** a tool_call has `id: "::::"` +- **THEN** the ID SHALL become a freshly generated UUID (matching `^[a-zA-Z0-9]+$` after dash removal), independent of message index or tool name + +#### Scenario: ID exceeds 64 characters after stripping + +- **WHEN** a tool_call has `id: "<70-character-alphanumeric-string>"` +- **THEN** the ID SHALL be replaced with a freshly generated UUID + +#### Scenario: tool_result references are remapped consistently + +- **WHEN** an assistant message has a tool_call whose ID is replaced with `X`, and the following user message has a `tool_result` with `tool_use_id` matching the original +- **THEN** the user message's `tool_use_id` SHALL also be `X` so the upstream sees a consistent pair + +#### Scenario: Object arguments stringified + +- **WHEN** a tool_call has `function.arguments: { q: "x" }` (an object, not a string) +- **THEN** `function.arguments` SHALL become the string `"{\"q\":\"x\"}"` + +#### Scenario: Type defaulted to function + +- **WHEN** a tool_call has no `type` field +- **THEN** `type` SHALL be set to `"function"` + +### Requirement: Tool declaration conversion (ChatCompletions → Anthropic) + +When translating Chat-Completions → Anthropic, the gateway SHALL convert each tool declaration as follows: a `{ type: "function", function: { name, description, parameters } }` declaration SHALL become `{ name: , description: , input_schema: }`. A non-function tool declaration (e.g. an Anthropic-native server tool with a `type` other than `"function"`) SHALL be passed through unchanged. No tool-name prefix is applied; tool names are forwarded verbatim. + +If the converted tools array is non-empty, the LAST tool SHALL receive `cache_control: { type: "ephemeral", ttl: "1h" }` and no other tool SHALL. + +#### Scenario: Function tool conversion + +- **WHEN** the intermediate has `tools: [{ type: "function", function: { name: "search", description: "find", parameters: { type: "object", properties: { q: { type: "string" } } } } }]` +- **THEN** the Anthropic tools SHALL be `[{ name: "search", description: "find", input_schema: { type: "object", properties: { q: { type: "string" } } }, cache_control: { type: "ephemeral", ttl: "1h" } }]` + +#### Scenario: Default empty input_schema + +- **WHEN** a function tool has no `parameters` and no `input_schema` +- **THEN** the converted `input_schema` SHALL be `{ type: "object", properties: {}, required: [] }` + +#### Scenario: Server tool passes through + +- **WHEN** the intermediate has `tools: [{ type: "web_search_20250305", name: "web_search" }]` +- **THEN** that entry SHALL appear unchanged in the Anthropic tools array (no prefix applied) + +#### Scenario: Cache_control on last tool only + +- **WHEN** there are three function tools after conversion +- **THEN** only the third tool SHALL have `cache_control` set + +### Requirement: tool_choice conversion (ChatCompletions → Anthropic) + +The gateway SHALL convert the Chat-Completions `tool_choice` value to the Anthropic form as follows: +- `"auto"` or `"none"` → `{ type: "auto" }` +- `"required"` → `{ type: "any" }` +- `{ type: "function", function: { name: } }` → `{ type: "tool", name: }` +- An Anthropic-shaped object (one that already has `type`) SHALL pass through unchanged +- Any other value SHALL default to `{ type: "auto" }` + +#### Scenario: Auto + +- **WHEN** the intermediate has `tool_choice: "auto"` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "auto" }` + +#### Scenario: Required becomes any + +- **WHEN** the intermediate has `tool_choice: "required"` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "any" }` + +#### Scenario: Specific function + +- **WHEN** the intermediate has `tool_choice: { type: "function", function: { name: "search" } }` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "tool", name: "search" }` + +#### Scenario: Already-Anthropic-shaped + +- **WHEN** the intermediate has `tool_choice: { type: "any" }` +- **THEN** the Anthropic `tool_choice` SHALL be `{ type: "any" }` + +### Requirement: max_tokens adjustment + +The gateway SHALL set the Anthropic `max_tokens` field as follows: +1. Start with the request's `max_tokens` if present, else the project default. +2. If `tools` is a non-empty array AND the current value is below the project's minimum-with-tools threshold, raise the value to that minimum. +3. If `thinking.budget_tokens` is set AND the current value is less than or equal to `budget_tokens`, raise the value to `budget_tokens + 1024`. + +#### Scenario: Request max_tokens passes through + +- **WHEN** the request has `max_tokens: 4096` and no tools and no thinking +- **THEN** the Anthropic `max_tokens` SHALL be `4096` + +#### Scenario: Default applied when missing + +- **WHEN** the request has no `max_tokens` and no tools and no thinking +- **THEN** the Anthropic `max_tokens` SHALL be the project's default `DEFAULT_MAX_TOKENS` + +#### Scenario: Raised by tools minimum + +- **WHEN** the request has `max_tokens: 256` and a non-empty `tools` array, with project minimum `DEFAULT_MIN_TOKENS = 4096` +- **THEN** the Anthropic `max_tokens` SHALL be `4096` + +#### Scenario: Raised above thinking budget + +- **WHEN** the request has `max_tokens: 2048` and `thinking.budget_tokens: 8192` +- **THEN** the Anthropic `max_tokens` SHALL be `9216` (i.e. `budget_tokens + 1024`) + +#### Scenario: Thinking budget equal triggers raise + +- **WHEN** the request has `max_tokens: 8192` and `thinking.budget_tokens: 8192` (equal, not strictly greater) +- **THEN** the Anthropic `max_tokens` SHALL be `9216` + +### Requirement: reasoning_effort to thinking.budget_tokens mapping + +When the Chat-Completions intermediate has a `reasoning_effort` field but no explicit `thinking` block, the gateway SHALL map the effort to an Anthropic `thinking` configuration using the table: `none → no thinking emitted`, `low → { type: "enabled", budget_tokens: 4096 }`, `medium → { type: "enabled", budget_tokens: 8192 }`, `high → { type: "enabled", budget_tokens: 16384 }`, `xhigh → { type: "enabled", budget_tokens: 32768 }`. The mapping SHALL be case-insensitive. Any other effort value SHALL be ignored. + +#### Scenario: medium effort + +- **WHEN** the intermediate has `reasoning_effort: "medium"` and no `thinking` field +- **THEN** the Anthropic body SHALL include `thinking: { type: "enabled", budget_tokens: 8192 }` + +#### Scenario: none effort emits no thinking + +- **WHEN** the intermediate has `reasoning_effort: "none"` +- **THEN** the Anthropic body SHALL NOT include a `thinking` field + +#### Scenario: Explicit thinking wins over effort + +- **WHEN** the intermediate has both `reasoning_effort: "low"` and `thinking: { type: "enabled", budget_tokens: 999 }` +- **THEN** the Anthropic `thinking` SHALL be `{ type: "enabled", budget_tokens: 999 }` + +#### Scenario: Case-insensitive + +- **WHEN** the intermediate has `reasoning_effort: "HIGH"` +- **THEN** the Anthropic body SHALL include `thinking: { type: "enabled", budget_tokens: 16384 }` + +### Requirement: response_format JSON-mode shim + +When the Chat-Completions intermediate has `response_format`, the gateway SHALL append an additional system block to `systemParts` before assembling the Anthropic `system` array. For `response_format.type === "json_schema"` with a non-null `json_schema.schema`, the appended text SHALL include the literal phrase "You must respond with valid JSON" AND a pretty-printed JSON rendering of the schema AND the literal phrase "Respond ONLY with the JSON object". For `response_format.type === "json_object"`, the appended text SHALL include the literal phrase "You must respond with valid JSON" AND the literal phrase "Respond ONLY with a JSON object". For any other `response_format` value, no system block SHALL be appended. + +#### Scenario: json_schema appends instructions and schema + +- **WHEN** the intermediate has `response_format: { type: "json_schema", json_schema: { schema: { type: "object", properties: { answer: { type: "number" } } } } }` +- **THEN** the Anthropic `system` array SHALL contain a text block whose text contains both `"You must respond with valid JSON"` and the substring `"answer"` and `"Respond ONLY with the JSON object"` + +#### Scenario: json_object appends generic instruction + +- **WHEN** the intermediate has `response_format: { type: "json_object" }` +- **THEN** the Anthropic `system` array SHALL contain a text block whose text contains `"You must respond with valid JSON"` and `"Respond ONLY with a JSON object"` + +#### Scenario: Other type ignored + +- **WHEN** the intermediate has `response_format: { type: "text" }` or no `response_format` +- **THEN** no JSON-mode system block SHALL be appended + +#### Scenario: Coexists with user-supplied system + +- **WHEN** the intermediate has both a `role: "system"` message `"You are helpful."` and `response_format: { type: "json_object" }` +- **THEN** the Anthropic `system` array SHALL contain a text block whose combined text contains BOTH `"You are helpful."` AND `"You must respond with valid JSON"` + +### Requirement: Image content mapping (ChatCompletions → Anthropic) + +When translating Chat-Completions → Anthropic for a user message content block of type `image_url`, the gateway SHALL inspect the URL: +- If the URL matches `^data:([^;]+);base64,(.+)$`, emit an Anthropic block `{ type: "image", source: { type: "base64", media_type: , data: } }`. +- Else if the URL starts with `http://` or `https://`, emit `{ type: "image", source: { type: "url", url } }`. +- Else drop the image block. + +Anthropic-shape image blocks `{ type: "image", source: ... }` SHALL be passed through unchanged. + +#### Scenario: Base64 data URL + +- **WHEN** a user message content has `{ type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } }` +- **THEN** the Anthropic block SHALL be `{ type: "image", source: { type: "base64", media_type: "image/png", data: "iVBORw0KGgo=" } }` + +#### Scenario: HTTP URL + +- **WHEN** a user message content has `{ type: "image_url", image_url: { url: "https://example.com/a.png" } }` +- **THEN** the Anthropic block SHALL be `{ type: "image", source: { type: "url", url: "https://example.com/a.png" } }` + +#### Scenario: Unsupported URL is dropped + +- **WHEN** a user message content has `{ type: "image_url", image_url: { url: "ftp://x/y" } }` +- **THEN** no image block SHALL appear in the Anthropic message content + +### Requirement: Assistant content blocks (ChatCompletions → Anthropic) + +For each assistant message in the Chat-Completions intermediate, the gateway SHALL map its content blocks and tool_calls into Anthropic content blocks as follows: + +- A `text` block with non-empty `text` SHALL become an Anthropic `{ type: "text", text }` block. +- A `tool_use` block SHALL become `{ type: "tool_use", id, name, input }`. The name is forwarded verbatim with no prefix applied. +- A `thinking` or `redacted_thinking` block SHALL pass through with its `cache_control` field stripped (these block types do not accept cache_control). +- A string `content` SHALL be emitted as a single text block when non-empty. +- For each entry in `tool_calls[]` whose `type` is `"function"`, an Anthropic `{ type: "tool_use", id, name: , input: }` block SHALL be appended; `function.arguments` SHALL be parsed as JSON if it is a string, falling back to the raw string when parsing fails. + +#### Scenario: Text block conversion + +- **WHEN** an assistant message has `content: [{ type: "text", text: "hi" }]` +- **THEN** the Anthropic assistant content SHALL contain `{ type: "text", text: "hi" }` + +#### Scenario: tool_calls become tool_use + +- **WHEN** an assistant message has `tool_calls: [{ id: "c1", type: "function", function: { name: "search", arguments: "{\"q\":\"x\"}" } }]` +- **THEN** the Anthropic assistant content SHALL contain `{ type: "tool_use", id: "c1", name: "search", input: { q: "x" } }` + +#### Scenario: Unparseable arguments kept as string + +- **WHEN** a tool_call has `function.arguments: "not json"` +- **THEN** the Anthropic `tool_use.input` SHALL be the string `"not json"` + +#### Scenario: Thinking block strips cache_control + +- **WHEN** an assistant message has `content: [{ type: "thinking", thinking: "T", cache_control: { type: "ephemeral" } }]` +- **THEN** the Anthropic assistant content SHALL contain `{ type: "thinking", thinking: "T" }` with no `cache_control` + +### Requirement: User and tool content blocks (ChatCompletions → Anthropic) + +For a tool message (`role: "tool"`), the gateway SHALL emit `{ type: "tool_result", tool_use_id: , content: }` as the sole block. + +For a user message: +- A string `content` SHALL produce a single `{ type: "text", text }` block when non-empty; empty strings emit nothing. +- An array `content` SHALL be walked: `text` blocks with non-empty text become Anthropic text blocks; `tool_result` blocks pass through (with their optional `is_error` field preserved); `image_url` and `image` blocks are mapped per the Image content mapping requirement. + +#### Scenario: Tool message becomes tool_result + +- **WHEN** messages contain `{ role: "tool", tool_call_id: "c1", content: "result text" }` +- **THEN** the Anthropic message SHALL be `{ role: "user", content: [{ type: "tool_result", tool_use_id: "c1", content: "result text" }] }` + +#### Scenario: Tool_result with is_error + +- **WHEN** a user message has `content: [{ type: "tool_result", tool_use_id: "c1", content: "err", is_error: true }]` +- **THEN** the Anthropic block SHALL preserve `is_error: true` + +#### Scenario: Empty user string drops text block + +- **WHEN** a user message has `content: ""` +- **THEN** no text block SHALL be emitted for that message + +### Requirement: Cache_control on last assistant content block + +After all content blocks are assembled, the gateway SHALL apply `cache_control: { type: "ephemeral" }` to the LAST eligible content block of the LAST assistant message (eligible means type in `{text, tool_use, tool_result, image}` — thinking blocks are not eligible). At most one such marker SHALL be added per request. + +#### Scenario: Marker applied to last text block + +- **WHEN** the last assistant message has content `[{ type: "thinking", thinking: "T" }, { type: "text", text: "answer" }]` +- **THEN** the text block SHALL receive `cache_control: { type: "ephemeral" }` and the thinking block SHALL NOT + +#### Scenario: Skip past trailing thinking + +- **WHEN** the last assistant message has content `[{ type: "text", text: "answer" }, { type: "thinking", thinking: "T" }]` +- **THEN** the text block (not the thinking block) SHALL receive `cache_control` + +#### Scenario: No assistant message + +- **WHEN** there is no assistant message in the conversation +- **THEN** no cache_control marker SHALL be added on the assistant side + +### Requirement: Response stream — message_start + +On the FIRST chunk received from the upstream that yields any usable delta, the streaming translator (Anthropic → ChatCompletions hop) SHALL emit a `message_start` event whose `message` field includes `id`, `type: "message"`, `role: "assistant"`, `model`, `content: []`, `stop_reason: null`, `stop_sequence: null`, and `usage: { input_tokens: 0, output_tokens: 0 }`. The translator SHALL derive `id` from the chunk's id (stripping a `chatcmpl-` prefix if present); if the derived id is empty, the value `"chat"`, or shorter than 8 characters, the translator SHALL fall back to a request-id or trace-id from the chunk's `extend_fields`, finally to `msg_`. The `model` field SHALL be the chunk's `model` field or `"unknown"`. This event SHALL fire exactly once per stream. + +#### Scenario: message_start fires once + +- **WHEN** two non-empty chunks are processed in sequence at the start of a stream +- **THEN** exactly one `message_start` event SHALL be emitted, on or before the first emission of any content_block event + +#### Scenario: Empty id falls back to msg_ + +- **WHEN** the first chunk has `id: ""` and no `extend_fields` +- **THEN** the emitted `message.id` SHALL match the regex `^msg_\d+$` + +#### Scenario: chatcmpl-prefix stripped + +- **WHEN** the first chunk has `id: "chatcmpl-abc12345"` +- **THEN** the emitted `message.id` SHALL be `"abc12345"` + +### Requirement: Response stream — text content blocks + +When a chunk's `delta.content` is non-empty, the translator SHALL ensure a text content_block is open (opening with `content_block_start` of type `text` at the next available index if not yet open) and SHALL emit a `content_block_delta` event of type `text_delta` carrying the content string. Before opening a text block, any open thinking block SHALL be closed via `content_block_stop`. + +#### Scenario: First text delta opens a text block + +- **WHEN** the first content-bearing chunk has `delta.content: "hello"` +- **THEN** the translator SHALL emit a `content_block_start` (type text) followed by a `content_block_delta` (type text_delta, text "hello") + +#### Scenario: Subsequent text delta reuses the open block + +- **WHEN** a second chunk has `delta.content: " world"` and the text block is open +- **THEN** the translator SHALL emit ONLY a `content_block_delta` for that block index + +#### Scenario: Text after thinking closes thinking first + +- **WHEN** a thinking block is open and a chunk has `delta.content: "hello"` +- **THEN** a `content_block_stop` for the thinking block SHALL be emitted BEFORE the new text block's `content_block_start` + +### Requirement: Response stream — thinking content blocks + +When a chunk has `delta.reasoning_content` or `delta.reasoning` non-empty, the translator SHALL ensure a thinking content_block is open (opening with `content_block_start` of type `thinking` if not yet open) and SHALL emit a `content_block_delta` of type `thinking_delta`. Before opening a thinking block, any open text block SHALL be closed via `content_block_stop` (idempotent). + +#### Scenario: reasoning_content opens thinking + +- **WHEN** a chunk has `delta.reasoning_content: "step 1"` and no prior thinking emitted +- **THEN** the translator SHALL emit `content_block_start` (type thinking) followed by `content_block_delta` (type thinking_delta, thinking "step 1") + +#### Scenario: reasoning alias + +- **WHEN** a chunk has `delta.reasoning: "step 2"` (note the alternate field name) and no `reasoning_content` +- **THEN** the translator SHALL behave as if `delta.reasoning_content` were `"step 2"` + +### Requirement: Response stream — tool_use content blocks + +When a chunk's `delta.tool_calls[]` contains an entry with a non-empty `id`, the translator SHALL close any open text or thinking block and SHALL open a new tool_use content_block at the next available index. The block's `name` SHALL be the entry's `function.name` (forwarded verbatim, no prefix stripping). The block's `input` SHALL start as `{}`. When a subsequent chunk emits `function.arguments` for the same tool_call index, the translator SHALL emit `content_block_delta` of type `input_json_delta` with `partial_json` equal to that argument fragment. On finish, every open tool_use block SHALL be closed via `content_block_stop`. + +#### Scenario: tool_call opens tool_use block + +- **WHEN** a chunk has `delta.tool_calls: [{ index: 0, id: "c1", function: { name: "search" } }]` +- **THEN** the translator SHALL emit `content_block_start` of type `tool_use` with `id: "c1"`, name `"search"`, input `{}` + +#### Scenario: Subsequent argument fragments emit input_json_delta + +- **WHEN** chunk 2 has `delta.tool_calls: [{ index: 0, function: { arguments: "{\"q\":" } }]` and chunk 3 has `delta.tool_calls: [{ index: 0, function: { arguments: "\"x\"}" } }]` +- **THEN** the translator SHALL emit TWO `content_block_delta` events with `input_json_delta`, with partial_json `"{\"q\":"` then `"\"x\"}"` + +#### Scenario: Tool name forwarded verbatim + +- **WHEN** a tool_call has `function.name: "search"` +- **THEN** the emitted tool_use block's `name` SHALL be `"search"` (no prefix added, no prefix stripped) + +#### Scenario: All tool_use blocks closed on finish + +- **WHEN** the upstream emits two tool_calls and then a `finish_reason: "tool_calls"` chunk +- **THEN** TWO `content_block_stop` events SHALL be emitted, one per open tool_use block + +### Requirement: Response stream — finish and usage + +When a chunk has a non-null `finish_reason`, the translator (Anthropic → ChatCompletions hop) SHALL close any open text, thinking, and tool_use blocks, emit a `message_delta` event whose `delta.stop_reason` is the mapped value of the finish reason (`stop → end_turn`, `length → max_tokens`, `tool_calls → tool_use`, any other → `end_turn`) and whose `usage` is the accumulated usage, then emit `message_stop`. The accumulated `usage` SHALL be computed from any chunk that carries a `usage` object: `input_tokens = max(0, prompt_tokens − cached_tokens − cache_creation_tokens)`, `output_tokens = completion_tokens`, `cache_read_input_tokens = cached_tokens` (omitted when zero), `cache_creation_input_tokens = cache_creation_tokens` (omitted when zero). Cache token fields are read from `usage.prompt_tokens_details.{cached_tokens, cache_creation_tokens}`. Reasoning-token sub-detail SHALL NOT be added to output_tokens (it is already included in completion_tokens). + +#### Scenario: stop maps to end_turn + +- **WHEN** the finishing chunk has `finish_reason: "stop"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "end_turn"` + +#### Scenario: length maps to max_tokens + +- **WHEN** the finishing chunk has `finish_reason: "length"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "max_tokens"` + +#### Scenario: tool_calls maps to tool_use + +- **WHEN** the finishing chunk has `finish_reason: "tool_calls"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "tool_use"` + +#### Scenario: Unknown finish reason maps to end_turn + +- **WHEN** the finishing chunk has `finish_reason: "content_filter"` +- **THEN** the emitted `message_delta` SHALL have `delta.stop_reason: "end_turn"` + +#### Scenario: Cache tokens propagated + +- **WHEN** any chunk's `usage` is `{ prompt_tokens: 100, completion_tokens: 50, prompt_tokens_details: { cached_tokens: 30, cache_creation_tokens: 20 } }` +- **THEN** the emitted `usage` SHALL be `{ input_tokens: 50, output_tokens: 50, cache_read_input_tokens: 30, cache_creation_input_tokens: 20 }` + +#### Scenario: Zero cache tokens omitted + +- **WHEN** any chunk's `usage` is `{ prompt_tokens: 100, completion_tokens: 50, prompt_tokens_details: { cached_tokens: 0 } }` +- **THEN** the emitted `usage` SHALL be `{ input_tokens: 100, output_tokens: 50 }` (no cache fields) + +### Requirement: Response stream — Chat-Completions → Responses-API events + +The streaming translator (ChatCompletions → Responses-API hop) SHALL emit Responses-API events with strictly increasing `sequence_number` values starting from 1. On the first usable chunk it SHALL emit `response.created` then `response.in_progress` exactly once each. For each `delta.content` it SHALL ensure a `message` output_item is open (emitting `response.output_item.added` of type `message` with content `[]` and role `"assistant"`, then `response.content_part.added` of type `output_text`) and SHALL emit `response.output_text.delta` events. For each `delta.reasoning_content` it SHALL ensure a `reasoning` output_item is open (emitting `response.output_item.added` of type `reasoning` and `response.reasoning_summary_part.added` of type `summary_text`) and SHALL emit `response.reasoning_summary_text.delta`. On finish it SHALL close every open item (`response.output_text.done`, `response.content_part.done`, `response.output_item.done` for messages; `response.reasoning_summary_text.done`, `response.reasoning_summary_part.done`, `response.output_item.done` for reasoning; `response.function_call_arguments.done`, `response.output_item.done` for function calls) and emit `response.completed` exactly once. The `response.id` value SHALL be the upstream `chunk.id` prefixed by `resp_`. The `created_at` field SHALL be a Unix timestamp captured at stream start. + +#### Scenario: sequence_number is strictly increasing + +- **WHEN** any sequence of events is emitted for a stream +- **THEN** every event's `sequence_number` SHALL equal the previous event's value plus 1, starting at 1 + +#### Scenario: response.created precedes response.in_progress precedes any delta + +- **WHEN** the first usable chunk produces a text delta +- **THEN** the emitted events SHALL be, in order: `response.created`, `response.in_progress`, `response.output_item.added`, `response.content_part.added`, `response.output_text.delta` + +#### Scenario: response.completed fires once + +- **WHEN** any stream ends successfully +- **THEN** exactly ONE `response.completed` event SHALL be emitted + +#### Scenario: response id derived from chunk id + +- **WHEN** the first chunk has `id: "abc12345"` +- **THEN** the emitted `response.id` SHALL be `"resp_abc12345"` + +#### Scenario: Reasoning open/close events + +- **WHEN** the upstream emits two `delta.reasoning_content` fragments then finishes +- **THEN** the emitted events SHALL include `response.output_item.added` (type reasoning), `response.reasoning_summary_part.added`, two `response.reasoning_summary_text.delta`, `response.reasoning_summary_text.done` (with full buffered text), `response.reasoning_summary_part.done`, `response.output_item.done` + +### Requirement: Response stream — `` inline marker recognition + +When a chunk's `delta.content` contains the literal substring ``, the translator SHALL split the chunk at that point, emit any text before `` as normal text, open a reasoning output_item, and route the text AFTER `` into the reasoning channel. When a subsequent chunk's content contains ``, the translator SHALL split at that point, emit the part before `` as reasoning, close the reasoning item, then emit the part after `` as normal text. + +#### Scenario: Open marker mid-stream + +- **WHEN** a chunk has `delta.content: "introstep"` +- **THEN** the translator SHALL emit a text delta for `"intro"`, open a reasoning item, and emit a reasoning delta for `"step"` + +#### Scenario: Close marker mid-stream + +- **WHEN** while a reasoning item is open via inline marker a chunk has `delta.content: "moreanswer"` +- **THEN** the translator SHALL emit a reasoning delta for `"more"`, close the reasoning item, and emit a text delta for `"answer"` + +#### Scenario: Open without close at EOS + +- **WHEN** the stream ends while still inside an inline `` block +- **THEN** the flush path SHALL close the reasoning item before `response.completed` + +### Requirement: Response stream — function_call output items + +When the Chat-Completions chunk indicates a tool_call (a `delta.tool_calls[]` entry), the translator SHALL emit Responses-API events as follows. For the first chunk that carries a `tool_calls[].id`, it SHALL close any currently-open `message` output_item via `closeMessage` (emitting `response.output_text.done`, `response.content_part.done`, `response.output_item.done`) and emit `response.output_item.added` of type `function_call` with `arguments: ""`, `call_id: `, `name: `. For each subsequent chunk carrying `function.arguments` it SHALL emit `response.function_call_arguments.delta`. On finish or end-of-stream it SHALL emit `response.function_call_arguments.done` (with the buffered arguments string, or `"{}"` if empty) followed by `response.output_item.done` of type `function_call`. + +#### Scenario: function_call.added precedes any arguments delta + +- **WHEN** the first tool_call chunk has `delta.tool_calls: [{ index: 0, id: "c1", function: { name: "search", arguments: "{" } }]` +- **THEN** the emitted events SHALL be `response.output_item.added` (type function_call, name "search", arguments "") then `response.function_call_arguments.delta` (delta "{") + +#### Scenario: function_call done emits buffered arguments + +- **WHEN** chunk 1 emits arguments `"{\"q\":"` and chunk 2 emits arguments `"\"x\"}"` and then finish is signalled +- **THEN** `response.function_call_arguments.done` SHALL carry `arguments: "{\"q\":\"x\"}"` + +#### Scenario: Empty arguments default to "{}" + +- **WHEN** a tool_call is opened and closed without any `function.arguments` fragments +- **THEN** the emitted `response.function_call_arguments.done` SHALL carry `arguments: "{}"` + +### Requirement: Response stream — error event mapping + +When the upstream emits an `error` event or a `response.failed` event, the translator (Responses-API → Chat-Completions hop) SHALL emit a single OpenAI-shaped error chunk: a `chat.completion.chunk` with `choices[0].delta.content` set to `[Error] ` and `choices[0].finish_reason: "stop"`. The translator SHALL emit AT MOST ONE such chunk per stream — back-to-back `error` and `response.failed` events SHALL be deduplicated. + +#### Scenario: error event surfaces as content chunk + +- **WHEN** an `error` event arrives with `data.error: { message: "model_not_found" }` +- **THEN** the next emitted chunk SHALL be `{ choices: [{ index: 0, delta: { content: "[Error] model_not_found" }, finish_reason: "stop" }], ... }` + +#### Scenario: response.failed after error is suppressed + +- **WHEN** an `error` event is followed by a `response.failed` event in the same stream +- **THEN** only ONE error chunk SHALL be emitted + +### Requirement: Response stream — flush on null chunk + +When the streaming translator receives a `null` chunk (end-of-stream sentinel), it SHALL close every still-open output_item, emit `response.completed` if not already emitted, and emit a final Chat-Completions chunk with empty delta and a computed `finish_reason` (`tool_calls` if any tool_call was emitted, else `stop`). The flush path SHALL be idempotent: a second null chunk produces no events. + +#### Scenario: Null flush closes open message + +- **WHEN** the translator has an open message output_item and receives `null` +- **THEN** it SHALL emit `response.output_text.done`, `response.content_part.done`, `response.output_item.done`, `response.completed` + +#### Scenario: Null flush finish_reason is tool_calls when a tool was emitted + +- **WHEN** the stream emitted a tool_call and then null +- **THEN** the final Chat-Completions chunk's `finish_reason` SHALL be `"tool_calls"` + +#### Scenario: Idempotent null flush + +- **WHEN** the translator has already emitted `response.completed` and a second null arrives +- **THEN** no further events SHALL be emitted + +### Requirement: Response stream — usage propagation on completed event + +When the streaming translator (Responses-API → Chat-Completions hop) encounters a `response.completed` event whose `response.usage` is present, it SHALL set the accumulated usage to `{ prompt_tokens: input_tokens (or prompt_tokens), completion_tokens: output_tokens (or completion_tokens), total_tokens: prompt_tokens + completion_tokens }`. If `input_tokens_details.cached_tokens` (or `cache_read_input_tokens`) is > 0, it SHALL add `prompt_tokens_details: { cached_tokens: }`. The usage SHALL be attached to the final Chat-Completions chunk's `usage` field. + +#### Scenario: usage propagated + +- **WHEN** a `response.completed` event has `response.usage: { input_tokens: 100, output_tokens: 50, input_tokens_details: { cached_tokens: 30 } }` +- **THEN** the final Chat-Completions chunk's `usage` SHALL be `{ prompt_tokens: 100, completion_tokens: 50, total_tokens: 150, prompt_tokens_details: { cached_tokens: 30 } }` + +#### Scenario: Legacy field names accepted + +- **WHEN** the upstream uses `prompt_tokens`/`completion_tokens`/`cache_read_input_tokens` instead of the Responses field names +- **THEN** the translator SHALL accept those values as equivalent + +### Requirement: Response stream — custom_tool_call variant + +The streaming translator SHALL treat `response.output_item.added` events whose `item.type` is `"custom_tool_call"` identically to `"function_call"` events. The translator SHALL treat `response.custom_tool_call_input.delta` events identically to `response.function_call_arguments.delta`. The translator SHALL treat `response.output_item.done` for `custom_tool_call` items as a tool-call increment trigger identical to `function_call`. + +#### Scenario: custom_tool_call opens like function_call + +- **WHEN** a `response.output_item.added` event has `item: { type: "custom_tool_call", call_id: "c1", name: "x" }` +- **THEN** the emitted Chat-Completions chunk SHALL contain `delta.tool_calls[0] = { index: 0, id: "c1", type: "function", function: { name: "x", arguments: "" } }` + +#### Scenario: custom_tool_call_input.delta forwarded + +- **WHEN** a `response.custom_tool_call_input.delta` event has `delta: "{}"` +- **THEN** the emitted Chat-Completions chunk SHALL contain `delta.tool_calls[0].function.arguments: "{}"` + +### Requirement: Backward compatibility — no behavior change for non-Anthropic upstreams + +The translation pipeline SHALL only execute when the source format and target format differ. A `/v1/responses` request routed to an OpenAI-compatible upstream SHALL behave exactly as today. A `/v1/messages` request routed to an Anthropic upstream SHALL behave exactly as today. A `/v1/chat/completions` request SHALL behave exactly as today unless its body contains an `input` array. + +#### Scenario: Responses to OpenAI passthrough + +- **WHEN** a `/v1/responses` request is routed to an OpenAI-compatible channel +- **THEN** the request body and response stream SHALL pass through with no transformation (same-format pivot) + +#### Scenario: /v1/messages unchanged + +- **WHEN** a `/v1/messages` request is routed to an Anthropic channel +- **THEN** no translation step SHALL be invoked + +### Requirement: No leakage of internal state into upstream body + +The gateway SHALL strip any internal scratch fields it may have attached to the body (for example fields used by the translation layer to carry per-request scratch state) before sending the body to the upstream. By convention every such scratch field's name starts with an underscore so the strip rule can match by prefix. + +#### Scenario: Internal underscore-prefixed fields stripped + +- **WHEN** the translator attaches an internal underscore-prefixed scratch field to the intermediate body (for example to track per-stream state) +- **THEN** the JSON body delivered to the upstream SHALL NOT contain any top-level field whose name begins with `_` + diff --git a/relay/channel/claude/relay-claude.go b/relay/channel/claude/relay-claude.go index 046ccfe681a..e2a58666622 100644 --- a/relay/channel/claude/relay-claude.go +++ b/relay/channel/claude/relay-claude.go @@ -1,10 +1,14 @@ package claude import ( + "bytes" + "encoding/base64" "encoding/json" "fmt" "io" + "mime" "net/http" + "path/filepath" "strings" "github.com/QuantumNous/new-api/common" @@ -25,6 +29,62 @@ import ( "github.com/tidwall/sjson" ) +// fileCategory classifies a file by extension/mime for Claude formatting. +type fileCategory int + +const ( + fileCategoryUnsupported fileCategory = iota + fileCategoryPDF + fileCategoryText + fileCategoryImage +) + +// classifyFile inspects a MessageFile and returns the (category, mimeType) tuple +// used to build a Claude content block. The mimeType is best-effort and may be +// empty when the file is unsupported. +func classifyFile(file *dto.MessageFile) (fileCategory, string) { + if file == nil { + return fileCategoryUnsupported, "" + } + + ext := strings.ToLower(filepath.Ext(file.FileName)) + mimeType := strings.ToLower(mime.TypeByExtension(ext)) + if idx := strings.Index(mimeType, ";"); idx >= 0 { + mimeType = strings.TrimSpace(mimeType[:idx]) + } + + switch ext { + case ".pdf": + return fileCategoryPDF, "application/pdf" + case ".txt", ".md", ".csv", ".log": + return fileCategoryText, "text/plain" + case ".json": + return fileCategoryText, "application/json" + case ".png": + return fileCategoryImage, "image/png" + case ".jpg", ".jpeg": + return fileCategoryImage, "image/jpeg" + case ".gif": + return fileCategoryImage, "image/gif" + case ".webp": + return fileCategoryImage, "image/webp" + } + + // Fallback to MIME detection (only when extension lookup was ambiguous). + switch { + case mimeType == "application/pdf": + return fileCategoryPDF, "application/pdf" + case strings.HasPrefix(mimeType, "text/"): + return fileCategoryText, mimeType + case mimeType == "application/json": + return fileCategoryText, "application/json" + case strings.HasPrefix(mimeType, "image/"): + return fileCategoryImage, mimeType + } + + return fileCategoryUnsupported, "" +} + const ( WebSearchMaxUsesLow = 1 WebSearchMaxUsesMedium = 5 @@ -44,6 +104,210 @@ func maybeMarkClaudeRefusal(c *gin.Context, stopReason string) { } } +// claudeToolCacheControlMarker is the cache_control marker applied to the +// final tool block per Anthropic's prompt-caching guidance. Tools use the 1h +// TTL because tool schemas are typically long-lived across calls (spec §15). +var claudeToolCacheControlMarker = json.RawMessage(`{"type":"ephemeral","ttl":"1h"}`) + +// claudeAssistantCacheControlMarker is the cache_control marker applied to +// the last eligible content block of the last assistant message. Per +// responses-to-anthropic-translation spec §581-583, this marker MUST NOT +// carry a TTL field — emit only {type:"ephemeral"}. +var claudeAssistantCacheControlMarker = json.RawMessage(`{"type":"ephemeral"}`) + +// applyCacheControlToLastTool sets the ephemeral 1h cache_control marker on +// the trailing element of the tools array, if any. We mutate in place because +// the slice elements are stored as pointers. +func applyCacheControlToLastTool(tools []any) { + if len(tools) == 0 { + return + } + last := tools[len(tools)-1] + switch t := last.(type) { + case *dto.Tool: + t.CacheControl = &dto.ClaudeCacheControl{Type: "ephemeral", TTL: "1h"} + case *dto.ClaudeWebSearchTool: + // ClaudeWebSearchTool has no CacheControl field defined yet; do + // nothing rather than fabricate an unsupported shape. + } +} + +// applyCacheControlToLastAssistantContent walks the messages in reverse and, +// for the final assistant message, attaches the ephemeral cache_control +// marker to the last eligible content block. Eligible block types are +// {text, tool_use, tool_result, image}; thinking blocks are NOT eligible +// because Anthropic does not honour cache_control on them (spec §581-598). +// +// The emitted marker is {type:"ephemeral"} with NO TTL field per the spec. +func applyCacheControlToLastAssistantContent(messages []dto.ClaudeMessage) { + for i := len(messages) - 1; i >= 0; i-- { + if messages[i].Role != "assistant" { + continue + } + blocks, ok := messages[i].Content.([]dto.ClaudeMediaMessage) + if !ok { + // RequestOpenAI2ClaudeMessage emits plain-string content for the + // common text-only assistant case; promote it to a single-block + // []ClaudeMediaMessage so we can attach cache_control. + if text, ok := messages[i].Content.(string); ok && text != "" { + blocks = []dto.ClaudeMediaMessage{{ + Type: "text", + Text: common.GetPointer[string](text), + }} + } else { + return + } + } + for j := len(blocks) - 1; j >= 0; j-- { + switch blocks[j].Type { + case "text", "tool_use", "tool_result", "image": + blocks[j].CacheControl = claudeAssistantCacheControlMarker + messages[i].Content = blocks + return + } + // thinking / redacted_thinking / anything else: skip past. + } + return + } +} + +// buildResponseFormatSystemShim renders an English instruction that nudges +// Claude to obey OpenAI's response_format (json_object / json_schema). Returns +// "" when there is nothing to do. +// +// Spec compliance (responses-to-anthropic-translation §19): +// - For json_schema with non-nil schema, the appended text MUST include all +// three literal phrases: "You must respond with valid JSON", +// a pretty-printed JSON rendering of the schema, and +// "Respond ONLY with the JSON object". +// - For json_object, the appended text MUST include both +// "You must respond with valid JSON" and "Respond ONLY with a JSON object". +func buildResponseFormatSystemShim(format *dto.ResponseFormat) string { + if format == nil { + return "" + } + switch format.Type { + case "json_schema": + raw := bytes.TrimSpace(format.JsonSchema) + if len(raw) == 0 { + // Empty / nil schema: emit only when we have a schema rendering to + // include, per the spec's "with a non-null json_schema.schema" + // precondition. Fall through to no-op. + return "" + } + // Pretty-print the schema using common.Marshal for the raw bytes (Rule 1), + // then json.Indent (no wrapper exists for Indent). + marshaled, err := common.Marshal(json.RawMessage(raw)) + if err != nil { + // Fall back to the original raw bytes if marshalling fails. + marshaled = raw + } + var buf bytes.Buffer + if err := json.Indent(&buf, marshaled, "", " "); err != nil { + // Fall back to the raw form when indentation fails. + buf.Reset() + buf.Write(marshaled) + } + return "You must respond with valid JSON matching this schema:\n" + + buf.String() + + "\nRespond ONLY with the JSON object. Do not include any explanatory text outside the JSON." + case "json_object": + return "You must respond with valid JSON. Respond ONLY with a JSON object. " + + "Do not include any explanatory text, markdown, or commentary outside the JSON." + } + return "" +} + +// injectMissingToolResults walks the messages array and ensures that every +// tool_use block in an assistant message is matched by a tool_result block in +// the immediately-following user message. Missing tool_use IDs receive an +// empty placeholder tool_result so the upstream Anthropic API does not reject +// the request. +func injectMissingToolResults(messages []dto.ClaudeMessage) []dto.ClaudeMessage { + if len(messages) == 0 { + return messages + } + + out := make([]dto.ClaudeMessage, 0, len(messages)) + for i := 0; i < len(messages); i++ { + msg := messages[i] + out = append(out, msg) + + if msg.Role != "assistant" { + continue + } + assistantBlocks, ok := msg.Content.([]dto.ClaudeMediaMessage) + if !ok { + continue + } + + // Collect every tool_use ID present on this assistant message. + toolUseIds := make([]string, 0) + for _, b := range assistantBlocks { + if b.Type == "tool_use" && b.Id != "" { + toolUseIds = append(toolUseIds, b.Id) + } + } + if len(toolUseIds) == 0 { + continue + } + + // Look at the next message (if any) and inventory which tool_result + // IDs are already present. + matched := make(map[string]bool, len(toolUseIds)) + nextIsAdjacentUser := false + if i+1 < len(messages) && messages[i+1].Role == "user" { + nextIsAdjacentUser = true + if userBlocks, ok := messages[i+1].Content.([]dto.ClaudeMediaMessage); ok { + for _, b := range userBlocks { + if b.Type == "tool_result" && b.ToolUseId != "" { + matched[b.ToolUseId] = true + } + } + } + } + + missing := make([]dto.ClaudeMediaMessage, 0) + for _, id := range toolUseIds { + if matched[id] { + continue + } + missing = append(missing, dto.ClaudeMediaMessage{ + Type: "tool_result", + ToolUseId: id, + Content: "", + }) + } + if len(missing) == 0 { + continue + } + + if nextIsAdjacentUser { + // Append synthesised tool_result blocks to the existing user + // message in-place. + userBlocks, ok := messages[i+1].Content.([]dto.ClaudeMediaMessage) + if !ok { + // Promote a string content to a single text block. + if s, ok := messages[i+1].Content.(string); ok { + userBlocks = []dto.ClaudeMediaMessage{{Type: "text", Text: common.GetPointer[string](s)}} + } else { + userBlocks = nil + } + } + userBlocks = append(userBlocks, missing...) + messages[i+1].Content = userBlocks + } else { + // Insert a fresh user message immediately after this assistant + // to host the synthesised tool_result blocks. + out = append(out, dto.ClaudeMessage{ + Role: "user", + Content: missing, + }) + } + } + return out +} + func RequestOpenAI2ClaudeMessage(c *gin.Context, textRequest dto.GeneralOpenAIRequest) (*dto.ClaudeRequest, error) { claudeTools := make([]any, 0, len(textRequest.Tools)) @@ -121,6 +385,11 @@ func RequestOpenAI2ClaudeMessage(c *gin.Context, textRequest dto.GeneralOpenAIRe claudeTools = append(claudeTools, &webSearchTool) } + // GAP-B: apply prompt-cache marker to the LAST tool to preserve the + // (typically long-lived) tool schema across calls. Anthropic only + // honours cache_control on the final tool block. + applyCacheControlToLastTool(claudeTools) + claudeRequest := dto.ClaudeRequest{ Model: textRequest.Model, StopSequences: nil, @@ -376,6 +645,45 @@ func RequestOpenAI2ClaudeMessage(c *gin.Context, textRequest dto.GeneralOpenAIRe Text: common.GetPointer[string](mediaMessage.Text), }) } + case dto.ContentTypeFile: + file := mediaMessage.GetFile() + if file == nil || file.FileData == "" { + continue + } + category, mimeType := classifyFile(file) + switch category { + case fileCategoryPDF: + claudeMediaMessages = append(claudeMediaMessages, dto.ClaudeMediaMessage{ + Type: "document", + Source: &dto.ClaudeMessageSource{ + Type: "base64", + MediaType: mimeType, + Data: file.FileData, + }, + }) + case fileCategoryText: + decoded, err := base64.StdEncoding.DecodeString(file.FileData) + if err != nil { + continue + } + text := string(decoded) + claudeMediaMessages = append(claudeMediaMessages, dto.ClaudeMediaMessage{ + Type: "text", + Text: common.GetPointer[string](text), + }) + case fileCategoryImage: + claudeMediaMessages = append(claudeMediaMessages, dto.ClaudeMediaMessage{ + Type: "image", + Source: &dto.ClaudeMessageSource{ + Type: "base64", + MediaType: mimeType, + Data: file.FileData, + }, + }) + default: + // Unsupported file type — skip without inserting a placeholder. + continue + } default: source := mediaMessage.ToFileSource() if source == nil { @@ -429,6 +737,25 @@ func RequestOpenAI2ClaudeMessage(c *gin.Context, textRequest dto.GeneralOpenAIRe claudeRequest.System = systemMessages } + // GAP-A: response_format JSON-mode shim — Claude has no native equivalent, + // so we append an English instruction system block guiding the model to + // emit valid JSON (and optionally match a schema). + if shim := buildResponseFormatSystemShim(textRequest.ResponseFormat); shim != "" { + systemMessages = append(systemMessages, dto.ClaudeMediaMessage{ + Type: "text", + Text: common.GetPointer[string](shim), + }) + claudeRequest.System = systemMessages + } + + // GAP-C: tag the LAST text/tool_use block of the LAST assistant message + // with the prompt-cache marker. Trailing thinking blocks are skipped. + applyCacheControlToLastAssistantContent(claudeMessages) + + // GAP-D: inject empty tool_result blocks for any tool_use IDs that lack + // a matching tool_result in the subsequent user message. + claudeMessages = injectMissingToolResults(claudeMessages) + claudeRequest.Prompt = "" claudeRequest.Messages = claudeMessages return &claudeRequest, nil diff --git a/relay/channel/claude/relay_claude_test.go b/relay/channel/claude/relay_claude_test.go index fdc7b38e5ec..89bdcc530ea 100644 --- a/relay/channel/claude/relay_claude_test.go +++ b/relay/channel/claude/relay_claude_test.go @@ -2,6 +2,7 @@ package claude import ( "encoding/base64" + "encoding/json" "strings" "testing" @@ -380,3 +381,343 @@ func TestRequestOpenAI2ClaudeMessage_ConvertsTextFileContentToText(t *testing.T) require.NotNil(t, content[0].Text) require.Equal(t, "alpha\nbeta", *content[0].Text) } + +// ----------------------------------------------------------------------------- +// GAP-A: response_format JSON-mode shim +// ----------------------------------------------------------------------------- + +func systemTexts(t *testing.T, system any) []string { + t.Helper() + msgs, ok := system.([]dto.ClaudeMediaMessage) + require.True(t, ok, "expected []ClaudeMediaMessage system, got %T", system) + out := make([]string, 0, len(msgs)) + for _, m := range msgs { + require.Equal(t, "text", m.Type) + require.NotNil(t, m.Text) + out = append(out, *m.Text) + } + return out +} + +func TestRequestOpenAI2ClaudeMessage_ResponseFormat_JsonObject_AppendsSystemShim(t *testing.T) { + request := dto.GeneralOpenAIRequest{ + Model: "claude-3-5-sonnet", + Messages: []dto.Message{ + {Role: "system", Content: "You are helpful."}, + {Role: "user", Content: "ping"}, + }, + ResponseFormat: &dto.ResponseFormat{Type: "json_object"}, + } + + claudeRequest, err := RequestOpenAI2ClaudeMessage(nil, request) + require.NoError(t, err) + texts := systemTexts(t, claudeRequest.System) + require.Len(t, texts, 2) + require.Equal(t, "You are helpful.", texts[0]) + // Spec §19 / GAP-A: json_object must contain BOTH literal phrases + // (exact case, including the article "a" in "a JSON object"). + require.Contains(t, texts[1], "You must respond with valid JSON") + require.Contains(t, texts[1], "Respond ONLY with a JSON object") +} + +func TestRequestOpenAI2ClaudeMessage_ResponseFormat_JsonSchema_AppendsSystemShim(t *testing.T) { + schema := json.RawMessage(`{"name":"weather","schema":{"type":"object","properties":{"answer":{"type":"number"}}}}`) + request := dto.GeneralOpenAIRequest{ + Model: "claude-3-5-sonnet", + Messages: []dto.Message{ + {Role: "user", Content: "ping"}, + }, + ResponseFormat: &dto.ResponseFormat{Type: "json_schema", JsonSchema: schema}, + } + + claudeRequest, err := RequestOpenAI2ClaudeMessage(nil, request) + require.NoError(t, err) + texts := systemTexts(t, claudeRequest.System) + require.Len(t, texts, 1) + // Spec §19 / GAP-A: json_schema must contain ALL THREE literal phrases. + require.Contains(t, texts[0], "You must respond with valid JSON") + require.Contains(t, texts[0], "Respond ONLY with the JSON object") + // The pretty-printed schema must include the inner property key. + require.Contains(t, texts[0], "answer") +} + +func TestRequestOpenAI2ClaudeMessage_ResponseFormat_Nil_NoSystemShim(t *testing.T) { + request := dto.GeneralOpenAIRequest{ + Model: "claude-3-5-sonnet", + Messages: []dto.Message{ + {Role: "system", Content: "You are helpful."}, + {Role: "user", Content: "ping"}, + }, + } + + claudeRequest, err := RequestOpenAI2ClaudeMessage(nil, request) + require.NoError(t, err) + texts := systemTexts(t, claudeRequest.System) + require.Len(t, texts, 1) + require.Equal(t, "You are helpful.", texts[0]) +} + +// ----------------------------------------------------------------------------- +// GAP-B: cache_control marker on the last tool +// ----------------------------------------------------------------------------- + +func TestRequestOpenAI2ClaudeMessage_CacheControl_OnLastTool(t *testing.T) { + request := dto.GeneralOpenAIRequest{ + Model: "claude-3-5-sonnet", + Messages: []dto.Message{ + {Role: "user", Content: "ping"}, + }, + Tools: []dto.ToolCallRequest{ + { + Type: "function", + Function: dto.FunctionRequest{ + Name: "first", + Description: "first tool", + Parameters: map[string]any{"type": "object"}, + }, + }, + { + Type: "function", + Function: dto.FunctionRequest{ + Name: "second", + Description: "second tool", + Parameters: map[string]any{"type": "object"}, + }, + }, + }, + } + + claudeRequest, err := RequestOpenAI2ClaudeMessage(nil, request) + require.NoError(t, err) + + tools, ok := claudeRequest.Tools.([]any) + require.True(t, ok) + require.Len(t, tools, 2) + + first, ok := tools[0].(*dto.Tool) + require.True(t, ok) + require.Nil(t, first.CacheControl, "first tool must NOT carry cache_control") + + last, ok := tools[1].(*dto.Tool) + require.True(t, ok) + require.NotNil(t, last.CacheControl, "last tool MUST carry cache_control") + require.Equal(t, "ephemeral", last.CacheControl.Type) + require.Equal(t, "1h", last.CacheControl.TTL) +} + +func TestRequestOpenAI2ClaudeMessage_CacheControl_NoToolsNoChange(t *testing.T) { + request := dto.GeneralOpenAIRequest{ + Model: "claude-3-5-sonnet", + Messages: []dto.Message{ + {Role: "user", Content: "ping"}, + }, + } + + claudeRequest, err := RequestOpenAI2ClaudeMessage(nil, request) + require.NoError(t, err) + tools, ok := claudeRequest.Tools.([]any) + require.True(t, ok) + require.Len(t, tools, 0) +} + +// ----------------------------------------------------------------------------- +// GAP-C: cache_control on the last assistant message's last eligible block. +// Spec §22 (lines 581-583): eligible block types are {text, tool_use, +// tool_result, image}; thinking is NOT eligible. The marker emitted on the +// assistant side MUST NOT carry a TTL field — emit only {type:"ephemeral"}. +// ----------------------------------------------------------------------------- + +// cacheControlHasNoTTL asserts the cache_control marker is exactly the +// no-TTL ephemeral shape (`{"type":"ephemeral"}`). Spec §22 forbids a TTL +// field on the assistant-side marker. +func cacheControlHasNoTTL(t *testing.T, raw json.RawMessage) { + t.Helper() + require.NotNil(t, raw) + var parsed map[string]any + require.NoError(t, json.Unmarshal(raw, &parsed)) + require.Equal(t, "ephemeral", parsed["type"], "marker must be ephemeral") + _, hasTTL := parsed["ttl"] + require.False(t, hasTTL, "assistant-side cache_control MUST NOT include a ttl field; got %s", string(raw)) +} + +func TestRequestOpenAI2ClaudeMessage_CacheControl_OnLastAssistantTextBlock(t *testing.T) { + request := dto.GeneralOpenAIRequest{ + Model: "claude-3-5-sonnet", + Messages: []dto.Message{ + {Role: "user", Content: "hi"}, + { + Role: "assistant", + Content: []any{ + dto.MediaContent{Type: dto.ContentTypeText, Text: "first"}, + dto.MediaContent{Type: dto.ContentTypeText, Text: "second"}, + }, + }, + {Role: "user", Content: "more"}, + { + Role: "assistant", + Content: []any{ + dto.MediaContent{Type: dto.ContentTypeText, Text: "final-one"}, + dto.MediaContent{Type: dto.ContentTypeText, Text: "final-two"}, + }, + }, + }, + } + + claudeRequest, err := RequestOpenAI2ClaudeMessage(nil, request) + require.NoError(t, err) + + // The last message should be the second assistant message. + require.GreaterOrEqual(t, len(claudeRequest.Messages), 1) + lastIdx := len(claudeRequest.Messages) - 1 + require.Equal(t, "assistant", claudeRequest.Messages[lastIdx].Role) + blocks, ok := claudeRequest.Messages[lastIdx].Content.([]dto.ClaudeMediaMessage) + require.True(t, ok) + require.GreaterOrEqual(t, len(blocks), 1) + + last := blocks[len(blocks)-1] + require.NotNil(t, last.CacheControl, "last assistant content block MUST carry cache_control") + // Spec §22: the assistant-side marker must NOT carry a TTL field. + cacheControlHasNoTTL(t, last.CacheControl) + // All earlier blocks of the same assistant must NOT carry the marker. + for i := 0; i < len(blocks)-1; i++ { + require.Nil(t, blocks[i].CacheControl, "earlier block %d carries unexpected cache_control", i) + } +} + +// TestApplyCacheControlToLastAssistantContent_BroadenedEligibility drives the +// helper directly and asserts the broadened eligibility set: the marker MUST +// land on text, tool_use, tool_result, or image blocks (whichever is the last +// non-thinking block of the last assistant message). +func TestApplyCacheControlToLastAssistantContent_BroadenedEligibility(t *testing.T) { + cases := []struct { + name string + blockType string + extra func(b *dto.ClaudeMediaMessage) + }{ + {name: "text", blockType: "text", extra: func(b *dto.ClaudeMediaMessage) { b.Text = stringPtr("ok") }}, + {name: "tool_use", blockType: "tool_use", extra: func(b *dto.ClaudeMediaMessage) { b.Id = "tu_1"; b.Name = "fn" }}, + {name: "tool_result", blockType: "tool_result", extra: func(b *dto.ClaudeMediaMessage) { b.ToolUseId = "tu_1"; b.Content = "out" }}, + {name: "image", blockType: "image", extra: func(b *dto.ClaudeMediaMessage) { + b.Source = &dto.ClaudeMessageSource{Type: "base64", MediaType: "image/png", Data: "AAA"} + }}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + eligible := dto.ClaudeMediaMessage{Type: tc.blockType} + tc.extra(&eligible) + messages := []dto.ClaudeMessage{ + {Role: "user", Content: "hi"}, + {Role: "assistant", Content: []dto.ClaudeMediaMessage{ + // A trailing thinking block must NOT receive the marker; + // the helper should skip past it to find the eligible + // block before it. + eligible, + {Type: "thinking", Thinking: stringPtr("T")}, + }}, + } + applyCacheControlToLastAssistantContent(messages) + blocks, ok := messages[1].Content.([]dto.ClaudeMediaMessage) + require.True(t, ok) + require.Len(t, blocks, 2) + + // Eligible block (index 0) got the marker. + require.NotNil(t, blocks[0].CacheControl, "eligible %s block must receive cache_control", tc.blockType) + cacheControlHasNoTTL(t, blocks[0].CacheControl) + + // Trailing thinking block (index 1) must NOT receive the marker. + require.Nil(t, blocks[1].CacheControl, "thinking block must not receive cache_control") + }) + } +} + +// TestApplyCacheControlToLastAssistantContent_ThinkingOnlySkipped confirms +// that an assistant message whose only blocks are non-eligible (e.g. only +// thinking) receives no marker at all. +func TestApplyCacheControlToLastAssistantContent_ThinkingOnlySkipped(t *testing.T) { + messages := []dto.ClaudeMessage{ + {Role: "user", Content: "hi"}, + {Role: "assistant", Content: []dto.ClaudeMediaMessage{ + {Type: "thinking", Thinking: stringPtr("T1")}, + {Type: "thinking", Thinking: stringPtr("T2")}, + }}, + } + applyCacheControlToLastAssistantContent(messages) + blocks, ok := messages[1].Content.([]dto.ClaudeMediaMessage) + require.True(t, ok) + for i, b := range blocks { + require.Nil(t, b.CacheControl, "thinking-only assistant block %d must not receive marker", i) + } +} + +// ----------------------------------------------------------------------------- +// GAP-D: missing tool_result auto-injection +// ----------------------------------------------------------------------------- + +func TestInjectMissingToolResults_AddsEmptyResultWhenNoNextUser(t *testing.T) { + use := dto.ClaudeMediaMessage{Type: "tool_use", Id: "tu_abc"} + messages := []dto.ClaudeMessage{ + {Role: "user", Content: "hi"}, + {Role: "assistant", Content: []dto.ClaudeMediaMessage{use}}, + } + + out := injectMissingToolResults(messages) + require.Len(t, out, 3) + require.Equal(t, "user", out[2].Role) + blocks, ok := out[2].Content.([]dto.ClaudeMediaMessage) + require.True(t, ok) + require.Len(t, blocks, 1) + require.Equal(t, "tool_result", blocks[0].Type) + require.Equal(t, "tu_abc", blocks[0].ToolUseId) + require.Equal(t, "", blocks[0].Content) +} + +func TestInjectMissingToolResults_AppendsToExistingNextUser(t *testing.T) { + use1 := dto.ClaudeMediaMessage{Type: "tool_use", Id: "tu_1"} + use2 := dto.ClaudeMediaMessage{Type: "tool_use", Id: "tu_2"} + existing := dto.ClaudeMediaMessage{Type: "tool_result", ToolUseId: "tu_1", Content: "done"} + messages := []dto.ClaudeMessage{ + {Role: "assistant", Content: []dto.ClaudeMediaMessage{use1, use2}}, + {Role: "user", Content: []dto.ClaudeMediaMessage{existing}}, + } + + out := injectMissingToolResults(messages) + require.Len(t, out, 2) + require.Equal(t, "user", out[1].Role) + blocks, ok := out[1].Content.([]dto.ClaudeMediaMessage) + require.True(t, ok) + require.Len(t, blocks, 2) + require.Equal(t, "tu_1", blocks[0].ToolUseId) + require.Equal(t, "done", blocks[0].Content) + require.Equal(t, "tu_2", blocks[1].ToolUseId) + require.Equal(t, "", blocks[1].Content) +} + +func TestInjectMissingToolResults_DoesNotDuplicateExistingResults(t *testing.T) { + use := dto.ClaudeMediaMessage{Type: "tool_use", Id: "tu_x"} + existing := dto.ClaudeMediaMessage{Type: "tool_result", ToolUseId: "tu_x", Content: "result"} + messages := []dto.ClaudeMessage{ + {Role: "assistant", Content: []dto.ClaudeMediaMessage{use}}, + {Role: "user", Content: []dto.ClaudeMediaMessage{existing}}, + } + + out := injectMissingToolResults(messages) + require.Len(t, out, 2) + blocks, ok := out[1].Content.([]dto.ClaudeMediaMessage) + require.True(t, ok) + require.Len(t, blocks, 1, "must not duplicate existing matched tool_result") +} + +func TestInjectMissingToolResults_NoToolUseLeavesMessagesUntouched(t *testing.T) { + messages := []dto.ClaudeMessage{ + {Role: "user", Content: "hi"}, + {Role: "assistant", Content: []dto.ClaudeMediaMessage{{Type: "text", Text: stringPtr("ok")}}}, + } + + out := injectMissingToolResults(messages) + require.Len(t, out, 2) +} + +func stringPtr(s string) *string { + return &s +} diff --git a/relay/helper/stream_scanner.go b/relay/helper/stream_scanner.go index 1d44b80443c..68c00e2a9cc 100644 --- a/relay/helper/stream_scanner.go +++ b/relay/helper/stream_scanner.go @@ -40,8 +40,10 @@ func StreamScannerHandler(c *gin.Context, resp *http.Response, info *relaycommon return } - // 无条件新建 StreamStatus - info.StreamStatus = relaycommon.NewStreamStatus() + // 仅在未初始化时新建 StreamStatus,保留调用方可能预先记录的状态 + if info.StreamStatus == nil { + info.StreamStatus = relaycommon.NewStreamStatus() + } // 确保响应体总是被关闭 defer func() { diff --git a/relay/responses_handler.go b/relay/responses_handler.go index 54ca3cbc501..4e9ee75d979 100644 --- a/relay/responses_handler.go +++ b/relay/responses_handler.go @@ -71,6 +71,34 @@ func ResponsesHelper(c *gin.Context, info *relaycommon.RelayInfo) (newAPIError * return types.NewError(fmt.Errorf("invalid api type: %d", info.ApiType), types.ErrorCodeInvalidApiType, types.ErrOptionWithSkipRetry()) } adaptor.Init(info) + + // Anthropic-typed channels do not natively understand the Responses-API + // shape. When the operator hasn't requested a raw pass-through, route the + // request through the new Responses → Chat-Completions → Anthropic pivot. + // Feature-gated via RESPONSES_TO_ANTHROPIC_ENABLED (default true). + passThroughGlobal := model_setting.GetGlobalSettings().PassThroughRequestEnabled + if shouldUseResponsesToAnthropicPivot( + info.RelayMode, + info.ApiType, + passThroughGlobal, + info.ChannelSetting.PassThroughBodyEnabled, + common.GetEnvOrDefaultBool("RESPONSES_TO_ANTHROPIC_ENABLED", true), + ) { + usage, apiErr := responsesViaChatCompletions(c, info, adaptor, request) + if apiErr != nil { + service.ResetStatusCode(apiErr, c.GetString("status_code_mapping")) + return apiErr + } + if usage != nil { + if strings.HasPrefix(info.OriginModelName, "gpt-4o-audio") { + service.PostAudioConsumeQuota(c, info, usage, "") + } else { + service.PostTextConsumeQuota(c, info, usage, nil) + } + } + return nil + } + var requestBody io.Reader if model_setting.GetGlobalSettings().PassThroughRequestEnabled || info.ChannelSetting.PassThroughBodyEnabled { storage, err := common.GetBodyStorage(c) @@ -158,3 +186,23 @@ func ResponsesHelper(c *gin.Context, info *relaycommon.RelayInfo) (newAPIError * } return nil } + +// shouldUseResponsesToAnthropicPivot encodes the branch condition that gates +// the Responses → Chat-Completions → Anthropic pivot. It is extracted into a +// pure function so the predicate can be unit-tested without standing up the +// full ResponsesHelper pipeline (DB, quota, billing, etc.). A change to this +// predicate — or a flip of the feature flag's default — must be reflected in +// TestShouldUseResponsesToAnthropicPivot. +func shouldUseResponsesToAnthropicPivot( + relayMode int, + apiType int, + passThroughGlobal bool, + passThroughBody bool, + featureFlagEnabled bool, +) bool { + return relayMode == relayconstant.RelayModeResponses && + apiType == appconstant.APITypeAnthropic && + !passThroughGlobal && + !passThroughBody && + featureFlagEnabled +} diff --git a/relay/responses_via_chat_completions.go b/relay/responses_via_chat_completions.go new file mode 100644 index 00000000000..56b00762087 --- /dev/null +++ b/relay/responses_via_chat_completions.go @@ -0,0 +1,320 @@ +package relay + +import ( + "bytes" + "fmt" + "io" + "net/http" + "strings" + + "github.com/QuantumNous/new-api/common" + appconstant "github.com/QuantumNous/new-api/constant" + "github.com/QuantumNous/new-api/dto" + "github.com/QuantumNous/new-api/logger" + "github.com/QuantumNous/new-api/relay/channel" + claudechannel "github.com/QuantumNous/new-api/relay/channel/claude" + relaycommon "github.com/QuantumNous/new-api/relay/common" + "github.com/QuantumNous/new-api/relay/helper" + "github.com/QuantumNous/new-api/service" + "github.com/QuantumNous/new-api/service/openaicompat" + "github.com/QuantumNous/new-api/types" + + "github.com/gin-gonic/gin" +) + +// responsesViaChatCompletions handles a /v1/responses request routed to an +// Anthropic-typed channel. It performs the two-step pivot: +// +// Responses → ChatCompletions (in service/openaicompat) +// ChatCompletions → Anthropic (via the Claude adaptor / RequestOpenAI2ClaudeMessage) +// +// And on the response side: +// +// Anthropic stream chunk → Chat-Completions chunk (StreamResponseClaude2OpenAI) +// → Responses-API events (ChatCompletionsStreamToResponsesEvents) +// +// or the non-streaming counterpart (ClaudeHandler → ResponseClaude2OpenAI → +// ChatCompletionsResponseToResponsesResponse). +// +// This function mirrors the existing chat_completions_via_responses.go. +func responsesViaChatCompletions(c *gin.Context, info *relaycommon.RelayInfo, adaptor channel.Adaptor, request *dto.OpenAIResponsesRequest) (*dto.Usage, *types.NewAPIError) { + if info.ApiType != appconstant.APITypeAnthropic { + return nil, types.NewError(fmt.Errorf("responsesViaChatCompletions called for non-Anthropic api type %d", info.ApiType), types.ErrorCodeInvalidApiType, types.ErrOptionWithSkipRetry()) + } + + // (a) Responses → ChatCompletions intermediate. + chatReq, err := openaicompat.ResponsesRequestToChatCompletionsRequest(request) + if err != nil { + return nil, types.NewErrorWithStatusCode(err, types.ErrorCodeConvertRequestFailed, http.StatusBadRequest, types.ErrOptionWithSkipRetry()) + } + + // (b) Sanitize tool-call IDs at the boundary (spec §14). + openaicompat.SanitizeToolCallIDs(chatReq) + + // (c) ChatCompletions → Anthropic via the existing adaptor converter. + converted, err := adaptor.ConvertOpenAIRequest(c, info, chatReq) + if err != nil { + return nil, types.NewError(err, types.ErrorCodeConvertRequestFailed, types.ErrOptionWithSkipRetry()) + } + relaycommon.AppendRequestConversionFromRequest(info, converted) + + // (d) Marshal -> RemoveDisabledFields -> ApplyParamOverride. + jsonData, err := common.Marshal(converted) + if err != nil { + return nil, types.NewError(err, types.ErrorCodeConvertRequestFailed, types.ErrOptionWithSkipRetry()) + } + jsonData, err = relaycommon.RemoveDisabledFields(jsonData, info.ChannelOtherSettings, info.ChannelSetting.PassThroughBodyEnabled) + if err != nil { + return nil, types.NewError(err, types.ErrorCodeConvertRequestFailed, types.ErrOptionWithSkipRetry()) + } + if len(info.ParamOverride) > 0 { + jsonData, err = relaycommon.ApplyParamOverrideWithRelayInfo(jsonData, info) + if err != nil { + return nil, newAPIErrorFromParamOverride(err) + } + } + logger.LogDebug(c, "responses_via_chat_anthropic body: %s", jsonData) + + // (e) DoRequest. + var requestBody io.Reader = bytes.NewBuffer(jsonData) + resp, err := adaptor.DoRequest(c, info, requestBody) + if err != nil { + return nil, types.NewOpenAIError(err, types.ErrorCodeDoRequestFailed, http.StatusInternalServerError) + } + if resp == nil { + return nil, types.NewOpenAIError(fmt.Errorf("nil response from upstream"), types.ErrorCodeBadResponse, http.StatusInternalServerError) + } + httpResp := resp.(*http.Response) + info.IsStream = info.IsStream || strings.HasPrefix(httpResp.Header.Get("Content-Type"), "text/event-stream") + + statusCodeMappingStr := c.GetString("status_code_mapping") + if httpResp.StatusCode != http.StatusOK { + apiErr := service.RelayErrorHandler(c.Request.Context(), httpResp, false) + service.ResetStatusCode(apiErr, statusCodeMappingStr) + return nil, apiErr + } + + // Mark the final relay format so downstream helpers see "openai_responses" + // (the client's expected format). + info.FinalRequestRelayFormat = types.RelayFormatOpenAIResponses + + if info.IsStream { + return runAnthropicToResponsesStream(c, info, httpResp) + } + return runAnthropicToResponsesNonStream(c, info, httpResp) +} + +// runAnthropicToResponsesStream reads Anthropic SSE chunks, converts each to a +// Chat-Completions chunk via StreamResponseClaude2OpenAI, then feeds it through +// ChatCompletionsStreamToResponsesEvents and writes Responses-API SSE events to +// the client. +func runAnthropicToResponsesStream(c *gin.Context, info *relaycommon.RelayInfo, resp *http.Response) (*dto.Usage, *types.NewAPIError) { + helper.SetEventStreamHeaders(c) + + claudeInfo := &claudechannel.ClaudeResponseInfo{ + ResponseId: helper.GetResponseID(c), + Created: common.GetTimestamp(), + Model: info.UpstreamModelName, + Usage: &dto.Usage{}, + } + state := openaicompat.NewResponsesStreamState() + + writeEvents := func(events []openaicompat.ResponsesAPIEvent) error { + for _, ev := range events { + data, err := common.Marshal(ev) + if err != nil { + return err + } + c.Render(-1, common.CustomEvent{Data: fmt.Sprintf("event: %s\n", ev.Type)}) + c.Render(-1, common.CustomEvent{Data: "data: " + string(data)}) + _ = helper.FlushWriter(c) + } + return nil + } + + var streamErr *types.NewAPIError + helper.StreamScannerHandler(c, resp, info, func(data string, sr *helper.StreamResult) { + var claudeResponse dto.ClaudeResponse + if e := common.UnmarshalJsonStr(data, &claudeResponse); e != nil { + logger.LogError(c, "claude_stream_unmarshal_failed: "+e.Error()) + streamErr = types.NewError(e, types.ErrorCodeBadResponseBody) + sr.Stop(streamErr) + return + } + // Surface upstream Claude errors. + if claudeError := claudeResponse.GetClaudeError(); claudeError != nil && claudeError.Type != "" { + evs := openaicompat.EmitChatStreamErrorEvent(state, claudeError.Message) + _ = writeEvents(evs) + streamErr = types.WithClaudeError(*claudeError, http.StatusInternalServerError) + sr.Stop(streamErr) + return + } + // Preserve refusal marking (parity with HandleStreamResponseData). + markClaudeRefusalFromStreamChunk(c, &claudeResponse) + + // Build the Chat-Completions chunk equivalent. + chatChunk := claudechannel.StreamResponseClaude2OpenAI(&claudeResponse) + // Accumulate Claude-side usage info. + _ = claudechannel.FormatClaudeResponseInfo(&claudeResponse, chatChunk, claudeInfo) + if chatChunk == nil { + return + } + // Attach the running usage on the final delta so the translator can + // pick it up. Normalize Anthropic semantics first so cached/cache- + // creation input tokens fold into prompt_tokens — the Responses + // translator subtracts cached from prompt_tokens to derive + // input_tokens. + if claudeInfo.Done && claudeInfo.Usage != nil { + chatChunk.Usage = normalizeClaudeUsageForOpenAISemantics(claudeInfo.Usage) + } + evs := openaicompat.ChatCompletionsStreamToResponsesEvents(chatChunk, state) + if e := writeEvents(evs); e != nil { + logger.LogError(c, "responses_stream_write_failed: "+e.Error()) + streamErr = types.NewOpenAIError(e, types.ErrorCodeBadResponse, http.StatusInternalServerError) + sr.Stop(streamErr) + return + } + }) + + // EOS flush: only run when the stream finished normally. On an upstream + // error we already emitted response.failed (via EmitChatStreamErrorEvent) + // or are propagating streamErr to the caller, and the unconditional flush + // would otherwise emit a synthetic response.completed alongside. + if streamErr == nil { + flushEvents := openaicompat.ChatCompletionsStreamToResponsesEvents(nil, state) + _ = writeEvents(flushEvents) + } + + if streamErr != nil { + return nil, streamErr + } + + // Fall back to text-estimated usage if upstream didn't deliver complete + // counts. Each token field is repaired independently so that a missing + // prompt count does not require a missing completion count (or vice + // versa). + if claudeInfo.Usage.CompletionTokens == 0 || claudeInfo.Usage.PromptTokens == 0 { + fallback := service.ResponseText2Usage(c, claudeInfo.ResponseText.String(), info.UpstreamModelName, info.GetEstimatePromptTokens()) + if claudeInfo.Usage.CompletionTokens == 0 { + claudeInfo.Usage.CompletionTokens = fallback.CompletionTokens + } + if claudeInfo.Usage.PromptTokens == 0 { + claudeInfo.Usage.PromptTokens = fallback.PromptTokens + } + claudeInfo.Usage.TotalTokens = claudeInfo.Usage.PromptTokens + claudeInfo.Usage.CompletionTokens + } + if claudeInfo.Usage != nil { + claudeInfo.Usage.UsageSemantic = "anthropic" + } + return claudeInfo.Usage, nil +} + +// runAnthropicToResponsesNonStream reads the Anthropic JSON response, +// converts it via ResponseClaude2OpenAI, then via +// ChatCompletionsResponseToResponsesResponse and writes the JSON to the client. +func runAnthropicToResponsesNonStream(c *gin.Context, info *relaycommon.RelayInfo, resp *http.Response) (*dto.Usage, *types.NewAPIError) { + defer service.CloseResponseBodyGracefully(resp) + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, types.NewOpenAIError(err, types.ErrorCodeReadResponseBodyFailed, http.StatusInternalServerError) + } + logger.LogDebug(c, "responses_via_chat_anthropic upstream body: %s", body) + + var claudeResponse dto.ClaudeResponse + if e := common.Unmarshal(body, &claudeResponse); e != nil { + return nil, types.NewError(e, types.ErrorCodeBadResponseBody) + } + if claudeError := claudeResponse.GetClaudeError(); claudeError != nil && claudeError.Type != "" { + return nil, types.WithClaudeError(*claudeError, resp.StatusCode) + } + // Preserve refusal marking (parity with the non-pivot Claude handler). + markClaudeRefusalFromResponse(c, &claudeResponse) + + openaiResp := claudechannel.ResponseClaude2OpenAI(&claudeResponse) + if openaiResp == nil { + return nil, types.NewOpenAIError(fmt.Errorf("nil openai response from Claude conversion"), types.ErrorCodeBadResponseBody, http.StatusInternalServerError) + } + + // Build usage from the Claude response. Folding cache_read/creation into + // PromptTokens keeps OpenAI semantics for the Responses translator while + // the returned *dto.Usage retains the raw Anthropic-semantic counts the + // gateway accounting layer expects. + usage := &dto.Usage{} + if claudeResponse.Usage != nil { + usage.PromptTokens = claudeResponse.Usage.InputTokens + usage.CompletionTokens = claudeResponse.Usage.OutputTokens + usage.TotalTokens = usage.PromptTokens + usage.CompletionTokens + usage.UsageSemantic = "anthropic" + usage.PromptTokensDetails.CachedTokens = claudeResponse.Usage.CacheReadInputTokens + usage.PromptTokensDetails.CachedCreationTokens = claudeResponse.Usage.CacheCreationInputTokens + } + // Hand the translator an OpenAI-semantic usage view so the cached/creation + // breakdown survives the responses envelope (translator subtracts cached + // from prompt_tokens to derive input_tokens). + if normalized := normalizeClaudeUsageForOpenAISemantics(usage); normalized != nil { + openaiResp.Usage = *normalized + } else { + openaiResp.Usage = *usage + } + + responsesResp, e := openaicompat.ChatCompletionsResponseToResponsesResponse(openaiResp, info.UpstreamModelName) + if e != nil { + return nil, types.NewOpenAIError(e, types.ErrorCodeBadResponseBody, http.StatusInternalServerError) + } + + responseBody, e := common.Marshal(responsesResp) + if e != nil { + return nil, types.NewOpenAIError(e, types.ErrorCodeBadResponse, http.StatusInternalServerError) + } + service.IOCopyBytesGracefully(c, resp, responseBody) + return usage, nil +} + +// markClaudeRefusalFromStreamChunk mirrors the refusal-detection performed by +// claudechannel.HandleStreamResponseData. Without it, /v1/responses requests +// routed through the pivot would not record the moderation/accounting signal +// that the direct Claude relay records. +func markClaudeRefusalFromStreamChunk(c *gin.Context, cr *dto.ClaudeResponse) { + if c == nil || cr == nil { + return + } + if cr.StopReason != "" && strings.EqualFold(cr.StopReason, "refusal") { + common.SetContextKey(c, appconstant.ContextKeyAdminRejectReason, "claude_stop_reason=refusal") + return + } + if cr.Delta != nil && cr.Delta.StopReason != nil && strings.EqualFold(*cr.Delta.StopReason, "refusal") { + common.SetContextKey(c, appconstant.ContextKeyAdminRejectReason, "claude_stop_reason=refusal") + } +} + +// markClaudeRefusalFromResponse mirrors the refusal-detection performed by the +// direct Claude non-streaming handler. +func markClaudeRefusalFromResponse(c *gin.Context, cr *dto.ClaudeResponse) { + if c == nil || cr == nil { + return + } + if strings.EqualFold(cr.StopReason, "refusal") { + common.SetContextKey(c, appconstant.ContextKeyAdminRejectReason, "claude_stop_reason=refusal") + } +} + +// normalizeClaudeUsageForOpenAISemantics folds Anthropic's separately-counted +// cache_read and cache_creation input tokens into prompt_tokens so the +// downstream Responses translator (which uses OpenAI semantics and subtracts +// cached from prompt) produces correct input_tokens / total_tokens. Returns +// the original *dto.Usage when no conversion is needed. +func normalizeClaudeUsageForOpenAISemantics(in *dto.Usage) *dto.Usage { + if in == nil { + return nil + } + if in.UsageSemantic != "anthropic" { + return in + } + cp := *in + cp.PromptTokens = in.PromptTokens + in.PromptTokensDetails.CachedTokens + in.PromptTokensDetails.CachedCreationTokens + cp.TotalTokens = cp.PromptTokens + cp.CompletionTokens + // Drop the semantic marker so a second pass through this helper is a no-op. + cp.UsageSemantic = "" + return &cp +} diff --git a/relay/responses_via_chat_completions_test.go b/relay/responses_via_chat_completions_test.go new file mode 100644 index 00000000000..3fc607e3039 --- /dev/null +++ b/relay/responses_via_chat_completions_test.go @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later +package relay + +import ( + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/QuantumNous/new-api/common" + "github.com/QuantumNous/new-api/constant" + "github.com/QuantumNous/new-api/dto" + relaycommon "github.com/QuantumNous/new-api/relay/common" + relayconstant "github.com/QuantumNous/new-api/relay/constant" + "github.com/QuantumNous/new-api/types" + + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/require" +) + +func init() { + gin.SetMode(gin.TestMode) +} + +// newResponsesViaChatTestContext returns a gin.Context tied to an in-memory +// recorder so handlers can write SSE/JSON without a real HTTP transport. +func newResponsesViaChatTestContext(t *testing.T) (*gin.Context, *httptest.ResponseRecorder, *relaycommon.RelayInfo) { + t.Helper() + + old := constant.StreamingTimeout + constant.StreamingTimeout = 30 + t.Cleanup(func() { constant.StreamingTimeout = old }) + + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + c.Request = httptest.NewRequest(http.MethodPost, "/v1/responses", nil) + + info := &relaycommon.RelayInfo{ + ChannelMeta: &relaycommon.ChannelMeta{ + UpstreamModelName: "claude-test", + }, + OriginModelName: "claude-test", + IsStream: true, + RelayFormat: types.RelayFormatOpenAIResponses, + } + return c, rec, info +} + +// anthropicSSE returns a canonical Anthropic streaming-message envelope as a +// raw SSE byte string (suitable for piping through StreamScannerHandler). +func anthropicSSE() string { + var b strings.Builder + b.WriteString(`data: {"type":"message_start","message":{"id":"msg_001","model":"claude-test","usage":{"input_tokens":11,"output_tokens":1}}}` + "\n") + b.WriteString(`data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}` + "\n") + b.WriteString(`data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hello "}}` + "\n") + b.WriteString(`data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"world"}}` + "\n") + b.WriteString(`data: {"type":"content_block_stop","index":0}` + "\n") + b.WriteString(`data: {"type":"message_delta","delta":{"stop_reason":"end_turn"},"usage":{"output_tokens":2}}` + "\n") + b.WriteString(`data: {"type":"message_stop"}` + "\n") + b.WriteString("data: [DONE]\n") + return b.String() +} + +// TestResponsesViaChatCompletions_StreamingTextOnly drives +// runAnthropicToResponsesStream with a canonical Anthropic SSE byte stream +// (text-only) and asserts the resulting Responses-API SSE wire format. +// +// It satisfies the §13 streaming integration coverage requirement: we verify +// the orchestration writes the documented sequence of events +// (response.created / in_progress / output_item.added / output_text.delta / +// output_text.done / content_part.done / output_item.done / response.completed) +// with monotonically increasing sequence_number values. +func TestResponsesViaChatCompletions_StreamingTextOnly(t *testing.T) { + c, rec, info := newResponsesViaChatTestContext(t) + + resp := &http.Response{ + Body: io.NopCloser(strings.NewReader(anthropicSSE())), + Header: http.Header{"Content-Type": []string{"text/event-stream"}}, + } + + usage, apiErr := runAnthropicToResponsesStream(c, info, resp) + require.Nil(t, apiErr) + require.NotNil(t, usage) + require.Equal(t, "anthropic", usage.UsageSemantic) + + body := rec.Body.String() + + // Mandatory event types per spec §5 of the responses-to-anthropic spec. + mustContain := []string{ + "event: response.created", + "event: response.in_progress", + "event: response.output_item.added", + "event: response.output_text.delta", + "event: response.output_text.done", + "event: response.content_part.done", + "event: response.output_item.done", + "event: response.completed", + } + for _, marker := range mustContain { + require.Contains(t, body, marker, "expected SSE to contain %q", marker) + } + + // Validate monotonically increasing sequence_number values across all + // emitted JSON payloads. + seq := extractSequenceNumbers(t, body) + require.NotEmpty(t, seq, "expected at least one sequence_number") + for i := 1; i < len(seq); i++ { + require.GreaterOrEqual(t, seq[i], seq[i-1], "sequence_number must be monotonically non-decreasing (got %d after %d at idx %d)", seq[i], seq[i-1], i) + } + + // The output_item.added for the message item must carry type=message. + require.Contains(t, body, `"type":"message"`) +} + +// TestResponsesViaChatCompletions_NonStreamingTextOnly drives +// runAnthropicToResponsesNonStream with a single JSON Anthropic message and +// validates that the response body parses as a valid Responses-API response +// with status=completed and output[0].type=message containing the text. +func TestResponsesViaChatCompletions_NonStreamingTextOnly(t *testing.T) { + c, rec, info := newResponsesViaChatTestContext(t) + info.IsStream = false + + anthropicBody := `{ + "id": "msg_abc", + "type": "message", + "role": "assistant", + "model": "claude-test", + "content": [ + {"type": "text", "text": "Hello world"} + ], + "stop_reason": "end_turn", + "usage": {"input_tokens": 11, "output_tokens": 2} + }` + + resp := &http.Response{ + Body: io.NopCloser(strings.NewReader(anthropicBody)), + Header: http.Header{"Content-Type": []string{"application/json"}}, + StatusCode: http.StatusOK, + } + + usage, apiErr := runAnthropicToResponsesNonStream(c, info, resp) + require.Nil(t, apiErr) + require.NotNil(t, usage) + require.Equal(t, "anthropic", usage.UsageSemantic) + + var got dto.OpenAIResponsesResponse + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &got), "response body must be a valid OpenAIResponsesResponse") + require.Equal(t, "claude-test", got.Model) + + statusStr := strings.Trim(strings.TrimSpace(string(got.Status)), `"`) + require.Equal(t, "completed", statusStr) + + require.NotEmpty(t, got.Output) + require.Equal(t, "message", got.Output[0].Type) + require.NotEmpty(t, got.Output[0].Content) + require.Equal(t, "Hello world", got.Output[0].Content[0].Text) +} + +// TestResponsesViaChatCompletions_FeatureFlagGate_EnvParse verifies that the +// PRODUCTION env-flag reader (common.GetEnvOrDefaultBool) — not a +// reimplementation in this test — correctly resolves +// RESPONSES_TO_ANTHROPIC_ENABLED to false when set to "false" and to true when +// set to "true". This is the exact call made at responses_handler.go to gate +// the Responses → Chat-Completions → Anthropic pivot, so a regression in the +// env parser or a flip of the default would be caught here. +func TestResponsesViaChatCompletions_FeatureFlagGate_EnvParse(t *testing.T) { + const envKey = "RESPONSES_TO_ANTHROPIC_ENABLED" + + // Flag explicitly false => production reader returns false (overrides the + // default value passed by the caller). + t.Setenv(envKey, "false") + require.False(t, common.GetEnvOrDefaultBool(envKey, true), + "production env reader must honour explicit false") + + // Flag explicitly true => production reader returns true. + t.Setenv(envKey, "true") + require.True(t, common.GetEnvOrDefaultBool(envKey, false), + "production env reader must honour explicit true") +} + +// TestResponsesViaChatCompletions_FeatureFlagGate_BranchCondition drives the +// branch predicate extracted from responses_handler.go. It builds a baseline +// "engaged" condition (RelayModeResponses + APITypeAnthropic + no global +// pass-through + no body pass-through + flag-on) and then flips each input +// individually, asserting that any flip disables the pivot. The flag's role +// is verified explicitly: with all other inputs in the engaged baseline, the +// pivot SHALL engage iff featureFlagEnabled is true. +func TestResponsesViaChatCompletions_FeatureFlagGate_BranchCondition(t *testing.T) { + // Baseline: everything aligned so the pivot engages. + require.True(t, shouldUseResponsesToAnthropicPivot( + relayconstant.RelayModeResponses, + constant.APITypeAnthropic, + false, // passThroughGlobal + false, // passThroughBody + true, // featureFlagEnabled + ), "engaged baseline must trigger the pivot") + + // Feature flag off disables the pivot even when every other condition is + // aligned. This is the critical regression-catch for the MAJOR finding. + require.False(t, shouldUseResponsesToAnthropicPivot( + relayconstant.RelayModeResponses, + constant.APITypeAnthropic, + false, + false, + false, // <- flag off + ), "feature flag off must bypass the pivot") + + // Wrong relay mode disables the pivot. + require.False(t, shouldUseResponsesToAnthropicPivot( + relayconstant.RelayModeChatCompletions, + constant.APITypeAnthropic, + false, + false, + true, + ), "non-Responses relay mode must bypass the pivot") + + // Wrong API type disables the pivot. + require.False(t, shouldUseResponsesToAnthropicPivot( + relayconstant.RelayModeResponses, + constant.APITypeOpenAI, + false, + false, + true, + ), "non-Anthropic api type must bypass the pivot") + + // Global pass-through disables the pivot. + require.False(t, shouldUseResponsesToAnthropicPivot( + relayconstant.RelayModeResponses, + constant.APITypeAnthropic, + true, // <- pass-through global on + false, + true, + ), "global pass-through must bypass the pivot") + + // Channel-level body pass-through disables the pivot. + require.False(t, shouldUseResponsesToAnthropicPivot( + relayconstant.RelayModeResponses, + constant.APITypeAnthropic, + false, + true, // <- channel body pass-through on + true, + ), "channel body pass-through must bypass the pivot") +} + +// TestResponsesViaChatCompletions_FeatureFlagGate_DefaultIsOn locks in the +// documented default for the feature flag: when the env var is unset, the +// pivot SHALL be enabled. This guards against an accidental default-flip. +func TestResponsesViaChatCompletions_FeatureFlagGate_DefaultIsOn(t *testing.T) { + const envKey = "RESPONSES_TO_ANTHROPIC_ENABLED" + + // t.Setenv with empty string clears the env at scope exit, but during the + // scope we explicitly unset it via setting to "" (production reader treats + // empty as unset and returns the default). + t.Setenv(envKey, "") + require.True(t, common.GetEnvOrDefaultBool(envKey, true), + "empty/unset RESPONSES_TO_ANTHROPIC_ENABLED must default to true") +} + +// extractSequenceNumbers scans the recorded SSE body and returns every +// `"sequence_number": N` value in emission order. +func extractSequenceNumbers(t *testing.T, body string) []int64 { + t.Helper() + + const marker = `"sequence_number":` + out := make([]int64, 0) + rest := body + for { + idx := strings.Index(rest, marker) + if idx < 0 { + break + } + rest = rest[idx+len(marker):] + // Read digits. + i := 0 + for i < len(rest) && (rest[i] == ' ' || rest[i] == '\t') { + i++ + } + start := i + for i < len(rest) && rest[i] >= '0' && rest[i] <= '9' { + i++ + } + if i == start { + continue + } + var n int64 + for j := start; j < i; j++ { + n = n*10 + int64(rest[j]-'0') + } + out = append(out, n) + rest = rest[i:] + } + return out +} diff --git a/service/channel_affinity_usage_cache_test.go b/service/channel_affinity_usage_cache_test.go index 64d3d715b54..6f8f64e1988 100644 --- a/service/channel_affinity_usage_cache_test.go +++ b/service/channel_affinity_usage_cache_test.go @@ -12,6 +12,21 @@ import ( "github.com/stretchr/testify/require" ) +// resetChannelAffinityUsageCacheStats purges the in-memory cache shared by the +// channel-affinity usage stats tests so that one test cannot influence another +// when keys collide (e.g. due to the time-based fixtures running in the same +// nanosecond). It is safe to call multiple times. +func resetChannelAffinityUsageCacheStats(t *testing.T) { + t.Helper() + cache := getChannelAffinityUsageCacheStatsCache() + if cache == nil { + return + } + if err := cache.Purge(); err != nil { + t.Logf("warning: failed to purge channel affinity usage cache: %v", err) + } +} + func buildChannelAffinityStatsContextForTest(ruleName, usingGroup, keyFP string) *gin.Context { rec := httptest.NewRecorder() ctx, _ := gin.CreateTestContext(rec) @@ -26,9 +41,12 @@ func buildChannelAffinityStatsContextForTest(ruleName, usingGroup, keyFP string) } func TestObserveChannelAffinityUsageCacheByRelayFormat_ClaudeMode(t *testing.T) { - ruleName := fmt.Sprintf("rule_%d", time.Now().UnixNano()) + resetChannelAffinityUsageCacheStats(t) + t.Cleanup(func() { resetChannelAffinityUsageCacheStats(t) }) + + ruleName := fmt.Sprintf("rule_claudemode_%d", time.Now().UnixNano()) usingGroup := "default" - keyFP := fmt.Sprintf("fp_%d", time.Now().UnixNano()) + keyFP := fmt.Sprintf("fp_claudemode_%d", time.Now().UnixNano()) ctx := buildChannelAffinityStatsContextForTest(ruleName, usingGroup, keyFP) usage := &dto.Usage{ @@ -53,9 +71,12 @@ func TestObserveChannelAffinityUsageCacheByRelayFormat_ClaudeMode(t *testing.T) } func TestObserveChannelAffinityUsageCacheByRelayFormat_MixedMode(t *testing.T) { - ruleName := fmt.Sprintf("rule_%d", time.Now().UnixNano()) + resetChannelAffinityUsageCacheStats(t) + t.Cleanup(func() { resetChannelAffinityUsageCacheStats(t) }) + + ruleName := fmt.Sprintf("rule_mixedmode_%d", time.Now().UnixNano()) usingGroup := "default" - keyFP := fmt.Sprintf("fp_%d", time.Now().UnixNano()) + keyFP := fmt.Sprintf("fp_mixedmode_%d", time.Now().UnixNano()) ctx := buildChannelAffinityStatsContextForTest(ruleName, usingGroup, keyFP) openAIUsage := &dto.Usage{ @@ -83,9 +104,12 @@ func TestObserveChannelAffinityUsageCacheByRelayFormat_MixedMode(t *testing.T) { } func TestObserveChannelAffinityUsageCacheByRelayFormat_UnsupportedModeKeepsEmpty(t *testing.T) { - ruleName := fmt.Sprintf("rule_%d", time.Now().UnixNano()) + resetChannelAffinityUsageCacheStats(t) + t.Cleanup(func() { resetChannelAffinityUsageCacheStats(t) }) + + ruleName := fmt.Sprintf("rule_unsupportedmode_%d", time.Now().UnixNano()) usingGroup := "default" - keyFP := fmt.Sprintf("fp_%d", time.Now().UnixNano()) + keyFP := fmt.Sprintf("fp_unsupportedmode_%d", time.Now().UnixNano()) ctx := buildChannelAffinityStatsContextForTest(ruleName, usingGroup, keyFP) usage := &dto.Usage{ diff --git a/service/openaicompat/chat_stream_to_responses.go b/service/openaicompat/chat_stream_to_responses.go new file mode 100644 index 00000000000..7a5638d9869 --- /dev/null +++ b/service/openaicompat/chat_stream_to_responses.go @@ -0,0 +1,708 @@ +package openaicompat + +import ( + "fmt" + "sort" + "strings" + "time" + + "github.com/QuantumNous/new-api/common" + "github.com/QuantumNous/new-api/dto" +) + +// ResponsesAPIEvent is a generic Responses-API event envelope. It is encoded +// as JSON for SSE wire transmission; the `Type` field becomes the SSE `event:` +// header, and the full envelope becomes the `data:` payload. +type ResponsesAPIEvent struct { + Type string `json:"type"` + SequenceNumber int64 `json:"sequence_number"` + // Payload holds the event-specific fields. It is rendered as siblings of + // `type`/`sequence_number` on the wire via the custom MarshalJSON below. + Payload map[string]any `json:"-"` +} + +// MarshalJSON flattens Payload into the top-level object alongside `type` and +// `sequence_number`. +func (e ResponsesAPIEvent) MarshalJSON() ([]byte, error) { + m := make(map[string]any, len(e.Payload)+2) + for k, v := range e.Payload { + m[k] = v + } + // Dedicated fields always win over payload to prevent shadowing. + m["type"] = e.Type + m["sequence_number"] = e.SequenceNumber + return common.Marshal(m) +} + +// emitEvent builds an event and increments the seq counter. +func emitEvent(state *ResponsesStreamState, eventType string, payload map[string]any) ResponsesAPIEvent { + if payload == nil { + payload = map[string]any{} + } + return ResponsesAPIEvent{ + Type: eventType, + SequenceNumber: state.NextSeq(), + Payload: payload, + } +} + +// ChatCompletionsStreamToResponsesEvents translates one Chat-Completions +// stream chunk into a sequence of Responses-API SSE events. A nil `chunk` +// flushes any still-open output_item and emits `response.completed` exactly +// once (idempotent on subsequent nil calls). +// +// Spec coverage: +// - §5.1: sequence counter starts at 1, monotonic +// - §5.2: response.created + response.in_progress emitted once on first usable chunk +// - §5.3: message lifecycle (added/content_part.added/delta/done events) +// - §5.4: reasoning lifecycle (output_item.added/reasoning_summary_part.added/delta/done) +// - §5.5: function_call lifecycle (added with arguments:"" / delta / done) +// - §5.6: ... inline tag recognition +// - §5.7: null-chunk flush with deterministic close order +// - §5.8: error events emit a single response.failed (dedup) +// - §5.9: usage propagation on response.completed (cache token decomposition) +// - §5.10: custom_tool_call alias +func ChatCompletionsStreamToResponsesEvents(chunk *dto.ChatCompletionsStreamResponse, state *ResponsesStreamState) []ResponsesAPIEvent { + if state == nil { + // Defensive: cannot translate without state. + return nil + } + + if chunk == nil { + return flushOnEOS(state) + } + + events := make([]ResponsesAPIEvent, 0, 4) + + // Emit response.created + response.in_progress exactly once on the first + // usable chunk. + if !state.Started { + respID := strings.TrimSpace(chunk.Id) + if respID == "" { + respID = "chat" + } + respID = "resp_" + respID + state.ResponseID = respID + state.Model = chunk.Model + state.CreatedAt = chunk.Created + if state.CreatedAt == 0 { + state.CreatedAt = time.Now().Unix() + } + responseEnvelope := buildResponseEnvelope(state, "in_progress") + events = append(events, emitEvent(state, "response.created", map[string]any{ + "response": responseEnvelope, + })) + events = append(events, emitEvent(state, "response.in_progress", map[string]any{ + "response": responseEnvelope, + })) + state.Started = true + state.InProgressSent = true + } + + if len(chunk.Choices) == 0 { + return events + } + choice := chunk.Choices[0] + delta := choice.Delta + + // Track usage update on every chunk that carries one. + if chunk.Usage != nil { + state.Usage.PromptTokens = chunk.Usage.PromptTokens + state.Usage.CompletionTokens = chunk.Usage.CompletionTokens + state.Usage.TotalTokens = chunk.Usage.TotalTokens + state.Usage.CachedTokens = chunk.Usage.PromptTokensDetails.CachedTokens + state.Usage.CacheCreationTokens = chunk.Usage.PromptTokensDetails.CachedCreationTokens + state.Usage.ReasoningTokens = chunk.Usage.CompletionTokenDetails.ReasoningTokens + } + + // Tool call deltas take precedence over text. + for _, tc := range delta.ToolCalls { + evs := handleToolCallDelta(state, tc) + events = append(events, evs...) + } + + // Reasoning content delta -> reasoning output_item lifecycle. + if rc := delta.GetReasoningContent(); rc != "" { + // Close any open message before opening reasoning. + events = append(events, closeMessageIfOpen(state)...) + events = append(events, ensureReasoningOpen(state)...) + events = append(events, emitEvent(state, "response.reasoning_summary_text.delta", map[string]any{ + "item_id": state.ReasoningItemID, + "output_index": state.ReasoningItemIndex, + "summary_index": 0, + "delta": rc, + })) + } + + // Text content delta. Honour ... inline markers. + if delta.Content != nil && *delta.Content != "" { + text := *delta.Content + events = append(events, handleTextDeltaWithInlineThink(state, text)...) + } + + // Finish reason — record but do not emit response.completed until we + // receive a null chunk (or the upstream gracefully terminates). + if choice.FinishReason != nil && *choice.FinishReason != "" { + state.FinalFinishReason = *choice.FinishReason + } + + return events +} + +// EmitChatStreamErrorEvent emits a single response.failed event for upstream +// error events. Calling this more than once is a no-op (spec §5.8). +func EmitChatStreamErrorEvent(state *ResponsesStreamState, message string) []ResponsesAPIEvent { + if state == nil || state.ErrorEmitted { + return nil + } + events := make([]ResponsesAPIEvent, 0, 2) + if !state.Started { + // Emit the minimum prelude as part of the returned events so its + // sequence number is observed by the caller. Discarding it here would + // still bump the counter and skew the subsequent response.failed + // sequence number to 2 instead of 1. + if state.CreatedAt == 0 { + state.CreatedAt = time.Now().Unix() + } + if state.ResponseID == "" { + state.ResponseID = "resp_error" + } + envelope := buildResponseEnvelope(state, "failed") + events = append(events, emitEvent(state, "response.created", map[string]any{"response": envelope})) + state.Started = true + } + events = append(events, emitEvent(state, "response.failed", map[string]any{ + "response": map[string]any{ + "id": state.ResponseID, + "status": "failed", + "error": map[string]any{"message": message}, + }, + })) + state.ErrorEmitted = true + // response.failed is terminal — mark the stream as completed so any + // subsequent flushOnEOS is a no-op and we never emit both response.failed + // and response.completed on the same stream. + state.CompletedSent = true + return events +} + +func handleTextDeltaWithInlineThink(state *ResponsesStreamState, text string) []ResponsesAPIEvent { + events := make([]ResponsesAPIEvent, 0, 2) + // Resume any partial / token saved from a previous chunk. + if state.PendingTagBuffer != "" { + text = state.PendingTagBuffer + text + state.PendingTagBuffer = "" + } + for text != "" { + if state.InThinkInlineTag { + // Looking for . + if idx := strings.Index(text, ""); idx >= 0 { + inside := text[:idx] + rest := text[idx+len(""):] + if inside != "" { + events = append(events, ensureReasoningOpen(state)...) + events = append(events, emitEvent(state, "response.reasoning_summary_text.delta", map[string]any{ + "item_id": state.ReasoningItemID, + "output_index": state.ReasoningItemIndex, + "summary_index": 0, + "delta": inside, + })) + } + // Close reasoning. + events = append(events, closeReasoningIfOpen(state)...) + state.InThinkInlineTag = false + text = rest + continue + } + // No closing in this chunk yet. Hold back any trailing + // prefix that could grow into on the next chunk. + emit, pending := splitPendingThinkTag(text) + state.PendingTagBuffer = pending + if emit != "" { + events = append(events, ensureReasoningOpen(state)...) + events = append(events, emitEvent(state, "response.reasoning_summary_text.delta", map[string]any{ + "item_id": state.ReasoningItemID, + "output_index": state.ReasoningItemIndex, + "summary_index": 0, + "delta": emit, + })) + } + return events + } + + // Not in think tag. + if idx := strings.Index(text, ""); idx >= 0 { + before := text[:idx] + rest := text[idx+len(""):] + if before != "" { + events = append(events, closeReasoningIfOpen(state)...) + events = append(events, ensureMessageOpen(state)...) + events = append(events, emitEvent(state, "response.output_text.delta", map[string]any{ + "item_id": state.MessageItemID, + "output_index": state.MessageItemIndex, + "content_index": 0, + "delta": before, + })) + } + // Open reasoning. + events = append(events, closeMessageIfOpen(state)...) + state.InThinkInlineTag = true + text = rest + continue + } + + // No opening in this chunk. Hold back any trailing prefix + // that could grow into on the next chunk. + emit, pending := splitPendingThinkTag(text) + state.PendingTagBuffer = pending + if emit != "" { + events = append(events, closeReasoningIfOpen(state)...) + events = append(events, ensureMessageOpen(state)...) + events = append(events, emitEvent(state, "response.output_text.delta", map[string]any{ + "item_id": state.MessageItemID, + "output_index": state.MessageItemIndex, + "content_index": 0, + "delta": emit, + })) + } + return events + } + return events +} + +// splitPendingThinkTag separates text into the portion safe to emit and a +// trailing partial-tag fragment that should be buffered until the next chunk. +// A trailing substring beginning with '<' is buffered only when it is a strict +// prefix of "" or "" (i.e. could still grow into a real tag). +// Tail length is bounded by len("")-1, so memory use is constant and +// ordinary text containing a stray '<' is emitted normally. +func splitPendingThinkTag(text string) (emit string, pending string) { + if text == "" { + return "", "" + } + maxLook := len("") - 1 + start := len(text) - maxLook + if start < 0 { + start = 0 + } + for i := start; i < len(text); i++ { + if text[i] != '<' { + continue + } + tail := text[i:] + if strings.ContainsRune(tail, '>') { + // A complete-looking tag is already present; let the main loop + // process it on the next iteration. + return text, "" + } + if strings.HasPrefix("", tail) || strings.HasPrefix("", tail) { + return text[:i], tail + } + } + return text, "" +} + +func handleToolCallDelta(state *ResponsesStreamState, tc dto.ToolCallResponse) []ResponsesAPIEvent { + events := make([]ResponsesAPIEvent, 0, 2) + + idx := 0 + if tc.Index != nil { + idx = *tc.Index + } + fc, ok := state.FuncCalls[idx] + if !ok { + fc = &ResponsesStreamFuncCall{ + ID: tc.ID, + Name: tc.Function.Name, + ItemIndex: nextItemIndex(state), + } + state.FuncCalls[idx] = fc + + // Close any open text/reasoning before opening function_call. + events = append(events, closeMessageIfOpen(state)...) + events = append(events, closeReasoningIfOpen(state)...) + + callID := fc.ID + if callID == "" { + callID = tc.ID + fc.ID = tc.ID + } + // Derive a stable item id from the call id so the wire item.id and the + // item_id referenced by function_call_arguments.* match each other. + fc.ItemID = funcCallItemID(state, callID) + events = append(events, emitEvent(state, "response.output_item.added", map[string]any{ + "output_index": fc.ItemIndex, + "item": map[string]any{ + "id": fc.ItemID, + "type": "function_call", + "status": "in_progress", + "call_id": callID, + "name": fc.Name, + "arguments": "", + }, + })) + } else { + // Update ID/name if the chunk carries new info. + if tc.ID != "" && fc.ID == "" { + fc.ID = tc.ID + } + if tc.Function.Name != "" && fc.Name == "" { + fc.Name = tc.Function.Name + } + if fc.ItemID == "" && fc.ID != "" { + fc.ItemID = funcCallItemID(state, fc.ID) + } + } + + // Argument deltas. + if tc.Function.Arguments != "" { + fc.ArgsBuf += tc.Function.Arguments + events = append(events, emitEvent(state, "response.function_call_arguments.delta", map[string]any{ + "item_id": fc.ItemID, + "output_index": fc.ItemIndex, + "delta": tc.Function.Arguments, + })) + } + return events +} + +func ensureMessageOpen(state *ResponsesStreamState) []ResponsesAPIEvent { + if state.MessageItemOpen { + return nil + } + events := make([]ResponsesAPIEvent, 0, 2) + state.MessageItemIndex = nextItemIndex(state) + state.MessageItemID = assignMessageItemID(state) + state.MessageItemOpen = true + state.MessageContentPartOpen = true + events = append(events, emitEvent(state, "response.output_item.added", map[string]any{ + "output_index": state.MessageItemIndex, + "item": map[string]any{ + "id": state.MessageItemID, + "type": "message", + "status": "in_progress", + "role": "assistant", + "content": []any{}, + }, + })) + events = append(events, emitEvent(state, "response.content_part.added", map[string]any{ + "item_id": state.MessageItemID, + "output_index": state.MessageItemIndex, + "content_index": 0, + "part": map[string]any{ + "type": "output_text", + "text": "", + }, + })) + return events +} + +func closeMessageIfOpen(state *ResponsesStreamState) []ResponsesAPIEvent { + if !state.MessageItemOpen { + return nil + } + events := make([]ResponsesAPIEvent, 0, 3) + itemID := state.MessageItemID + events = append(events, emitEvent(state, "response.output_text.done", map[string]any{ + "item_id": itemID, + "output_index": state.MessageItemIndex, + "content_index": 0, + })) + events = append(events, emitEvent(state, "response.content_part.done", map[string]any{ + "item_id": itemID, + "output_index": state.MessageItemIndex, + "content_index": 0, + })) + events = append(events, emitEvent(state, "response.output_item.done", map[string]any{ + "output_index": state.MessageItemIndex, + "item": map[string]any{ + "id": itemID, + "type": "message", + "status": "completed", + "role": "assistant", + }, + })) + state.MessageItemOpen = false + state.MessageContentPartOpen = false + state.MessageItemID = "" + return events +} + +func ensureReasoningOpen(state *ResponsesStreamState) []ResponsesAPIEvent { + if state.ReasoningItemOpen { + return nil + } + events := make([]ResponsesAPIEvent, 0, 2) + state.ReasoningItemIndex = nextItemIndex(state) + state.ReasoningItemID = assignReasoningItemID(state) + state.ReasoningItemOpen = true + state.ReasoningSummaryPartOpen = true + events = append(events, emitEvent(state, "response.output_item.added", map[string]any{ + "output_index": state.ReasoningItemIndex, + "item": map[string]any{ + "id": state.ReasoningItemID, + "type": "reasoning", + "status": "in_progress", + "summary": []any{}, + }, + })) + events = append(events, emitEvent(state, "response.reasoning_summary_part.added", map[string]any{ + "item_id": state.ReasoningItemID, + "output_index": state.ReasoningItemIndex, + "summary_index": 0, + "part": map[string]any{ + "type": "summary_text", + "text": "", + }, + })) + return events +} + +func closeReasoningIfOpen(state *ResponsesStreamState) []ResponsesAPIEvent { + if !state.ReasoningItemOpen { + return nil + } + events := make([]ResponsesAPIEvent, 0, 3) + itemID := state.ReasoningItemID + events = append(events, emitEvent(state, "response.reasoning_summary_text.done", map[string]any{ + "item_id": itemID, + "output_index": state.ReasoningItemIndex, + "summary_index": 0, + })) + events = append(events, emitEvent(state, "response.reasoning_summary_part.done", map[string]any{ + "item_id": itemID, + "output_index": state.ReasoningItemIndex, + "summary_index": 0, + })) + events = append(events, emitEvent(state, "response.output_item.done", map[string]any{ + "output_index": state.ReasoningItemIndex, + "item": map[string]any{ + "id": itemID, + "type": "reasoning", + "status": "completed", + }, + })) + state.ReasoningItemOpen = false + state.ReasoningSummaryPartOpen = false + state.ReasoningItemID = "" + return events +} + +func closeAllOpenFunctionCalls(state *ResponsesStreamState) []ResponsesAPIEvent { + events := make([]ResponsesAPIEvent, 0) + // Collect open entries and sort by tool index (the map key) so the close + // order — and the sequence numbers it stamps onto downstream events — is + // deterministic across identical streams. state.FuncCalls is a Go map and + // would otherwise iterate in random order. + indices := make([]int, 0, len(state.FuncCalls)) + for idx, fc := range state.FuncCalls { + if fc == nil || fc.Done { + continue + } + indices = append(indices, idx) + } + sort.Ints(indices) + for _, idx := range indices { + fc := state.FuncCalls[idx] + args := fc.ArgsBuf + if strings.TrimSpace(args) == "" { + args = "{}" + } + if fc.ItemID == "" { + fc.ItemID = funcCallItemID(state, fc.ID) + } + events = append(events, emitEvent(state, "response.function_call_arguments.done", map[string]any{ + "item_id": fc.ItemID, + "output_index": fc.ItemIndex, + "arguments": args, + })) + events = append(events, emitEvent(state, "response.output_item.done", map[string]any{ + "output_index": fc.ItemIndex, + "item": map[string]any{ + "id": fc.ItemID, + "type": "function_call", + "status": "completed", + "call_id": fc.ID, + "name": fc.Name, + "arguments": args, + }, + })) + fc.Done = true + } + return events +} + +func nextItemIndex(state *ResponsesStreamState) int { + idx := state.ItemIndex + state.ItemIndex++ + return idx +} + +// responseIDSuffix returns the portion of state.ResponseID after the "resp_" +// prefix, stripped of any further item-type prefix. It is used as the stable +// base for derived item ids ("msg_", "rs_", ...). +func responseIDSuffix(state *ResponsesStreamState) string { + return ResponsesIDBase(state.ResponseID) +} + +// ResponsesIDBase returns the portion of a Responses-API response id after the +// "resp_" prefix (and any subsequent item-type prefix such as "msg_"/"rs_"/ +// "fc_"). It is the stable base used when deriving per-item ids in both the +// streaming and non-streaming chat→responses translators. +func ResponsesIDBase(respID string) string { + base := strings.TrimPrefix(respID, "resp_") + for _, p := range []string{"msg_", "rs_", "fc_"} { + if strings.HasPrefix(base, p) { + base = strings.TrimPrefix(base, p) + break + } + } + if base == "" { + base = "chat" + } + return base +} + +// assignMessageItemID returns a fresh message item id and bumps the per-stream +// counter so subsequent reopens (e.g. after an inline close) get a +// unique value. +func assignMessageItemID(state *ResponsesStreamState) string { + state.MessageItemCount++ + if state.MessageItemCount == 1 { + return "msg_" + responseIDSuffix(state) + } + return fmt.Sprintf("msg_%s_%d", responseIDSuffix(state), state.MessageItemCount) +} + +// assignReasoningItemID mirrors assignMessageItemID for reasoning items. +func assignReasoningItemID(state *ResponsesStreamState) string { + state.ReasoningItemCount++ + if state.ReasoningItemCount == 1 { + return "rs_" + responseIDSuffix(state) + } + return fmt.Sprintf("rs_%s_%d", responseIDSuffix(state), state.ReasoningItemCount) +} + +// funcCallItemID derives a stable function_call item id ("fc_") from +// the upstream call id, falling back to the response suffix when callID is +// empty so the wire id is always non-empty. +func funcCallItemID(state *ResponsesStreamState, callID string) string { + base := strings.TrimSpace(callID) + if base == "" { + base = responseIDSuffix(state) + } + if strings.HasPrefix(base, "fc_") { + return base + } + return "fc_" + base +} + +func flushOnEOS(state *ResponsesStreamState) []ResponsesAPIEvent { + if state.CompletedSent { + return nil + } + events := make([]ResponsesAPIEvent, 0, 6) + + // If we never started, emit the prelude before anything else so the wire + // still has a well-formed sequence. + if !state.Started { + if state.CreatedAt == 0 { + state.CreatedAt = time.Now().Unix() + } + if state.ResponseID == "" { + state.ResponseID = "resp_chat" + } + envelope := buildResponseEnvelope(state, "in_progress") + events = append(events, emitEvent(state, "response.created", map[string]any{"response": envelope})) + events = append(events, emitEvent(state, "response.in_progress", map[string]any{"response": envelope})) + state.Started = true + state.InProgressSent = true + } + // Flush any partial-tag fragment held back across chunks. It cannot grow + // into a complete / now, so emit it to whichever channel + // is currently active. + if state.PendingTagBuffer != "" { + pending := state.PendingTagBuffer + state.PendingTagBuffer = "" + if state.InThinkInlineTag { + events = append(events, ensureReasoningOpen(state)...) + events = append(events, emitEvent(state, "response.reasoning_summary_text.delta", map[string]any{ + "item_id": state.ReasoningItemID, + "output_index": state.ReasoningItemIndex, + "summary_index": 0, + "delta": pending, + })) + } else { + events = append(events, closeReasoningIfOpen(state)...) + events = append(events, ensureMessageOpen(state)...) + events = append(events, emitEvent(state, "response.output_text.delta", map[string]any{ + "item_id": state.MessageItemID, + "output_index": state.MessageItemIndex, + "content_index": 0, + "delta": pending, + })) + } + } + // Close in deterministic order: message, reasoning (if inline-only), + // then function_calls. + events = append(events, closeMessageIfOpen(state)...) + events = append(events, closeReasoningIfOpen(state)...) + events = append(events, closeAllOpenFunctionCalls(state)...) + + envelope := buildResponseEnvelope(state, "completed") + // Attach usage. + envelope["usage"] = buildResponsesUsage(state) + events = append(events, emitEvent(state, "response.completed", map[string]any{ + "response": envelope, + })) + state.CompletedSent = true + return events +} + +func buildResponseEnvelope(state *ResponsesStreamState, status string) map[string]any { + return map[string]any{ + "id": state.ResponseID, + "object": "response", + "created_at": state.CreatedAt, + "model": state.Model, + "status": status, + "output": []any{}, + } +} + +func buildResponsesUsage(state *ResponsesStreamState) map[string]any { + if state.Usage == nil { + return map[string]any{ + "input_tokens": 0, + "output_tokens": 0, + "total_tokens": 0, + } + } + cached := state.Usage.CachedTokens + cacheCreation := state.Usage.CacheCreationTokens + input := state.Usage.PromptTokens - cached - cacheCreation + if input < 0 { + input = 0 + } + u := map[string]any{ + "input_tokens": input, + "output_tokens": state.Usage.CompletionTokens, + "total_tokens": state.Usage.PromptTokens + state.Usage.CompletionTokens, + } + if cached > 0 || cacheCreation > 0 { + details := map[string]any{} + if cached > 0 { + details["cached_tokens"] = cached + } + if cacheCreation > 0 { + details["cache_creation_tokens"] = cacheCreation + } + u["input_tokens_details"] = details + } + if state.Usage.ReasoningTokens > 0 { + u["output_tokens_details"] = map[string]any{ + "reasoning_tokens": state.Usage.ReasoningTokens, + } + } + return u +} diff --git a/service/openaicompat/chat_stream_to_responses_test.go b/service/openaicompat/chat_stream_to_responses_test.go new file mode 100644 index 00000000000..0929a31fda6 --- /dev/null +++ b/service/openaicompat/chat_stream_to_responses_test.go @@ -0,0 +1,540 @@ +package openaicompat + +import ( + "strings" + "testing" + + "github.com/QuantumNous/new-api/common" + "github.com/QuantumNous/new-api/dto" + "github.com/stretchr/testify/require" +) + +// helper: parse a marshaled ResponsesAPIEvent's JSON into a flat map so we can +// assert top-level fields without re-deriving the wire shape. +func unmarshalEvent(t *testing.T, ev ResponsesAPIEvent) map[string]any { + t.Helper() + data, err := common.Marshal(ev) + require.NoError(t, err) + var m map[string]any + require.NoError(t, common.Unmarshal(data, &m)) + return m +} + +func TestStreamToResponses_SequenceIsMonotonic(t *testing.T) { + state := NewResponsesStreamState() + first := "hello" + chunk := &dto.ChatCompletionsStreamResponse{ + Id: "abc12345", + Object: "chat.completion.chunk", + Created: 100, + Model: "test", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + { + Index: 0, + Delta: dto.ChatCompletionsStreamResponseChoiceDelta{ + Content: &first, + }, + }, + }, + } + events := ChatCompletionsStreamToResponsesEvents(chunk, state) + require.NotEmpty(t, events) + for i, ev := range events { + want := int64(i + 1) + if ev.SequenceNumber != want { + t.Errorf("event[%d].seq=%d want %d", i, ev.SequenceNumber, want) + } + } +} + +func TestStreamToResponses_CreatedAndInProgressOnce(t *testing.T) { + state := NewResponsesStreamState() + first := "a" + chunk1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &first}}, + }, + } + ev1 := ChatCompletionsStreamToResponsesEvents(chunk1, state) + chunk2 := &dto.ChatCompletionsStreamResponse{ + Id: "x", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &first}}, + }, + } + ev2 := ChatCompletionsStreamToResponsesEvents(chunk2, state) + + count := func(events []ResponsesAPIEvent, t string) int { + n := 0 + for _, e := range events { + if e.Type == t { + n++ + } + } + return n + } + all := append(ev1, ev2...) + if count(all, "response.created") != 1 { + t.Errorf("created count=%d want 1", count(all, "response.created")) + } + if count(all, "response.in_progress") != 1 { + t.Errorf("in_progress count=%d want 1", count(all, "response.in_progress")) + } +} + +func TestStreamToResponses_ResponseIDPrefixed(t *testing.T) { + state := NewResponsesStreamState() + text := "hi" + chunk := &dto.ChatCompletionsStreamResponse{ + Id: "abc12345", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &text}}, + }, + } + events := ChatCompletionsStreamToResponsesEvents(chunk, state) + require.NotEmpty(t, events) + m := unmarshalEvent(t, events[0]) + resp, ok := m["response"].(map[string]any) + require.True(t, ok) + if resp["id"] != "resp_abc12345" { + t.Errorf("id=%v want resp_abc12345", resp["id"]) + } +} + +func TestStreamToResponses_MessageLifecycle(t *testing.T) { + state := NewResponsesStreamState() + text := "hello" + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &text}}, + }, + } + ev := ChatCompletionsStreamToResponsesEvents(c1, state) + wantTypes := []string{ + "response.created", + "response.in_progress", + "response.output_item.added", + "response.content_part.added", + "response.output_text.delta", + } + for i, want := range wantTypes { + if i >= len(ev) { + t.Errorf("missing event %d: %s", i, want) + continue + } + if ev[i].Type != want { + t.Errorf("event[%d].type=%s want %s", i, ev[i].Type, want) + } + } + + // EOS flush should close. + flush := ChatCompletionsStreamToResponsesEvents(nil, state) + typesWanted := []string{ + "response.output_text.done", + "response.content_part.done", + "response.output_item.done", + "response.completed", + } + wireTypes := make([]string, 0, len(flush)) + for _, e := range flush { + wireTypes = append(wireTypes, e.Type) + } + for _, want := range typesWanted { + found := false + for _, t2 := range wireTypes { + if t2 == want { + found = true + break + } + } + if !found { + t.Errorf("missing flush event %s in %v", want, wireTypes) + } + } +} + +func TestStreamToResponses_ReasoningLifecycle(t *testing.T) { + state := NewResponsesStreamState() + r1 := "step1" + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{ReasoningContent: &r1}}, + }, + } + ev := ChatCompletionsStreamToResponsesEvents(c1, state) + hasAdded := false + hasPartAdded := false + hasDelta := false + for _, e := range ev { + switch e.Type { + case "response.output_item.added": + hasAdded = true + case "response.reasoning_summary_part.added": + hasPartAdded = true + case "response.reasoning_summary_text.delta": + hasDelta = true + } + } + if !hasAdded || !hasPartAdded || !hasDelta { + t.Errorf("missing reasoning events: added=%v partAdded=%v delta=%v", hasAdded, hasPartAdded, hasDelta) + } +} + +func TestStreamToResponses_FunctionCallLifecycle(t *testing.T) { + state := NewResponsesStreamState() + idx0 := 0 + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + { + Delta: dto.ChatCompletionsStreamResponseChoiceDelta{ + ToolCalls: []dto.ToolCallResponse{ + { + Index: &idx0, + ID: "c1", + Type: "function", + Function: dto.FunctionResponse{Name: "search", Arguments: "{"}, + }, + }, + }, + }, + }, + } + ev := ChatCompletionsStreamToResponsesEvents(c1, state) + added := false + delta := false + for _, e := range ev { + if e.Type == "response.output_item.added" { + added = true + m := unmarshalEvent(t, e) + if item, ok := m["item"].(map[string]any); ok { + if item["type"] != "function_call" { + t.Errorf("output_item.added.type=%v want function_call", item["type"]) + } + if item["arguments"] != "" { + t.Errorf("initial arguments=%v want \"\"", item["arguments"]) + } + } + } + if e.Type == "response.function_call_arguments.delta" { + delta = true + } + } + if !added || !delta { + t.Errorf("missing function_call events: added=%v delta=%v", added, delta) + } + + // Flush should close with done events. + flush := ChatCompletionsStreamToResponsesEvents(nil, state) + hasArgsDone := false + hasItemDone := false + for _, e := range flush { + if e.Type == "response.function_call_arguments.done" { + hasArgsDone = true + m := unmarshalEvent(t, e) + if m["arguments"] != "{" { + t.Errorf("done args=%v want '{'", m["arguments"]) + } + } + if e.Type == "response.output_item.done" { + hasItemDone = true + } + } + if !hasArgsDone || !hasItemDone { + t.Errorf("missing close events: args.done=%v item.done=%v", hasArgsDone, hasItemDone) + } +} + +func TestStreamToResponses_FunctionCallEmptyArgsDefaultsCurly(t *testing.T) { + state := NewResponsesStreamState() + idx0 := 0 + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + { + Delta: dto.ChatCompletionsStreamResponseChoiceDelta{ + ToolCalls: []dto.ToolCallResponse{ + { + Index: &idx0, + ID: "c1", + Type: "function", + Function: dto.FunctionResponse{Name: "f"}, + }, + }, + }, + }, + }, + } + _ = ChatCompletionsStreamToResponsesEvents(c1, state) + flush := ChatCompletionsStreamToResponsesEvents(nil, state) + for _, e := range flush { + if e.Type == "response.function_call_arguments.done" { + m := unmarshalEvent(t, e) + if m["arguments"] != "{}" { + t.Errorf("empty args default=%v want {}", m["arguments"]) + } + } + } +} + +func TestStreamToResponses_InlineThinkTag(t *testing.T) { + state := NewResponsesStreamState() + text := "introstep" + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", + Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &text}}, + }, + } + ev := ChatCompletionsStreamToResponsesEvents(c1, state) + gotText := false + gotReasoning := false + for _, e := range ev { + if e.Type == "response.output_text.delta" { + gotText = true + } + if e.Type == "response.reasoning_summary_text.delta" { + gotReasoning = true + } + } + if !gotText || !gotReasoning { + t.Errorf("inline marker: text=%v reasoning=%v", gotText, gotReasoning) + } +} + +func TestStreamToResponses_InlineThinkClose(t *testing.T) { + state := NewResponsesStreamState() + t1 := "introstep" + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &t1}}, + }, + } + _ = ChatCompletionsStreamToResponsesEvents(c1, state) + t2 := "moreanswer" + c2 := &dto.ChatCompletionsStreamResponse{ + Id: "x", Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &t2}}, + }, + } + ev2 := ChatCompletionsStreamToResponsesEvents(c2, state) + // Must close reasoning then open message and emit text "answer". + hasReasoningClose := false + hasTextOpen := false + hasTextDeltaAnswer := false + for _, e := range ev2 { + if e.Type == "response.reasoning_summary_text.done" { + hasReasoningClose = true + } + if e.Type == "response.content_part.added" { + hasTextOpen = true + } + if e.Type == "response.output_text.delta" { + m := unmarshalEvent(t, e) + if s, _ := m["delta"].(string); strings.Contains(s, "answer") { + hasTextDeltaAnswer = true + } + } + } + if !hasReasoningClose || !hasTextOpen || !hasTextDeltaAnswer { + t.Errorf("close path missing: reasoningClose=%v textOpen=%v ans=%v", hasReasoningClose, hasTextOpen, hasTextDeltaAnswer) + } +} + +func TestStreamToResponses_NullFlushIdempotent(t *testing.T) { + state := NewResponsesStreamState() + text := "hi" + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &text}}, + }, + } + _ = ChatCompletionsStreamToResponsesEvents(c1, state) + f1 := ChatCompletionsStreamToResponsesEvents(nil, state) + f2 := ChatCompletionsStreamToResponsesEvents(nil, state) + count := 0 + for _, e := range f1 { + if e.Type == "response.completed" { + count++ + } + } + for _, e := range f2 { + if e.Type == "response.completed" { + count++ + } + } + if count != 1 { + t.Errorf("response.completed emitted %d times, want 1", count) + } +} + +func TestStreamToResponses_ErrorMappedOnce(t *testing.T) { + state := NewResponsesStreamState() + ev1 := EmitChatStreamErrorEvent(state, "boom") + ev2 := EmitChatStreamErrorEvent(state, "boom") + if len(ev2) != 0 { + t.Errorf("second emit returned %d events", len(ev2)) + } + count := 0 + for _, e := range ev1 { + if e.Type == "response.failed" { + count++ + } + } + if count != 1 { + t.Errorf("response.failed count=%d want 1", count) + } +} + +func TestStreamToResponses_UsagePropagation(t *testing.T) { + state := NewResponsesStreamState() + text := "hi" + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &text}}, + }, + Usage: &dto.Usage{ + PromptTokens: 100, + CompletionTokens: 50, + TotalTokens: 150, + PromptTokensDetails: dto.InputTokenDetails{ + CachedTokens: 30, + CachedCreationTokens: 20, + }, + }, + } + _ = ChatCompletionsStreamToResponsesEvents(c1, state) + flush := ChatCompletionsStreamToResponsesEvents(nil, state) + var completed map[string]any + for _, e := range flush { + if e.Type == "response.completed" { + completed = unmarshalEvent(t, e) + } + } + require.NotNil(t, completed) + resp, _ := completed["response"].(map[string]any) + usage, _ := resp["usage"].(map[string]any) + // input_tokens = 100 - 30 - 20 = 50 + if u, _ := usage["input_tokens"].(float64); int(u) != 50 { + t.Errorf("input_tokens=%v want 50", usage["input_tokens"]) + } + if u, _ := usage["output_tokens"].(float64); int(u) != 50 { + t.Errorf("output_tokens=%v want 50", usage["output_tokens"]) + } + det, _ := usage["input_tokens_details"].(map[string]any) + require.NotNil(t, det) + if c, _ := det["cached_tokens"].(float64); int(c) != 30 { + t.Errorf("cached_tokens=%v want 30", det["cached_tokens"]) + } +} + +func TestResponsesAPIEvent_MarshalJSON_PayloadCannotShadowDedicatedFields(t *testing.T) { + ev := ResponsesAPIEvent{ + Type: "response.completed", + SequenceNumber: 42, + Payload: map[string]any{ + "type": "ATTACKER_OVERRIDE", + "sequence_number": 9999, + "response": map[string]any{"id": "resp_1"}, + }, + } + raw, err := ev.MarshalJSON() + require.NoError(t, err) + var got map[string]any + require.NoError(t, common.Unmarshal(raw, &got)) + require.Equal(t, "response.completed", got["type"], "dedicated type must win over payload key") + require.EqualValues(t, 42, got["sequence_number"], "dedicated sequence_number must win over payload key") + require.NotNil(t, got["response"], "non-conflicting payload keys must still be present") +} + +func TestStreamToResponses_ErrorPreventsSubsequentCompleted(t *testing.T) { + state := NewResponsesStreamState() + // Drive at least one usable chunk so state.Started is true. + text := "Hi" + finish := "" + chunk := &dto.ChatCompletionsStreamResponse{ + Id: "abc", + Model: "claude-test", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + { + Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &text}, + FinishReason: &finish, + }, + }, + } + _ = ChatCompletionsStreamToResponsesEvents(chunk, state) + + // Now emit a failure. + errEvents := EmitChatStreamErrorEvent(state, "upstream blew up") + require.NotEmpty(t, errEvents) + + // The flush MUST be a no-op now: no response.completed must follow. + flushEvents := ChatCompletionsStreamToResponsesEvents(nil, state) + for _, ev := range flushEvents { + require.NotEqual(t, "response.completed", ev.Type, + "response.completed must NOT fire after response.failed") + } +} + +func TestStreamToResponses_ToolCloseBeforeTextAndReverse(t *testing.T) { + // Open text first, then tool_call: text must close before tool opens. + state := NewResponsesStreamState() + tx := "hello" + c1 := &dto.ChatCompletionsStreamResponse{ + Id: "x", Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + {Delta: dto.ChatCompletionsStreamResponseChoiceDelta{Content: &tx}}, + }, + } + _ = ChatCompletionsStreamToResponsesEvents(c1, state) + idx0 := 0 + c2 := &dto.ChatCompletionsStreamResponse{ + Id: "x", Model: "m", + Choices: []dto.ChatCompletionsStreamResponseChoice{ + { + Delta: dto.ChatCompletionsStreamResponseChoiceDelta{ + ToolCalls: []dto.ToolCallResponse{ + { + Index: &idx0, + ID: "c1", + Type: "function", + Function: dto.FunctionResponse{Name: "x"}, + }, + }, + }, + }, + }, + } + ev := ChatCompletionsStreamToResponsesEvents(c2, state) + idxTextDone := -1 + idxToolAdded := -1 + for i, e := range ev { + if e.Type == "response.output_text.done" && idxTextDone == -1 { + idxTextDone = i + } + if e.Type == "response.output_item.added" { + m := unmarshalEvent(t, e) + if item, ok := m["item"].(map[string]any); ok && item["type"] == "function_call" { + idxToolAdded = i + } + } + } + if idxTextDone == -1 || idxToolAdded == -1 || idxTextDone >= idxToolAdded { + t.Errorf("ordering wrong: textDone=%d toolAdded=%d", idxTextDone, idxToolAdded) + } +} diff --git a/service/openaicompat/chat_to_responses.go b/service/openaicompat/chat_to_responses.go index 16096b88f59..bf749754465 100644 --- a/service/openaicompat/chat_to_responses.go +++ b/service/openaicompat/chat_to_responses.go @@ -400,3 +400,155 @@ func ChatCompletionsRequestToResponsesRequest(req *dto.GeneralOpenAIRequest) (*d return out, nil } + +// ChatCompletionsResponseToResponsesResponse converts a non-streaming +// Chat-Completions response (typically the result of the Anthropic adaptor +// going through ResponseClaude2OpenAI) into a Responses-API response shape. +// +// It satisfies spec §6 (non-streaming): builds an `output[]` array containing +// (optionally) a reasoning item, a message item, and function_call items — +// in stable order. status="completed", id="resp_", +// created_at=resp.Created, model=requestModel. Usage propagates per §5.9 +// (canonical cache token decomposition). +func ChatCompletionsResponseToResponsesResponse(resp *dto.OpenAITextResponse, requestModel string) (*dto.OpenAIResponsesResponse, error) { + if resp == nil { + return nil, errors.New("response is nil") + } + + respID := strings.TrimSpace(resp.Id) + if respID == "" { + respID = "chat" + } + if !strings.HasPrefix(respID, "resp_") { + respID = "resp_" + respID + } + + createdAt := 0 + switch v := resp.Created.(type) { + case int64: + createdAt = int(v) + case int: + createdAt = v + case float64: + createdAt = int(v) + } + + out := &dto.OpenAIResponsesResponse{ + ID: respID, + Object: "response", + CreatedAt: createdAt, + Model: requestModel, + } + + statusRaw, _ := common.Marshal("completed") + out.Status = statusRaw + + // Choose first choice (Chat-Completions guarantees at least one if non-error). + if len(resp.Choices) == 0 { + return out, nil + } + ch := resp.Choices[0] + + output := make([]dto.ResponsesOutput, 0, 4) + idBase := ResponsesIDBase(respID) + + // Reasoning item. + if rc := ch.Message.GetReasoningContent(); rc != "" { + output = append(output, dto.ResponsesOutput{ + Type: "reasoning", + ID: "rs_" + idBase, + Status: "completed", + Content: []dto.ResponsesOutputContent{ + {Type: "summary_text", Text: rc}, + }, + }) + } + + // Message item with text content. + text := "" + if ch.Message.IsStringContent() { + text = ch.Message.StringContent() + } else { + // Best effort: concat any text parts. + for _, part := range ch.Message.ParseContent() { + if part.Type == dto.ContentTypeText && part.Text != "" { + text += part.Text + } + } + } + if text != "" { + output = append(output, dto.ResponsesOutput{ + Type: "message", + ID: "msg_" + idBase, + Status: "completed", + Role: "assistant", + Content: []dto.ResponsesOutputContent{ + {Type: "output_text", Text: text}, + }, + }) + } + + // Function call items. + fcAuto := 0 + for _, tc := range ch.Message.ParseToolCalls() { + if strings.TrimSpace(tc.Function.Name) == "" { + continue + } + argsRaw, _ := common.Marshal(tc.Function.Arguments) + fcItemID := tc.ID + if strings.TrimSpace(fcItemID) == "" { + fcAuto++ + fcItemID = fmt.Sprintf("%s_%d", idBase, fcAuto) + } + if !strings.HasPrefix(fcItemID, "fc_") { + fcItemID = "fc_" + fcItemID + } + output = append(output, dto.ResponsesOutput{ + Type: "function_call", + ID: fcItemID, + Status: "completed", + CallId: tc.ID, + Name: tc.Function.Name, + Arguments: argsRaw, + }) + } + out.Output = output + + // Usage mapping per spec §5.9. + usage := &dto.Usage{} + usage.PromptTokens = resp.Usage.PromptTokens + usage.CompletionTokens = resp.Usage.CompletionTokens + usage.TotalTokens = resp.Usage.TotalTokens + if usage.TotalTokens == 0 { + usage.TotalTokens = usage.PromptTokens + usage.CompletionTokens + } + usage.InputTokens = resp.Usage.PromptTokens + usage.OutputTokens = resp.Usage.CompletionTokens + if resp.Usage.PromptTokensDetails.CachedTokens > 0 || resp.Usage.PromptTokensDetails.CachedCreationTokens > 0 { + usage.InputTokensDetails = &dto.InputTokenDetails{ + CachedTokens: resp.Usage.PromptTokensDetails.CachedTokens, + CachedCreationTokens: resp.Usage.PromptTokensDetails.CachedCreationTokens, + } + usage.PromptTokensDetails = resp.Usage.PromptTokensDetails + } + if resp.Usage.CompletionTokenDetails.ReasoningTokens > 0 { + usage.CompletionTokenDetails.ReasoningTokens = resp.Usage.CompletionTokenDetails.ReasoningTokens + } + // Canonical decomposition: input_tokens = max(0, prompt − cached − cache_creation). + cached := resp.Usage.PromptTokensDetails.CachedTokens + cacheCreation := resp.Usage.PromptTokensDetails.CachedCreationTokens + inputDecomp := usage.PromptTokens - cached - cacheCreation + if inputDecomp < 0 { + inputDecomp = 0 + } + usage.InputTokens = inputDecomp + out.Usage = usage + + // incomplete_details mapping per spec §6.4. + switch ch.FinishReason { + case "length": + out.IncompleteDetails = &dto.IncompleteDetails{Reasoning: "max_output_tokens"} + } + + return out, nil +} diff --git a/service/openaicompat/chat_to_responses_test.go b/service/openaicompat/chat_to_responses_test.go new file mode 100644 index 00000000000..1047dae738d --- /dev/null +++ b/service/openaicompat/chat_to_responses_test.go @@ -0,0 +1,191 @@ +package openaicompat + +import ( + "testing" + + "github.com/QuantumNous/new-api/dto" + "github.com/stretchr/testify/require" +) + +func TestChatToResponses_TextOnly(t *testing.T) { + msg := dto.Message{Role: "assistant"} + msg.SetStringContent("answer") + resp := &dto.OpenAITextResponse{ + Id: "abc", + Object: "chat.completion", + Created: int64(123), + Model: "claude", + Choices: []dto.OpenAITextResponseChoice{ + {Index: 0, Message: msg, FinishReason: "stop"}, + }, + Usage: dto.Usage{PromptTokens: 10, CompletionTokens: 5, TotalTokens: 15}, + } + out, err := ChatCompletionsResponseToResponsesResponse(resp, "claude") + require.NoError(t, err) + if out.ID != "resp_abc" { + t.Errorf("id=%q", out.ID) + } + require.Len(t, out.Output, 1) + if out.Output[0].Type != "message" { + t.Errorf("output type=%q", out.Output[0].Type) + } + require.Len(t, out.Output[0].Content, 1) + if out.Output[0].Content[0].Text != "answer" { + t.Errorf("text=%q", out.Output[0].Content[0].Text) + } +} + +func TestChatToResponses_ToolCall(t *testing.T) { + msg := dto.Message{Role: "assistant"} + msg.SetToolCalls([]dto.ToolCallRequest{ + {ID: "c1", Type: "function", Function: dto.FunctionRequest{Name: "search", Arguments: `{"q":"x"}`}}, + }) + resp := &dto.OpenAITextResponse{ + Id: "abc", + Object: "chat.completion", + Created: int64(1), + Model: "m", + Choices: []dto.OpenAITextResponseChoice{ + {Index: 0, Message: msg, FinishReason: "tool_calls"}, + }, + } + out, err := ChatCompletionsResponseToResponsesResponse(resp, "m") + require.NoError(t, err) + hasFc := false + for _, o := range out.Output { + if o.Type == "function_call" { + hasFc = true + if o.Name != "search" { + t.Errorf("name=%q", o.Name) + } + if o.CallId != "c1" { + t.Errorf("call_id=%q", o.CallId) + } + } + } + if !hasFc { + t.Errorf("missing function_call: %+v", out.Output) + } +} + +func TestChatToResponses_ReasoningOnly(t *testing.T) { + reasoning := "thinking" + msg := dto.Message{Role: "assistant", ReasoningContent: &reasoning} + msg.SetStringContent("") + resp := &dto.OpenAITextResponse{ + Id: "abc", + Object: "chat.completion", + Created: int64(1), + Model: "m", + Choices: []dto.OpenAITextResponseChoice{ + {Index: 0, Message: msg, FinishReason: "stop"}, + }, + } + out, err := ChatCompletionsResponseToResponsesResponse(resp, "m") + require.NoError(t, err) + hasReasoning := false + for _, o := range out.Output { + if o.Type == "reasoning" { + hasReasoning = true + require.NotEmpty(t, o.Content) + if o.Content[0].Text != "thinking" { + t.Errorf("reasoning text=%q", o.Content[0].Text) + } + } + } + if !hasReasoning { + t.Errorf("missing reasoning: %+v", out.Output) + } +} + +func TestChatToResponses_LengthMarksIncomplete(t *testing.T) { + msg := dto.Message{Role: "assistant"} + msg.SetStringContent("abc") + resp := &dto.OpenAITextResponse{ + Id: "abc", + Object: "chat.completion", + Created: int64(1), + Model: "m", + Choices: []dto.OpenAITextResponseChoice{ + {Index: 0, Message: msg, FinishReason: "length"}, + }, + } + out, err := ChatCompletionsResponseToResponsesResponse(resp, "m") + require.NoError(t, err) + require.NotNil(t, out.IncompleteDetails) + if out.IncompleteDetails.Reasoning != "max_output_tokens" { + t.Errorf("incomplete reason=%q", out.IncompleteDetails.Reasoning) + } +} + +func TestChatToResponses_UsageDecomposition(t *testing.T) { + msg := dto.Message{Role: "assistant"} + msg.SetStringContent("ok") + resp := &dto.OpenAITextResponse{ + Id: "abc", + Object: "chat.completion", + Created: int64(1), + Model: "m", + Choices: []dto.OpenAITextResponseChoice{ + {Index: 0, Message: msg, FinishReason: "stop"}, + }, + Usage: dto.Usage{ + PromptTokens: 100, + CompletionTokens: 50, + TotalTokens: 150, + PromptTokensDetails: dto.InputTokenDetails{ + CachedTokens: 30, + CachedCreationTokens: 20, + }, + }, + } + out, err := ChatCompletionsResponseToResponsesResponse(resp, "m") + require.NoError(t, err) + require.NotNil(t, out.Usage) + // input_tokens = 100 - 30 - 20 = 50 + if out.Usage.InputTokens != 50 { + t.Errorf("input_tokens=%d want 50", out.Usage.InputTokens) + } + if out.Usage.OutputTokens != 50 { + t.Errorf("output_tokens=%d want 50", out.Usage.OutputTokens) + } + require.NotNil(t, out.Usage.InputTokensDetails) + if out.Usage.InputTokensDetails.CachedTokens != 30 { + t.Errorf("cached=%d want 30", out.Usage.InputTokensDetails.CachedTokens) + } +} + +func TestChatToResponses_MixedReasoningTextToolCall(t *testing.T) { + reasoning := "let me think" + msg := dto.Message{Role: "assistant", ReasoningContent: &reasoning} + msg.SetStringContent("partial") + msg.SetToolCalls([]dto.ToolCallRequest{ + {ID: "c1", Type: "function", Function: dto.FunctionRequest{Name: "f", Arguments: "{}"}}, + }) + resp := &dto.OpenAITextResponse{ + Id: "abc", Object: "chat.completion", Created: int64(1), Model: "m", + Choices: []dto.OpenAITextResponseChoice{ + {Index: 0, Message: msg, FinishReason: "tool_calls"}, + }, + } + out, err := ChatCompletionsResponseToResponsesResponse(resp, "m") + require.NoError(t, err) + types := make([]string, 0) + for _, o := range out.Output { + types = append(types, o.Type) + } + hasR, hasM, hasF := false, false, false + for _, t2 := range types { + switch t2 { + case "reasoning": + hasR = true + case "message": + hasM = true + case "function_call": + hasF = true + } + } + if !hasR || !hasM || !hasF { + t.Errorf("expected all three output items, got %v", types) + } +} diff --git a/service/openaicompat/responses_stream_state.go b/service/openaicompat/responses_stream_state.go new file mode 100644 index 00000000000..0531ae67526 --- /dev/null +++ b/service/openaicompat/responses_stream_state.go @@ -0,0 +1,115 @@ +// Package openaicompat exposes shape translators between the OpenAI Responses, +// Chat-Completions, and Anthropic Messages surfaces. +package openaicompat + +// ResponsesStreamFuncCall holds per-tool-call streaming state used by +// ChatCompletionsStreamToResponsesEvents. +type ResponsesStreamFuncCall struct { + ID string + // ItemID is the function_call item's own id ("fc_..."), distinct from + // ID/CallID which is the call_id referenced by tool result messages. + ItemID string + Name string + ArgsBuf string + ItemIndex int + Done bool +} + +// ResponsesStreamState holds the per-stream bookkeeping required by the +// ChatCompletions -> Responses streaming translator. It is intentionally +// agnostic of the SSE transport. +type ResponsesStreamState struct { + // seq is the running sequence-number counter; NextSeq returns the next + // value, starting from 1. + seq int64 + + // ResponseID is the Responses-API response.id ("resp_..." prefix). + ResponseID string + // CreatedAt is the Unix timestamp captured on the first usable chunk. + CreatedAt int64 + + // Started indicates we've already emitted response.created. + Started bool + // InProgressSent indicates we've already emitted response.in_progress. + InProgressSent bool + // CompletedSent indicates we've already emitted response.completed. + CompletedSent bool + + // Message output_item lifecycle. + MessageItemOpen bool + MessageItemIndex int + MessageContentPartOpen bool + MessageOutputIndex int + // MessageItemID is the id of the currently-open message item ("msg_..."), + // referenced by all content_part.* and output_text.* events that belong to + // it. Cleared when the message item closes. + MessageItemID string + // MessageItemCount tracks how many message items have been opened in this + // stream, so that subsequent reopens (e.g. after an interleaved think tag) + // get unique ids. + MessageItemCount int + + // Reasoning output_item lifecycle. + ReasoningItemOpen bool + ReasoningItemIndex int + ReasoningSummaryPartOpen bool + // ReasoningItemID is the id of the currently-open reasoning item + // ("rs_..."), referenced by all reasoning_summary_* events. Cleared when + // the reasoning item closes. + ReasoningItemID string + // ReasoningItemCount mirrors MessageItemCount for reasoning items. + ReasoningItemCount int + + // FuncCalls is keyed by the chunk tool_call index. + FuncCalls map[int]*ResponsesStreamFuncCall + + // InThinkInlineTag is true while reasoning is being routed via the + // inline ... marker. + InThinkInlineTag bool + + // PendingTagBuffer holds a trailing chunk fragment that could still grow + // into a complete `` or `` token once the next chunk + // arrives. It is bounded by the longest possible partial-tag length so + // memory growth is constant. Always flushed at EOS. + PendingTagBuffer string + + // Usage accumulates the latest usage seen on stream completion. + Usage *ResponsesUsageSnapshot + + // Model is the upstream model echoed back to the client. + Model string + + // FinalFinishReason is the last finish_reason observed on the chat stream. + FinalFinishReason string + + // ErrorEmitted ensures the error chunk path is idempotent. + ErrorEmitted bool + + // ItemIndex is a running output_index counter for output_item.added/done. + ItemIndex int +} + +// ResponsesUsageSnapshot is a light wrapper to preserve cross-hop usage state. +type ResponsesUsageSnapshot struct { + PromptTokens int + CompletionTokens int + TotalTokens int + CachedTokens int + CacheCreationTokens int + ReasoningTokens int +} + +// NewResponsesStreamState constructs a state with safe zero defaults. +// seq begins at 0 so the first call to NextSeq returns 1. +func NewResponsesStreamState() *ResponsesStreamState { + return &ResponsesStreamState{ + FuncCalls: map[int]*ResponsesStreamFuncCall{}, + Usage: &ResponsesUsageSnapshot{}, + } +} + +// NextSeq increments the sequence counter and returns the new value. +func (s *ResponsesStreamState) NextSeq() int64 { + s.seq++ + return s.seq +} diff --git a/service/openaicompat/responses_to_chat.go b/service/openaicompat/responses_to_chat.go index d1c7473fe8a..844d6c819c1 100644 --- a/service/openaicompat/responses_to_chat.go +++ b/service/openaicompat/responses_to_chat.go @@ -2,8 +2,10 @@ package openaicompat import ( "errors" + "fmt" "strings" + "github.com/QuantumNous/new-api/common" "github.com/QuantumNous/new-api/dto" ) @@ -131,3 +133,540 @@ func ExtractOutputTextFromResponses(resp *dto.OpenAIResponsesResponse) string { } return sb.String() } + +// ResponsesRequestToChatCompletionsRequest translates the Responses-API shape +// into a Chat-Completions intermediate that can then be re-translated by the +// existing Chat -> Anthropic converter. +// +// It implements spec sections §3 through §10: +// - input-shape normalization (string / empty / array / non-string-non-array) +// - instructions lifting +// - role-only fallback for item type +// - message content normalization (input_text/output_text/input_image) +// - function_call buffering into assistant tool_calls +// - function_call_output -> role: "tool" with stringified non-string output +// - reasoning item buffering -> attached as reasoning_content to next assistant +// - tool declaration conversion (both Chat-Completions-shaped and Responses-flat) +// - Responses-only field cleanup +// - reasoning_effort carry +// - text.format -> response_format carry +// +// Any other input shape (number, object) returns an error so the caller can +// decide whether to fall back to the existing adaptor stub. +func ResponsesRequestToChatCompletionsRequest(req *dto.OpenAIResponsesRequest) (*dto.GeneralOpenAIRequest, error) { + if req == nil { + return nil, errors.New("request is nil") + } + + out := &dto.GeneralOpenAIRequest{ + Model: req.Model, + Stream: req.Stream, + Temperature: req.Temperature, + TopP: req.TopP, + User: req.User, + Metadata: req.Metadata, + Store: req.Store, + } + // max_output_tokens -> max_tokens (the field the Claude converter consumes). + if req.MaxOutputTokens != nil { + mt := *req.MaxOutputTokens + out.MaxTokens = &mt + } + + // reasoning.effort carry-through. + if req.Reasoning != nil && strings.TrimSpace(req.Reasoning.Effort) != "" { + out.ReasoningEffort = req.Reasoning.Effort + } + + // text.format -> response_format. text JSON shape can be either + // { "format": { "type": "json_object" } } + // or + // { "format": { "type": "json_schema", "json_schema": {...} } } + // or + // { "format": { "type": "json_schema", "name": ..., "schema": ... } } (flat) + if len(req.Text) > 0 { + var textObj map[string]any + if err := common.Unmarshal(req.Text, &textObj); err == nil { + if fmtAny, ok := textObj["format"]; ok { + if fmtMap, ok := fmtAny.(map[string]any); ok { + rf := &dto.ResponseFormat{} + if t, _ := fmtMap["type"].(string); t != "" { + rf.Type = t + } + if rf.Type == "json_schema" { + if schema, ok := fmtMap["json_schema"]; ok { + if b, err := common.Marshal(schema); err == nil { + rf.JsonSchema = b + } + } else { + // Flat shape: merge name/schema/strict/description into a json_schema object. + flat := map[string]any{} + for k, v := range fmtMap { + if k == "type" { + continue + } + flat[k] = v + } + if len(flat) > 0 { + if b, err := common.Marshal(flat); err == nil { + rf.JsonSchema = b + } + } + } + } + if rf.Type != "" { + out.ResponseFormat = rf + } + } + } + } + } + + // ----- Tool declarations ----- + if len(req.Tools) > 0 { + var toolsRaw []map[string]any + if err := common.Unmarshal(req.Tools, &toolsRaw); err == nil { + converted := make([]dto.ToolCallRequest, 0, len(toolsRaw)) + for _, t := range toolsRaw { + if t == nil { + continue + } + toolType, _ := t["type"].(string) + if toolType == "" { + toolType = "function" + } + // Already Chat-Completions shape (has "function" key)? + if fnAny, ok := t["function"]; ok { + fnMap, _ := fnAny.(map[string]any) + name, _ := fnMap["name"].(string) + if strings.TrimSpace(name) == "" { + continue + } + params := normalizeToolParameters(fnMap["parameters"]) + desc, _ := fnMap["description"].(string) + converted = append(converted, dto.ToolCallRequest{ + Type: "function", + Function: dto.FunctionRequest{ + Name: name, + Description: desc, + Parameters: params, + }, + }) + continue + } + if toolType == "function" { + name, _ := t["name"].(string) + if strings.TrimSpace(name) == "" { + continue + } + params := normalizeToolParameters(t["parameters"]) + desc, _ := t["description"].(string) + converted = append(converted, dto.ToolCallRequest{ + Type: "function", + Function: dto.FunctionRequest{ + Name: name, + Description: desc, + Parameters: params, + }, + }) + continue + } + // Hosted / non-function tool with no name => drop silently. + if name, _ := t["name"].(string); strings.TrimSpace(name) == "" { + continue + } + // Preserve hosted tool with name as a custom tool stub. We + // pass-through here using the raw map; the downstream Claude + // converter only recognises `function` types and ignores + // others, which keeps backwards behavior intact. + if b, err := common.Marshal(t); err == nil { + var stub dto.ToolCallRequest + _ = common.Unmarshal(b, &stub) + if stub.Type == "" { + stub.Type = toolType + } + converted = append(converted, stub) + } + } + if len(converted) > 0 { + out.Tools = converted + } + } + } + + // tool_choice pass-through (raw JSON -> any). + if len(req.ToolChoice) > 0 { + var any2 any + if err := common.Unmarshal(req.ToolChoice, &any2); err == nil { + // If the Responses-style {"type":"function","name":"x"} shape arrives, + // reshape to Chat-Completions {"type":"function","function":{"name":"x"}}. + if m, ok := any2.(map[string]any); ok { + if t, _ := m["type"].(string); t == "function" { + if _, has := m["function"]; !has { + if name, _ := m["name"].(string); name != "" { + any2 = map[string]any{ + "type": "function", + "function": map[string]any{"name": name}, + } + } + } + } + } + out.ToolChoice = any2 + } + } + + // parallel_tool_calls pass-through. + if len(req.ParallelToolCalls) > 0 { + var b bool + if err := common.Unmarshal(req.ParallelToolCalls, &b); err == nil { + out.ParallelTooCalls = &b + } + } + + // ----- Input normalization ----- + // instructions => leading system message. + if len(req.Instructions) > 0 { + var instr string + if err := common.Unmarshal(req.Instructions, &instr); err == nil { + if strings.TrimSpace(instr) != "" { + out.Messages = append(out.Messages, dto.Message{ + Role: "system", + Content: instr, + }) + } + } + } + + // Parse the input field. + var inputItems []map[string]any + if req.Input == nil || len(req.Input) == 0 { + // Treat absent input as empty -> placeholder user message. + inputItems = []map[string]any{ + { + "type": "message", + "role": "user", + "content": []map[string]any{{"type": "input_text", "text": "..."}}, + }, + } + } else { + switch common.GetJsonType(req.Input) { + case "string": + var s string + _ = common.Unmarshal(req.Input, &s) + if strings.TrimSpace(s) == "" { + s = "..." + } + inputItems = []map[string]any{ + { + "type": "message", + "role": "user", + "content": []map[string]any{{"type": "input_text", "text": s}}, + }, + } + case "array": + if err := common.Unmarshal(req.Input, &inputItems); err != nil { + return nil, fmt.Errorf("input array unmarshal: %w", err) + } + if len(inputItems) == 0 { + inputItems = []map[string]any{ + { + "type": "message", + "role": "user", + "content": []map[string]any{{"type": "input_text", "text": "..."}}, + }, + } + } + default: + // Per spec §3, return error so caller can fall through. + return nil, fmt.Errorf("unsupported input shape: %s", common.GetJsonType(req.Input)) + } + } + + // Convert items, with buffering for reasoning and consecutive function_calls. + var reasoningBuf []string + flushReasoningInto := func(msg *dto.Message) { + if len(reasoningBuf) == 0 { + return + } + s := strings.Join(reasoningBuf, "\n") + reasoningBuf = nil + msg.ReasoningContent = &s + } + + // Pending assistant tool_calls accumulator (so consecutive function_calls + // collapse into one assistant message). + var pendingAssistantToolCalls []dto.ToolCallRequest + flushAssistantToolCalls := func() { + if len(pendingAssistantToolCalls) == 0 { + return + } + msg := dto.Message{ + Role: "assistant", + } + msg.SetNullContent() + flushReasoningInto(&msg) + msg.SetToolCalls(pendingAssistantToolCalls) + out.Messages = append(out.Messages, msg) + pendingAssistantToolCalls = nil + } + + for _, item := range inputItems { + if item == nil { + continue + } + itemType, _ := item["type"].(string) + role, _ := item["role"].(string) + if itemType == "" && role != "" { + itemType = "message" + } + if itemType == "" { + // Neither type nor role -> skip per spec §5. + continue + } + + switch itemType { + case "message": + flushAssistantToolCalls() + msg := dto.Message{Role: role} + if msg.Role == "" { + msg.Role = "user" + } + // Content can be string or array. + contentAny, hasContent := item["content"] + if !hasContent { + msg.Content = "" + } else { + // Normalize to []any so we can walk it uniformly regardless of + // whether it came from JSON unmarshal ([]any) or from in-process + // construction ([]map[string]any). + var parts []any + switch cv := contentAny.(type) { + case string: + msg.Content = cv + parts = nil + case []any: + parts = cv + case []map[string]any: + parts = make([]any, len(cv)) + for i := range cv { + parts[i] = cv[i] + } + } + if parts != nil { + mc := convertResponsesContentParts(parts) + if len(mc) == 0 { + msg.Content = "" + } else if len(mc) == 1 && mc[0].Type == dto.ContentTypeText { + msg.Content = mc[0].Text + } else { + out2 := make([]any, 0, len(mc)) + for _, p := range mc { + pm := map[string]any{"type": p.Type} + switch p.Type { + case dto.ContentTypeText: + pm["text"] = p.Text + case dto.ContentTypeImageURL: + pm["image_url"] = p.ImageUrl + } + out2 = append(out2, pm) + } + msg.Content = out2 + } + } + } + if msg.Role == "assistant" { + flushReasoningInto(&msg) + } + out.Messages = append(out.Messages, msg) + + case "function_call": + name, _ := item["name"].(string) + if strings.TrimSpace(name) == "" { + continue + } + callID, _ := item["call_id"].(string) + argsStr := "" + if raw, ok := item["arguments"]; ok { + switch av := raw.(type) { + case string: + argsStr = av + default: + if b, err := common.Marshal(av); err == nil { + argsStr = string(b) + } + } + } + pendingAssistantToolCalls = append(pendingAssistantToolCalls, dto.ToolCallRequest{ + ID: callID, + Type: "function", + Function: dto.FunctionRequest{ + Name: name, + Arguments: argsStr, + }, + }) + + case "function_call_output": + flushAssistantToolCalls() + callID, _ := item["call_id"].(string) + outputAny := item["output"] + var output string + switch ov := outputAny.(type) { + case string: + output = ov + default: + if b, err := common.Marshal(ov); err == nil { + output = string(b) + } else { + output = fmt.Sprintf("%v", ov) + } + } + out.Messages = append(out.Messages, dto.Message{ + Role: "tool", + Content: output, + ToolCallId: callID, + }) + + case "reasoning": + text := extractReasoningItemText(item) + if text != "" { + reasoningBuf = append(reasoningBuf, text) + } + + default: + // Unknown item type: skip silently to match spec §5 forgiving stance. + continue + } + } + // End-of-input flush. + flushAssistantToolCalls() + + // Strip Responses-only fields explicitly: input/instructions/include/ + // prompt_cache_key/store/reasoning/background are NOT carried over. + // "store" is intentionally also dropped to keep the Chat intermediate clean. + out.Store = nil + + return out, nil +} + +// normalizeToolParameters ensures an object-typed schema has a `properties` key +// per spec §8. +func normalizeToolParameters(params any) any { + if params == nil { + return map[string]any{ + "type": "object", + "properties": map[string]any{}, + } + } + m, ok := params.(map[string]any) + if !ok { + return params + } + if t, _ := m["type"].(string); strings.EqualFold(t, "object") { + if _, has := m["properties"]; !has { + m["properties"] = map[string]any{} + } + } + return m +} + +func convertResponsesContentParts(parts []any) []dto.MediaContent { + result := make([]dto.MediaContent, 0, len(parts)) + for _, p := range parts { + pm, ok := p.(map[string]any) + if !ok { + continue + } + pt, _ := pm["type"].(string) + switch pt { + case "input_text", "output_text": + if t, ok := pm["text"].(string); ok { + result = append(result, dto.MediaContent{ + Type: dto.ContentTypeText, + Text: t, + }) + } + case "input_image": + detail, _ := pm["detail"].(string) + if detail == "" { + detail = "auto" + } + url := "" + switch v := pm["image_url"].(type) { + case string: + url = v + case map[string]any: + if s, ok := v["url"].(string); ok { + url = s + } + } + if url == "" { + if s, ok := pm["file_id"].(string); ok { + url = s + } + } + result = append(result, dto.MediaContent{ + Type: dto.ContentTypeImageURL, + ImageUrl: map[string]any{ + "url": url, + "detail": detail, + }, + }) + default: + // Pass-through unknown types as a generic text block to keep the + // converter forgiving. + if t, _ := pm["text"].(string); t != "" { + result = append(result, dto.MediaContent{ + Type: dto.ContentTypeText, + Text: t, + }) + } + } + } + return result +} + +// extractReasoningItemText pulls text out of a reasoning input item per spec §7. +// Priority: summary[].text joined with \n; else content[].text joined with \n; else "". +func extractReasoningItemText(item map[string]any) string { + if item == nil { + return "" + } + if sums, ok := item["summary"].([]any); ok && len(sums) > 0 { + var b strings.Builder + for _, s := range sums { + sm, ok := s.(map[string]any) + if !ok { + continue + } + if t, _ := sm["text"].(string); t != "" { + if b.Len() > 0 { + b.WriteString("\n") + } + b.WriteString(t) + } + } + if b.Len() > 0 { + return b.String() + } + } + if conts, ok := item["content"].([]any); ok && len(conts) > 0 { + var b strings.Builder + for _, c := range conts { + cm, ok := c.(map[string]any) + if !ok { + continue + } + if t, _ := cm["text"].(string); t != "" { + if b.Len() > 0 { + b.WriteString("\n") + } + b.WriteString(t) + } + } + if b.Len() > 0 { + return b.String() + } + } + return "" +} diff --git a/service/openaicompat/responses_to_chat_test.go b/service/openaicompat/responses_to_chat_test.go new file mode 100644 index 00000000000..feabf7c3d35 --- /dev/null +++ b/service/openaicompat/responses_to_chat_test.go @@ -0,0 +1,514 @@ +package openaicompat + +import ( + "encoding/json" + "strings" + "testing" + + "github.com/QuantumNous/new-api/common" + "github.com/QuantumNous/new-api/dto" + "github.com/stretchr/testify/require" +) + +func newResponsesReq(t *testing.T, body map[string]any) *dto.OpenAIResponsesRequest { + t.Helper() + raw, err := common.Marshal(body) + require.NoError(t, err) + var req dto.OpenAIResponsesRequest + require.NoError(t, common.Unmarshal(raw, &req)) + return &req +} + +func TestResponsesToChat_StringInputWrapsAsUserMessage(t *testing.T) { + req := newResponsesReq(t, map[string]any{"model": "claude-3", "input": "hello"}) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].Role != "user" { + t.Errorf("role=%q want user", chat.Messages[0].Role) + } + if chat.Messages[0].StringContent() != "hello" { + t.Errorf("content=%q want hello", chat.Messages[0].StringContent()) + } +} + +func TestResponsesToChat_EmptyStringInputPlaceholder(t *testing.T) { + req := newResponsesReq(t, map[string]any{"model": "x", "input": ""}) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].StringContent() != "..." { + t.Errorf("placeholder=%q want ...", chat.Messages[0].StringContent()) + } +} + +func TestResponsesToChat_EmptyArrayPlaceholder(t *testing.T) { + req := newResponsesReq(t, map[string]any{"model": "x", "input": []any{}}) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].StringContent() != "..." { + t.Errorf("placeholder=%q want ...", chat.Messages[0].StringContent()) + } +} + +func TestResponsesToChat_NonStringNonArrayReturnsError(t *testing.T) { + req := newResponsesReq(t, map[string]any{"model": "x", "input": 42}) + _, err := ResponsesRequestToChatCompletionsRequest(req) + if err == nil { + t.Errorf("expected error for numeric input") + } +} + +func TestResponsesToChat_InstructionsLifted(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "instructions": "You are helpful.", + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.GreaterOrEqual(t, len(chat.Messages), 2) + if chat.Messages[0].Role != "system" { + t.Errorf("first role=%q want system", chat.Messages[0].Role) + } + if chat.Messages[0].StringContent() != "You are helpful." { + t.Errorf("system content=%q", chat.Messages[0].StringContent()) + } +} + +func TestResponsesToChat_EmptyInstructionsSkipped(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "instructions": "", + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + for _, m := range chat.Messages { + if m.Role == "system" { + t.Errorf("system message present when instructions empty") + } + } +} + +func TestResponsesToChat_RoleOnlyFallbackAndSkipUnknown(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"role": "user", "content": []any{ + map[string]any{"type": "input_text", "text": "hi"}, + }}, + map[string]any{"foo": "bar"}, // skipped + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].StringContent() != "hi" { + t.Errorf("got=%q want hi", chat.Messages[0].StringContent()) + } +} + +func TestResponsesToChat_OutputTextBecomesText(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"role": "assistant", "content": []any{ + map[string]any{"type": "output_text", "text": "answer"}, + }}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].StringContent() != "answer" { + t.Errorf("got=%q want answer", chat.Messages[0].StringContent()) + } +} + +func TestResponsesToChat_InputImageWithURL(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"role": "user", "content": []any{ + map[string]any{ + "type": "input_image", + "image_url": "https://example.com/a.png", + "detail": "high", + }, + }}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + parts := chat.Messages[0].ParseContent() + require.Len(t, parts, 1) + if parts[0].Type != dto.ContentTypeImageURL { + t.Errorf("type=%q want image_url", parts[0].Type) + } +} + +func TestResponsesToChat_InputImageWithFileID(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"role": "user", "content": []any{ + map[string]any{"type": "input_image", "file_id": "file_abc"}, + }}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + parts := chat.Messages[0].ParseContent() + require.Len(t, parts, 1) + if parts[0].Type != dto.ContentTypeImageURL { + t.Errorf("type=%q want image_url", parts[0].Type) + } +} + +// MINOR-2: input_image with neither image_url nor file_id should still be +// emitted as an image_url part (with empty url and detail="auto") so the +// downstream converter can decide how to handle it. +func TestResponsesToChat_InputImageWithNeitherURLNorFileID(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"role": "user", "content": []any{ + map[string]any{"type": "input_image"}, + }}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + parts := chat.Messages[0].ParseContent() + require.Len(t, parts, 1) + require.Equal(t, dto.ContentTypeImageURL, parts[0].Type) + + imageURL := parts[0].GetImageMedia() + require.NotNil(t, imageURL, "expected image_url to be parseable") + require.Equal(t, "", imageURL.Url) + require.Equal(t, "auto", imageURL.Detail) +} + +func TestResponsesToChat_FunctionCallBecomesAssistantToolCalls(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{ + "type": "function_call", + "call_id": "c1", + "name": "search", + "arguments": `{"q":"x"}`, + }, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].Role != "assistant" { + t.Errorf("role=%q want assistant", chat.Messages[0].Role) + } + calls := chat.Messages[0].ParseToolCalls() + require.Len(t, calls, 1) + if calls[0].ID != "c1" || calls[0].Function.Name != "search" { + t.Errorf("call mismatch: id=%q name=%q", calls[0].ID, calls[0].Function.Name) + } + if calls[0].Function.Arguments != `{"q":"x"}` { + t.Errorf("args=%q", calls[0].Function.Arguments) + } +} + +func TestResponsesToChat_FunctionCallEmptyNameDropped(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"type": "function_call", "call_id": "c1", "name": "", "arguments": "{}"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Empty(t, chat.Messages) +} + +func TestResponsesToChat_FunctionCallOutputBecomesToolMessage(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"type": "function_call_output", "call_id": "c1", "output": "result text"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].Role != "tool" || chat.Messages[0].ToolCallId != "c1" { + t.Errorf("tool msg mismatch: role=%q id=%q", chat.Messages[0].Role, chat.Messages[0].ToolCallId) + } + if chat.Messages[0].StringContent() != "result text" { + t.Errorf("content=%q", chat.Messages[0].StringContent()) + } +} + +func TestResponsesToChat_FunctionCallOutputObjectStringified(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"type": "function_call_output", "call_id": "c1", "output": map[string]any{"ok": true, "n": 7}}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + c := chat.Messages[0].StringContent() + if !strings.Contains(c, `"ok":true`) || !strings.Contains(c, `"n":7`) { + t.Errorf("content=%q want JSON-stringified", c) + } +} + +func TestResponsesToChat_FunctionCallFlushesBeforeOutput(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{ + "type": "function_call", + "call_id": "c1", + "name": "search", + "arguments": "{}", + }, + map[string]any{"type": "function_call_output", "call_id": "c1", "output": "r"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 2) + if chat.Messages[0].Role != "assistant" { + t.Errorf("first role=%q", chat.Messages[0].Role) + } + if chat.Messages[1].Role != "tool" { + t.Errorf("second role=%q", chat.Messages[1].Role) + } +} + +func TestResponsesToChat_ReasoningAttachedToNextAssistant(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"type": "reasoning", "summary": []any{ + map[string]any{"text": "thinking step 1"}, + }}, + map[string]any{"type": "message", "role": "assistant", "content": []any{ + map[string]any{"type": "output_text", "text": "answer"}, + }}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + m := chat.Messages[0] + if m.GetReasoningContent() != "thinking step 1" { + t.Errorf("reasoning=%q", m.GetReasoningContent()) + } +} + +func TestResponsesToChat_ReasoningContentFallback(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"type": "reasoning", "content": []any{ + map[string]any{"text": "alt thinking"}, + }}, + map[string]any{"type": "message", "role": "assistant", "content": "ok"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].GetReasoningContent() != "alt thinking" { + t.Errorf("reasoning=%q", chat.Messages[0].GetReasoningContent()) + } +} + +func TestResponsesToChat_MultipleReasoningJoined(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"type": "reasoning", "summary": []any{map[string]any{"text": "a"}}}, + map[string]any{"type": "reasoning", "summary": []any{map[string]any{"text": "b"}}}, + map[string]any{"type": "message", "role": "assistant", "content": "ok"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 1) + if chat.Messages[0].GetReasoningContent() != "a\nb" { + t.Errorf("reasoning=%q want a\\nb", chat.Messages[0].GetReasoningContent()) + } +} + +func TestResponsesToChat_ReasoningBufferCleared(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": []any{ + map[string]any{"type": "reasoning", "summary": []any{map[string]any{"text": "r"}}}, + map[string]any{"type": "message", "role": "assistant", "content": "first"}, + map[string]any{"type": "message", "role": "assistant", "content": "second"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Messages, 2) + if chat.Messages[0].GetReasoningContent() == "" { + t.Errorf("first message should carry reasoning") + } + if chat.Messages[1].GetReasoningContent() != "" { + t.Errorf("second message should not have reasoning, got=%q", chat.Messages[1].GetReasoningContent()) + } +} + +func TestResponsesToChat_ToolDeclarationFlatConverted(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "tools": []any{ + map[string]any{ + "type": "function", + "name": "search", + "description": "find", + "parameters": map[string]any{"type": "object", "properties": map[string]any{}}, + }, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Tools, 1) + if chat.Tools[0].Function.Name != "search" { + t.Errorf("name=%q", chat.Tools[0].Function.Name) + } +} + +func TestResponsesToChat_ToolDeclarationChatShapePassThrough(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "tools": []any{ + map[string]any{ + "type": "function", + "function": map[string]any{ + "name": "search", + "description": "find", + "parameters": map[string]any{"type": "object"}, + }, + }, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Len(t, chat.Tools, 1) + if chat.Tools[0].Function.Name != "search" { + t.Errorf("name=%q", chat.Tools[0].Function.Name) + } + // Parameters should have been normalized. + m, ok := chat.Tools[0].Function.Parameters.(map[string]any) + require.True(t, ok) + if _, has := m["properties"]; !has { + t.Errorf("properties not normalized: %+v", m) + } +} + +func TestResponsesToChat_NamelessToolDropped(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "tools": []any{ + map[string]any{"type": "request_user_input"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.Empty(t, chat.Tools) +} + +func TestResponsesToChat_ReasoningEffortCarry(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "reasoning": map[string]any{"effort": "high"}, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + if chat.ReasoningEffort != "high" { + t.Errorf("reasoning_effort=%q", chat.ReasoningEffort) + } +} + +func TestResponsesToChat_ResponseFormatJSONObject(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "text": map[string]any{ + "format": map[string]any{"type": "json_object"}, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.NotNil(t, chat.ResponseFormat) + if chat.ResponseFormat.Type != "json_object" { + t.Errorf("response_format.type=%q", chat.ResponseFormat.Type) + } +} + +func TestResponsesToChat_ResponseFormatJSONSchema(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "text": map[string]any{ + "format": map[string]any{ + "type": "json_schema", + "json_schema": map[string]any{"schema": map[string]any{"type": "object"}}, + }, + }, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.NotNil(t, chat.ResponseFormat) + if chat.ResponseFormat.Type != "json_schema" { + t.Errorf("response_format.type=%q", chat.ResponseFormat.Type) + } + var got map[string]any + require.NoError(t, json.Unmarshal(chat.ResponseFormat.JsonSchema, &got)) + if _, has := got["schema"]; !has { + t.Errorf("schema not preserved: %+v", got) + } +} + +func TestResponsesToChat_ToolChoiceFlatToChatShape(t *testing.T) { + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "tool_choice": map[string]any{"type": "function", "name": "search"}, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + require.NotNil(t, chat.ToolChoice) + m, ok := chat.ToolChoice.(map[string]any) + require.True(t, ok) + if fn, ok := m["function"].(map[string]any); !ok || fn["name"] != "search" { + t.Errorf("tool_choice did not reshape: %+v", m) + } +} + +func TestResponsesToChat_StoreAndOtherFieldsStripped(t *testing.T) { + // Spec §10 — Responses-only fields removed from result. + req := newResponsesReq(t, map[string]any{ + "model": "x", + "input": "hi", + "store": false, + }) + chat, err := ResponsesRequestToChatCompletionsRequest(req) + require.NoError(t, err) + if chat.Store != nil { + t.Errorf("store should be stripped: %v", chat.Store) + } +} diff --git a/service/openaicompat/tool_call_ids.go b/service/openaicompat/tool_call_ids.go new file mode 100644 index 00000000000..956b91afd9f --- /dev/null +++ b/service/openaicompat/tool_call_ids.go @@ -0,0 +1,108 @@ +package openaicompat + +import ( + "regexp" + "strings" + + "github.com/QuantumNous/new-api/common" + "github.com/QuantumNous/new-api/dto" +) + +// anthropicToolIDPattern matches Anthropic's allowed tool_use.id regex. +var anthropicToolIDPattern = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`) + +const maxAnthropicToolIDLen = 64 + +// sanitizeOneToolID applies the three-tier policy: +// 1. pass-through if valid AND <= 64 chars, +// 2. strip non-[a-zA-Z0-9_-] characters and keep if non-empty AND <= 64, +// 3. otherwise generate a fresh UUID (dashes removed). +func sanitizeOneToolID(id string) string { + if id != "" && len(id) <= maxAnthropicToolIDLen && anthropicToolIDPattern.MatchString(id) { + return id + } + // Strip-and-keep. + var b strings.Builder + b.Grow(len(id)) + for _, r := range id { + switch { + case r >= 'a' && r <= 'z', + r >= 'A' && r <= 'Z', + r >= '0' && r <= '9', + r == '_', r == '-': + b.WriteRune(r) + } + } + residue := b.String() + if residue != "" && len(residue) <= maxAnthropicToolIDLen { + return residue + } + // UUID fallback (dashes stripped per common.GetUUID()). + return common.GetUUID() +} + +// SanitizeToolCallIDs walks the request messages and rewrites every tool-call +// ID (assistant.tool_calls[].id) and any matching tool_call_id on the next +// tool messages so the upstream Anthropic API receives a consistent mapping +// that satisfies its regex and length constraints. +// +// It also defaults a missing tool_call.type to "function" and stringifies +// any object-valued tool_call.function.arguments. +func SanitizeToolCallIDs(req *dto.GeneralOpenAIRequest) { + if req == nil || len(req.Messages) == 0 { + return + } + + // idMap tracks original-ID -> sanitized-ID rewrites so we can also patch + // downstream tool_result references. + idMap := map[string]string{} + + for mi := range req.Messages { + msg := &req.Messages[mi] + if msg.Role == "assistant" && msg.ToolCalls != nil { + calls := msg.ParseToolCalls() + if len(calls) == 0 { + continue + } + for ci := range calls { + tc := &calls[ci] + // Default missing type to "function". + if strings.TrimSpace(tc.Type) == "" { + tc.Type = "function" + } + // Sanitize ID. Reuse an existing remap before generating a new + // one so repeated invalid originals (e.g. multiple `"::::"`) all + // get the same sanitized id — matching the tool_result remap + // that only retains the last write. + origID := tc.ID + if remapped, ok := idMap[origID]; ok { + tc.ID = remapped + continue + } + newID := sanitizeOneToolID(origID) + if newID != origID { + idMap[origID] = newID + tc.ID = newID + } + } + msg.SetToolCalls(calls) + } + } + + // Remap tool messages' tool_call_id references. + if len(idMap) == 0 { + return + } + for mi := range req.Messages { + msg := &req.Messages[mi] + if msg.Role != "tool" && msg.Role != "function" { + continue + } + if msg.ToolCallId == "" { + continue + } + if remap, ok := idMap[msg.ToolCallId]; ok { + msg.ToolCallId = remap + } + } +} diff --git a/service/openaicompat/tool_call_ids_test.go b/service/openaicompat/tool_call_ids_test.go new file mode 100644 index 00000000000..a5372fa7cb9 --- /dev/null +++ b/service/openaicompat/tool_call_ids_test.go @@ -0,0 +1,133 @@ +package openaicompat + +import ( + "strings" + "testing" + + "github.com/QuantumNous/new-api/dto" + "github.com/stretchr/testify/require" +) + +func TestSanitizeToolCallIDs_PassThroughValid(t *testing.T) { + req := &dto.GeneralOpenAIRequest{ + Messages: []dto.Message{ + { + Role: "assistant", + }, + }, + } + req.Messages[0].SetToolCalls([]dto.ToolCallRequest{ + {ID: "call_abc-123", Type: "function", Function: dto.FunctionRequest{Name: "x", Arguments: "{}"}}, + }) + SanitizeToolCallIDs(req) + calls := req.Messages[0].ParseToolCalls() + require.Len(t, calls, 1) + if calls[0].ID != "call_abc-123" { + t.Errorf("id changed: %q", calls[0].ID) + } +} + +func TestSanitizeToolCallIDs_StripAndKeep(t *testing.T) { + req := &dto.GeneralOpenAIRequest{ + Messages: []dto.Message{{Role: "assistant"}}, + } + req.Messages[0].SetToolCalls([]dto.ToolCallRequest{ + {ID: "call:abc/123", Type: "function", Function: dto.FunctionRequest{Name: "x", Arguments: "{}"}}, + }) + SanitizeToolCallIDs(req) + calls := req.Messages[0].ParseToolCalls() + require.Len(t, calls, 1) + if calls[0].ID != "callabc123" { + t.Errorf("got %q want callabc123", calls[0].ID) + } +} + +func TestSanitizeToolCallIDs_UUIDFallbackEmptyResidue(t *testing.T) { + req := &dto.GeneralOpenAIRequest{ + Messages: []dto.Message{{Role: "assistant"}}, + } + req.Messages[0].SetToolCalls([]dto.ToolCallRequest{ + {ID: "::::", Type: "function", Function: dto.FunctionRequest{Name: "x", Arguments: "{}"}}, + }) + SanitizeToolCallIDs(req) + calls := req.Messages[0].ParseToolCalls() + require.Len(t, calls, 1) + // 32-char dash-stripped UUID; must be alphanumeric. + id := calls[0].ID + if len(id) < 16 { + t.Errorf("uuid too short: %q", id) + } + for _, r := range id { + if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) { + t.Errorf("uuid has bad char: %q", id) + } + } +} + +func TestSanitizeToolCallIDs_UUIDFallbackOver64(t *testing.T) { + long := strings.Repeat("a", 70) + req := &dto.GeneralOpenAIRequest{ + Messages: []dto.Message{{Role: "assistant"}}, + } + req.Messages[0].SetToolCalls([]dto.ToolCallRequest{ + {ID: long, Type: "function", Function: dto.FunctionRequest{Name: "x", Arguments: "{}"}}, + }) + SanitizeToolCallIDs(req) + calls := req.Messages[0].ParseToolCalls() + require.Len(t, calls, 1) + if calls[0].ID == long { + t.Errorf("70-char id should have been replaced") + } + if len(calls[0].ID) > 64 { + t.Errorf("replacement too long: %d", len(calls[0].ID)) + } +} + +func TestSanitizeToolCallIDs_ConsistentRemap(t *testing.T) { + req := &dto.GeneralOpenAIRequest{ + Messages: []dto.Message{ + {Role: "assistant"}, + {Role: "tool", Content: "ok", ToolCallId: "::::"}, + }, + } + req.Messages[0].SetToolCalls([]dto.ToolCallRequest{ + {ID: "::::", Type: "function", Function: dto.FunctionRequest{Name: "x", Arguments: "{}"}}, + }) + SanitizeToolCallIDs(req) + calls := req.Messages[0].ParseToolCalls() + require.Len(t, calls, 1) + newID := calls[0].ID + if req.Messages[1].ToolCallId != newID { + t.Errorf("tool message id not remapped: got=%q want=%q", req.Messages[1].ToolCallId, newID) + } +} + +func TestSanitizeToolCallIDs_TypeDefaulted(t *testing.T) { + req := &dto.GeneralOpenAIRequest{ + Messages: []dto.Message{{Role: "assistant"}}, + } + req.Messages[0].SetToolCalls([]dto.ToolCallRequest{ + {ID: "ok", Function: dto.FunctionRequest{Name: "x", Arguments: "{}"}}, + }) + SanitizeToolCallIDs(req) + calls := req.Messages[0].ParseToolCalls() + require.Len(t, calls, 1) + if calls[0].Type != "function" { + t.Errorf("type=%q want function", calls[0].Type) + } +} + +func TestSanitizeToolCallIDs_NoToolCallsNoOp(t *testing.T) { + req := &dto.GeneralOpenAIRequest{ + Messages: []dto.Message{{Role: "user", Content: "hello"}}, + } + // Should not panic and should not mutate the message. + SanitizeToolCallIDs(req) + if req.Messages[0].StringContent() != "hello" { + t.Errorf("content changed: %q", req.Messages[0].StringContent()) + } +} + +func TestSanitizeToolCallIDs_NilRequest(t *testing.T) { + SanitizeToolCallIDs(nil) // must not panic +}