diff --git a/extension/llm/server/__init__.py b/extension/llm/server/__init__.py new file mode 100644 index 00000000000..2e41cd717f6 --- /dev/null +++ b/extension/llm/server/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/extension/llm/server/python/__init__.py b/extension/llm/server/python/__init__.py new file mode 100644 index 00000000000..00b6274c01f --- /dev/null +++ b/extension/llm/server/python/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""OpenAI-compatible server for ExecuTorch LLMs (Python implementation).""" diff --git a/extension/llm/server/python/chat_template.py b/extension/llm/server/python/chat_template.py new file mode 100644 index 00000000000..52a7ef21243 --- /dev/null +++ b/extension/llm/server/python/chat_template.py @@ -0,0 +1,144 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Render OpenAI chat messages into a single prompt string. + +The ExecuTorch runner tokenizes a plain prompt; chat formatting is the server's +job (control plane). We require the model's own Hugging Face ``chat_template`` +(via ``--hf-tokenizer``) for correct, tool-aware, reasoning-aware formatting. +The generic ChatML fallback is opt-in only (``allow_fallback``): it is +approximate and cannot reproduce model-specific controls (e.g. enable_thinking), +so it must be a deliberate choice rather than a silent default. +""" + +import json +import logging +from typing import Any, Optional + +from .protocol import ChatMessage + +logger = logging.getLogger(__name__) + + +_DEFAULT_SPECIAL_TOKENS = ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "<|end|>"] + + +def _decode_tool_call_arguments(messages: list[dict[str, Any]]) -> None: + """In-place: parse each tool call's ``function.arguments`` from a JSON string + into an object. + + OpenAI sends assistant tool-call arguments as a JSON-encoded string, but HF + chat templates expect a mapping (e.g. Qwen renders ``arguments|items`` into + ```` tags). Without this, a multi-turn tool conversation makes + the template raise "Can only get item pairs from a mapping". Left as-is if + the value isn't valid JSON, so a template that wants the raw string still works. + """ + for m in messages: + for tc in m.get("tool_calls") or []: + fn = tc.get("function") + if not isinstance(fn, dict): + continue + args = fn.get("arguments") + if isinstance(args, str): + try: + fn["arguments"] = json.loads(args) + except (ValueError, TypeError): + pass + + +class ChatTemplate: + def __init__( + self, + hf_tokenizer_path: Optional[str] = None, + default_template_kwargs: Optional[dict[str, Any]] = None, + allow_fallback: bool = False, + ): + # Server-level defaults (e.g. {"enable_thinking": False}); per-request + # chat_template_kwargs override these. + self._defaults = default_template_kwargs or {} + self._hf = None + if hf_tokenizer_path: + from transformers import AutoTokenizer + + self._hf = AutoTokenizer.from_pretrained(hf_tokenizer_path) + if self._hf.chat_template is None: + self._hf = None + if not allow_fallback: + raise ValueError( + f"HF tokenizer at {hf_tokenizer_path} has no chat_template; " + "pass an explicit fallback flag to use approximate ChatML." + ) + logger.warning( + "No chat_template at %s; using approximate ChatML.", + hf_tokenizer_path, + ) + elif not allow_fallback: + raise ValueError( + "A chat template is required: pass --hf-tokenizer for the model's own " + "template, or opt into approximate ChatML with --allow-chatml-fallback." + ) + else: + logger.warning( + "No --hf-tokenizer; using approximate ChatML (no thinking control)." + ) + + def render( + self, + messages: list[ChatMessage], + tools: Optional[list[dict[str, Any]]] = None, + template_kwargs: Optional[dict[str, Any]] = None, + ) -> str: + kwargs = {**self._defaults, **(template_kwargs or {})} + if self._hf is not None: + dumped = [m.model_dump(exclude_none=True) for m in messages] + _decode_tool_call_arguments(dumped) + return self._hf.apply_chat_template( + dumped, + tools=tools, + add_generation_prompt=True, + tokenize=False, + **kwargs, + ) + return self._fallback(messages) + + def chat_template_str(self) -> Optional[str]: + """Raw chat-template string (for tool-format auto-detection), if available.""" + return ( + getattr(self._hf, "chat_template", None) if self._hf is not None else None + ) + + def count_tokens(self, prompt: str) -> Optional[int]: + """Token count for the rendered prompt, or None if no tokenizer is available.""" + if self._hf is not None: + # The prompt is already rendered (apply_chat_template includes the + # control tokens), so encode without re-adding BOS/EOS — matching the + # session/prefix-cache paths, so the count isn't inflated and + # near-limit requests aren't falsely rejected under --max-context. + return len(self._hf.encode(prompt, add_special_tokens=False)) + return None + + def special_tokens(self) -> list[str]: + """Special-token strings whose appearance ends the visible content. + + From the HF tokenizer when available (model-accurate), else a default set + covering common chat models. + """ + if self._hf is not None: + toks = list(getattr(self._hf, "all_special_tokens", []) or []) + return [t for t in toks if isinstance(t, str) and t] + return list(_DEFAULT_SPECIAL_TOKENS) + + @staticmethod + def _fallback(messages: list[ChatMessage]) -> str: + # Approximate ChatML. Provide --hf-tokenizer for model-correct formatting + # (including reasoning controls like enable_thinking, which the fallback + # cannot reproduce). + parts = [] + for m in messages: + content = m.content if isinstance(m.content, str) else str(m.content or "") + parts.append(f"<|im_start|>{m.role}\n{content}<|im_end|>") + parts.append("<|im_start|>assistant\n") + return "\n".join(parts) diff --git a/extension/llm/server/python/errors.py b/extension/llm/server/python/errors.py new file mode 100644 index 00000000000..f24df43f2e8 --- /dev/null +++ b/extension/llm/server/python/errors.py @@ -0,0 +1,62 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""OpenAI-shaped API errors. + +Raising these lets the server return a structured `{"error": {...}}` body with +the right HTTP status instead of dropping the connection. +""" + +from typing import Optional + + +class APIError(Exception): + def __init__( + self, status: int, message: str, err_type: str, code: Optional[str] = None + ): + super().__init__(message) + self.status = status + self.message = message + self.err_type = err_type + self.code = code + + def body(self) -> dict: + return { + "error": {"message": self.message, "type": self.err_type, "code": self.code} + } + + +class ContextLengthExceeded(APIError): + def __init__(self, num_tokens: int, max_context: int, completion_tokens: int = 0): + # completion_tokens > 0: the prompt fits but prompt + requested + # max_tokens would run past the window — reject up front rather than + # fail (or truncate) mid-generation. + if completion_tokens > 0: + message = ( + f"This model's maximum context length is {max_context} tokens. " + f"However, you requested {num_tokens + completion_tokens} tokens " + f"({num_tokens} in the messages, {completion_tokens} in the " + f"completion). Please reduce the length of the messages or " + f"completion." + ) + else: + message = ( + f"This model's maximum context length is {max_context} tokens, " + f"but the request has {num_tokens} prompt tokens." + ) + super().__init__( + status=400, + message=message, + err_type="invalid_request_error", + code="context_length_exceeded", + ) + + +class GenerationError(APIError): + def __init__(self, detail: str): + super().__init__( + status=500, message=f"Generation failed: {detail}", err_type="server_error" + ) diff --git a/extension/llm/server/python/protocol.py b/extension/llm/server/python/protocol.py new file mode 100644 index 00000000000..2d73d2d7f64 --- /dev/null +++ b/extension/llm/server/python/protocol.py @@ -0,0 +1,148 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""OpenAI-compatible request/response schemas for the ExecuTorch LLM server. + +This is the Python view of the contract defined in ``extension/llm/server/spec``. +Any language server must serialize to the same shapes; the conformance suite in +``extension/llm/server/conformance`` validates them. +""" + +import time +import uuid +from typing import Any, Literal, Optional, Union + +from pydantic import BaseModel, Field + + +def _new_id(prefix: str) -> str: + return f"{prefix}-{uuid.uuid4().hex}" + + +class FunctionCall(BaseModel): + name: Optional[str] = None + arguments: Optional[str] = None + + +class ToolCall(BaseModel): + index: Optional[int] = None + id: Optional[str] = None + type: Literal["function"] = "function" + function: FunctionCall + + +class ChatMessage(BaseModel): + role: str + content: Optional[Union[str, list[dict[str, Any]]]] = None + name: Optional[str] = None + tool_calls: Optional[list[ToolCall]] = None + tool_call_id: Optional[str] = None + + +class StreamOptions(BaseModel): + include_usage: bool = False + + +class ChatCompletionRequest(BaseModel): + model: Optional[str] = None + messages: list[ChatMessage] + stream: bool = False + stream_options: Optional[StreamOptions] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + max_tokens: Optional[int] = None + max_completion_tokens: Optional[int] = None + stop: Optional[Union[str, list[str]]] = None + n: int = 1 + seed: Optional[int] = None + # Sampling knobs that change generation output. We don't plumb these, so they + # are modeled (not dropped) in order to be rejected with a clear error rather + # than silently ignored — see serving_chat's unsupported-parameter check. + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + top_k: Optional[int] = None + logit_bias: Optional[dict[str, float]] = None + # Output-contract fields: modeled (not dropped) so we reject the ones we + # can't honor rather than returning an output that violates what was asked. + response_format: Optional[dict[str, Any]] = None + logprobs: Optional[bool] = None + top_logprobs: Optional[int] = None + parallel_tool_calls: Optional[bool] = None + # Per-request chat-template controls, e.g. {"enable_thinking": false} for Qwen3. + chat_template_kwargs: Optional[dict[str, Any]] = None + # Accepted now so the contract is stable; parsing/enforcement land in M2/M5. + tools: Optional[list[dict[str, Any]]] = None + tool_choice: Optional[Union[str, dict[str, Any]]] = None + reasoning_effort: Optional[str] = None + + def resolved_max_tokens(self) -> int: + # `is not None` (not `or`): an explicit 0 must not be treated as unset. + # Callers validate positivity; -1 means "unset / auto". + if self.max_completion_tokens is not None: + return self.max_completion_tokens + if self.max_tokens is not None: + return self.max_tokens + return -1 + + +class Usage(BaseModel): + prompt_tokens: int = 0 + completion_tokens: int = 0 + total_tokens: int = 0 + + +class ResponseMessage(BaseModel): + role: str = "assistant" + content: Optional[str] = None + tool_calls: Optional[list[ToolCall]] = None + + +class Choice(BaseModel): + index: int = 0 + message: ResponseMessage + finish_reason: Optional[str] = None + + +class ChatCompletionResponse(BaseModel): + id: str = Field(default_factory=lambda: _new_id("chatcmpl")) + object: Literal["chat.completion"] = "chat.completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[Choice] + usage: Usage = Field(default_factory=Usage) + + +class DeltaMessage(BaseModel): + role: Optional[str] = None + content: Optional[str] = None + tool_calls: Optional[list[ToolCall]] = None + + +class ChunkChoice(BaseModel): + index: int = 0 + delta: DeltaMessage + finish_reason: Optional[str] = None + + +class ChatCompletionChunk(BaseModel): + id: str + object: Literal["chat.completion.chunk"] = "chat.completion.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[ChunkChoice] + usage: Optional[Usage] = None + + +class ModelCard(BaseModel): + id: str + object: Literal["model"] = "model" + created: int = Field(default_factory=lambda: int(time.time())) + owned_by: str = "executorch" + + +class ModelList(BaseModel): + object: Literal["list"] = "list" + data: list[ModelCard] diff --git a/extension/llm/server/python/requirements.txt b/extension/llm/server/python/requirements.txt new file mode 100644 index 00000000000..70ad7ccb4dd --- /dev/null +++ b/extension/llm/server/python/requirements.txt @@ -0,0 +1,5 @@ +fastapi>=0.110 +uvicorn[standard]>=0.27 +pydantic>=2.0 +# Optional but recommended for model-correct chat templating (--hf-tokenizer): +# transformers>=4.40 diff --git a/extension/llm/server/python/tests/test_qwen_tool_parser.py b/extension/llm/server/python/tests/test_qwen_tool_parser.py new file mode 100644 index 00000000000..f7c01bfa6d8 --- /dev/null +++ b/extension/llm/server/python/tests/test_qwen_tool_parser.py @@ -0,0 +1,126 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for QwenFunctionCallDetector (Qwen XML tool format).""" + +import json + +from executorch.extension.llm.server.python.tool_parsers import QwenFunctionCallDetector + +# name -> JSON-schema `parameters` (as the server passes it to the detector). +_TOOLS = { + "get_weather": {"type": "object", "properties": {"city": {"type": "string"}}}, + "add": { + "type": "object", + "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}}, + }, +} + + +def _parse(text, tools=_TOOLS): + return QwenFunctionCallDetector().detect_and_parse(text, tools) + + +def test_basic_call(): + text = ( + "\n\n\nParis\n" + "\n\n" + ) + r = _parse(text) + assert len(r.calls) == 1 + assert r.calls[0].name == "get_weather" + assert json.loads(r.calls[0].arguments) == {"city": "Paris"} + assert r.normal_text == "" + + +def test_observed_model_output(): + # The exact shape seen from Qwen3.5-MoE during the live smoke. + text = ( + "\n\n\nParis\n" + "\n\n" + ) + r = _parse(text) + assert [c.name for c in r.calls] == ["get_weather"] + + +def test_numeric_and_multi_param_coercion(): + text = ( + "2" + "3" + ) + r = _parse(text) + assert json.loads(r.calls[0].arguments) == {"a": 2, "b": 3} + + +def test_multiple_calls(): + text = ( + "Paris" + "1" + ) + r = _parse(text) + assert [c.name for c in r.calls] == ["get_weather", "add"] + assert [c.tool_index for c in r.calls] == [0, 1] + + +def test_leading_text_preserved(): + text = "Let me check.Paris" + r = _parse(text) + assert r.normal_text == "Let me check." + assert len(r.calls) == 1 + + +def test_no_tool_call_is_plain_text(): + text = "The capital of France is Paris." + r = _parse(text) + assert r.calls == [] + assert r.normal_text == text + + +def test_undefined_tool_degrades_to_text(): + # A call to a tool not in the request -> whole response kept as visible text. + text = "1" + r = _parse(text) + assert r.calls == [] + assert r.normal_text == text + + +def test_missing_tool_call_wrapper_still_parses(): + # Tolerate a truncated/absent wrapper as long as the function + # block is complete. + text = "Paris" + r = _parse(text) + assert len(r.calls) == 1 + assert json.loads(r.calls[0].arguments) == {"city": "Paris"} + + +# Schema-aware coercion: the XML format is stringly-typed, so values must be cast +# to the declared schema type (the cause of several BFCL function-calling misses). +def test_boolean_value_coerced_by_schema(): + tools = {"f": {"properties": {"flag": {"type": "boolean"}}}} + # The model writes a non-JSON capitalized "True"; the schema says boolean. + text = "True" + r = _parse(text, tools) + assert json.loads(r.calls[0].arguments) == {"flag": True} + + +def test_string_schema_keeps_numeric_literal_as_string(): + tools = {"f": {"properties": {"id": {"type": "string"}}}} + # A numeric-looking value the schema declares as a string must NOT become int. + text = "1234" + r = _parse(text, tools) + args = json.loads(r.calls[0].arguments) + assert args == {"id": "1234"} and isinstance(args["id"], str) + + +def test_untyped_param_falls_back_to_json_guess(): + # No declared type -> best-effort JSON guess (so loosely-typed tools still work). + tools = {"f": {"properties": {}}} + text = ( + "42" + "[1, 2]" + ) + r = _parse(text, tools) + assert json.loads(r.calls[0].arguments) == {"n": 42, "items": [1, 2]} diff --git a/extension/llm/server/python/tool_parsers/__init__.py b/extension/llm/server/python/tool_parsers/__init__.py new file mode 100644 index 00000000000..c890dec3888 --- /dev/null +++ b/extension/llm/server/python/tool_parsers/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Tool-call parsing. Two formats, pick the one matching your model: + +- HermesDetector: JSON inside (Qwen2.5/3, Hermes). +- QwenFunctionCallDetector: Qwen XML (Qwen3.5-MoE / + Qwen3-Coder). + +The server buffers the model's full output and parses it once into complete +OpenAI tool_calls; parse failures degrade to visible text. +""" + +from .hermes import HermesDetector +from .qwen import QwenFunctionCallDetector +from .types import ParseResult, ToolCallItem + +__all__ = [ + "HermesDetector", + "QwenFunctionCallDetector", + "ParseResult", + "ToolCallItem", +] diff --git a/extension/llm/server/python/tool_parsers/hermes.py b/extension/llm/server/python/tool_parsers/hermes.py new file mode 100644 index 00000000000..94e5f747508 --- /dev/null +++ b/extension/llm/server/python/tool_parsers/hermes.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Hermes-style tool calls: {"name": ..., "arguments": {...}}. + +Used by Qwen2.5/Qwen3 (and Hermes models); the Qwen XML format is handled +separately by QwenFunctionCallDetector. The server buffers a model's full output +and parses it once into complete OpenAI tool_calls (no partial-fragment +streaming). Parse failures fall back to visible text — never a crash or a silent +drop. +""" + +import json +import logging +import re +from typing import Any, Optional + +from .types import ParseResult, ToolCallItem + +logger = logging.getLogger(__name__) + +_CALL_RE = re.compile(r"\s*(.*?)\s*", re.DOTALL) + + +class _UndefinedToolCall(Exception): + """A named a tool not in the request's `tools`. v1 degrades the + WHOLE response to visible text rather than emitting a partial set — never + silently drop an undefined call while keeping its siblings (spec).""" + + +class HermesDetector: + """Parses Hermes/Qwen tool calls. Create a fresh instance per request (it + holds the per-request tool-call index); never share across requests.""" + + bot_token = "" + + def __init__(self): + self._next_index = 0 + + def detect_and_parse(self, text: str, tools: dict[str, dict]) -> ParseResult: + """Return leading text + any complete tool calls. On no call or a parse + failure, return the original text unchanged (kept visible to the client).""" + if self.bot_token not in text: + return ParseResult(normal_text=text) + normal = text[: text.find(self.bot_token)].strip() + try: + calls = self._parse_calls(text, tools) + except _UndefinedToolCall as e: + # Degrade the whole response to visible text so the undefined call + # isn't silently dropped (and its valid siblings aren't executed in + # isolation, losing the model's full intent). + logger.debug("undefined tool %s; returning raw text (no partial calls)", e) + return ParseResult(normal_text=text) + except Exception as e: # noqa: BLE001 - never crash; fall back to visible text + logger.debug("tool parse failed (%s); returning raw text", e) + return ParseResult(normal_text=text) + if not calls: + return ParseResult(normal_text=text) + return ParseResult(normal_text=normal, calls=calls) + + def _parse_calls(self, text: str, tools: dict[str, dict]) -> list[ToolCallItem]: + calls = [] + for raw in _CALL_RE.findall(text): + if not raw.strip(): + continue + obj = json.loads(raw.strip()) + for entry in obj if isinstance(obj, list) else [obj]: + calls.append( + self._make_item( + entry.get("name"), + entry.get("arguments", entry.get("parameters")), + tools, + ) + ) + return calls + + def _make_item( + self, name: Optional[str], arguments: Any, tools: dict[str, dict] + ) -> ToolCallItem: + if not name or name not in tools: + raise _UndefinedToolCall(repr(name)) + item = ToolCallItem( + tool_index=self._next_index, + name=name, + arguments=json.dumps( + arguments if arguments is not None else {}, ensure_ascii=False + ), + ) + self._next_index += 1 + return item diff --git a/extension/llm/server/python/tool_parsers/qwen.py b/extension/llm/server/python/tool_parsers/qwen.py new file mode 100644 index 00000000000..01e7a884c77 --- /dev/null +++ b/extension/llm/server/python/tool_parsers/qwen.py @@ -0,0 +1,138 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Qwen XML-style tool calls: V. + +Emitted by Qwen3.5-MoE / Qwen3-Coder (typically wrapped in … +), e.g.: + + + + + Paris + + + + +This is a DIFFERENT format from HermesDetector (JSON inside ); pick the +detector that matches your model. Detection triggers only on the unambiguous +`` marker so ordinary prose is not misclassified. Parse failures fall +back to visible text — never a crash or a silent drop. +""" + +import json +import logging +import re +from typing import Any, Optional + +from .types import ParseResult, ToolCallItem + +logger = logging.getLogger(__name__) + +_FUNCTION_RE = re.compile(r"\s]+)\s*>(.*?)", re.DOTALL) +_PARAMETER_RE = re.compile(r"\s]+)\s*>(.*?)", re.DOTALL) + + +class _UndefinedToolCall(Exception): + """A call named a tool not in the request's `tools`. v1 degrades the WHOLE + response to visible text rather than emitting a partial set (spec).""" + + +def _coerce(value: str, declared_type: Optional[str]) -> Any: + """Cast a raw XML parameter string to the type declared in the tool's JSON + schema. + + The Qwen XML format is stringly-typed (`v`), so + without the schema we'd have to guess. A bare `json.loads` guess mistypes two + common ways: a value the schema wants as a string but that looks numeric + (`"1234"`) becomes an int, and a value the schema wants as a bool but that the + model didn't write as valid JSON (`True`) stays a string. Coercing to the + declared type keeps the emitted OpenAI tool_call schema-valid. Falls back to a + JSON guess (then the raw string) when the type is unknown or coercion fails, + so untyped/loosely-typed params keep working. + """ + if declared_type == "string": + return value + if declared_type == "boolean": + low = value.strip().lower() + if low in ("true", "false"): + return low == "true" + elif declared_type == "integer": + try: + return int(value.strip()) + except (ValueError, TypeError): + pass + elif declared_type == "number": + try: + return float(value.strip()) + except (ValueError, TypeError): + pass + try: + return json.loads(value) + except (ValueError, TypeError): + return value + + +class QwenFunctionCallDetector: + """Parses Qwen's XML tool-call format. Create a fresh instance per request + (it holds the per-request tool-call index); never share across requests.""" + + bot_token = "" + + def __init__(self): + self._next_index = 0 + + def detect_and_parse(self, text: str, tools: dict[str, dict]) -> ParseResult: + """Return leading text + any complete tool calls. On no call or a parse + failure, return the original text unchanged (kept visible to the client). + + `tools` maps each defined tool name to its JSON-schema ``parameters`` + object; the schema is used to coerce stringly-typed XML values to their + declared types (and the key set validates names).""" + first = _FUNCTION_RE.search(text) + if first is None: + return ParseResult(normal_text=text) + # Leading text ends at the wrapper if present, else at the + # first tag. + cut = text.find(self.bot_token) + if cut == -1 or cut > first.start(): + cut = first.start() + normal = text[:cut].strip() + try: + calls = self._parse_calls(text, tools) + except _UndefinedToolCall as e: + logger.debug("undefined tool %s; returning raw text (no partial calls)", e) + return ParseResult(normal_text=text) + except Exception as e: # noqa: BLE001 - never crash; fall back to visible text + logger.debug("tool parse failed (%s); returning raw text", e) + return ParseResult(normal_text=text) + if not calls: + return ParseResult(normal_text=text) + return ParseResult(normal_text=normal, calls=calls) + + def _parse_calls(self, text: str, tools: dict[str, dict]) -> list[ToolCallItem]: + calls = [] + for name, body in _FUNCTION_RE.findall(text): + props = (tools.get(name) or {}).get("properties", {}) + args = { + key: _coerce(value.strip(), props.get(key, {}).get("type")) + for key, value in _PARAMETER_RE.findall(body) + } + calls.append(self._make_item(name, args, tools)) + return calls + + def _make_item( + self, name: Optional[str], arguments: dict, tools: dict[str, dict] + ) -> ToolCallItem: + if not name or name not in tools: + raise _UndefinedToolCall(repr(name)) + item = ToolCallItem( + tool_index=self._next_index, + name=name, + arguments=json.dumps(arguments, ensure_ascii=False), + ) + self._next_index += 1 + return item diff --git a/extension/llm/server/python/tool_parsers/types.py b/extension/llm/server/python/tool_parsers/types.py new file mode 100644 index 00000000000..2dae5c79458 --- /dev/null +++ b/extension/llm/server/python/tool_parsers/types.py @@ -0,0 +1,33 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Protocol-agnostic tool-parsing types. + +Kept independent of the OpenAI wire schema so the parser package is reusable; +serving_chat translates these into OpenAI tool_calls / deltas at the edge. +Design adapted from SGLang's core_types, with explicit per-request state. +""" + +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class ToolCallItem: + """A parsed tool call. `arguments` is a JSON string (the full arguments — + this server emits complete calls, not fragments).""" + + tool_index: int + name: Optional[str] = None + arguments: str = "" + + +@dataclass +class ParseResult: + """Outcome of a parse: free text plus any tool calls found.""" + + normal_text: str = "" + calls: list[ToolCallItem] = field(default_factory=list) diff --git a/extension/llm/server/spec/README.md b/extension/llm/server/spec/README.md new file mode 100644 index 00000000000..58e0e46ef57 --- /dev/null +++ b/extension/llm/server/spec/README.md @@ -0,0 +1,73 @@ +# ExecuTorch LLM Server — Contract Spec + +The language-neutral contract every ExecuTorch LLM server (Python today, C++ +later) implements. The conformance suite in `../conformance` validates an +implementation against this spec by hitting a live server, so it is independent +of language and engine. + +## Supported endpoints + +| Endpoint | Status | +|----------|--------| +| `GET /v1/models` | implemented | +| `POST /v1/chat/completions` (stream + non-stream) | implemented | +| `GET /health` | implemented | +| `POST /v1/completions` | planned | + +## `POST /v1/chat/completions` + +OpenAI Chat Completions subset. **Honored** request fields: `model`, `messages`, +`stream`, `temperature`, `max_tokens` / `max_completion_tokens`, `stop`, `tools`, +`tool_choice` (only `"none"` to disable tools, or `"auto"`/unset for default +parsing), `stream_options.include_usage`, and `chat_template_kwargs` (e.g. +`enable_thinking`). + +**Rejected** with `400 invalid_request_error` (`code: "unsupported_parameter"`) +rather than silently ignored — a client relying on them would otherwise get +wrong behavior: `top_p` (anything other than `1.0`), `seed`, `n` (> 1), +`reasoning_effort`, `frequency_penalty`/`presence_penalty` (nonzero), `top_k`, +`logit_bias`, `tool_choice` = `"required"` or a specific-function choice +(forcing/restricting a call needs constrained decoding, which v1 lacks), +`response_format` other than `{"type": "text"}` (no constrained JSON), +`logprobs`/`top_logprobs` (not returned), and `parallel_tool_calls: false` +(single-call can't be guaranteed without constraining). Unknown fields that +don't affect the output (e.g. `user`, `store`, `metadata`) are accepted and +ignored. + +Non-streaming response: `chat.completion` with one `choice` +(`message.role = "assistant"`, string `content` or `tool_calls`, `finish_reason` +∈ `stop` | `length` | `tool_calls`) and a `usage` block. + +Streaming response: `text/event-stream` of `chat.completion.chunk` objects — +first chunk carries `delta.role = "assistant"`, subsequent chunks carry +`delta.content` (or buffered `delta.tool_calls`), a final chunk carries +`finish_reason`, optionally a usage-only chunk (with +`stream_options.include_usage`), terminated by `data: [DONE]`. + +### Tool calling + +Two output formats are accepted: Hermes-style JSON +(`{"name":...,"arguments":{...}}`, used by Qwen2.5/Qwen3) +and Qwen XML-style (`V`, +typically wrapped in ``, used by Qwen3.5-MoE / Qwen3-Coder). The +server buffers the model's full output and emits **complete** OpenAI +`tool_calls` (no partial-argument fragments). Calls to tools absent from the +request, and malformed tool calls, degrade to visible text — never a crash or +silent drop. `tool_choice="none"` disables tool parsing. + +### Errors & cancellation + +Errors return `{"error": {"message", "type", "code"}}` with an appropriate +status (e.g. `400 context_length_exceeded` when `--max-context` is set and the +prompt exceeds it). A mid-stream failure emits an `error` SSE event then +`[DONE]` rather than dropping the socket. Cancellation is best-effort: on a +client disconnect the control plane stops consuming the stream (`stop()`), but +the worker runs the in-flight request to completion — V1 has no mid-generation +interrupt protocol. + +### Prefix cache + +Not in V1 serving. The control plane holds no KV state and does no prefix-reuse +routing; each request is an independent prompt to the worker. If turn-to-turn KV +prefix reuse returns, it will live inside the worker/session (where the KV cache +is), not in the control plane.