diff --git a/extension/llm/server/__init__.py b/extension/llm/server/__init__.py
new file mode 100644
index 00000000000..2e41cd717f6
--- /dev/null
+++ b/extension/llm/server/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/extension/llm/server/python/__init__.py b/extension/llm/server/python/__init__.py
new file mode 100644
index 00000000000..00b6274c01f
--- /dev/null
+++ b/extension/llm/server/python/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenAI-compatible server for ExecuTorch LLMs (Python implementation)."""
diff --git a/extension/llm/server/python/chat_template.py b/extension/llm/server/python/chat_template.py
new file mode 100644
index 00000000000..52a7ef21243
--- /dev/null
+++ b/extension/llm/server/python/chat_template.py
@@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Render OpenAI chat messages into a single prompt string.
+
+The ExecuTorch runner tokenizes a plain prompt; chat formatting is the server's
+job (control plane). We require the model's own Hugging Face ``chat_template``
+(via ``--hf-tokenizer``) for correct, tool-aware, reasoning-aware formatting.
+The generic ChatML fallback is opt-in only (``allow_fallback``): it is
+approximate and cannot reproduce model-specific controls (e.g. enable_thinking),
+so it must be a deliberate choice rather than a silent default.
+"""
+
+import json
+import logging
+from typing import Any, Optional
+
+from .protocol import ChatMessage
+
+logger = logging.getLogger(__name__)
+
+
+_DEFAULT_SPECIAL_TOKENS = ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "<|end|>"]
+
+
+def _decode_tool_call_arguments(messages: list[dict[str, Any]]) -> None:
+    """In-place: parse each tool call's ``function.arguments`` from a JSON string
+    into an object.
+
+    OpenAI sends assistant tool-call arguments as a JSON-encoded string, but HF
+    chat templates expect a mapping (e.g. Qwen renders ``arguments|items`` into
+    ``<parameter=…>`` tags). Without this, a multi-turn tool conversation makes
+    the template raise "Can only get item pairs from a mapping". Left as-is if
+    the value isn't valid JSON, so a template that wants the raw string still works.
+    """
+    for m in messages:
+        for tc in m.get("tool_calls") or []:
+            fn = tc.get("function")
+            if not isinstance(fn, dict):
+                continue
+            args = fn.get("arguments")
+            if isinstance(args, str):
+                try:
+                    fn["arguments"] = json.loads(args)
+                except (ValueError, TypeError):
+                    pass
+
+
+class ChatTemplate:
+    def __init__(
+        self,
+        hf_tokenizer_path: Optional[str] = None,
+        default_template_kwargs: Optional[dict[str, Any]] = None,
+        allow_fallback: bool = False,
+    ):
+        # Server-level defaults (e.g. {"enable_thinking": False}); per-request
+        # chat_template_kwargs override these.
+        self._defaults = default_template_kwargs or {}
+        self._hf = None
+        if hf_tokenizer_path:
+            from transformers import AutoTokenizer
+
+            self._hf = AutoTokenizer.from_pretrained(hf_tokenizer_path)
+            if self._hf.chat_template is None:
+                self._hf = None
+                if not allow_fallback:
+                    raise ValueError(
+                        f"HF tokenizer at {hf_tokenizer_path} has no chat_template; "
+                        "pass an explicit fallback flag to use approximate ChatML."
+                    )
+                logger.warning(
+                    "No chat_template at %s; using approximate ChatML.",
+                    hf_tokenizer_path,
+                )
+        elif not allow_fallback:
+            raise ValueError(
+                "A chat template is required: pass --hf-tokenizer for the model's own "
+                "template, or opt into approximate ChatML with --allow-chatml-fallback."
+            )
+        else:
+            logger.warning(
+                "No --hf-tokenizer; using approximate ChatML (no thinking control)."
+            )
+
+    def render(
+        self,
+        messages: list[ChatMessage],
+        tools: Optional[list[dict[str, Any]]] = None,
+        template_kwargs: Optional[dict[str, Any]] = None,
+    ) -> str:
+        kwargs = {**self._defaults, **(template_kwargs or {})}
+        if self._hf is not None:
+            dumped = [m.model_dump(exclude_none=True) for m in messages]
+            _decode_tool_call_arguments(dumped)
+            return self._hf.apply_chat_template(
+                dumped,
+                tools=tools,
+                add_generation_prompt=True,
+                tokenize=False,
+                **kwargs,
+            )
+        return self._fallback(messages)
+
+    def chat_template_str(self) -> Optional[str]:
+        """Raw chat-template string (for tool-format auto-detection), if available."""
+        return (
+            getattr(self._hf, "chat_template", None) if self._hf is not None else None
+        )
+
+    def count_tokens(self, prompt: str) -> Optional[int]:
+        """Token count for the rendered prompt, or None if no tokenizer is available."""
+        if self._hf is not None:
+            # The prompt is already rendered (apply_chat_template includes the
+            # control tokens), so encode without re-adding BOS/EOS — matching the
+            # session/prefix-cache paths, so the count isn't inflated and
+            # near-limit requests aren't falsely rejected under --max-context.
+            return len(self._hf.encode(prompt, add_special_tokens=False))
+        return None
+
+    def special_tokens(self) -> list[str]:
+        """Special-token strings whose appearance ends the visible content.
+
+        From the HF tokenizer when available (model-accurate), else a default set
+        covering common chat models.
+        """
+        if self._hf is not None:
+            toks = list(getattr(self._hf, "all_special_tokens", []) or [])
+            return [t for t in toks if isinstance(t, str) and t]
+        return list(_DEFAULT_SPECIAL_TOKENS)
+
+    @staticmethod
+    def _fallback(messages: list[ChatMessage]) -> str:
+        # Approximate ChatML. Provide --hf-tokenizer for model-correct formatting
+        # (including reasoning controls like enable_thinking, which the fallback
+        # cannot reproduce).
+        parts = []
+        for m in messages:
+            content = m.content if isinstance(m.content, str) else str(m.content or "")
+            parts.append(f"<|im_start|>{m.role}\n{content}<|im_end|>")
+        parts.append("<|im_start|>assistant\n")
+        return "\n".join(parts)
diff --git a/extension/llm/server/python/errors.py b/extension/llm/server/python/errors.py
new file mode 100644
index 00000000000..f24df43f2e8
--- /dev/null
+++ b/extension/llm/server/python/errors.py
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenAI-shaped API errors.
+
+Raising these lets the server return a structured `{"error": {...}}` body with
+the right HTTP status instead of dropping the connection.
+"""
+
+from typing import Optional
+
+
+class APIError(Exception):
+    def __init__(
+        self, status: int, message: str, err_type: str, code: Optional[str] = None
+    ):
+        super().__init__(message)
+        self.status = status
+        self.message = message
+        self.err_type = err_type
+        self.code = code
+
+    def body(self) -> dict:
+        return {
+            "error": {"message": self.message, "type": self.err_type, "code": self.code}
+        }
+
+
+class ContextLengthExceeded(APIError):
+    def __init__(self, num_tokens: int, max_context: int, completion_tokens: int = 0):
+        # completion_tokens > 0: the prompt fits but prompt + requested
+        # max_tokens would run past the window — reject up front rather than
+        # fail (or truncate) mid-generation.
+        if completion_tokens > 0:
+            message = (
+                f"This model's maximum context length is {max_context} tokens. "
+                f"However, you requested {num_tokens + completion_tokens} tokens "
+                f"({num_tokens} in the messages, {completion_tokens} in the "
+                f"completion). Please reduce the length of the messages or "
+                f"completion."
+            )
+        else:
+            message = (
+                f"This model's maximum context length is {max_context} tokens, "
+                f"but the request has {num_tokens} prompt tokens."
+            )
+        super().__init__(
+            status=400,
+            message=message,
+            err_type="invalid_request_error",
+            code="context_length_exceeded",
+        )
+
+
+class GenerationError(APIError):
+    def __init__(self, detail: str):
+        super().__init__(
+            status=500, message=f"Generation failed: {detail}", err_type="server_error"
+        )
diff --git a/extension/llm/server/python/protocol.py b/extension/llm/server/python/protocol.py
new file mode 100644
index 00000000000..2d73d2d7f64
--- /dev/null
+++ b/extension/llm/server/python/protocol.py
@@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenAI-compatible request/response schemas for the ExecuTorch LLM server.
+
+This is the Python view of the contract defined in ``extension/llm/server/spec``.
+Any language server must serialize to the same shapes; the conformance suite in
+``extension/llm/server/conformance`` validates them.
+"""
+
+import time
+import uuid
+from typing import Any, Literal, Optional, Union
+
+from pydantic import BaseModel, Field
+
+
+def _new_id(prefix: str) -> str:
+    return f"{prefix}-{uuid.uuid4().hex}"
+
+
+class FunctionCall(BaseModel):
+    name: Optional[str] = None
+    arguments: Optional[str] = None
+
+
+class ToolCall(BaseModel):
+    index: Optional[int] = None
+    id: Optional[str] = None
+    type: Literal["function"] = "function"
+    function: FunctionCall
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: Optional[Union[str, list[dict[str, Any]]]] = None
+    name: Optional[str] = None
+    tool_calls: Optional[list[ToolCall]] = None
+    tool_call_id: Optional[str] = None
+
+
+class StreamOptions(BaseModel):
+    include_usage: bool = False
+
+
+class ChatCompletionRequest(BaseModel):
+    model: Optional[str] = None
+    messages: list[ChatMessage]
+    stream: bool = False
+    stream_options: Optional[StreamOptions] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    max_tokens: Optional[int] = None
+    max_completion_tokens: Optional[int] = None
+    stop: Optional[Union[str, list[str]]] = None
+    n: int = 1
+    seed: Optional[int] = None
+    # Sampling knobs that change generation output. We don't plumb these, so they
+    # are modeled (not dropped) in order to be rejected with a clear error rather
+    # than silently ignored — see serving_chat's unsupported-parameter check.
+    frequency_penalty: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    top_k: Optional[int] = None
+    logit_bias: Optional[dict[str, float]] = None
+    # Output-contract fields: modeled (not dropped) so we reject the ones we
+    # can't honor rather than returning an output that violates what was asked.
+    response_format: Optional[dict[str, Any]] = None
+    logprobs: Optional[bool] = None
+    top_logprobs: Optional[int] = None
+    parallel_tool_calls: Optional[bool] = None
+    # Per-request chat-template controls, e.g. {"enable_thinking": false} for Qwen3.
+    chat_template_kwargs: Optional[dict[str, Any]] = None
+    # Accepted now so the contract is stable; parsing/enforcement land in M2/M5.
+    tools: Optional[list[dict[str, Any]]] = None
+    tool_choice: Optional[Union[str, dict[str, Any]]] = None
+    reasoning_effort: Optional[str] = None
+
+    def resolved_max_tokens(self) -> int:
+        # `is not None` (not `or`): an explicit 0 must not be treated as unset.
+        # Callers validate positivity; -1 means "unset / auto".
+        if self.max_completion_tokens is not None:
+            return self.max_completion_tokens
+        if self.max_tokens is not None:
+            return self.max_tokens
+        return -1
+
+
+class Usage(BaseModel):
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_tokens: int = 0
+
+
+class ResponseMessage(BaseModel):
+    role: str = "assistant"
+    content: Optional[str] = None
+    tool_calls: Optional[list[ToolCall]] = None
+
+
+class Choice(BaseModel):
+    index: int = 0
+    message: ResponseMessage
+    finish_reason: Optional[str] = None
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: _new_id("chatcmpl"))
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[Choice]
+    usage: Usage = Field(default_factory=Usage)
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+    tool_calls: Optional[list[ToolCall]] = None
+
+
+class ChunkChoice(BaseModel):
+    index: int = 0
+    delta: DeltaMessage
+    finish_reason: Optional[str] = None
+
+
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChunkChoice]
+    usage: Optional[Usage] = None
+
+
+class ModelCard(BaseModel):
+    id: str
+    object: Literal["model"] = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "executorch"
+
+
+class ModelList(BaseModel):
+    object: Literal["list"] = "list"
+    data: list[ModelCard]
diff --git a/extension/llm/server/python/requirements.txt b/extension/llm/server/python/requirements.txt
new file mode 100644
index 00000000000..70ad7ccb4dd
--- /dev/null
+++ b/extension/llm/server/python/requirements.txt
@@ -0,0 +1,5 @@
+fastapi>=0.110
+uvicorn[standard]>=0.27
+pydantic>=2.0
+# Optional but recommended for model-correct chat templating (--hf-tokenizer):
+# transformers>=4.40
diff --git a/extension/llm/server/python/tests/test_qwen_tool_parser.py b/extension/llm/server/python/tests/test_qwen_tool_parser.py
new file mode 100644
index 00000000000..f7c01bfa6d8
--- /dev/null
+++ b/extension/llm/server/python/tests/test_qwen_tool_parser.py
@@ -0,0 +1,126 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for QwenFunctionCallDetector (Qwen XML <function=…> tool format)."""
+
+import json
+
+from executorch.extension.llm.server.python.tool_parsers import QwenFunctionCallDetector
+
+# name -> JSON-schema `parameters` (as the server passes it to the detector).
+_TOOLS = {
+    "get_weather": {"type": "object", "properties": {"city": {"type": "string"}}},
+    "add": {
+        "type": "object",
+        "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}},
+    },
+}
+
+
+def _parse(text, tools=_TOOLS):
+    return QwenFunctionCallDetector().detect_and_parse(text, tools)
+
+
+def test_basic_call():
+    text = (
+        "<tool_call>\n<function=get_weather>\n<parameter=city>\nParis\n"
+        "</parameter>\n</function>\n</tool_call>"
+    )
+    r = _parse(text)
+    assert len(r.calls) == 1
+    assert r.calls[0].name == "get_weather"
+    assert json.loads(r.calls[0].arguments) == {"city": "Paris"}
+    assert r.normal_text == ""
+
+
+def test_observed_model_output():
+    # The exact shape seen from Qwen3.5-MoE during the live smoke.
+    text = (
+        "<tool_call>\n<function=get_weather>\n<parameter=city>\nParis\n"
+        "</parameter>\n</function>\n</tool_call>"
+    )
+    r = _parse(text)
+    assert [c.name for c in r.calls] == ["get_weather"]
+
+
+def test_numeric_and_multi_param_coercion():
+    text = (
+        "<function=add><parameter=a>2</parameter>"
+        "<parameter=b>3</parameter></function>"
+    )
+    r = _parse(text)
+    assert json.loads(r.calls[0].arguments) == {"a": 2, "b": 3}
+
+
+def test_multiple_calls():
+    text = (
+        "<function=get_weather><parameter=city>Paris</parameter></function>"
+        "<function=add><parameter=a>1</parameter></function>"
+    )
+    r = _parse(text)
+    assert [c.name for c in r.calls] == ["get_weather", "add"]
+    assert [c.tool_index for c in r.calls] == [0, 1]
+
+
+def test_leading_text_preserved():
+    text = "Let me check.<function=get_weather><parameter=city>Paris</parameter></function>"
+    r = _parse(text)
+    assert r.normal_text == "Let me check."
+    assert len(r.calls) == 1
+
+
+def test_no_tool_call_is_plain_text():
+    text = "The capital of France is Paris."
+    r = _parse(text)
+    assert r.calls == []
+    assert r.normal_text == text
+
+
+def test_undefined_tool_degrades_to_text():
+    # A call to a tool not in the request -> whole response kept as visible text.
+    text = "<function=delete_everything><parameter=x>1</parameter></function>"
+    r = _parse(text)
+    assert r.calls == []
+    assert r.normal_text == text
+
+
+def test_missing_tool_call_wrapper_still_parses():
+    # Tolerate a truncated/absent <tool_call> wrapper as long as the function
+    # block is complete.
+    text = "<function=get_weather><parameter=city>Paris</parameter></function>"
+    r = _parse(text)
+    assert len(r.calls) == 1
+    assert json.loads(r.calls[0].arguments) == {"city": "Paris"}
+
+
+# Schema-aware coercion: the XML format is stringly-typed, so values must be cast
+# to the declared schema type (the cause of several BFCL function-calling misses).
+def test_boolean_value_coerced_by_schema():
+    tools = {"f": {"properties": {"flag": {"type": "boolean"}}}}
+    # The model writes a non-JSON capitalized "True"; the schema says boolean.
+    text = "<function=f><parameter=flag>True</parameter></function>"
+    r = _parse(text, tools)
+    assert json.loads(r.calls[0].arguments) == {"flag": True}
+
+
+def test_string_schema_keeps_numeric_literal_as_string():
+    tools = {"f": {"properties": {"id": {"type": "string"}}}}
+    # A numeric-looking value the schema declares as a string must NOT become int.
+    text = "<function=f><parameter=id>1234</parameter></function>"
+    r = _parse(text, tools)
+    args = json.loads(r.calls[0].arguments)
+    assert args == {"id": "1234"} and isinstance(args["id"], str)
+
+
+def test_untyped_param_falls_back_to_json_guess():
+    # No declared type -> best-effort JSON guess (so loosely-typed tools still work).
+    tools = {"f": {"properties": {}}}
+    text = (
+        "<function=f><parameter=n>42</parameter>"
+        "<parameter=items>[1, 2]</parameter></function>"
+    )
+    r = _parse(text, tools)
+    assert json.loads(r.calls[0].arguments) == {"n": 42, "items": [1, 2]}
diff --git a/extension/llm/server/python/tool_parsers/__init__.py b/extension/llm/server/python/tool_parsers/__init__.py
new file mode 100644
index 00000000000..c890dec3888
--- /dev/null
+++ b/extension/llm/server/python/tool_parsers/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tool-call parsing. Two formats, pick the one matching your model:
+
+- HermesDetector: JSON inside <tool_call>…</tool_call> (Qwen2.5/3, Hermes).
+- QwenFunctionCallDetector: Qwen XML <function=…><parameter=…> (Qwen3.5-MoE /
+  Qwen3-Coder).
+
+The server buffers the model's full output and parses it once into complete
+OpenAI tool_calls; parse failures degrade to visible text.
+"""
+
+from .hermes import HermesDetector
+from .qwen import QwenFunctionCallDetector
+from .types import ParseResult, ToolCallItem
+
+__all__ = [
+    "HermesDetector",
+    "QwenFunctionCallDetector",
+    "ParseResult",
+    "ToolCallItem",
+]
diff --git a/extension/llm/server/python/tool_parsers/hermes.py b/extension/llm/server/python/tool_parsers/hermes.py
new file mode 100644
index 00000000000..94e5f747508
--- /dev/null
+++ b/extension/llm/server/python/tool_parsers/hermes.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Hermes-style tool calls: <tool_call>{"name": ..., "arguments": {...}}</tool_call>.
+
+Used by Qwen2.5/Qwen3 (and Hermes models); the Qwen XML format is handled
+separately by QwenFunctionCallDetector. The server buffers a model's full output
+and parses it once into complete OpenAI tool_calls (no partial-fragment
+streaming). Parse failures fall back to visible text — never a crash or a silent
+drop.
+"""
+
+import json
+import logging
+import re
+from typing import Any, Optional
+
+from .types import ParseResult, ToolCallItem
+
+logger = logging.getLogger(__name__)
+
+_CALL_RE = re.compile(r"<tool_call>\s*(.*?)\s*</tool_call>", re.DOTALL)
+
+
+class _UndefinedToolCall(Exception):
+    """A <tool_call> named a tool not in the request's `tools`. v1 degrades the
+    WHOLE response to visible text rather than emitting a partial set — never
+    silently drop an undefined call while keeping its siblings (spec)."""
+
+
+class HermesDetector:
+    """Parses Hermes/Qwen tool calls. Create a fresh instance per request (it
+    holds the per-request tool-call index); never share across requests."""
+
+    bot_token = "<tool_call>"
+
+    def __init__(self):
+        self._next_index = 0
+
+    def detect_and_parse(self, text: str, tools: dict[str, dict]) -> ParseResult:
+        """Return leading text + any complete tool calls. On no call or a parse
+        failure, return the original text unchanged (kept visible to the client)."""
+        if self.bot_token not in text:
+            return ParseResult(normal_text=text)
+        normal = text[: text.find(self.bot_token)].strip()
+        try:
+            calls = self._parse_calls(text, tools)
+        except _UndefinedToolCall as e:
+            # Degrade the whole response to visible text so the undefined call
+            # isn't silently dropped (and its valid siblings aren't executed in
+            # isolation, losing the model's full intent).
+            logger.debug("undefined tool %s; returning raw text (no partial calls)", e)
+            return ParseResult(normal_text=text)
+        except Exception as e:  # noqa: BLE001 - never crash; fall back to visible text
+            logger.debug("tool parse failed (%s); returning raw text", e)
+            return ParseResult(normal_text=text)
+        if not calls:
+            return ParseResult(normal_text=text)
+        return ParseResult(normal_text=normal, calls=calls)
+
+    def _parse_calls(self, text: str, tools: dict[str, dict]) -> list[ToolCallItem]:
+        calls = []
+        for raw in _CALL_RE.findall(text):
+            if not raw.strip():
+                continue
+            obj = json.loads(raw.strip())
+            for entry in obj if isinstance(obj, list) else [obj]:
+                calls.append(
+                    self._make_item(
+                        entry.get("name"),
+                        entry.get("arguments", entry.get("parameters")),
+                        tools,
+                    )
+                )
+        return calls
+
+    def _make_item(
+        self, name: Optional[str], arguments: Any, tools: dict[str, dict]
+    ) -> ToolCallItem:
+        if not name or name not in tools:
+            raise _UndefinedToolCall(repr(name))
+        item = ToolCallItem(
+            tool_index=self._next_index,
+            name=name,
+            arguments=json.dumps(
+                arguments if arguments is not None else {}, ensure_ascii=False
+            ),
+        )
+        self._next_index += 1
+        return item
diff --git a/extension/llm/server/python/tool_parsers/qwen.py b/extension/llm/server/python/tool_parsers/qwen.py
new file mode 100644
index 00000000000..01e7a884c77
--- /dev/null
+++ b/extension/llm/server/python/tool_parsers/qwen.py
@@ -0,0 +1,138 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Qwen XML-style tool calls: <function=NAME><parameter=K>V</parameter></function>.
+
+Emitted by Qwen3.5-MoE / Qwen3-Coder (typically wrapped in <tool_call>…
+</tool_call>), e.g.:
+
+    <tool_call>
+    <function=get_weather>
+    <parameter=city>
+    Paris
+    </parameter>
+    </function>
+    </tool_call>
+
+This is a DIFFERENT format from HermesDetector (JSON inside <tool_call>); pick the
+detector that matches your model. Detection triggers only on the unambiguous
+`<function=…>` marker so ordinary prose is not misclassified. Parse failures fall
+back to visible text — never a crash or a silent drop.
+"""
+
+import json
+import logging
+import re
+from typing import Any, Optional
+
+from .types import ParseResult, ToolCallItem
+
+logger = logging.getLogger(__name__)
+
+_FUNCTION_RE = re.compile(r"<function=([^>\s]+)\s*>(.*?)</function>", re.DOTALL)
+_PARAMETER_RE = re.compile(r"<parameter=([^>\s]+)\s*>(.*?)</parameter>", re.DOTALL)
+
+
+class _UndefinedToolCall(Exception):
+    """A call named a tool not in the request's `tools`. v1 degrades the WHOLE
+    response to visible text rather than emitting a partial set (spec)."""
+
+
+def _coerce(value: str, declared_type: Optional[str]) -> Any:
+    """Cast a raw XML parameter string to the type declared in the tool's JSON
+    schema.
+
+    The Qwen XML format is stringly-typed (`<parameter=k>v</parameter>`), so
+    without the schema we'd have to guess. A bare `json.loads` guess mistypes two
+    common ways: a value the schema wants as a string but that looks numeric
+    (`"1234"`) becomes an int, and a value the schema wants as a bool but that the
+    model didn't write as valid JSON (`True`) stays a string. Coercing to the
+    declared type keeps the emitted OpenAI tool_call schema-valid. Falls back to a
+    JSON guess (then the raw string) when the type is unknown or coercion fails,
+    so untyped/loosely-typed params keep working.
+    """
+    if declared_type == "string":
+        return value
+    if declared_type == "boolean":
+        low = value.strip().lower()
+        if low in ("true", "false"):
+            return low == "true"
+    elif declared_type == "integer":
+        try:
+            return int(value.strip())
+        except (ValueError, TypeError):
+            pass
+    elif declared_type == "number":
+        try:
+            return float(value.strip())
+        except (ValueError, TypeError):
+            pass
+    try:
+        return json.loads(value)
+    except (ValueError, TypeError):
+        return value
+
+
+class QwenFunctionCallDetector:
+    """Parses Qwen's XML tool-call format. Create a fresh instance per request
+    (it holds the per-request tool-call index); never share across requests."""
+
+    bot_token = "<tool_call>"
+
+    def __init__(self):
+        self._next_index = 0
+
+    def detect_and_parse(self, text: str, tools: dict[str, dict]) -> ParseResult:
+        """Return leading text + any complete tool calls. On no call or a parse
+        failure, return the original text unchanged (kept visible to the client).
+
+        `tools` maps each defined tool name to its JSON-schema ``parameters``
+        object; the schema is used to coerce stringly-typed XML values to their
+        declared types (and the key set validates names)."""
+        first = _FUNCTION_RE.search(text)
+        if first is None:
+            return ParseResult(normal_text=text)
+        # Leading text ends at the <tool_call> wrapper if present, else at the
+        # first <function=…> tag.
+        cut = text.find(self.bot_token)
+        if cut == -1 or cut > first.start():
+            cut = first.start()
+        normal = text[:cut].strip()
+        try:
+            calls = self._parse_calls(text, tools)
+        except _UndefinedToolCall as e:
+            logger.debug("undefined tool %s; returning raw text (no partial calls)", e)
+            return ParseResult(normal_text=text)
+        except Exception as e:  # noqa: BLE001 - never crash; fall back to visible text
+            logger.debug("tool parse failed (%s); returning raw text", e)
+            return ParseResult(normal_text=text)
+        if not calls:
+            return ParseResult(normal_text=text)
+        return ParseResult(normal_text=normal, calls=calls)
+
+    def _parse_calls(self, text: str, tools: dict[str, dict]) -> list[ToolCallItem]:
+        calls = []
+        for name, body in _FUNCTION_RE.findall(text):
+            props = (tools.get(name) or {}).get("properties", {})
+            args = {
+                key: _coerce(value.strip(), props.get(key, {}).get("type"))
+                for key, value in _PARAMETER_RE.findall(body)
+            }
+            calls.append(self._make_item(name, args, tools))
+        return calls
+
+    def _make_item(
+        self, name: Optional[str], arguments: dict, tools: dict[str, dict]
+    ) -> ToolCallItem:
+        if not name or name not in tools:
+            raise _UndefinedToolCall(repr(name))
+        item = ToolCallItem(
+            tool_index=self._next_index,
+            name=name,
+            arguments=json.dumps(arguments, ensure_ascii=False),
+        )
+        self._next_index += 1
+        return item
diff --git a/extension/llm/server/python/tool_parsers/types.py b/extension/llm/server/python/tool_parsers/types.py
new file mode 100644
index 00000000000..2dae5c79458
--- /dev/null
+++ b/extension/llm/server/python/tool_parsers/types.py
@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Protocol-agnostic tool-parsing types.
+
+Kept independent of the OpenAI wire schema so the parser package is reusable;
+serving_chat translates these into OpenAI tool_calls / deltas at the edge.
+Design adapted from SGLang's core_types, with explicit per-request state.
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class ToolCallItem:
+    """A parsed tool call. `arguments` is a JSON string (the full arguments —
+    this server emits complete calls, not fragments)."""
+
+    tool_index: int
+    name: Optional[str] = None
+    arguments: str = ""
+
+
+@dataclass
+class ParseResult:
+    """Outcome of a parse: free text plus any tool calls found."""
+
+    normal_text: str = ""
+    calls: list[ToolCallItem] = field(default_factory=list)
diff --git a/extension/llm/server/spec/README.md b/extension/llm/server/spec/README.md
new file mode 100644
index 00000000000..58e0e46ef57
--- /dev/null
+++ b/extension/llm/server/spec/README.md
@@ -0,0 +1,73 @@
+# ExecuTorch LLM Server — Contract Spec
+
+The language-neutral contract every ExecuTorch LLM server (Python today, C++
+later) implements. The conformance suite in `../conformance` validates an
+implementation against this spec by hitting a live server, so it is independent
+of language and engine.
+
+## Supported endpoints
+
+| Endpoint | Status |
+|----------|--------|
+| `GET /v1/models` | implemented |
+| `POST /v1/chat/completions` (stream + non-stream) | implemented |
+| `GET /health` | implemented |
+| `POST /v1/completions` | planned |
+
+## `POST /v1/chat/completions`
+
+OpenAI Chat Completions subset. **Honored** request fields: `model`, `messages`,
+`stream`, `temperature`, `max_tokens` / `max_completion_tokens`, `stop`, `tools`,
+`tool_choice` (only `"none"` to disable tools, or `"auto"`/unset for default
+parsing), `stream_options.include_usage`, and `chat_template_kwargs` (e.g.
+`enable_thinking`).
+
+**Rejected** with `400 invalid_request_error` (`code: "unsupported_parameter"`)
+rather than silently ignored — a client relying on them would otherwise get
+wrong behavior: `top_p` (anything other than `1.0`), `seed`, `n` (> 1),
+`reasoning_effort`, `frequency_penalty`/`presence_penalty` (nonzero), `top_k`,
+`logit_bias`, `tool_choice` = `"required"` or a specific-function choice
+(forcing/restricting a call needs constrained decoding, which v1 lacks),
+`response_format` other than `{"type": "text"}` (no constrained JSON),
+`logprobs`/`top_logprobs` (not returned), and `parallel_tool_calls: false`
+(single-call can't be guaranteed without constraining). Unknown fields that
+don't affect the output (e.g. `user`, `store`, `metadata`) are accepted and
+ignored.
+
+Non-streaming response: `chat.completion` with one `choice`
+(`message.role = "assistant"`, string `content` or `tool_calls`, `finish_reason`
+∈ `stop` | `length` | `tool_calls`) and a `usage` block.
+
+Streaming response: `text/event-stream` of `chat.completion.chunk` objects —
+first chunk carries `delta.role = "assistant"`, subsequent chunks carry
+`delta.content` (or buffered `delta.tool_calls`), a final chunk carries
+`finish_reason`, optionally a usage-only chunk (with
+`stream_options.include_usage`), terminated by `data: [DONE]`.
+
+### Tool calling
+
+Two output formats are accepted: Hermes-style JSON
+(`<tool_call>{"name":...,"arguments":{...}}</tool_call>`, used by Qwen2.5/Qwen3)
+and Qwen XML-style (`<function=NAME><parameter=K>V</parameter></function>`,
+typically wrapped in `<tool_call>`, used by Qwen3.5-MoE / Qwen3-Coder). The
+server buffers the model's full output and emits **complete** OpenAI
+`tool_calls` (no partial-argument fragments). Calls to tools absent from the
+request, and malformed tool calls, degrade to visible text — never a crash or
+silent drop. `tool_choice="none"` disables tool parsing.
+
+### Errors & cancellation
+
+Errors return `{"error": {"message", "type", "code"}}` with an appropriate
+status (e.g. `400 context_length_exceeded` when `--max-context` is set and the
+prompt exceeds it). A mid-stream failure emits an `error` SSE event then
+`[DONE]` rather than dropping the socket. Cancellation is best-effort: on a
+client disconnect the control plane stops consuming the stream (`stop()`), but
+the worker runs the in-flight request to completion — V1 has no mid-generation
+interrupt protocol.
+
+### Prefix cache
+
+Not in V1 serving. The control plane holds no KV state and does no prefix-reuse
+routing; each request is an independent prompt to the worker. If turn-to-turn KV
+prefix reuse returns, it will live inside the worker/session (where the KV cache
+is), not in the control plane.