Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions extension/llm/server/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# ExecuTorch LLM Server

OpenAI-compatible serving for ExecuTorch LLMs, so any OpenAI-compatible agent
harness (pi, opencode, ...) can use ExecuTorch as a local backend.

```
extension/llm/server/
spec/ # language-neutral OpenAI contract ExecuTorch targets
conformance/ # one test suite every language server must pass
python/ # Python server implementation (current)
# cpp/ # future: no-Python single-binary server
```

Why this layout: the OpenAI contract is identical across languages, so the
**spec** and **conformance** suite are shared, and each language gets its own
implementation directory. The real cross-language reuse comes from the C++
`LLMEngine`/`LLMSession` primitives underneath, packaged as a process-isolated
**worker binary** (`text_llm_worker`) that any control plane drives over a small
JSONL protocol — the server is a thin protocol shell that spawns and talks to
that worker. See `python/README.md` to run it.

Status: experimental, reliability-first and deliberately narrow. Implemented:
`/health`, `/v1/models`, `/v1/chat/completions` (streaming + non-streaming),
Hugging Face chat templates (`--hf-tokenizer`), `temperature` / `max_tokens` /
`max_completion_tokens` / `stop`, Hermes tool calling by default
(`<tool_call>...</tool_call>` JSON, complete calls only; model-specific launchers
may select the Qwen XML format) with `tool_choice="none"`,
structured API errors, and best-effort cancellation. V1 serving is single-slot
(one worker, one session) with no prefix cache; KV prefix reuse, if it returns,
lives inside the worker/session, not the control plane. Unsupported params (including `top_p`,
`seed`, `n>1`, `reasoning_effort`, penalties, `logit_bias`, `response_format`,
`logprobs`, and `tool_choice="required"`) are rejected with a structured 400
rather than silently ignored. See `python/README.md` to run it and
`spec/README.md` for the exact contract.
207 changes: 207 additions & 0 deletions extension/llm/server/conformance/test_openai_contract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Language-neutral OpenAI-contract conformance tests.

Runs against any base URL (ExecuTorch, llama.cpp, mlx-lm, ...) so every server
implementation is validated against one shared spec. Point it at a running
server:

OPENAI_BASE_URL=http://127.0.0.1:8000/v1 pytest test_openai_contract.py

Skips automatically if no server is reachable.
"""

import json
import os
import urllib.error
import urllib.request

import pytest

BASE_URL = os.environ.get("OPENAI_BASE_URL", "http://127.0.0.1:8000/v1").rstrip("/")
MODEL = os.environ.get("OPENAI_MODEL", "executorch")


def _post(path: str, body: dict, stream: bool = False):
req = urllib.request.Request(
f"{BASE_URL}{path}",
data=json.dumps(body).encode(),
headers={"Content-Type": "application/json"},
method="POST",
)
return urllib.request.urlopen(req, timeout=120)


def _server_up() -> bool:
try:
urllib.request.urlopen(f"{BASE_URL}/models", timeout=5)
return True
except Exception:
return False


pytestmark = pytest.mark.skipif(
not _server_up(), reason="no OpenAI server at OPENAI_BASE_URL"
)


def test_models_listing():
with urllib.request.urlopen(f"{BASE_URL}/models", timeout=10) as r:
data = json.loads(r.read())
assert data["object"] == "list"
assert any("id" in m for m in data["data"])


def test_chat_completion_nonstreaming():
body = {
"model": MODEL,
"messages": [{"role": "user", "content": "Say hello in one word."}],
"max_tokens": 16,
"temperature": 0.0,
}
with _post("/chat/completions", body) as r:
data = json.loads(r.read())
assert data["object"] == "chat.completion"
assert data["choices"][0]["message"]["role"] == "assistant"
assert isinstance(data["choices"][0]["message"]["content"], str)
assert data["choices"][0]["finish_reason"] is not None


def test_chat_completion_streaming():
body = {
"model": MODEL,
"messages": [{"role": "user", "content": "Count to three."}],
"max_tokens": 32,
"stream": True,
}
saw_role = saw_content = saw_done = False
with _post("/chat/completions", body, stream=True) as r:
for raw in r:
line = raw.decode().strip()
if not line.startswith("data:"):
continue
payload = line[len("data:") :].strip()
if payload == "[DONE]":
saw_done = True
break
chunk = json.loads(payload)
assert chunk["object"] == "chat.completion.chunk"
delta = chunk["choices"][0]["delta"]
saw_role = saw_role or delta.get("role") == "assistant"
saw_content = saw_content or bool(delta.get("content"))
assert saw_role and saw_content and saw_done


def test_multibyte_streaming_integrity():
# Byte-level BPE can split a multi-byte character across tokens; the stream
# must reassemble it, not abort with a UTF-8 decode error.
body = {
"model": MODEL,
"messages": [
{"role": "user", "content": "Reply with exactly: 你好世界 🌍 café"}
],
"max_tokens": 32,
"temperature": 0.0,
"stream": True,
}
content, saw_done, saw_error = "", False, False
with _post("/chat/completions", body, stream=True) as r:
for raw in r:
line = raw.decode().strip()
if not line.startswith("data:"):
continue
payload = line[len("data:") :].strip()
if payload == "[DONE]":
saw_done = True
break
chunk = json.loads(payload)
if "error" in chunk:
saw_error = True
content += (
chunk["choices"][0]["delta"].get("content", "")
if chunk.get("choices")
else ""
)
assert saw_done and not saw_error
assert isinstance(content, str) and content # reassembled, valid UTF-8


def test_usage_chunk_in_stream():
body = {
"model": MODEL,
"messages": [{"role": "user", "content": "Say hi."}],
"max_tokens": 16,
"stream": True,
"stream_options": {"include_usage": True},
}
usage = None
with _post("/chat/completions", body, stream=True) as r:
for raw in r:
line = raw.decode().strip()
if not line.startswith("data:"):
continue
payload = line[len("data:") :].strip()
if payload == "[DONE]":
break
chunk = json.loads(payload)
if chunk.get("usage"):
usage = chunk["usage"]
assert usage is not None, "no usage chunk emitted with include_usage"
assert usage["prompt_tokens"] > 0 and usage["completion_tokens"] > 0
assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]


WEATHER_TOOL = {
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a city.",
"parameters": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
},
}


def test_tool_call_response_shape():
body = {
"model": MODEL,
"messages": [
{"role": "user", "content": "What is the weather in Paris? Use the tool."}
],
"tools": [WEATHER_TOOL],
"max_tokens": 128,
"temperature": 0.0,
}
with _post("/chat/completions", body) as r:
data = json.loads(r.read())
calls = data["choices"][0]["message"].get("tool_calls")
assert calls, "expected tool_calls in response"
tc = calls[0]
assert tc["type"] == "function"
assert tc["id"]
assert tc["function"]["name"] == "get_weather"
json.loads(tc["function"]["arguments"]) # arguments is a JSON string
assert data["choices"][0]["finish_reason"] == "tool_calls"


def test_error_body_shape():
# Over-long prompt -> structured 400 (OpenAI error envelope), not a 500/drop.
body = {
"model": MODEL,
"messages": [{"role": "user", "content": "word " * 40000}],
"max_tokens": 8,
}
try:
_post("/chat/completions", body)
raise AssertionError("expected an HTTP error for over-long prompt")
except urllib.error.HTTPError as e:
assert 400 <= e.code < 500
err = json.loads(e.read())["error"]
assert err["message"] and err["type"]
88 changes: 88 additions & 0 deletions extension/llm/server/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Generic model-execution worker for standard .pte TextLLM models. One binary,
# no registry/factory: it constructs TextLLMEngine/TextLLMSession directly and
# speaks the JSONL worker protocol (worker_client.py). Model execution is C++
# only — the Python server is HTTP/control plane.
#
# Build like the example runners (standalone), e.g. from this directory: cmake
# -S . -B <executorch-cmake-out>/extension/llm/server/cpp \
# -DCMAKE_PREFIX_PATH=<executorch-cmake-out> -DEXECUTORCH_BUILD_XNNPACK=ON cmake
# --build <...>/extension/llm/server/cpp --target text_llm_worker

cmake_minimum_required(VERSION 3.24)
project(llm_server_workers)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)

include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

set(_common_include_directories ${EXECUTORCH_ROOT}/..)
# Vendored single-include nlohmann/json for the worker protocol (no new dep).
set(_json_include
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
)

# gflags
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../../third-party/gflags)
find_package(gflags REQUIRED)

# executorch
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../../..)
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
executorch_target_link_options_shared_lib(executorch)

set(link_libraries executorch gflags)

# CPU ops
list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)

# Custom + quantized kernels that export_llm models need, whole-archived so the
# static op registrations survive the linker: llama::custom_sdpa (from
# use_sdpa_with_kv_cache) and quantized_decomposed ops (from quantized exports).
# Without these the model loads but execution fails with "Missing operator".
if(TARGET custom_ops)
executorch_target_link_options_shared_lib(custom_ops)
list(APPEND link_libraries custom_ops)
endif()
if(TARGET quantized_ops_lib)
list(APPEND link_libraries quantized_kernels quantized_ops_lib)
executorch_target_link_options_shared_lib(quantized_ops_lib)
endif()

# Extensions (Engine/Session lives in extension_llm_runner)
list(
APPEND
link_libraries
extension_llm_runner
extension_module
extension_data_loader
extension_tensor
extension_flat_tensor
)

# XNNPACK: the standard CPU backend for normal .pte TextLLM models.
list(APPEND link_libraries xnnpack_backend)
executorch_target_link_options_shared_lib(xnnpack_backend)

# Tokenizer
list(APPEND link_libraries tokenizers::tokenizers)

add_executable(text_llm_worker text_llm_worker.cpp)
target_include_directories(
text_llm_worker PUBLIC ${_common_include_directories} ${_json_include}
)
target_link_libraries(text_llm_worker PUBLIC ${link_libraries})

if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(text_llm_worker)
target_link_options(text_llm_worker PRIVATE "LINKER:-s")
endif()
Loading
Loading