pytorch · mergennachin · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/Makefile b/Makefile
@@ -129,7 +129,7 @@ help:
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
 	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
-	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
+	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
 
@@ -431,11 +431,13 @@ voxtral_tts-cuda:
 qwen3_5_moe-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
-	@echo "==> Building Qwen3.5 MoE runner with CUDA..."
+	@echo "==> Building Qwen3.5 MoE runner + serving worker with CUDA..."
 	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
 	@echo ""
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+	@echo "  Serving worker: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
+	@echo "  Launch: see examples/models/qwen3_5_moe/README.md (Serving)"
 
 gemma4_31b-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."

@@ -15,6 +15,11 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+# Vendored single-include nlohmann/json for the worker JSONL protocol (no new
+# dependency).
+set(_json_include
+    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
+)
 
 # gflags
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
@@ -60,7 +65,7 @@ endif()
 # Tokenizer
 list(APPEND link_libraries tokenizers::tokenizers)
 
-add_executable(qwen3_5_moe_runner main.cpp)
+add_executable(qwen3_5_moe_runner main.cpp qwen35_moe_engine.cpp)
 target_include_directories(
   qwen3_5_moe_runner PUBLIC ${_common_include_directories}
 )
@@ -70,3 +75,18 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(qwen3_5_moe_runner)
   target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
 endif()
+
+# Process-isolated serving worker (qwen3_5_moe_worker): constructs
+# Qwen35MoEEngine directly and speaks the JSONL worker protocol that the Python
+# control plane drives via WorkerClient (no pybind, no Python model code). Built
+# alongside the runner by the qwen3-5-moe-cuda preset.
+add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
+target_include_directories(
+  qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
+)
+target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries})
+
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(qwen3_5_moe_worker)
+  target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
+endif()
@@ -13,7 +13,7 @@
         },
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Qwen3.5 MoE runner (CUDA)",
+            "displayName": "Qwen3.5 MoE runner + serving worker (CUDA)",
             "inherits": ["qwen3-5-moe-base"],
             "cacheVariables": {
                 "EXECUTORCH_BUILD_CUDA": "ON"
@@ -41,9 +41,9 @@
     "buildPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Build Qwen3.5 MoE runner (CUDA)",
+            "displayName": "Build Qwen3.5 MoE runner + serving worker (CUDA)",
             "configurePreset": "qwen3-5-moe-cuda",
-            "targets": ["qwen3_5_moe_runner"]
+            "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
         },
         {
             "name": "qwen3-5-moe-metal",
@@ -55,7 +55,7 @@
     "workflowPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Configure and build Qwen3.5 MoE runner (CUDA)",
+            "displayName": "Configure and build Qwen3.5 MoE runner + serving worker (CUDA)",
             "steps": [
                 {
                     "type": "configure",

diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -100,14 +100,16 @@ It can be uploaded to HuggingFace Hub for easy sharing.
 
 ExecuTorch must be installed from source first (see
 [Prerequisites](#prerequisites)). The `make` target handles building
-core libraries and the runner binary.
+core libraries and the binaries.
 
 ```bash
 make qwen3_5_moe-cuda
 ```
 
 This builds ExecuTorch with CUDA backend support, then the runner binary
-at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner`.
+at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` and the
+serving worker at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker`
+(see [Serving](#serving-openai-compatible)).
 
 ## Run
 
@@ -133,11 +135,95 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
 | `--data_path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
 | `--tokenizer_path` | (required) | Path to HuggingFace `tokenizer.json` |
 | `--prompt` | `"Hello"` | Input prompt text |
+| `--prompt_file` | (none) | Path to a file with the prompt (overrides `--prompt`) |
 | `--temperature` | `0.8` | Sampling temperature (0 = greedy) |
 | `--max_new_tokens` | `128` | Maximum tokens to generate |
+| `--cuda_graph` | off | Capture/replay the decode method as a CUDA graph (CUDA only). See the caveat below. |
+| `--warmup` | `0` | Warmup iterations to discard before timing (one model load; the session is reset between iterations) |
+| `--num_iters` | `1` | Timed iterations to average, after warmup |
+
+## Serving (OpenAI-compatible)
+
+Run an OpenAI-compatible HTTP server so an agent harness (pi, opencode, …) can
+use the model for local tool-use. Point your client at `http://<host>:<port>/v1`.
+
+The CUDA build produces the runner **and** the serving worker:
+
+```bash
+make qwen3_5_moe-cuda
+```
+
+Launch (the `LD_LIBRARY_PATH` shim is forwarded to the worker for the CUDA blob):
+
+```bash
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
+  python -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path  qwen35_moe_exports/model.pte \
+    --data-path   qwen35_moe_exports/aoti_cuda_blob.ptd \
+    --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
+    --hf-tokenizer   ~/models/Qwen3.5-35B-A3B \
+    --model-id qwen3.5-moe --no-think
+```
+
+### Architecture (process isolation)
+
+Two processes, one model load:
+
+```
+serve.py            (control plane: FastAPI/asyncio, OpenAI protocol, chat
+                     templating, tool parsing, validation — NO CUDA, NO pybind)
+   │  JSONL over stdin/stdout
+   ▼
+qwen3_5_moe_worker  (C++ binary: one Qwen35MoEEngine + one session, synchronous
+                     loop — the CUDA model; NO asyncio server)
+```
+
+The model runs in a **separate worker process** because executing the AOTI CUDA
+model inside a live asyncio server process segfaults in the int4 matmul
+(reproducible, and isolated by elimination to the asyncio-loop × CUDA
+interaction). The worker runs the model like the CLI — a plain synchronous loop —
+which is reliable. The control plane only does blocking pipe I/O (no CUDA), which
+is safe under asyncio.
+
+### Serve Options
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--model-path` | (required) | Path to exported `.pte` model |
+| `--data-path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
+| `--tokenizer-path` | (required) | Path to HuggingFace `tokenizer.json` |
+| `--hf-tokenizer` | (required) | HF tokenizer id/dir for the chat template + encoding |
+| `--model-id` | `qwen3.5-moe` | Model id reported on `/v1/models` |
+| `--host` / `--port` | `127.0.0.1` / `8000` | Bind address |
+| `--max-context` | (none) | Reject prompts that exceed it with 400 |
+| `--no-think` | off | Default reasoning off (`enable_thinking=False`) |
+
+### V1 limitations
+
+- **Single-slot** (`serving_capacity=1`): one worker, one session, one model
+  load. `--num-runners > 1` is rejected; concurrent requests queue on the worker.
+- **No prefix cache**: the recurrent/conv state cannot be rewound by position
+  (`seek()` is NotSupported), so turn-to-turn KV reuse is off.
+- Supports the chat-completions contract of the generic server; `top_p != 1`,
+  `seed`, `top_k`, `logprobs`, etc. are rejected (only temperature is plumbed).
 
 ## Troubleshooting
 
+- **Runner exits silently right after `Loading methods...`**: the AOTI CUDA blob
+  is compiled with the conda toolchain's `libstdc++`, which is newer than the
+  system one (it needs e.g. `GLIBCXX_3.4.34`). Prepend the conda lib dir so the
+  runner loads the matching `libstdc++`:
+
+  ```bash
+  LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
+    cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner ...
+  ```
+- **`aoti_torch_cuda_sort_stable ... API call failed` when re-running prefill
+  with `--cuda_graph`**: capturing the decode CUDA graph and then running another
+  prefill in the same process currently fails (allocator interaction). Use
+  `--cuda_graph` for single prefill+decode runs; omit it when looping with
+  `--warmup`/`--num_iters`.
+
 - **OOM during export**: The model requires significant GPU memory even
   with int4 quantization. Try reducing `--max-seq-len` or using a GPU
   with more VRAM.