Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ help:
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
@echo " clean - Clean build artifacts"

Expand Down Expand Up @@ -431,11 +431,13 @@ voxtral_tts-cuda:
qwen3_5_moe-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
cmake --workflow --preset llm-release-cuda
@echo "==> Building Qwen3.5 MoE runner with CUDA..."
@echo "==> Building Qwen3.5 MoE runner + serving worker with CUDA..."
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
@echo " Serving worker: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
@echo " Launch: see examples/models/qwen3_5_moe/README.md (Serving)"

gemma4_31b-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
Expand Down
22 changes: 21 additions & 1 deletion examples/models/qwen3_5_moe/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

set(_common_include_directories ${EXECUTORCH_ROOT}/..)
# Vendored single-include nlohmann/json for the worker JSONL protocol (no new
# dependency).
set(_json_include
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
)

# gflags
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
Expand Down Expand Up @@ -60,7 +65,7 @@ endif()
# Tokenizer
list(APPEND link_libraries tokenizers::tokenizers)

add_executable(qwen3_5_moe_runner main.cpp)
add_executable(qwen3_5_moe_runner main.cpp qwen35_moe_engine.cpp)
target_include_directories(
qwen3_5_moe_runner PUBLIC ${_common_include_directories}
)
Expand All @@ -70,3 +75,18 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(qwen3_5_moe_runner)
target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
endif()

# Process-isolated serving worker (qwen3_5_moe_worker): constructs
# Qwen35MoEEngine directly and speaks the JSONL worker protocol that the Python
# control plane drives via WorkerClient (no pybind, no Python model code). Built
# alongside the runner by the qwen3-5-moe-cuda preset.
add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
target_include_directories(
qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
)
target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries})

if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(qwen3_5_moe_worker)
target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
endif()
8 changes: 4 additions & 4 deletions examples/models/qwen3_5_moe/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
{
"name": "qwen3-5-moe-cuda",
"displayName": "Qwen3.5 MoE runner (CUDA)",
"displayName": "Qwen3.5 MoE runner + serving worker (CUDA)",
"inherits": ["qwen3-5-moe-base"],
"cacheVariables": {
"EXECUTORCH_BUILD_CUDA": "ON"
Expand Down Expand Up @@ -41,9 +41,9 @@
"buildPresets": [
{
"name": "qwen3-5-moe-cuda",
"displayName": "Build Qwen3.5 MoE runner (CUDA)",
"displayName": "Build Qwen3.5 MoE runner + serving worker (CUDA)",
"configurePreset": "qwen3-5-moe-cuda",
"targets": ["qwen3_5_moe_runner"]
"targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
},
{
"name": "qwen3-5-moe-metal",
Expand All @@ -55,7 +55,7 @@
"workflowPresets": [
{
"name": "qwen3-5-moe-cuda",
"displayName": "Configure and build Qwen3.5 MoE runner (CUDA)",
"displayName": "Configure and build Qwen3.5 MoE runner + serving worker (CUDA)",
"steps": [
{
"type": "configure",
Expand Down
90 changes: 88 additions & 2 deletions examples/models/qwen3_5_moe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,16 @@ It can be uploaded to HuggingFace Hub for easy sharing.

ExecuTorch must be installed from source first (see
[Prerequisites](#prerequisites)). The `make` target handles building
core libraries and the runner binary.
core libraries and the binaries.

```bash
make qwen3_5_moe-cuda
```

This builds ExecuTorch with CUDA backend support, then the runner binary
at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner`.
at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` and the
serving worker at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker`
(see [Serving](#serving-openai-compatible)).

## Run

Expand All @@ -133,11 +135,95 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
| `--data_path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
| `--tokenizer_path` | (required) | Path to HuggingFace `tokenizer.json` |
| `--prompt` | `"Hello"` | Input prompt text |
| `--prompt_file` | (none) | Path to a file with the prompt (overrides `--prompt`) |
| `--temperature` | `0.8` | Sampling temperature (0 = greedy) |
| `--max_new_tokens` | `128` | Maximum tokens to generate |
| `--cuda_graph` | off | Capture/replay the decode method as a CUDA graph (CUDA only). See the caveat below. |
| `--warmup` | `0` | Warmup iterations to discard before timing (one model load; the session is reset between iterations) |
| `--num_iters` | `1` | Timed iterations to average, after warmup |

## Serving (OpenAI-compatible)

Run an OpenAI-compatible HTTP server so an agent harness (pi, opencode, …) can
use the model for local tool-use. Point your client at `http://<host>:<port>/v1`.

The CUDA build produces the runner **and** the serving worker:

```bash
make qwen3_5_moe-cuda
```

Launch (the `LD_LIBRARY_PATH` shim is forwarded to the worker for the CUDA blob):

```bash
LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
python -m executorch.examples.models.qwen3_5_moe.serve \
--model-path qwen35_moe_exports/model.pte \
--data-path qwen35_moe_exports/aoti_cuda_blob.ptd \
--tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
--hf-tokenizer ~/models/Qwen3.5-35B-A3B \
--model-id qwen3.5-moe --no-think
```

### Architecture (process isolation)

Two processes, one model load:

```
serve.py (control plane: FastAPI/asyncio, OpenAI protocol, chat
templating, tool parsing, validation — NO CUDA, NO pybind)
│ JSONL over stdin/stdout
qwen3_5_moe_worker (C++ binary: one Qwen35MoEEngine + one session, synchronous
loop — the CUDA model; NO asyncio server)
```

The model runs in a **separate worker process** because executing the AOTI CUDA
model inside a live asyncio server process segfaults in the int4 matmul
(reproducible, and isolated by elimination to the asyncio-loop × CUDA
interaction). The worker runs the model like the CLI — a plain synchronous loop —
which is reliable. The control plane only does blocking pipe I/O (no CUDA), which
is safe under asyncio.

### Serve Options

| Flag | Default | Description |
|------|---------|-------------|
| `--model-path` | (required) | Path to exported `.pte` model |
| `--data-path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
| `--tokenizer-path` | (required) | Path to HuggingFace `tokenizer.json` |
| `--hf-tokenizer` | (required) | HF tokenizer id/dir for the chat template + encoding |
| `--model-id` | `qwen3.5-moe` | Model id reported on `/v1/models` |
| `--host` / `--port` | `127.0.0.1` / `8000` | Bind address |
| `--max-context` | (none) | Reject prompts that exceed it with 400 |
| `--no-think` | off | Default reasoning off (`enable_thinking=False`) |

### V1 limitations

- **Single-slot** (`serving_capacity=1`): one worker, one session, one model
load. `--num-runners > 1` is rejected; concurrent requests queue on the worker.
- **No prefix cache**: the recurrent/conv state cannot be rewound by position
(`seek()` is NotSupported), so turn-to-turn KV reuse is off.
- Supports the chat-completions contract of the generic server; `top_p != 1`,
`seed`, `top_k`, `logprobs`, etc. are rejected (only temperature is plumbed).

## Troubleshooting

- **Runner exits silently right after `Loading methods...`**: the AOTI CUDA blob
is compiled with the conda toolchain's `libstdc++`, which is newer than the
system one (it needs e.g. `GLIBCXX_3.4.34`). Prepend the conda lib dir so the
runner loads the matching `libstdc++`:

```bash
LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner ...
```
- **`aoti_torch_cuda_sort_stable ... API call failed` when re-running prefill
with `--cuda_graph`**: capturing the decode CUDA graph and then running another
prefill in the same process currently fails (allocator interaction). Use
`--cuda_graph` for single prefill+decode runs; omit it when looping with
`--warmup`/`--num_iters`.

- **OOM during export**: The model requires significant GPU memory even
with int4 quantization. Try reducing `--max-seq-len` or using a GPU
with more VRAM.
Expand Down
Loading
Loading