From 688f924feda2d6d72496b7d83bddf3c894c3a32c Mon Sep 17 00:00:00 2001 From: ereid7 Date: Sat, 28 Feb 2026 13:10:18 -0500 Subject: [PATCH 1/3] fix: support Apple Silicon via vllm-metal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always call .cpu() before .numpy() in _to_numpy() — MPS tensors are not on CPU and the previous CUDA-only check missed them. .cpu() is a no-op on CPU tensors so this is safe for all devices. Add Apple Silicon setup docs to README with vllm-metal install steps. --- README.md | 44 +++++++++++++++++++++++++++++++++++++ src/qr_sampler/processor.py | 7 ++---- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a604ccc..5e293aa 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,50 @@ export QR_GRPC_SERVER_ADDRESS=localhost:50051 vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8096 --gpu-memory-utilization 0.80 ``` +### Apple Silicon (macOS) + +qr-sampler works on Apple Silicon via [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained vLLM plugin under the official `vllm-project` GitHub org. It uses MLX under the hood but exposes the same vLLM API and plugin system — same entry points, same endpoints, same `curl` commands. + +#### 1. Install vllm-metal + +```bash +curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh | bash +``` + +This creates a virtual environment at `~/.venv-vllm-metal` with vLLM and all dependencies. Requires Python 3.12+. + +#### 2. Install qr-sampler + +```bash +source ~/.venv-vllm-metal/bin/activate +pip install qr-sampler +``` + +#### 3. Start the server + +```bash +source ~/.venv-vllm-metal/bin/activate +vllm serve Qwen/Qwen2.5-1.5B-Instruct +``` + +qr-sampler registers automatically via the same `vllm.logits_processors` entry point — no additional configuration needed. The plugin is discovered at startup and processes every token. + +#### 4. Send a request + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "The nature of consciousness is", + "max_tokens": 100 + }' +``` + +All configuration (entropy sources, temperature strategies, per-request overrides) works identically to the NVIDIA setup. The only difference is how vLLM itself is installed. + +> **Note:** The Docker deployment profiles use NVIDIA GPU images and are not compatible with Apple Silicon. Use the bare-metal install above instead. + ### System entropy fallback Without an external entropy source, qr-sampler falls back to `os.urandom()`. This is useful for development and testing but does not provide the quantum randomness needed for consciousness-research experiments. To use system entropy, set `QR_ENTROPY_SOURCE_TYPE=system` (this is the default). diff --git a/src/qr_sampler/processor.py b/src/qr_sampler/processor.py index 7d0d7f6..80a1e6c 100644 --- a/src/qr_sampler/processor.py +++ b/src/qr_sampler/processor.py @@ -497,12 +497,9 @@ def _to_numpy(tensor: Any) -> np.ndarray: """ if isinstance(tensor, np.ndarray): return tensor - # torch.Tensor — use .numpy() for zero-copy on CPU. + # torch.Tensor — .cpu() is a no-op on CPU tensors, required for MPS/CUDA. try: - if tensor.is_cuda: - result: np.ndarray = tensor.detach().cpu().numpy() - else: - result = tensor.detach().numpy() + result: np.ndarray = tensor.detach().cpu().numpy() return result except AttributeError: return np.asarray(tensor) From 8f013d9c0833f023f24f5e304cc18fee7e5558cf Mon Sep 17 00:00:00 2001 From: ereid7 Date: Sat, 28 Feb 2026 20:43:25 -0500 Subject: [PATCH 2/3] docs: improve Apple Silicon setup with MLX models, verification, and PR #124 note --- .gitignore | 1 + README.md | 45 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 7a4c0f1..e5cd29c 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ node_modules/ # Setuptools SCM src/qr_sampler/_version.py +.webui_secret_key diff --git a/README.md b/README.md index 5e293aa..50d5c47 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,10 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8096 --gpu-me qr-sampler works on Apple Silicon via [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained vLLM plugin under the official `vllm-project` GitHub org. It uses MLX under the hood but exposes the same vLLM API and plugin system — same entry points, same endpoints, same `curl` commands. +vllm-metal works with MLX-format models from the [mlx-community](https://huggingface.co/mlx-community) collection on Hugging Face. These are pre-converted and quantized for Apple Silicon — pick one that fits your available memory. + +> **Prerequisite:** vllm-metal currently does not load custom logits processors registered via entry points — it creates an empty `LogitsProcessors()` instead of calling `build_logitsprocs()`. [PR #124](https://github.com/vllm-project/vllm-metal/pull/124) fixes this with a 9-line patch that mirrors `GPUModelRunner`'s pattern. Until it is merged, you will need to apply the patch manually or install from the PR branch. Without it, qr-sampler's plugin will be silently skipped. + #### 1. Install vllm-metal ```bash @@ -142,26 +146,40 @@ pip install qr-sampler ```bash source ~/.venv-vllm-metal/bin/activate -vllm serve Qwen/Qwen2.5-1.5B-Instruct +vllm serve mlx-community/Qwen3-0.6B-4bit ``` -qr-sampler registers automatically via the same `vllm.logits_processors` entry point — no additional configuration needed. The plugin is discovered at startup and processes every token. +qr-sampler registers automatically via the same `vllm.logits_processors` entry point — no additional configuration needed. Look for this line in the server logs to confirm the plugin is active: + +``` +QRSamplerLogitsProcessor initialized: vocab_size=..., entropy_source=system+system, amplifier=zscore_mean, temperature=fixed +``` #### 4. Send a request ```bash +# Completions curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen/Qwen2.5-1.5B-Instruct", + "model": "mlx-community/Qwen3-0.6B-4bit", "prompt": "The nature of consciousness is", "max_tokens": 100 }' + +# Chat completions +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mlx-community/Qwen3-0.6B-4bit", + "messages": [{"role": "user", "content": "Tell me about quantum randomness"}], + "max_tokens": 100 + }' ``` All configuration (entropy sources, temperature strategies, per-request overrides) works identically to the NVIDIA setup. The only difference is how vLLM itself is installed. -> **Note:** The Docker deployment profiles use NVIDIA GPU images and are not compatible with Apple Silicon. Use the bare-metal install above instead. +> **Note:** The Docker deployment profiles use NVIDIA GPU images and are not compatible with Apple Silicon. Use the bare-metal install above instead. To use Open WebUI on Apple Silicon, see the [Web UI](#web-ui) section. ### System entropy fallback @@ -193,7 +211,9 @@ curl http://localhost:8000/v1/completions \ qr-sampler works with [Open WebUI](https://github.com/open-webui/open-webui), a self-hosted ChatGPT-style interface that connects to vLLM's OpenAI-compatible -API. Every deployment profile includes it as an optional service — add +API. + +**NVIDIA / Linux:** Every deployment profile includes Open WebUI as an optional service — add `--profile ui` to start it alongside vLLM: ```bash @@ -201,8 +221,19 @@ cd deployments/urandom docker compose --profile ui up --build ``` -Then open http://localhost:3000 to start chatting. Without `--profile ui`, Open -WebUI does not start and nothing changes. +**Apple Silicon:** The deployment profiles use NVIDIA GPU images, but Open WebUI itself is just a web app. Run it standalone in Docker and point it at your vllm-metal server: + +```bash +docker run -d -p 3000:8080 \ + --add-host=host.docker.internal:host-gateway \ + -e OPENAI_API_BASE_URL=http://host.docker.internal:8000/v1 \ + -e OPENAI_API_KEY=not-needed \ + -e WEBUI_AUTH=false \ + --name open-webui \ + ghcr.io/open-webui/open-webui:main +``` + +Then open http://localhost:3000 to start chatting. ### Controlling qr-sampler from the UI From c4053fbaec5a611e8cd4cdc9dc84b86644825164 Mon Sep 17 00:00:00 2001 From: ereid7 Date: Sat, 28 Feb 2026 21:10:39 -0500 Subject: [PATCH 3/3] docs: clarify Docker limitation on Apple Silicon and improve .cpu() comment --- README.md | 2 +- src/qr_sampler/processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 50d5c47..857e12b 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ curl http://localhost:8000/v1/chat/completions \ All configuration (entropy sources, temperature strategies, per-request overrides) works identically to the NVIDIA setup. The only difference is how vLLM itself is installed. -> **Note:** The Docker deployment profiles use NVIDIA GPU images and are not compatible with Apple Silicon. Use the bare-metal install above instead. To use Open WebUI on Apple Silicon, see the [Web UI](#web-ui) section. +> **Note:** The Docker deployment profiles are not compatible with Apple Silicon. Docker on macOS runs a Linux VM with no Metal GPU passthrough, so vllm-metal must run natively. To use Open WebUI on Apple Silicon, see the [Web UI](#web-ui) section. ### System entropy fallback diff --git a/src/qr_sampler/processor.py b/src/qr_sampler/processor.py index 80a1e6c..42ea7d3 100644 --- a/src/qr_sampler/processor.py +++ b/src/qr_sampler/processor.py @@ -497,7 +497,7 @@ def _to_numpy(tensor: Any) -> np.ndarray: """ if isinstance(tensor, np.ndarray): return tensor - # torch.Tensor — .cpu() is a no-op on CPU tensors, required for MPS/CUDA. + # .cpu() moves GPU tensors (CUDA/MPS) to host memory; no-op on CPU. try: result: np.ndarray = tensor.detach().cpu().numpy() return result