IBM · starpit · Feb 15, 2026
diff --git a/.github/scripts/free-up-disk-space-fast.sh b/.github/scripts/free-up-disk-space-fast.sh
@@ -1,5 +1,7 @@
 #!/bin/sh
 
+# Disk space cleanup # https://dev.to/mathio/squeezing-disk-space-from-github-actions-runners-an-engineers-guide-3pjg
+
 # Remove Java (JDKs)
 sudo rm -rf /usr/lib/jvm
 

diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml
@@ -59,33 +59,23 @@ jobs:
       - name: verify feature consistency
         run: .github/scripts/verify-feature-consistency.sh
 
-      - name: install ollama
-        run: |
-          # Disk space cleanup # https://dev.to/mathio/squeezing-disk-space-from-github-actions-runners-an-engineers-guide-3pjg
-          ./.github/scripts/free-up-disk-space-fast.sh &
-
-          if [ "$RUNNER_OS" = "Linux" ]; then
-            curl -fsSL https://ollama.com/install.sh | bash
-            # note: install.sh will start ollama as an systemd service, no need to start it ourselves (actively harmful -- port conflict)
-          elif [ "$RUNNER_OS" = "macOS" ]; then
-            brew update
-            brew install ollama
-            brew services start ollama
-          fi
-          wait # for disk space cleanup
+      - name: free up disk space
+        run: ./.github/scripts/free-up-disk-space-fast.sh
 
       - name: cargo test
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           if [ "$RUNNER_OS" = "macOS" ]; then
             # IMPORTANT: Cannot use --all-features because mistralrs (used by local/metal/cuda features)
             # unconditionally depends on candle-core, which depends on cudarc (CUDA).
             #
             # When adding new features to spnl/Cargo.toml, update this list!
             # Current features tested (from spnl/Cargo.toml):
-            # - Included: cli_support, print, lisp, run, ollama, openai, gemini, pull, yaml, metal, rag, rag-deep-debug, spnl-api, vllm, k8s, gce, ffi, pypi, run_py, tok, openssl-vendored
+            # - Included: cli_support, print, lisp, run, ollama, openai, gemini, yaml, metal, rag, rag-deep-debug, spnl-api, vllm, k8s, gce, ffi, pypi, run_py, tok, openssl-vendored
             # - Excluded: local (CPU inference - pulls in CUDA deps via mistralrs)
             # - Excluded: cuda, cuda-flash-attn, cuda-flash-attn-v3 (CUDA features)
-            cargo test -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,pull,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored -- --nocapture
+            cargo test -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored -- --nocapture
             cargo test -p spnl-cli --features rag,spnl-api,vllm,k8s,gce,local,metal -- --nocapture
           else
             # Test default features on Linux (no GPU features)
@@ -101,7 +91,7 @@ jobs:
         # When adding new features, update the feature list above in cargo test!
         if: runner.os == 'macOS'
         run: |
-          cargo clippy -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,pull,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored --tests --no-deps -- -D warnings
+          cargo clippy -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored --tests --no-deps -- -D warnings
           cargo clippy -p spnl-cli --features rag,spnl-api,vllm,k8s,gce,local,metal --tests --no-deps -- -D warnings
 
       - name: rustfmt

diff --git a/.github/workflows/release-cli.yml b/.github/workflows/release-cli.yml
@@ -73,15 +73,15 @@ jobs:
             local_feature: local
             name: Windows x86_64
 
-          - runner: windows-latest
-            target: aarch64-pc-windows-msvc
-            platform: windows
-            arch: aarch64
-            libc: ""
-            # local: very slow to build on aarch64 windows (60+ minutes); disabled for now. maybe we can restore if we figure out rustc caching
-            #local_feature: local
-            #rustflags: "-Ctarget-feature=+fp16,+fhm" # https://github.com/sarah-quinones/gemm/issues/31 avoid "error: instruction requires: fullfp16"
-            name: Windows ARM64
+          # TODO. the local feature is very slow to build on aarch64 windows (60+ minutes); disabled for now. maybe we can restore if we figure out rustc caching
+          # - runner: windows-latest
+          #   target: aarch64-pc-windows-msvc
+          #   platform: windows
+          #   arch: aarch64
+          #   libc: ""
+          #   local_feature: local
+          #   rustflags: "-Ctarget-feature=+fp16,+fhm" # https://github.com/sarah-quinones/gemm/issues/31 avoid "error: instruction requires: fullfp16"
+          #   name: Windows ARM64
 
     env:
       TARGET: ${{ matrix.platform.target }}

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/cli/src/args.rs b/cli/src/args.rs
@@ -43,19 +43,14 @@ pub struct Args {
     pub builtin: Option<Builtin>,
 
     /// Generative Model
-    #[arg(
-        short,
-        long,
-        default_value = "ollama/granite3.3:2b",
-        env = "SPNL_MODEL"
-    )]
+    #[arg(short, long, default_value = "llama3.2:1b", env = "SPNL_MODEL")]
     pub model: String,
 
     /// Embedding Model
     #[arg(
         short,
         long,
-        default_value = "ollama/mxbai-embed-large:335m",
+        default_value = "local/google/embeddinggemma-300m",
         env = "SPNL_EMBEDDING_MODEL"
     )]
     pub embedding_model: String,

diff --git a/docker/Containerfile.hostbuild b/docker/Containerfile.hostbuild
diff --git a/docker/Containerfile.hostbuild.ollama b/docker/Containerfile.hostbuild.ollama
diff --git a/docker/gce/vllm/create-vllm-gce-image.sh b/docker/gce/vllm/create-vllm-gce-image.sh
@@ -143,7 +143,7 @@ if [[ -f /etc/environment ]]; then
 fi
 
 echo "=== Disabling unnecessary services ==="
-# Disable services not needed for vLLM/ollama
+# Disable services not needed for vLLM
 sudo systemctl disable snapd.service || true
 sudo systemctl disable snapd.socket || true
 sudo systemctl disable unattended-upgrades.service || true
@@ -176,9 +176,6 @@ uv venv --seed
 source .venv/bin/activate
 VLLM_USE_PRECOMPILED=1 uv pip install --editable .
 
-echo "=== Installing ollama ==="
-curl -fsSL https://ollama.com/install.sh | sh
-
 echo "=== Creating systemd service for vLLM ==="
 # Create directory for vLLM configuration
 sudo mkdir -p /etc/vllm
@@ -219,10 +216,6 @@ StandardError=journal
 WantedBy=multi-user.target
 VLLM_SERVICE_EOF
 
-echo "=== Creating systemd service for Ollama ==="
-# Create Ollama systemd service (ollama install.sh already creates one, but we ensure it's enabled)
-sudo systemctl enable ollama.service
-
 echo "=== Enabling services to start at boot ==="
 sudo systemctl enable vllm.service
 

diff --git a/docker/gce/vllm/setup-dev.sh b/docker/gce/vllm/setup-dev.sh
@@ -137,22 +137,16 @@ VLLM_ATTENTION_BACKEND=TRITON_ATTN \
     VLLM_SERVER_DEV_MODE=1 \
     nohup vllm serve $MODEL --enforce-eager &
 
-# Install ollama (for embedding)
-(curl -fsSL https://ollama.com/install.sh | sh && ollama serve) &
-
 # Wait till vllm is ready
 timeout 5m bash -c 'until curl --output /dev/null --silent --fail http://localhost:8000/health; do sleep 3; done'
 echo "vllm is ready"
 
-# Wait till ollama is ready
-timeout 5m bash -c 'until ollama ps; do sleep 3; done'
-echo "ollama is ready"
-
 # Run tests
 # Here are the variables we will allow to be used in the test.d/* scripts
 declare -x GCS_BUCKET
 declare -x RUN_ID
 declare -x MODEL
+declare -x HF_TOKEN
 declare -x OPENAI_API_BASE=http://localhost:8000/v1
 
 cd $HOME

diff --git a/docker/gce/vllm/test.d/spnl-speedup.sh b/docker/gce/vllm/test.d/spnl-speedup.sh
@@ -8,8 +8,6 @@ SCRIPTDIR=$(cd $(dirname "$0") && pwd)
 
 #set -x # debug
 
-export SPNL_EMBEDDING_MODEL=ollama/qwen3-embedding:0.6b
-
 # TODO: make at least the inner-most loop bound a parameter rather than hard-coded
 for b in email2 rag
 do

diff --git a/docs/feature-flags.md b/docs/feature-flags.md
@@ -13,30 +13,78 @@ messages from a filesystem or from stdin. Or you may wish to have your
 server side also support fetching message content from a
 filesystem. The choice is yours.
 
-- **rag**: This allows a span query to express that a given message
-  should be augmented with fragments from a given set of
-  documents. The query process, with this feature flag enabled,
-  handles the fragmentation, indexing, etc.
-
-- **run**: This allows for execution of a query. Without this flag
-  enabled, the compiled code will only be able to parse
-
-- **ollama**: This allows the query execution to direct `g` (generate)
-  at a local Ollama model server.
-
-- **openai**: This allows the query execution to direct `g` (generate)
-  at an OpenAI compatible model server. By default, this will talk to
-  `http://localhost:8000`, but this can be changed via the
-  `OPENAI_BASE_URL` environment variable.
-
-- **pull**: This allows the query execution to pull down Ollama models
-  specified in a query.
-
-- **tok**: This adds an API for both parsing and then tokenizing the
-  messages in a query.
-
-- **python_bindings**: This adds python bindings to the span query
-  APIs (currently only the tokenization APIs are supported).
-
-- **lisp**: A highly experimental effort to allow for [static
-  compilation](./lisp) of a query into a shrinkwrapped executable.
+## Core Features
+
+- **run**: Enables execution of span queries. Without this flag enabled, the compiled code will only be able to parse queries.
+
+- **print**: Enables printing/display functionality for span queries.
+
+- **cli_support**: Enables CLI-specific support features. Depends on `print` and includes pretty-printing with `ptree`.
+
+## Model Backend Features
+
+- **ollama**: Enables support for directing `g` (generate) operations to a local Ollama model server. Depends on `openai` for API compatibility.
+
+- **openai**: Enables support for directing `g` (generate) operations to an OpenAI-compatible model server. By default, this will talk to `http://localhost:8000`, but this can be changed via the `OPENAI_BASE_URL` environment variable.
+
+- **gemini**: Enables support for Google's Gemini API. Depends on `openai` for API compatibility.
+
+- **local**: Enables local model inference using mistral.rs. Supports running models directly on your machine without external API calls. Depends on `run` and includes mistralrs, tokio, and related dependencies.
+
+- **metal**: Enables Metal GPU acceleration for local inference on macOS. Depends on `local` and enables mistralrs Metal backend.
+
+- **cuda**: Enables CUDA GPU acceleration for local inference on NVIDIA GPUs. Depends on `local` and enables mistralrs CUDA backend.
+
+- **cuda-flash-attn**: Enables Flash Attention optimization for CUDA. Depends on `cuda`.
+
+- **cuda-flash-attn-v3**: Enables Flash Attention v3 optimization for CUDA. Depends on `cuda` and uses mistralrs-core directly.
+
+## RAG (Retrieval-Augmented Generation) Features
+
+- **rag**: Enables RAG capabilities, allowing span queries to augment messages with fragments from a given set of documents. The query process handles fragmentation, indexing, embedding, and retrieval. Depends on `run` and includes LanceDB, PDF extraction, and vector operations.
+
+- **rag-deep-debug**: Enables deep debugging output for RAG operations.
+
+## Language & Format Features
+
+- **lisp**: A highly experimental effort to allow for [static compilation](./lisp) of a query into a shrinkwrapped executable.
+
+- **yaml**: Enables YAML parsing and serialization support.
+
+## Tokenization & Python Features
+
+- **tok**: Adds an API for both parsing and then tokenizing the messages in a query.
+
+- **ffi**: Enables Foreign Function Interface support for calling spnl from other languages.
+
+- **pypi**: Enables Python bindings for spnl. Depends on `ffi` and `tok`. Includes PyO3 for Python interop.
+
+- **run_py**: Enables running span queries from Python with async support. Depends on `run`, `pypi`, and model backends.
+
+## Cloud & Infrastructure Features
+
+- **spnl-api**: Enables the spnl API client for communicating with spnl services.
+
+- **vllm**: Enables support for vLLM model serving. Depends on `yaml`.
+
+- **k8s**: Enables Kubernetes integration for deploying and managing vLLM instances. Depends on `vllm` and includes kube client libraries.
+
+- **gce**: Enables Google Compute Engine integration for deploying and managing vLLM instances. Depends on `vllm` and includes GCP client libraries.
+
+## Utility Features
+
+- **openssl-vendored**: Uses a vendored (statically linked) version of OpenSSL instead of the system version. Useful for portable builds.
+
+## Default Features
+
+The following features are enabled by default:
+- `cli_support`
+- `lisp`
+- `run`
+- `ollama`
+- `openai`
+- `gemini`
+- `yaml`
+- `local`
+
+This provides a full-featured experience with CLI support, multiple model backends (local and remote), and common format support.
diff --git a/spnl/Cargo.toml b/spnl/Cargo.toml
@@ -14,20 +14,19 @@ include = [
 ]
 
 [features]
-default = ["cli_support","lisp","run","ollama","openai","gemini","pull","yaml"]
+default = ["cli_support","lisp","run","ollama","openai","gemini","yaml","local"]
 cli_support = ["print","dep:ptree"]
 print = []
 lisp = ["dep:serde-lexpr"]
 ollama = ["openai"]
 openai = ["dep:async-openai","dep:tokio"]
 gemini = ["openai"]
-local = ["run","dep:mistralrs","dep:tokio","dep:uuid","dep:indexmap","dep:either"]
+local = ["run","dep:mistralrs","dep:tokio","dep:uuid","dep:indexmap","dep:either","dep:reqwest"]
 metal = ["local","mistralrs/metal"]
 cuda = ["local","mistralrs/cuda"]
 cuda-flash-attn = ["cuda","mistralrs/flash-attn"]
 cuda-flash-attn-v3 = ["cuda","dep:mistralrs-core","mistralrs-core/flash-attn-v3"]
 openssl-vendored = ["dep:openssl"]
-pull = ["dep:reqwest","dep:tokio-util"]
 ffi = []
 pypi = ["ffi","tok","dep:pyo3","pyo3/extension-module","dep:thiserror"]
 rag = ["run","dep:sha2", "dep:lancedb","dep:tracing","dep:arrow-schema","dep:arrow-array","dep:itertools","dep:pdf-extract","dep:async-recursion","dep:regex","dep:rand"]
@@ -71,7 +70,6 @@ futures = { version = "0.3.31", optional = true }
 indicatif = { version = "0.18.0", optional = true }
 tokio = { version = "1.44.1", features = ["io-std", "io-util", "signal"], optional = true }
 tokio-stream = { version = "0.1.18", features = ["net"], optional = true }
-tokio-util = { version = "0.7.16", optional = true }
 anyhow = { version = "1.0.98" }
 lancedb = { version = "0.26.0", default-features = false, optional = true }
 tracing = { version = "0.1.41", optional = true }

diff --git a/spnl/benches/mt_rag.rs b/spnl/benches/mt_rag.rs
@@ -113,8 +113,8 @@ async fn run_rag_benchmark(
 fn mt_rag_benchmark(c: &mut Criterion) {
     let runtime = tokio::runtime::Runtime::new().unwrap();
 
-    let model = "ollama/granite3.3:2b";
-    let embedding_model = "ollama/mxbai-embed-large:335m";
+    let model = "llama3.2:3b";
+    let embedding_model = "local/google/embeddinggemma-300m";
     let temperature = 0.0;
     let max_tokens = 100; // Use small token limit for faster benchmarking