Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/scripts/free-up-disk-space-fast.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/sh

# Disk space cleanup # https://dev.to/mathio/squeezing-disk-space-from-github-actions-runners-an-engineers-guide-3pjg

# Remove Java (JDKs)
sudo rm -rf /usr/lib/jvm

Expand Down
24 changes: 7 additions & 17 deletions .github/workflows/core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,33 +59,23 @@ jobs:
- name: verify feature consistency
run: .github/scripts/verify-feature-consistency.sh

- name: install ollama
run: |
# Disk space cleanup # https://dev.to/mathio/squeezing-disk-space-from-github-actions-runners-an-engineers-guide-3pjg
./.github/scripts/free-up-disk-space-fast.sh &

if [ "$RUNNER_OS" = "Linux" ]; then
curl -fsSL https://ollama.com/install.sh | bash
# note: install.sh will start ollama as an systemd service, no need to start it ourselves (actively harmful -- port conflict)
elif [ "$RUNNER_OS" = "macOS" ]; then
brew update
brew install ollama
brew services start ollama
fi
wait # for disk space cleanup
- name: free up disk space
run: ./.github/scripts/free-up-disk-space-fast.sh

- name: cargo test
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
if [ "$RUNNER_OS" = "macOS" ]; then
# IMPORTANT: Cannot use --all-features because mistralrs (used by local/metal/cuda features)
# unconditionally depends on candle-core, which depends on cudarc (CUDA).
#
# When adding new features to spnl/Cargo.toml, update this list!
# Current features tested (from spnl/Cargo.toml):
# - Included: cli_support, print, lisp, run, ollama, openai, gemini, pull, yaml, metal, rag, rag-deep-debug, spnl-api, vllm, k8s, gce, ffi, pypi, run_py, tok, openssl-vendored
# - Included: cli_support, print, lisp, run, ollama, openai, gemini, yaml, metal, rag, rag-deep-debug, spnl-api, vllm, k8s, gce, ffi, pypi, run_py, tok, openssl-vendored
# - Excluded: local (CPU inference - pulls in CUDA deps via mistralrs)
# - Excluded: cuda, cuda-flash-attn, cuda-flash-attn-v3 (CUDA features)
cargo test -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,pull,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored -- --nocapture
cargo test -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored -- --nocapture
cargo test -p spnl-cli --features rag,spnl-api,vllm,k8s,gce,local,metal -- --nocapture
else
# Test default features on Linux (no GPU features)
Expand All @@ -101,7 +91,7 @@ jobs:
# When adding new features, update the feature list above in cargo test!
if: runner.os == 'macOS'
run: |
cargo clippy -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,pull,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored --tests --no-deps -- -D warnings
cargo clippy -p spnl --features cli_support,print,lisp,run,ollama,openai,gemini,yaml,metal,rag,rag-deep-debug,spnl-api,vllm,k8s,gce,ffi,pypi,run_py,tok,openssl-vendored --tests --no-deps -- -D warnings
cargo clippy -p spnl-cli --features rag,spnl-api,vllm,k8s,gce,local,metal --tests --no-deps -- -D warnings

- name: rustfmt
Expand Down
18 changes: 9 additions & 9 deletions .github/workflows/release-cli.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ jobs:
local_feature: local
name: Windows x86_64

- runner: windows-latest
target: aarch64-pc-windows-msvc
platform: windows
arch: aarch64
libc: ""
# local: very slow to build on aarch64 windows (60+ minutes); disabled for now. maybe we can restore if we figure out rustc caching
#local_feature: local
#rustflags: "-Ctarget-feature=+fp16,+fhm" # https://github.com/sarah-quinones/gemm/issues/31 avoid "error: instruction requires: fullfp16"
name: Windows ARM64
# TODO. the local feature is very slow to build on aarch64 windows (60+ minutes); disabled for now. maybe we can restore if we figure out rustc caching
# - runner: windows-latest
# target: aarch64-pc-windows-msvc
# platform: windows
# arch: aarch64
# libc: ""
# local_feature: local
# rustflags: "-Ctarget-feature=+fp16,+fhm" # https://github.com/sarah-quinones/gemm/issues/31 avoid "error: instruction requires: fullfp16"
# name: Windows ARM64

env:
TARGET: ${{ matrix.platform.target }}
Expand Down
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 2 additions & 7 deletions cli/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,14 @@ pub struct Args {
pub builtin: Option<Builtin>,

/// Generative Model
#[arg(
short,
long,
default_value = "ollama/granite3.3:2b",
env = "SPNL_MODEL"
)]
#[arg(short, long, default_value = "llama3.2:1b", env = "SPNL_MODEL")]
pub model: String,

/// Embedding Model
#[arg(
short,
long,
default_value = "ollama/mxbai-embed-large:335m",
default_value = "local/google/embeddinggemma-300m",
env = "SPNL_EMBEDDING_MODEL"
)]
pub embedding_model: String,
Expand Down
14 changes: 0 additions & 14 deletions docker/Containerfile.hostbuild

This file was deleted.

30 changes: 0 additions & 30 deletions docker/Containerfile.hostbuild.ollama

This file was deleted.

9 changes: 1 addition & 8 deletions docker/gce/vllm/create-vllm-gce-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ if [[ -f /etc/environment ]]; then
fi

echo "=== Disabling unnecessary services ==="
# Disable services not needed for vLLM/ollama
# Disable services not needed for vLLM
sudo systemctl disable snapd.service || true
sudo systemctl disable snapd.socket || true
sudo systemctl disable unattended-upgrades.service || true
Expand Down Expand Up @@ -176,9 +176,6 @@ uv venv --seed
source .venv/bin/activate
VLLM_USE_PRECOMPILED=1 uv pip install --editable .

echo "=== Installing ollama ==="
curl -fsSL https://ollama.com/install.sh | sh

echo "=== Creating systemd service for vLLM ==="
# Create directory for vLLM configuration
sudo mkdir -p /etc/vllm
Expand Down Expand Up @@ -219,10 +216,6 @@ StandardError=journal
WantedBy=multi-user.target
VLLM_SERVICE_EOF

echo "=== Creating systemd service for Ollama ==="
# Create Ollama systemd service (ollama install.sh already creates one, but we ensure it's enabled)
sudo systemctl enable ollama.service

echo "=== Enabling services to start at boot ==="
sudo systemctl enable vllm.service

Expand Down
8 changes: 1 addition & 7 deletions docker/gce/vllm/setup-dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -137,22 +137,16 @@ VLLM_ATTENTION_BACKEND=TRITON_ATTN \
VLLM_SERVER_DEV_MODE=1 \
nohup vllm serve $MODEL --enforce-eager &

# Install ollama (for embedding)
(curl -fsSL https://ollama.com/install.sh | sh && ollama serve) &

# Wait till vllm is ready
timeout 5m bash -c 'until curl --output /dev/null --silent --fail http://localhost:8000/health; do sleep 3; done'
echo "vllm is ready"

# Wait till ollama is ready
timeout 5m bash -c 'until ollama ps; do sleep 3; done'
echo "ollama is ready"

# Run tests
# Here are the variables we will allow to be used in the test.d/* scripts
declare -x GCS_BUCKET
declare -x RUN_ID
declare -x MODEL
declare -x HF_TOKEN
declare -x OPENAI_API_BASE=http://localhost:8000/v1

cd $HOME
Expand Down
2 changes: 0 additions & 2 deletions docker/gce/vllm/test.d/spnl-speedup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ SCRIPTDIR=$(cd $(dirname "$0") && pwd)

#set -x # debug

export SPNL_EMBEDDING_MODEL=ollama/qwen3-embedding:0.6b

# TODO: make at least the inner-most loop bound a parameter rather than hard-coded
for b in email2 rag
do
Expand Down
102 changes: 75 additions & 27 deletions docs/feature-flags.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,78 @@ messages from a filesystem or from stdin. Or you may wish to have your
server side also support fetching message content from a
filesystem. The choice is yours.

- **rag**: This allows a span query to express that a given message
should be augmented with fragments from a given set of
documents. The query process, with this feature flag enabled,
handles the fragmentation, indexing, etc.

- **run**: This allows for execution of a query. Without this flag
enabled, the compiled code will only be able to parse

- **ollama**: This allows the query execution to direct `g` (generate)
at a local Ollama model server.

- **openai**: This allows the query execution to direct `g` (generate)
at an OpenAI compatible model server. By default, this will talk to
`http://localhost:8000`, but this can be changed via the
`OPENAI_BASE_URL` environment variable.

- **pull**: This allows the query execution to pull down Ollama models
specified in a query.

- **tok**: This adds an API for both parsing and then tokenizing the
messages in a query.

- **python_bindings**: This adds python bindings to the span query
APIs (currently only the tokenization APIs are supported).

- **lisp**: A highly experimental effort to allow for [static
compilation](./lisp) of a query into a shrinkwrapped executable.
## Core Features

- **run**: Enables execution of span queries. Without this flag enabled, the compiled code will only be able to parse queries.

- **print**: Enables printing/display functionality for span queries.

- **cli_support**: Enables CLI-specific support features. Depends on `print` and includes pretty-printing with `ptree`.

## Model Backend Features

- **ollama**: Enables support for directing `g` (generate) operations to a local Ollama model server. Depends on `openai` for API compatibility.

- **openai**: Enables support for directing `g` (generate) operations to an OpenAI-compatible model server. By default, this will talk to `http://localhost:8000`, but this can be changed via the `OPENAI_BASE_URL` environment variable.

- **gemini**: Enables support for Google's Gemini API. Depends on `openai` for API compatibility.

- **local**: Enables local model inference using mistral.rs. Supports running models directly on your machine without external API calls. Depends on `run` and includes mistralrs, tokio, and related dependencies.

- **metal**: Enables Metal GPU acceleration for local inference on macOS. Depends on `local` and enables mistralrs Metal backend.

- **cuda**: Enables CUDA GPU acceleration for local inference on NVIDIA GPUs. Depends on `local` and enables mistralrs CUDA backend.

- **cuda-flash-attn**: Enables Flash Attention optimization for CUDA. Depends on `cuda`.

- **cuda-flash-attn-v3**: Enables Flash Attention v3 optimization for CUDA. Depends on `cuda` and uses mistralrs-core directly.

## RAG (Retrieval-Augmented Generation) Features

- **rag**: Enables RAG capabilities, allowing span queries to augment messages with fragments from a given set of documents. The query process handles fragmentation, indexing, embedding, and retrieval. Depends on `run` and includes LanceDB, PDF extraction, and vector operations.

- **rag-deep-debug**: Enables deep debugging output for RAG operations.

## Language & Format Features

- **lisp**: A highly experimental effort to allow for [static compilation](./lisp) of a query into a shrinkwrapped executable.

- **yaml**: Enables YAML parsing and serialization support.

## Tokenization & Python Features

- **tok**: Adds an API for both parsing and then tokenizing the messages in a query.

- **ffi**: Enables Foreign Function Interface support for calling spnl from other languages.

- **pypi**: Enables Python bindings for spnl. Depends on `ffi` and `tok`. Includes PyO3 for Python interop.

- **run_py**: Enables running span queries from Python with async support. Depends on `run`, `pypi`, and model backends.

## Cloud & Infrastructure Features

- **spnl-api**: Enables the spnl API client for communicating with spnl services.

- **vllm**: Enables support for vLLM model serving. Depends on `yaml`.

- **k8s**: Enables Kubernetes integration for deploying and managing vLLM instances. Depends on `vllm` and includes kube client libraries.

- **gce**: Enables Google Compute Engine integration for deploying and managing vLLM instances. Depends on `vllm` and includes GCP client libraries.

## Utility Features

- **openssl-vendored**: Uses a vendored (statically linked) version of OpenSSL instead of the system version. Useful for portable builds.

## Default Features

The following features are enabled by default:
- `cli_support`
- `lisp`
- `run`
- `ollama`
- `openai`
- `gemini`
- `yaml`
- `local`

This provides a full-featured experience with CLI support, multiple model backends (local and remote), and common format support.
6 changes: 2 additions & 4 deletions spnl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,19 @@ include = [
]

[features]
default = ["cli_support","lisp","run","ollama","openai","gemini","pull","yaml"]
default = ["cli_support","lisp","run","ollama","openai","gemini","yaml","local"]
cli_support = ["print","dep:ptree"]
print = []
lisp = ["dep:serde-lexpr"]
ollama = ["openai"]
openai = ["dep:async-openai","dep:tokio"]
gemini = ["openai"]
local = ["run","dep:mistralrs","dep:tokio","dep:uuid","dep:indexmap","dep:either"]
local = ["run","dep:mistralrs","dep:tokio","dep:uuid","dep:indexmap","dep:either","dep:reqwest"]
metal = ["local","mistralrs/metal"]
cuda = ["local","mistralrs/cuda"]
cuda-flash-attn = ["cuda","mistralrs/flash-attn"]
cuda-flash-attn-v3 = ["cuda","dep:mistralrs-core","mistralrs-core/flash-attn-v3"]
openssl-vendored = ["dep:openssl"]
pull = ["dep:reqwest","dep:tokio-util"]
ffi = []
pypi = ["ffi","tok","dep:pyo3","pyo3/extension-module","dep:thiserror"]
rag = ["run","dep:sha2", "dep:lancedb","dep:tracing","dep:arrow-schema","dep:arrow-array","dep:itertools","dep:pdf-extract","dep:async-recursion","dep:regex","dep:rand"]
Expand Down Expand Up @@ -71,7 +70,6 @@ futures = { version = "0.3.31", optional = true }
indicatif = { version = "0.18.0", optional = true }
tokio = { version = "1.44.1", features = ["io-std", "io-util", "signal"], optional = true }
tokio-stream = { version = "0.1.18", features = ["net"], optional = true }
tokio-util = { version = "0.7.16", optional = true }
anyhow = { version = "1.0.98" }
lancedb = { version = "0.26.0", default-features = false, optional = true }
tracing = { version = "0.1.41", optional = true }
Expand Down
4 changes: 2 additions & 2 deletions spnl/benches/mt_rag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ async fn run_rag_benchmark(
fn mt_rag_benchmark(c: &mut Criterion) {
let runtime = tokio::runtime::Runtime::new().unwrap();

let model = "ollama/granite3.3:2b";
let embedding_model = "ollama/mxbai-embed-large:335m";
let model = "llama3.2:3b";
let embedding_model = "local/google/embeddinggemma-300m";
let temperature = 0.0;
let max_tokens = 100; // Use small token limit for faster benchmarking

Expand Down
Loading
Loading