diff --git a/.github/workflows/test-llm-d-patch.yml b/.github/workflows/test-llm-d-patch.yml index 01170d9a..b730c35f 100644 --- a/.github/workflows/test-llm-d-patch.yml +++ b/.github/workflows/test-llm-d-patch.yml @@ -31,9 +31,10 @@ jobs: git config --global user.email "foo@bar.com" git config --global user.name "Spnl Dev" - - name: Generate patch - working-directory: docker/vllm/llm-d - run: ./genpatch.sh + # llmd 0.5.0 required AI help to rebase... We can re-enable this if we ever rebase our vllm span query branch to avoid the conflicts that come up when running genpatch + #- name: Generate patch + # working-directory: docker/vllm/llm-d + # run: ./genpatch.sh - name: Apply patch working-directory: docker/vllm/llm-d diff --git a/.github/workflows/vllm-gce.yml b/.github/workflows/vllm-gce.yml index 280ec115..4f3e87b7 100644 --- a/.github/workflows/vllm-gce.yml +++ b/.github/workflows/vllm-gce.yml @@ -20,9 +20,9 @@ jobs: name: Test in GCE VM env: # Adjust these as needed - VLLM_ORG: neuralmagic + VLLM_ORG: vllm-project VLLM_REPO: vllm - VLLM_BRANCH: llm-d-release-0.4 + VLLM_SHA: d7de043d55d1dd629554467e23874097e1c48993 MODEL: ibm-granite/granite-3.3-2b-instruct # You probably won't need to change this diff --git a/cli/src/args.rs b/cli/src/args.rs index 5648b55a..69b23fb7 100644 --- a/cli/src/args.rs +++ b/cli/src/args.rs @@ -226,7 +226,7 @@ pub enum ImageCommands { image_family: String, /// LLM-D version for patch file - #[arg(long, default_value = "0.4.0")] + #[arg(long, default_value = "0.5.0")] llmd_version: String, /// GCE configuration diff --git a/cli/src/main.rs b/cli/src/main.rs index c2a0da6c..4c154cd7 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -114,7 +114,7 @@ async fn main() -> Result<(), SpnlError> { .llmd_version(llmd_version.clone()) .vllm_org(gce_config.vllm_org.clone()) .vllm_repo(gce_config.vllm_repo.clone()) - .vllm_branch(gce_config.vllm_branch.clone()) + .vllm_sha(gce_config.vllm_sha.clone()) .config(gce_config.clone()) .build()?, ) diff --git a/docker/gce/vllm/cloud-config.yaml b/docker/gce/vllm/cloud-config.yaml index fe1db798..9e37f038 100644 --- a/docker/gce/vllm/cloud-config.yaml +++ b/docker/gce/vllm/cloud-config.yaml @@ -23,7 +23,8 @@ write_files: SPNL_RELEASE=${spnl_release} VLLM_ORG=${vllm_org} VLLM_REPO=${vllm_repo} - VLLM_BRANCH=${vllm_branch} + VLLM_SHA=${vllm_sha} + VLLM_PRECOMPILED_WHEEL_COMMIT=${vllm_precompiled_wheel_commit} MODEL=${model} VLLM_PATCHFILE=/tmp/vllm.patch ${vllm_config_section} diff --git a/docker/gce/vllm/create-vllm-gce-image.sh b/docker/gce/vllm/create-vllm-gce-image.sh index c332cfe2..8da56de4 100755 --- a/docker/gce/vllm/create-vllm-gce-image.sh +++ b/docker/gce/vllm/create-vllm-gce-image.sh @@ -2,7 +2,7 @@ # # Create a custom GCE image with vLLM pre-installed -# This script creates a reusable image based on the setup.sh logic +# This script creates a reusable image # set -euo pipefail diff --git a/docker/gce/vllm/setup-dev.sh b/docker/gce/vllm/setup-dev.sh index 8410317b..742b9feb 100644 --- a/docker/gce/vllm/setup-dev.sh +++ b/docker/gce/vllm/setup-dev.sh @@ -69,11 +69,51 @@ fi # Install vLLM curl -LsSf https://astral.sh/uv/install.sh | sh source $HOME/.local/bin/env -git clone https://github.com/$VLLM_ORG/$VLLM_REPO.git vllm -b $VLLM_BRANCH +git clone https://github.com/$VLLM_ORG/$VLLM_REPO.git vllm cd vllm +git fetch origin $VLLM_SHA +git checkout $VLLM_SHA uv venv --seed source .venv/bin/activate -VLLM_USE_PRECOMPILED=1 uv pip install --editable . + +# Default VLLM_PRECOMPILED_WHEEL_COMMIT to VLLM_SHA if not set +# This allows using precompiled binaries from a different commit (e.g., main) while checking out a specific source commit +VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_PRECOMPILED_WHEEL_COMMIT:-${VLLM_SHA}}" + +# Detect if precompiled wheel exists (following llm-d approach) +MACHINE=$(uname -m) +case "${MACHINE}" in + x86_64|amd64) PLATFORM_TAG="manylinux_2_31_x86_64" ;; + aarch64|arm64) PLATFORM_TAG="manylinux_2_31_aarch64" ;; + *) echo "Unsupported architecture: ${MACHINE}"; PLATFORM_TAG="" ;; +esac + +WHEEL_URL="" +if [ -n "${PLATFORM_TAG}" ]; then + echo "Looking for precompiled wheel at: https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/vllm/" + WHEEL_INDEX_HTML=$(curl -sf "https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/vllm/" 2>/dev/null || echo "") + if [ -n "${WHEEL_INDEX_HTML}" ]; then + WHEEL_FILENAME=$(echo "${WHEEL_INDEX_HTML}" | grep -oE "vllm-[^\"]+${PLATFORM_TAG}\.whl" | head -1) + if [ -n "${WHEEL_FILENAME}" ]; then + # URL-encode the + sign in the wheel filename + WHEEL_URL="https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/${WHEEL_FILENAME}" + WHEEL_URL=$(echo "${WHEEL_URL}" | sed -E 's/\+/%2B/g') + echo "Found precompiled wheel: ${WHEEL_URL}" + fi + fi +fi + +# Install vLLM with or without precompiled binaries +if [ -n "${WHEEL_URL}" ]; then + echo "Using precompiled binaries from commit: ${VLLM_PRECOMPILED_WHEEL_COMMIT} (source: ${VLLM_SHA})" + export VLLM_USE_PRECOMPILED=1 + export VLLM_PRECOMPILED_WHEEL_LOCATION="${WHEEL_URL}" + uv pip install --editable . +else + echo "Compiling vLLM from source (no precompiled wheel found or unsupported platform)" + unset VLLM_USE_PRECOMPILED VLLM_PRECOMPILED_WHEEL_LOCATION 2>/dev/null || true + uv pip install --editable . +fi # Wait for spnl build to complete echo "Waiting for spnl build to complete..." diff --git a/docker/gce/vllm/setup.sh b/docker/gce/vllm/setup.sh deleted file mode 100644 index 6ce3a520..00000000 --- a/docker/gce/vllm/setup.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env bash - -# -# Note: this script is executed inside the VM, via a cloud-init runcmd. See ./cloud-config.yaml -# - -set -eo pipefail - -# DEBUG -#set -x - -cleanup() { - rc=$? - echo "Exiting with exit_code=$rc" - gsutil cp <(echo $rc) gs://$GCS_BUCKET/runs/$RUN_ID/status/exit_code -} -trap "cleanup" EXIT - -export HOME=/root -cd $HOME - -# TODO: i was expecting this to be loaded automatically. Apparently not if this is run via a cloud-init runcmd. -. /etc/environment - -export SCCACHE_GCS_BUCKET=$GCS_BUCKET -SCCACHE_VERSION=$(curl -s "https://api.github.com/repos/mozilla/sccache/releases/latest" | grep -Po '"tag_name": "v\K[0-9.]+') -wget -qO sccache.tar.gz https://github.com/mozilla/sccache/releases/latest/download/sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl.tar.gz -mkdir sccache-temp -tar xf sccache.tar.gz --strip-components=1 -C sccache-temp -sudo mv sccache-temp/sccache /usr/local/bin -sudo chmod a+x /usr/local/bin/sccache -rm -rf sccache.tar.gz sccache-temp -export RUSTC_WRAPPER=/usr/local/bin/sccache -export SCCACHE_GCS_RW_MODE=READ_WRITE -export SCCACHE_GCS_KEY_PREFIX=sccache - -# Install and build spnl -export CARGO_INCREMENTAL=0 # Disable incremental compilation for faster from-scratch builds -export CARGO_PROFILE_TEST_DEBUG=0 - -if [[ -n "$SPNL_RELEASE" ]] -then - echo "Downloading spnl release $SPNL_RELEASE" - - # Detect OS and architecture - OS=$(uname -s | tr '[:upper:]' '[:lower:]') - ARCH=$(uname -m) - - # Map architecture names to match GitHub release naming - case "$ARCH" in - x86_64) - ARCH="x86_64" - ;; - aarch64|arm64) - ARCH="aarch64" - ;; - *) - echo "Unsupported architecture: $ARCH" - exit 1 - ;; - esac - - # Map OS and ABI to match GitHub release naming - # Format: spnl-{version}-{os}-{arch}-{abi}.tar.gz - case "$OS" in - linux) - OS="linux" - ABI="gnu" - ;; - darwin) - OS="macos" - ABI="" - ;; - *) - echo "Unsupported OS: $OS" - exit 1 - ;; - esac - - # Construct the asset name - if [[ -n "$ABI" ]]; then - ASSET_NAME="spnl-${SPNL_RELEASE}-${OS}-${ARCH}-${ABI}.tar.gz" - else - ASSET_NAME="spnl-${SPNL_RELEASE}-${OS}-${ARCH}.tar.gz" - fi - - # Extract repo owner and name from SPNL_GITHUB (e.g., https://github.com/owner/repo) - REPO_PATH=$(echo "$SPNL_GITHUB" | sed -E 's|https?://github.com/||' | sed 's|\.git$||') - - # Download the release asset - DOWNLOAD_URL="https://github.com/${REPO_PATH}/releases/download/${SPNL_RELEASE}/${ASSET_NAME}" - echo "Downloading from: $DOWNLOAD_URL" - - wget -q "$DOWNLOAD_URL" -O spnl-release.tar.gz || { - echo "Failed to download release asset. Falling back to building from source." - exit 1 - } - - # Extract and install - tar xzf spnl-release.tar.gz - sudo cp spnl /usr/local/bin/ - sudo chmod a+rX /usr/local/bin/spnl - rm spnl-release.tar.gz spnl - - # No need to clone repo or build - we'll install Python package from PyPI later - spnl_pid=0 -elif [[ -n "$GITHUB_SHA" ]] && [[ -n "$GITHUB_REF" ]] -then - echo "Cloning spnl from GITHUB_SHA=$GITHUB_SHA GITHUB_REF=$GITHUB_REF" - ( - curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ - source $HOME/.cargo/env && \ - mkdir spnl && \ - cd spnl && \ - git init && \ - git remote add origin $SPNL_GITHUB && \ - git fetch --prune --no-recurse-submodules --depth=1 origin +$GITHUB_SHA:$GITHUB_REF && \ - git checkout --progress --force $GITHUB_REF && \ - cargo build -F rag,spnl-api,vllm && sudo cp target/debug/spnl /usr/local/bin && sudo chmod a+rX /usr/local/bin/spnl \ - ) & - spnl_pid=$! -else - echo "Cloning spnl from repo=$SPNL_GITHUB" - ( - curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ - source $HOME/.cargo/env && \ - git clone $SPNL_GITHUB spnl && \ - cd spnl && \ - cargo build -F rag,spnl-api,vllm && sudo cp target/debug/spnl /usr/local/bin && sudo chmod a+rX /usr/local/bin/spnl \ - ) & - spnl_pid=$! -fi - -# Install vLLM -curl -LsSf https://astral.sh/uv/install.sh | sh -source $HOME/.local/bin/env -git clone https://github.com/$VLLM_ORG/$VLLM_REPO.git vllm -b $VLLM_BRANCH -cd vllm -uv venv --seed -source .venv/bin/activate -VLLM_USE_PRECOMPILED=1 uv pip install --editable . - -# Wait for spnl build to complete (if building from source) -if [[ $spnl_pid -ne 0 ]]; then - echo "Waiting for spnl build to complete..." - wait $spnl_pid - echo "spnl build completed" -fi - -# Patch the vllm code and install spnl Python package -spnl vllm patchfile | git apply - -if [[ -n "$SPNL_RELEASE" ]] -then - # Install spnl from PyPI (strip 'v' prefix if present) - SPNL_VERSION="${SPNL_RELEASE#v}" - echo "Installing spnl==$SPNL_VERSION from PyPI" - uv pip install "spnl==$SPNL_VERSION" -else - # Build the cloned version of spnl into vLLM, via maturin - uv pip install maturin[patchelf] - source $HOME/.cargo/env # to get rustc on path - (cd $HOME/spnl && maturin develop -F tok,run_py -m spnl/Cargo.toml) -fi - -# Start vLLM -VLLM_ATTENTION_BACKEND=TRITON_ATTN \ - VLLM_USE_V1=1 \ - VLLM_V1_SPANS_ENABLED=True \ - VLLM_V1_SPANS_TOKEN_PLUS=10 \ - VLLM_V1_SPANS_TOKEN_CROSS=13 \ - VLLM_SERVER_DEV_MODE=1 \ - nohup vllm serve $MODEL --enforce-eager & - -# Install ollama (for embedding) -(curl -fsSL https://ollama.com/install.sh | sh && ollama serve) & - -# Wait till vllm is ready -timeout 5m bash -c 'until curl --output /dev/null --silent --fail http://localhost:8000/health; do sleep 3; done' -echo "vllm is ready" - -# Wait till ollama is ready -#timeout 5m bash -c 'until curl --output /dev/null --silent --fail http://localhost:11434; do sleep 3; done' -timeout 5m bash -c 'until ollama ps; do sleep 3; done' -echo "ollama is ready" - -# Run tests only if not using a release (releases are for production, not testing) -if [[ -z "$SPNL_RELEASE" ]] -then - # Here are the variables we will allow to be used in the test.d/* scripts - declare -x GCS_BUCKET - declare -x RUN_ID - declare -x MODEL - declare -x OPENAI_API_BASE=http://localhost:8000/v1 - - cd $HOME - TESTS_DIR=$HOME/spnl/docker/gce/vllm/test.d - if [ -d "$TESTS_DIR" ] - then - n_tests=$(ls "$TESTS_DIR" | wc -l | xargs) - echo "Starting $n_tests tests" - find "$TESTS_DIR" -type f -name '*.sh' -print0 | xargs -0L1 -I{} bash -c 'echo "Executing {} at $(date -u)"; "{}"' - else echo "No tests found in $TESTS_DIR" - fi -else - echo "Skipping tests (SPNL_RELEASE is set)" -fi diff --git a/docker/vllm/llm-d/Containerfile.cuda b/docker/vllm/llm-d/Containerfile.cuda index 6f8f4812..438c52bb 100644 --- a/docker/vllm/llm-d/Containerfile.cuda +++ b/docker/vllm/llm-d/Containerfile.cuda @@ -1,4 +1,4 @@ -ARG LLMD_VERSION=0.4.0 +ARG LLMD_VERSION=0.5.0 ARG MANYLINUX_VERSION=2_34 # use 2_39 for aarch64; TODO is it possible to infer this from the build platform? # Python version extractor @@ -29,7 +29,7 @@ LABEL org.opencontainers.image.source=https://github.com/IBM/spnl LABEL org.opencontainers.image.description="Span Query support for llm-d's vLLM" LABEL org.opencontainers.image.licenses="Apache-2.0" -ARG LLMD_VERSION=0.4.0 # sigh, we need to repeat this if we want to use it inside of the FROM +ARG LLMD_VERSION=0.5.0 # sigh, we need to repeat this if we want to use it inside of the FROM COPY --from=builder target/wheels/ /tmp/wheels COPY docker/vllm/llm-d/patches/$LLMD_VERSION/ /tmp/patches diff --git a/docker/vllm/llm-d/clone.sh b/docker/vllm/llm-d/clone.sh index 4625c864..9bc22aa9 100755 --- a/docker/vllm/llm-d/clone.sh +++ b/docker/vllm/llm-d/clone.sh @@ -2,12 +2,14 @@ set -e -LLMD_VERSION=0.4.0 -BASE_VLLM_FORK=https://github.com/neuralmagic/vllm.git -BASE_VLLM_BRANCH=llm-d-release-0.4 +LLMD_VERSION=0.5.0 +BASE_VLLM_FORK=https://github.com/vllm-project/vllm.git +BASE_VLLM_COMMIT_SHA=d7de043d55d1dd629554467e23874097e1c48993 -git clone $BASE_VLLM_FORK -b $BASE_VLLM_BRANCH --depth 1 +git clone $BASE_VLLM_FORK vllm cd vllm +git fetch --depth=1 origin $BASE_VLLM_COMMIT_SHA +git checkout -q $BASE_VLLM_COMMIT_SHA for patchfile in ../patches/$LLMD_VERSION/*.patch.gz do git apply <(gunzip -c $patchfile) --reject diff --git a/docker/vllm/llm-d/genpatch.sh b/docker/vllm/llm-d/genpatch.sh index 74d4449d..ff9918e9 100755 --- a/docker/vllm/llm-d/genpatch.sh +++ b/docker/vllm/llm-d/genpatch.sh @@ -7,25 +7,24 @@ SCRIPTDIR=$(cd $(dirname "$0") && pwd) SPANS_VLLM_FORK=https://github.com/starpit/vllm-ibm.git SPANS_VLLM_BRANCH=spnl-ibm -LLMD_VERSION=0.4.0 -BASE_VLLM_FORK=https://github.com/neuralmagic/vllm.git -BASE_VLLM_BRANCH=llm-d-release-0.4 +LLMD_VERSION=0.5.0 +BASE_VLLM_FORK=https://github.com/vllm-project/vllm.git +BASE_VLLM_COMMIT_SHA=d7de043d55d1dd629554467e23874097e1c48993 -T=$(mktemp -d) -trap "rm -rf $T" EXIT +T=vllm +#trap "rm -rf $T" EXIT -git clone $BASE_VLLM_FORK $T/vllm-llmd -b $BASE_VLLM_BRANCH +git clone $BASE_VLLM_FORK $T/vllm-llmd cd $T/vllm-llmd -BASE_VLLM_REVISION=$(git rev-parse --verify HEAD) +git fetch origin $BASE_VLLM_COMMIT_SHA +git checkout -q $BASE_VLLM_COMMIT_SHA +BASE_VLLM_REVISION=$BASE_VLLM_COMMIT_SHA git remote add spans $SPANS_VLLM_FORK git fetch spans $SPANS_VLLM_BRANCH -git checkout $SPANS_VLLM_BRANCH -SPANS_VLLM_REVISION=$(git rev-parse --verify HEAD) -git checkout $BASE_VLLM_BRANCH -git rebase spans/$SPANS_VLLM_BRANCH -C0 +git rebase spans/$SPANS_VLLM_BRANCH -C0 # Notes: gzip --no-name ensures deterministic output (gzip won't save mtime in the file); this helps with git sanity -mkdir -p "$SCRIPTDIR"/patches +mkdir -p "$SCRIPTDIR"/patches/$LLMD_VERSION git diff $BASE_VLLM_REVISION | gzip --no-name -c > "$SCRIPTDIR"/patches/$LLMD_VERSION/01-spans-llmd-vllm.patch.gz diff --git a/docker/vllm/llm-d/patches/0.5.0/01-spans-llmd-vllm.patch.gz b/docker/vllm/llm-d/patches/0.5.0/01-spans-llmd-vllm.patch.gz new file mode 100644 index 00000000..e7fa2e27 Binary files /dev/null and b/docker/vllm/llm-d/patches/0.5.0/01-spans-llmd-vllm.patch.gz differ diff --git a/spnl/src/vllm/gce/args.rs b/spnl/src/vllm/gce/args.rs index 1d5ffcda..2ffc2f2e 100644 --- a/spnl/src/vllm/gce/args.rs +++ b/spnl/src/vllm/gce/args.rs @@ -52,9 +52,19 @@ pub struct GceConfig { #[arg(long, env = "VLLM_REPO", default_value = "vllm")] pub vllm_repo: String, - /// vLLM branch to use - #[arg(long, env = "VLLM_BRANCH", default_value = "llm-d-release-0.4")] - pub vllm_branch: String, + /// vLLM commit SHA to use + #[arg(long, env = "VLLM_SHA", default_value = "a1b2c3d4e5f6")] + pub vllm_sha: String, + + /// vLLM commit SHA to use for precompiled wheel lookup (defaults to vllm_sha) + /// This allows using stable precompiled binaries from a known commit (e.g., main) + /// while checking out a different source commit for testing + #[arg( + long, + env = "VLLM_PRECOMPILED_WHEEL_COMMIT", + default_value = "d7de043d55d1dd629554467e23874097e1c48993" + )] + pub vllm_precompiled_wheel_commit: String, } impl GceConfig { @@ -72,7 +82,8 @@ impl GceConfig { github_ref: None, vllm_org: "neuralmagic".to_string(), vllm_repo: "vllm".to_string(), - vllm_branch: "llm-d-release-0.4".to_string(), + vllm_sha: "a1b2c3d4e5f6".to_string(), + vllm_precompiled_wheel_commit: "d7de043d55d1dd629554467e23874097e1c48993".to_string(), } } @@ -126,7 +137,11 @@ mod tests { assert_eq!(config.spnl_github, None); assert_eq!(config.vllm_org, "neuralmagic"); assert_eq!(config.vllm_repo, "vllm"); - assert_eq!(config.vllm_branch, "llm-d-release-0.4"); + assert_eq!(config.vllm_sha, "a1b2c3d4e5f6"); + assert_eq!( + config.vllm_precompiled_wheel_commit, + "d7de043d55d1dd629554467e23874097e1c48993" + ); } #[test] diff --git a/spnl/src/vllm/gce/image.rs b/spnl/src/vllm/gce/image.rs index 8756ed01..d9a0f947 100644 --- a/spnl/src/vllm/gce/image.rs +++ b/spnl/src/vllm/gce/image.rs @@ -1,11 +1,11 @@ use super::args::GceConfig; /// Generate image name from patch content hash and vLLM source identifier -fn generate_image_name( +pub fn generate_image_name( patch_content: &[u8], vllm_org: &str, vllm_repo: &str, - vllm_branch: &str, + vllm_sha: &str, ) -> String { use sha2::{Digest, Sha256}; @@ -13,7 +13,7 @@ fn generate_image_name( let patch_hash = format!("{:x}", Sha256::digest(patch_content)); // Create vLLM source identifier - let vllm_source_id = format!("{}/{}@{}", vllm_org, vllm_repo, vllm_branch); + let vllm_source_id = format!("{}/{}@{}", vllm_org, vllm_repo, vllm_sha); // Combine and hash (GCE image names have 63 char limit, format is "vllm-spnl-{hash}") let combined = format!("{}{}", patch_hash, vllm_source_id); @@ -41,12 +41,12 @@ pub struct ImageCreateArgs { #[builder(setter(into), default = "vllm".to_string())] pub(crate) vllm_repo: String, - /// vLLM branch to use - #[builder(setter(into), default = "llm-d-release-0.4".to_string())] - pub(crate) vllm_branch: String, + /// vLLM commit SHA to use + #[builder(setter(into), default = "a1b2c3d4e5f6".to_string())] + pub(crate) vllm_sha: String, /// LLM-D version for patch file - #[builder(setter(into), default = "0.4.0".to_string())] + #[builder(setter(into), default = "0.5.0".to_string())] pub(crate) llmd_version: String, /// Custom image name (defaults to auto-generated from hash) @@ -66,7 +66,7 @@ pub struct ImageCreateArgs { fn generate_startup_script( vllm_org: &str, vllm_repo: &str, - vllm_branch: &str, + vllm_sha: &str, patch_content_b64: &str, ) -> String { format!( @@ -97,8 +97,10 @@ sudo resize2fs /dev/sda1 2>/dev/null || true echo "=== Installing vLLM ===" curl -LsSf https://astral.sh/uv/install.sh | sh source $HOME/.local/bin/env -git clone https://github.com/{}/{}.git vllm -b {} +git clone https://github.com/{}/{}.git vllm cd vllm +git fetch origin {} +git checkout {} echo "=== Applying vLLM patch ===" # Decode the embedded patch file @@ -179,7 +181,7 @@ sudo rm -rf /var/lib/apt/lists/* echo "=== Image preparation complete ===" "#, - vllm_org, vllm_repo, vllm_branch, patch_content_b64 + vllm_org, vllm_repo, vllm_sha, vllm_sha, patch_content_b64 ) } @@ -423,7 +425,7 @@ struct ImageCreationParams<'a> { project: &'a str, vllm_org: &'a str, vllm_repo: &'a str, - vllm_branch: &'a str, + vllm_sha: &'a str, llmd_version: &'a str, } @@ -435,8 +437,8 @@ async fn create_image_from_disk(params: ImageCreationParams<'_>) -> anyhow::Resu let client = Images::builder().build().await?; let description = format!( - "vLLM custom image with VLLM_ORG={}, VLLM_REPO={}, VLLM_BRANCH={}, LLMD_VERSION={}", - params.vllm_org, params.vllm_repo, params.vllm_branch, params.llmd_version + "vLLM custom image with VLLM_ORG={}, VLLM_REPO={}, VLLM_SHA={}, LLMD_VERSION={}", + params.vllm_org, params.vllm_repo, params.vllm_sha, params.llmd_version ); eprintln!("Creating custom image: {}", params.image_name); @@ -560,7 +562,7 @@ pub async fn create_image(args: ImageCreateArgs) -> anyhow::Result { eprintln!("Configuration:"); eprintln!(" VLLM_ORG: {}", args.vllm_org); eprintln!(" VLLM_REPO: {}", args.vllm_repo); - eprintln!(" VLLM_BRANCH: {}", args.vllm_branch); + eprintln!(" VLLM_SHA: {}", args.vllm_sha); eprintln!(" LLMD_VERSION: {}", args.llmd_version); eprintln!(" IMAGE_FAMILY: {}", args.image_family); eprintln!(" IMAGE_PROJECT: {}", project); @@ -568,12 +570,12 @@ pub async fn create_image(args: ImageCreateArgs) -> anyhow::Result { // Embed patch file at compile time based on LLMD version let patch_content = match args.llmd_version.as_str() { - "0.4.0" => { - include_bytes!("../../../docker/vllm/llm-d/patches/0.4.0/01-spans-llmd-vllm.patch.gz") + "0.5.0" => { + include_bytes!("../../../docker/vllm/llm-d/patches/0.5.0/01-spans-llmd-vllm.patch.gz") } _ => { return Err(anyhow::anyhow!( - "Unsupported LLMD version: {}. Only 0.4.0 is currently supported.", + "Unsupported LLMD version: {}. Only 0.5 is currently supported.", args.llmd_version )); } @@ -592,7 +594,7 @@ pub async fn create_image(args: ImageCreateArgs) -> anyhow::Result { patch_content, &args.vllm_org, &args.vllm_repo, - &args.vllm_branch, + &args.vllm_sha, ) }; @@ -626,7 +628,7 @@ pub async fn create_image(args: ImageCreateArgs) -> anyhow::Result { let startup_script = generate_startup_script( &args.vllm_org, &args.vllm_repo, - &args.vllm_branch, + &args.vllm_sha, &patch_content_b64, ); @@ -654,7 +656,7 @@ pub async fn create_image(args: ImageCreateArgs) -> anyhow::Result { project: &project, vllm_org: &args.vllm_org, vllm_repo: &args.vllm_repo, - vllm_branch: &args.vllm_branch, + vllm_sha: &args.vllm_sha, llmd_version: &args.llmd_version, }) .await?; @@ -701,8 +703,8 @@ mod tests { .force_overwrite(true) .vllm_org("test-org") .vllm_repo("test-repo") - .vllm_branch("test-branch") - .llmd_version("0.4.0") + .vllm_sha("abc123def456") + .llmd_version("0.5.0") .image_family("test-family") .build() .unwrap(); @@ -710,8 +712,8 @@ mod tests { assert!(args.force_overwrite); assert_eq!(args.vllm_org, "test-org"); assert_eq!(args.vllm_repo, "test-repo"); - assert_eq!(args.vllm_branch, "test-branch"); - assert_eq!(args.llmd_version, "0.4.0"); + assert_eq!(args.vllm_sha, "abc123def456"); + assert_eq!(args.llmd_version, "0.5.0"); assert_eq!(args.image_family, "test-family"); } } diff --git a/spnl/src/vllm/gce/up.rs b/spnl/src/vllm/gce/up.rs index e55df6d1..fd1e5e93 100644 --- a/spnl/src/vllm/gce/up.rs +++ b/spnl/src/vllm/gce/up.rs @@ -71,7 +71,8 @@ fn load_cloud_config(args: &UpArgs) -> anyhow::Result { }; let vllm_org = &args.config.vllm_org; let vllm_repo = &args.config.vllm_repo; - let vllm_branch = &args.config.vllm_branch; + let vllm_sha = &args.config.vllm_sha; + let vllm_precompiled_wheel_commit = &args.config.vllm_precompiled_wheel_commit; let model = args .model .clone() @@ -187,7 +188,11 @@ cloud_final_modules: []"# substitutions.insert("spnl_release", spnl_release.as_str()); substitutions.insert("vllm_org", vllm_org.as_str()); substitutions.insert("vllm_repo", vllm_repo.as_str()); - substitutions.insert("vllm_branch", vllm_branch.as_str()); + substitutions.insert("vllm_sha", vllm_sha.as_str()); + substitutions.insert( + "vllm_precompiled_wheel_commit", + vllm_precompiled_wheel_commit.as_str(), + ); substitutions.insert("model", model.as_str()); substitutions.insert("packages_section", &packages_section); substitutions.insert("setup_dev_script", &setup_dev_script); @@ -248,11 +253,19 @@ pub async fn up(args: UpArgs) -> anyhow::Result<()> { // Determine which image to use let source_image = if is_dev_mode { // Dev mode: use standard Ubuntu accelerator image - "projects/ubuntu-os-accelerator-images/global/images/ubuntu-accelerator-2404-amd64-with-nvidia-580-v20251210".to_string() + "projects/ubuntu-os-accelerator-images/global/images/ubuntu-accelerator-2404-amd64-with-nvidia-580-v20260203".to_string() } else { // Production mode: use custom image based on vLLM configuration - // Use the image family to get the latest image - format!("projects/{}/global/images/family/vllm-spnl", project) + // Generate the exact image name using the same logic as image creation + let patch_content = + include_bytes!("../../../docker/vllm/llm-d/patches/0.5.0/01-spans-llmd-vllm.patch.gz"); + let image_name = super::image::generate_image_name( + patch_content, + &args.config.vllm_org, + &args.config.vllm_repo, + &args.config.vllm_sha, + ); + format!("projects/{}/global/images/{}", project, image_name) }; #[derive(Tabled)] @@ -314,6 +327,15 @@ pub async fn up(args: UpArgs) -> anyhow::Result<()> { "v2-x86-template-1-4-0".to_string(), ); + // Configure disk based on mode + // Dev mode needs more space for compilation (300GB pd-balanced) + // Production mode uses smaller, faster disk (100GB pd-ssd) + let (disk_size_gb, disk_type) = if is_dev_mode { + (300, format!("zones/{}/diskTypes/pd-balanced", zone)) + } else { + (100, format!("zones/{}/diskTypes/pd-ssd", zone)) + }; + // Create the instance configuration matching the terraform file let instance = Instance::new() .set_name(&instance_name) @@ -325,8 +347,8 @@ pub async fn up(args: UpArgs) -> anyhow::Result<()> { .set_initialize_params( AttachedDiskInitializeParams::new() .set_source_image(&source_image) - .set_disk_size_gb(100) - .set_disk_type(format!("zones/{}/diskTypes/pd-ssd", zone)), + .set_disk_size_gb(disk_size_gb) + .set_disk_type(disk_type), ) .set_mode("READ_WRITE")]) .set_network_interfaces([NetworkInterface::new() diff --git a/spnl/src/vllm/patch.rs b/spnl/src/vllm/patch.rs index 2d3a7424..d424d660 100644 --- a/spnl/src/vllm/patch.rs +++ b/spnl/src/vllm/patch.rs @@ -1,7 +1,7 @@ use std::io::{Read, Write}; const PATCH_DATA: &[u8] = - include_bytes!("../../docker/vllm/llm-d/patches/0.4.0/01-spans-llmd-vllm.patch.gz"); + include_bytes!("../../docker/vllm/llm-d/patches/0.5.0/01-spans-llmd-vllm.patch.gz"); /// Emit the vLLM patchfile to stdout ///