Skip to content

Build llama-server

Build llama-server #14

name: Build llama-server
on:
# Manual trigger — enter a specific version
workflow_dispatch:
inputs:
version:
description: 'llama.cpp version tag (e.g. b8416). Leave empty to auto-detect latest.'
required: false
default: ''
type: string
cuda_architectures:
description: 'CUDA SM targets (semicolon-separated)'
required: false
default: '75;80;86;89;90;100;120'
type: string
# Automatic trigger — check for new upstream release every Monday
schedule:
- cron: '0 4 * * 1' # Monday 04:00 UTC (Sunday 9pm PDT)
permissions:
contents: write
jobs:
# ── Step 1: Detect the version to build ──────────────────────────────────────
detect-version:
runs-on: ubuntu-22.04
outputs:
version: ${{ steps.resolve.outputs.version }}
should_build: ${{ steps.resolve.outputs.should_build }}
cuda_architectures: ${{ steps.resolve.outputs.cuda_architectures }}
steps:
- name: Resolve version
id: resolve
env:
INPUT_VERSION: ${{ inputs.version }}
INPUT_CUDA_ARCHS: ${{ inputs.cuda_architectures }}
GH_TOKEN: ${{ github.token }}
run: |
CUDA_ARCHS="${INPUT_CUDA_ARCHS:-75;80;86;89;90;100;120}"
echo "cuda_architectures=${CUDA_ARCHS}" >> $GITHUB_OUTPUT
# If version was provided manually, use it
if [ -n "$INPUT_VERSION" ]; then
echo "version=${INPUT_VERSION}" >> $GITHUB_OUTPUT
echo "should_build=true" >> $GITHUB_OUTPUT
echo "✅ Manual trigger: building ${INPUT_VERSION}"
exit 0
fi
# Auto-detect latest upstream version
LATEST=$(curl -sf https://api.github.com/repos/ggml-org/llama.cpp/releases/latest \
-H "Accept: application/vnd.github.v3+json" | jq -r '.tag_name')
if [ -z "$LATEST" ] || [ "$LATEST" = "null" ]; then
echo "❌ Failed to fetch latest llama.cpp version"
echo "should_build=false" >> $GITHUB_OUTPUT
exit 0
fi
echo "🔍 Latest upstream version: ${LATEST}"
# Check if we already have a release for this version
EXISTING=$(curl -sf \
-H "Accept: application/vnd.github.v3+json" \
-H "Authorization: Bearer ${GH_TOKEN}" \
"https://api.github.com/repos/${{ github.repository }}/releases/tags/${LATEST}" \
| jq -r '.tag_name // empty')
if [ "$EXISTING" = "$LATEST" ]; then
echo "⏭️ Release ${LATEST} already exists — skipping"
echo "should_build=false" >> $GITHUB_OUTPUT
else
echo "🆕 New version detected: ${LATEST} — triggering build"
echo "version=${LATEST}" >> $GITHUB_OUTPUT
echo "should_build=true" >> $GITHUB_OUTPUT
fi
# ── Step 2: Build all variants ───────────────────────────────────────────────
build:
needs: detect-version
if: needs.detect-version.outputs.should_build == 'true'
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
include:
# ── Linux x64 ──
- name: linux-x64-cpu
runner: ubuntu-22.04
os: linux
acceleration: cpu
cuda_version: ''
cuda_architectures: ''
- name: linux-x64-cuda-12
runner: ubuntu-22.04
os: linux
acceleration: cuda
cuda_version: '12-8'
cuda_pkg: 'cuda-toolkit-12-8'
cuda_home: '/usr/local/cuda-12.8'
cuda_architectures: '75;80;86;89;90'
- name: linux-x64-cuda-13
runner: ubuntu-22.04
os: linux
acceleration: cuda
cuda_version: '13-1'
cuda_pkg: 'cuda-toolkit-13-1'
cuda_home: '/usr/local/cuda-13.1'
cuda_architectures: '75;80;86;89;90;100;120'
- name: linux-x64-vulkan
runner: ubuntu-22.04
os: linux
acceleration: vulkan
cuda_version: ''
cuda_architectures: ''
# ── Linux arm64 ──
# All arm64 builds use ggml_native: 'OFF' + -march=armv8-a for a
# portable binary that runs on Jetson Orin (A78AE), Raspberry Pi 4
# (A72), Pi 5 (A76), Rockchip RK3399/RK3588, and any aarch64 device.
# Without this, Graviton runners bake in SVE/i8mm/bf16 instructions
# that SIGILL on those targets.
- name: linux-arm64-cpu
runner: ubuntu-22.04-arm
os: linux
acceleration: cpu
ggml_native: 'OFF'
cuda_version: ''
cuda_architectures: ''
- name: linux-arm64-cuda-12
runner: ubuntu-22.04-arm
os: linux
acceleration: cuda
ggml_native: 'OFF'
cuda_version: '12-8'
cuda_pkg: 'cuda-toolkit-12-8'
cuda_home: '/usr/local/cuda-12.8'
cuda_architectures: '75;80;86;89;90'
- name: linux-arm64-cuda-13
runner: ubuntu-22.04-arm
os: linux
acceleration: cuda
ggml_native: 'OFF'
cuda_version: '13-1'
cuda_pkg: 'cuda-toolkit-13-1'
cuda_home: '/usr/local/cuda-13.1'
cuda_architectures: '75;80;86;89;90;100;120'
- name: linux-arm64-vulkan
runner: ubuntu-22.04-arm
os: linux
acceleration: vulkan
ggml_native: 'OFF'
cuda_version: ''
cuda_architectures: ''
# ── Windows x64 ──
- name: windows-x64-cpu
runner: windows-2022
os: windows
acceleration: cpu
cuda_version: ''
cuda_architectures: ''
- name: windows-x64-cuda-12
runner: windows-2022
os: windows
acceleration: cuda
cuda_version: '12.4'
cuda_architectures: '75;80;86;89;90'
- name: windows-x64-cuda-13
runner: windows-2022
os: windows
acceleration: cuda
cuda_version: '13.1'
cuda_architectures: '75;80;86;89;90;100;120'
- name: windows-x64-vulkan
runner: windows-2022
os: windows
acceleration: vulkan
cuda_version: ''
cuda_architectures: ''
# ── macOS ──
- name: macos-arm64-metal
runner: macos-14
os: macos
acceleration: metal
cuda_version: ''
cuda_architectures: ''
- name: macos-x64-cpu
runner: macos-14
os: macos
acceleration: cpu
cross_arch: x86_64
cuda_version: ''
cuda_architectures: ''
name: ${{ matrix.name }}
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout build repo
uses: actions/checkout@v4
# ── Linux dependencies ──
- name: Install build dependencies (Linux)
if: matrix.os == 'linux'
run: |
sudo apt-get update
sudo apt-get install -y build-essential cmake git
- name: Install Vulkan SDK (Linux)
if: matrix.os == 'linux' && matrix.acceleration == 'vulkan'
run: |
ARCH=$(dpkg --print-architecture)
if [ "$ARCH" = "amd64" ]; then
# LunarG full SDK — x86_64 only
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update
sudo apt-get install -y vulkan-sdk
else
# ARM64: use Ubuntu's built-in Vulkan packages (LunarG SDK is amd64-only)
sudo apt-get install -y libvulkan-dev glslang-tools spirv-tools
fi
- name: Install CUDA toolkit (Linux)
if: matrix.os == 'linux' && matrix.acceleration == 'cuda'
run: |
ARCH=$(dpkg --print-architecture)
if [ "$ARCH" = "amd64" ]; then
CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64"
elif [ "$ARCH" = "arm64" ]; then
CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa"
else
echo "❌ Unsupported architecture: $ARCH"
exit 1
fi
wget -qO- "${CUDA_REPO}/3bf863cc.pub" | sudo gpg --dearmor -o /usr/share/keyrings/cuda-archive-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] ${CUDA_REPO} /" | sudo tee /etc/apt/sources.list.d/cuda.list
sudo apt-get update
sudo apt-get install -y ${{ matrix.cuda_pkg }}
echo "CUDA_HOME=${{ matrix.cuda_home }}" >> $GITHUB_ENV
echo "${{ matrix.cuda_home }}/bin" >> $GITHUB_PATH
# ── Windows dependencies ──
- name: Install Ninja (Windows)
if: matrix.os == 'windows'
run: choco install ninja -y
- name: Install CUDA toolkit (Windows)
if: matrix.os == 'windows' && matrix.acceleration == 'cuda'
uses: ./.github/actions/windows-setup-cuda
with:
cuda_version: ${{ matrix.cuda_version }}
- name: Install Vulkan SDK (Windows)
if: matrix.os == 'windows' && matrix.acceleration == 'vulkan'
env:
VULKAN_VERSION: 1.4.313.0
run: |
curl -L -o vulkan-sdk.exe "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
.\vulkan-sdk.exe --accept-licenses --default-answer --confirm-command install
echo "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" >> $env:GITHUB_ENV
echo "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" >> $env:GITHUB_PATH
# ── Build (Linux & macOS) ──
- name: Build llama-server (Unix)
if: matrix.os != 'windows'
env:
CROSS_ARCH: ${{ matrix.cross_arch }}
# GGML_NATIVE=OFF forces -march=armv8-a on arm64 runners, producing
# a portable binary that won't SIGILL on Jetson/Pi/Rockchip targets.
GGML_NATIVE: ${{ matrix.ggml_native || 'ON' }}
run: |
chmod +x scripts/build.sh
bash scripts/build.sh \
"${{ needs.detect-version.outputs.version }}" \
"${{ matrix.acceleration }}" \
"${{ matrix.cuda_architectures || needs.detect-version.outputs.cuda_architectures }}"
# ── Build (Windows) ──
- name: Build llama-server (Windows)
if: matrix.os == 'windows'
shell: cmd
run: |
set VERSION=${{ needs.detect-version.outputs.version }}
set ACCELERATION=${{ matrix.acceleration }}
set CUDA_ARCHS=${{ matrix.cuda_architectures || needs.detect-version.outputs.cuda_architectures }}
rem Clone source
git clone --depth 1 --branch %VERSION% https://github.com/ggml-org/llama.cpp.git C:\llama-source
rem Setup MSVC environment
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
rem Configure cmake with Ninja
set CMAKE_ARGS=-B C:\llama-build -S C:\llama-source -G "Ninja Multi-Config" -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF
if "%ACCELERATION%"=="cuda" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=%CUDA_ARCHS%
if "%ACCELERATION%"=="vulkan" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_VULKAN=ON
cmake %CMAKE_ARGS%
rem Build
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build C:\llama-build --config Release -j %NINJA_JOBS%
- name: Package (Windows)
if: matrix.os == 'windows'
shell: pwsh
run: |
$version = "${{ needs.detect-version.outputs.version }}"
$artifactName = "llama-server-${version}-${{ matrix.name }}"
$stagingDir = "C:\llama-staging\$artifactName"
New-Item -ItemType Directory -Force -Path $stagingDir | Out-Null
New-Item -ItemType Directory -Force -Path "dist" | Out-Null
# Copy binary and DLLs
Copy-Item "C:\llama-build\bin\Release\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue
Copy-Item "C:\llama-build\bin\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue
Get-ChildItem "C:\llama-build" -Recurse -Include "*.dll" | Copy-Item -Destination "$stagingDir\" -ErrorAction SilentlyContinue
Compress-Archive -Path "$stagingDir" -DestinationPath "dist\${artifactName}.zip"
Write-Host "✅ Built: dist\${artifactName}.zip"
# ── Smoke test: verify the binary runs ──
- name: Smoke test (Unix)
if: matrix.os != 'windows' && matrix.cross_arch == ''
run: |
BINARY=$(find dist/ -name '*.tar.gz' | head -1)
if [ -z "$BINARY" ]; then
echo "❌ No tarball found in dist/"
exit 1
fi
echo "📦 Testing: $BINARY"
mkdir -p /tmp/smoke-test
tar -xzf "$BINARY" -C /tmp/smoke-test
SERVER=$(find /tmp/smoke-test -name 'llama-server' -type f | head -1)
if [ -z "$SERVER" ]; then
echo "❌ llama-server binary not found in archive"
exit 1
fi
chmod +x "$SERVER"
echo "🔍 Version:" && "$SERVER" --version
echo "✅ Smoke test passed"
- name: Smoke test (Windows)
if: matrix.os == 'windows'
shell: pwsh
run: |
$zip = Get-ChildItem dist\*.zip | Select-Object -First 1
if (-not $zip) { Write-Error "No zip found"; exit 1 }
Write-Host "📦 Testing: $($zip.Name)"
Expand-Archive -Path $zip.FullName -DestinationPath C:\smoke-test -Force
$exe = Get-ChildItem C:\smoke-test -Recurse -Filter "llama-server.exe" | Select-Object -First 1
if (-not $exe) { Write-Error "llama-server.exe not found"; exit 1 }
Write-Host "🔍 Version:"
& $exe.FullName --version
if ($LASTEXITCODE -ne 0) { Write-Error "Smoke test failed"; exit 1 }
Write-Host "✅ Smoke test passed"
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.name }}
path: |
dist/*.tar.gz
dist/*.zip
retention-days: 7
# ── Step 3: Publish release ──────────────────────────────────────────────────
release:
needs: [detect-version, build]
runs-on: ubuntu-22.04
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts/
- name: List artifacts
run: find artifacts/ -type f \( -name '*.tar.gz' -o -name '*.zip' \) | sort
- name: Create GitHub Release
uses: softprops/action-gh-release@v2
with:
tag_name: ${{ needs.detect-version.outputs.version }}
name: "llama-server ${{ needs.detect-version.outputs.version }}"
body: |
Pre-built `llama-server` binaries from [llama.cpp ${{ needs.detect-version.outputs.version }}](https://github.com/ggml-org/llama.cpp/releases/tag/${{ needs.detect-version.outputs.version }}).
## Downloads — Linux
| Variant | File |
|---------|------|
| x64 CPU | `llama-server-*-linux-x64-cpu.tar.gz` |
| x64 CUDA 12.8 | `llama-server-*-linux-x64-cuda-12.tar.gz` |
| x64 CUDA 13.1 | `llama-server-*-linux-x64-cuda-13.tar.gz` |
| x64 Vulkan | `llama-server-*-linux-x64-vulkan.tar.gz` |
| arm64 CPU | `llama-server-*-linux-arm64-cpu.tar.gz` |
| arm64 CUDA 12.8 | `llama-server-*-linux-arm64-cuda-12.tar.gz` |
| arm64 CUDA 13.1 | `llama-server-*-linux-arm64-cuda-13.tar.gz` |
| arm64 Vulkan | `llama-server-*-linux-arm64-vulkan.tar.gz` |
## Downloads — Windows
| Variant | File |
|---------|------|
| x64 CPU | `llama-server-*-windows-x64-cpu.zip` |
| x64 CUDA 12.4 | `llama-server-*-windows-x64-cuda-12.zip` |
| x64 CUDA 13.1 | `llama-server-*-windows-x64-cuda-13.zip` |
| x64 Vulkan | `llama-server-*-windows-x64-vulkan.zip` |
## Downloads — macOS
| Variant | File |
|---------|------|
| arm64 Metal | `llama-server-*-macos-arm64-metal.tar.gz` |
| x64 CPU | `llama-server-*-macos-x64-cpu.tar.gz` |
### CUDA SM targets: `${{ needs.detect-version.outputs.cuda_architectures }}`
files: artifacts/**/*
fail_on_unmatched_files: false
generate_release_notes: false