Build llama-server #14
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Build llama-server | |
| on: | |
| # Manual trigger — enter a specific version | |
| workflow_dispatch: | |
| inputs: | |
| version: | |
| description: 'llama.cpp version tag (e.g. b8416). Leave empty to auto-detect latest.' | |
| required: false | |
| default: '' | |
| type: string | |
| cuda_architectures: | |
| description: 'CUDA SM targets (semicolon-separated)' | |
| required: false | |
| default: '75;80;86;89;90;100;120' | |
| type: string | |
| # Automatic trigger — check for new upstream release every Monday | |
| schedule: | |
| - cron: '0 4 * * 1' # Monday 04:00 UTC (Sunday 9pm PDT) | |
| permissions: | |
| contents: write | |
| jobs: | |
| # ── Step 1: Detect the version to build ────────────────────────────────────── | |
| detect-version: | |
| runs-on: ubuntu-22.04 | |
| outputs: | |
| version: ${{ steps.resolve.outputs.version }} | |
| should_build: ${{ steps.resolve.outputs.should_build }} | |
| cuda_architectures: ${{ steps.resolve.outputs.cuda_architectures }} | |
| steps: | |
| - name: Resolve version | |
| id: resolve | |
| env: | |
| INPUT_VERSION: ${{ inputs.version }} | |
| INPUT_CUDA_ARCHS: ${{ inputs.cuda_architectures }} | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| CUDA_ARCHS="${INPUT_CUDA_ARCHS:-75;80;86;89;90;100;120}" | |
| echo "cuda_architectures=${CUDA_ARCHS}" >> $GITHUB_OUTPUT | |
| # If version was provided manually, use it | |
| if [ -n "$INPUT_VERSION" ]; then | |
| echo "version=${INPUT_VERSION}" >> $GITHUB_OUTPUT | |
| echo "should_build=true" >> $GITHUB_OUTPUT | |
| echo "✅ Manual trigger: building ${INPUT_VERSION}" | |
| exit 0 | |
| fi | |
| # Auto-detect latest upstream version | |
| LATEST=$(curl -sf https://api.github.com/repos/ggml-org/llama.cpp/releases/latest \ | |
| -H "Accept: application/vnd.github.v3+json" | jq -r '.tag_name') | |
| if [ -z "$LATEST" ] || [ "$LATEST" = "null" ]; then | |
| echo "❌ Failed to fetch latest llama.cpp version" | |
| echo "should_build=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "🔍 Latest upstream version: ${LATEST}" | |
| # Check if we already have a release for this version | |
| EXISTING=$(curl -sf \ | |
| -H "Accept: application/vnd.github.v3+json" \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${{ github.repository }}/releases/tags/${LATEST}" \ | |
| | jq -r '.tag_name // empty') | |
| if [ "$EXISTING" = "$LATEST" ]; then | |
| echo "⏭️ Release ${LATEST} already exists — skipping" | |
| echo "should_build=false" >> $GITHUB_OUTPUT | |
| else | |
| echo "🆕 New version detected: ${LATEST} — triggering build" | |
| echo "version=${LATEST}" >> $GITHUB_OUTPUT | |
| echo "should_build=true" >> $GITHUB_OUTPUT | |
| fi | |
| # ── Step 2: Build all variants ─────────────────────────────────────────────── | |
| build: | |
| needs: detect-version | |
| if: needs.detect-version.outputs.should_build == 'true' | |
| timeout-minutes: 120 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| # ── Linux x64 ── | |
| - name: linux-x64-cpu | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: cpu | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: linux-x64-cuda-12 | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: cuda | |
| cuda_version: '12-8' | |
| cuda_pkg: 'cuda-toolkit-12-8' | |
| cuda_home: '/usr/local/cuda-12.8' | |
| cuda_architectures: '75;80;86;89;90' | |
| - name: linux-x64-cuda-13 | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: cuda | |
| cuda_version: '13-1' | |
| cuda_pkg: 'cuda-toolkit-13-1' | |
| cuda_home: '/usr/local/cuda-13.1' | |
| cuda_architectures: '75;80;86;89;90;100;120' | |
| - name: linux-x64-vulkan | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: vulkan | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| # ── Linux arm64 ── | |
| # All arm64 builds use ggml_native: 'OFF' + -march=armv8-a for a | |
| # portable binary that runs on Jetson Orin (A78AE), Raspberry Pi 4 | |
| # (A72), Pi 5 (A76), Rockchip RK3399/RK3588, and any aarch64 device. | |
| # Without this, Graviton runners bake in SVE/i8mm/bf16 instructions | |
| # that SIGILL on those targets. | |
| - name: linux-arm64-cpu | |
| runner: ubuntu-22.04-arm | |
| os: linux | |
| acceleration: cpu | |
| ggml_native: 'OFF' | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: linux-arm64-cuda-12 | |
| runner: ubuntu-22.04-arm | |
| os: linux | |
| acceleration: cuda | |
| ggml_native: 'OFF' | |
| cuda_version: '12-8' | |
| cuda_pkg: 'cuda-toolkit-12-8' | |
| cuda_home: '/usr/local/cuda-12.8' | |
| cuda_architectures: '75;80;86;89;90' | |
| - name: linux-arm64-cuda-13 | |
| runner: ubuntu-22.04-arm | |
| os: linux | |
| acceleration: cuda | |
| ggml_native: 'OFF' | |
| cuda_version: '13-1' | |
| cuda_pkg: 'cuda-toolkit-13-1' | |
| cuda_home: '/usr/local/cuda-13.1' | |
| cuda_architectures: '75;80;86;89;90;100;120' | |
| - name: linux-arm64-vulkan | |
| runner: ubuntu-22.04-arm | |
| os: linux | |
| acceleration: vulkan | |
| ggml_native: 'OFF' | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| # ── Windows x64 ── | |
| - name: windows-x64-cpu | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: cpu | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: windows-x64-cuda-12 | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: cuda | |
| cuda_version: '12.4' | |
| cuda_architectures: '75;80;86;89;90' | |
| - name: windows-x64-cuda-13 | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: cuda | |
| cuda_version: '13.1' | |
| cuda_architectures: '75;80;86;89;90;100;120' | |
| - name: windows-x64-vulkan | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: vulkan | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| # ── macOS ── | |
| - name: macos-arm64-metal | |
| runner: macos-14 | |
| os: macos | |
| acceleration: metal | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: macos-x64-cpu | |
| runner: macos-14 | |
| os: macos | |
| acceleration: cpu | |
| cross_arch: x86_64 | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| name: ${{ matrix.name }} | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout build repo | |
| uses: actions/checkout@v4 | |
| # ── Linux dependencies ── | |
| - name: Install build dependencies (Linux) | |
| if: matrix.os == 'linux' | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y build-essential cmake git | |
| - name: Install Vulkan SDK (Linux) | |
| if: matrix.os == 'linux' && matrix.acceleration == 'vulkan' | |
| run: | | |
| ARCH=$(dpkg --print-architecture) | |
| if [ "$ARCH" = "amd64" ]; then | |
| # LunarG full SDK — x86_64 only | |
| wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc | |
| sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list | |
| sudo apt-get update | |
| sudo apt-get install -y vulkan-sdk | |
| else | |
| # ARM64: use Ubuntu's built-in Vulkan packages (LunarG SDK is amd64-only) | |
| sudo apt-get install -y libvulkan-dev glslang-tools spirv-tools | |
| fi | |
| - name: Install CUDA toolkit (Linux) | |
| if: matrix.os == 'linux' && matrix.acceleration == 'cuda' | |
| run: | | |
| ARCH=$(dpkg --print-architecture) | |
| if [ "$ARCH" = "amd64" ]; then | |
| CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64" | |
| elif [ "$ARCH" = "arm64" ]; then | |
| CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa" | |
| else | |
| echo "❌ Unsupported architecture: $ARCH" | |
| exit 1 | |
| fi | |
| wget -qO- "${CUDA_REPO}/3bf863cc.pub" | sudo gpg --dearmor -o /usr/share/keyrings/cuda-archive-keyring.gpg | |
| echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] ${CUDA_REPO} /" | sudo tee /etc/apt/sources.list.d/cuda.list | |
| sudo apt-get update | |
| sudo apt-get install -y ${{ matrix.cuda_pkg }} | |
| echo "CUDA_HOME=${{ matrix.cuda_home }}" >> $GITHUB_ENV | |
| echo "${{ matrix.cuda_home }}/bin" >> $GITHUB_PATH | |
| # ── Windows dependencies ── | |
| - name: Install Ninja (Windows) | |
| if: matrix.os == 'windows' | |
| run: choco install ninja -y | |
| - name: Install CUDA toolkit (Windows) | |
| if: matrix.os == 'windows' && matrix.acceleration == 'cuda' | |
| uses: ./.github/actions/windows-setup-cuda | |
| with: | |
| cuda_version: ${{ matrix.cuda_version }} | |
| - name: Install Vulkan SDK (Windows) | |
| if: matrix.os == 'windows' && matrix.acceleration == 'vulkan' | |
| env: | |
| VULKAN_VERSION: 1.4.313.0 | |
| run: | | |
| curl -L -o vulkan-sdk.exe "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" | |
| .\vulkan-sdk.exe --accept-licenses --default-answer --confirm-command install | |
| echo "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" >> $env:GITHUB_ENV | |
| echo "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" >> $env:GITHUB_PATH | |
| # ── Build (Linux & macOS) ── | |
| - name: Build llama-server (Unix) | |
| if: matrix.os != 'windows' | |
| env: | |
| CROSS_ARCH: ${{ matrix.cross_arch }} | |
| # GGML_NATIVE=OFF forces -march=armv8-a on arm64 runners, producing | |
| # a portable binary that won't SIGILL on Jetson/Pi/Rockchip targets. | |
| GGML_NATIVE: ${{ matrix.ggml_native || 'ON' }} | |
| run: | | |
| chmod +x scripts/build.sh | |
| bash scripts/build.sh \ | |
| "${{ needs.detect-version.outputs.version }}" \ | |
| "${{ matrix.acceleration }}" \ | |
| "${{ matrix.cuda_architectures || needs.detect-version.outputs.cuda_architectures }}" | |
| # ── Build (Windows) ── | |
| - name: Build llama-server (Windows) | |
| if: matrix.os == 'windows' | |
| shell: cmd | |
| run: | | |
| set VERSION=${{ needs.detect-version.outputs.version }} | |
| set ACCELERATION=${{ matrix.acceleration }} | |
| set CUDA_ARCHS=${{ matrix.cuda_architectures || needs.detect-version.outputs.cuda_architectures }} | |
| rem Clone source | |
| git clone --depth 1 --branch %VERSION% https://github.com/ggml-org/llama.cpp.git C:\llama-source | |
| rem Setup MSVC environment | |
| call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 | |
| rem Configure cmake with Ninja | |
| set CMAKE_ARGS=-B C:\llama-build -S C:\llama-source -G "Ninja Multi-Config" -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF | |
| if "%ACCELERATION%"=="cuda" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=%CUDA_ARCHS% | |
| if "%ACCELERATION%"=="vulkan" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_VULKAN=ON | |
| cmake %CMAKE_ARGS% | |
| rem Build | |
| set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 | |
| cmake --build C:\llama-build --config Release -j %NINJA_JOBS% | |
| - name: Package (Windows) | |
| if: matrix.os == 'windows' | |
| shell: pwsh | |
| run: | | |
| $version = "${{ needs.detect-version.outputs.version }}" | |
| $artifactName = "llama-server-${version}-${{ matrix.name }}" | |
| $stagingDir = "C:\llama-staging\$artifactName" | |
| New-Item -ItemType Directory -Force -Path $stagingDir | Out-Null | |
| New-Item -ItemType Directory -Force -Path "dist" | Out-Null | |
| # Copy binary and DLLs | |
| Copy-Item "C:\llama-build\bin\Release\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue | |
| Copy-Item "C:\llama-build\bin\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue | |
| Get-ChildItem "C:\llama-build" -Recurse -Include "*.dll" | Copy-Item -Destination "$stagingDir\" -ErrorAction SilentlyContinue | |
| Compress-Archive -Path "$stagingDir" -DestinationPath "dist\${artifactName}.zip" | |
| Write-Host "✅ Built: dist\${artifactName}.zip" | |
| # ── Smoke test: verify the binary runs ── | |
| - name: Smoke test (Unix) | |
| if: matrix.os != 'windows' && matrix.cross_arch == '' | |
| run: | | |
| BINARY=$(find dist/ -name '*.tar.gz' | head -1) | |
| if [ -z "$BINARY" ]; then | |
| echo "❌ No tarball found in dist/" | |
| exit 1 | |
| fi | |
| echo "📦 Testing: $BINARY" | |
| mkdir -p /tmp/smoke-test | |
| tar -xzf "$BINARY" -C /tmp/smoke-test | |
| SERVER=$(find /tmp/smoke-test -name 'llama-server' -type f | head -1) | |
| if [ -z "$SERVER" ]; then | |
| echo "❌ llama-server binary not found in archive" | |
| exit 1 | |
| fi | |
| chmod +x "$SERVER" | |
| echo "🔍 Version:" && "$SERVER" --version | |
| echo "✅ Smoke test passed" | |
| - name: Smoke test (Windows) | |
| if: matrix.os == 'windows' | |
| shell: pwsh | |
| run: | | |
| $zip = Get-ChildItem dist\*.zip | Select-Object -First 1 | |
| if (-not $zip) { Write-Error "No zip found"; exit 1 } | |
| Write-Host "📦 Testing: $($zip.Name)" | |
| Expand-Archive -Path $zip.FullName -DestinationPath C:\smoke-test -Force | |
| $exe = Get-ChildItem C:\smoke-test -Recurse -Filter "llama-server.exe" | Select-Object -First 1 | |
| if (-not $exe) { Write-Error "llama-server.exe not found"; exit 1 } | |
| Write-Host "🔍 Version:" | |
| & $exe.FullName --version | |
| if ($LASTEXITCODE -ne 0) { Write-Error "Smoke test failed"; exit 1 } | |
| Write-Host "✅ Smoke test passed" | |
| - name: Upload artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ${{ matrix.name }} | |
| path: | | |
| dist/*.tar.gz | |
| dist/*.zip | |
| retention-days: 7 | |
| # ── Step 3: Publish release ────────────────────────────────────────────────── | |
| release: | |
| needs: [detect-version, build] | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - name: Download all artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: artifacts/ | |
| - name: List artifacts | |
| run: find artifacts/ -type f \( -name '*.tar.gz' -o -name '*.zip' \) | sort | |
| - name: Create GitHub Release | |
| uses: softprops/action-gh-release@v2 | |
| with: | |
| tag_name: ${{ needs.detect-version.outputs.version }} | |
| name: "llama-server ${{ needs.detect-version.outputs.version }}" | |
| body: | | |
| Pre-built `llama-server` binaries from [llama.cpp ${{ needs.detect-version.outputs.version }}](https://github.com/ggml-org/llama.cpp/releases/tag/${{ needs.detect-version.outputs.version }}). | |
| ## Downloads — Linux | |
| | Variant | File | | |
| |---------|------| | |
| | x64 CPU | `llama-server-*-linux-x64-cpu.tar.gz` | | |
| | x64 CUDA 12.8 | `llama-server-*-linux-x64-cuda-12.tar.gz` | | |
| | x64 CUDA 13.1 | `llama-server-*-linux-x64-cuda-13.tar.gz` | | |
| | x64 Vulkan | `llama-server-*-linux-x64-vulkan.tar.gz` | | |
| | arm64 CPU | `llama-server-*-linux-arm64-cpu.tar.gz` | | |
| | arm64 CUDA 12.8 | `llama-server-*-linux-arm64-cuda-12.tar.gz` | | |
| | arm64 CUDA 13.1 | `llama-server-*-linux-arm64-cuda-13.tar.gz` | | |
| | arm64 Vulkan | `llama-server-*-linux-arm64-vulkan.tar.gz` | | |
| ## Downloads — Windows | |
| | Variant | File | | |
| |---------|------| | |
| | x64 CPU | `llama-server-*-windows-x64-cpu.zip` | | |
| | x64 CUDA 12.4 | `llama-server-*-windows-x64-cuda-12.zip` | | |
| | x64 CUDA 13.1 | `llama-server-*-windows-x64-cuda-13.zip` | | |
| | x64 Vulkan | `llama-server-*-windows-x64-vulkan.zip` | | |
| ## Downloads — macOS | |
| | Variant | File | | |
| |---------|------| | |
| | arm64 Metal | `llama-server-*-macos-arm64-metal.tar.gz` | | |
| | x64 CPU | `llama-server-*-macos-x64-cpu.tar.gz` | | |
| ### CUDA SM targets: `${{ needs.detect-version.outputs.cuda_architectures }}` | |
| files: artifacts/**/* | |
| fail_on_unmatched_files: false | |
| generate_release_notes: false |