Build llama-server #14

Workflow file for this run

.github/workflows/build-llama-server.yml at 974d062

	name: Build llama-server

	on:
	# Manual trigger — enter a specific version
	workflow_dispatch:
	inputs:
	version:
	description: 'llama.cpp version tag (e.g. b8416). Leave empty to auto-detect latest.'
	required: false
	default: ''
	type: string
	cuda_architectures:
	description: 'CUDA SM targets (semicolon-separated)'
	required: false
	default: '75;80;86;89;90;100;120'
	type: string

	# Automatic trigger — check for new upstream release every Monday
	schedule:
	- cron: '0 4 * * 1' # Monday 04:00 UTC (Sunday 9pm PDT)

	permissions:
	contents: write

	jobs:
	# ── Step 1: Detect the version to build ──────────────────────────────────────
	detect-version:
	runs-on: ubuntu-22.04
	outputs:
	version: ${{ steps.resolve.outputs.version }}
	should_build: ${{ steps.resolve.outputs.should_build }}
	cuda_architectures: ${{ steps.resolve.outputs.cuda_architectures }}
	steps:
	- name: Resolve version
	id: resolve
	env:
	INPUT_VERSION: ${{ inputs.version }}
	INPUT_CUDA_ARCHS: ${{ inputs.cuda_architectures }}
	GH_TOKEN: ${{ github.token }}
	run: \|
	CUDA_ARCHS="${INPUT_CUDA_ARCHS:-75;80;86;89;90;100;120}"
	echo "cuda_architectures=${CUDA_ARCHS}" >> $GITHUB_OUTPUT

	# If version was provided manually, use it
	if [ -n "$INPUT_VERSION" ]; then
	echo "version=${INPUT_VERSION}" >> $GITHUB_OUTPUT
	echo "should_build=true" >> $GITHUB_OUTPUT
	echo "✅ Manual trigger: building ${INPUT_VERSION}"
	exit 0
	fi

	# Auto-detect latest upstream version
	LATEST=$(curl -sf https://api.github.com/repos/ggml-org/llama.cpp/releases/latest \
	-H "Accept: application/vnd.github.v3+json" \| jq -r '.tag_name')

	if [ -z "$LATEST" ] \|\| [ "$LATEST" = "null" ]; then
	echo "❌ Failed to fetch latest llama.cpp version"
	echo "should_build=false" >> $GITHUB_OUTPUT
	exit 0
	fi

	echo "🔍 Latest upstream version: ${LATEST}"

	# Check if we already have a release for this version
	EXISTING=$(curl -sf \
	-H "Accept: application/vnd.github.v3+json" \
	-H "Authorization: Bearer ${GH_TOKEN}" \
	"https://api.github.com/repos/${{ github.repository }}/releases/tags/${LATEST}" \
	\| jq -r '.tag_name // empty')

	if [ "$EXISTING" = "$LATEST" ]; then
	echo "⏭️ Release ${LATEST} already exists — skipping"
	echo "should_build=false" >> $GITHUB_OUTPUT
	else
	echo "🆕 New version detected: ${LATEST} — triggering build"
	echo "version=${LATEST}" >> $GITHUB_OUTPUT
	echo "should_build=true" >> $GITHUB_OUTPUT
	fi

	# ── Step 2: Build all variants ───────────────────────────────────────────────
	build:
	needs: detect-version
	if: needs.detect-version.outputs.should_build == 'true'
	timeout-minutes: 120
	strategy:
	fail-fast: false
	matrix:
	include:
	# ── Linux x64 ──
	- name: linux-x64-cpu
	runner: ubuntu-22.04
	os: linux
	acceleration: cpu
	cuda_version: ''
	cuda_architectures: ''

	- name: linux-x64-cuda-12
	runner: ubuntu-22.04
	os: linux
	acceleration: cuda
	cuda_version: '12-8'
	cuda_pkg: 'cuda-toolkit-12-8'
	cuda_home: '/usr/local/cuda-12.8'
	cuda_architectures: '75;80;86;89;90'

	- name: linux-x64-cuda-13
	runner: ubuntu-22.04
	os: linux
	acceleration: cuda
	cuda_version: '13-1'
	cuda_pkg: 'cuda-toolkit-13-1'
	cuda_home: '/usr/local/cuda-13.1'
	cuda_architectures: '75;80;86;89;90;100;120'


	- name: linux-x64-vulkan
	runner: ubuntu-22.04
	os: linux
	acceleration: vulkan
	cuda_version: ''
	cuda_architectures: ''

	# ── Linux arm64 ──
	# All arm64 builds use ggml_native: 'OFF' + -march=armv8-a for a
	# portable binary that runs on Jetson Orin (A78AE), Raspberry Pi 4
	# (A72), Pi 5 (A76), Rockchip RK3399/RK3588, and any aarch64 device.
	# Without this, Graviton runners bake in SVE/i8mm/bf16 instructions
	# that SIGILL on those targets.
	- name: linux-arm64-cpu
	runner: ubuntu-22.04-arm
	os: linux
	acceleration: cpu
	ggml_native: 'OFF'
	cuda_version: ''
	cuda_architectures: ''

	- name: linux-arm64-cuda-12
	runner: ubuntu-22.04-arm
	os: linux
	acceleration: cuda
	ggml_native: 'OFF'
	cuda_version: '12-8'
	cuda_pkg: 'cuda-toolkit-12-8'
	cuda_home: '/usr/local/cuda-12.8'
	cuda_architectures: '75;80;86;89;90'

	- name: linux-arm64-cuda-13
	runner: ubuntu-22.04-arm
	os: linux
	acceleration: cuda
	ggml_native: 'OFF'
	cuda_version: '13-1'
	cuda_pkg: 'cuda-toolkit-13-1'
	cuda_home: '/usr/local/cuda-13.1'
	cuda_architectures: '75;80;86;89;90;100;120'


	- name: linux-arm64-vulkan
	runner: ubuntu-22.04-arm
	os: linux
	acceleration: vulkan
	ggml_native: 'OFF'
	cuda_version: ''
	cuda_architectures: ''

	# ── Windows x64 ──
	- name: windows-x64-cpu
	runner: windows-2022
	os: windows
	acceleration: cpu
	cuda_version: ''
	cuda_architectures: ''

	- name: windows-x64-cuda-12
	runner: windows-2022
	os: windows
	acceleration: cuda
	cuda_version: '12.4'
	cuda_architectures: '75;80;86;89;90'

	- name: windows-x64-cuda-13
	runner: windows-2022
	os: windows
	acceleration: cuda
	cuda_version: '13.1'
	cuda_architectures: '75;80;86;89;90;100;120'


	- name: windows-x64-vulkan
	runner: windows-2022
	os: windows
	acceleration: vulkan
	cuda_version: ''
	cuda_architectures: ''


	# ── macOS ──
	- name: macos-arm64-metal
	runner: macos-14
	os: macos
	acceleration: metal
	cuda_version: ''
	cuda_architectures: ''

	- name: macos-x64-cpu
	runner: macos-14
	os: macos
	acceleration: cpu
	cross_arch: x86_64
	cuda_version: ''
	cuda_architectures: ''

	name: ${{ matrix.name }}
	runs-on: ${{ matrix.runner }}

	steps:
	- name: Checkout build repo
	uses: actions/checkout@v4

	# ── Linux dependencies ──
	- name: Install build dependencies (Linux)
	if: matrix.os == 'linux'
	run: \|
	sudo apt-get update
	sudo apt-get install -y build-essential cmake git

	- name: Install Vulkan SDK (Linux)
	if: matrix.os == 'linux' && matrix.acceleration == 'vulkan'
	run: \|
	ARCH=$(dpkg --print-architecture)
	if [ "$ARCH" = "amd64" ]; then
	# LunarG full SDK — x86_64 only
	wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc \| sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
	sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
	sudo apt-get update
	sudo apt-get install -y vulkan-sdk
	else
	# ARM64: use Ubuntu's built-in Vulkan packages (LunarG SDK is amd64-only)
	sudo apt-get install -y libvulkan-dev glslang-tools spirv-tools
	fi

	- name: Install CUDA toolkit (Linux)
	if: matrix.os == 'linux' && matrix.acceleration == 'cuda'
	run: \|
	ARCH=$(dpkg --print-architecture)
	if [ "$ARCH" = "amd64" ]; then
	CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64"
	elif [ "$ARCH" = "arm64" ]; then
	CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa"
	else
	echo "❌ Unsupported architecture: $ARCH"
	exit 1
	fi

	wget -qO- "${CUDA_REPO}/3bf863cc.pub" \| sudo gpg --dearmor -o /usr/share/keyrings/cuda-archive-keyring.gpg
	echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] ${CUDA_REPO} /" \| sudo tee /etc/apt/sources.list.d/cuda.list
	sudo apt-get update
	sudo apt-get install -y ${{ matrix.cuda_pkg }}

	echo "CUDA_HOME=${{ matrix.cuda_home }}" >> $GITHUB_ENV
	echo "${{ matrix.cuda_home }}/bin" >> $GITHUB_PATH

	# ── Windows dependencies ──
	- name: Install Ninja (Windows)
	if: matrix.os == 'windows'
	run: choco install ninja -y

	- name: Install CUDA toolkit (Windows)
	if: matrix.os == 'windows' && matrix.acceleration == 'cuda'
	uses: ./.github/actions/windows-setup-cuda
	with:
	cuda_version: ${{ matrix.cuda_version }}

	- name: Install Vulkan SDK (Windows)
	if: matrix.os == 'windows' && matrix.acceleration == 'vulkan'
	env:
	VULKAN_VERSION: 1.4.313.0
	run: \|
	curl -L -o vulkan-sdk.exe "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
	.\vulkan-sdk.exe --accept-licenses --default-answer --confirm-command install
	echo "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" >> $env:GITHUB_ENV
	echo "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" >> $env:GITHUB_PATH

	# ── Build (Linux & macOS) ──
	- name: Build llama-server (Unix)
	if: matrix.os != 'windows'
	env:
	CROSS_ARCH: ${{ matrix.cross_arch }}
	# GGML_NATIVE=OFF forces -march=armv8-a on arm64 runners, producing
	# a portable binary that won't SIGILL on Jetson/Pi/Rockchip targets.
	GGML_NATIVE: ${{ matrix.ggml_native \|\| 'ON' }}
	run: \|
	chmod +x scripts/build.sh
	bash scripts/build.sh \
	"${{ needs.detect-version.outputs.version }}" \
	"${{ matrix.acceleration }}" \
	"${{ matrix.cuda_architectures \|\| needs.detect-version.outputs.cuda_architectures }}"

	# ── Build (Windows) ──
	- name: Build llama-server (Windows)
	if: matrix.os == 'windows'
	shell: cmd
	run: \|
	set VERSION=${{ needs.detect-version.outputs.version }}
	set ACCELERATION=${{ matrix.acceleration }}
	set CUDA_ARCHS=${{ matrix.cuda_architectures \|\| needs.detect-version.outputs.cuda_architectures }}

	rem Clone source
	git clone --depth 1 --branch %VERSION% https://github.com/ggml-org/llama.cpp.git C:\llama-source

	rem Setup MSVC environment
	call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64

	rem Configure cmake with Ninja
	set CMAKE_ARGS=-B C:\llama-build -S C:\llama-source -G "Ninja Multi-Config" -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF

	if "%ACCELERATION%"=="cuda" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=%CUDA_ARCHS%
	if "%ACCELERATION%"=="vulkan" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_VULKAN=ON

	cmake %CMAKE_ARGS%

	rem Build
	set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
	cmake --build C:\llama-build --config Release -j %NINJA_JOBS%

	- name: Package (Windows)
	if: matrix.os == 'windows'
	shell: pwsh
	run: \|
	$version = "${{ needs.detect-version.outputs.version }}"
	$artifactName = "llama-server-${version}-${{ matrix.name }}"
	$stagingDir = "C:\llama-staging\$artifactName"
	New-Item -ItemType Directory -Force -Path $stagingDir \| Out-Null
	New-Item -ItemType Directory -Force -Path "dist" \| Out-Null

	# Copy binary and DLLs
	Copy-Item "C:\llama-build\bin\Release\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue
	Copy-Item "C:\llama-build\bin\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue
	Get-ChildItem "C:\llama-build" -Recurse -Include "*.dll" \| Copy-Item -Destination "$stagingDir\" -ErrorAction SilentlyContinue

	Compress-Archive -Path "$stagingDir" -DestinationPath "dist\${artifactName}.zip"
	Write-Host "✅ Built: dist\${artifactName}.zip"

	# ── Smoke test: verify the binary runs ──
	- name: Smoke test (Unix)
	if: matrix.os != 'windows' && matrix.cross_arch == ''
	run: \|
	BINARY=$(find dist/ -name '*.tar.gz' \| head -1)
	if [ -z "$BINARY" ]; then
	echo "❌ No tarball found in dist/"
	exit 1
	fi
	echo "📦 Testing: $BINARY"
	mkdir -p /tmp/smoke-test
	tar -xzf "$BINARY" -C /tmp/smoke-test
	SERVER=$(find /tmp/smoke-test -name 'llama-server' -type f \| head -1)
	if [ -z "$SERVER" ]; then
	echo "❌ llama-server binary not found in archive"
	exit 1
	fi
	chmod +x "$SERVER"
	echo "🔍 Version:" && "$SERVER" --version
	echo "✅ Smoke test passed"

	- name: Smoke test (Windows)
	if: matrix.os == 'windows'
	shell: pwsh
	run: \|
	$zip = Get-ChildItem dist\*.zip \| Select-Object -First 1
	if (-not $zip) { Write-Error "No zip found"; exit 1 }
	Write-Host "📦 Testing: $($zip.Name)"
	Expand-Archive -Path $zip.FullName -DestinationPath C:\smoke-test -Force
	$exe = Get-ChildItem C:\smoke-test -Recurse -Filter "llama-server.exe" \| Select-Object -First 1
	if (-not $exe) { Write-Error "llama-server.exe not found"; exit 1 }
	Write-Host "🔍 Version:"
	& $exe.FullName --version
	if ($LASTEXITCODE -ne 0) { Write-Error "Smoke test failed"; exit 1 }
	Write-Host "✅ Smoke test passed"

	- name: Upload artifact
	uses: actions/upload-artifact@v4
	with:
	name: ${{ matrix.name }}
	path: \|
	dist/*.tar.gz
	dist/*.zip
	retention-days: 7

	# ── Step 3: Publish release ──────────────────────────────────────────────────
	release:
	needs: [detect-version, build]
	runs-on: ubuntu-22.04
	steps:
	- name: Download all artifacts
	uses: actions/download-artifact@v4
	with:
	path: artifacts/

	- name: List artifacts
	run: find artifacts/ -type f $ -name '.tar.gz' -o -name '.zip' $ \| sort

	- name: Create GitHub Release
	uses: softprops/action-gh-release@v2
	with:
	tag_name: ${{ needs.detect-version.outputs.version }}
	name: "llama-server ${{ needs.detect-version.outputs.version }}"
	body: \|
	Pre-built `llama-server` binaries from [llama.cpp ${{ needs.detect-version.outputs.version }}](https://github.com/ggml-org/llama.cpp/releases/tag/${{ needs.detect-version.outputs.version }}).

	## Downloads — Linux

	\| Variant \| File \|
	\|---------\|------\|
	\| x64 CPU \| `llama-server-*-linux-x64-cpu.tar.gz` \|
	\| x64 CUDA 12.8 \| `llama-server-*-linux-x64-cuda-12.tar.gz` \|
	\| x64 CUDA 13.1 \| `llama-server-*-linux-x64-cuda-13.tar.gz` \|
	\| x64 Vulkan \| `llama-server-*-linux-x64-vulkan.tar.gz` \|
	\| arm64 CPU \| `llama-server-*-linux-arm64-cpu.tar.gz` \|
	\| arm64 CUDA 12.8 \| `llama-server-*-linux-arm64-cuda-12.tar.gz` \|
	\| arm64 CUDA 13.1 \| `llama-server-*-linux-arm64-cuda-13.tar.gz` \|
	\| arm64 Vulkan \| `llama-server-*-linux-arm64-vulkan.tar.gz` \|

	## Downloads — Windows

	\| Variant \| File \|
	\|---------\|------\|
	\| x64 CPU \| `llama-server-*-windows-x64-cpu.zip` \|
	\| x64 CUDA 12.4 \| `llama-server-*-windows-x64-cuda-12.zip` \|
	\| x64 CUDA 13.1 \| `llama-server-*-windows-x64-cuda-13.zip` \|
	\| x64 Vulkan \| `llama-server-*-windows-x64-vulkan.zip` \|

	## Downloads — macOS

	\| Variant \| File \|
	\|---------\|------\|
	\| arm64 Metal \| `llama-server-*-macos-arm64-metal.tar.gz` \|
	\| x64 CPU \| `llama-server-*-macos-x64-cpu.tar.gz` \|

	### CUDA SM targets: `${{ needs.detect-version.outputs.cuda_architectures }}`
	files: artifacts/*/
	fail_on_unmatched_files: false
	generate_release_notes: false

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Build llama-server #14

Workflow file

Build llama-server #14

Uh oh!

Workflow file for this run