From ad4f275622d4e8e0f86041a00e5bc5811de695dc Mon Sep 17 00:00:00 2001 From: abetlen Date: Mon, 1 Jun 2026 02:41:56 -0700 Subject: [PATCH 1/6] feat(ci): add CUDA 11.8 wheel builds --- .github/workflows/build-wheels-cuda.yaml | 2 +- CHANGELOG.md | 1 + README.md | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 87bd365957..066a229960 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -24,7 +24,7 @@ jobs: # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic, # so one builder per toolkit version is sufficient. 'pyver' = @("3.9") - 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1") + 'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1") 'releasetag' = @("basic") 'exclude' = @( @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' }, diff --git a/CHANGELOG.md b/CHANGELOG.md index 9144353983..e2fb8e951c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat(ci): add CUDA 11.8 wheel builds by @abetlen in #2238 - fix(ci): add Pascal compute capability targets to CUDA wheel builds by @abetlen in #2237 ## [0.3.24] diff --git a/README.md b/README.md index 7e02b174d4..c6a7b3ca45 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: -- CUDA Version is 12.1, 12.2, 12.3, 12.4 or 12.5 +- CUDA Version is 11.8, 12.1, 12.2, 12.3, 12.4 or 12.5 - NVIDIA GPU compute capability is 6.0 or newer - Python Version is 3.10, 3.11 or 3.12 @@ -135,6 +135,7 @@ pip install llama-cpp-python \ ``` Where `` is one of the following: +- `cu118`: CUDA 11.8 - `cu121`: CUDA 12.1 - `cu122`: CUDA 12.2 - `cu123`: CUDA 12.3 From d74eb9633bd414b36bf089fe6f439ac426c68441 Mon Sep 17 00:00:00 2001 From: abetlen Date: Mon, 1 Jun 2026 02:58:53 -0700 Subject: [PATCH 2/6] fix(ci): make CUDA 11.8 wheel builds version-consistent --- .github/workflows/build-wheels-cuda.yaml | 36 ++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 066a229960..5ec31e7a89 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -20,16 +20,22 @@ jobs: id: set-matrix run: | $matrix = @{ - 'os' = @('ubuntu-22.04', 'windows-2022') + 'os' = @('ubuntu-22.04', 'windows-2022', 'windows-2019') # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic, # so one builder per toolkit version is sufficient. 'pyver' = @("3.9") 'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1") 'releasetag' = @("basic") 'exclude' = @( + @{ 'os' = 'windows-2022'; 'cuda' = '11.8.0' }, @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' }, @{ 'os' = 'windows-2022'; 'cuda' = '12.2.2' }, - @{ 'os' = 'windows-2022'; 'cuda' = '12.3.2' } + @{ 'os' = 'windows-2022'; 'cuda' = '12.3.2' }, + @{ 'os' = 'windows-2019'; 'cuda' = '12.1.1' }, + @{ 'os' = 'windows-2019'; 'cuda' = '12.2.2' }, + @{ 'os' = 'windows-2019'; 'cuda' = '12.3.2' }, + @{ 'os' = 'windows-2019'; 'cuda' = '12.4.1' }, + @{ 'os' = 'windows-2019'; 'cuda' = '12.5.1' } ) } @@ -81,7 +87,31 @@ jobs: run: | $cudaVersion = $env:CUDAVER $cudaChannel = "nvidia/label/cuda-$cudaVersion" - if ($IsLinux) { + if ($cudaVersion -eq '11.8.0') { + if ($IsLinux) { + $cudaPackages = @( + "${cudaChannel}::cuda-nvcc_linux-64=11.8.0", + "${cudaChannel}::cuda-cccl=11.8.89", + "${cudaChannel}::cuda-cudart=11.8.89", + "${cudaChannel}::cuda-cudart-dev=11.8.89", + "${cudaChannel}::cuda-driver-dev=11.8.89", + "${cudaChannel}::libcublas=11.11.3.6", + "${cudaChannel}::libcublas-dev=11.11.3.6" + ) + } elseif ($IsWindows) { + $cudaPackages = @( + "${cudaChannel}::cuda-nvcc_win-64=11.8.0", + "${cudaChannel}::cuda-cccl=11.8.89", + "${cudaChannel}::cuda-cudart=11.8.89", + "${cudaChannel}::cuda-cudart-dev=11.8.89", + "${cudaChannel}::libcublas=11.11.3.6", + "${cudaChannel}::libcublas-dev=11.11.3.6" + ) + } else { + throw 'Unsupported CUDA wheel build platform' + } + mamba install -y --channel-priority strict --override-channels -c $cudaChannel $cudaPackages + } elseif ($IsLinux) { mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" } elseif ($IsWindows) { if ($cudaVersion -like '12.5.*') { From 809be58805d88cb44d2454d7c4eb021f17874e5e Mon Sep 17 00:00:00 2001 From: abetlen Date: Mon, 1 Jun 2026 03:00:53 -0700 Subject: [PATCH 3/6] fix(ci): allow non-CUDA dependencies for CUDA 11.8 wheels --- .github/workflows/build-wheels-cuda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 5ec31e7a89..524f4497ba 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -110,7 +110,7 @@ jobs: } else { throw 'Unsupported CUDA wheel build platform' } - mamba install -y --channel-priority strict --override-channels -c $cudaChannel $cudaPackages + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel $cudaPackages } elseif ($IsLinux) { mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" } elseif ($IsWindows) { From 372045fa0d5f8cfa7050bd6df785755a134581f2 Mon Sep 17 00:00:00 2001 From: abetlen Date: Mon, 1 Jun 2026 03:05:04 -0700 Subject: [PATCH 4/6] fix(ci): omit Hopper targets from CUDA 11.8 wheels --- .github/workflows/build-wheels-cuda.yaml | 7 ++++++- README.md | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 524f4497ba..5ca21b1577 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -199,10 +199,15 @@ jobs: } $cudaTagVersion = $nvccVersion.Replace('.','') $env:VERBOSE = '1' + $cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real;90-real;90-virtual" + if ([version]$nvccVersion -lt [version]"12.0") { + # CUDA 11.8 cannot compile llama.cpp's Hopper PDL device calls. + $cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real" + } # Build real cubins for the supported GPUs, including Pascal, and keep # one forward-compatible PTX target instead of embedding PTX for every # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=60-real;61-real;70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS" + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=$cudaArchs -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS" $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' python -m build --wheel # Publish tags that reflect the actual installed toolkit version. diff --git a/README.md b/README.md index c6a7b3ca45..2b7a7d98c9 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: - CUDA Version is 11.8, 12.1, 12.2, 12.3, 12.4 or 12.5 -- NVIDIA GPU compute capability is 6.0 or newer +- NVIDIA GPU compute capability is 6.0 through 8.9 for CUDA 11.8 wheels, or 6.0 or newer for CUDA 12 wheels - Python Version is 3.10, 3.11 or 3.12 ```bash From d6eb46a62b3232b3972c394806e1c268edb27bae Mon Sep 17 00:00:00 2001 From: abetlen Date: Mon, 1 Jun 2026 03:13:35 -0700 Subject: [PATCH 5/6] fix(ci): use GCC 11 for CUDA 11.8 Linux wheels --- .github/workflows/build-wheels-cuda.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 5ca21b1577..83200ecdfc 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -152,7 +152,12 @@ jobs: $cudaRootCmake = $cudaRoot.Replace('\', '/') $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$cudaRootCmake -DCUDA_TOOLKIT_ROOT_DIR=$cudaRootCmake" if ($IsLinux) { - if (Test-Path '/usr/bin/g++-12') { + if ([version]$env:CUDAVER -lt [version]"12.0" -and (Test-Path '/usr/bin/g++-11')) { + $env:CC = '/usr/bin/gcc-11' + $env:CXX = '/usr/bin/g++-11' + $env:CUDAHOSTCXX = '/usr/bin/g++-11' + $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX" + } elseif (Test-Path '/usr/bin/g++-12') { $env:CC = '/usr/bin/gcc-12' $env:CXX = '/usr/bin/g++-12' $env:CUDAHOSTCXX = '/usr/bin/g++-12' From b7bae926058bd6e041034a54eff75339e3cabcf9 Mon Sep 17 00:00:00 2001 From: abetlen Date: Mon, 1 Jun 2026 03:29:20 -0700 Subject: [PATCH 6/6] fix(ci): use MSVC 14.29 for CUDA 11.8 Windows wheels --- .github/workflows/build-wheels-cuda.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 83200ecdfc..723236ca82 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -20,22 +20,16 @@ jobs: id: set-matrix run: | $matrix = @{ - 'os' = @('ubuntu-22.04', 'windows-2022', 'windows-2019') + 'os' = @('ubuntu-22.04', 'windows-2022') # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic, # so one builder per toolkit version is sufficient. 'pyver' = @("3.9") 'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1") 'releasetag' = @("basic") 'exclude' = @( - @{ 'os' = 'windows-2022'; 'cuda' = '11.8.0' }, @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' }, @{ 'os' = 'windows-2022'; 'cuda' = '12.2.2' }, - @{ 'os' = 'windows-2022'; 'cuda' = '12.3.2' }, - @{ 'os' = 'windows-2019'; 'cuda' = '12.1.1' }, - @{ 'os' = 'windows-2019'; 'cuda' = '12.2.2' }, - @{ 'os' = 'windows-2019'; 'cuda' = '12.3.2' }, - @{ 'os' = 'windows-2019'; 'cuda' = '12.4.1' }, - @{ 'os' = 'windows-2019'; 'cuda' = '12.5.1' } + @{ 'os' = 'windows-2022'; 'cuda' = '12.3.2' } ) } @@ -56,8 +50,15 @@ jobs: AVXVER: ${{ matrix.releasetag }} steps: + - name: Set up MSVC for CUDA 11.8 + if: runner.os == 'Windows' && matrix.cuda == '11.8.0' + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + toolset: 14.29 + - name: Set up MSVC - if: runner.os == 'Windows' + if: runner.os == 'Windows' && matrix.cuda != '11.8.0' uses: ilammy/msvc-dev-cmd@v1 with: arch: x64