From 2fab52e506672fec7e2747c0aab4427ac2ad095b Mon Sep 17 00:00:00 2001
From: Christopher Maher <chris@mahercode.io>
Date: Thu, 2 Apr 2026 19:19:46 -0700
Subject: [PATCH] feat!: update default CUDA image to server-cuda13 for Qwen3.5
 and Blackwell support

BREAKING CHANGE: The default GPU inference image is now server-cuda13
(CUDA 13) instead of server-cuda (CUDA 12). This requires NVIDIA
driver 590+ (CUDA 13.1). Users on older drivers must specify
--image ghcr.io/ggml-org/llama.cpp:server-cuda to use the CUDA 12 image.

This enables:
- Qwen3.5 model architecture support (previously unknown architecture)
- Native SM 120 (Blackwell) GPU support for RTX 50-series
- Latest llama.cpp optimizations and model architecture support

Updated across CLI, benchmark tool, sample manifests, and docs.

Fixes #261

Signed-off-by: Christopher Maher <chris@mahercode.io>
---
 config/samples/gpu-llama-3b-model.yaml        |  2 +-
 config/samples/multi-gpu-azure-spot.yaml      |  2 +-
 config/samples/multi-gpu-eks-spot.yaml        |  2 +-
 config/samples/multi-gpu-gke-spot.yaml        |  2 +-
 config/samples/multi-gpu-llama-13b-model.yaml |  2 +-
 config/samples/multi-gpu-llama-70b-model.yaml |  2 +-
 docs/air-gapped-quickstart.md                 |  4 +-
 .../inferenceservice_controller_test.go       | 54 +++++++++----------
 pkg/cli/benchmark.go                          |  2 +-
 pkg/cli/deploy.go                             |  6 ++-
 pkg/cli/deploy_test.go                        |  6 +--
 11 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/config/samples/gpu-llama-3b-model.yaml b/config/samples/gpu-llama-3b-model.yaml
index 1ee8e89..9951f41 100644
--- a/config/samples/gpu-llama-3b-model.yaml
+++ b/config/samples/gpu-llama-3b-model.yaml
@@ -52,7 +52,7 @@ spec:
   replicas: 1
 
   # Use llama.cpp with CUDA support
-  image: ghcr.io/ggml-org/llama.cpp:server-cuda
+  image: ghcr.io/ggml-org/llama.cpp:server-cuda13
 
   # Endpoint configuration (OpenAI-compatible API)
   endpoint:
diff --git a/config/samples/multi-gpu-azure-spot.yaml b/config/samples/multi-gpu-azure-spot.yaml
index 0c4b19e..da2807a 100644
--- a/config/samples/multi-gpu-azure-spot.yaml
+++ b/config/samples/multi-gpu-azure-spot.yaml
@@ -25,7 +25,7 @@ metadata:
 spec:
   modelRef: llama-13b-multi-gpu
   replicas: 1
-  image: ghcr.io/ggml-org/llama.cpp:server-cuda
+  image: ghcr.io/ggml-org/llama.cpp:server-cuda13
   resources:
     gpu: 2              # Request 2 GPUs
     gpuMemory: "16Gi"
diff --git a/config/samples/multi-gpu-eks-spot.yaml b/config/samples/multi-gpu-eks-spot.yaml
index 8a7e3f7..f37e6bc 100644
--- a/config/samples/multi-gpu-eks-spot.yaml
+++ b/config/samples/multi-gpu-eks-spot.yaml
@@ -25,7 +25,7 @@ metadata:
 spec:
   modelRef: llama-13b-multi-gpu
   replicas: 1
-  image: ghcr.io/ggml-org/llama.cpp:server-cuda
+  image: ghcr.io/ggml-org/llama.cpp:server-cuda13
   resources:
     gpu: 2              # Request 2 GPUs
     gpuMemory: "16Gi"
diff --git a/config/samples/multi-gpu-gke-spot.yaml b/config/samples/multi-gpu-gke-spot.yaml
index 07bced8..cc50ed4 100644
--- a/config/samples/multi-gpu-gke-spot.yaml
+++ b/config/samples/multi-gpu-gke-spot.yaml
@@ -25,7 +25,7 @@ metadata:
 spec:
   modelRef: llama-13b-multi-gpu
   replicas: 1
-  image: ghcr.io/ggml-org/llama.cpp:server-cuda
+  image: ghcr.io/ggml-org/llama.cpp:server-cuda13
   resources:
     gpu: 2              # Request 2 GPUs
     gpuMemory: "16Gi"
diff --git a/config/samples/multi-gpu-llama-13b-model.yaml b/config/samples/multi-gpu-llama-13b-model.yaml
index 15f3294..50ff28d 100644
--- a/config/samples/multi-gpu-llama-13b-model.yaml
+++ b/config/samples/multi-gpu-llama-13b-model.yaml
@@ -49,7 +49,7 @@ spec:
   replicas: 1
 
   # Use CUDA-enabled llama.cpp image
-  image: ghcr.io/ggml-org/llama.cpp:server-cuda
+  image: ghcr.io/ggml-org/llama.cpp:server-cuda13
 
   resources:
     gpu: 2              # Request 2 GPUs per pod
diff --git a/config/samples/multi-gpu-llama-70b-model.yaml b/config/samples/multi-gpu-llama-70b-model.yaml
index 6f48085..21e3e21 100644
--- a/config/samples/multi-gpu-llama-70b-model.yaml
+++ b/config/samples/multi-gpu-llama-70b-model.yaml
@@ -41,7 +41,7 @@ spec:
   replicas: 1
 
   # Use CUDA-enabled llama.cpp image
-  image: ghcr.io/ggml-org/llama.cpp:server-cuda
+  image: ghcr.io/ggml-org/llama.cpp:server-cuda13
 
   resources:
     gpu: 4              # Request 4 GPUs per pod
diff --git a/docs/air-gapped-quickstart.md b/docs/air-gapped-quickstart.md
index 3ec55fd..9e1dcd3 100644
--- a/docs/air-gapped-quickstart.md
+++ b/docs/air-gapped-quickstart.md
@@ -181,11 +181,11 @@ python3 -m http.server 8080
 ```bash
 # Pull images
 docker pull ghcr.io/defilantech/llmkube:v0.4.9
-docker pull ghcr.io/ggml-org/llama.cpp:server-cuda
+docker pull ghcr.io/ggml-org/llama.cpp:server-cuda13
 
 # Save to tar files
 docker save ghcr.io/defilantech/llmkube:v0.4.9 > llmkube-controller.tar
-docker save ghcr.io/ggml-org/llama.cpp:server-cuda > llama-server-cuda.tar
+docker save ghcr.io/ggml-org/llama.cpp:server-cuda13 > llama-server-cuda.tar
 ```
 
 2. Transfer tar files to the air-gapped environment
diff --git a/internal/controller/inferenceservice_controller_test.go b/internal/controller/inferenceservice_controller_test.go
index 8109c58..a2b2cfb 100644
--- a/internal/controller/inferenceservice_controller_test.go
+++ b/internal/controller/inferenceservice_controller_test.go
@@ -227,7 +227,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "multi-gpu-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 2,
 					},
@@ -296,7 +296,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "quad-gpu-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 				},
 			}
 
@@ -347,7 +347,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "single-gpu-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
 					},
@@ -453,7 +453,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "model-gpu-precedence",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 2, // InferenceService says 2 GPUs
 					},
@@ -565,7 +565,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "toleration-test-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 				},
 			}
 
@@ -621,7 +621,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "nodeselector-test-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					NodeSelector: map[string]string{
 						"cloud.google.com/gke-nodepool": "gpu-pool",
 						"nvidia.com/gpu.product":        "NVIDIA-L4",
@@ -690,7 +690,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:    "context-size-model",
 					Replicas:    &replicas,
-					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					ContextSize: &contextSize,
 				},
 			}
@@ -714,7 +714,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:    "context-size-model",
 					Replicas:    &replicas,
-					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					ContextSize: &contextSize,
 				},
 			}
@@ -737,7 +737,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "context-size-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					// ContextSize not specified
 				},
 			}
@@ -760,7 +760,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:    "context-size-model",
 					Replicas:    &replicas,
-					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					ContextSize: &contextSize,
 				},
 			}
@@ -783,7 +783,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:    "context-size-model",
 					Replicas:    &replicas,
-					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:       "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					ContextSize: &contextSize,
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -849,7 +849,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:      "parallel-slots-model",
 					Replicas:      &replicas,
-					Image:         "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:         "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					ParallelSlots: &parallelSlots,
 				},
 			}
@@ -871,7 +871,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "parallel-slots-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 				},
 			}
 
@@ -892,7 +892,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:      "parallel-slots-model",
 					Replicas:      &replicas,
-					Image:         "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:         "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					ParallelSlots: &parallelSlots,
 				},
 			}
@@ -949,7 +949,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:       "flash-attn-model",
 					Replicas:       &replicas,
-					Image:          "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:          "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					FlashAttention: &flashAttn,
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -973,7 +973,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "flash-attn-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
 					},
@@ -997,7 +997,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:       "flash-attn-model",
 					Replicas:       &replicas,
-					Image:          "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:          "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					FlashAttention: &flashAttn,
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -1093,7 +1093,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "jinja-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Jinja:    &jinja,
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -1117,7 +1117,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "jinja-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
 					},
@@ -1141,7 +1141,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "jinja-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Jinja:    &jinja,
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -1200,7 +1200,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:   "cache-type-model",
 					Replicas:   &replicas,
-					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					CacheTypeK: "q4_0",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -1224,7 +1224,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:   "cache-type-model",
 					Replicas:   &replicas,
-					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					CacheTypeV: "q8_0",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -1248,7 +1248,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:   "cache-type-model",
 					Replicas:   &replicas,
-					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:      "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					CacheTypeK: "q4_0",
 					CacheTypeV: "q8_0",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
@@ -1274,7 +1274,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "cache-type-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
 					},
@@ -1333,7 +1333,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef:  "extra-args-model",
 					Replicas:  &replicas,
-					Image:     "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:     "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					ExtraArgs: []string{"--seed", "42", "--batch-size", "2048"},
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
@@ -1358,7 +1358,7 @@ var _ = Describe("Context Size Configuration", func() {
 				Spec: inferencev1alpha1.InferenceServiceSpec{
 					ModelRef: "extra-args-model",
 					Replicas: &replicas,
-					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+					Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 					Resources: &inferencev1alpha1.InferenceResourceRequirements{
 						GPU: 1,
 					},
@@ -1438,7 +1438,7 @@ var _ = Describe("Multi-GPU End-to-End Reconciliation", func() {
 					Spec: inferencev1alpha1.InferenceServiceSpec{
 						ModelRef: multiGPUModelName,
 						Replicas: &replicas,
-						Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda",
+						Image:    "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 						Resources: &inferencev1alpha1.InferenceResourceRequirements{
 							GPU:       2,
 							GPUMemory: "16Gi",
diff --git a/pkg/cli/benchmark.go b/pkg/cli/benchmark.go
index 30d3e25..2e0c581 100644
--- a/pkg/cli/benchmark.go
+++ b/pkg/cli/benchmark.go
@@ -284,7 +284,7 @@ const (
 
 const (
 	imageLlamaCppServer     = "ghcr.io/ggml-org/llama.cpp:server"
-	imageLlamaCppServerCUDA = "ghcr.io/ggml-org/llama.cpp:server-cuda"
+	imageLlamaCppServerCUDA = "ghcr.io/ggml-org/llama.cpp:server-cuda13"
 	imageLlamaCppServerROCm = "ghcr.io/ggml-org/llama.cpp:server-rocm"
 )
 
diff --git a/pkg/cli/deploy.go b/pkg/cli/deploy.go
index 62c1f71..d457dd6 100644
--- a/pkg/cli/deploy.go
+++ b/pkg/cli/deploy.go
@@ -181,7 +181,9 @@ Examples:
 
 	cmd.Flags().StringVar(&opts.cpu, "cpu", "2", "CPU request (e.g., '2' or '2000m')")
 	cmd.Flags().StringVar(&opts.memory, "memory", "4Gi", "Memory request (e.g., '4Gi')")
-	cmd.Flags().StringVar(&opts.image, "image", "", "Custom llama.cpp server image (auto-detected based on --gpu)")
+	cmd.Flags().StringVar(&opts.image, "image", "",
+		"Custom llama.cpp server image. Default: server-cuda13 for GPU, server for CPU.\n"+
+			"Use this to override with an older image (e.g., ghcr.io/ggml-org/llama.cpp:server-cuda for CUDA 12).")
 
 	cmd.Flags().BoolVarP(&opts.wait, "wait", "w", true, "Wait for deployment to be ready")
 	cmd.Flags().DurationVar(&opts.timeout, "timeout", 10*time.Minute, "Timeout for waiting")
@@ -524,7 +526,7 @@ func resolveAcceleratorAndImage(opts *deployOptions) {
 			fmt.Printf("ℹ️  Ensure Metal agent is installed: make install-metal-agent\n")
 		} else {
 			if opts.image == "" {
-				opts.image = "ghcr.io/ggml-org/llama.cpp:server-cuda"
+				opts.image = "ghcr.io/ggml-org/llama.cpp:server-cuda13"
 				fmt.Printf("ℹ️  Auto-detected image: %s\n", opts.image)
 			}
 		}
diff --git a/pkg/cli/deploy_test.go b/pkg/cli/deploy_test.go
index 2d42426..2d6d1f4 100644
--- a/pkg/cli/deploy_test.go
+++ b/pkg/cli/deploy_test.go
@@ -54,7 +54,7 @@ func TestBuildInferenceService(t *testing.T) {
 				name:      "gpu-model",
 				namespace: "production",
 				replicas:  2,
-				image:     "ghcr.io/ggml-org/llama.cpp:server-cuda",
+				image:     "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 				cpu:       "4",
 				memory:    "8Gi",
 				gpu:       true,
@@ -70,7 +70,7 @@ func TestBuildInferenceService(t *testing.T) {
 				name:      "gpu-model",
 				namespace: testDefaultNamespace,
 				replicas:  1,
-				image:     "ghcr.io/ggml-org/llama.cpp:server-cuda",
+				image:     "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 				cpu:       "2",
 				memory:    "4Gi",
 				gpu:       true,
@@ -585,7 +585,7 @@ func TestResolveAcceleratorAndImage(t *testing.T) {
 			},
 			wantAccel:  "cuda",
 			wantVendor: defaultGPUVendor,
-			wantImage:  "ghcr.io/ggml-org/llama.cpp:server-cuda",
+			wantImage:  "ghcr.io/ggml-org/llama.cpp:server-cuda13",
 		},
 		{
 			name: "metal with explicit amd vendor is preserved",