From 2fab52e506672fec7e2747c0aab4427ac2ad095b Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Thu, 2 Apr 2026 19:19:46 -0700 Subject: [PATCH] feat!: update default CUDA image to server-cuda13 for Qwen3.5 and Blackwell support BREAKING CHANGE: The default GPU inference image is now server-cuda13 (CUDA 13) instead of server-cuda (CUDA 12). This requires NVIDIA driver 590+ (CUDA 13.1). Users on older drivers must specify --image ghcr.io/ggml-org/llama.cpp:server-cuda to use the CUDA 12 image. This enables: - Qwen3.5 model architecture support (previously unknown architecture) - Native SM 120 (Blackwell) GPU support for RTX 50-series - Latest llama.cpp optimizations and model architecture support Updated across CLI, benchmark tool, sample manifests, and docs. Fixes #261 Signed-off-by: Christopher Maher --- config/samples/gpu-llama-3b-model.yaml | 2 +- config/samples/multi-gpu-azure-spot.yaml | 2 +- config/samples/multi-gpu-eks-spot.yaml | 2 +- config/samples/multi-gpu-gke-spot.yaml | 2 +- config/samples/multi-gpu-llama-13b-model.yaml | 2 +- config/samples/multi-gpu-llama-70b-model.yaml | 2 +- docs/air-gapped-quickstart.md | 4 +- .../inferenceservice_controller_test.go | 54 +++++++++---------- pkg/cli/benchmark.go | 2 +- pkg/cli/deploy.go | 6 ++- pkg/cli/deploy_test.go | 6 +-- 11 files changed, 43 insertions(+), 41 deletions(-) diff --git a/config/samples/gpu-llama-3b-model.yaml b/config/samples/gpu-llama-3b-model.yaml index 1ee8e89..9951f41 100644 --- a/config/samples/gpu-llama-3b-model.yaml +++ b/config/samples/gpu-llama-3b-model.yaml @@ -52,7 +52,7 @@ spec: replicas: 1 # Use llama.cpp with CUDA support - image: ghcr.io/ggml-org/llama.cpp:server-cuda + image: ghcr.io/ggml-org/llama.cpp:server-cuda13 # Endpoint configuration (OpenAI-compatible API) endpoint: diff --git a/config/samples/multi-gpu-azure-spot.yaml b/config/samples/multi-gpu-azure-spot.yaml index 0c4b19e..da2807a 100644 --- a/config/samples/multi-gpu-azure-spot.yaml +++ b/config/samples/multi-gpu-azure-spot.yaml @@ -25,7 +25,7 @@ metadata: spec: modelRef: llama-13b-multi-gpu replicas: 1 - image: ghcr.io/ggml-org/llama.cpp:server-cuda + image: ghcr.io/ggml-org/llama.cpp:server-cuda13 resources: gpu: 2 # Request 2 GPUs gpuMemory: "16Gi" diff --git a/config/samples/multi-gpu-eks-spot.yaml b/config/samples/multi-gpu-eks-spot.yaml index 8a7e3f7..f37e6bc 100644 --- a/config/samples/multi-gpu-eks-spot.yaml +++ b/config/samples/multi-gpu-eks-spot.yaml @@ -25,7 +25,7 @@ metadata: spec: modelRef: llama-13b-multi-gpu replicas: 1 - image: ghcr.io/ggml-org/llama.cpp:server-cuda + image: ghcr.io/ggml-org/llama.cpp:server-cuda13 resources: gpu: 2 # Request 2 GPUs gpuMemory: "16Gi" diff --git a/config/samples/multi-gpu-gke-spot.yaml b/config/samples/multi-gpu-gke-spot.yaml index 07bced8..cc50ed4 100644 --- a/config/samples/multi-gpu-gke-spot.yaml +++ b/config/samples/multi-gpu-gke-spot.yaml @@ -25,7 +25,7 @@ metadata: spec: modelRef: llama-13b-multi-gpu replicas: 1 - image: ghcr.io/ggml-org/llama.cpp:server-cuda + image: ghcr.io/ggml-org/llama.cpp:server-cuda13 resources: gpu: 2 # Request 2 GPUs gpuMemory: "16Gi" diff --git a/config/samples/multi-gpu-llama-13b-model.yaml b/config/samples/multi-gpu-llama-13b-model.yaml index 15f3294..50ff28d 100644 --- a/config/samples/multi-gpu-llama-13b-model.yaml +++ b/config/samples/multi-gpu-llama-13b-model.yaml @@ -49,7 +49,7 @@ spec: replicas: 1 # Use CUDA-enabled llama.cpp image - image: ghcr.io/ggml-org/llama.cpp:server-cuda + image: ghcr.io/ggml-org/llama.cpp:server-cuda13 resources: gpu: 2 # Request 2 GPUs per pod diff --git a/config/samples/multi-gpu-llama-70b-model.yaml b/config/samples/multi-gpu-llama-70b-model.yaml index 6f48085..21e3e21 100644 --- a/config/samples/multi-gpu-llama-70b-model.yaml +++ b/config/samples/multi-gpu-llama-70b-model.yaml @@ -41,7 +41,7 @@ spec: replicas: 1 # Use CUDA-enabled llama.cpp image - image: ghcr.io/ggml-org/llama.cpp:server-cuda + image: ghcr.io/ggml-org/llama.cpp:server-cuda13 resources: gpu: 4 # Request 4 GPUs per pod diff --git a/docs/air-gapped-quickstart.md b/docs/air-gapped-quickstart.md index 3ec55fd..9e1dcd3 100644 --- a/docs/air-gapped-quickstart.md +++ b/docs/air-gapped-quickstart.md @@ -181,11 +181,11 @@ python3 -m http.server 8080 ```bash # Pull images docker pull ghcr.io/defilantech/llmkube:v0.4.9 -docker pull ghcr.io/ggml-org/llama.cpp:server-cuda +docker pull ghcr.io/ggml-org/llama.cpp:server-cuda13 # Save to tar files docker save ghcr.io/defilantech/llmkube:v0.4.9 > llmkube-controller.tar -docker save ghcr.io/ggml-org/llama.cpp:server-cuda > llama-server-cuda.tar +docker save ghcr.io/ggml-org/llama.cpp:server-cuda13 > llama-server-cuda.tar ``` 2. Transfer tar files to the air-gapped environment diff --git a/internal/controller/inferenceservice_controller_test.go b/internal/controller/inferenceservice_controller_test.go index 8109c58..a2b2cfb 100644 --- a/internal/controller/inferenceservice_controller_test.go +++ b/internal/controller/inferenceservice_controller_test.go @@ -227,7 +227,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "multi-gpu-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 2, }, @@ -296,7 +296,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "quad-gpu-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", }, } @@ -347,7 +347,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "single-gpu-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, }, @@ -453,7 +453,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "model-gpu-precedence", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 2, // InferenceService says 2 GPUs }, @@ -565,7 +565,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "toleration-test-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", }, } @@ -621,7 +621,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "nodeselector-test-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", NodeSelector: map[string]string{ "cloud.google.com/gke-nodepool": "gpu-pool", "nvidia.com/gpu.product": "NVIDIA-L4", @@ -690,7 +690,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "context-size-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", ContextSize: &contextSize, }, } @@ -714,7 +714,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "context-size-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", ContextSize: &contextSize, }, } @@ -737,7 +737,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "context-size-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", // ContextSize not specified }, } @@ -760,7 +760,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "context-size-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", ContextSize: &contextSize, }, } @@ -783,7 +783,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "context-size-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", ContextSize: &contextSize, Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -849,7 +849,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "parallel-slots-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", ParallelSlots: ¶llelSlots, }, } @@ -871,7 +871,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "parallel-slots-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", }, } @@ -892,7 +892,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "parallel-slots-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", ParallelSlots: ¶llelSlots, }, } @@ -949,7 +949,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "flash-attn-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", FlashAttention: &flashAttn, Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -973,7 +973,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "flash-attn-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, }, @@ -997,7 +997,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "flash-attn-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", FlashAttention: &flashAttn, Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -1093,7 +1093,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "jinja-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Jinja: &jinja, Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -1117,7 +1117,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "jinja-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, }, @@ -1141,7 +1141,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "jinja-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Jinja: &jinja, Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -1200,7 +1200,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "cache-type-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", CacheTypeK: "q4_0", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -1224,7 +1224,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "cache-type-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", CacheTypeV: "q8_0", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -1248,7 +1248,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "cache-type-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", CacheTypeK: "q4_0", CacheTypeV: "q8_0", Resources: &inferencev1alpha1.InferenceResourceRequirements{ @@ -1274,7 +1274,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "cache-type-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, }, @@ -1333,7 +1333,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "extra-args-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", ExtraArgs: []string{"--seed", "42", "--batch-size", "2048"}, Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, @@ -1358,7 +1358,7 @@ var _ = Describe("Context Size Configuration", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: "extra-args-model", Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 1, }, @@ -1438,7 +1438,7 @@ var _ = Describe("Multi-GPU End-to-End Reconciliation", func() { Spec: inferencev1alpha1.InferenceServiceSpec{ ModelRef: multiGPUModelName, Replicas: &replicas, - Image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", Resources: &inferencev1alpha1.InferenceResourceRequirements{ GPU: 2, GPUMemory: "16Gi", diff --git a/pkg/cli/benchmark.go b/pkg/cli/benchmark.go index 30d3e25..2e0c581 100644 --- a/pkg/cli/benchmark.go +++ b/pkg/cli/benchmark.go @@ -284,7 +284,7 @@ const ( const ( imageLlamaCppServer = "ghcr.io/ggml-org/llama.cpp:server" - imageLlamaCppServerCUDA = "ghcr.io/ggml-org/llama.cpp:server-cuda" + imageLlamaCppServerCUDA = "ghcr.io/ggml-org/llama.cpp:server-cuda13" imageLlamaCppServerROCm = "ghcr.io/ggml-org/llama.cpp:server-rocm" ) diff --git a/pkg/cli/deploy.go b/pkg/cli/deploy.go index 62c1f71..d457dd6 100644 --- a/pkg/cli/deploy.go +++ b/pkg/cli/deploy.go @@ -181,7 +181,9 @@ Examples: cmd.Flags().StringVar(&opts.cpu, "cpu", "2", "CPU request (e.g., '2' or '2000m')") cmd.Flags().StringVar(&opts.memory, "memory", "4Gi", "Memory request (e.g., '4Gi')") - cmd.Flags().StringVar(&opts.image, "image", "", "Custom llama.cpp server image (auto-detected based on --gpu)") + cmd.Flags().StringVar(&opts.image, "image", "", + "Custom llama.cpp server image. Default: server-cuda13 for GPU, server for CPU.\n"+ + "Use this to override with an older image (e.g., ghcr.io/ggml-org/llama.cpp:server-cuda for CUDA 12).") cmd.Flags().BoolVarP(&opts.wait, "wait", "w", true, "Wait for deployment to be ready") cmd.Flags().DurationVar(&opts.timeout, "timeout", 10*time.Minute, "Timeout for waiting") @@ -524,7 +526,7 @@ func resolveAcceleratorAndImage(opts *deployOptions) { fmt.Printf("ℹ️ Ensure Metal agent is installed: make install-metal-agent\n") } else { if opts.image == "" { - opts.image = "ghcr.io/ggml-org/llama.cpp:server-cuda" + opts.image = "ghcr.io/ggml-org/llama.cpp:server-cuda13" fmt.Printf("ℹ️ Auto-detected image: %s\n", opts.image) } } diff --git a/pkg/cli/deploy_test.go b/pkg/cli/deploy_test.go index 2d42426..2d6d1f4 100644 --- a/pkg/cli/deploy_test.go +++ b/pkg/cli/deploy_test.go @@ -54,7 +54,7 @@ func TestBuildInferenceService(t *testing.T) { name: "gpu-model", namespace: "production", replicas: 2, - image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", cpu: "4", memory: "8Gi", gpu: true, @@ -70,7 +70,7 @@ func TestBuildInferenceService(t *testing.T) { name: "gpu-model", namespace: testDefaultNamespace, replicas: 1, - image: "ghcr.io/ggml-org/llama.cpp:server-cuda", + image: "ghcr.io/ggml-org/llama.cpp:server-cuda13", cpu: "2", memory: "4Gi", gpu: true, @@ -585,7 +585,7 @@ func TestResolveAcceleratorAndImage(t *testing.T) { }, wantAccel: "cuda", wantVendor: defaultGPUVendor, - wantImage: "ghcr.io/ggml-org/llama.cpp:server-cuda", + wantImage: "ghcr.io/ggml-org/llama.cpp:server-cuda13", }, { name: "metal with explicit amd vendor is preserved",