Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/samples/gpu-llama-3b-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ spec:
replicas: 1

# Use llama.cpp with CUDA support
image: ghcr.io/ggml-org/llama.cpp:server-cuda
image: ghcr.io/ggml-org/llama.cpp:server-cuda13

# Endpoint configuration (OpenAI-compatible API)
endpoint:
Expand Down
2 changes: 1 addition & 1 deletion config/samples/multi-gpu-azure-spot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ metadata:
spec:
modelRef: llama-13b-multi-gpu
replicas: 1
image: ghcr.io/ggml-org/llama.cpp:server-cuda
image: ghcr.io/ggml-org/llama.cpp:server-cuda13
resources:
gpu: 2 # Request 2 GPUs
gpuMemory: "16Gi"
Expand Down
2 changes: 1 addition & 1 deletion config/samples/multi-gpu-eks-spot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ metadata:
spec:
modelRef: llama-13b-multi-gpu
replicas: 1
image: ghcr.io/ggml-org/llama.cpp:server-cuda
image: ghcr.io/ggml-org/llama.cpp:server-cuda13
resources:
gpu: 2 # Request 2 GPUs
gpuMemory: "16Gi"
Expand Down
2 changes: 1 addition & 1 deletion config/samples/multi-gpu-gke-spot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ metadata:
spec:
modelRef: llama-13b-multi-gpu
replicas: 1
image: ghcr.io/ggml-org/llama.cpp:server-cuda
image: ghcr.io/ggml-org/llama.cpp:server-cuda13
resources:
gpu: 2 # Request 2 GPUs
gpuMemory: "16Gi"
Expand Down
2 changes: 1 addition & 1 deletion config/samples/multi-gpu-llama-13b-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ spec:
replicas: 1

# Use CUDA-enabled llama.cpp image
image: ghcr.io/ggml-org/llama.cpp:server-cuda
image: ghcr.io/ggml-org/llama.cpp:server-cuda13

resources:
gpu: 2 # Request 2 GPUs per pod
Expand Down
2 changes: 1 addition & 1 deletion config/samples/multi-gpu-llama-70b-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ spec:
replicas: 1

# Use CUDA-enabled llama.cpp image
image: ghcr.io/ggml-org/llama.cpp:server-cuda
image: ghcr.io/ggml-org/llama.cpp:server-cuda13

resources:
gpu: 4 # Request 4 GPUs per pod
Expand Down
4 changes: 2 additions & 2 deletions docs/air-gapped-quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,11 @@ python3 -m http.server 8080
```bash
# Pull images
docker pull ghcr.io/defilantech/llmkube:v0.4.9
docker pull ghcr.io/ggml-org/llama.cpp:server-cuda
docker pull ghcr.io/ggml-org/llama.cpp:server-cuda13

# Save to tar files
docker save ghcr.io/defilantech/llmkube:v0.4.9 > llmkube-controller.tar
docker save ghcr.io/ggml-org/llama.cpp:server-cuda > llama-server-cuda.tar
docker save ghcr.io/ggml-org/llama.cpp:server-cuda13 > llama-server-cuda.tar
```

2. Transfer tar files to the air-gapped environment
Expand Down
54 changes: 27 additions & 27 deletions internal/controller/inferenceservice_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "multi-gpu-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 2,
},
Expand Down Expand Up @@ -296,7 +296,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "quad-gpu-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
},
}

Expand Down Expand Up @@ -347,7 +347,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "single-gpu-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
},
Expand Down Expand Up @@ -453,7 +453,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "model-gpu-precedence",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 2, // InferenceService says 2 GPUs
},
Expand Down Expand Up @@ -565,7 +565,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "toleration-test-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
},
}

Expand Down Expand Up @@ -621,7 +621,7 @@ var _ = Describe("Multi-GPU Deployment Construction", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "nodeselector-test-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
NodeSelector: map[string]string{
"cloud.google.com/gke-nodepool": "gpu-pool",
"nvidia.com/gpu.product": "NVIDIA-L4",
Expand Down Expand Up @@ -690,7 +690,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "context-size-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
ContextSize: &contextSize,
},
}
Expand All @@ -714,7 +714,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "context-size-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
ContextSize: &contextSize,
},
}
Expand All @@ -737,7 +737,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "context-size-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
// ContextSize not specified
},
}
Expand All @@ -760,7 +760,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "context-size-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
ContextSize: &contextSize,
},
}
Expand All @@ -783,7 +783,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "context-size-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
ContextSize: &contextSize,
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand Down Expand Up @@ -849,7 +849,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "parallel-slots-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
ParallelSlots: &parallelSlots,
},
}
Expand All @@ -871,7 +871,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "parallel-slots-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
},
}

Expand All @@ -892,7 +892,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "parallel-slots-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
ParallelSlots: &parallelSlots,
},
}
Expand Down Expand Up @@ -949,7 +949,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "flash-attn-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
FlashAttention: &flashAttn,
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand All @@ -973,7 +973,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "flash-attn-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
},
Expand All @@ -997,7 +997,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "flash-attn-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
FlashAttention: &flashAttn,
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand Down Expand Up @@ -1093,7 +1093,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "jinja-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Jinja: &jinja,
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand All @@ -1117,7 +1117,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "jinja-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
},
Expand All @@ -1141,7 +1141,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "jinja-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Jinja: &jinja,
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand Down Expand Up @@ -1200,7 +1200,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "cache-type-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
CacheTypeK: "q4_0",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand All @@ -1224,7 +1224,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "cache-type-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
CacheTypeV: "q8_0",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand All @@ -1248,7 +1248,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "cache-type-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
CacheTypeK: "q4_0",
CacheTypeV: "q8_0",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
Expand All @@ -1274,7 +1274,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "cache-type-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
},
Expand Down Expand Up @@ -1333,7 +1333,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "extra-args-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
ExtraArgs: []string{"--seed", "42", "--batch-size", "2048"},
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
Expand All @@ -1358,7 +1358,7 @@ var _ = Describe("Context Size Configuration", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "extra-args-model",
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 1,
},
Expand Down Expand Up @@ -1438,7 +1438,7 @@ var _ = Describe("Multi-GPU End-to-End Reconciliation", func() {
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: multiGPUModelName,
Replicas: &replicas,
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
Image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
Resources: &inferencev1alpha1.InferenceResourceRequirements{
GPU: 2,
GPUMemory: "16Gi",
Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/benchmark.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ const (

const (
imageLlamaCppServer = "ghcr.io/ggml-org/llama.cpp:server"
imageLlamaCppServerCUDA = "ghcr.io/ggml-org/llama.cpp:server-cuda"
imageLlamaCppServerCUDA = "ghcr.io/ggml-org/llama.cpp:server-cuda13"
imageLlamaCppServerROCm = "ghcr.io/ggml-org/llama.cpp:server-rocm"
)

Expand Down
6 changes: 4 additions & 2 deletions pkg/cli/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ Examples:

cmd.Flags().StringVar(&opts.cpu, "cpu", "2", "CPU request (e.g., '2' or '2000m')")
cmd.Flags().StringVar(&opts.memory, "memory", "4Gi", "Memory request (e.g., '4Gi')")
cmd.Flags().StringVar(&opts.image, "image", "", "Custom llama.cpp server image (auto-detected based on --gpu)")
cmd.Flags().StringVar(&opts.image, "image", "",
"Custom llama.cpp server image. Default: server-cuda13 for GPU, server for CPU.\n"+
"Use this to override with an older image (e.g., ghcr.io/ggml-org/llama.cpp:server-cuda for CUDA 12).")

cmd.Flags().BoolVarP(&opts.wait, "wait", "w", true, "Wait for deployment to be ready")
cmd.Flags().DurationVar(&opts.timeout, "timeout", 10*time.Minute, "Timeout for waiting")
Expand Down Expand Up @@ -524,7 +526,7 @@ func resolveAcceleratorAndImage(opts *deployOptions) {
fmt.Printf("ℹ️ Ensure Metal agent is installed: make install-metal-agent\n")
} else {
if opts.image == "" {
opts.image = "ghcr.io/ggml-org/llama.cpp:server-cuda"
opts.image = "ghcr.io/ggml-org/llama.cpp:server-cuda13"
fmt.Printf("ℹ️ Auto-detected image: %s\n", opts.image)
}
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/cli/deploy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func TestBuildInferenceService(t *testing.T) {
name: "gpu-model",
namespace: "production",
replicas: 2,
image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
cpu: "4",
memory: "8Gi",
gpu: true,
Expand All @@ -70,7 +70,7 @@ func TestBuildInferenceService(t *testing.T) {
name: "gpu-model",
namespace: testDefaultNamespace,
replicas: 1,
image: "ghcr.io/ggml-org/llama.cpp:server-cuda",
image: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
cpu: "2",
memory: "4Gi",
gpu: true,
Expand Down Expand Up @@ -585,7 +585,7 @@ func TestResolveAcceleratorAndImage(t *testing.T) {
},
wantAccel: "cuda",
wantVendor: defaultGPUVendor,
wantImage: "ghcr.io/ggml-org/llama.cpp:server-cuda",
wantImage: "ghcr.io/ggml-org/llama.cpp:server-cuda13",
},
{
name: "metal with explicit amd vendor is preserved",
Expand Down