diff --git a/api/v1alpha1/inferenceservice_types.go b/api/v1alpha1/inferenceservice_types.go index 6b48f994..b448c352 100644 --- a/api/v1alpha1/inferenceservice_types.go +++ b/api/v1alpha1/inferenceservice_types.go @@ -61,7 +61,8 @@ type InferenceServiceSpec struct { // "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server. // "vllm": vLLM OpenAI-compatible server with PagedAttention. // "tgi": HuggingFace Text Generation Inference server. - // +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;generic + // "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server. + // +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;whisper;generic // +kubebuilder:default=llamacpp // +optional Runtime string `json:"runtime,omitempty"` @@ -338,6 +339,11 @@ type InferenceServiceSpec struct { // +optional TGIConfig *TGIConfig `json:"tgiConfig,omitempty"` + // WhisperConfig holds configuration for the whisper (speaches) runtime. + // Only used when Runtime is "whisper". + // +optional + WhisperConfig *WhisperConfig `json:"whisperConfig,omitempty"` + // ImagePullSecrets for pulling container images from private registries. // +optional ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"` @@ -386,8 +392,10 @@ type EndpointSpec struct { // +optional Port int32 `json:"port,omitempty"` - // Path is the HTTP path for the inference endpoint - // +kubebuilder:default="/v1/chat/completions" + // Path is the HTTP path for the inference endpoint. When unset, the + // effective default is the runtime's OpenAI-compatible path + // (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + // whisper runtime), resolved when the status endpoint is constructed. // +optional Path string `json:"path,omitempty"` @@ -667,6 +675,45 @@ type TGIConfig struct { HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"` } +// WhisperConfig holds deploy-time server settings for the whisper (speaches) +// runtime. speaches selects the model, language, and task per request, so those +// are NOT server config; the model id clients request comes from the referenced +// Model's spec.source. +type WhisperConfig struct { + // ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE). + // When unset, falls back to a recognized Model spec.quantization, else the speaches default. + // +kubebuilder:validation:Enum=int8;int8_float16;int8_bfloat16;int8_float32;int16;float16;bfloat16;float32;default + // +optional + ComputeType string `json:"computeType,omitempty"` + + // InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE). + // When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda, + // cpu/metal -> cpu), defaulting to auto. + // +kubebuilder:validation:Enum=auto;cuda;cpu + // +optional + InferenceDevice string `json:"inferenceDevice,omitempty"` + + // ModelTTLSeconds is how long an idle model stays loaded before being unloaded + // (speaches WHISPER__TTL). -1 keeps models loaded indefinitely. + // +kubebuilder:validation:Minimum=-1 + // +optional + ModelTTLSeconds *int32 `json:"modelTTLSeconds,omitempty"` + + // EnableUI exposes the speaches Gradio web UI. Defaults to false. + // +optional + EnableUI *bool `json:"enableUI,omitempty"` + + // HFTokenSecretRef references a Secret containing a HuggingFace token, used to + // download gated CTranslate2 models. + // +optional + HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"` + + // APIKeySecretRef references a Secret containing an API key speaches will require + // on requests (sets the speaches API_KEY). + // +optional + APIKeySecretRef *corev1.SecretKeySelector `json:"apiKeySecretRef,omitempty"` +} + // InferenceServiceStatus defines the observed state of InferenceService. type InferenceServiceStatus struct { // Phase represents the current lifecycle phase of the InferenceService. diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d3486d39..276a1b0d 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -524,6 +524,11 @@ func (in *InferenceServiceSpec) DeepCopyInto(out *InferenceServiceSpec) { *out = new(TGIConfig) (*in).DeepCopyInto(*out) } + if in.WhisperConfig != nil { + in, out := &in.WhisperConfig, &out.WhisperConfig + *out = new(WhisperConfig) + (*in).DeepCopyInto(*out) + } if in.ImagePullSecrets != nil { in, out := &in.ImagePullSecrets, &out.ImagePullSecrets *out = make([]v1.LocalObjectReference, len(*in)) @@ -1327,3 +1332,38 @@ func (in *VLLMConfig) DeepCopy() *VLLMConfig { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WhisperConfig) DeepCopyInto(out *WhisperConfig) { + *out = *in + if in.ModelTTLSeconds != nil { + in, out := &in.ModelTTLSeconds, &out.ModelTTLSeconds + *out = new(int32) + **out = **in + } + if in.EnableUI != nil { + in, out := &in.EnableUI, &out.EnableUI + *out = new(bool) + **out = **in + } + if in.HFTokenSecretRef != nil { + in, out := &in.HFTokenSecretRef, &out.HFTokenSecretRef + *out = new(v1.SecretKeySelector) + (*in).DeepCopyInto(*out) + } + if in.APIKeySecretRef != nil { + in, out := &in.APIKeySecretRef, &out.APIKeySecretRef + *out = new(v1.SecretKeySelector) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WhisperConfig. +func (in *WhisperConfig) DeepCopy() *WhisperConfig { + if in == nil { + return nil + } + out := new(WhisperConfig) + in.DeepCopyInto(out) + return out +} diff --git a/charts/llmkube/templates/crds/inferenceservices.yaml b/charts/llmkube/templates/crds/inferenceservices.yaml index 8a56f713..7e8a0dda 100644 --- a/charts/llmkube/templates/crds/inferenceservices.yaml +++ b/charts/llmkube/templates/crds/inferenceservices.yaml @@ -224,8 +224,11 @@ spec: description: Endpoint defines the service endpoint configuration properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 @@ -1398,11 +1401,13 @@ spec: "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server. "vllm": vLLM OpenAI-compatible server with PagedAttention. "tgi": HuggingFace Text Generation Inference server. + "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server. enum: - llamacpp - personaplex - vllm - tgi + - whisper - generic type: string runtimeClassName: @@ -1910,6 +1915,100 @@ spec: format: int32 type: integer type: object + whisperConfig: + description: |- + WhisperConfig holds configuration for the whisper (speaches) runtime. + Only used when Runtime is "whisper". + properties: + apiKeySecretRef: + description: |- + APIKeySecretRef references a Secret containing an API key speaches will require + on requests (sets the speaches API_KEY). + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + computeType: + description: |- + ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE). + When unset, falls back to a recognized Model spec.quantization, else the speaches default. + enum: + - int8 + - int8_float16 + - int8_bfloat16 + - int8_float32 + - int16 + - float16 + - bfloat16 + - float32 + - default + type: string + enableUI: + description: EnableUI exposes the speaches Gradio web UI. Defaults + to false. + type: boolean + hfTokenSecretRef: + description: |- + HFTokenSecretRef references a Secret containing a HuggingFace token, used to + download gated CTranslate2 models. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + inferenceDevice: + description: |- + InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE). + When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda, + cpu/metal -> cpu), defaulting to auto. + enum: + - auto + - cuda + - cpu + type: string + modelTTLSeconds: + description: |- + ModelTTLSeconds is how long an idle model stays loaded before being unloaded + (speaches WHISPER__TTL). -1 keeps models loaded indefinitely. + format: int32 + minimum: -1 + type: integer + type: object required: - modelRef type: object diff --git a/charts/llmkube/templates/crds/modelrouters.yaml b/charts/llmkube/templates/crds/modelrouters.yaml index d2c168a6..b7a330c9 100644 --- a/charts/llmkube/templates/crds/modelrouters.yaml +++ b/charts/llmkube/templates/crds/modelrouters.yaml @@ -238,8 +238,11 @@ spec: through. Mirrors the shape used by InferenceService. properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 diff --git a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml index 60e12422..1825997a 100644 --- a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml +++ b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml @@ -220,8 +220,11 @@ spec: description: Endpoint defines the service endpoint configuration properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 @@ -1394,11 +1397,13 @@ spec: "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server. "vllm": vLLM OpenAI-compatible server with PagedAttention. "tgi": HuggingFace Text Generation Inference server. + "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server. enum: - llamacpp - personaplex - vllm - tgi + - whisper - generic type: string runtimeClassName: @@ -1906,6 +1911,100 @@ spec: format: int32 type: integer type: object + whisperConfig: + description: |- + WhisperConfig holds configuration for the whisper (speaches) runtime. + Only used when Runtime is "whisper". + properties: + apiKeySecretRef: + description: |- + APIKeySecretRef references a Secret containing an API key speaches will require + on requests (sets the speaches API_KEY). + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + computeType: + description: |- + ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE). + When unset, falls back to a recognized Model spec.quantization, else the speaches default. + enum: + - int8 + - int8_float16 + - int8_bfloat16 + - int8_float32 + - int16 + - float16 + - bfloat16 + - float32 + - default + type: string + enableUI: + description: EnableUI exposes the speaches Gradio web UI. Defaults + to false. + type: boolean + hfTokenSecretRef: + description: |- + HFTokenSecretRef references a Secret containing a HuggingFace token, used to + download gated CTranslate2 models. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + inferenceDevice: + description: |- + InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE). + When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda, + cpu/metal -> cpu), defaulting to auto. + enum: + - auto + - cuda + - cpu + type: string + modelTTLSeconds: + description: |- + ModelTTLSeconds is how long an idle model stays loaded before being unloaded + (speaches WHISPER__TTL). -1 keeps models loaded indefinitely. + format: int32 + minimum: -1 + type: integer + type: object required: - modelRef type: object diff --git a/config/crd/bases/inference.llmkube.dev_modelrouters.yaml b/config/crd/bases/inference.llmkube.dev_modelrouters.yaml index ca5c3945..42bb22e7 100644 --- a/config/crd/bases/inference.llmkube.dev_modelrouters.yaml +++ b/config/crd/bases/inference.llmkube.dev_modelrouters.yaml @@ -234,8 +234,11 @@ spec: through. Mirrors the shape used by InferenceService. properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 diff --git a/docs/contributors/adding-a-runtime.md b/docs/contributors/adding-a-runtime.md index ff435777..58124f97 100644 --- a/docs/contributors/adding-a-runtime.md +++ b/docs/contributors/adding-a-runtime.md @@ -104,4 +104,11 @@ Add `--runtime yourengine` handling in `pkg/cli/deploy.go`. | `personaplex` | PersonaPlex/Moshi | 8998 | TCP socket | No | — | | `vllm` | vLLM | 8000 | HTTP /health | Yes | vllm:num_requests_running | | `tgi` | TGI | 80 | HTTP /health | No (HF download) | tgi:queue_size | +| `whisper` | speaches (faster-whisper) | 8000 | HTTP /health | No (HF download) | — | | `generic` | Any container | 8080 | TCP socket | No | — | + +The `whisper` runtime serves the OpenAI audio API (`/v1/audio/transcriptions`) +rather than `/v1/chat/completions`; it declares this via the optional +`EndpointPathProvider` interface, which `constructEndpoint` consults for the +default status endpoint path. It is configured entirely through env vars +(`EnvBuilder`), not CLI args. diff --git a/examples/whisper-quickstart/README.md b/examples/whisper-quickstart/README.md new file mode 100644 index 00000000..a4101627 --- /dev/null +++ b/examples/whisper-quickstart/README.md @@ -0,0 +1,54 @@ +# Whisper (speaches) audio transcription quickstart + +Deploys an OpenAI-compatible audio transcription service using the `whisper` +runtime, backed by [speaches](https://speaches.ai) (faster-whisper / CTranslate2). + +## What you get + +- A `Model` referencing a faster-whisper CTranslate2 HuggingFace repo. +- An `InferenceService` with `runtime: whisper` that serves + `POST /v1/audio/transcriptions` (and `/v1/audio/translations`) on a ClusterIP, + port 8000. + +The operator manages the Deployment, Service, probes (`/health`), GPU +scheduling, and scaling. It also preloads the model into speaches via a +postStart hook (speaches does not auto-download on the first request), so the +pod reports Ready only once the model is installed and transcription will +succeed. + +## Apply + +```bash +kubectl apply -f model.yaml +kubectl apply -f inferenceservice.yaml +kubectl get inferenceservice whisper -o jsonpath='{.status.endpoint}' +# -> http://whisper.default.svc.cluster.local:8000/v1/audio/transcriptions +``` + +## Try it + +From a pod in the cluster, or via `kubectl port-forward svc/whisper 8000:8000`: + +```bash +curl -s http://localhost:8000/v1/audio/transcriptions \ + -F file=@sample.wav \ + -F model=Systran/faster-whisper-large-v3 +``` + +The response is OpenAI-compatible JSON (`{"text": "..."}`). The model id in the +`model` field must match the `Model`'s `spec.source`. + +## Notes and limitations (v1) + +- **The operator preloads the model.** A postStart hook installs it via + `POST /v1/models/{id}` once speaches is healthy; the pod becomes Ready only + after that completes. There is no persistent cache yet, so the model + re-downloads on each pod start. This runtime therefore requires HuggingFace + reachability and is not yet air-gapped (persistent cache + air-gapped support + are a tracked follow-up). +- **No Prometheus metrics.** speaches exposes none, so the cluster PodMonitor + will see 404s scraping `/metrics` for these pods. This is benign. +- **CPU-only:** drop the `gpu` resources from `inferenceservice.yaml` and set + `image: ghcr.io/speaches-ai/speaches:0.8.3-cpu`. +- **Gated models / auth:** set `whisperConfig.hfTokenSecretRef` to download gated + repos, and `whisperConfig.apiKeySecretRef` to require an API key on requests. diff --git a/examples/whisper-quickstart/inferenceservice.yaml b/examples/whisper-quickstart/inferenceservice.yaml new file mode 100644 index 00000000..9a50e489 --- /dev/null +++ b/examples/whisper-quickstart/inferenceservice.yaml @@ -0,0 +1,28 @@ +apiVersion: inference.llmkube.dev/v1alpha1 +kind: InferenceService +metadata: + name: whisper + namespace: default +spec: + modelRef: whisper-large-v3 + runtime: whisper + replicas: 1 + # No endpoint block needed: the whisper runtime defaults the Service/endpoint + # port to 8000 (speaches) and the path to /v1/audio/transcriptions. Set + # spec.endpoint only to override (custom port, NodePort/LoadBalancer, etc.). + whisperConfig: + # CTranslate2 compute type. float16 suits most NVIDIA GPUs; use int8 or + # int8_float16 to trade a little accuracy for memory/speed. Omit to inherit + # the model's quantization (when recognized) or the speaches default. + computeType: float16 + # Optional: derived from the Model accelerator when omitted (cuda here). + # inferenceDevice: cuda + # Optional: keep idle models loaded (-1) instead of unloading after 300s. + # modelTTLSeconds: -1 + resources: + gpu: 1 + gpuMemory: "8Gi" + cpu: "2" + memory: "4Gi" + # CPU-only deployments: drop the gpu resources above and override the image: + # image: ghcr.io/speaches-ai/speaches:0.8.3-cpu diff --git a/examples/whisper-quickstart/model.yaml b/examples/whisper-quickstart/model.yaml new file mode 100644 index 00000000..28aa042c --- /dev/null +++ b/examples/whisper-quickstart/model.yaml @@ -0,0 +1,23 @@ +apiVersion: inference.llmkube.dev/v1alpha1 +kind: Model +metadata: + name: whisper-large-v3 + namespace: default +spec: + # speaches uses CTranslate2 / faster-whisper models, referenced by HuggingFace + # repo id. This is the model clients pass in the OpenAI `model` field, and it + # drives the runtime device/compute defaults. The operator preloads it into + # speaches via a postStart hook (speaches does not auto-download on first + # request), so the pod becomes Ready only once the model is installed. + source: Systran/faster-whisper-large-v3 + format: custom + hardware: + accelerator: cuda + gpu: + enabled: true + count: 1 + vendor: nvidia + memory: "8Gi" + resources: + cpu: "2" + memory: "4Gi" diff --git a/internal/controller/deployment_builder.go b/internal/controller/deployment_builder.go index 1001a370..5f7c0491 100644 --- a/internal/controller/deployment_builder.go +++ b/internal/controller/deployment_builder.go @@ -163,12 +163,7 @@ func (r *InferenceServiceReconciler) constructDeployment( image = isvc.Spec.Image } - port := backend.DefaultPort() - if isvc.Spec.ContainerPort != nil { - port = *isvc.Spec.ContainerPort - } else if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Port > 0 { - port = isvc.Spec.Endpoint.Port - } + port := resolveServicePort(isvc) skipInit := isvc.Spec.SkipModelInit != nil && *isvc.Spec.SkipModelInit @@ -222,9 +217,14 @@ func (r *InferenceServiceReconciler) constructDeployment( container.Args = args } + // Optional container lifecycle hook (e.g. whisper preloads its model via postStart). + if lp, ok := backend.(LifecycleProvider); ok { + container.Lifecycle = lp.BuildLifecycle(isvc, model, port) + } + // Add runtime-generated env vars, then user-specified env vars (user wins on conflict) if eb, ok := backend.(EnvBuilder); ok { - container.Env = append(container.Env, eb.BuildEnv(isvc)...) + container.Env = append(container.Env, eb.BuildEnv(isvc, model)...) } if len(isvc.Spec.Env) > 0 { container.Env = append(container.Env, isvc.Spec.Env...) diff --git a/internal/controller/inferenceservice_deployment_test.go b/internal/controller/inferenceservice_deployment_test.go index fae2f907..8c29a7da 100644 --- a/internal/controller/inferenceservice_deployment_test.go +++ b/internal/controller/inferenceservice_deployment_test.go @@ -3720,7 +3720,7 @@ var _ = Describe("RuntimeBackend interface", func() { }, }, } - env := backend.BuildEnv(isvc) + env := backend.BuildEnv(isvc, nil) Expect(env).To(HaveLen(2)) Expect(env[0].Name).To(Equal("HF_TOKEN")) Expect(env[0].ValueFrom.SecretKeyRef.Name).To(Equal("hf-token")) diff --git a/internal/controller/inferenceservice_reconcile_test.go b/internal/controller/inferenceservice_reconcile_test.go index 5659fc45..5f577d23 100644 --- a/internal/controller/inferenceservice_reconcile_test.go +++ b/internal/controller/inferenceservice_reconcile_test.go @@ -504,6 +504,90 @@ var _ = Describe("Reconcile lifecycle", func() { Expect(updated.Status.Endpoint).NotTo(BeEmpty()) }) + It("should create a speaches Deployment for the whisper runtime", func() { + modelName := "whisper-model-ready" + isvcName := "whisper-isvc" + + model := &inferencev1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: "default"}, + Spec: inferencev1alpha1.ModelSpec{ + Source: "Systran/faster-whisper-large-v3", + Hardware: &inferencev1alpha1.HardwareSpec{Accelerator: "cuda"}, + }, + } + Expect(k8sClient.Create(ctx, model)).To(Succeed()) + defer func() { _ = k8sClient.Delete(ctx, model) }() + + model.Status.Phase = PhaseReady + Expect(k8sClient.Status().Update(ctx, model)).To(Succeed()) + + replicas := int32(1) + isvc := &inferencev1alpha1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{Name: isvcName, Namespace: "default"}, + Spec: inferencev1alpha1.InferenceServiceSpec{ + ModelRef: modelName, + Runtime: "whisper", + Replicas: &replicas, + WhisperConfig: &inferencev1alpha1.WhisperConfig{ + ComputeType: "float16", + }, + }, + } + Expect(k8sClient.Create(ctx, isvc)).To(Succeed()) + defer func() { + _ = k8sClient.Delete(ctx, isvc) + dep := &appsv1.Deployment{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, dep); err == nil { + _ = k8sClient.Delete(ctx, dep) + } + svc := &corev1.Service{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, svc); err == nil { + _ = k8sClient.Delete(ctx, svc) + } + }() + + reconciler := &InferenceServiceReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + InitContainerImage: "docker.io/curlimages/curl:8.18.0", + } + _, err := reconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: isvcName, Namespace: "default"}, + }) + Expect(err).NotTo(HaveOccurred()) + + By("verifying the speaches container, port, env, and probes") + dep := &appsv1.Deployment{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, dep)).To(Succeed()) + Expect(dep.Spec.Template.Spec.InitContainers).To(BeEmpty(), "whisper runtime needs no model-init container") + containers := dep.Spec.Template.Spec.Containers + Expect(containers).To(HaveLen(1)) + c := containers[0] + Expect(c.Name).To(Equal("speaches")) + Expect(c.Ports[0].ContainerPort).To(Equal(int32(8000))) + Expect(c.ReadinessProbe.HTTPGet.Path).To(Equal("/health")) + By("verifying the postStart model-preload hook is wired") + Expect(c.Lifecycle).NotTo(BeNil()) + Expect(c.Lifecycle.PostStart.Exec.Command[2]).To(ContainSubstring("/v1/models/$LLMKUBE_WHISPER_MODEL")) + By("verifying the Service port matches the speaches container port (8000)") + svc := &corev1.Service{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, svc)).To(Succeed()) + Expect(svc.Spec.Ports[0].Port).To(Equal(int32(8000))) + + envNames := map[string]string{} + for _, e := range c.Env { + envNames[e.Name] = e.Value + } + Expect(envNames).To(HaveKeyWithValue("WHISPER__INFERENCE_DEVICE", "cuda")) + Expect(envNames).To(HaveKeyWithValue("WHISPER__COMPUTE_TYPE", "float16")) + Expect(envNames).To(HaveKey("HF_HOME")) + + By("verifying the status endpoint advertises the audio transcription path") + updated := &inferencev1alpha1.InferenceService{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, updated)).To(Succeed()) + Expect(updated.Status.Endpoint).To(HaveSuffix("/v1/audio/transcriptions")) + }) + It("should skip Deployment for Metal accelerator", func() { modelName := "metal-model" isvcName := "isvc-metal" diff --git a/internal/controller/runtime.go b/internal/controller/runtime.go index 98a9efcf..f443d2f4 100644 --- a/internal/controller/runtime.go +++ b/internal/controller/runtime.go @@ -38,8 +38,18 @@ type CommandBuilder interface { } // EnvBuilder is optionally implemented by backends that generate runtime-specific env vars. +// It receives the Model so backends can derive env values (e.g. device, model id) from the +// Model spec, consistent with BuildArgs. type EnvBuilder interface { - BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar + BuildEnv(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) []corev1.EnvVar +} + +// EndpointPathProvider is optionally implemented by backends whose OpenAI-compatible API path +// differs from the default /v1/chat/completions (e.g. the whisper runtime serves +// /v1/audio/transcriptions). constructEndpoint consults it for the default status endpoint path +// when the user has not set spec.endpoint.path explicitly. +type EndpointPathProvider interface { + DefaultEndpointPath() string } // HPAMetricProvider is optionally implemented by backends that have a default autoscaling metric. @@ -47,6 +57,13 @@ type HPAMetricProvider interface { DefaultHPAMetric() string } +// LifecycleProvider is optionally implemented by backends that need a container +// lifecycle hook (e.g. the whisper runtime preloads its model via a postStart +// hook because speaches does not download models on the first request). +type LifecycleProvider interface { + BuildLifecycle(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model, port int32) *corev1.Lifecycle +} + // ServiceLinksOptOut is optionally implemented by backends that should run with // the legacy Kubernetes service-link env-var injection disabled. Returning true // sets `enableServiceLinks: false` on the Pod spec, which suppresses the @@ -68,6 +85,8 @@ func resolveBackend(isvc *inferencev1alpha1.InferenceService) RuntimeBackend { return &VLLMBackend{} case "tgi": return &TGIBackend{} + case "whisper": + return &WhisperBackend{} case "generic": return &GenericBackend{} default: @@ -87,6 +106,21 @@ func runtimeNameLabel(isvc *inferencev1alpha1.InferenceService) string { return isvc.Spec.Runtime } +// resolveServicePort returns the port the inference container listens on, +// which the Service and the advertised endpoint must match. Precedence: +// spec.containerPort, then spec.endpoint.port, then the backend's DefaultPort. +// This keeps the Service/endpoint aligned with the container for runtimes whose +// default port is not 8080 (e.g. vllm/whisper on 8000, tgi on 80). +func resolveServicePort(isvc *inferencev1alpha1.InferenceService) int32 { + port := resolveBackend(isvc).DefaultPort() + if isvc.Spec.ContainerPort != nil { + port = *isvc.Spec.ContainerPort + } else if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Port > 0 { + port = isvc.Spec.Endpoint.Port + } + return port +} + // resolveGPUCount determines the GPU count from Model spec or InferenceService spec. func resolveGPUCount(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) int32 { if model.Spec.Hardware != nil && model.Spec.Hardware.GPU != nil && model.Spec.Hardware.GPU.Count > 0 { diff --git a/internal/controller/runtime_personaplex.go b/internal/controller/runtime_personaplex.go index 5c1d612f..7fa6f19a 100644 --- a/internal/controller/runtime_personaplex.go +++ b/internal/controller/runtime_personaplex.go @@ -86,7 +86,7 @@ func (b *PersonaPlexBackend) BuildProbes(port int32) (startup, liveness, readine // BuildEnv returns environment variables for the PersonaPlex container, // including HF_TOKEN from a Secret reference if configured. -func (b *PersonaPlexBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar { +func (b *PersonaPlexBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar { var env []corev1.EnvVar cfg := isvc.Spec.PersonaPlexConfig diff --git a/internal/controller/runtime_test.go b/internal/controller/runtime_test.go index ba8ffe14..9b0b5ece 100644 --- a/internal/controller/runtime_test.go +++ b/internal/controller/runtime_test.go @@ -3,6 +3,8 @@ package controller import ( "testing" + corev1 "k8s.io/api/core/v1" + inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1" ) @@ -25,6 +27,32 @@ func containsArg(args []string, flag, value string) bool { return false } +// containsEnv reports whether env contains a var named name. When value is +// non-empty, it also requires the literal .Value to equal value. For +// secret-backed vars, pass value == "" and assert ValueFrom via envSecretRef. +func containsEnv(env []corev1.EnvVar, name, value string) bool { + for _, e := range env { + if e.Name != name { + continue + } + if value == "" { + return true + } + return e.Value == value + } + return false +} + +// envSecretRef returns the SecretKeySelector backing the named env var, or nil. +func envSecretRef(env []corev1.EnvVar, name string) *corev1.SecretKeySelector { + for _, e := range env { + if e.Name == name && e.ValueFrom != nil { + return e.ValueFrom.SecretKeyRef + } + } + return nil +} + // ptrString, ptrBool, ptrInt32 are local helpers so tests read naturally. func ptrBool(b bool) *bool { return &b } func ptrFloat64(f float64) *float64 { return &f } @@ -45,6 +73,7 @@ func TestRuntimeNameLabel(t *testing.T) { {name: "empty runtime defaults to llamacpp", runtime: "", expected: "llamacpp"}, {name: "vllm passes through", runtime: "vllm", expected: "vllm"}, {name: "tgi passes through", runtime: "tgi", expected: "tgi"}, + {name: "whisper passes through", runtime: "whisper", expected: "whisper"}, {name: "personaplex passes through", runtime: "personaplex", expected: "personaplex"}, {name: "generic passes through", runtime: "generic", expected: "generic"}, // Future runtimes (vllm-swift on metal, etc.) pass through diff --git a/internal/controller/runtime_tgi.go b/internal/controller/runtime_tgi.go index 7b9408c0..4e652f04 100644 --- a/internal/controller/runtime_tgi.go +++ b/internal/controller/runtime_tgi.go @@ -92,7 +92,7 @@ func (b *TGIBackend) BuildProbes(port int32) (*corev1.Probe, *corev1.Probe, *cor return startup, liveness, readiness } -func (b *TGIBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar { +func (b *TGIBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar { cfg := isvc.Spec.TGIConfig if cfg != nil && cfg.HFTokenSecretRef != nil { return []corev1.EnvVar{{ diff --git a/internal/controller/runtime_vllm.go b/internal/controller/runtime_vllm.go index c4500559..e2d567f8 100644 --- a/internal/controller/runtime_vllm.go +++ b/internal/controller/runtime_vllm.go @@ -190,7 +190,7 @@ func (b *VLLMBackend) BuildProbes(port int32) (*corev1.Probe, *corev1.Probe, *co return startup, liveness, readiness } -func (b *VLLMBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar { +func (b *VLLMBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar { cfg := isvc.Spec.VLLMConfig if cfg != nil && cfg.HFTokenSecretRef != nil { return []corev1.EnvVar{{ diff --git a/internal/controller/runtime_whisper.go b/internal/controller/runtime_whisper.go new file mode 100644 index 00000000..23c9d43b --- /dev/null +++ b/internal/controller/runtime_whisper.go @@ -0,0 +1,191 @@ +package controller + +import ( + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1" +) + +// WhisperBackend generates container configuration for speaches +// (https://speaches.ai), the faster-whisper OpenAI-compatible audio +// transcription server. speaches serves /v1/audio/transcriptions on port 8000, +// is configured entirely via environment variables, and lazy-loads CTranslate2 +// models from HuggingFace per request (so there is no model-init step and the +// model id clients request comes from the referenced Model's spec.source). +type WhisperBackend struct{} + +// whisperImage is the pinned default speaches image. CUDA by default; CPU-only +// deployments should override spec.image with the ...-cpu tag. +const whisperImage = "ghcr.io/speaches-ai/speaches:0.8.3-cuda" + +// whisperHFHome is where speaches' underlying huggingface_hub caches models. +// The image runs as the non-root "ubuntu" user with HOME=/home/ubuntu. +const whisperHFHome = "/home/ubuntu/.cache/huggingface" + +// whisperComputeTypes is the set of CTranslate2 compute types speaches accepts +// (WHISPER__COMPUTE_TYPE). Used to decide whether a Model's quantization string +// can be passed through as a compute type. +var whisperComputeTypes = map[string]struct{}{ + "int8": {}, "int8_float16": {}, "int8_bfloat16": {}, "int8_float32": {}, + "int16": {}, "float16": {}, "bfloat16": {}, "float32": {}, "default": {}, +} + +func (b *WhisperBackend) ContainerName() string { return "speaches" } +func (b *WhisperBackend) DefaultImage() string { return whisperImage } +func (b *WhisperBackend) DefaultPort() int32 { return 8000 } + +// NeedsModelInit is false: speaches downloads the CTranslate2 model from +// HuggingFace at request time, so no model-downloader init container is needed. +func (b *WhisperBackend) NeedsModelInit() bool { return false } + +// DefaultHPAMetric returns "" because speaches exposes no Prometheus queue +// metric to autoscale on. +func (b *WhisperBackend) DefaultHPAMetric() string { return "" } + +// DefaultEndpointPath advertises the OpenAI audio transcription path so the +// status endpoint points clients at the right route. +func (b *WhisperBackend) DefaultEndpointPath() string { return "/v1/audio/transcriptions" } + +// BuildArgs returns only the user's extra args: speaches is configured via env +// vars, not CLI flags (see BuildEnv). +func (b *WhisperBackend) BuildArgs(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model, _ string, _ int32) []string { + return isvc.Spec.ExtraArgs +} + +func (b *WhisperBackend) BuildProbes(port int32) (startup, liveness, readiness *corev1.Probe) { + healthGet := func() corev1.ProbeHandler { + return corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/health", + Port: intstr.FromInt32(port), + }, + } + } + startup = &corev1.Probe{ + ProbeHandler: healthGet(), + PeriodSeconds: 10, + TimeoutSeconds: 5, + FailureThreshold: 180, + } + liveness = &corev1.Probe{ + ProbeHandler: healthGet(), + PeriodSeconds: 15, + TimeoutSeconds: 5, + FailureThreshold: 3, + } + readiness = &corev1.Probe{ + ProbeHandler: healthGet(), + PeriodSeconds: 10, + TimeoutSeconds: 5, + FailureThreshold: 3, + } + return startup, liveness, readiness +} + +// BuildEnv translates the Model and WhisperConfig into speaches environment +// variables. Emitted in a stable order so Deployment specs are deterministic. +func (b *WhisperBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) []corev1.EnvVar { + cfg := isvc.Spec.WhisperConfig + + env := []corev1.EnvVar{ + {Name: "HF_HOME", Value: whisperHFHome}, + {Name: "ENABLE_UI", Value: whisperEnableUI(cfg)}, + {Name: "WHISPER__INFERENCE_DEVICE", Value: whisperDevice(cfg, model)}, + } + + // LLMKUBE_WHISPER_MODEL is consumed by the postStart preload hook (BuildLifecycle), + // not by speaches itself. speaches does not download models on first request, so the + // hook installs this model id once the server is up. + if model != nil && model.Spec.Source != "" { + env = append(env, corev1.EnvVar{Name: "LLMKUBE_WHISPER_MODEL", Value: model.Spec.Source}) + } + + if ct := whisperComputeType(cfg, model); ct != "" { + env = append(env, corev1.EnvVar{Name: "WHISPER__COMPUTE_TYPE", Value: ct}) + } + if cfg != nil && cfg.ModelTTLSeconds != nil { + env = append(env, corev1.EnvVar{Name: "WHISPER__TTL", Value: fmt.Sprintf("%d", *cfg.ModelTTLSeconds)}) + } + if cfg != nil && cfg.HFTokenSecretRef != nil { + env = append(env, corev1.EnvVar{ + Name: "HF_TOKEN", + ValueFrom: &corev1.EnvVarSource{SecretKeyRef: cfg.HFTokenSecretRef}, + }) + } + if cfg != nil && cfg.APIKeySecretRef != nil { + env = append(env, corev1.EnvVar{ + Name: "API_KEY", + ValueFrom: &corev1.EnvVarSource{SecretKeyRef: cfg.APIKeySecretRef}, + }) + } + + return env +} + +// BuildLifecycle returns a postStart hook that installs the model into speaches +// once the server is healthy. speaches (v0.8.x) does not download models on the +// first transcription request: it returns 400 until the model is installed via +// POST /v1/models/{id}. The hook blocks the container from reporting Running +// (and therefore Ready) until the model is installed, so the Service only +// receives traffic once transcription will succeed. Returns nil when there is no +// model source to preload. +func (b *WhisperBackend) BuildLifecycle(_ *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model, port int32) *corev1.Lifecycle { + if model == nil || model.Spec.Source == "" { + return nil + } + // curl is present in the speaches image (its own healthcheck uses it). The + // model id is read from the LLMKUBE_WHISPER_MODEL env var (set by BuildEnv) to + // avoid interpolating CR data into the shell script. + script := fmt.Sprintf(`for i in $(seq 1 90); do curl -sf -m 5 http://localhost:%d/health >/dev/null 2>&1 && break; sleep 2; done +curl -sf -m 1800 -X POST "http://localhost:%d/v1/models/$LLMKUBE_WHISPER_MODEL" >/dev/null 2>&1 || true`, port, port) + return &corev1.Lifecycle{ + PostStart: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{Command: []string{"sh", "-c", script}}, + }, + } +} + +func whisperEnableUI(cfg *inferencev1alpha1.WhisperConfig) string { + if cfg != nil && cfg.EnableUI != nil && *cfg.EnableUI { + return "true" + } + return "false" +} + +// whisperDevice resolves the speaches inference device: explicit config wins, +// otherwise it is derived from the Model accelerator, defaulting to "auto". +func whisperDevice(cfg *inferencev1alpha1.WhisperConfig, model *inferencev1alpha1.Model) string { + if cfg != nil && cfg.InferenceDevice != "" { + return cfg.InferenceDevice + } + if model != nil && model.Spec.Hardware != nil { + switch strings.ToLower(model.Spec.Hardware.Accelerator) { + case "cuda": + return "cuda" + case "cpu", "metal": + // CTranslate2 has no Metal backend; fall back to CPU. + return "cpu" + } + } + return "auto" +} + +// whisperComputeType resolves WHISPER__COMPUTE_TYPE: explicit config wins, +// otherwise a Model quantization string is passed through only if speaches +// recognizes it as a compute type. Returns "" to use the speaches default. +func whisperComputeType(cfg *inferencev1alpha1.WhisperConfig, model *inferencev1alpha1.Model) string { + if cfg != nil && cfg.ComputeType != "" { + return cfg.ComputeType + } + if model != nil { + q := strings.ToLower(strings.TrimSpace(model.Spec.Quantization)) + if _, ok := whisperComputeTypes[q]; ok { + return q + } + } + return "" +} diff --git a/internal/controller/runtime_whisper_test.go b/internal/controller/runtime_whisper_test.go new file mode 100644 index 00000000..42f39a16 --- /dev/null +++ b/internal/controller/runtime_whisper_test.go @@ -0,0 +1,305 @@ +package controller + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + + inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1" +) + +func whisperModel(accelerator, quantization string) *inferencev1alpha1.Model { + m := &inferencev1alpha1.Model{ + Spec: inferencev1alpha1.ModelSpec{ + Source: "Systran/faster-whisper-large-v3", + Quantization: quantization, + }, + } + if accelerator != "" { + m.Spec.Hardware = &inferencev1alpha1.HardwareSpec{Accelerator: accelerator} + } + return m +} + +func TestWhisperBackendBasics(t *testing.T) { + b := &WhisperBackend{} + + if b.ContainerName() != "speaches" { + t.Errorf("ContainerName() = %q, want speaches", b.ContainerName()) + } + if b.DefaultPort() != 8000 { + t.Errorf("DefaultPort() = %d, want 8000", b.DefaultPort()) + } + if b.NeedsModelInit() { + t.Error("NeedsModelInit() = true, want false (speaches fetches from HF at runtime)") + } + if b.DefaultHPAMetric() != "" { + t.Errorf("DefaultHPAMetric() = %q, want empty (speaches exposes no scrapeable queue metric)", b.DefaultHPAMetric()) + } + if got := b.DefaultEndpointPath(); got != "/v1/audio/transcriptions" { + t.Errorf("DefaultEndpointPath() = %q, want /v1/audio/transcriptions", got) + } + if img := b.DefaultImage(); img == "" || !containsSubstr(img, "speaches") { + t.Errorf("DefaultImage() = %q, want a pinned speaches image", img) + } +} + +func containsSubstr(s, sub string) bool { + return len(s) >= len(sub) && (s == sub || indexOf(s, sub) >= 0) +} + +func indexOf(s, sub string) int { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return i + } + } + return -1 +} + +func TestWhisperBuildProbes(t *testing.T) { + b := &WhisperBackend{} + startup, liveness, readiness := b.BuildProbes(8000) + for name, p := range map[string]*corev1.Probe{"startup": startup, "liveness": liveness, "readiness": readiness} { + if p == nil || p.HTTPGet == nil { + t.Fatalf("%s probe should be an HTTP GET", name) + continue + } + if p.HTTPGet.Path != "/health" { + t.Errorf("%s probe path = %q, want /health", name, p.HTTPGet.Path) + } + if p.HTTPGet.Port.IntValue() != 8000 { + t.Errorf("%s probe port = %v, want 8000", name, p.HTTPGet.Port) + } + } +} + +func TestWhisperBuildEnv(t *testing.T) { + secretRef := &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "hf"}, + Key: "token", + } + apiRef := &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api"}, + Key: "key", + } + + tests := []struct { + name string + cfg *inferencev1alpha1.WhisperConfig + model *inferencev1alpha1.Model + wantEnv map[string]string // name -> exact .Value + wantAbsent []string + wantHFSecret bool + wantAPISecret bool + }{ + { + name: "minimal cpu model: HF_HOME, UI off, device cpu, no compute/ttl", + model: whisperModel("cpu", ""), + wantEnv: map[string]string{ + "HF_HOME": "/home/ubuntu/.cache/huggingface", + "ENABLE_UI": "false", + "WHISPER__INFERENCE_DEVICE": "cpu", + "LLMKUBE_WHISPER_MODEL": "Systran/faster-whisper-large-v3", + }, + wantAbsent: []string{"WHISPER__COMPUTE_TYPE", "WHISPER__TTL", "HF_TOKEN", "API_KEY"}, + }, + { + name: "cuda accelerator maps to cuda device", + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "cuda"}, + }, + { + name: "metal accelerator maps to cpu", + model: whisperModel("metal", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "cpu"}, + }, + { + name: "nil hardware defaults device to auto", + model: whisperModel("", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "auto"}, + }, + { + name: "config device overrides model accelerator", + cfg: &inferencev1alpha1.WhisperConfig{InferenceDevice: "auto"}, + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "auto"}, + }, + { + name: "explicit compute type wins", + cfg: &inferencev1alpha1.WhisperConfig{ComputeType: "int8_float16"}, + model: whisperModel("cuda", "float16"), + wantEnv: map[string]string{"WHISPER__COMPUTE_TYPE": "int8_float16"}, + }, + { + name: "recognized model quantization becomes compute type", + model: whisperModel("cuda", "float16"), + wantEnv: map[string]string{"WHISPER__COMPUTE_TYPE": "float16"}, + }, + { + name: "unrecognized quantization omits compute type", + model: whisperModel("cuda", "Q4_K_M"), + wantAbsent: []string{"WHISPER__COMPUTE_TYPE"}, + }, + { + name: "model ttl -1 keeps loaded", + cfg: &inferencev1alpha1.WhisperConfig{ModelTTLSeconds: ptrInt32(-1)}, + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"WHISPER__TTL": "-1"}, + }, + { + name: "enable UI true", + cfg: &inferencev1alpha1.WhisperConfig{EnableUI: ptrBool(true)}, + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"ENABLE_UI": "true"}, + }, + { + name: "HF token secret ref", + cfg: &inferencev1alpha1.WhisperConfig{HFTokenSecretRef: secretRef}, + model: whisperModel("cuda", ""), + wantHFSecret: true, + }, + { + name: "API key secret ref", + cfg: &inferencev1alpha1.WhisperConfig{APIKeySecretRef: apiRef}, + model: whisperModel("cuda", ""), + wantAPISecret: true, + }, + } + + b := &WhisperBackend{} + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + isvc := &inferencev1alpha1.InferenceService{ + Spec: inferencev1alpha1.InferenceServiceSpec{ + Runtime: "whisper", + WhisperConfig: tc.cfg, + }, + } + env := b.BuildEnv(isvc, tc.model) + + for name, want := range tc.wantEnv { + if !containsEnv(env, name, want) { + t.Errorf("env %s = %q not found; got %+v", name, want, env) + } + } + for _, name := range tc.wantAbsent { + if containsEnv(env, name, "") { + t.Errorf("env %s should be absent; got %+v", name, env) + } + } + if tc.wantHFSecret && envSecretRef(env, "HF_TOKEN") == nil { + t.Errorf("HF_TOKEN should be backed by a secret ref; got %+v", env) + } + if tc.wantAPISecret && envSecretRef(env, "API_KEY") == nil { + t.Errorf("API_KEY should be backed by a secret ref; got %+v", env) + } + }) + } +} + +func TestWhisperBuildLifecycle(t *testing.T) { + b := &WhisperBackend{} + + t.Run("postStart preloads the model", func(t *testing.T) { + isvc := &inferencev1alpha1.InferenceService{ + Spec: inferencev1alpha1.InferenceServiceSpec{Runtime: "whisper"}, + } + lc := b.BuildLifecycle(isvc, whisperModel("cuda", ""), 8000) + if lc == nil || lc.PostStart == nil || lc.PostStart.Exec == nil { + t.Fatal("expected a postStart exec hook") + } + cmd := lc.PostStart.Exec.Command + if len(cmd) != 3 || cmd[0] != "sh" || cmd[1] != "-c" { + t.Fatalf("expected sh -c