From 761c6488af514a9ac8788975d64822d734193905 Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Tue, 2 Jun 2026 14:38:59 -0700 Subject: [PATCH 1/2] feat(controller): add first-class whisper (speaches) audio transcription runtime Add a `whisper` runtime backed by speaches (faster-whisper, CTranslate2) that serves the OpenAI-compatible audio API (/v1/audio/transcriptions) on port 8000. - New WhisperBackend: port 8000, /health probes, NeedsModelInit=false (speaches fetches CTranslate2 models from HuggingFace at request time), env-driven config. - Widen the optional EnvBuilder interface to BuildEnv(isvc, model) so backends can derive env values from the Model spec; update vllm/tgi/personaplex implementers and the deployment_builder call site. - Add an optional EndpointPathProvider interface and drop the EndpointSpec.Path CRD default so the whisper runtime resolves /v1/audio/transcriptions automatically while text runtimes keep /v1/chat/completions. constructEndpoint and routerProxyEndpoint already default an empty path, so this is backward compatible. - New typed WhisperConfig (compute type, device, model TTL, UI, HF/API token secret refs); add `whisper` to the Runtime enum; regenerate CRDs + chart CRDs. - Unit + reconcile tests, examples/whisper-quickstart, and a runtime-table doc update. v1 targets connected clusters (model downloads on first request); persistent model cache + air-gapped support are deferred to a follow-up volume hook. Fixes #612 Signed-off-by: Christopher Maher --- api/v1alpha1/inferenceservice_types.go | 53 +++- api/v1alpha1/zz_generated.deepcopy.go | 40 +++ .../templates/crds/inferenceservices.yaml | 103 +++++++- .../llmkube/templates/crds/modelrouters.yaml | 7 +- ...ference.llmkube.dev_inferenceservices.yaml | 103 +++++++- .../inference.llmkube.dev_modelrouters.yaml | 7 +- docs/contributors/adding-a-runtime.md | 7 + examples/whisper-quickstart/README.md | 50 ++++ .../whisper-quickstart/inferenceservice.yaml | 31 +++ examples/whisper-quickstart/model.yaml | 22 ++ internal/controller/deployment_builder.go | 2 +- .../inferenceservice_deployment_test.go | 2 +- .../inferenceservice_reconcile_test.go | 77 ++++++ internal/controller/runtime.go | 14 +- internal/controller/runtime_personaplex.go | 2 +- internal/controller/runtime_test.go | 29 +++ internal/controller/runtime_tgi.go | 2 +- internal/controller/runtime_vllm.go | 2 +- internal/controller/runtime_whisper.go | 161 ++++++++++++ internal/controller/runtime_whisper_test.go | 238 ++++++++++++++++++ internal/controller/status_builder.go | 9 + 21 files changed, 944 insertions(+), 17 deletions(-) create mode 100644 examples/whisper-quickstart/README.md create mode 100644 examples/whisper-quickstart/inferenceservice.yaml create mode 100644 examples/whisper-quickstart/model.yaml create mode 100644 internal/controller/runtime_whisper.go create mode 100644 internal/controller/runtime_whisper_test.go diff --git a/api/v1alpha1/inferenceservice_types.go b/api/v1alpha1/inferenceservice_types.go index 6b48f994..b448c352 100644 --- a/api/v1alpha1/inferenceservice_types.go +++ b/api/v1alpha1/inferenceservice_types.go @@ -61,7 +61,8 @@ type InferenceServiceSpec struct { // "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server. // "vllm": vLLM OpenAI-compatible server with PagedAttention. // "tgi": HuggingFace Text Generation Inference server. - // +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;generic + // "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server. + // +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;whisper;generic // +kubebuilder:default=llamacpp // +optional Runtime string `json:"runtime,omitempty"` @@ -338,6 +339,11 @@ type InferenceServiceSpec struct { // +optional TGIConfig *TGIConfig `json:"tgiConfig,omitempty"` + // WhisperConfig holds configuration for the whisper (speaches) runtime. + // Only used when Runtime is "whisper". + // +optional + WhisperConfig *WhisperConfig `json:"whisperConfig,omitempty"` + // ImagePullSecrets for pulling container images from private registries. // +optional ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"` @@ -386,8 +392,10 @@ type EndpointSpec struct { // +optional Port int32 `json:"port,omitempty"` - // Path is the HTTP path for the inference endpoint - // +kubebuilder:default="/v1/chat/completions" + // Path is the HTTP path for the inference endpoint. When unset, the + // effective default is the runtime's OpenAI-compatible path + // (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + // whisper runtime), resolved when the status endpoint is constructed. // +optional Path string `json:"path,omitempty"` @@ -667,6 +675,45 @@ type TGIConfig struct { HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"` } +// WhisperConfig holds deploy-time server settings for the whisper (speaches) +// runtime. speaches selects the model, language, and task per request, so those +// are NOT server config; the model id clients request comes from the referenced +// Model's spec.source. +type WhisperConfig struct { + // ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE). + // When unset, falls back to a recognized Model spec.quantization, else the speaches default. + // +kubebuilder:validation:Enum=int8;int8_float16;int8_bfloat16;int8_float32;int16;float16;bfloat16;float32;default + // +optional + ComputeType string `json:"computeType,omitempty"` + + // InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE). + // When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda, + // cpu/metal -> cpu), defaulting to auto. + // +kubebuilder:validation:Enum=auto;cuda;cpu + // +optional + InferenceDevice string `json:"inferenceDevice,omitempty"` + + // ModelTTLSeconds is how long an idle model stays loaded before being unloaded + // (speaches WHISPER__TTL). -1 keeps models loaded indefinitely. + // +kubebuilder:validation:Minimum=-1 + // +optional + ModelTTLSeconds *int32 `json:"modelTTLSeconds,omitempty"` + + // EnableUI exposes the speaches Gradio web UI. Defaults to false. + // +optional + EnableUI *bool `json:"enableUI,omitempty"` + + // HFTokenSecretRef references a Secret containing a HuggingFace token, used to + // download gated CTranslate2 models. + // +optional + HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"` + + // APIKeySecretRef references a Secret containing an API key speaches will require + // on requests (sets the speaches API_KEY). + // +optional + APIKeySecretRef *corev1.SecretKeySelector `json:"apiKeySecretRef,omitempty"` +} + // InferenceServiceStatus defines the observed state of InferenceService. type InferenceServiceStatus struct { // Phase represents the current lifecycle phase of the InferenceService. diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d3486d39..276a1b0d 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -524,6 +524,11 @@ func (in *InferenceServiceSpec) DeepCopyInto(out *InferenceServiceSpec) { *out = new(TGIConfig) (*in).DeepCopyInto(*out) } + if in.WhisperConfig != nil { + in, out := &in.WhisperConfig, &out.WhisperConfig + *out = new(WhisperConfig) + (*in).DeepCopyInto(*out) + } if in.ImagePullSecrets != nil { in, out := &in.ImagePullSecrets, &out.ImagePullSecrets *out = make([]v1.LocalObjectReference, len(*in)) @@ -1327,3 +1332,38 @@ func (in *VLLMConfig) DeepCopy() *VLLMConfig { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WhisperConfig) DeepCopyInto(out *WhisperConfig) { + *out = *in + if in.ModelTTLSeconds != nil { + in, out := &in.ModelTTLSeconds, &out.ModelTTLSeconds + *out = new(int32) + **out = **in + } + if in.EnableUI != nil { + in, out := &in.EnableUI, &out.EnableUI + *out = new(bool) + **out = **in + } + if in.HFTokenSecretRef != nil { + in, out := &in.HFTokenSecretRef, &out.HFTokenSecretRef + *out = new(v1.SecretKeySelector) + (*in).DeepCopyInto(*out) + } + if in.APIKeySecretRef != nil { + in, out := &in.APIKeySecretRef, &out.APIKeySecretRef + *out = new(v1.SecretKeySelector) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WhisperConfig. +func (in *WhisperConfig) DeepCopy() *WhisperConfig { + if in == nil { + return nil + } + out := new(WhisperConfig) + in.DeepCopyInto(out) + return out +} diff --git a/charts/llmkube/templates/crds/inferenceservices.yaml b/charts/llmkube/templates/crds/inferenceservices.yaml index 8a56f713..7e8a0dda 100644 --- a/charts/llmkube/templates/crds/inferenceservices.yaml +++ b/charts/llmkube/templates/crds/inferenceservices.yaml @@ -224,8 +224,11 @@ spec: description: Endpoint defines the service endpoint configuration properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 @@ -1398,11 +1401,13 @@ spec: "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server. "vllm": vLLM OpenAI-compatible server with PagedAttention. "tgi": HuggingFace Text Generation Inference server. + "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server. enum: - llamacpp - personaplex - vllm - tgi + - whisper - generic type: string runtimeClassName: @@ -1910,6 +1915,100 @@ spec: format: int32 type: integer type: object + whisperConfig: + description: |- + WhisperConfig holds configuration for the whisper (speaches) runtime. + Only used when Runtime is "whisper". + properties: + apiKeySecretRef: + description: |- + APIKeySecretRef references a Secret containing an API key speaches will require + on requests (sets the speaches API_KEY). + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + computeType: + description: |- + ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE). + When unset, falls back to a recognized Model spec.quantization, else the speaches default. + enum: + - int8 + - int8_float16 + - int8_bfloat16 + - int8_float32 + - int16 + - float16 + - bfloat16 + - float32 + - default + type: string + enableUI: + description: EnableUI exposes the speaches Gradio web UI. Defaults + to false. + type: boolean + hfTokenSecretRef: + description: |- + HFTokenSecretRef references a Secret containing a HuggingFace token, used to + download gated CTranslate2 models. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + inferenceDevice: + description: |- + InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE). + When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda, + cpu/metal -> cpu), defaulting to auto. + enum: + - auto + - cuda + - cpu + type: string + modelTTLSeconds: + description: |- + ModelTTLSeconds is how long an idle model stays loaded before being unloaded + (speaches WHISPER__TTL). -1 keeps models loaded indefinitely. + format: int32 + minimum: -1 + type: integer + type: object required: - modelRef type: object diff --git a/charts/llmkube/templates/crds/modelrouters.yaml b/charts/llmkube/templates/crds/modelrouters.yaml index d2c168a6..b7a330c9 100644 --- a/charts/llmkube/templates/crds/modelrouters.yaml +++ b/charts/llmkube/templates/crds/modelrouters.yaml @@ -238,8 +238,11 @@ spec: through. Mirrors the shape used by InferenceService. properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 diff --git a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml index 60e12422..1825997a 100644 --- a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml +++ b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml @@ -220,8 +220,11 @@ spec: description: Endpoint defines the service endpoint configuration properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 @@ -1394,11 +1397,13 @@ spec: "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server. "vllm": vLLM OpenAI-compatible server with PagedAttention. "tgi": HuggingFace Text Generation Inference server. + "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server. enum: - llamacpp - personaplex - vllm - tgi + - whisper - generic type: string runtimeClassName: @@ -1906,6 +1911,100 @@ spec: format: int32 type: integer type: object + whisperConfig: + description: |- + WhisperConfig holds configuration for the whisper (speaches) runtime. + Only used when Runtime is "whisper". + properties: + apiKeySecretRef: + description: |- + APIKeySecretRef references a Secret containing an API key speaches will require + on requests (sets the speaches API_KEY). + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + computeType: + description: |- + ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE). + When unset, falls back to a recognized Model spec.quantization, else the speaches default. + enum: + - int8 + - int8_float16 + - int8_bfloat16 + - int8_float32 + - int16 + - float16 + - bfloat16 + - float32 + - default + type: string + enableUI: + description: EnableUI exposes the speaches Gradio web UI. Defaults + to false. + type: boolean + hfTokenSecretRef: + description: |- + HFTokenSecretRef references a Secret containing a HuggingFace token, used to + download gated CTranslate2 models. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + inferenceDevice: + description: |- + InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE). + When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda, + cpu/metal -> cpu), defaulting to auto. + enum: + - auto + - cuda + - cpu + type: string + modelTTLSeconds: + description: |- + ModelTTLSeconds is how long an idle model stays loaded before being unloaded + (speaches WHISPER__TTL). -1 keeps models loaded indefinitely. + format: int32 + minimum: -1 + type: integer + type: object required: - modelRef type: object diff --git a/config/crd/bases/inference.llmkube.dev_modelrouters.yaml b/config/crd/bases/inference.llmkube.dev_modelrouters.yaml index ca5c3945..42bb22e7 100644 --- a/config/crd/bases/inference.llmkube.dev_modelrouters.yaml +++ b/config/crd/bases/inference.llmkube.dev_modelrouters.yaml @@ -234,8 +234,11 @@ spec: through. Mirrors the shape used by InferenceService. properties: path: - default: /v1/chat/completions - description: Path is the HTTP path for the inference endpoint + description: |- + Path is the HTTP path for the inference endpoint. When unset, the + effective default is the runtime's OpenAI-compatible path + (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the + whisper runtime), resolved when the status endpoint is constructed. type: string port: default: 8080 diff --git a/docs/contributors/adding-a-runtime.md b/docs/contributors/adding-a-runtime.md index ff435777..58124f97 100644 --- a/docs/contributors/adding-a-runtime.md +++ b/docs/contributors/adding-a-runtime.md @@ -104,4 +104,11 @@ Add `--runtime yourengine` handling in `pkg/cli/deploy.go`. | `personaplex` | PersonaPlex/Moshi | 8998 | TCP socket | No | — | | `vllm` | vLLM | 8000 | HTTP /health | Yes | vllm:num_requests_running | | `tgi` | TGI | 80 | HTTP /health | No (HF download) | tgi:queue_size | +| `whisper` | speaches (faster-whisper) | 8000 | HTTP /health | No (HF download) | — | | `generic` | Any container | 8080 | TCP socket | No | — | + +The `whisper` runtime serves the OpenAI audio API (`/v1/audio/transcriptions`) +rather than `/v1/chat/completions`; it declares this via the optional +`EndpointPathProvider` interface, which `constructEndpoint` consults for the +default status endpoint path. It is configured entirely through env vars +(`EnvBuilder`), not CLI args. diff --git a/examples/whisper-quickstart/README.md b/examples/whisper-quickstart/README.md new file mode 100644 index 00000000..f8f10537 --- /dev/null +++ b/examples/whisper-quickstart/README.md @@ -0,0 +1,50 @@ +# Whisper (speaches) audio transcription quickstart + +Deploys an OpenAI-compatible audio transcription service using the `whisper` +runtime, backed by [speaches](https://speaches.ai) (faster-whisper / CTranslate2). + +## What you get + +- A `Model` referencing a faster-whisper CTranslate2 HuggingFace repo. +- An `InferenceService` with `runtime: whisper` that serves + `POST /v1/audio/transcriptions` (and `/v1/audio/translations`) on a ClusterIP, + port 8000. + +The operator manages the Deployment, Service, probes (`/health`), GPU +scheduling, and scaling. speaches downloads the CTranslate2 model from +HuggingFace on first request. + +## Apply + +```bash +kubectl apply -f model.yaml +kubectl apply -f inferenceservice.yaml +kubectl get inferenceservice whisper -o jsonpath='{.status.endpoint}' +# -> http://whisper.default.svc.cluster.local:8000/v1/audio/transcriptions +``` + +## Try it + +From a pod in the cluster, or via `kubectl port-forward svc/whisper 8000:8000`: + +```bash +curl -s http://localhost:8000/v1/audio/transcriptions \ + -F file=@sample.wav \ + -F model=Systran/faster-whisper-large-v3 +``` + +The response is OpenAI-compatible JSON (`{"text": "..."}`). The model id in the +`model` field must match the `Model`'s `spec.source`. + +## Notes and limitations (v1) + +- **First request downloads the model.** speaches fetches the CTranslate2 model + from HuggingFace on demand. Until a persistent cache volume lands (see below), + the model re-downloads on each pod start, so this runtime currently requires + HuggingFace reachability and is not yet air-gapped. +- **No Prometheus metrics.** speaches exposes none, so the cluster PodMonitor + will see 404s scraping `/metrics` for these pods. This is benign. +- **CPU-only:** drop the `gpu` resources from `inferenceservice.yaml` and set + `image: ghcr.io/speaches-ai/speaches:0.8.3-cpu`. +- **Gated models / auth:** set `whisperConfig.hfTokenSecretRef` to download gated + repos, and `whisperConfig.apiKeySecretRef` to require an API key on requests. diff --git a/examples/whisper-quickstart/inferenceservice.yaml b/examples/whisper-quickstart/inferenceservice.yaml new file mode 100644 index 00000000..0ec53629 --- /dev/null +++ b/examples/whisper-quickstart/inferenceservice.yaml @@ -0,0 +1,31 @@ +apiVersion: inference.llmkube.dev/v1alpha1 +kind: InferenceService +metadata: + name: whisper + namespace: default +spec: + modelRef: whisper-large-v3 + runtime: whisper + replicas: 1 + # Endpoint path is optional: the whisper runtime defaults it to + # /v1/audio/transcriptions automatically. Override only if you front it + # differently. + endpoint: + port: 8000 + type: ClusterIP + whisperConfig: + # CTranslate2 compute type. float16 suits most NVIDIA GPUs; use int8 or + # int8_float16 to trade a little accuracy for memory/speed. Omit to inherit + # the model's quantization (when recognized) or the speaches default. + computeType: float16 + # Optional: derived from the Model accelerator when omitted (cuda here). + # inferenceDevice: cuda + # Optional: keep idle models loaded (-1) instead of unloading after 300s. + # modelTTLSeconds: -1 + resources: + gpu: 1 + gpuMemory: "8Gi" + cpu: "2" + memory: "4Gi" + # CPU-only deployments: drop the gpu resources above and override the image: + # image: ghcr.io/speaches-ai/speaches:0.8.3-cpu diff --git a/examples/whisper-quickstart/model.yaml b/examples/whisper-quickstart/model.yaml new file mode 100644 index 00000000..844d737f --- /dev/null +++ b/examples/whisper-quickstart/model.yaml @@ -0,0 +1,22 @@ +apiVersion: inference.llmkube.dev/v1alpha1 +kind: Model +metadata: + name: whisper-large-v3 + namespace: default +spec: + # speaches loads CTranslate2 / faster-whisper models from HuggingFace by repo + # id. This is the model clients pass in the OpenAI `model` field, and it drives + # the runtime device/compute defaults. The operator does not download it: the + # speaches server fetches it from HuggingFace on first request. + source: Systran/faster-whisper-large-v3 + format: custom + hardware: + accelerator: cuda + gpu: + enabled: true + count: 1 + vendor: nvidia + memory: "8Gi" + resources: + cpu: "2" + memory: "4Gi" diff --git a/internal/controller/deployment_builder.go b/internal/controller/deployment_builder.go index 1001a370..963b1da1 100644 --- a/internal/controller/deployment_builder.go +++ b/internal/controller/deployment_builder.go @@ -224,7 +224,7 @@ func (r *InferenceServiceReconciler) constructDeployment( // Add runtime-generated env vars, then user-specified env vars (user wins on conflict) if eb, ok := backend.(EnvBuilder); ok { - container.Env = append(container.Env, eb.BuildEnv(isvc)...) + container.Env = append(container.Env, eb.BuildEnv(isvc, model)...) } if len(isvc.Spec.Env) > 0 { container.Env = append(container.Env, isvc.Spec.Env...) diff --git a/internal/controller/inferenceservice_deployment_test.go b/internal/controller/inferenceservice_deployment_test.go index fae2f907..8c29a7da 100644 --- a/internal/controller/inferenceservice_deployment_test.go +++ b/internal/controller/inferenceservice_deployment_test.go @@ -3720,7 +3720,7 @@ var _ = Describe("RuntimeBackend interface", func() { }, }, } - env := backend.BuildEnv(isvc) + env := backend.BuildEnv(isvc, nil) Expect(env).To(HaveLen(2)) Expect(env[0].Name).To(Equal("HF_TOKEN")) Expect(env[0].ValueFrom.SecretKeyRef.Name).To(Equal("hf-token")) diff --git a/internal/controller/inferenceservice_reconcile_test.go b/internal/controller/inferenceservice_reconcile_test.go index 5659fc45..eebe8edc 100644 --- a/internal/controller/inferenceservice_reconcile_test.go +++ b/internal/controller/inferenceservice_reconcile_test.go @@ -504,6 +504,83 @@ var _ = Describe("Reconcile lifecycle", func() { Expect(updated.Status.Endpoint).NotTo(BeEmpty()) }) + It("should create a speaches Deployment for the whisper runtime", func() { + modelName := "whisper-model-ready" + isvcName := "whisper-isvc" + + model := &inferencev1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: "default"}, + Spec: inferencev1alpha1.ModelSpec{ + Source: "Systran/faster-whisper-large-v3", + Hardware: &inferencev1alpha1.HardwareSpec{Accelerator: "cuda"}, + }, + } + Expect(k8sClient.Create(ctx, model)).To(Succeed()) + defer func() { _ = k8sClient.Delete(ctx, model) }() + + model.Status.Phase = PhaseReady + Expect(k8sClient.Status().Update(ctx, model)).To(Succeed()) + + replicas := int32(1) + isvc := &inferencev1alpha1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{Name: isvcName, Namespace: "default"}, + Spec: inferencev1alpha1.InferenceServiceSpec{ + ModelRef: modelName, + Runtime: "whisper", + Replicas: &replicas, + WhisperConfig: &inferencev1alpha1.WhisperConfig{ + ComputeType: "float16", + }, + }, + } + Expect(k8sClient.Create(ctx, isvc)).To(Succeed()) + defer func() { + _ = k8sClient.Delete(ctx, isvc) + dep := &appsv1.Deployment{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, dep); err == nil { + _ = k8sClient.Delete(ctx, dep) + } + svc := &corev1.Service{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, svc); err == nil { + _ = k8sClient.Delete(ctx, svc) + } + }() + + reconciler := &InferenceServiceReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + InitContainerImage: "docker.io/curlimages/curl:8.18.0", + } + _, err := reconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: isvcName, Namespace: "default"}, + }) + Expect(err).NotTo(HaveOccurred()) + + By("verifying the speaches container, port, env, and probes") + dep := &appsv1.Deployment{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, dep)).To(Succeed()) + Expect(dep.Spec.Template.Spec.InitContainers).To(BeEmpty(), "whisper runtime needs no model-init container") + containers := dep.Spec.Template.Spec.Containers + Expect(containers).To(HaveLen(1)) + c := containers[0] + Expect(c.Name).To(Equal("speaches")) + Expect(c.Ports[0].ContainerPort).To(Equal(int32(8000))) + Expect(c.ReadinessProbe.HTTPGet.Path).To(Equal("/health")) + + envNames := map[string]string{} + for _, e := range c.Env { + envNames[e.Name] = e.Value + } + Expect(envNames).To(HaveKeyWithValue("WHISPER__INFERENCE_DEVICE", "cuda")) + Expect(envNames).To(HaveKeyWithValue("WHISPER__COMPUTE_TYPE", "float16")) + Expect(envNames).To(HaveKey("HF_HOME")) + + By("verifying the status endpoint advertises the audio transcription path") + updated := &inferencev1alpha1.InferenceService{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, updated)).To(Succeed()) + Expect(updated.Status.Endpoint).To(HaveSuffix("/v1/audio/transcriptions")) + }) + It("should skip Deployment for Metal accelerator", func() { modelName := "metal-model" isvcName := "isvc-metal" diff --git a/internal/controller/runtime.go b/internal/controller/runtime.go index 98a9efcf..1504c656 100644 --- a/internal/controller/runtime.go +++ b/internal/controller/runtime.go @@ -38,8 +38,18 @@ type CommandBuilder interface { } // EnvBuilder is optionally implemented by backends that generate runtime-specific env vars. +// It receives the Model so backends can derive env values (e.g. device, model id) from the +// Model spec, consistent with BuildArgs. type EnvBuilder interface { - BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar + BuildEnv(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) []corev1.EnvVar +} + +// EndpointPathProvider is optionally implemented by backends whose OpenAI-compatible API path +// differs from the default /v1/chat/completions (e.g. the whisper runtime serves +// /v1/audio/transcriptions). constructEndpoint consults it for the default status endpoint path +// when the user has not set spec.endpoint.path explicitly. +type EndpointPathProvider interface { + DefaultEndpointPath() string } // HPAMetricProvider is optionally implemented by backends that have a default autoscaling metric. @@ -68,6 +78,8 @@ func resolveBackend(isvc *inferencev1alpha1.InferenceService) RuntimeBackend { return &VLLMBackend{} case "tgi": return &TGIBackend{} + case "whisper": + return &WhisperBackend{} case "generic": return &GenericBackend{} default: diff --git a/internal/controller/runtime_personaplex.go b/internal/controller/runtime_personaplex.go index 5c1d612f..7fa6f19a 100644 --- a/internal/controller/runtime_personaplex.go +++ b/internal/controller/runtime_personaplex.go @@ -86,7 +86,7 @@ func (b *PersonaPlexBackend) BuildProbes(port int32) (startup, liveness, readine // BuildEnv returns environment variables for the PersonaPlex container, // including HF_TOKEN from a Secret reference if configured. -func (b *PersonaPlexBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar { +func (b *PersonaPlexBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar { var env []corev1.EnvVar cfg := isvc.Spec.PersonaPlexConfig diff --git a/internal/controller/runtime_test.go b/internal/controller/runtime_test.go index ba8ffe14..9b0b5ece 100644 --- a/internal/controller/runtime_test.go +++ b/internal/controller/runtime_test.go @@ -3,6 +3,8 @@ package controller import ( "testing" + corev1 "k8s.io/api/core/v1" + inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1" ) @@ -25,6 +27,32 @@ func containsArg(args []string, flag, value string) bool { return false } +// containsEnv reports whether env contains a var named name. When value is +// non-empty, it also requires the literal .Value to equal value. For +// secret-backed vars, pass value == "" and assert ValueFrom via envSecretRef. +func containsEnv(env []corev1.EnvVar, name, value string) bool { + for _, e := range env { + if e.Name != name { + continue + } + if value == "" { + return true + } + return e.Value == value + } + return false +} + +// envSecretRef returns the SecretKeySelector backing the named env var, or nil. +func envSecretRef(env []corev1.EnvVar, name string) *corev1.SecretKeySelector { + for _, e := range env { + if e.Name == name && e.ValueFrom != nil { + return e.ValueFrom.SecretKeyRef + } + } + return nil +} + // ptrString, ptrBool, ptrInt32 are local helpers so tests read naturally. func ptrBool(b bool) *bool { return &b } func ptrFloat64(f float64) *float64 { return &f } @@ -45,6 +73,7 @@ func TestRuntimeNameLabel(t *testing.T) { {name: "empty runtime defaults to llamacpp", runtime: "", expected: "llamacpp"}, {name: "vllm passes through", runtime: "vllm", expected: "vllm"}, {name: "tgi passes through", runtime: "tgi", expected: "tgi"}, + {name: "whisper passes through", runtime: "whisper", expected: "whisper"}, {name: "personaplex passes through", runtime: "personaplex", expected: "personaplex"}, {name: "generic passes through", runtime: "generic", expected: "generic"}, // Future runtimes (vllm-swift on metal, etc.) pass through diff --git a/internal/controller/runtime_tgi.go b/internal/controller/runtime_tgi.go index 7b9408c0..4e652f04 100644 --- a/internal/controller/runtime_tgi.go +++ b/internal/controller/runtime_tgi.go @@ -92,7 +92,7 @@ func (b *TGIBackend) BuildProbes(port int32) (*corev1.Probe, *corev1.Probe, *cor return startup, liveness, readiness } -func (b *TGIBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar { +func (b *TGIBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar { cfg := isvc.Spec.TGIConfig if cfg != nil && cfg.HFTokenSecretRef != nil { return []corev1.EnvVar{{ diff --git a/internal/controller/runtime_vllm.go b/internal/controller/runtime_vllm.go index c4500559..e2d567f8 100644 --- a/internal/controller/runtime_vllm.go +++ b/internal/controller/runtime_vllm.go @@ -190,7 +190,7 @@ func (b *VLLMBackend) BuildProbes(port int32) (*corev1.Probe, *corev1.Probe, *co return startup, liveness, readiness } -func (b *VLLMBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar { +func (b *VLLMBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar { cfg := isvc.Spec.VLLMConfig if cfg != nil && cfg.HFTokenSecretRef != nil { return []corev1.EnvVar{{ diff --git a/internal/controller/runtime_whisper.go b/internal/controller/runtime_whisper.go new file mode 100644 index 00000000..e6fc7413 --- /dev/null +++ b/internal/controller/runtime_whisper.go @@ -0,0 +1,161 @@ +package controller + +import ( + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1" +) + +// WhisperBackend generates container configuration for speaches +// (https://speaches.ai), the faster-whisper OpenAI-compatible audio +// transcription server. speaches serves /v1/audio/transcriptions on port 8000, +// is configured entirely via environment variables, and lazy-loads CTranslate2 +// models from HuggingFace per request (so there is no model-init step and the +// model id clients request comes from the referenced Model's spec.source). +type WhisperBackend struct{} + +// whisperImage is the pinned default speaches image. CUDA by default; CPU-only +// deployments should override spec.image with the ...-cpu tag. +const whisperImage = "ghcr.io/speaches-ai/speaches:0.8.3-cuda" + +// whisperHFHome is where speaches' underlying huggingface_hub caches models. +// The image runs as the non-root "ubuntu" user with HOME=/home/ubuntu. +const whisperHFHome = "/home/ubuntu/.cache/huggingface" + +// whisperComputeTypes is the set of CTranslate2 compute types speaches accepts +// (WHISPER__COMPUTE_TYPE). Used to decide whether a Model's quantization string +// can be passed through as a compute type. +var whisperComputeTypes = map[string]struct{}{ + "int8": {}, "int8_float16": {}, "int8_bfloat16": {}, "int8_float32": {}, + "int16": {}, "float16": {}, "bfloat16": {}, "float32": {}, "default": {}, +} + +func (b *WhisperBackend) ContainerName() string { return "speaches" } +func (b *WhisperBackend) DefaultImage() string { return whisperImage } +func (b *WhisperBackend) DefaultPort() int32 { return 8000 } + +// NeedsModelInit is false: speaches downloads the CTranslate2 model from +// HuggingFace at request time, so no model-downloader init container is needed. +func (b *WhisperBackend) NeedsModelInit() bool { return false } + +// DefaultHPAMetric returns "" because speaches exposes no Prometheus queue +// metric to autoscale on. +func (b *WhisperBackend) DefaultHPAMetric() string { return "" } + +// DefaultEndpointPath advertises the OpenAI audio transcription path so the +// status endpoint points clients at the right route. +func (b *WhisperBackend) DefaultEndpointPath() string { return "/v1/audio/transcriptions" } + +// BuildArgs returns only the user's extra args: speaches is configured via env +// vars, not CLI flags (see BuildEnv). +func (b *WhisperBackend) BuildArgs(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model, _ string, _ int32) []string { + return isvc.Spec.ExtraArgs +} + +func (b *WhisperBackend) BuildProbes(port int32) (startup, liveness, readiness *corev1.Probe) { + healthGet := func() corev1.ProbeHandler { + return corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/health", + Port: intstr.FromInt32(port), + }, + } + } + startup = &corev1.Probe{ + ProbeHandler: healthGet(), + PeriodSeconds: 10, + TimeoutSeconds: 5, + FailureThreshold: 180, + } + liveness = &corev1.Probe{ + ProbeHandler: healthGet(), + PeriodSeconds: 15, + TimeoutSeconds: 5, + FailureThreshold: 3, + } + readiness = &corev1.Probe{ + ProbeHandler: healthGet(), + PeriodSeconds: 10, + TimeoutSeconds: 5, + FailureThreshold: 3, + } + return startup, liveness, readiness +} + +// BuildEnv translates the Model and WhisperConfig into speaches environment +// variables. Emitted in a stable order so Deployment specs are deterministic. +func (b *WhisperBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) []corev1.EnvVar { + cfg := isvc.Spec.WhisperConfig + + env := []corev1.EnvVar{ + {Name: "HF_HOME", Value: whisperHFHome}, + {Name: "ENABLE_UI", Value: whisperEnableUI(cfg)}, + {Name: "WHISPER__INFERENCE_DEVICE", Value: whisperDevice(cfg, model)}, + } + + if ct := whisperComputeType(cfg, model); ct != "" { + env = append(env, corev1.EnvVar{Name: "WHISPER__COMPUTE_TYPE", Value: ct}) + } + if cfg != nil && cfg.ModelTTLSeconds != nil { + env = append(env, corev1.EnvVar{Name: "WHISPER__TTL", Value: fmt.Sprintf("%d", *cfg.ModelTTLSeconds)}) + } + if cfg != nil && cfg.HFTokenSecretRef != nil { + env = append(env, corev1.EnvVar{ + Name: "HF_TOKEN", + ValueFrom: &corev1.EnvVarSource{SecretKeyRef: cfg.HFTokenSecretRef}, + }) + } + if cfg != nil && cfg.APIKeySecretRef != nil { + env = append(env, corev1.EnvVar{ + Name: "API_KEY", + ValueFrom: &corev1.EnvVarSource{SecretKeyRef: cfg.APIKeySecretRef}, + }) + } + + return env +} + +func whisperEnableUI(cfg *inferencev1alpha1.WhisperConfig) string { + if cfg != nil && cfg.EnableUI != nil && *cfg.EnableUI { + return "true" + } + return "false" +} + +// whisperDevice resolves the speaches inference device: explicit config wins, +// otherwise it is derived from the Model accelerator, defaulting to "auto". +func whisperDevice(cfg *inferencev1alpha1.WhisperConfig, model *inferencev1alpha1.Model) string { + if cfg != nil && cfg.InferenceDevice != "" { + return cfg.InferenceDevice + } + if model != nil && model.Spec.Hardware != nil { + switch strings.ToLower(model.Spec.Hardware.Accelerator) { + case "cuda": + return "cuda" + case "cpu", "metal": + // CTranslate2 has no Metal backend; fall back to CPU. + return "cpu" + } + } + return "auto" +} + +// whisperComputeType resolves WHISPER__COMPUTE_TYPE: explicit config wins, +// otherwise a Model quantization string is passed through only if speaches +// recognizes it as a compute type. Returns "" to use the speaches default. +func whisperComputeType(cfg *inferencev1alpha1.WhisperConfig, model *inferencev1alpha1.Model) string { + if cfg != nil && cfg.ComputeType != "" { + return cfg.ComputeType + } + if model != nil { + q := strings.ToLower(strings.TrimSpace(model.Spec.Quantization)) + if _, ok := whisperComputeTypes[q]; ok { + return q + } + } + return "" +} diff --git a/internal/controller/runtime_whisper_test.go b/internal/controller/runtime_whisper_test.go new file mode 100644 index 00000000..8b1f6b17 --- /dev/null +++ b/internal/controller/runtime_whisper_test.go @@ -0,0 +1,238 @@ +package controller + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + + inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1" +) + +func whisperModel(accelerator, quantization string) *inferencev1alpha1.Model { + m := &inferencev1alpha1.Model{ + Spec: inferencev1alpha1.ModelSpec{ + Source: "Systran/faster-whisper-large-v3", + Quantization: quantization, + }, + } + if accelerator != "" { + m.Spec.Hardware = &inferencev1alpha1.HardwareSpec{Accelerator: accelerator} + } + return m +} + +func TestWhisperBackendBasics(t *testing.T) { + b := &WhisperBackend{} + + if b.ContainerName() != "speaches" { + t.Errorf("ContainerName() = %q, want speaches", b.ContainerName()) + } + if b.DefaultPort() != 8000 { + t.Errorf("DefaultPort() = %d, want 8000", b.DefaultPort()) + } + if b.NeedsModelInit() { + t.Error("NeedsModelInit() = true, want false (speaches fetches from HF at runtime)") + } + if b.DefaultHPAMetric() != "" { + t.Errorf("DefaultHPAMetric() = %q, want empty (speaches exposes no scrapeable queue metric)", b.DefaultHPAMetric()) + } + if got := b.DefaultEndpointPath(); got != "/v1/audio/transcriptions" { + t.Errorf("DefaultEndpointPath() = %q, want /v1/audio/transcriptions", got) + } + if img := b.DefaultImage(); img == "" || !containsSubstr(img, "speaches") { + t.Errorf("DefaultImage() = %q, want a pinned speaches image", img) + } +} + +func containsSubstr(s, sub string) bool { + return len(s) >= len(sub) && (s == sub || indexOf(s, sub) >= 0) +} + +func indexOf(s, sub string) int { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return i + } + } + return -1 +} + +func TestWhisperBuildProbes(t *testing.T) { + b := &WhisperBackend{} + startup, liveness, readiness := b.BuildProbes(8000) + for name, p := range map[string]*corev1.Probe{"startup": startup, "liveness": liveness, "readiness": readiness} { + if p == nil || p.HTTPGet == nil { + t.Fatalf("%s probe should be an HTTP GET", name) + continue + } + if p.HTTPGet.Path != "/health" { + t.Errorf("%s probe path = %q, want /health", name, p.HTTPGet.Path) + } + if p.HTTPGet.Port.IntValue() != 8000 { + t.Errorf("%s probe port = %v, want 8000", name, p.HTTPGet.Port) + } + } +} + +func TestWhisperBuildEnv(t *testing.T) { + secretRef := &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "hf"}, + Key: "token", + } + apiRef := &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "api"}, + Key: "key", + } + + tests := []struct { + name string + cfg *inferencev1alpha1.WhisperConfig + model *inferencev1alpha1.Model + wantEnv map[string]string // name -> exact .Value + wantAbsent []string + wantHFSecret bool + wantAPISecret bool + }{ + { + name: "minimal cpu model: HF_HOME, UI off, device cpu, no compute/ttl", + model: whisperModel("cpu", ""), + wantEnv: map[string]string{ + "HF_HOME": "/home/ubuntu/.cache/huggingface", + "ENABLE_UI": "false", + "WHISPER__INFERENCE_DEVICE": "cpu", + }, + wantAbsent: []string{"WHISPER__COMPUTE_TYPE", "WHISPER__TTL", "HF_TOKEN", "API_KEY"}, + }, + { + name: "cuda accelerator maps to cuda device", + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "cuda"}, + }, + { + name: "metal accelerator maps to cpu", + model: whisperModel("metal", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "cpu"}, + }, + { + name: "nil hardware defaults device to auto", + model: whisperModel("", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "auto"}, + }, + { + name: "config device overrides model accelerator", + cfg: &inferencev1alpha1.WhisperConfig{InferenceDevice: "auto"}, + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "auto"}, + }, + { + name: "explicit compute type wins", + cfg: &inferencev1alpha1.WhisperConfig{ComputeType: "int8_float16"}, + model: whisperModel("cuda", "float16"), + wantEnv: map[string]string{"WHISPER__COMPUTE_TYPE": "int8_float16"}, + }, + { + name: "recognized model quantization becomes compute type", + model: whisperModel("cuda", "float16"), + wantEnv: map[string]string{"WHISPER__COMPUTE_TYPE": "float16"}, + }, + { + name: "unrecognized quantization omits compute type", + model: whisperModel("cuda", "Q4_K_M"), + wantAbsent: []string{"WHISPER__COMPUTE_TYPE"}, + }, + { + name: "model ttl -1 keeps loaded", + cfg: &inferencev1alpha1.WhisperConfig{ModelTTLSeconds: ptrInt32(-1)}, + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"WHISPER__TTL": "-1"}, + }, + { + name: "enable UI true", + cfg: &inferencev1alpha1.WhisperConfig{EnableUI: ptrBool(true)}, + model: whisperModel("cuda", ""), + wantEnv: map[string]string{"ENABLE_UI": "true"}, + }, + { + name: "HF token secret ref", + cfg: &inferencev1alpha1.WhisperConfig{HFTokenSecretRef: secretRef}, + model: whisperModel("cuda", ""), + wantHFSecret: true, + }, + { + name: "API key secret ref", + cfg: &inferencev1alpha1.WhisperConfig{APIKeySecretRef: apiRef}, + model: whisperModel("cuda", ""), + wantAPISecret: true, + }, + } + + b := &WhisperBackend{} + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + isvc := &inferencev1alpha1.InferenceService{ + Spec: inferencev1alpha1.InferenceServiceSpec{ + Runtime: "whisper", + WhisperConfig: tc.cfg, + }, + } + env := b.BuildEnv(isvc, tc.model) + + for name, want := range tc.wantEnv { + if !containsEnv(env, name, want) { + t.Errorf("env %s = %q not found; got %+v", name, want, env) + } + } + for _, name := range tc.wantAbsent { + if containsEnv(env, name, "") { + t.Errorf("env %s should be absent; got %+v", name, env) + } + } + if tc.wantHFSecret && envSecretRef(env, "HF_TOKEN") == nil { + t.Errorf("HF_TOKEN should be backed by a secret ref; got %+v", env) + } + if tc.wantAPISecret && envSecretRef(env, "API_KEY") == nil { + t.Errorf("API_KEY should be backed by a secret ref; got %+v", env) + } + }) + } +} + +// TestConstructEndpointRuntimeAwareDefault verifies the runtime-aware default path: +// whisper resolves to the audio endpoint, other runtimes keep the chat endpoint, +// and an explicit spec.endpoint.path always wins. +func TestConstructEndpointRuntimeAwareDefault(t *testing.T) { + r := &InferenceServiceReconciler{} + svc := &corev1.Service{} + svc.Name = "demo" + svc.Namespace = "default" + + cases := []struct { + name string + runtime string + path string + wantEnds string + }{ + {name: "whisper default", runtime: "whisper", wantEnds: "/v1/audio/transcriptions"}, + {name: "llamacpp default", runtime: "", wantEnds: "/v1/chat/completions"}, + {name: "vllm default", runtime: "vllm", wantEnds: "/v1/chat/completions"}, + {name: "explicit path wins on whisper", runtime: "whisper", path: "/custom", wantEnds: "/custom"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + isvc := &inferencev1alpha1.InferenceService{ + Spec: inferencev1alpha1.InferenceServiceSpec{Runtime: tc.runtime}, + } + if tc.path != "" { + isvc.Spec.Endpoint = &inferencev1alpha1.EndpointSpec{Path: tc.path} + } + got := r.constructEndpoint(isvc, svc) + if !endsWith(got, tc.wantEnds) { + t.Errorf("constructEndpoint() = %q, want suffix %q", got, tc.wantEnds) + } + }) + } +} + +func endsWith(s, suffix string) bool { + return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix +} diff --git a/internal/controller/status_builder.go b/internal/controller/status_builder.go index 0ce67a83..e7f5cbaa 100644 --- a/internal/controller/status_builder.go +++ b/internal/controller/status_builder.go @@ -80,6 +80,15 @@ func (r *InferenceServiceReconciler) constructEndpoint(isvc *inferencev1alpha1.I port := int32(8080) path := "/v1/chat/completions" + // A backend may declare a different default OpenAI-compatible path (e.g. the + // whisper runtime serves /v1/audio/transcriptions). A user-set + // spec.endpoint.path still wins, checked below. + if ep, ok := resolveBackend(isvc).(EndpointPathProvider); ok { + if p := ep.DefaultEndpointPath(); p != "" { + path = p + } + } + if isvc.Spec.Endpoint != nil { if isvc.Spec.Endpoint.Port > 0 { port = isvc.Spec.Endpoint.Port From dea97cdab7885a26b65b0674947bfa4d90bfa3a4 Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Tue, 2 Jun 2026 16:29:05 -0700 Subject: [PATCH 2/2] feat(controller): preload whisper model and align service port with backend Two fixes surfaced by live testing the whisper runtime on a GPU cluster: - Service/endpoint port no longer hardcoded to 8080. Add resolveServicePort (containerPort -> endpoint.port -> backend.DefaultPort) and use it in constructService, constructEndpoint, and the deployment builder. Fixes the Service targetPort / container port mismatch for runtimes whose default port is not 8080 (whisper/vllm 8000, tgi 80); llamacpp (8080) is unchanged. - Preload the whisper model. speaches does not download models on the first transcription request (it returns 400 until POST /v1/models/{id}). Add an optional LifecycleProvider interface; WhisperBackend injects a postStart hook that installs model.Spec.Source once the server is healthy, gating Ready on the model being present. The model id is passed via the LLMKUBE_WHISPER_MODEL env var to avoid interpolating CR data into the shell script. Updated the quickstart example (no endpoint block needed) and docs to reflect that the operator preloads the model rather than relying on lazy download. Signed-off-by: Christopher Maher --- examples/whisper-quickstart/README.md | 16 ++-- .../whisper-quickstart/inferenceservice.yaml | 9 +-- examples/whisper-quickstart/model.yaml | 9 ++- internal/controller/deployment_builder.go | 12 +-- .../inferenceservice_reconcile_test.go | 7 ++ internal/controller/runtime.go | 22 ++++++ internal/controller/runtime_whisper.go | 30 ++++++++ internal/controller/runtime_whisper_test.go | 75 ++++++++++++++++++- internal/controller/service_builder.go | 5 +- internal/controller/status_builder.go | 11 +-- 10 files changed, 158 insertions(+), 38 deletions(-) diff --git a/examples/whisper-quickstart/README.md b/examples/whisper-quickstart/README.md index f8f10537..a4101627 100644 --- a/examples/whisper-quickstart/README.md +++ b/examples/whisper-quickstart/README.md @@ -11,8 +11,10 @@ runtime, backed by [speaches](https://speaches.ai) (faster-whisper / CTranslate2 port 8000. The operator manages the Deployment, Service, probes (`/health`), GPU -scheduling, and scaling. speaches downloads the CTranslate2 model from -HuggingFace on first request. +scheduling, and scaling. It also preloads the model into speaches via a +postStart hook (speaches does not auto-download on the first request), so the +pod reports Ready only once the model is installed and transcription will +succeed. ## Apply @@ -38,10 +40,12 @@ The response is OpenAI-compatible JSON (`{"text": "..."}`). The model id in the ## Notes and limitations (v1) -- **First request downloads the model.** speaches fetches the CTranslate2 model - from HuggingFace on demand. Until a persistent cache volume lands (see below), - the model re-downloads on each pod start, so this runtime currently requires - HuggingFace reachability and is not yet air-gapped. +- **The operator preloads the model.** A postStart hook installs it via + `POST /v1/models/{id}` once speaches is healthy; the pod becomes Ready only + after that completes. There is no persistent cache yet, so the model + re-downloads on each pod start. This runtime therefore requires HuggingFace + reachability and is not yet air-gapped (persistent cache + air-gapped support + are a tracked follow-up). - **No Prometheus metrics.** speaches exposes none, so the cluster PodMonitor will see 404s scraping `/metrics` for these pods. This is benign. - **CPU-only:** drop the `gpu` resources from `inferenceservice.yaml` and set diff --git a/examples/whisper-quickstart/inferenceservice.yaml b/examples/whisper-quickstart/inferenceservice.yaml index 0ec53629..9a50e489 100644 --- a/examples/whisper-quickstart/inferenceservice.yaml +++ b/examples/whisper-quickstart/inferenceservice.yaml @@ -7,12 +7,9 @@ spec: modelRef: whisper-large-v3 runtime: whisper replicas: 1 - # Endpoint path is optional: the whisper runtime defaults it to - # /v1/audio/transcriptions automatically. Override only if you front it - # differently. - endpoint: - port: 8000 - type: ClusterIP + # No endpoint block needed: the whisper runtime defaults the Service/endpoint + # port to 8000 (speaches) and the path to /v1/audio/transcriptions. Set + # spec.endpoint only to override (custom port, NodePort/LoadBalancer, etc.). whisperConfig: # CTranslate2 compute type. float16 suits most NVIDIA GPUs; use int8 or # int8_float16 to trade a little accuracy for memory/speed. Omit to inherit diff --git a/examples/whisper-quickstart/model.yaml b/examples/whisper-quickstart/model.yaml index 844d737f..28aa042c 100644 --- a/examples/whisper-quickstart/model.yaml +++ b/examples/whisper-quickstart/model.yaml @@ -4,10 +4,11 @@ metadata: name: whisper-large-v3 namespace: default spec: - # speaches loads CTranslate2 / faster-whisper models from HuggingFace by repo - # id. This is the model clients pass in the OpenAI `model` field, and it drives - # the runtime device/compute defaults. The operator does not download it: the - # speaches server fetches it from HuggingFace on first request. + # speaches uses CTranslate2 / faster-whisper models, referenced by HuggingFace + # repo id. This is the model clients pass in the OpenAI `model` field, and it + # drives the runtime device/compute defaults. The operator preloads it into + # speaches via a postStart hook (speaches does not auto-download on first + # request), so the pod becomes Ready only once the model is installed. source: Systran/faster-whisper-large-v3 format: custom hardware: diff --git a/internal/controller/deployment_builder.go b/internal/controller/deployment_builder.go index 963b1da1..5f7c0491 100644 --- a/internal/controller/deployment_builder.go +++ b/internal/controller/deployment_builder.go @@ -163,12 +163,7 @@ func (r *InferenceServiceReconciler) constructDeployment( image = isvc.Spec.Image } - port := backend.DefaultPort() - if isvc.Spec.ContainerPort != nil { - port = *isvc.Spec.ContainerPort - } else if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Port > 0 { - port = isvc.Spec.Endpoint.Port - } + port := resolveServicePort(isvc) skipInit := isvc.Spec.SkipModelInit != nil && *isvc.Spec.SkipModelInit @@ -222,6 +217,11 @@ func (r *InferenceServiceReconciler) constructDeployment( container.Args = args } + // Optional container lifecycle hook (e.g. whisper preloads its model via postStart). + if lp, ok := backend.(LifecycleProvider); ok { + container.Lifecycle = lp.BuildLifecycle(isvc, model, port) + } + // Add runtime-generated env vars, then user-specified env vars (user wins on conflict) if eb, ok := backend.(EnvBuilder); ok { container.Env = append(container.Env, eb.BuildEnv(isvc, model)...) diff --git a/internal/controller/inferenceservice_reconcile_test.go b/internal/controller/inferenceservice_reconcile_test.go index eebe8edc..5f577d23 100644 --- a/internal/controller/inferenceservice_reconcile_test.go +++ b/internal/controller/inferenceservice_reconcile_test.go @@ -566,6 +566,13 @@ var _ = Describe("Reconcile lifecycle", func() { Expect(c.Name).To(Equal("speaches")) Expect(c.Ports[0].ContainerPort).To(Equal(int32(8000))) Expect(c.ReadinessProbe.HTTPGet.Path).To(Equal("/health")) + By("verifying the postStart model-preload hook is wired") + Expect(c.Lifecycle).NotTo(BeNil()) + Expect(c.Lifecycle.PostStart.Exec.Command[2]).To(ContainSubstring("/v1/models/$LLMKUBE_WHISPER_MODEL")) + By("verifying the Service port matches the speaches container port (8000)") + svc := &corev1.Service{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, svc)).To(Succeed()) + Expect(svc.Spec.Ports[0].Port).To(Equal(int32(8000))) envNames := map[string]string{} for _, e := range c.Env { diff --git a/internal/controller/runtime.go b/internal/controller/runtime.go index 1504c656..f443d2f4 100644 --- a/internal/controller/runtime.go +++ b/internal/controller/runtime.go @@ -57,6 +57,13 @@ type HPAMetricProvider interface { DefaultHPAMetric() string } +// LifecycleProvider is optionally implemented by backends that need a container +// lifecycle hook (e.g. the whisper runtime preloads its model via a postStart +// hook because speaches does not download models on the first request). +type LifecycleProvider interface { + BuildLifecycle(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model, port int32) *corev1.Lifecycle +} + // ServiceLinksOptOut is optionally implemented by backends that should run with // the legacy Kubernetes service-link env-var injection disabled. Returning true // sets `enableServiceLinks: false` on the Pod spec, which suppresses the @@ -99,6 +106,21 @@ func runtimeNameLabel(isvc *inferencev1alpha1.InferenceService) string { return isvc.Spec.Runtime } +// resolveServicePort returns the port the inference container listens on, +// which the Service and the advertised endpoint must match. Precedence: +// spec.containerPort, then spec.endpoint.port, then the backend's DefaultPort. +// This keeps the Service/endpoint aligned with the container for runtimes whose +// default port is not 8080 (e.g. vllm/whisper on 8000, tgi on 80). +func resolveServicePort(isvc *inferencev1alpha1.InferenceService) int32 { + port := resolveBackend(isvc).DefaultPort() + if isvc.Spec.ContainerPort != nil { + port = *isvc.Spec.ContainerPort + } else if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Port > 0 { + port = isvc.Spec.Endpoint.Port + } + return port +} + // resolveGPUCount determines the GPU count from Model spec or InferenceService spec. func resolveGPUCount(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) int32 { if model.Spec.Hardware != nil && model.Spec.Hardware.GPU != nil && model.Spec.Hardware.GPU.Count > 0 { diff --git a/internal/controller/runtime_whisper.go b/internal/controller/runtime_whisper.go index e6fc7413..23c9d43b 100644 --- a/internal/controller/runtime_whisper.go +++ b/internal/controller/runtime_whisper.go @@ -97,6 +97,13 @@ func (b *WhisperBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, mode {Name: "WHISPER__INFERENCE_DEVICE", Value: whisperDevice(cfg, model)}, } + // LLMKUBE_WHISPER_MODEL is consumed by the postStart preload hook (BuildLifecycle), + // not by speaches itself. speaches does not download models on first request, so the + // hook installs this model id once the server is up. + if model != nil && model.Spec.Source != "" { + env = append(env, corev1.EnvVar{Name: "LLMKUBE_WHISPER_MODEL", Value: model.Spec.Source}) + } + if ct := whisperComputeType(cfg, model); ct != "" { env = append(env, corev1.EnvVar{Name: "WHISPER__COMPUTE_TYPE", Value: ct}) } @@ -119,6 +126,29 @@ func (b *WhisperBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, mode return env } +// BuildLifecycle returns a postStart hook that installs the model into speaches +// once the server is healthy. speaches (v0.8.x) does not download models on the +// first transcription request: it returns 400 until the model is installed via +// POST /v1/models/{id}. The hook blocks the container from reporting Running +// (and therefore Ready) until the model is installed, so the Service only +// receives traffic once transcription will succeed. Returns nil when there is no +// model source to preload. +func (b *WhisperBackend) BuildLifecycle(_ *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model, port int32) *corev1.Lifecycle { + if model == nil || model.Spec.Source == "" { + return nil + } + // curl is present in the speaches image (its own healthcheck uses it). The + // model id is read from the LLMKUBE_WHISPER_MODEL env var (set by BuildEnv) to + // avoid interpolating CR data into the shell script. + script := fmt.Sprintf(`for i in $(seq 1 90); do curl -sf -m 5 http://localhost:%d/health >/dev/null 2>&1 && break; sleep 2; done +curl -sf -m 1800 -X POST "http://localhost:%d/v1/models/$LLMKUBE_WHISPER_MODEL" >/dev/null 2>&1 || true`, port, port) + return &corev1.Lifecycle{ + PostStart: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{Command: []string{"sh", "-c", script}}, + }, + } +} + func whisperEnableUI(cfg *inferencev1alpha1.WhisperConfig) string { if cfg != nil && cfg.EnableUI != nil && *cfg.EnableUI { return "true" diff --git a/internal/controller/runtime_whisper_test.go b/internal/controller/runtime_whisper_test.go index 8b1f6b17..42f39a16 100644 --- a/internal/controller/runtime_whisper_test.go +++ b/internal/controller/runtime_whisper_test.go @@ -100,6 +100,7 @@ func TestWhisperBuildEnv(t *testing.T) { "HF_HOME": "/home/ubuntu/.cache/huggingface", "ENABLE_UI": "false", "WHISPER__INFERENCE_DEVICE": "cpu", + "LLMKUBE_WHISPER_MODEL": "Systran/faster-whisper-large-v3", }, wantAbsent: []string{"WHISPER__COMPUTE_TYPE", "WHISPER__TTL", "HF_TOKEN", "API_KEY"}, }, @@ -197,6 +198,37 @@ func TestWhisperBuildEnv(t *testing.T) { } } +func TestWhisperBuildLifecycle(t *testing.T) { + b := &WhisperBackend{} + + t.Run("postStart preloads the model", func(t *testing.T) { + isvc := &inferencev1alpha1.InferenceService{ + Spec: inferencev1alpha1.InferenceServiceSpec{Runtime: "whisper"}, + } + lc := b.BuildLifecycle(isvc, whisperModel("cuda", ""), 8000) + if lc == nil || lc.PostStart == nil || lc.PostStart.Exec == nil { + t.Fatal("expected a postStart exec hook") + } + cmd := lc.PostStart.Exec.Command + if len(cmd) != 3 || cmd[0] != "sh" || cmd[1] != "-c" { + t.Fatalf("expected sh -c