defilantech · Defilan · Jun 2, 2026 · Jun 2, 2026
@@ -61,7 +61,8 @@ type InferenceServiceSpec struct {
 	// "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server.
 	// "vllm": vLLM OpenAI-compatible server with PagedAttention.
 	// "tgi": HuggingFace Text Generation Inference server.
-	// +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;generic
+	// "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server.
+	// +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;whisper;generic
 	// +kubebuilder:default=llamacpp
 	// +optional
 	Runtime string `json:"runtime,omitempty"`
@@ -338,6 +339,11 @@ type InferenceServiceSpec struct {
 	// +optional
 	TGIConfig *TGIConfig `json:"tgiConfig,omitempty"`
 
+	// WhisperConfig holds configuration for the whisper (speaches) runtime.
+	// Only used when Runtime is "whisper".
+	// +optional
+	WhisperConfig *WhisperConfig `json:"whisperConfig,omitempty"`
+
 	// ImagePullSecrets for pulling container images from private registries.
 	// +optional
 	ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
@@ -386,8 +392,10 @@ type EndpointSpec struct {
 	// +optional
 	Port int32 `json:"port,omitempty"`
 
-	// Path is the HTTP path for the inference endpoint
-	// +kubebuilder:default="/v1/chat/completions"
+	// Path is the HTTP path for the inference endpoint. When unset, the
+	// effective default is the runtime's OpenAI-compatible path
+	// (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+	// whisper runtime), resolved when the status endpoint is constructed.
 	// +optional
 	Path string `json:"path,omitempty"`
 
@@ -667,6 +675,45 @@ type TGIConfig struct {
 	HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"`
 }
 
+// WhisperConfig holds deploy-time server settings for the whisper (speaches)
+// runtime. speaches selects the model, language, and task per request, so those
+// are NOT server config; the model id clients request comes from the referenced
+// Model's spec.source.
+type WhisperConfig struct {
+	// ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE).
+	// When unset, falls back to a recognized Model spec.quantization, else the speaches default.
+	// +kubebuilder:validation:Enum=int8;int8_float16;int8_bfloat16;int8_float32;int16;float16;bfloat16;float32;default
+	// +optional
+	ComputeType string `json:"computeType,omitempty"`
+
+	// InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE).
+	// When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda,
+	// cpu/metal -> cpu), defaulting to auto.
+	// +kubebuilder:validation:Enum=auto;cuda;cpu
+	// +optional
+	InferenceDevice string `json:"inferenceDevice,omitempty"`
+
+	// ModelTTLSeconds is how long an idle model stays loaded before being unloaded
+	// (speaches WHISPER__TTL). -1 keeps models loaded indefinitely.
+	// +kubebuilder:validation:Minimum=-1
+	// +optional
+	ModelTTLSeconds *int32 `json:"modelTTLSeconds,omitempty"`
+
+	// EnableUI exposes the speaches Gradio web UI. Defaults to false.
+	// +optional
+	EnableUI *bool `json:"enableUI,omitempty"`
+
+	// HFTokenSecretRef references a Secret containing a HuggingFace token, used to
+	// download gated CTranslate2 models.
+	// +optional
+	HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"`
+
+	// APIKeySecretRef references a Secret containing an API key speaches will require
+	// on requests (sets the speaches API_KEY).
+	// +optional
+	APIKeySecretRef *corev1.SecretKeySelector `json:"apiKeySecretRef,omitempty"`
+}
+
 // InferenceServiceStatus defines the observed state of InferenceService.
 type InferenceServiceStatus struct {
 	// Phase represents the current lifecycle phase of the InferenceService.

@@ -224,8 +224,11 @@ spec:
                 description: Endpoint defines the service endpoint configuration
                 properties:
                   path:
-                    default: /v1/chat/completions
-                    description: Path is the HTTP path for the inference endpoint
+                    description: |-
+                      Path is the HTTP path for the inference endpoint. When unset, the
+                      effective default is the runtime's OpenAI-compatible path
+                      (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+                      whisper runtime), resolved when the status endpoint is constructed.
                     type: string
                   port:
                     default: 8080
@@ -1398,11 +1401,13 @@ spec:
                   "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server.
                   "vllm": vLLM OpenAI-compatible server with PagedAttention.
                   "tgi": HuggingFace Text Generation Inference server.
+                  "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server.
                 enum:
                 - llamacpp
                 - personaplex
                 - vllm
                 - tgi
+                - whisper
                 - generic
                 type: string
               runtimeClassName:
@@ -1910,6 +1915,100 @@ spec:
                     format: int32
                     type: integer
                 type: object
+              whisperConfig:
+                description: |-
+                  WhisperConfig holds configuration for the whisper (speaches) runtime.
+                  Only used when Runtime is "whisper".
+                properties:
+                  apiKeySecretRef:
+                    description: |-
+                      APIKeySecretRef references a Secret containing an API key speaches will require
+                      on requests (sets the speaches API_KEY).
+                    properties:
+                      key:
+                        description: The key of the secret to select from.  Must be
+                          a valid secret key.
+                        type: string
+                      name:
+                        default: ""
+                        description: |-
+                          Name of the referent.
+                          This field is effectively required, but due to backwards compatibility is
+                          allowed to be empty. Instances of this type with an empty value here are
+                          almost certainly wrong.
+                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                        type: string
+                      optional:
+                        description: Specify whether the Secret or its key must be
+                          defined
+                        type: boolean
+                    required:
+                    - key
+                    type: object
+                    x-kubernetes-map-type: atomic
+                  computeType:
+                    description: |-
+                      ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE).
+                      When unset, falls back to a recognized Model spec.quantization, else the speaches default.
+                    enum:
+                    - int8
+                    - int8_float16
+                    - int8_bfloat16
+                    - int8_float32
+                    - int16
+                    - float16
+                    - bfloat16
+                    - float32
+                    - default
+                    type: string
+                  enableUI:
+                    description: EnableUI exposes the speaches Gradio web UI. Defaults
+                      to false.
+                    type: boolean
+                  hfTokenSecretRef:
+                    description: |-
+                      HFTokenSecretRef references a Secret containing a HuggingFace token, used to
+                      download gated CTranslate2 models.
+                    properties:
+                      key:
+                        description: The key of the secret to select from.  Must be
+                          a valid secret key.
+                        type: string
+                      name:
+                        default: ""
+                        description: |-
+                          Name of the referent.
+                          This field is effectively required, but due to backwards compatibility is
+                          allowed to be empty. Instances of this type with an empty value here are
+                          almost certainly wrong.
+                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                        type: string
+                      optional:
+                        description: Specify whether the Secret or its key must be
+                          defined
+                        type: boolean
+                    required:
+                    - key
+                    type: object
+                    x-kubernetes-map-type: atomic
+                  inferenceDevice:
+                    description: |-
+                      InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE).
+                      When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda,
+                      cpu/metal -> cpu), defaulting to auto.
+                    enum:
+                    - auto
+                    - cuda
+                    - cpu
+                    type: string
+                  modelTTLSeconds:
+                    description: |-
+                      ModelTTLSeconds is how long an idle model stays loaded before being unloaded
+                      (speaches WHISPER__TTL). -1 keeps models loaded indefinitely.
+                    format: int32
+                    minimum: -1
+                    type: integer
+                type: object
             required:
             - modelRef
             type: object

@@ -238,8 +238,11 @@ spec:
                   through. Mirrors the shape used by InferenceService.
                 properties:
                   path:
-                    default: /v1/chat/completions
-                    description: Path is the HTTP path for the inference endpoint
+                    description: |-
+                      Path is the HTTP path for the inference endpoint. When unset, the
+                      effective default is the runtime's OpenAI-compatible path
+                      (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+                      whisper runtime), resolved when the status endpoint is constructed.
                     type: string
                   port:
                     default: 8080