diff --git a/api/v1alpha1/inferenceservice_types.go b/api/v1alpha1/inferenceservice_types.go
index 6b48f994..b448c352 100644
--- a/api/v1alpha1/inferenceservice_types.go
+++ b/api/v1alpha1/inferenceservice_types.go
@@ -61,7 +61,8 @@ type InferenceServiceSpec struct {
 	// "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server.
 	// "vllm": vLLM OpenAI-compatible server with PagedAttention.
 	// "tgi": HuggingFace Text Generation Inference server.
-	// +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;generic
+	// "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server.
+	// +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;whisper;generic
 	// +kubebuilder:default=llamacpp
 	// +optional
 	Runtime string `json:"runtime,omitempty"`
@@ -338,6 +339,11 @@ type InferenceServiceSpec struct {
 	// +optional
 	TGIConfig *TGIConfig `json:"tgiConfig,omitempty"`
 
+	// WhisperConfig holds configuration for the whisper (speaches) runtime.
+	// Only used when Runtime is "whisper".
+	// +optional
+	WhisperConfig *WhisperConfig `json:"whisperConfig,omitempty"`
+
 	// ImagePullSecrets for pulling container images from private registries.
 	// +optional
 	ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
@@ -386,8 +392,10 @@ type EndpointSpec struct {
 	// +optional
 	Port int32 `json:"port,omitempty"`
 
-	// Path is the HTTP path for the inference endpoint
-	// +kubebuilder:default="/v1/chat/completions"
+	// Path is the HTTP path for the inference endpoint. When unset, the
+	// effective default is the runtime's OpenAI-compatible path
+	// (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+	// whisper runtime), resolved when the status endpoint is constructed.
 	// +optional
 	Path string `json:"path,omitempty"`
 
@@ -667,6 +675,45 @@ type TGIConfig struct {
 	HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"`
 }
 
+// WhisperConfig holds deploy-time server settings for the whisper (speaches)
+// runtime. speaches selects the model, language, and task per request, so those
+// are NOT server config; the model id clients request comes from the referenced
+// Model's spec.source.
+type WhisperConfig struct {
+	// ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE).
+	// When unset, falls back to a recognized Model spec.quantization, else the speaches default.
+	// +kubebuilder:validation:Enum=int8;int8_float16;int8_bfloat16;int8_float32;int16;float16;bfloat16;float32;default
+	// +optional
+	ComputeType string `json:"computeType,omitempty"`
+
+	// InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE).
+	// When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda,
+	// cpu/metal -> cpu), defaulting to auto.
+	// +kubebuilder:validation:Enum=auto;cuda;cpu
+	// +optional
+	InferenceDevice string `json:"inferenceDevice,omitempty"`
+
+	// ModelTTLSeconds is how long an idle model stays loaded before being unloaded
+	// (speaches WHISPER__TTL). -1 keeps models loaded indefinitely.
+	// +kubebuilder:validation:Minimum=-1
+	// +optional
+	ModelTTLSeconds *int32 `json:"modelTTLSeconds,omitempty"`
+
+	// EnableUI exposes the speaches Gradio web UI. Defaults to false.
+	// +optional
+	EnableUI *bool `json:"enableUI,omitempty"`
+
+	// HFTokenSecretRef references a Secret containing a HuggingFace token, used to
+	// download gated CTranslate2 models.
+	// +optional
+	HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"`
+
+	// APIKeySecretRef references a Secret containing an API key speaches will require
+	// on requests (sets the speaches API_KEY).
+	// +optional
+	APIKeySecretRef *corev1.SecretKeySelector `json:"apiKeySecretRef,omitempty"`
+}
+
 // InferenceServiceStatus defines the observed state of InferenceService.
 type InferenceServiceStatus struct {
 	// Phase represents the current lifecycle phase of the InferenceService.
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index d3486d39..276a1b0d 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -524,6 +524,11 @@ func (in *InferenceServiceSpec) DeepCopyInto(out *InferenceServiceSpec) {
 		*out = new(TGIConfig)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.WhisperConfig != nil {
+		in, out := &in.WhisperConfig, &out.WhisperConfig
+		*out = new(WhisperConfig)
+		(*in).DeepCopyInto(*out)
+	}
 	if in.ImagePullSecrets != nil {
 		in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
 		*out = make([]v1.LocalObjectReference, len(*in))
@@ -1327,3 +1332,38 @@ func (in *VLLMConfig) DeepCopy() *VLLMConfig {
 	in.DeepCopyInto(out)
 	return out
 }
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *WhisperConfig) DeepCopyInto(out *WhisperConfig) {
+	*out = *in
+	if in.ModelTTLSeconds != nil {
+		in, out := &in.ModelTTLSeconds, &out.ModelTTLSeconds
+		*out = new(int32)
+		**out = **in
+	}
+	if in.EnableUI != nil {
+		in, out := &in.EnableUI, &out.EnableUI
+		*out = new(bool)
+		**out = **in
+	}
+	if in.HFTokenSecretRef != nil {
+		in, out := &in.HFTokenSecretRef, &out.HFTokenSecretRef
+		*out = new(v1.SecretKeySelector)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.APIKeySecretRef != nil {
+		in, out := &in.APIKeySecretRef, &out.APIKeySecretRef
+		*out = new(v1.SecretKeySelector)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WhisperConfig.
+func (in *WhisperConfig) DeepCopy() *WhisperConfig {
+	if in == nil {
+		return nil
+	}
+	out := new(WhisperConfig)
+	in.DeepCopyInto(out)
+	return out
+}
diff --git a/charts/llmkube/templates/crds/inferenceservices.yaml b/charts/llmkube/templates/crds/inferenceservices.yaml
index 8a56f713..7e8a0dda 100644
--- a/charts/llmkube/templates/crds/inferenceservices.yaml
+++ b/charts/llmkube/templates/crds/inferenceservices.yaml
@@ -224,8 +224,11 @@ spec:
                 description: Endpoint defines the service endpoint configuration
                 properties:
                   path:
-                    default: /v1/chat/completions
-                    description: Path is the HTTP path for the inference endpoint
+                    description: |-
+                      Path is the HTTP path for the inference endpoint. When unset, the
+                      effective default is the runtime's OpenAI-compatible path
+                      (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+                      whisper runtime), resolved when the status endpoint is constructed.
                     type: string
                   port:
                     default: 8080
@@ -1398,11 +1401,13 @@ spec:
                   "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server.
                   "vllm": vLLM OpenAI-compatible server with PagedAttention.
                   "tgi": HuggingFace Text Generation Inference server.
+                  "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server.
                 enum:
                 - llamacpp
                 - personaplex
                 - vllm
                 - tgi
+                - whisper
                 - generic
                 type: string
               runtimeClassName:
@@ -1910,6 +1915,100 @@ spec:
                     format: int32
                     type: integer
                 type: object
+              whisperConfig:
+                description: |-
+                  WhisperConfig holds configuration for the whisper (speaches) runtime.
+                  Only used when Runtime is "whisper".
+                properties:
+                  apiKeySecretRef:
+                    description: |-
+                      APIKeySecretRef references a Secret containing an API key speaches will require
+                      on requests (sets the speaches API_KEY).
+                    properties:
+                      key:
+                        description: The key of the secret to select from.  Must be
+                          a valid secret key.
+                        type: string
+                      name:
+                        default: ""
+                        description: |-
+                          Name of the referent.
+                          This field is effectively required, but due to backwards compatibility is
+                          allowed to be empty. Instances of this type with an empty value here are
+                          almost certainly wrong.
+                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                        type: string
+                      optional:
+                        description: Specify whether the Secret or its key must be
+                          defined
+                        type: boolean
+                    required:
+                    - key
+                    type: object
+                    x-kubernetes-map-type: atomic
+                  computeType:
+                    description: |-
+                      ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE).
+                      When unset, falls back to a recognized Model spec.quantization, else the speaches default.
+                    enum:
+                    - int8
+                    - int8_float16
+                    - int8_bfloat16
+                    - int8_float32
+                    - int16
+                    - float16
+                    - bfloat16
+                    - float32
+                    - default
+                    type: string
+                  enableUI:
+                    description: EnableUI exposes the speaches Gradio web UI. Defaults
+                      to false.
+                    type: boolean
+                  hfTokenSecretRef:
+                    description: |-
+                      HFTokenSecretRef references a Secret containing a HuggingFace token, used to
+                      download gated CTranslate2 models.
+                    properties:
+                      key:
+                        description: The key of the secret to select from.  Must be
+                          a valid secret key.
+                        type: string
+                      name:
+                        default: ""
+                        description: |-
+                          Name of the referent.
+                          This field is effectively required, but due to backwards compatibility is
+                          allowed to be empty. Instances of this type with an empty value here are
+                          almost certainly wrong.
+                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                        type: string
+                      optional:
+                        description: Specify whether the Secret or its key must be
+                          defined
+                        type: boolean
+                    required:
+                    - key
+                    type: object
+                    x-kubernetes-map-type: atomic
+                  inferenceDevice:
+                    description: |-
+                      InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE).
+                      When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda,
+                      cpu/metal -> cpu), defaulting to auto.
+                    enum:
+                    - auto
+                    - cuda
+                    - cpu
+                    type: string
+                  modelTTLSeconds:
+                    description: |-
+                      ModelTTLSeconds is how long an idle model stays loaded before being unloaded
+                      (speaches WHISPER__TTL). -1 keeps models loaded indefinitely.
+                    format: int32
+                    minimum: -1
+                    type: integer
+                type: object
             required:
             - modelRef
             type: object
diff --git a/charts/llmkube/templates/crds/modelrouters.yaml b/charts/llmkube/templates/crds/modelrouters.yaml
index d2c168a6..b7a330c9 100644
--- a/charts/llmkube/templates/crds/modelrouters.yaml
+++ b/charts/llmkube/templates/crds/modelrouters.yaml
@@ -238,8 +238,11 @@ spec:
                   through. Mirrors the shape used by InferenceService.
                 properties:
                   path:
-                    default: /v1/chat/completions
-                    description: Path is the HTTP path for the inference endpoint
+                    description: |-
+                      Path is the HTTP path for the inference endpoint. When unset, the
+                      effective default is the runtime's OpenAI-compatible path
+                      (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+                      whisper runtime), resolved when the status endpoint is constructed.
                     type: string
                   port:
                     default: 8080
diff --git a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml
index 60e12422..1825997a 100644
--- a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml
+++ b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml
@@ -220,8 +220,11 @@ spec:
                 description: Endpoint defines the service endpoint configuration
                 properties:
                   path:
-                    default: /v1/chat/completions
-                    description: Path is the HTTP path for the inference endpoint
+                    description: |-
+                      Path is the HTTP path for the inference endpoint. When unset, the
+                      effective default is the runtime's OpenAI-compatible path
+                      (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+                      whisper runtime), resolved when the status endpoint is constructed.
                     type: string
                   port:
                     default: 8080
@@ -1394,11 +1397,13 @@ spec:
                   "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server.
                   "vllm": vLLM OpenAI-compatible server with PagedAttention.
                   "tgi": HuggingFace Text Generation Inference server.
+                  "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server.
                 enum:
                 - llamacpp
                 - personaplex
                 - vllm
                 - tgi
+                - whisper
                 - generic
                 type: string
               runtimeClassName:
@@ -1906,6 +1911,100 @@ spec:
                     format: int32
                     type: integer
                 type: object
+              whisperConfig:
+                description: |-
+                  WhisperConfig holds configuration for the whisper (speaches) runtime.
+                  Only used when Runtime is "whisper".
+                properties:
+                  apiKeySecretRef:
+                    description: |-
+                      APIKeySecretRef references a Secret containing an API key speaches will require
+                      on requests (sets the speaches API_KEY).
+                    properties:
+                      key:
+                        description: The key of the secret to select from.  Must be
+                          a valid secret key.
+                        type: string
+                      name:
+                        default: ""
+                        description: |-
+                          Name of the referent.
+                          This field is effectively required, but due to backwards compatibility is
+                          allowed to be empty. Instances of this type with an empty value here are
+                          almost certainly wrong.
+                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                        type: string
+                      optional:
+                        description: Specify whether the Secret or its key must be
+                          defined
+                        type: boolean
+                    required:
+                    - key
+                    type: object
+                    x-kubernetes-map-type: atomic
+                  computeType:
+                    description: |-
+                      ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE).
+                      When unset, falls back to a recognized Model spec.quantization, else the speaches default.
+                    enum:
+                    - int8
+                    - int8_float16
+                    - int8_bfloat16
+                    - int8_float32
+                    - int16
+                    - float16
+                    - bfloat16
+                    - float32
+                    - default
+                    type: string
+                  enableUI:
+                    description: EnableUI exposes the speaches Gradio web UI. Defaults
+                      to false.
+                    type: boolean
+                  hfTokenSecretRef:
+                    description: |-
+                      HFTokenSecretRef references a Secret containing a HuggingFace token, used to
+                      download gated CTranslate2 models.
+                    properties:
+                      key:
+                        description: The key of the secret to select from.  Must be
+                          a valid secret key.
+                        type: string
+                      name:
+                        default: ""
+                        description: |-
+                          Name of the referent.
+                          This field is effectively required, but due to backwards compatibility is
+                          allowed to be empty. Instances of this type with an empty value here are
+                          almost certainly wrong.
+                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                        type: string
+                      optional:
+                        description: Specify whether the Secret or its key must be
+                          defined
+                        type: boolean
+                    required:
+                    - key
+                    type: object
+                    x-kubernetes-map-type: atomic
+                  inferenceDevice:
+                    description: |-
+                      InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE).
+                      When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda,
+                      cpu/metal -> cpu), defaulting to auto.
+                    enum:
+                    - auto
+                    - cuda
+                    - cpu
+                    type: string
+                  modelTTLSeconds:
+                    description: |-
+                      ModelTTLSeconds is how long an idle model stays loaded before being unloaded
+                      (speaches WHISPER__TTL). -1 keeps models loaded indefinitely.
+                    format: int32
+                    minimum: -1
+                    type: integer
+                type: object
             required:
             - modelRef
             type: object
diff --git a/config/crd/bases/inference.llmkube.dev_modelrouters.yaml b/config/crd/bases/inference.llmkube.dev_modelrouters.yaml
index ca5c3945..42bb22e7 100644
--- a/config/crd/bases/inference.llmkube.dev_modelrouters.yaml
+++ b/config/crd/bases/inference.llmkube.dev_modelrouters.yaml
@@ -234,8 +234,11 @@ spec:
                   through. Mirrors the shape used by InferenceService.
                 properties:
                   path:
-                    default: /v1/chat/completions
-                    description: Path is the HTTP path for the inference endpoint
+                    description: |-
+                      Path is the HTTP path for the inference endpoint. When unset, the
+                      effective default is the runtime's OpenAI-compatible path
+                      (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
+                      whisper runtime), resolved when the status endpoint is constructed.
                     type: string
                   port:
                     default: 8080
diff --git a/docs/contributors/adding-a-runtime.md b/docs/contributors/adding-a-runtime.md
index ff435777..58124f97 100644
--- a/docs/contributors/adding-a-runtime.md
+++ b/docs/contributors/adding-a-runtime.md
@@ -104,4 +104,11 @@ Add `--runtime yourengine` handling in `pkg/cli/deploy.go`.
 | `personaplex` | PersonaPlex/Moshi | 8998 | TCP socket | No | — |
 | `vllm` | vLLM | 8000 | HTTP /health | Yes | vllm:num_requests_running |
 | `tgi` | TGI | 80 | HTTP /health | No (HF download) | tgi:queue_size |
+| `whisper` | speaches (faster-whisper) | 8000 | HTTP /health | No (HF download) | — |
 | `generic` | Any container | 8080 | TCP socket | No | — |
+
+The `whisper` runtime serves the OpenAI audio API (`/v1/audio/transcriptions`)
+rather than `/v1/chat/completions`; it declares this via the optional
+`EndpointPathProvider` interface, which `constructEndpoint` consults for the
+default status endpoint path. It is configured entirely through env vars
+(`EnvBuilder`), not CLI args.
diff --git a/examples/whisper-quickstart/README.md b/examples/whisper-quickstart/README.md
new file mode 100644
index 00000000..a4101627
--- /dev/null
+++ b/examples/whisper-quickstart/README.md
@@ -0,0 +1,54 @@
+# Whisper (speaches) audio transcription quickstart
+
+Deploys an OpenAI-compatible audio transcription service using the `whisper`
+runtime, backed by [speaches](https://speaches.ai) (faster-whisper / CTranslate2).
+
+## What you get
+
+- A `Model` referencing a faster-whisper CTranslate2 HuggingFace repo.
+- An `InferenceService` with `runtime: whisper` that serves
+  `POST /v1/audio/transcriptions` (and `/v1/audio/translations`) on a ClusterIP,
+  port 8000.
+
+The operator manages the Deployment, Service, probes (`/health`), GPU
+scheduling, and scaling. It also preloads the model into speaches via a
+postStart hook (speaches does not auto-download on the first request), so the
+pod reports Ready only once the model is installed and transcription will
+succeed.
+
+## Apply
+
+```bash
+kubectl apply -f model.yaml
+kubectl apply -f inferenceservice.yaml
+kubectl get inferenceservice whisper -o jsonpath='{.status.endpoint}'
+# -> http://whisper.default.svc.cluster.local:8000/v1/audio/transcriptions
+```
+
+## Try it
+
+From a pod in the cluster, or via `kubectl port-forward svc/whisper 8000:8000`:
+
+```bash
+curl -s http://localhost:8000/v1/audio/transcriptions \
+  -F file=@sample.wav \
+  -F model=Systran/faster-whisper-large-v3
+```
+
+The response is OpenAI-compatible JSON (`{"text": "..."}`). The model id in the
+`model` field must match the `Model`'s `spec.source`.
+
+## Notes and limitations (v1)
+
+- **The operator preloads the model.** A postStart hook installs it via
+  `POST /v1/models/{id}` once speaches is healthy; the pod becomes Ready only
+  after that completes. There is no persistent cache yet, so the model
+  re-downloads on each pod start. This runtime therefore requires HuggingFace
+  reachability and is not yet air-gapped (persistent cache + air-gapped support
+  are a tracked follow-up).
+- **No Prometheus metrics.** speaches exposes none, so the cluster PodMonitor
+  will see 404s scraping `/metrics` for these pods. This is benign.
+- **CPU-only:** drop the `gpu` resources from `inferenceservice.yaml` and set
+  `image: ghcr.io/speaches-ai/speaches:0.8.3-cpu`.
+- **Gated models / auth:** set `whisperConfig.hfTokenSecretRef` to download gated
+  repos, and `whisperConfig.apiKeySecretRef` to require an API key on requests.
diff --git a/examples/whisper-quickstart/inferenceservice.yaml b/examples/whisper-quickstart/inferenceservice.yaml
new file mode 100644
index 00000000..9a50e489
--- /dev/null
+++ b/examples/whisper-quickstart/inferenceservice.yaml
@@ -0,0 +1,28 @@
+apiVersion: inference.llmkube.dev/v1alpha1
+kind: InferenceService
+metadata:
+  name: whisper
+  namespace: default
+spec:
+  modelRef: whisper-large-v3
+  runtime: whisper
+  replicas: 1
+  # No endpoint block needed: the whisper runtime defaults the Service/endpoint
+  # port to 8000 (speaches) and the path to /v1/audio/transcriptions. Set
+  # spec.endpoint only to override (custom port, NodePort/LoadBalancer, etc.).
+  whisperConfig:
+    # CTranslate2 compute type. float16 suits most NVIDIA GPUs; use int8 or
+    # int8_float16 to trade a little accuracy for memory/speed. Omit to inherit
+    # the model's quantization (when recognized) or the speaches default.
+    computeType: float16
+    # Optional: derived from the Model accelerator when omitted (cuda here).
+    # inferenceDevice: cuda
+    # Optional: keep idle models loaded (-1) instead of unloading after 300s.
+    # modelTTLSeconds: -1
+  resources:
+    gpu: 1
+    gpuMemory: "8Gi"
+    cpu: "2"
+    memory: "4Gi"
+  # CPU-only deployments: drop the gpu resources above and override the image:
+  # image: ghcr.io/speaches-ai/speaches:0.8.3-cpu
diff --git a/examples/whisper-quickstart/model.yaml b/examples/whisper-quickstart/model.yaml
new file mode 100644
index 00000000..28aa042c
--- /dev/null
+++ b/examples/whisper-quickstart/model.yaml
@@ -0,0 +1,23 @@
+apiVersion: inference.llmkube.dev/v1alpha1
+kind: Model
+metadata:
+  name: whisper-large-v3
+  namespace: default
+spec:
+  # speaches uses CTranslate2 / faster-whisper models, referenced by HuggingFace
+  # repo id. This is the model clients pass in the OpenAI `model` field, and it
+  # drives the runtime device/compute defaults. The operator preloads it into
+  # speaches via a postStart hook (speaches does not auto-download on first
+  # request), so the pod becomes Ready only once the model is installed.
+  source: Systran/faster-whisper-large-v3
+  format: custom
+  hardware:
+    accelerator: cuda
+    gpu:
+      enabled: true
+      count: 1
+      vendor: nvidia
+      memory: "8Gi"
+  resources:
+    cpu: "2"
+    memory: "4Gi"
diff --git a/internal/controller/deployment_builder.go b/internal/controller/deployment_builder.go
index 1001a370..5f7c0491 100644
--- a/internal/controller/deployment_builder.go
+++ b/internal/controller/deployment_builder.go
@@ -163,12 +163,7 @@ func (r *InferenceServiceReconciler) constructDeployment(
 		image = isvc.Spec.Image
 	}
 
-	port := backend.DefaultPort()
-	if isvc.Spec.ContainerPort != nil {
-		port = *isvc.Spec.ContainerPort
-	} else if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Port > 0 {
-		port = isvc.Spec.Endpoint.Port
-	}
+	port := resolveServicePort(isvc)
 
 	skipInit := isvc.Spec.SkipModelInit != nil && *isvc.Spec.SkipModelInit
 
@@ -222,9 +217,14 @@ func (r *InferenceServiceReconciler) constructDeployment(
 		container.Args = args
 	}
 
+	// Optional container lifecycle hook (e.g. whisper preloads its model via postStart).
+	if lp, ok := backend.(LifecycleProvider); ok {
+		container.Lifecycle = lp.BuildLifecycle(isvc, model, port)
+	}
+
 	// Add runtime-generated env vars, then user-specified env vars (user wins on conflict)
 	if eb, ok := backend.(EnvBuilder); ok {
-		container.Env = append(container.Env, eb.BuildEnv(isvc)...)
+		container.Env = append(container.Env, eb.BuildEnv(isvc, model)...)
 	}
 	if len(isvc.Spec.Env) > 0 {
 		container.Env = append(container.Env, isvc.Spec.Env...)
diff --git a/internal/controller/inferenceservice_deployment_test.go b/internal/controller/inferenceservice_deployment_test.go
index fae2f907..8c29a7da 100644
--- a/internal/controller/inferenceservice_deployment_test.go
+++ b/internal/controller/inferenceservice_deployment_test.go
@@ -3720,7 +3720,7 @@ var _ = Describe("RuntimeBackend interface", func() {
 					},
 				},
 			}
-			env := backend.BuildEnv(isvc)
+			env := backend.BuildEnv(isvc, nil)
 			Expect(env).To(HaveLen(2))
 			Expect(env[0].Name).To(Equal("HF_TOKEN"))
 			Expect(env[0].ValueFrom.SecretKeyRef.Name).To(Equal("hf-token"))
diff --git a/internal/controller/inferenceservice_reconcile_test.go b/internal/controller/inferenceservice_reconcile_test.go
index 5659fc45..5f577d23 100644
--- a/internal/controller/inferenceservice_reconcile_test.go
+++ b/internal/controller/inferenceservice_reconcile_test.go
@@ -504,6 +504,90 @@ var _ = Describe("Reconcile lifecycle", func() {
 			Expect(updated.Status.Endpoint).NotTo(BeEmpty())
 		})
 
+		It("should create a speaches Deployment for the whisper runtime", func() {
+			modelName := "whisper-model-ready"
+			isvcName := "whisper-isvc"
+
+			model := &inferencev1alpha1.Model{
+				ObjectMeta: metav1.ObjectMeta{Name: modelName, Namespace: "default"},
+				Spec: inferencev1alpha1.ModelSpec{
+					Source:   "Systran/faster-whisper-large-v3",
+					Hardware: &inferencev1alpha1.HardwareSpec{Accelerator: "cuda"},
+				},
+			}
+			Expect(k8sClient.Create(ctx, model)).To(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, model) }()
+
+			model.Status.Phase = PhaseReady
+			Expect(k8sClient.Status().Update(ctx, model)).To(Succeed())
+
+			replicas := int32(1)
+			isvc := &inferencev1alpha1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{Name: isvcName, Namespace: "default"},
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					ModelRef: modelName,
+					Runtime:  "whisper",
+					Replicas: &replicas,
+					WhisperConfig: &inferencev1alpha1.WhisperConfig{
+						ComputeType: "float16",
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, isvc)).To(Succeed())
+			defer func() {
+				_ = k8sClient.Delete(ctx, isvc)
+				dep := &appsv1.Deployment{}
+				if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, dep); err == nil {
+					_ = k8sClient.Delete(ctx, dep)
+				}
+				svc := &corev1.Service{}
+				if err := k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, svc); err == nil {
+					_ = k8sClient.Delete(ctx, svc)
+				}
+			}()
+
+			reconciler := &InferenceServiceReconciler{
+				Client:             k8sClient,
+				Scheme:             k8sClient.Scheme(),
+				InitContainerImage: "docker.io/curlimages/curl:8.18.0",
+			}
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: isvcName, Namespace: "default"},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			By("verifying the speaches container, port, env, and probes")
+			dep := &appsv1.Deployment{}
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, dep)).To(Succeed())
+			Expect(dep.Spec.Template.Spec.InitContainers).To(BeEmpty(), "whisper runtime needs no model-init container")
+			containers := dep.Spec.Template.Spec.Containers
+			Expect(containers).To(HaveLen(1))
+			c := containers[0]
+			Expect(c.Name).To(Equal("speaches"))
+			Expect(c.Ports[0].ContainerPort).To(Equal(int32(8000)))
+			Expect(c.ReadinessProbe.HTTPGet.Path).To(Equal("/health"))
+			By("verifying the postStart model-preload hook is wired")
+			Expect(c.Lifecycle).NotTo(BeNil())
+			Expect(c.Lifecycle.PostStart.Exec.Command[2]).To(ContainSubstring("/v1/models/$LLMKUBE_WHISPER_MODEL"))
+			By("verifying the Service port matches the speaches container port (8000)")
+			svc := &corev1.Service{}
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, svc)).To(Succeed())
+			Expect(svc.Spec.Ports[0].Port).To(Equal(int32(8000)))
+
+			envNames := map[string]string{}
+			for _, e := range c.Env {
+				envNames[e.Name] = e.Value
+			}
+			Expect(envNames).To(HaveKeyWithValue("WHISPER__INFERENCE_DEVICE", "cuda"))
+			Expect(envNames).To(HaveKeyWithValue("WHISPER__COMPUTE_TYPE", "float16"))
+			Expect(envNames).To(HaveKey("HF_HOME"))
+
+			By("verifying the status endpoint advertises the audio transcription path")
+			updated := &inferencev1alpha1.InferenceService{}
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: isvcName, Namespace: "default"}, updated)).To(Succeed())
+			Expect(updated.Status.Endpoint).To(HaveSuffix("/v1/audio/transcriptions"))
+		})
+
 		It("should skip Deployment for Metal accelerator", func() {
 			modelName := "metal-model"
 			isvcName := "isvc-metal"
diff --git a/internal/controller/runtime.go b/internal/controller/runtime.go
index 98a9efcf..f443d2f4 100644
--- a/internal/controller/runtime.go
+++ b/internal/controller/runtime.go
@@ -38,8 +38,18 @@ type CommandBuilder interface {
 }
 
 // EnvBuilder is optionally implemented by backends that generate runtime-specific env vars.
+// It receives the Model so backends can derive env values (e.g. device, model id) from the
+// Model spec, consistent with BuildArgs.
 type EnvBuilder interface {
-	BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar
+	BuildEnv(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) []corev1.EnvVar
+}
+
+// EndpointPathProvider is optionally implemented by backends whose OpenAI-compatible API path
+// differs from the default /v1/chat/completions (e.g. the whisper runtime serves
+// /v1/audio/transcriptions). constructEndpoint consults it for the default status endpoint path
+// when the user has not set spec.endpoint.path explicitly.
+type EndpointPathProvider interface {
+	DefaultEndpointPath() string
 }
 
 // HPAMetricProvider is optionally implemented by backends that have a default autoscaling metric.
@@ -47,6 +57,13 @@ type HPAMetricProvider interface {
 	DefaultHPAMetric() string
 }
 
+// LifecycleProvider is optionally implemented by backends that need a container
+// lifecycle hook (e.g. the whisper runtime preloads its model via a postStart
+// hook because speaches does not download models on the first request).
+type LifecycleProvider interface {
+	BuildLifecycle(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model, port int32) *corev1.Lifecycle
+}
+
 // ServiceLinksOptOut is optionally implemented by backends that should run with
 // the legacy Kubernetes service-link env-var injection disabled. Returning true
 // sets `enableServiceLinks: false` on the Pod spec, which suppresses the
@@ -68,6 +85,8 @@ func resolveBackend(isvc *inferencev1alpha1.InferenceService) RuntimeBackend {
 		return &VLLMBackend{}
 	case "tgi":
 		return &TGIBackend{}
+	case "whisper":
+		return &WhisperBackend{}
 	case "generic":
 		return &GenericBackend{}
 	default:
@@ -87,6 +106,21 @@ func runtimeNameLabel(isvc *inferencev1alpha1.InferenceService) string {
 	return isvc.Spec.Runtime
 }
 
+// resolveServicePort returns the port the inference container listens on,
+// which the Service and the advertised endpoint must match. Precedence:
+// spec.containerPort, then spec.endpoint.port, then the backend's DefaultPort.
+// This keeps the Service/endpoint aligned with the container for runtimes whose
+// default port is not 8080 (e.g. vllm/whisper on 8000, tgi on 80).
+func resolveServicePort(isvc *inferencev1alpha1.InferenceService) int32 {
+	port := resolveBackend(isvc).DefaultPort()
+	if isvc.Spec.ContainerPort != nil {
+		port = *isvc.Spec.ContainerPort
+	} else if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Port > 0 {
+		port = isvc.Spec.Endpoint.Port
+	}
+	return port
+}
+
 // resolveGPUCount determines the GPU count from Model spec or InferenceService spec.
 func resolveGPUCount(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) int32 {
 	if model.Spec.Hardware != nil && model.Spec.Hardware.GPU != nil && model.Spec.Hardware.GPU.Count > 0 {
diff --git a/internal/controller/runtime_personaplex.go b/internal/controller/runtime_personaplex.go
index 5c1d612f..7fa6f19a 100644
--- a/internal/controller/runtime_personaplex.go
+++ b/internal/controller/runtime_personaplex.go
@@ -86,7 +86,7 @@ func (b *PersonaPlexBackend) BuildProbes(port int32) (startup, liveness, readine
 
 // BuildEnv returns environment variables for the PersonaPlex container,
 // including HF_TOKEN from a Secret reference if configured.
-func (b *PersonaPlexBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar {
+func (b *PersonaPlexBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar {
 	var env []corev1.EnvVar
 
 	cfg := isvc.Spec.PersonaPlexConfig
diff --git a/internal/controller/runtime_test.go b/internal/controller/runtime_test.go
index ba8ffe14..9b0b5ece 100644
--- a/internal/controller/runtime_test.go
+++ b/internal/controller/runtime_test.go
@@ -3,6 +3,8 @@ package controller
 import (
 	"testing"
 
+	corev1 "k8s.io/api/core/v1"
+
 	inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1"
 )
 
@@ -25,6 +27,32 @@ func containsArg(args []string, flag, value string) bool {
 	return false
 }
 
+// containsEnv reports whether env contains a var named name. When value is
+// non-empty, it also requires the literal .Value to equal value. For
+// secret-backed vars, pass value == "" and assert ValueFrom via envSecretRef.
+func containsEnv(env []corev1.EnvVar, name, value string) bool {
+	for _, e := range env {
+		if e.Name != name {
+			continue
+		}
+		if value == "" {
+			return true
+		}
+		return e.Value == value
+	}
+	return false
+}
+
+// envSecretRef returns the SecretKeySelector backing the named env var, or nil.
+func envSecretRef(env []corev1.EnvVar, name string) *corev1.SecretKeySelector {
+	for _, e := range env {
+		if e.Name == name && e.ValueFrom != nil {
+			return e.ValueFrom.SecretKeyRef
+		}
+	}
+	return nil
+}
+
 // ptrString, ptrBool, ptrInt32 are local helpers so tests read naturally.
 func ptrBool(b bool) *bool          { return &b }
 func ptrFloat64(f float64) *float64 { return &f }
@@ -45,6 +73,7 @@ func TestRuntimeNameLabel(t *testing.T) {
 		{name: "empty runtime defaults to llamacpp", runtime: "", expected: "llamacpp"},
 		{name: "vllm passes through", runtime: "vllm", expected: "vllm"},
 		{name: "tgi passes through", runtime: "tgi", expected: "tgi"},
+		{name: "whisper passes through", runtime: "whisper", expected: "whisper"},
 		{name: "personaplex passes through", runtime: "personaplex", expected: "personaplex"},
 		{name: "generic passes through", runtime: "generic", expected: "generic"},
 		// Future runtimes (vllm-swift on metal, etc.) pass through
diff --git a/internal/controller/runtime_tgi.go b/internal/controller/runtime_tgi.go
index 7b9408c0..4e652f04 100644
--- a/internal/controller/runtime_tgi.go
+++ b/internal/controller/runtime_tgi.go
@@ -92,7 +92,7 @@ func (b *TGIBackend) BuildProbes(port int32) (*corev1.Probe, *corev1.Probe, *cor
 	return startup, liveness, readiness
 }
 
-func (b *TGIBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar {
+func (b *TGIBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar {
 	cfg := isvc.Spec.TGIConfig
 	if cfg != nil && cfg.HFTokenSecretRef != nil {
 		return []corev1.EnvVar{{
diff --git a/internal/controller/runtime_vllm.go b/internal/controller/runtime_vllm.go
index c4500559..e2d567f8 100644
--- a/internal/controller/runtime_vllm.go
+++ b/internal/controller/runtime_vllm.go
@@ -190,7 +190,7 @@ func (b *VLLMBackend) BuildProbes(port int32) (*corev1.Probe, *corev1.Probe, *co
 	return startup, liveness, readiness
 }
 
-func (b *VLLMBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService) []corev1.EnvVar {
+func (b *VLLMBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model) []corev1.EnvVar {
 	cfg := isvc.Spec.VLLMConfig
 	if cfg != nil && cfg.HFTokenSecretRef != nil {
 		return []corev1.EnvVar{{
diff --git a/internal/controller/runtime_whisper.go b/internal/controller/runtime_whisper.go
new file mode 100644
index 00000000..23c9d43b
--- /dev/null
+++ b/internal/controller/runtime_whisper.go
@@ -0,0 +1,191 @@
+package controller
+
+import (
+	"fmt"
+	"strings"
+
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+
+	inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1"
+)
+
+// WhisperBackend generates container configuration for speaches
+// (https://speaches.ai), the faster-whisper OpenAI-compatible audio
+// transcription server. speaches serves /v1/audio/transcriptions on port 8000,
+// is configured entirely via environment variables, and lazy-loads CTranslate2
+// models from HuggingFace per request (so there is no model-init step and the
+// model id clients request comes from the referenced Model's spec.source).
+type WhisperBackend struct{}
+
+// whisperImage is the pinned default speaches image. CUDA by default; CPU-only
+// deployments should override spec.image with the ...-cpu tag.
+const whisperImage = "ghcr.io/speaches-ai/speaches:0.8.3-cuda"
+
+// whisperHFHome is where speaches' underlying huggingface_hub caches models.
+// The image runs as the non-root "ubuntu" user with HOME=/home/ubuntu.
+const whisperHFHome = "/home/ubuntu/.cache/huggingface"
+
+// whisperComputeTypes is the set of CTranslate2 compute types speaches accepts
+// (WHISPER__COMPUTE_TYPE). Used to decide whether a Model's quantization string
+// can be passed through as a compute type.
+var whisperComputeTypes = map[string]struct{}{
+	"int8": {}, "int8_float16": {}, "int8_bfloat16": {}, "int8_float32": {},
+	"int16": {}, "float16": {}, "bfloat16": {}, "float32": {}, "default": {},
+}
+
+func (b *WhisperBackend) ContainerName() string { return "speaches" }
+func (b *WhisperBackend) DefaultImage() string  { return whisperImage }
+func (b *WhisperBackend) DefaultPort() int32    { return 8000 }
+
+// NeedsModelInit is false: speaches downloads the CTranslate2 model from
+// HuggingFace at request time, so no model-downloader init container is needed.
+func (b *WhisperBackend) NeedsModelInit() bool { return false }
+
+// DefaultHPAMetric returns "" because speaches exposes no Prometheus queue
+// metric to autoscale on.
+func (b *WhisperBackend) DefaultHPAMetric() string { return "" }
+
+// DefaultEndpointPath advertises the OpenAI audio transcription path so the
+// status endpoint points clients at the right route.
+func (b *WhisperBackend) DefaultEndpointPath() string { return "/v1/audio/transcriptions" }
+
+// BuildArgs returns only the user's extra args: speaches is configured via env
+// vars, not CLI flags (see BuildEnv).
+func (b *WhisperBackend) BuildArgs(isvc *inferencev1alpha1.InferenceService, _ *inferencev1alpha1.Model, _ string, _ int32) []string {
+	return isvc.Spec.ExtraArgs
+}
+
+func (b *WhisperBackend) BuildProbes(port int32) (startup, liveness, readiness *corev1.Probe) {
+	healthGet := func() corev1.ProbeHandler {
+		return corev1.ProbeHandler{
+			HTTPGet: &corev1.HTTPGetAction{
+				Path: "/health",
+				Port: intstr.FromInt32(port),
+			},
+		}
+	}
+	startup = &corev1.Probe{
+		ProbeHandler:     healthGet(),
+		PeriodSeconds:    10,
+		TimeoutSeconds:   5,
+		FailureThreshold: 180,
+	}
+	liveness = &corev1.Probe{
+		ProbeHandler:     healthGet(),
+		PeriodSeconds:    15,
+		TimeoutSeconds:   5,
+		FailureThreshold: 3,
+	}
+	readiness = &corev1.Probe{
+		ProbeHandler:     healthGet(),
+		PeriodSeconds:    10,
+		TimeoutSeconds:   5,
+		FailureThreshold: 3,
+	}
+	return startup, liveness, readiness
+}
+
+// BuildEnv translates the Model and WhisperConfig into speaches environment
+// variables. Emitted in a stable order so Deployment specs are deterministic.
+func (b *WhisperBackend) BuildEnv(isvc *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model) []corev1.EnvVar {
+	cfg := isvc.Spec.WhisperConfig
+
+	env := []corev1.EnvVar{
+		{Name: "HF_HOME", Value: whisperHFHome},
+		{Name: "ENABLE_UI", Value: whisperEnableUI(cfg)},
+		{Name: "WHISPER__INFERENCE_DEVICE", Value: whisperDevice(cfg, model)},
+	}
+
+	// LLMKUBE_WHISPER_MODEL is consumed by the postStart preload hook (BuildLifecycle),
+	// not by speaches itself. speaches does not download models on first request, so the
+	// hook installs this model id once the server is up.
+	if model != nil && model.Spec.Source != "" {
+		env = append(env, corev1.EnvVar{Name: "LLMKUBE_WHISPER_MODEL", Value: model.Spec.Source})
+	}
+
+	if ct := whisperComputeType(cfg, model); ct != "" {
+		env = append(env, corev1.EnvVar{Name: "WHISPER__COMPUTE_TYPE", Value: ct})
+	}
+	if cfg != nil && cfg.ModelTTLSeconds != nil {
+		env = append(env, corev1.EnvVar{Name: "WHISPER__TTL", Value: fmt.Sprintf("%d", *cfg.ModelTTLSeconds)})
+	}
+	if cfg != nil && cfg.HFTokenSecretRef != nil {
+		env = append(env, corev1.EnvVar{
+			Name:      "HF_TOKEN",
+			ValueFrom: &corev1.EnvVarSource{SecretKeyRef: cfg.HFTokenSecretRef},
+		})
+	}
+	if cfg != nil && cfg.APIKeySecretRef != nil {
+		env = append(env, corev1.EnvVar{
+			Name:      "API_KEY",
+			ValueFrom: &corev1.EnvVarSource{SecretKeyRef: cfg.APIKeySecretRef},
+		})
+	}
+
+	return env
+}
+
+// BuildLifecycle returns a postStart hook that installs the model into speaches
+// once the server is healthy. speaches (v0.8.x) does not download models on the
+// first transcription request: it returns 400 until the model is installed via
+// POST /v1/models/{id}. The hook blocks the container from reporting Running
+// (and therefore Ready) until the model is installed, so the Service only
+// receives traffic once transcription will succeed. Returns nil when there is no
+// model source to preload.
+func (b *WhisperBackend) BuildLifecycle(_ *inferencev1alpha1.InferenceService, model *inferencev1alpha1.Model, port int32) *corev1.Lifecycle {
+	if model == nil || model.Spec.Source == "" {
+		return nil
+	}
+	// curl is present in the speaches image (its own healthcheck uses it). The
+	// model id is read from the LLMKUBE_WHISPER_MODEL env var (set by BuildEnv) to
+	// avoid interpolating CR data into the shell script.
+	script := fmt.Sprintf(`for i in $(seq 1 90); do curl -sf -m 5 http://localhost:%d/health >/dev/null 2>&1 && break; sleep 2; done
+curl -sf -m 1800 -X POST "http://localhost:%d/v1/models/$LLMKUBE_WHISPER_MODEL" >/dev/null 2>&1 || true`, port, port)
+	return &corev1.Lifecycle{
+		PostStart: &corev1.LifecycleHandler{
+			Exec: &corev1.ExecAction{Command: []string{"sh", "-c", script}},
+		},
+	}
+}
+
+func whisperEnableUI(cfg *inferencev1alpha1.WhisperConfig) string {
+	if cfg != nil && cfg.EnableUI != nil && *cfg.EnableUI {
+		return "true"
+	}
+	return "false"
+}
+
+// whisperDevice resolves the speaches inference device: explicit config wins,
+// otherwise it is derived from the Model accelerator, defaulting to "auto".
+func whisperDevice(cfg *inferencev1alpha1.WhisperConfig, model *inferencev1alpha1.Model) string {
+	if cfg != nil && cfg.InferenceDevice != "" {
+		return cfg.InferenceDevice
+	}
+	if model != nil && model.Spec.Hardware != nil {
+		switch strings.ToLower(model.Spec.Hardware.Accelerator) {
+		case "cuda":
+			return "cuda"
+		case "cpu", "metal":
+			// CTranslate2 has no Metal backend; fall back to CPU.
+			return "cpu"
+		}
+	}
+	return "auto"
+}
+
+// whisperComputeType resolves WHISPER__COMPUTE_TYPE: explicit config wins,
+// otherwise a Model quantization string is passed through only if speaches
+// recognizes it as a compute type. Returns "" to use the speaches default.
+func whisperComputeType(cfg *inferencev1alpha1.WhisperConfig, model *inferencev1alpha1.Model) string {
+	if cfg != nil && cfg.ComputeType != "" {
+		return cfg.ComputeType
+	}
+	if model != nil {
+		q := strings.ToLower(strings.TrimSpace(model.Spec.Quantization))
+		if _, ok := whisperComputeTypes[q]; ok {
+			return q
+		}
+	}
+	return ""
+}
diff --git a/internal/controller/runtime_whisper_test.go b/internal/controller/runtime_whisper_test.go
new file mode 100644
index 00000000..42f39a16
--- /dev/null
+++ b/internal/controller/runtime_whisper_test.go
@@ -0,0 +1,305 @@
+package controller
+
+import (
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+
+	inferencev1alpha1 "github.com/defilantech/llmkube/api/v1alpha1"
+)
+
+func whisperModel(accelerator, quantization string) *inferencev1alpha1.Model {
+	m := &inferencev1alpha1.Model{
+		Spec: inferencev1alpha1.ModelSpec{
+			Source:       "Systran/faster-whisper-large-v3",
+			Quantization: quantization,
+		},
+	}
+	if accelerator != "" {
+		m.Spec.Hardware = &inferencev1alpha1.HardwareSpec{Accelerator: accelerator}
+	}
+	return m
+}
+
+func TestWhisperBackendBasics(t *testing.T) {
+	b := &WhisperBackend{}
+
+	if b.ContainerName() != "speaches" {
+		t.Errorf("ContainerName() = %q, want speaches", b.ContainerName())
+	}
+	if b.DefaultPort() != 8000 {
+		t.Errorf("DefaultPort() = %d, want 8000", b.DefaultPort())
+	}
+	if b.NeedsModelInit() {
+		t.Error("NeedsModelInit() = true, want false (speaches fetches from HF at runtime)")
+	}
+	if b.DefaultHPAMetric() != "" {
+		t.Errorf("DefaultHPAMetric() = %q, want empty (speaches exposes no scrapeable queue metric)", b.DefaultHPAMetric())
+	}
+	if got := b.DefaultEndpointPath(); got != "/v1/audio/transcriptions" {
+		t.Errorf("DefaultEndpointPath() = %q, want /v1/audio/transcriptions", got)
+	}
+	if img := b.DefaultImage(); img == "" || !containsSubstr(img, "speaches") {
+		t.Errorf("DefaultImage() = %q, want a pinned speaches image", img)
+	}
+}
+
+func containsSubstr(s, sub string) bool {
+	return len(s) >= len(sub) && (s == sub || indexOf(s, sub) >= 0)
+}
+
+func indexOf(s, sub string) int {
+	for i := 0; i+len(sub) <= len(s); i++ {
+		if s[i:i+len(sub)] == sub {
+			return i
+		}
+	}
+	return -1
+}
+
+func TestWhisperBuildProbes(t *testing.T) {
+	b := &WhisperBackend{}
+	startup, liveness, readiness := b.BuildProbes(8000)
+	for name, p := range map[string]*corev1.Probe{"startup": startup, "liveness": liveness, "readiness": readiness} {
+		if p == nil || p.HTTPGet == nil {
+			t.Fatalf("%s probe should be an HTTP GET", name)
+			continue
+		}
+		if p.HTTPGet.Path != "/health" {
+			t.Errorf("%s probe path = %q, want /health", name, p.HTTPGet.Path)
+		}
+		if p.HTTPGet.Port.IntValue() != 8000 {
+			t.Errorf("%s probe port = %v, want 8000", name, p.HTTPGet.Port)
+		}
+	}
+}
+
+func TestWhisperBuildEnv(t *testing.T) {
+	secretRef := &corev1.SecretKeySelector{
+		LocalObjectReference: corev1.LocalObjectReference{Name: "hf"},
+		Key:                  "token",
+	}
+	apiRef := &corev1.SecretKeySelector{
+		LocalObjectReference: corev1.LocalObjectReference{Name: "api"},
+		Key:                  "key",
+	}
+
+	tests := []struct {
+		name          string
+		cfg           *inferencev1alpha1.WhisperConfig
+		model         *inferencev1alpha1.Model
+		wantEnv       map[string]string // name -> exact .Value
+		wantAbsent    []string
+		wantHFSecret  bool
+		wantAPISecret bool
+	}{
+		{
+			name:  "minimal cpu model: HF_HOME, UI off, device cpu, no compute/ttl",
+			model: whisperModel("cpu", ""),
+			wantEnv: map[string]string{
+				"HF_HOME":                   "/home/ubuntu/.cache/huggingface",
+				"ENABLE_UI":                 "false",
+				"WHISPER__INFERENCE_DEVICE": "cpu",
+				"LLMKUBE_WHISPER_MODEL":     "Systran/faster-whisper-large-v3",
+			},
+			wantAbsent: []string{"WHISPER__COMPUTE_TYPE", "WHISPER__TTL", "HF_TOKEN", "API_KEY"},
+		},
+		{
+			name:    "cuda accelerator maps to cuda device",
+			model:   whisperModel("cuda", ""),
+			wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "cuda"},
+		},
+		{
+			name:    "metal accelerator maps to cpu",
+			model:   whisperModel("metal", ""),
+			wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "cpu"},
+		},
+		{
+			name:    "nil hardware defaults device to auto",
+			model:   whisperModel("", ""),
+			wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "auto"},
+		},
+		{
+			name:    "config device overrides model accelerator",
+			cfg:     &inferencev1alpha1.WhisperConfig{InferenceDevice: "auto"},
+			model:   whisperModel("cuda", ""),
+			wantEnv: map[string]string{"WHISPER__INFERENCE_DEVICE": "auto"},
+		},
+		{
+			name:    "explicit compute type wins",
+			cfg:     &inferencev1alpha1.WhisperConfig{ComputeType: "int8_float16"},
+			model:   whisperModel("cuda", "float16"),
+			wantEnv: map[string]string{"WHISPER__COMPUTE_TYPE": "int8_float16"},
+		},
+		{
+			name:    "recognized model quantization becomes compute type",
+			model:   whisperModel("cuda", "float16"),
+			wantEnv: map[string]string{"WHISPER__COMPUTE_TYPE": "float16"},
+		},
+		{
+			name:       "unrecognized quantization omits compute type",
+			model:      whisperModel("cuda", "Q4_K_M"),
+			wantAbsent: []string{"WHISPER__COMPUTE_TYPE"},
+		},
+		{
+			name:    "model ttl -1 keeps loaded",
+			cfg:     &inferencev1alpha1.WhisperConfig{ModelTTLSeconds: ptrInt32(-1)},
+			model:   whisperModel("cuda", ""),
+			wantEnv: map[string]string{"WHISPER__TTL": "-1"},
+		},
+		{
+			name:    "enable UI true",
+			cfg:     &inferencev1alpha1.WhisperConfig{EnableUI: ptrBool(true)},
+			model:   whisperModel("cuda", ""),
+			wantEnv: map[string]string{"ENABLE_UI": "true"},
+		},
+		{
+			name:         "HF token secret ref",
+			cfg:          &inferencev1alpha1.WhisperConfig{HFTokenSecretRef: secretRef},
+			model:        whisperModel("cuda", ""),
+			wantHFSecret: true,
+		},
+		{
+			name:          "API key secret ref",
+			cfg:           &inferencev1alpha1.WhisperConfig{APIKeySecretRef: apiRef},
+			model:         whisperModel("cuda", ""),
+			wantAPISecret: true,
+		},
+	}
+
+	b := &WhisperBackend{}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			isvc := &inferencev1alpha1.InferenceService{
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					Runtime:       "whisper",
+					WhisperConfig: tc.cfg,
+				},
+			}
+			env := b.BuildEnv(isvc, tc.model)
+
+			for name, want := range tc.wantEnv {
+				if !containsEnv(env, name, want) {
+					t.Errorf("env %s = %q not found; got %+v", name, want, env)
+				}
+			}
+			for _, name := range tc.wantAbsent {
+				if containsEnv(env, name, "") {
+					t.Errorf("env %s should be absent; got %+v", name, env)
+				}
+			}
+			if tc.wantHFSecret && envSecretRef(env, "HF_TOKEN") == nil {
+				t.Errorf("HF_TOKEN should be backed by a secret ref; got %+v", env)
+			}
+			if tc.wantAPISecret && envSecretRef(env, "API_KEY") == nil {
+				t.Errorf("API_KEY should be backed by a secret ref; got %+v", env)
+			}
+		})
+	}
+}
+
+func TestWhisperBuildLifecycle(t *testing.T) {
+	b := &WhisperBackend{}
+
+	t.Run("postStart preloads the model", func(t *testing.T) {
+		isvc := &inferencev1alpha1.InferenceService{
+			Spec: inferencev1alpha1.InferenceServiceSpec{Runtime: "whisper"},
+		}
+		lc := b.BuildLifecycle(isvc, whisperModel("cuda", ""), 8000)
+		if lc == nil || lc.PostStart == nil || lc.PostStart.Exec == nil {
+			t.Fatal("expected a postStart exec hook")
+		}
+		cmd := lc.PostStart.Exec.Command
+		if len(cmd) != 3 || cmd[0] != "sh" || cmd[1] != "-c" {
+			t.Fatalf("expected sh -c <script>, got %v", cmd)
+		}
+		script := cmd[2]
+		for _, want := range []string{"/v1/models/$LLMKUBE_WHISPER_MODEL", "localhost:8000/health", "-X POST"} {
+			if !containsSubstr(script, want) {
+				t.Errorf("postStart script missing %q; got:\n%s", want, script)
+			}
+		}
+	})
+
+	t.Run("nil when no model source", func(t *testing.T) {
+		isvc := &inferencev1alpha1.InferenceService{}
+		if lc := b.BuildLifecycle(isvc, &inferencev1alpha1.Model{}, 8000); lc != nil {
+			t.Errorf("expected nil lifecycle when model source empty, got %+v", lc)
+		}
+	})
+}
+
+// TestConstructEndpointRuntimeAwareDefault verifies the runtime-aware default path:
+// whisper resolves to the audio endpoint, other runtimes keep the chat endpoint,
+// and an explicit spec.endpoint.path always wins.
+func TestConstructEndpointRuntimeAwareDefault(t *testing.T) {
+	r := &InferenceServiceReconciler{}
+	svc := &corev1.Service{}
+	svc.Name = "demo"
+	svc.Namespace = "default"
+
+	cases := []struct {
+		name     string
+		runtime  string
+		path     string
+		wantEnds string
+	}{
+		// Port follows the backend DefaultPort: whisper/vllm on 8000, llamacpp on 8080.
+		{name: "whisper default", runtime: "whisper", wantEnds: ":8000/v1/audio/transcriptions"},
+		{name: "llamacpp default", runtime: "", wantEnds: ":8080/v1/chat/completions"},
+		{name: "vllm default", runtime: "vllm", wantEnds: ":8000/v1/chat/completions"},
+		{name: "explicit path wins on whisper", runtime: "whisper", path: "/custom", wantEnds: ":8000/custom"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			isvc := &inferencev1alpha1.InferenceService{
+				Spec: inferencev1alpha1.InferenceServiceSpec{Runtime: tc.runtime},
+			}
+			if tc.path != "" {
+				isvc.Spec.Endpoint = &inferencev1alpha1.EndpointSpec{Path: tc.path}
+			}
+			got := r.constructEndpoint(isvc, svc)
+			if !endsWith(got, tc.wantEnds) {
+				t.Errorf("constructEndpoint() = %q, want suffix %q", got, tc.wantEnds)
+			}
+		})
+	}
+}
+
+func endsWith(s, suffix string) bool {
+	return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix
+}
+
+// TestResolveServicePort verifies the Service/endpoint port follows the backend
+// default (so non-8080 runtimes route correctly) while explicit overrides win.
+func TestResolveServicePort(t *testing.T) {
+	cases := []struct {
+		name          string
+		runtime       string
+		containerPort *int32
+		endpointPort  int32
+		want          int32
+	}{
+		{name: "whisper defaults to 8000", runtime: "whisper", want: 8000},
+		{name: "llamacpp defaults to 8080", runtime: "", want: 8080},
+		{name: "tgi defaults to 80", runtime: "tgi", want: 80},
+		{name: "endpoint port overrides backend default", runtime: "whisper", endpointPort: 9000, want: 9000},
+		{name: "containerPort wins over endpoint port", runtime: "whisper", containerPort: ptrInt32(7000), endpointPort: 9000, want: 7000},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			isvc := &inferencev1alpha1.InferenceService{
+				Spec: inferencev1alpha1.InferenceServiceSpec{
+					Runtime:       tc.runtime,
+					ContainerPort: tc.containerPort,
+				},
+			}
+			if tc.endpointPort > 0 {
+				isvc.Spec.Endpoint = &inferencev1alpha1.EndpointSpec{Port: tc.endpointPort}
+			}
+			if got := resolveServicePort(isvc); got != tc.want {
+				t.Errorf("resolveServicePort() = %d, want %d", got, tc.want)
+			}
+		})
+	}
+}
diff --git a/internal/controller/service_builder.go b/internal/controller/service_builder.go
index 70c81423..8820b673 100644
--- a/internal/controller/service_builder.go
+++ b/internal/controller/service_builder.go
@@ -38,10 +38,7 @@ func (r *InferenceServiceReconciler) constructService(isvc *inferencev1alpha1.In
 		"inference.llmkube.dev/service": isvc.Name,
 	}
 
-	port := int32(8080)
-	if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Port > 0 {
-		port = isvc.Spec.Endpoint.Port
-	}
+	port := resolveServicePort(isvc)
 
 	serviceType := corev1.ServiceTypeClusterIP
 	if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Type != "" {
diff --git a/internal/controller/status_builder.go b/internal/controller/status_builder.go
index 0ce67a83..d4d2d23b 100644
--- a/internal/controller/status_builder.go
+++ b/internal/controller/status_builder.go
@@ -77,18 +77,22 @@ func (r *InferenceServiceReconciler) reconcileVLLMSpecCondition(isvc *inferencev
 }
 
 func (r *InferenceServiceReconciler) constructEndpoint(isvc *inferencev1alpha1.InferenceService, svc *corev1.Service) string {
-	port := int32(8080)
+	port := resolveServicePort(isvc)
 	path := "/v1/chat/completions"
 
-	if isvc.Spec.Endpoint != nil {
-		if isvc.Spec.Endpoint.Port > 0 {
-			port = isvc.Spec.Endpoint.Port
-		}
-		if isvc.Spec.Endpoint.Path != "" {
-			path = isvc.Spec.Endpoint.Path
+	// A backend may declare a different default OpenAI-compatible path (e.g. the
+	// whisper runtime serves /v1/audio/transcriptions). A user-set
+	// spec.endpoint.path still wins, checked below.
+	if ep, ok := resolveBackend(isvc).(EndpointPathProvider); ok {
+		if p := ep.DefaultEndpointPath(); p != "" {
+			path = p
 		}
 	}
 
+	if isvc.Spec.Endpoint != nil && isvc.Spec.Endpoint.Path != "" {
+		path = isvc.Spec.Endpoint.Path
+	}
+
 	return fmt.Sprintf("http://%s.%s.svc.cluster.local:%d%s", svc.Name, svc.Namespace, port, path)
 }