Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions api/v1alpha1/inferenceservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ type InferenceServiceSpec struct {
// "personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server.
// "vllm": vLLM OpenAI-compatible server with PagedAttention.
// "tgi": HuggingFace Text Generation Inference server.
// +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;generic
// "whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server.
// +kubebuilder:validation:Enum=llamacpp;personaplex;vllm;tgi;whisper;generic
// +kubebuilder:default=llamacpp
// +optional
Runtime string `json:"runtime,omitempty"`
Expand Down Expand Up @@ -338,6 +339,11 @@ type InferenceServiceSpec struct {
// +optional
TGIConfig *TGIConfig `json:"tgiConfig,omitempty"`

// WhisperConfig holds configuration for the whisper (speaches) runtime.
// Only used when Runtime is "whisper".
// +optional
WhisperConfig *WhisperConfig `json:"whisperConfig,omitempty"`

// ImagePullSecrets for pulling container images from private registries.
// +optional
ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
Expand Down Expand Up @@ -386,8 +392,10 @@ type EndpointSpec struct {
// +optional
Port int32 `json:"port,omitempty"`

// Path is the HTTP path for the inference endpoint
// +kubebuilder:default="/v1/chat/completions"
// Path is the HTTP path for the inference endpoint. When unset, the
// effective default is the runtime's OpenAI-compatible path
// (/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
// whisper runtime), resolved when the status endpoint is constructed.
// +optional
Path string `json:"path,omitempty"`

Expand Down Expand Up @@ -667,6 +675,45 @@ type TGIConfig struct {
HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"`
}

// WhisperConfig holds deploy-time server settings for the whisper (speaches)
// runtime. speaches selects the model, language, and task per request, so those
// are NOT server config; the model id clients request comes from the referenced
// Model's spec.source.
type WhisperConfig struct {
// ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE).
// When unset, falls back to a recognized Model spec.quantization, else the speaches default.
// +kubebuilder:validation:Enum=int8;int8_float16;int8_bfloat16;int8_float32;int16;float16;bfloat16;float32;default
// +optional
ComputeType string `json:"computeType,omitempty"`

// InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE).
// When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda,
// cpu/metal -> cpu), defaulting to auto.
// +kubebuilder:validation:Enum=auto;cuda;cpu
// +optional
InferenceDevice string `json:"inferenceDevice,omitempty"`

// ModelTTLSeconds is how long an idle model stays loaded before being unloaded
// (speaches WHISPER__TTL). -1 keeps models loaded indefinitely.
// +kubebuilder:validation:Minimum=-1
// +optional
ModelTTLSeconds *int32 `json:"modelTTLSeconds,omitempty"`

// EnableUI exposes the speaches Gradio web UI. Defaults to false.
// +optional
EnableUI *bool `json:"enableUI,omitempty"`

// HFTokenSecretRef references a Secret containing a HuggingFace token, used to
// download gated CTranslate2 models.
// +optional
HFTokenSecretRef *corev1.SecretKeySelector `json:"hfTokenSecretRef,omitempty"`

// APIKeySecretRef references a Secret containing an API key speaches will require
// on requests (sets the speaches API_KEY).
// +optional
APIKeySecretRef *corev1.SecretKeySelector `json:"apiKeySecretRef,omitempty"`
}

// InferenceServiceStatus defines the observed state of InferenceService.
type InferenceServiceStatus struct {
// Phase represents the current lifecycle phase of the InferenceService.
Expand Down
40 changes: 40 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

103 changes: 101 additions & 2 deletions charts/llmkube/templates/crds/inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,11 @@ spec:
description: Endpoint defines the service endpoint configuration
properties:
path:
default: /v1/chat/completions
description: Path is the HTTP path for the inference endpoint
description: |-
Path is the HTTP path for the inference endpoint. When unset, the
effective default is the runtime's OpenAI-compatible path
(/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
whisper runtime), resolved when the status endpoint is constructed.
type: string
port:
default: 8080
Expand Down Expand Up @@ -1398,11 +1401,13 @@ spec:
"personaplex": NVIDIA PersonaPlex (Moshi) speech-to-speech server.
"vllm": vLLM OpenAI-compatible server with PagedAttention.
"tgi": HuggingFace Text Generation Inference server.
"whisper": speaches (faster-whisper) OpenAI-compatible audio transcription server.
enum:
- llamacpp
- personaplex
- vllm
- tgi
- whisper
- generic
type: string
runtimeClassName:
Expand Down Expand Up @@ -1910,6 +1915,100 @@ spec:
format: int32
type: integer
type: object
whisperConfig:
description: |-
WhisperConfig holds configuration for the whisper (speaches) runtime.
Only used when Runtime is "whisper".
properties:
apiKeySecretRef:
description: |-
APIKeySecretRef references a Secret containing an API key speaches will require
on requests (sets the speaches API_KEY).
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
computeType:
description: |-
ComputeType sets the CTranslate2 compute type (speaches WHISPER__COMPUTE_TYPE).
When unset, falls back to a recognized Model spec.quantization, else the speaches default.
enum:
- int8
- int8_float16
- int8_bfloat16
- int8_float32
- int16
- float16
- bfloat16
- float32
- default
type: string
enableUI:
description: EnableUI exposes the speaches Gradio web UI. Defaults
to false.
type: boolean
hfTokenSecretRef:
description: |-
HFTokenSecretRef references a Secret containing a HuggingFace token, used to
download gated CTranslate2 models.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
inferenceDevice:
description: |-
InferenceDevice sets the device speaches loads models on (WHISPER__INFERENCE_DEVICE).
When unset, derived from the referenced Model's hardware.accelerator (cuda/rocm/intel -> cuda,
cpu/metal -> cpu), defaulting to auto.
enum:
- auto
- cuda
- cpu
type: string
modelTTLSeconds:
description: |-
ModelTTLSeconds is how long an idle model stays loaded before being unloaded
(speaches WHISPER__TTL). -1 keeps models loaded indefinitely.
format: int32
minimum: -1
type: integer
type: object
required:
- modelRef
type: object
Expand Down
7 changes: 5 additions & 2 deletions charts/llmkube/templates/crds/modelrouters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,11 @@ spec:
through. Mirrors the shape used by InferenceService.
properties:
path:
default: /v1/chat/completions
description: Path is the HTTP path for the inference endpoint
description: |-
Path is the HTTP path for the inference endpoint. When unset, the
effective default is the runtime's OpenAI-compatible path
(/v1/chat/completions for text runtimes, /v1/audio/transcriptions for the
whisper runtime), resolved when the status endpoint is constructed.
type: string
port:
default: 8080
Expand Down
Loading
Loading