Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions api/v1alpha1/inferenceservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ type InferenceServiceSpec struct {
// +optional
Replicas *int32 `json:"replicas,omitempty"`

// Autoscaling configures horizontal pod autoscaling for the inference service.
// When set, the controller creates and manages an HPA resource targeting the
// inference Deployment. Requires Prometheus Adapter for custom metrics.
// Mutually exclusive with manual replica management: when autoscaling is enabled,
// the Replicas field serves as the initial replica count only.
// +optional
Autoscaling *AutoscalingSpec `json:"autoscaling,omitempty"`

// Image is the container image for the llama.cpp runtime
// +kubebuilder:default="ghcr.io/ggml-org/llama.cpp:server"
// +optional
Expand Down Expand Up @@ -172,6 +180,44 @@ type InferenceResourceRequirements struct {
GPUMemory string `json:"gpuMemory,omitempty"`
}

// AutoscalingSpec configures Horizontal Pod Autoscaler for the inference service.
type AutoscalingSpec struct {
// MinReplicas is the lower limit for the number of replicas.
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=10
// +kubebuilder:default=1
// +optional
MinReplicas *int32 `json:"minReplicas,omitempty"`

// MaxReplicas is the upper limit for the number of replicas.
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=100
MaxReplicas int32 `json:"maxReplicas"`

// Metrics defines the scaling metrics and target values.
// If empty, defaults to llamacpp:requests_processing with target average value of 2.
// +optional
Metrics []MetricSpec `json:"metrics,omitempty"`
}

// MetricSpec defines a single metric for HPA scaling.
type MetricSpec struct {
// Type is the metric source type.
// +kubebuilder:validation:Enum=Pods;Resource
Type string `json:"type"`

// Name is the metric name (e.g., llamacpp:requests_processing).
Name string `json:"name"`

// TargetAverageValue is the target per-pod average for Pods-type metrics.
// +optional
TargetAverageValue *string `json:"targetAverageValue,omitempty"`

// TargetAverageUtilization is the target utilization percentage for Resource-type metrics.
// +optional
TargetAverageUtilization *int32 `json:"targetAverageUtilization,omitempty"`
}

// InferenceServiceStatus defines the observed state of InferenceService.
type InferenceServiceStatus struct {
// Phase represents the current lifecycle phase (Pending, Creating, Ready, Failed)
Expand Down
57 changes: 57 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions charts/llmkube/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- inference.llmkube.dev
resources:
Expand Down
56 changes: 56 additions & 0 deletions config/crd/bases/inference.llmkube.dev_inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,62 @@ spec:
spec:
description: spec defines the desired state of InferenceService
properties:
autoscaling:
description: |-
Autoscaling configures horizontal pod autoscaling for the inference service.
When set, the controller creates and manages an HPA resource targeting the
inference Deployment. Requires Prometheus Adapter for custom metrics.
Mutually exclusive with manual replica management: when autoscaling is enabled,
the Replicas field serves as the initial replica count only.
properties:
maxReplicas:
description: MaxReplicas is the upper limit for the number of
replicas.
format: int32
maximum: 100
minimum: 1
type: integer
metrics:
description: |-
Metrics defines the scaling metrics and target values.
If empty, defaults to llamacpp:requests_processing with target average value of 2.
items:
description: MetricSpec defines a single metric for HPA scaling.
properties:
name:
description: Name is the metric name (e.g., llamacpp:requests_processing).
type: string
targetAverageUtilization:
description: TargetAverageUtilization is the target utilization
percentage for Resource-type metrics.
format: int32
type: integer
targetAverageValue:
description: TargetAverageValue is the target per-pod average
for Pods-type metrics.
type: string
type:
description: Type is the metric source type.
enum:
- Pods
- Resource
type: string
required:
- name
- type
type: object
type: array
minReplicas:
default: 1
description: MinReplicas is the lower limit for the number of
replicas.
format: int32
maximum: 10
minimum: 1
type: integer
required:
- maxReplicas
type: object
cacheTypeK:
description: |-
CacheTypeK sets the KV cache quantization type for keys.
Expand Down
12 changes: 12 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- inference.llmkube.dev
resources:
Expand Down
Loading