diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index f79497e64..8b88880cb 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -36,6 +36,8 @@ const ( ClusterPolicyCRDName = "ClusterPolicy" // DefaultDCGMJobMappingDir is the default directory for DCGM Exporter HPC job mapping files DefaultDCGMJobMappingDir = "/var/lib/dcgm-exporter/job-mapping" + // DefaultDCGMPodResourcesSocket is the default kubelet pod-resources socket path + DefaultDCGMPodResourcesSocket = "/var/lib/kubelet/pod-resources/kubelet.sock" ) // ClusterPolicySpec defines the desired state of ClusterPolicy @@ -969,6 +971,38 @@ type DCGMExporterSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced" HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"` + + // Optional: Per-pod GPU utilization metrics for CUDA time-slicing workloads. + // When enabled, dcgm-exporter emits dcgm_fi_dev_sm_util_per_pod gauges that + // attribute SM utilization to individual pods sharing a GPU via time-slicing. + // Requires dcgm-exporter v3.4.0+ built with --enable-per-pod-gpu-util support. + // See: https://github.com/NVIDIA/dcgm-exporter/issues/587 + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Per-Pod GPU Utilization Metrics" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced" + PerPodGPUUtil *DCGMExporterPerPodGPUUtilConfig `json:"perPodGPUUtil,omitempty"` +} + +// DCGMExporterPerPodGPUUtilConfig configures per-pod GPU SM utilization metrics. +// This feature is useful when CUDA time-slicing is active and multiple pods share +// one physical GPU — standard per-device metrics lose per-workload attribution. +type DCGMExporterPerPodGPUUtilConfig struct { + // Enable per-pod GPU utilization collection via NVML process utilization API. + // Requires hostPID: true (automatically set when enabled). + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable Per-Pod GPU Utilization" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + Enabled *bool `json:"enabled,omitempty"` + + // PodResourcesSocketPath is the path to the kubelet pod-resources gRPC socket. + // Defaults to /var/lib/kubelet/pod-resources/kubelet.sock. + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Pod Resources Socket Path" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" + PodResourcesSocketPath string `json:"podResourcesSocketPath,omitempty"` } // DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter @@ -2101,6 +2135,24 @@ func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string { return e.HPCJobMapping.Directory } +// IsPerPodGPUUtilEnabled returns true if per-pod GPU utilization metrics are enabled. +// This feature attributes SM utilization to individual pods when CUDA time-slicing is active. +func (e *DCGMExporterSpec) IsPerPodGPUUtilEnabled() bool { + if e.PerPodGPUUtil == nil || e.PerPodGPUUtil.Enabled == nil { + return false + } + return *e.PerPodGPUUtil.Enabled +} + +// GetPerPodGPUUtilSocketPath returns the kubelet pod-resources socket path for per-pod GPU util. +// Falls back to DefaultDCGMPodResourcesSocket if not explicitly configured. +func (e *DCGMExporterSpec) GetPerPodGPUUtilSocketPath() string { + if e.PerPodGPUUtil == nil || e.PerPodGPUUtil.PodResourcesSocketPath == "" { + return DefaultDCGMPodResourcesSocket + } + return e.PerPodGPUUtil.PodResourcesSocketPath +} + // IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator func (g *GPUFeatureDiscoverySpec) IsEnabled() bool { if g.Enabled == nil { diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index f65e0648b..503d0077f 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -324,6 +324,26 @@ func (in *DCGMExporterHPCJobMappingConfig) DeepCopy() *DCGMExporterHPCJobMapping return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DCGMExporterPerPodGPUUtilConfig) DeepCopyInto(out *DCGMExporterPerPodGPUUtilConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DCGMExporterPerPodGPUUtilConfig. +func (in *DCGMExporterPerPodGPUUtilConfig) DeepCopy() *DCGMExporterPerPodGPUUtilConfig { + if in == nil { + return nil + } + out := new(DCGMExporterPerPodGPUUtilConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DCGMExporterMetricsConfig) DeepCopyInto(out *DCGMExporterMetricsConfig) { *out = *in @@ -460,6 +480,11 @@ func (in *DCGMExporterSpec) DeepCopyInto(out *DCGMExporterSpec) { *out = new(DCGMExporterHPCJobMappingConfig) (*in).DeepCopyInto(*out) } + if in.PerPodGPUUtil != nil { + in, out := &in.PerPodGPUUtil, &out.PerPodGPUUtil + *out = new(DCGMExporterPerPodGPUUtilConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DCGMExporterSpec. diff --git a/controllers/object_controls.go b/controllers/object_controls.go index b436bcab1..90ed1edd7 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1785,6 +1785,39 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol) } + // configure per-pod GPU utilization metrics when enabled (for CUDA time-slicing workloads) + // See: https://github.com/NVIDIA/dcgm-exporter/issues/587 + if config.DCGMExporter.IsPerPodGPUUtilEnabled() { + // enable the feature flag in dcgm-exporter + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL", "true") + + // resolve pod→GPU mapping via kubelet pod-resources gRPC API + socketPath := config.DCGMExporter.GetPerPodGPUUtilSocketPath() + socketDir := socketPath[:strings.LastIndex(socketPath, "/")] + + podResourcesVolMount := corev1.VolumeMount{ + Name: "pod-resources", + ReadOnly: true, + MountPath: socketDir, + } + obj.Spec.Template.Spec.Containers[0].VolumeMounts = append( + obj.Spec.Template.Spec.Containers[0].VolumeMounts, podResourcesVolMount) + + podResourcesVol := corev1.Volume{ + Name: "pod-resources", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: socketDir, + Type: ptr.To(corev1.HostPathDirectory), + }, + }, + } + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, podResourcesVol) + + // per-pod attribution requires resolving PIDs via /proc//cgroup + obj.Spec.Template.Spec.HostPID = true + } + // mount configmap for custom metrics if provided by user if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" { metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName} diff --git a/docs/dcgm-exporter-per-pod-gpu-metrics.md b/docs/dcgm-exporter-per-pod-gpu-metrics.md new file mode 100644 index 000000000..4a61c12d0 --- /dev/null +++ b/docs/dcgm-exporter-per-pod-gpu-metrics.md @@ -0,0 +1,142 @@ +# Per-Pod GPU Utilization with DCGM Exporter (Time-Slicing) + +## Overview + +When GPU time-slicing is enabled via `ClusterPolicy`, multiple pods share a +single physical GPU device. Standard DCGM metrics report aggregate utilization +for the whole device — `dcgm_fi_dev_gpu_util` cannot distinguish how much of +the GPU proxy, embeddings, or inference pods are each using. + +GPU Operator v24.x+ integrates with dcgm-exporter's per-pod GPU utilization +feature to restore workload-level attribution without requiring MIG. + +## Prerequisite: dcgm-exporter v3.4.0+ + +This feature requires dcgm-exporter v3.4.0 or later, which adds the +`--enable-per-pod-gpu-util` flag and `dcgm_fi_dev_sm_util_per_pod` metric. + +See: [NVIDIA/dcgm-exporter#587](https://github.com/NVIDIA/dcgm-exporter/issues/587) + +## Enabling Time-Slicing + Per-Pod Metrics + +A complete `ClusterPolicy` for a T4 cluster running three shared workloads: + +```yaml +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + # 1. Configure time-slicing: 3 virtual slices per physical GPU + devicePlugin: + config: + name: time-slicing-config + default: any + + # 2. Enable per-pod GPU utilization metrics in dcgm-exporter + dcgmExporter: + perPodGPUUtil: + enabled: true + # Optional: custom path (default: /var/lib/kubelet/pod-resources/kubelet.sock) + # podResourcesSocketPath: /var/lib/kubelet/pod-resources/kubelet.sock +``` + +The time-slicing ConfigMap referenced above must be deployed separately: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: time-slicing-config + namespace: gpu-operator +data: + any: |- + version: v1 + flags: + migStrategy: none + sharing: + timeSlicing: + replicas: 3 + renameByDefault: false + resources: + - name: nvidia.com/gpu + replicas: 3 +``` + +## What GPU Operator does automatically + +When `dcgmExporter.perPodGPUUtil.enabled: true` is set, GPU Operator: + +1. Sets `DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL=true` in the dcgm-exporter + DaemonSet environment. +2. Mounts `/var/lib/kubelet/pod-resources/` as a read-only `hostPath` volume + so dcgm-exporter can reach the kubelet pod-resources gRPC socket. +3. Sets `hostPID: true` on the DaemonSet so dcgm-exporter can read + `/proc//cgroup` to resolve NVML PIDs to containers. + +## Emitted metric + +``` +# HELP dcgm_fi_dev_sm_util_per_pod SM utilization attributed to a pod (time-slicing) +# TYPE dcgm_fi_dev_sm_util_per_pod gauge +dcgm_fi_dev_sm_util_per_pod{ + gpu="0", + uuid="GPU-abc123", + pod="synapse-proxy-7f9d4b-xkz2p", + namespace="synapse-staging", + container="proxy" +} 42 +dcgm_fi_dev_sm_util_per_pod{...,pod="synapse-jina-...",container="jina"} 18 +dcgm_fi_dev_sm_util_per_pod{...,pod="synapse-vllm-...",container="vllm"} 35 +``` + +## Example Prometheus alert + +```yaml +groups: + - name: per-pod-gpu + rules: + - alert: PodGPUHighUtilization + expr: dcgm_fi_dev_sm_util_per_pod > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "{{ $labels.namespace }}/{{ $labels.pod }} using >80% GPU SM" +``` + +## Cost model (example: g4dn.xlarge T4) + +| Setup | Nodes | Cost/day | +|-------|-------|----------| +| 3 workloads, no time-slicing | 3 × g4dn.xlarge | ~$38/day | +| 3 workloads, time-slicing (3 replicas) | 1 × g4dn.xlarge | ~$13/day | +| **Savings** | | **~$25/day (~$9,000/year)** | + +Time-slicing is appropriate for inference + embedding workloads that do not +fully saturate the GPU. For compute-bound training workloads, MIG or dedicated +GPUs remain the right choice. + +## Security considerations + +Enabling `perPodGPUUtil` grants dcgm-exporter: +- Read access to `/var/lib/kubelet/pod-resources/` (lists all GPU-using pods) +- Host PID namespace access (to read `/proc//cgroup`) + +These are the same permissions used by other node-level monitoring agents +(e.g., node-exporter, cAdvisor). Review your security policy before enabling +in sensitive environments. + +## Compatibility + +| GPU Operator | dcgm-exporter | Feature available | +|-------------|---------------|-------------------| +| < v24.x | any | No | +| ≥ v24.x | < v3.4.0 | Field accepted but no-op | +| ≥ v24.x | ≥ v3.4.0 | Yes | + +## Related + +- dcgm-exporter feature: [docs/per-pod-gpu-metrics.md](https://github.com/NVIDIA/dcgm-exporter/blob/main/docs/per-pod-gpu-metrics.md) +- Time-slicing setup: [GPU Sharing with Time-Slicing](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html) +- Issue: [NVIDIA/dcgm-exporter#587](https://github.com/NVIDIA/dcgm-exporter/issues/587)