Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 35 additions & 21 deletions internal/exporter/converter/otlp.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"time"

apiv1 "github.com/NVIDIA/fleet-intelligence-sdk/api/v1"
pkgmetrics "github.com/NVIDIA/fleet-intelligence-sdk/pkg/metrics"
commonv1 "go.opentelemetry.io/proto/otlp/common/v1"
logsv1 "go.opentelemetry.io/proto/otlp/logs/v1"
metricsv1 "go.opentelemetry.io/proto/otlp/metrics/v1"
Expand Down Expand Up @@ -152,33 +153,14 @@ func (c *otlpConverter) convertMetricsToOTLP(data *collector.HealthData) []*metr
// Convert regular metrics if available
if len(data.Metrics) > 0 {
for _, metric := range data.Metrics {
otlpMetric := &metricsv1.Metric{
Name: metric.Name,
Description: fmt.Sprintf("Metric from component %s", metric.Component),
Unit: "1",
Data: &metricsv1.Metric_Gauge{
Gauge: &metricsv1.Gauge{
DataPoints: []*metricsv1.NumberDataPoint{
{
TimeUnixNano: uint64(metric.UnixMilliseconds) * 1_000_000,
Value: &metricsv1.NumberDataPoint_AsDouble{
AsDouble: metric.Value,
},
Attributes: c.convertLabelsToOTLPAttributes(metric.Labels, gpuUUIDToIndex),
},
},
},
},
}
otlpMetrics = append(otlpMetrics, otlpMetric)
otlpMetrics = append(otlpMetrics, c.convertMetricToOTLP(metric, gpuUUIDToIndex))
}
}

// Add a summary metric with collection info
summaryMetric := &metricsv1.Metric{
Name: "fleetint_agent_collection_summary",
Description: "Summary of Fleet Intelligence data collection including counts of metrics, events, and components",
Unit: "1",
Data: &metricsv1.Metric_Gauge{
Gauge: &metricsv1.Gauge{
DataPoints: []*metricsv1.NumberDataPoint{
Expand Down Expand Up @@ -217,7 +199,6 @@ func (c *otlpConverter) convertMetricsToOTLP(data *collector.HealthData) []*metr
upMetric := &metricsv1.Metric{
Name: "fleetint_agent_up",
Description: "Fleet Intelligence agent liveness. A value of 1 indicates the agent was running when telemetry was exported.",
Unit: "1",
Data: &metricsv1.Metric_Gauge{
Gauge: &metricsv1.Gauge{
DataPoints: []*metricsv1.NumberDataPoint{
Expand All @@ -236,6 +217,39 @@ func (c *otlpConverter) convertMetricsToOTLP(data *collector.HealthData) []*metr
return otlpMetrics
}

func (c *otlpConverter) convertMetricToOTLP(metric pkgmetrics.Metric, gpuUUIDToIndex map[string]string) *metricsv1.Metric {
dataPoint := &metricsv1.NumberDataPoint{
TimeUnixNano: uint64(metric.UnixMilliseconds) * 1_000_000,
Value: &metricsv1.NumberDataPoint_AsDouble{
AsDouble: metric.Value,
},
Attributes: c.convertLabelsToOTLPAttributes(metric.Labels, gpuUUIDToIndex),
}

otlpMetric := &metricsv1.Metric{
Name: metric.Name,
Description: fmt.Sprintf("Metric from component %s", metric.Component),
}

if metric.Type == pkgmetrics.MetricTypeCounter {
otlpMetric.Data = &metricsv1.Metric_Sum{
Sum: &metricsv1.Sum{
AggregationTemporality: metricsv1.AggregationTemporality_AGGREGATION_TEMPORALITY_CUMULATIVE,
IsMonotonic: true,
DataPoints: []*metricsv1.NumberDataPoint{dataPoint},
},
}
return otlpMetric
}

otlpMetric.Data = &metricsv1.Metric_Gauge{
Gauge: &metricsv1.Gauge{
DataPoints: []*metricsv1.NumberDataPoint{dataPoint},
},
}
return otlpMetric
}

// convertLabelsToOTLPAttributes converts metric labels to OTLP attributes.
// If the labels contain a "uuid" key but no "gpu" key, it enriches the
// attributes with the GPU index looked up from the machine info mapping.
Expand Down
58 changes: 57 additions & 1 deletion internal/exporter/converter/otlp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,53 @@ func TestOTLPConverter_Convert_WithMetrics(t *testing.T) {
assert.Contains(t, metrics[0].Description, "gpu")
}

func TestOTLPConverter_Convert_CounterMetricsBecomeCumulativeSums(t *testing.T) {
data := &collector.HealthData{
Timestamp: time.Now(),
MachineID: "test-machine",
Metrics: metrics.Metrics{
{
Component: "gpu",
Name: "dcgm_fi_dev_pcie_replay_counter",
Type: metrics.MetricTypeCounter,
UnixMilliseconds: 1699200000000,
Value: 42,
Labels: map[string]string{"uuid": "GPU-0", "gpu": "0"},
},
{
Component: "gpu",
Name: "dcgm_fi_dev_gpu_temp",
Type: metrics.MetricTypeGauge,
UnixMilliseconds: 1699200001000,
Value: 65,
Labels: map[string]string{"uuid": "GPU-0", "gpu": "0"},
},
},
}

converter := NewOTLPConverter()
otlpData := converter.Convert(data)

convertedMetrics := otlpData.Metrics.ResourceMetrics[0].ScopeMetrics[0].Metrics
counterMetric := findOTLPMetric(convertedMetrics, "dcgm_fi_dev_pcie_replay_counter")
require.NotNil(t, counterMetric)
assert.Empty(t, counterMetric.Unit)
sum := counterMetric.GetSum()
require.NotNil(t, sum)
assert.True(t, sum.IsMonotonic)
assert.Equal(t, metricsv1.AggregationTemporality_AGGREGATION_TEMPORALITY_CUMULATIVE, sum.AggregationTemporality)
require.Len(t, sum.DataPoints, 1)
assert.Equal(t, 42.0, sum.DataPoints[0].GetAsDouble())

gaugeMetric := findOTLPMetric(convertedMetrics, "dcgm_fi_dev_gpu_temp")
require.NotNil(t, gaugeMetric)
assert.Empty(t, gaugeMetric.Unit)
gauge := gaugeMetric.GetGauge()
require.NotNil(t, gauge)
require.Len(t, gauge.DataPoints, 1)
assert.Equal(t, 65.0, gauge.DataPoints[0].GetAsDouble())
}

func TestOTLPConverter_Convert_WithEvents(t *testing.T) {
data := &collector.HealthData{
Timestamp: time.Now(),
Expand Down Expand Up @@ -733,7 +780,7 @@ func TestOTLPConverter_UpMetric(t *testing.T) {
}

require.NotNil(t, upMetric, "Should have fleetint_agent_up metric")
assert.Equal(t, "1", upMetric.Unit)
assert.Empty(t, upMetric.Unit)
assert.Contains(t, upMetric.Description, "liveness")

gauge := upMetric.Data.(*metricsv1.Metric_Gauge).Gauge
Expand All @@ -745,6 +792,15 @@ func TestOTLPConverter_UpMetric(t *testing.T) {
assert.Empty(t, point.Attributes)
}

func findOTLPMetric(metrics []*metricsv1.Metric, name string) *metricsv1.Metric {
for _, metric := range metrics {
if metric.Name == name {
return metric
}
}
return nil
}

func TestOTLPConverter_ResourceAttributes(t *testing.T) {
data := &collector.HealthData{
Timestamp: time.Now(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ var (
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevUncorrectableRemappedRows = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevUncorrectableRemappedRows = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Namespace: "",
Subsystem: "",
Name: "dcgm_fi_dev_uncorrectable_remapped_rows",
Expand All @@ -93,8 +93,8 @@ var (
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevCorrectableRemappedRows = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevCorrectableRemappedRows = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Namespace: "",
Subsystem: "",
Name: "dcgm_fi_dev_correctable_remapped_rows",
Expand Down Expand Up @@ -133,8 +133,8 @@ var (
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCSBEVolTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCSBEVolTotal = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Namespace: "",
Subsystem: "",
Name: "dcgm_fi_dev_ecc_sbe_vol_total",
Expand All @@ -143,8 +143,8 @@ var (
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCDBEVolTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCDBEVolTotal = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Namespace: "",
Subsystem: "",
Name: "dcgm_fi_dev_ecc_dbe_vol_total",
Expand All @@ -153,8 +153,8 @@ var (
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCSBEAggTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCSBEAggTotal = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Namespace: "",
Subsystem: "",
Name: "dcgm_fi_dev_ecc_sbe_agg_total",
Expand All @@ -163,8 +163,8 @@ var (
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCDBAggTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCDBAggTotal = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Namespace: "",
Subsystem: "",
Name: "dcgm_fi_dev_ecc_dbe_agg_total",
Expand All @@ -173,32 +173,32 @@ var (
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCSBEVolDev = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCSBEVolDev = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Name: "dcgm_fi_dev_ecc_sbe_vol_dev",
Help: "Device memory single bit volatile ECC errors.",
},
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCDBEVolDev = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCDBEVolDev = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Name: "dcgm_fi_dev_ecc_dbe_vol_dev",
Help: "Device memory double bit volatile ECC errors.",
},
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCSBEAggDev = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCSBEAggDev = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Name: "dcgm_fi_dev_ecc_sbe_agg_dev",
Help: "Device memory single bit aggregate (persistent) ECC errors. Note: monotonically increasing.",
},
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
).MustCurryWith(componentLabel)

metricDCGMFIDevECCDBEAggDev = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
metricDCGMFIDevECCDBEAggDev = pkgmetrics.NewSettableCounterVec(
prometheus.CounterOpts{
Name: "dcgm_fi_dev_ecc_dbe_agg_dev",
Help: "Device memory double bit aggregate (persistent) ECC errors. Note: monotonically increasing.",
},
Expand Down
Loading
Loading