Skip to content

Commit c724fff

Browse files
committed
feat(ethtool): add metrics for SFP module performance and status
Signed-off-by: tdakkota <tdakkota@yandex.ru>
1 parent 0f5c158 commit c724fff

5 files changed

Lines changed: 732 additions & 9 deletions

File tree

collector/ethtool_linux.go

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"os"
2727
"regexp"
2828
"sort"
29+
"strconv"
2930
"strings"
3031
"sync"
3132
"syscall"
@@ -49,6 +50,7 @@ type Ethtool interface {
4950
DriverInfo(string) (ethtool.DrvInfo, error)
5051
Stats(string) (map[string]uint64, error)
5152
LinkInfo(string) (ethtool.EthtoolCmd, error)
53+
ModuleEeprom(string) ([]byte, error)
5254
}
5355

5456
type ethtoolLibrary struct {
@@ -69,15 +71,24 @@ func (e *ethtoolLibrary) LinkInfo(intf string) (ethtool.EthtoolCmd, error) {
6971
return ethtoolCmd, err
7072
}
7173

74+
func (e *ethtoolLibrary) ModuleEeprom(intf string) ([]byte, error) {
75+
return e.ethtool.ModuleEeprom(intf)
76+
}
77+
7278
type ethtoolCollector struct {
73-
fs sysfs.FS
74-
entries map[string]*prometheus.Desc
75-
entriesMutex sync.Mutex
76-
ethtool Ethtool
77-
deviceFilter deviceFilter
78-
infoDesc *prometheus.Desc
79-
metricsPattern *regexp.Regexp
80-
logger *slog.Logger
79+
fs sysfs.FS
80+
entries map[string]*prometheus.Desc
81+
entriesMutex sync.Mutex
82+
ethtool Ethtool
83+
deviceFilter deviceFilter
84+
infoDesc *prometheus.Desc
85+
moduleTemperatureDesc *prometheus.Desc
86+
moduleVoltageDesc *prometheus.Desc
87+
moduleTxBiasDesc *prometheus.Desc
88+
moduleTxPowerDesc *prometheus.Desc
89+
moduleRxPowerDesc *prometheus.Desc
90+
metricsPattern *regexp.Regexp
91+
logger *slog.Logger
8192
}
8293

8394
// makeEthtoolCollector is the internal constructor for EthtoolCollector.
@@ -111,6 +122,31 @@ func makeEthtoolCollector(logger *slog.Logger) (*ethtoolCollector, error) {
111122
deviceFilter: newDeviceFilter(*ethtoolDeviceExclude, *ethtoolDeviceInclude),
112123
metricsPattern: regexp.MustCompile(*ethtoolIncludedMetrics),
113124
logger: logger,
125+
moduleTemperatureDesc: prometheus.NewDesc(
126+
prometheus.BuildFQName(namespace, "ethtool", "module_temperature_celsius"),
127+
"Module temperature in degrees Celsius",
128+
[]string{"device"}, nil,
129+
),
130+
moduleVoltageDesc: prometheus.NewDesc(
131+
prometheus.BuildFQName(namespace, "ethtool", "module_voltage_volts"),
132+
"Module supply voltage in volts",
133+
[]string{"device"}, nil,
134+
),
135+
moduleTxBiasDesc: prometheus.NewDesc(
136+
prometheus.BuildFQName(namespace, "ethtool", "module_tx_bias_milliamperes"),
137+
"Module TX laser bias current in milliamperes",
138+
[]string{"device", "lane"}, nil,
139+
),
140+
moduleTxPowerDesc: prometheus.NewDesc(
141+
prometheus.BuildFQName(namespace, "ethtool", "module_tx_power_milliwatts"),
142+
"Module TX optical power in milliwatts",
143+
[]string{"device", "lane"}, nil,
144+
),
145+
moduleRxPowerDesc: prometheus.NewDesc(
146+
prometheus.BuildFQName(namespace, "ethtool", "module_rx_power_milliwatts"),
147+
"Module RX optical power in milliwatts",
148+
[]string{"device", "lane"}, nil,
149+
),
114150
entries: map[string]*prometheus.Desc{
115151
"rx_bytes": prometheus.NewDesc(
116152
prometheus.BuildFQName(namespace, "ethtool", "received_bytes_total"),
@@ -445,6 +481,27 @@ func (c *ethtoolCollector) Update(ch chan<- prometheus.Metric) error {
445481
}
446482
}
447483

484+
eepromData, err := c.ethtool.ModuleEeprom(device)
485+
if err == nil {
486+
modMetrics, parseErr := parseModuleEeprom(eepromData)
487+
if parseErr == nil {
488+
ch <- prometheus.MustNewConstMetric(c.moduleTemperatureDesc, prometheus.GaugeValue, modMetrics.temperature, device)
489+
ch <- prometheus.MustNewConstMetric(c.moduleVoltageDesc, prometheus.GaugeValue, modMetrics.voltage, device)
490+
for i, lane := range modMetrics.lanes {
491+
laneStr := strconv.Itoa(i + 1)
492+
ch <- prometheus.MustNewConstMetric(c.moduleTxBiasDesc, prometheus.GaugeValue, lane.txBias, device, laneStr)
493+
ch <- prometheus.MustNewConstMetric(c.moduleTxPowerDesc, prometheus.GaugeValue, lane.txPower, device, laneStr)
494+
ch <- prometheus.MustNewConstMetric(c.moduleRxPowerDesc, prometheus.GaugeValue, lane.rxPower, device, laneStr)
495+
}
496+
} else {
497+
c.logger.Debug("ethtool module EEPROM parse error", "err", parseErr, "device", device)
498+
}
499+
} else if err != unix.EOPNOTSUPP {
500+
c.logger.Error("ethtool module EEPROM error", "err", err, "device", device)
501+
} else {
502+
c.logger.Debug("ethtool module EEPROM error", "err", err, "device", device)
503+
}
504+
448505
if len(stats) == 0 {
449506
// No stats returned; device does not support ethtool stats.
450507
continue

collector/ethtool_linux_test.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package collector
1717

1818
import (
1919
"bufio"
20+
"errors"
2021
"fmt"
2122
"io"
2223
"log/slog"
@@ -257,6 +258,14 @@ func (e *EthtoolFixture) LinkInfo(intf string) (ethtool.EthtoolCmd, error) {
257258
return res, err
258259
}
259260

261+
func (e *EthtoolFixture) ModuleEeprom(intf string) ([]byte, error) {
262+
data, err := os.ReadFile(filepath.Join(e.fixturePath, intf, "module_eeprom"))
263+
if errors.Is(err, os.ErrNotExist) {
264+
return nil, unix.EOPNOTSUPP
265+
}
266+
return data, err
267+
}
268+
260269
func NewEthtoolTestCollector(logger *slog.Logger) (Collector, error) {
261270
collector, err := makeEthtoolCollector(logger)
262271
if err != nil {
@@ -288,7 +297,22 @@ func TestBuildEthtoolFQName(t *testing.T) {
288297
}
289298

290299
func TestEthToolCollector(t *testing.T) {
291-
testcase := `# HELP node_ethtool_align_errors Network interface align_errors
300+
testcase := `# HELP node_ethtool_module_rx_power_milliwatts Module RX optical power in milliwatts
301+
# TYPE node_ethtool_module_rx_power_milliwatts gauge
302+
node_ethtool_module_rx_power_milliwatts{device="eth0",lane="1"} 0.5
303+
# HELP node_ethtool_module_temperature_celsius Module temperature in degrees Celsius
304+
# TYPE node_ethtool_module_temperature_celsius gauge
305+
node_ethtool_module_temperature_celsius{device="eth0"} 25
306+
# HELP node_ethtool_module_tx_bias_milliamperes Module TX laser bias current in milliamperes
307+
# TYPE node_ethtool_module_tx_bias_milliamperes gauge
308+
node_ethtool_module_tx_bias_milliamperes{device="eth0",lane="1"} 20
309+
# HELP node_ethtool_module_tx_power_milliwatts Module TX optical power in milliwatts
310+
# TYPE node_ethtool_module_tx_power_milliwatts gauge
311+
node_ethtool_module_tx_power_milliwatts{device="eth0",lane="1"} 1
312+
# HELP node_ethtool_module_voltage_volts Module supply voltage in volts
313+
# TYPE node_ethtool_module_voltage_volts gauge
314+
node_ethtool_module_voltage_volts{device="eth0"} 3.2976
315+
# HELP node_ethtool_align_errors Network interface align_errors
292316
# TYPE node_ethtool_align_errors untyped
293317
node_ethtool_align_errors{device="eth0"} 0
294318
# HELP node_ethtool_info A metric with a constant '1' value labeled by bus_info, device, driver, expansion_rom_version, firmware_version, version.

collector/ethtool_sfp_linux.go

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
// Copyright 2021 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !noethtool
15+
16+
// SFP/QSFP module EEPROM parsing for Digital Optical Monitoring (DOM) /
17+
// Digital Diagnostic Monitoring (DDM) data.
18+
//
19+
// Standards:
20+
// - SFF-8472: SFP/SFP+ DDM (A0 + A2 EEPROM pages, 512 bytes total)
21+
// - SFF-8636: QSFP/QSFP28 DOM (page 0, 256 bytes)
22+
23+
package collector
24+
25+
import (
26+
"encoding/binary"
27+
"fmt"
28+
)
29+
30+
// SFP/QSFP module identifier values (EEPROM byte 0).
31+
const (
32+
sfpIdentifierSFP = 0x03 // SFP/SFP+/SFP28 (SFF-8472)
33+
sfpIdentifierSFPAlt = 0x0B // SFP+ alternative identifier
34+
sfpIdentifierQSFP = 0x0C // QSFP (SFF-8436)
35+
sfpIdentifierQSFPP = 0x0D // QSFP+ (SFF-8436)
36+
sfpIdentifierQSFP28 = 0x11 // QSFP28 (SFF-8636)
37+
)
38+
39+
// sfpLaneMetrics holds per-lane optical monitoring values.
40+
type sfpLaneMetrics struct {
41+
txBias float64 // TX laser bias current in amperes
42+
txPower float64 // TX optical power in watts
43+
rxPower float64 // RX optical power in watts
44+
}
45+
46+
// sfpMetrics holds parsed DOM/DDM values from a transceiver module.
47+
type sfpMetrics struct {
48+
temperature float64 // Module temperature in degrees Celsius
49+
voltage float64 // Module supply voltage in volts
50+
lanes []sfpLaneMetrics // Per-lane metrics (1 lane for SFP, 4 for QSFP)
51+
}
52+
53+
// parseModuleEeprom parses raw EEPROM bytes returned by ethtool GMODULEEEPROM
54+
// and extracts DOM/DDM values.
55+
//
56+
// Returns an error if the data is too short, the identifier is unrecognised, or DDM is not available.
57+
func parseModuleEeprom(data []byte) (sfpMetrics, error) {
58+
if len(data) < 1 {
59+
return sfpMetrics{}, fmt.Errorf("module EEPROM data too short (%d bytes)", len(data))
60+
}
61+
62+
switch data[0] {
63+
case sfpIdentifierSFP, sfpIdentifierSFPAlt:
64+
return parseSFF8472(data)
65+
case sfpIdentifierQSFP, sfpIdentifierQSFPP, sfpIdentifierQSFP28:
66+
return parseSFF8636(data)
67+
default:
68+
return sfpMetrics{}, fmt.Errorf("unsupported module identifier 0x%02x", data[0])
69+
}
70+
}
71+
72+
// parseSFF8472 parses SFP/SFP+ DDM data per SFF-8472.
73+
func parseSFF8472(data []byte) (sfpMetrics, error) {
74+
const (
75+
a0DiagnosticType = 92 // A0 page: diagnostic monitoring type byte
76+
ddmSupportBit = 0x40 // bit 6: DDM implemented
77+
78+
// Offsets within the full 512-byte dump (A2 page starts at 256).
79+
a2PageOffset = 256
80+
valuesOffset = a2PageOffset + 96
81+
82+
tempOffset = valuesOffset
83+
voltageOffset = tempOffset + 2
84+
txBiasOffset = voltageOffset + 2
85+
txPowerOffset = txBiasOffset + 2
86+
rxPowerOffset = txPowerOffset + 2
87+
minLen = rxPowerOffset + 2
88+
)
89+
90+
if len(data) < a0DiagnosticType+1 {
91+
return sfpMetrics{}, fmt.Errorf("SFF-8472 EEPROM too short for diagnostic type byte (%d bytes)", len(data))
92+
}
93+
if data[a0DiagnosticType]&ddmSupportBit == 0 {
94+
return sfpMetrics{}, fmt.Errorf("SFP module does not support DDM (diagnostic type byte: 0x%02x)", data[a0DiagnosticType])
95+
}
96+
if len(data) < minLen {
97+
return sfpMetrics{}, fmt.Errorf("SFF-8472 EEPROM too short for DDM values (%d bytes, need %d)", len(data), minLen)
98+
}
99+
100+
temp := parseSFPTemperature(data[tempOffset:])
101+
voltage := parseSFPVoltage(data[voltageOffset:])
102+
103+
txBias := parseSFPBias(data[txBiasOffset:])
104+
txPower := parseSFPPower(data[txPowerOffset:])
105+
rxPower := parseSFPPower(data[rxPowerOffset:])
106+
107+
return sfpMetrics{
108+
temperature: temp,
109+
voltage: voltage,
110+
lanes: []sfpLaneMetrics{
111+
{txBias: txBias, txPower: txPower, rxPower: rxPower},
112+
},
113+
}, nil
114+
}
115+
116+
// parseSFF8636 parses QSFP/QSFP28 DOM data per SFF-8636.
117+
func parseSFF8636(data []byte) (sfpMetrics, error) {
118+
// All real-time values are on Page 00h.
119+
const (
120+
// Table 6-8 Free Side Monitoring Values
121+
tempOffset = 22 // Temperature MSB
122+
voltageOffset = 26 // Supply voltage MSB
123+
124+
// Table 6-9 Channel Monitoring Values.
125+
numLanes = 4
126+
rxPowerOffset = 34 // RX power ch1 MSB
127+
txBiasOffset = rxPowerOffset + numLanes*2 // TX bias ch1 MSB
128+
txPowerOffset = txBiasOffset + numLanes*2 // TX power ch1 MSB
129+
130+
minLen = txPowerOffset + numLanes*2
131+
)
132+
133+
if len(data) < minLen {
134+
return sfpMetrics{}, fmt.Errorf("SFF-8636 EEPROM too short (%d bytes, need %d)", len(data), minLen)
135+
}
136+
137+
temp := parseSFPTemperature(data[tempOffset:])
138+
voltage := parseSFPVoltage(data[voltageOffset:])
139+
140+
lanes := make([]sfpLaneMetrics, numLanes)
141+
for i := range numLanes {
142+
lanes[i] = sfpLaneMetrics{
143+
rxPower: parseSFPPower(data[rxPowerOffset+i*2:]),
144+
txBias: parseSFPBias(data[txBiasOffset+i*2:]),
145+
txPower: parseSFPPower(data[txPowerOffset+i*2:]),
146+
}
147+
}
148+
149+
return sfpMetrics{
150+
temperature: temp,
151+
voltage: voltage,
152+
lanes: lanes,
153+
}, nil
154+
}
155+
156+
func parseSFPTemperature(b []byte) float64 {
157+
// SFF-8472
158+
//
159+
// Table 9-1 Bit Weights (°C) for Temperature Reporting Registers
160+
//
161+
// +----------------------------------+----------------------------------+-------+-------+
162+
// | Most Significant Byte (byte 96) | Least Significant Byte (byte 97) | | |
163+
// +------+----+----+----+---+---+---+---+---+---+----+-----+-----+------+-------+-------+
164+
// | D7 | D6 | D5 | D4 | D3| D2| D1| D0| D7| D6| D5 | D4 | D3 | D2 | D1 | D0 |
165+
// +------+----+----+----+---+---+---+---+---+---+----+-----+-----+------+-------+-------+
166+
// | Sign | 64 | 32 | 16 | 8 | 4 | 2 | 1 |1/2|1/4|1/8 |1/16 |1/32 | 1/64 | 1/128 | 1/256 |
167+
// +------+----+----+----+---+---+---+---+---+---+----+-----+-----+------+-------+-------+
168+
//
169+
rawVal := int16(binary.BigEndian.Uint16(b))
170+
return float64(rawVal) / 256.0
171+
}
172+
173+
func parseSFPVoltage(b []byte) float64 {
174+
// SFF-8472
175+
//
176+
// 9.2 Internal Calibration
177+
//
178+
// ...
179+
// 2) Internally measured transceiver supply voltage. Represented as a 16-bit unsigned integer with the voltage
180+
// defined as the full 16-bit value (0-65535) with LSB equal to 100 microvolts, yielding a total range of 0 V to +6.55 V.
181+
rawVal := binary.BigEndian.Uint16(b)
182+
mV := float64(rawVal) / 10
183+
V := mV / 1000
184+
return V
185+
}
186+
187+
func parseSFPBias(b []byte) float64 {
188+
// SFF-8472
189+
//
190+
// 9.2 Internal Calibration
191+
//
192+
// ...
193+
// 3) Measured TX bias current in mA. Represented as a 16-bit unsigned integer with the current defined as the full
194+
// 16-bit value (0-65535) with LSB equal to 2 microamps, yielding a total range of 0 to 131 mA.
195+
rawVal := binary.BigEndian.Uint16(b)
196+
mA := float64(rawVal) / 500
197+
return mA
198+
}
199+
200+
func parseSFPPower(b []byte) float64 {
201+
// SFF-8472
202+
//
203+
// 9.2 Internal Calibration
204+
//
205+
// ...
206+
// 4) Measured TX output power in mW. Represented as a 16-bit unsigned integer with the power defined as the
207+
// full 16-bit value (0-65535) with LSB equal to 0.1 microwatts, yielding a total range of 0 to 6.5535 mW (-40 to +8.2 dBm).
208+
// ...
209+
// 5) Measured RX received optical power in mW. Value can represent either average received power or OMA
210+
// depending upon how bit 3 of byte 92 (A0h) is set. Represented as a 16-bit unsigned integer with the power
211+
// defined as the full 16-bit value (0-65535) with LSB equal to 0.1 microwatts, yielding a total range of 0 to 6.5535 mW (-40 to +8.2 dBm).
212+
rawVal := binary.BigEndian.Uint16(b)
213+
mW := float64(rawVal) / 10000
214+
return mW
215+
}

0 commit comments

Comments
 (0)