Skip to content

Commit c340ce9

Browse files
author
manogna_grandhi
committed
add error counts / dimm channel for edac collector
Signed-off-by: manogna_grandhi <grandhi.manogna@flipkart.com>
1 parent 9fd21e8 commit c340ce9

4 files changed

Lines changed: 201 additions & 75 deletions

File tree

collector/edac_linux.go

Lines changed: 153 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ package collector
1818
import (
1919
"fmt"
2020
"log/slog"
21+
"os"
2122
"path/filepath"
2223
"regexp"
24+
"strings"
2325

2426
"github.com/prometheus/client_golang/prometheus"
2527
)
@@ -30,115 +32,213 @@ const (
3032

3133
var (
3234
edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`)
33-
edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`)
35+
edacMemDimmRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/dimm([0-9]*)`)
3436
)
3537

3638
type edacCollector struct {
37-
ceCount *prometheus.Desc
38-
ueCount *prometheus.Desc
39-
csRowCECount *prometheus.Desc
40-
csRowUECount *prometheus.Desc
41-
logger *slog.Logger
39+
ceCount *prometheus.Desc
40+
ueCount *prometheus.Desc
41+
channelCECount *prometheus.Desc
42+
channelUECount *prometheus.Desc
43+
dimmCECount *prometheus.Desc
44+
dimmUECount *prometheus.Desc
45+
logger *slog.Logger
4246
}
4347

4448
func init() {
4549
registerCollector("edac", defaultEnabled, NewEdacCollector)
4650
}
4751

48-
// NewEdacCollector returns a new Collector exposing edac stats.
4952
func NewEdacCollector(logger *slog.Logger) (Collector, error) {
53+
5054
return &edacCollector{
55+
5156
ceCount: prometheus.NewDesc(
5257
prometheus.BuildFQName(namespace, edacSubsystem, "correctable_errors_total"),
5358
"Total correctable memory errors.",
54-
[]string{"controller"}, nil,
59+
[]string{"controller"},
60+
nil,
5561
),
62+
5663
ueCount: prometheus.NewDesc(
5764
prometheus.BuildFQName(namespace, edacSubsystem, "uncorrectable_errors_total"),
5865
"Total uncorrectable memory errors.",
59-
[]string{"controller"}, nil,
66+
[]string{"controller"},
67+
nil,
68+
),
69+
70+
channelCECount: prometheus.NewDesc(
71+
prometheus.BuildFQName(namespace, edacSubsystem, "channel_correctable_errors_total"),
72+
"Total correctable memory errors for this channel.",
73+
[]string{"controller", "csrow", "channel", "dimm_label"},
74+
nil,
6075
),
61-
csRowCECount: prometheus.NewDesc(
62-
prometheus.BuildFQName(namespace, edacSubsystem, "csrow_correctable_errors_total"),
63-
"Total correctable memory errors for this csrow.",
64-
[]string{"controller", "csrow"}, nil,
76+
77+
channelUECount: prometheus.NewDesc(
78+
prometheus.BuildFQName(namespace, edacSubsystem, "channel_uncorrectable_errors_total"),
79+
"Total uncorrectable memory errors for this channel.",
80+
[]string{"controller", "csrow", "channel", "dimm_label"},
81+
nil,
6582
),
66-
csRowUECount: prometheus.NewDesc(
67-
prometheus.BuildFQName(namespace, edacSubsystem, "csrow_uncorrectable_errors_total"),
68-
"Total uncorrectable memory errors for this csrow.",
69-
[]string{"controller", "csrow"}, nil,
83+
84+
dimmCECount: prometheus.NewDesc(
85+
prometheus.BuildFQName(namespace, edacSubsystem, "dimm_correctable_errors_total"),
86+
"Total correctable memory errors for this dimm.",
87+
[]string{"controller", "dimm"},
88+
nil,
7089
),
90+
91+
dimmUECount: prometheus.NewDesc(
92+
prometheus.BuildFQName(namespace, edacSubsystem, "dimm_uncorrectable_errors_total"),
93+
"Total uncorrectable memory errors for this dimm.",
94+
[]string{"controller", "dimm"},
95+
nil,
96+
),
97+
7198
logger: logger,
7299
}, nil
73100
}
74101

75102
func (c *edacCollector) Update(ch chan<- prometheus.Metric) error {
103+
76104
memControllers, err := filepath.Glob(sysFilePath("devices/system/edac/mc/mc[0-9]*"))
77105
if err != nil {
78106
return err
79107
}
108+
80109
for _, controller := range memControllers {
110+
81111
controllerMatch := edacMemControllerRE.FindStringSubmatch(controller)
82112
if controllerMatch == nil {
83113
return fmt.Errorf("controller string didn't match regexp: %s", controller)
84114
}
115+
85116
controllerNumber := controllerMatch[1]
86117

87118
value, err := readUintFromFile(filepath.Join(controller, "ce_count"))
88-
if err != nil {
89-
return fmt.Errorf("couldn't get ce_count for controller %s: %w", controllerNumber, err)
119+
if err == nil {
120+
ch <- prometheus.MustNewConstMetric(
121+
c.ceCount,
122+
prometheus.CounterValue,
123+
float64(value),
124+
controllerNumber,
125+
)
90126
}
91-
ch <- prometheus.MustNewConstMetric(
92-
c.ceCount, prometheus.CounterValue, float64(value), controllerNumber)
93127

94-
value, err = readUintFromFile(filepath.Join(controller, "ce_noinfo_count"))
95-
if err != nil {
96-
return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %w", controllerNumber, err)
128+
value, err = readUintFromFile(filepath.Join(controller, "ue_count"))
129+
if err == nil {
130+
ch <- prometheus.MustNewConstMetric(
131+
c.ueCount,
132+
prometheus.CounterValue,
133+
float64(value),
134+
controllerNumber,
135+
)
97136
}
98-
ch <- prometheus.MustNewConstMetric(
99-
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown")
100137

101-
value, err = readUintFromFile(filepath.Join(controller, "ue_count"))
138+
csrows, err := filepath.Glob(controller + "/csrow[0-9]*")
139+
102140
if err != nil {
103-
return fmt.Errorf("couldn't get ue_count for controller %s: %w", controllerNumber, err)
141+
return err
104142
}
105-
ch <- prometheus.MustNewConstMetric(
106-
c.ueCount, prometheus.CounterValue, float64(value), controllerNumber)
107143

108-
value, err = readUintFromFile(filepath.Join(controller, "ue_noinfo_count"))
109-
if err != nil {
110-
return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %w", controllerNumber, err)
144+
for _, csrow := range csrows {
145+
base := filepath.Base(csrow)
146+
147+
match := regexp.MustCompile(`csrow([0-9]+)`).FindStringSubmatch(base)
148+
if match == nil {
149+
continue
150+
}
151+
csrowNumber := match[1]
152+
153+
channelFiles, err := filepath.Glob(csrow + "/ch*_ce_count")
154+
if err != nil {
155+
return err
156+
}
157+
158+
for _, chFile := range channelFiles {
159+
160+
base := filepath.Base(chFile)
161+
162+
match := regexp.MustCompile(`ch([0-9]+)_ce_count`).FindStringSubmatch(base)
163+
if match == nil {
164+
continue
165+
}
166+
167+
channelNumber := match[1]
168+
label := "unknown"
169+
labelBytes, err := os.ReadFile(filepath.Join(csrow, "ch"+channelNumber+"_dimm_label"))
170+
if err == nil {
171+
label = strings.TrimSpace(string(labelBytes))
172+
// format label
173+
label = strings.ReplaceAll(label, "#", "")
174+
label = strings.ReplaceAll(label, "csrow", "_csrow")
175+
label = strings.ReplaceAll(label, "channel", "_channel")
176+
}
177+
value, err := readUintFromFile(chFile)
178+
if err == nil {
179+
ch <- prometheus.MustNewConstMetric(
180+
c.channelCECount,
181+
prometheus.CounterValue,
182+
float64(value),
183+
controllerNumber,
184+
csrowNumber,
185+
channelNumber,
186+
label,
187+
)
188+
}
189+
190+
value, err = readUintFromFile(filepath.Join(csrow, "ch"+channelNumber+"_ue_count"))
191+
if err == nil {
192+
ch <- prometheus.MustNewConstMetric(
193+
c.channelUECount,
194+
prometheus.CounterValue,
195+
float64(value),
196+
controllerNumber,
197+
csrowNumber,
198+
channelNumber,
199+
label,
200+
)
201+
}
202+
}
111203
}
112-
ch <- prometheus.MustNewConstMetric(
113-
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown")
114204

115-
// For each controller, walk the csrow directories.
116-
csrows, err := filepath.Glob(controller + "/csrow[0-9]*")
205+
dimms, err := filepath.Glob(controller + "/dimm[0-9]*")
117206
if err != nil {
118207
return err
119208
}
120-
for _, csrow := range csrows {
121-
csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow)
122-
if csrowMatch == nil {
123-
return fmt.Errorf("csrow string didn't match regexp: %s", csrow)
209+
210+
for _, dimm := range dimms {
211+
212+
dimmMatch := edacMemDimmRE.FindStringSubmatch(dimm)
213+
if dimmMatch == nil || len(dimmMatch) < 2 {
214+
continue
124215
}
125-
csrowNumber := csrowMatch[1]
126216

127-
value, err = readUintFromFile(filepath.Join(csrow, "ce_count"))
128-
if err != nil {
129-
return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err)
217+
dimmNumber := dimmMatch[1]
218+
219+
value, err := readUintFromFile(filepath.Join(dimm, "dimm_ce_count"))
220+
if err == nil {
221+
ch <- prometheus.MustNewConstMetric(
222+
c.dimmCECount,
223+
prometheus.CounterValue,
224+
float64(value),
225+
controllerNumber,
226+
dimmNumber,
227+
)
130228
}
131-
ch <- prometheus.MustNewConstMetric(
132-
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)
133229

134-
value, err = readUintFromFile(filepath.Join(csrow, "ue_count"))
135-
if err != nil {
136-
return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err)
230+
value, err = readUintFromFile(filepath.Join(dimm, "dimm_ue_count"))
231+
if err == nil {
232+
ch <- prometheus.MustNewConstMetric(
233+
c.dimmUECount,
234+
prometheus.CounterValue,
235+
float64(value),
236+
controllerNumber,
237+
dimmNumber,
238+
)
137239
}
138-
ch <- prometheus.MustNewConstMetric(
139-
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)
140240
}
141241
}
142242

143-
return err
243+
return nil
144244
}

collector/fixtures/e2e-64k-page-output.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,18 @@ node_drbd_remote_pending{device="drbd1"} 12346
740740
# HELP node_drbd_remote_unacknowledged Number of requests received by the peer via the network connection, but that have not yet been answered.
741741
# TYPE node_drbd_remote_unacknowledged gauge
742742
node_drbd_remote_unacknowledged{device="drbd1"} 12347
743+
# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel.
744+
# TYPE node_edac_channel_correctable_errors_total counter
745+
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0
746+
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0
747+
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0
748+
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0
749+
# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel.
750+
# TYPE node_edac_channel_uncorrectable_errors_total counter
751+
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2
752+
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2
753+
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2
754+
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2
743755
# HELP node_edac_correctable_errors_total Total correctable memory errors.
744756
# TYPE node_edac_correctable_errors_total counter
745757
node_edac_correctable_errors_total{controller="0"} 1

collector/fixtures/e2e-output.txt

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -775,17 +775,21 @@ node_drbd_remote_unacknowledged{device="drbd1"} 12347
775775
# HELP node_edac_correctable_errors_total Total correctable memory errors.
776776
# TYPE node_edac_correctable_errors_total counter
777777
node_edac_correctable_errors_total{controller="0"} 1
778-
# HELP node_edac_csrow_correctable_errors_total Total correctable memory errors for this csrow.
779-
# TYPE node_edac_csrow_correctable_errors_total counter
780-
node_edac_csrow_correctable_errors_total{controller="0",csrow="0"} 3
781-
node_edac_csrow_correctable_errors_total{controller="0",csrow="unknown"} 2
782-
# HELP node_edac_csrow_uncorrectable_errors_total Total uncorrectable memory errors for this csrow.
783-
# TYPE node_edac_csrow_uncorrectable_errors_total counter
784-
node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="0"} 4
785-
node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="unknown"} 6
786778
# HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors.
787779
# TYPE node_edac_uncorrectable_errors_total counter
788780
node_edac_uncorrectable_errors_total{controller="0"} 5
781+
# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel.
782+
# TYPE node_edac_channel_correctable_errors_total counter
783+
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0
784+
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0
785+
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0
786+
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0
787+
# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel.
788+
# TYPE node_edac_channel_uncorrectable_errors_total counter
789+
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2
790+
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2
791+
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2
792+
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2
789793
# HELP node_entropy_available_bits Bits of available entropy.
790794
# TYPE node_entropy_available_bits gauge
791795
node_entropy_available_bits 1337

0 commit comments

Comments
 (0)