@@ -18,8 +18,10 @@ package collector
1818import (
1919 "fmt"
2020 "log/slog"
21+ "os"
2122 "path/filepath"
2223 "regexp"
24+ "strings"
2325
2426 "github.com/prometheus/client_golang/prometheus"
2527)
@@ -30,115 +32,213 @@ const (
3032
3133var (
3234 edacMemControllerRE = regexp .MustCompile (`.*devices/system/edac/mc/mc([0-9]*)` )
33- edacMemCsrowRE = regexp .MustCompile (`.*devices/system/edac/mc/mc[0-9]*/csrow ([0-9]*)` )
35+ edacMemDimmRE = regexp .MustCompile (`.*devices/system/edac/mc/mc[0-9]*/dimm ([0-9]*)` )
3436)
3537
3638type edacCollector struct {
37- ceCount * prometheus.Desc
38- ueCount * prometheus.Desc
39- csRowCECount * prometheus.Desc
40- csRowUECount * prometheus.Desc
41- logger * slog.Logger
39+ ceCount * prometheus.Desc
40+ ueCount * prometheus.Desc
41+ channelCECount * prometheus.Desc
42+ channelUECount * prometheus.Desc
43+ dimmCECount * prometheus.Desc
44+ dimmUECount * prometheus.Desc
45+ logger * slog.Logger
4246}
4347
4448func init () {
4549 registerCollector ("edac" , defaultEnabled , NewEdacCollector )
4650}
4751
48- // NewEdacCollector returns a new Collector exposing edac stats.
4952func NewEdacCollector (logger * slog.Logger ) (Collector , error ) {
53+
5054 return & edacCollector {
55+
5156 ceCount : prometheus .NewDesc (
5257 prometheus .BuildFQName (namespace , edacSubsystem , "correctable_errors_total" ),
5358 "Total correctable memory errors." ,
54- []string {"controller" }, nil ,
59+ []string {"controller" },
60+ nil ,
5561 ),
62+
5663 ueCount : prometheus .NewDesc (
5764 prometheus .BuildFQName (namespace , edacSubsystem , "uncorrectable_errors_total" ),
5865 "Total uncorrectable memory errors." ,
59- []string {"controller" }, nil ,
66+ []string {"controller" },
67+ nil ,
68+ ),
69+
70+ channelCECount : prometheus .NewDesc (
71+ prometheus .BuildFQName (namespace , edacSubsystem , "channel_correctable_errors_total" ),
72+ "Total correctable memory errors for this channel." ,
73+ []string {"controller" , "csrow" , "channel" , "dimm_label" },
74+ nil ,
6075 ),
61- csRowCECount : prometheus .NewDesc (
62- prometheus .BuildFQName (namespace , edacSubsystem , "csrow_correctable_errors_total" ),
63- "Total correctable memory errors for this csrow." ,
64- []string {"controller" , "csrow" }, nil ,
76+
77+ channelUECount : prometheus .NewDesc (
78+ prometheus .BuildFQName (namespace , edacSubsystem , "channel_uncorrectable_errors_total" ),
79+ "Total uncorrectable memory errors for this channel." ,
80+ []string {"controller" , "csrow" , "channel" , "dimm_label" },
81+ nil ,
6582 ),
66- csRowUECount : prometheus .NewDesc (
67- prometheus .BuildFQName (namespace , edacSubsystem , "csrow_uncorrectable_errors_total" ),
68- "Total uncorrectable memory errors for this csrow." ,
69- []string {"controller" , "csrow" }, nil ,
83+
84+ dimmCECount : prometheus .NewDesc (
85+ prometheus .BuildFQName (namespace , edacSubsystem , "dimm_correctable_errors_total" ),
86+ "Total correctable memory errors for this dimm." ,
87+ []string {"controller" , "dimm" },
88+ nil ,
7089 ),
90+
91+ dimmUECount : prometheus .NewDesc (
92+ prometheus .BuildFQName (namespace , edacSubsystem , "dimm_uncorrectable_errors_total" ),
93+ "Total uncorrectable memory errors for this dimm." ,
94+ []string {"controller" , "dimm" },
95+ nil ,
96+ ),
97+
7198 logger : logger ,
7299 }, nil
73100}
74101
75102func (c * edacCollector ) Update (ch chan <- prometheus.Metric ) error {
103+
76104 memControllers , err := filepath .Glob (sysFilePath ("devices/system/edac/mc/mc[0-9]*" ))
77105 if err != nil {
78106 return err
79107 }
108+
80109 for _ , controller := range memControllers {
110+
81111 controllerMatch := edacMemControllerRE .FindStringSubmatch (controller )
82112 if controllerMatch == nil {
83113 return fmt .Errorf ("controller string didn't match regexp: %s" , controller )
84114 }
115+
85116 controllerNumber := controllerMatch [1 ]
86117
87118 value , err := readUintFromFile (filepath .Join (controller , "ce_count" ))
88- if err != nil {
89- return fmt .Errorf ("couldn't get ce_count for controller %s: %w" , controllerNumber , err )
119+ if err == nil {
120+ ch <- prometheus .MustNewConstMetric (
121+ c .ceCount ,
122+ prometheus .CounterValue ,
123+ float64 (value ),
124+ controllerNumber ,
125+ )
90126 }
91- ch <- prometheus .MustNewConstMetric (
92- c .ceCount , prometheus .CounterValue , float64 (value ), controllerNumber )
93127
94- value , err = readUintFromFile (filepath .Join (controller , "ce_noinfo_count" ))
95- if err != nil {
96- return fmt .Errorf ("couldn't get ce_noinfo_count for controller %s: %w" , controllerNumber , err )
128+ value , err = readUintFromFile (filepath .Join (controller , "ue_count" ))
129+ if err == nil {
130+ ch <- prometheus .MustNewConstMetric (
131+ c .ueCount ,
132+ prometheus .CounterValue ,
133+ float64 (value ),
134+ controllerNumber ,
135+ )
97136 }
98- ch <- prometheus .MustNewConstMetric (
99- c .csRowCECount , prometheus .CounterValue , float64 (value ), controllerNumber , "unknown" )
100137
101- value , err = readUintFromFile (filepath .Join (controller , "ue_count" ))
138+ csrows , err := filepath .Glob (controller + "/csrow[0-9]*" )
139+
102140 if err != nil {
103- return fmt . Errorf ( "couldn't get ue_count for controller %s: %w" , controllerNumber , err )
141+ return err
104142 }
105- ch <- prometheus .MustNewConstMetric (
106- c .ueCount , prometheus .CounterValue , float64 (value ), controllerNumber )
107143
108- value , err = readUintFromFile (filepath .Join (controller , "ue_noinfo_count" ))
109- if err != nil {
110- return fmt .Errorf ("couldn't get ue_noinfo_count for controller %s: %w" , controllerNumber , err )
144+ for _ , csrow := range csrows {
145+ base := filepath .Base (csrow )
146+
147+ match := regexp .MustCompile (`csrow([0-9]+)` ).FindStringSubmatch (base )
148+ if match == nil {
149+ continue
150+ }
151+ csrowNumber := match [1 ]
152+
153+ channelFiles , err := filepath .Glob (csrow + "/ch*_ce_count" )
154+ if err != nil {
155+ return err
156+ }
157+
158+ for _ , chFile := range channelFiles {
159+
160+ base := filepath .Base (chFile )
161+
162+ match := regexp .MustCompile (`ch([0-9]+)_ce_count` ).FindStringSubmatch (base )
163+ if match == nil {
164+ continue
165+ }
166+
167+ channelNumber := match [1 ]
168+ label := "unknown"
169+ labelBytes , err := os .ReadFile (filepath .Join (csrow , "ch" + channelNumber + "_dimm_label" ))
170+ if err == nil {
171+ label = strings .TrimSpace (string (labelBytes ))
172+ // format label
173+ label = strings .ReplaceAll (label , "#" , "" )
174+ label = strings .ReplaceAll (label , "csrow" , "_csrow" )
175+ label = strings .ReplaceAll (label , "channel" , "_channel" )
176+ }
177+ value , err := readUintFromFile (chFile )
178+ if err == nil {
179+ ch <- prometheus .MustNewConstMetric (
180+ c .channelCECount ,
181+ prometheus .CounterValue ,
182+ float64 (value ),
183+ controllerNumber ,
184+ csrowNumber ,
185+ channelNumber ,
186+ label ,
187+ )
188+ }
189+
190+ value , err = readUintFromFile (filepath .Join (csrow , "ch" + channelNumber + "_ue_count" ))
191+ if err == nil {
192+ ch <- prometheus .MustNewConstMetric (
193+ c .channelUECount ,
194+ prometheus .CounterValue ,
195+ float64 (value ),
196+ controllerNumber ,
197+ csrowNumber ,
198+ channelNumber ,
199+ label ,
200+ )
201+ }
202+ }
111203 }
112- ch <- prometheus .MustNewConstMetric (
113- c .csRowUECount , prometheus .CounterValue , float64 (value ), controllerNumber , "unknown" )
114204
115- // For each controller, walk the csrow directories.
116- csrows , err := filepath .Glob (controller + "/csrow[0-9]*" )
205+ dimms , err := filepath .Glob (controller + "/dimm[0-9]*" )
117206 if err != nil {
118207 return err
119208 }
120- for _ , csrow := range csrows {
121- csrowMatch := edacMemCsrowRE .FindStringSubmatch (csrow )
122- if csrowMatch == nil {
123- return fmt .Errorf ("csrow string didn't match regexp: %s" , csrow )
209+
210+ for _ , dimm := range dimms {
211+
212+ dimmMatch := edacMemDimmRE .FindStringSubmatch (dimm )
213+ if dimmMatch == nil || len (dimmMatch ) < 2 {
214+ continue
124215 }
125- csrowNumber := csrowMatch [1 ]
126216
127- value , err = readUintFromFile (filepath .Join (csrow , "ce_count" ))
128- if err != nil {
129- return fmt .Errorf ("couldn't get ce_count for controller/csrow %s/%s: %w" , controllerNumber , csrowNumber , err )
217+ dimmNumber := dimmMatch [1 ]
218+
219+ value , err := readUintFromFile (filepath .Join (dimm , "dimm_ce_count" ))
220+ if err == nil {
221+ ch <- prometheus .MustNewConstMetric (
222+ c .dimmCECount ,
223+ prometheus .CounterValue ,
224+ float64 (value ),
225+ controllerNumber ,
226+ dimmNumber ,
227+ )
130228 }
131- ch <- prometheus .MustNewConstMetric (
132- c .csRowCECount , prometheus .CounterValue , float64 (value ), controllerNumber , csrowNumber )
133229
134- value , err = readUintFromFile (filepath .Join (csrow , "ue_count" ))
135- if err != nil {
136- return fmt .Errorf ("couldn't get ue_count for controller/csrow %s/%s: %w" , controllerNumber , csrowNumber , err )
230+ value , err = readUintFromFile (filepath .Join (dimm , "dimm_ue_count" ))
231+ if err == nil {
232+ ch <- prometheus .MustNewConstMetric (
233+ c .dimmUECount ,
234+ prometheus .CounterValue ,
235+ float64 (value ),
236+ controllerNumber ,
237+ dimmNumber ,
238+ )
137239 }
138- ch <- prometheus .MustNewConstMetric (
139- c .csRowUECount , prometheus .CounterValue , float64 (value ), controllerNumber , csrowNumber )
140240 }
141241 }
142242
143- return err
243+ return nil
144244}
0 commit comments