99
1010#include " sysman/linux/os_sysman_imp.h"
1111
12- #include < regex>
1312namespace L0 {
1413static const std::map<zes_ras_error_cat_t , std::vector<std::string>> categoryToListOfEventsUncorrectable = {
1514 {ZES_RAS_ERROR_CAT_CACHE_ERRORS,
@@ -129,20 +128,28 @@ static uint64_t convertHexToUint64(std::string strVal) {
129128 return config;
130129}
131130
132- static bool isErrorTypeSupported (std::string pattern, std::vector<std::string> &eventList) {
133- std::regex pPattern (pattern);
134- for (const auto &entry : eventList) {
135- if (regex_match (entry, pPattern) == true ) {
136- return true ;
137- }
131+ static bool getErrorType (std::map<zes_ras_error_cat_t , std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
132+ ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
133+ Device::fromHandle (deviceHandle)->getProperties (&deviceProperties);
134+ bool onSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
135+ uint32_t subDeviceId = deviceProperties.subdeviceId ;
136+ // Naming convention of files containing config values for errors
137+ // error--<Name of error> Ex:- error--engine-reset (config file with no subdevice)
138+ // error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config file with subdevices)
139+ // error--<Name of error> Ex:- error--driver-object-migration (config file for device level errors)
140+ std::string errorPrefix = " error--" ; // prefix string of the file containing config value for pmu counters
141+ if (onSubDevice == true ) {
142+ errorPrefix = " error-gt" + std::to_string (subDeviceId) + " --" ;
138143 }
139- return false ;
140- }
141-
142- static bool getErrorType (std::vector<std::string> errorPattern, std::vector<std::string> &eventList) {
143- for (auto &pattern : errorPattern) {
144- if (isErrorTypeSupported (pattern, eventList) == true ) {
145- return true ;
144+ for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
145+ for (auto const &nameOfError : rasErrorCatToListOfEvents.second ) {
146+ std::string errorPrefixLocal = errorPrefix;
147+ if (nameOfError == " driver-object-migration" ) { // check for errors which occur at device level
148+ errorPrefixLocal = " error--" ;
149+ }
150+ if (std::find (eventList.begin (), eventList.end (), errorPrefixLocal + nameOfError) != eventList.end ()) {
151+ return true ;
152+ }
146153 }
147154 }
148155 return false ;
@@ -167,37 +174,10 @@ void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
167174 if (result != ZE_RESULT_SUCCESS) {
168175 return ;
169176 }
170- ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
171- Device::fromHandle (deviceHandle)->getProperties (&deviceProperties);
172- bool onSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
173- uint32_t subDeviceId = deviceProperties.subdeviceId ;
174- std::vector<std::string> uncorrectablePattern;
175- std::vector<std::string> correctablePattern;
176- // For device with no subDevice error entries are of form error--<Name of error type>
177- // and for device having subDevice error entries are of form error-gt<N>--<Name of error type>
178- uncorrectablePattern.push_back (" ^error--driver.*" );
179- if (onSubDevice == false ) {
180- correctablePattern.push_back (" ^error--correctable.*" );
181- correctablePattern.push_back (" ^error--soc-correctable.*" );
182- uncorrectablePattern.push_back (" ^error--engine-reset.*" );
183- uncorrectablePattern.push_back (" ^error--eu-attention.*" );
184- uncorrectablePattern.push_back (" ^error--fatal.*" );
185- uncorrectablePattern.push_back (" ^error--soc-fatal.*" );
186- uncorrectablePattern.push_back (" ^error--soc-nonfatal.*" );
187- } else {
188- correctablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --correctable.*" );
189- correctablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --soc-correctable.*" );
190- uncorrectablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --driver.*" );
191- uncorrectablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --fatal.*" );
192- uncorrectablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --soc-fatal.*" );
193- uncorrectablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --soc-nonfatal.*" );
194- uncorrectablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --eu-attention.*" );
195- uncorrectablePattern.push_back (" ^error-gt" + std::to_string (subDeviceId) + " --engine-reset.*" );
196- }
197- if (getErrorType (correctablePattern, listOfEvents) == true ) {
177+ if (getErrorType (categoryToListOfEventsCorrectable, listOfEvents, deviceHandle) == true ) {
198178 errorType.insert (ZES_RAS_ERROR_TYPE_CORRECTABLE);
199179 }
200- if (getErrorType (uncorrectablePattern , listOfEvents) == true ) {
180+ if (getErrorType (categoryToListOfEventsUncorrectable , listOfEvents, deviceHandle ) == true ) {
201181 errorType.insert (ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
202182 }
203183}
0 commit comments