Skip to content

Commit c637903

Browse files
Modify getSupportedRasErrorTypes function for gt Ras errors
Related-To: LOCI-2934 Signed-off-by: Mayank Raghuwanshi <mayank.raghuwanshi@intel.com>
1 parent 8154444 commit c637903

File tree

1 file changed

+23
-43
lines changed

1 file changed

+23
-43
lines changed

level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp

Lines changed: 23 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
#include "sysman/linux/os_sysman_imp.h"
1111

12-
#include <regex>
1312
namespace L0 {
1413
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
1514
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
@@ -129,20 +128,28 @@ static uint64_t convertHexToUint64(std::string strVal) {
129128
return config;
130129
}
131130

132-
static bool isErrorTypeSupported(std::string pattern, std::vector<std::string> &eventList) {
133-
std::regex pPattern(pattern);
134-
for (const auto &entry : eventList) {
135-
if (regex_match(entry, pPattern) == true) {
136-
return true;
137-
}
131+
static bool getErrorType(std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
132+
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
133+
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
134+
bool onSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
135+
uint32_t subDeviceId = deviceProperties.subdeviceId;
136+
// Naming convention of files containing config values for errors
137+
// error--<Name of error> Ex:- error--engine-reset (config file with no subdevice)
138+
// error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config file with subdevices)
139+
// error--<Name of error> Ex:- error--driver-object-migration (config file for device level errors)
140+
std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
141+
if (onSubDevice == true) {
142+
errorPrefix = "error-gt" + std::to_string(subDeviceId) + "--";
138143
}
139-
return false;
140-
}
141-
142-
static bool getErrorType(std::vector<std::string> errorPattern, std::vector<std::string> &eventList) {
143-
for (auto &pattern : errorPattern) {
144-
if (isErrorTypeSupported(pattern, eventList) == true) {
145-
return true;
144+
for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
145+
for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
146+
std::string errorPrefixLocal = errorPrefix;
147+
if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
148+
errorPrefixLocal = "error--";
149+
}
150+
if (std::find(eventList.begin(), eventList.end(), errorPrefixLocal + nameOfError) != eventList.end()) {
151+
return true;
152+
}
146153
}
147154
}
148155
return false;
@@ -167,37 +174,10 @@ void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
167174
if (result != ZE_RESULT_SUCCESS) {
168175
return;
169176
}
170-
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
171-
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
172-
bool onSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
173-
uint32_t subDeviceId = deviceProperties.subdeviceId;
174-
std::vector<std::string> uncorrectablePattern;
175-
std::vector<std::string> correctablePattern;
176-
// For device with no subDevice error entries are of form error--<Name of error type>
177-
// and for device having subDevice error entries are of form error-gt<N>--<Name of error type>
178-
uncorrectablePattern.push_back("^error--driver.*");
179-
if (onSubDevice == false) {
180-
correctablePattern.push_back("^error--correctable.*");
181-
correctablePattern.push_back("^error--soc-correctable.*");
182-
uncorrectablePattern.push_back("^error--engine-reset.*");
183-
uncorrectablePattern.push_back("^error--eu-attention.*");
184-
uncorrectablePattern.push_back("^error--fatal.*");
185-
uncorrectablePattern.push_back("^error--soc-fatal.*");
186-
uncorrectablePattern.push_back("^error--soc-nonfatal.*");
187-
} else {
188-
correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--correctable.*");
189-
correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-correctable.*");
190-
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--driver.*");
191-
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--fatal.*");
192-
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-fatal.*");
193-
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-nonfatal.*");
194-
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--eu-attention.*");
195-
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--engine-reset.*");
196-
}
197-
if (getErrorType(correctablePattern, listOfEvents) == true) {
177+
if (getErrorType(categoryToListOfEventsCorrectable, listOfEvents, deviceHandle) == true) {
198178
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
199179
}
200-
if (getErrorType(uncorrectablePattern, listOfEvents) == true) {
180+
if (getErrorType(categoryToListOfEventsUncorrectable, listOfEvents, deviceHandle) == true) {
201181
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
202182
}
203183
}

0 commit comments

Comments
 (0)