diff --git a/resources/grafana/alert-rules/error_events.json b/resources/grafana/alert-rules/error_events.json new file mode 100644 index 0000000..5b3ba0a --- /dev/null +++ b/resources/grafana/alert-rules/error_events.json @@ -0,0 +1,127 @@ +{ + "orgID": 1, + "folderUID": "{{FOLDER_UID}}", + "ruleGroup": "30sEval", + "title": "Error events", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "datasourceUid": "{{DATASOURCE_UID}}", + "model": { + "datasource": { + "type": "prometheus", + "uid": "{{DATASOURCE_UID}}" + }, + "editorMode": "code", + "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"Error\"}[10m]) == 1", + "instant": false, + "interval": "", + "intervalMs": 15000, + "legendFormat": "{{pod}}", + "maxDataPoints": 43200, + "range": true, + "refId": "A" + } + }, + { + "refId": "B", + "queryType": "", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "reducer": "last", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "B", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Error", + "for": "5m", + "annotations": { + "__dashboardUid__": "a24365d3-fb9c-407b-99d6-5b0b14de5adc", + "__panelId__": "50", + "summary": "Triggered when a pod is restarted because of an error" + }, + "isPaused": false +} diff --git a/resources/grafana/alert-rules/oom_killed.json b/resources/grafana/alert-rules/oom_killed_events.json similarity index 93% rename from resources/grafana/alert-rules/oom_killed.json rename to resources/grafana/alert-rules/oom_killed_events.json index 46f968b..8297e3f 100644 --- a/resources/grafana/alert-rules/oom_killed.json +++ b/resources/grafana/alert-rules/oom_killed_events.json @@ -19,7 +19,7 @@ "uid": "{{DATASOURCE_UID}}" }, "editorMode": "code", - "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1", + "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1", "instant": false, "interval": "", "intervalMs": 15000, @@ -115,7 +115,7 @@ } } ], - "noDataState": "NoData", + "noDataState": "OK", "execErrState": "Error", "for": "5m", "annotations": {