From 93d9fd30ea36f2db9734bf8adefb590cd45d3beb Mon Sep 17 00:00:00 2001 From: BOUHOURS Antoine Date: Tue, 4 Jun 2024 14:57:09 +0200 Subject: [PATCH 1/4] Fix nodata error in alert --- resources/grafana/alert-rules/oom_killed.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/grafana/alert-rules/oom_killed.json b/resources/grafana/alert-rules/oom_killed.json index 46f968b..ce02549 100644 --- a/resources/grafana/alert-rules/oom_killed.json +++ b/resources/grafana/alert-rules/oom_killed.json @@ -19,7 +19,7 @@ "uid": "{{DATASOURCE_UID}}" }, "editorMode": "code", - "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1", + "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1 OR vector(0)", "instant": false, "interval": "", "intervalMs": 15000, From 9e9220710ac5ba4e38ed2a901179a91e6aad7cba Mon Sep 17 00:00:00 2001 From: BOUHOURS Antoine Date: Tue, 4 Jun 2024 15:15:58 +0200 Subject: [PATCH 2/4] Add error events alert --- .../grafana/alert-rules/error_events.json | 127 ++++++++++++++++++ ...oom_killed.json => oom_killed_events.json} | 2 +- 2 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 resources/grafana/alert-rules/error_events.json rename resources/grafana/alert-rules/{oom_killed.json => oom_killed_events.json} (97%) diff --git a/resources/grafana/alert-rules/error_events.json b/resources/grafana/alert-rules/error_events.json new file mode 100644 index 0000000..e663624 --- /dev/null +++ b/resources/grafana/alert-rules/error_events.json @@ -0,0 +1,127 @@ +{ + "orgID": 1, + "folderUID": "{{FOLDER_UID}}", + "ruleGroup": "30sEval", + "title": "Error events", + "condition": "C", + "data": [ + { + "refId": "A", + "queryType": "", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "datasourceUid": "{{DATASOURCE_UID}}", + "model": { + "datasource": { + "type": "prometheus", + "uid": "{{DATASOURCE_UID}}" + }, + "editorMode": "code", + "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"Error\"}[10m]) == 1", + "instant": false, + "interval": "", + "intervalMs": 15000, + "legendFormat": "{{pod}}", + "maxDataPoints": 43200, + "range": true, + "refId": "A" + } + }, + { + "refId": "B", + "queryType": "", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "reducer": "last", + "refId": "B", + "type": "reduce" + } + }, + { + "refId": "C", + "queryType": "", + "relativeTimeRange": { + "from": 1800, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "B", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "5m", + "annotations": { + "__dashboardUid__": "a24365d3-fb9c-407b-99d6-5b0b14de5adc", + "__panelId__": "50", + "summary": "Triggered when a pod is OOMKilled by Kubernetes" + }, + "isPaused": false +} diff --git a/resources/grafana/alert-rules/oom_killed.json b/resources/grafana/alert-rules/oom_killed_events.json similarity index 97% rename from resources/grafana/alert-rules/oom_killed.json rename to resources/grafana/alert-rules/oom_killed_events.json index ce02549..46f968b 100644 --- a/resources/grafana/alert-rules/oom_killed.json +++ b/resources/grafana/alert-rules/oom_killed_events.json @@ -19,7 +19,7 @@ "uid": "{{DATASOURCE_UID}}" }, "editorMode": "code", - "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1 OR vector(0)", + "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1", "instant": false, "interval": "", "intervalMs": 15000, From 52c241be5d756a3102823426993d7224e53ac28d Mon Sep 17 00:00:00 2001 From: BOUHOURS Antoine Date: Tue, 4 Jun 2024 16:30:48 +0200 Subject: [PATCH 3/4] Change nodata state to OK --- resources/grafana/alert-rules/error_events.json | 2 +- resources/grafana/alert-rules/oom_killed_events.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/grafana/alert-rules/error_events.json b/resources/grafana/alert-rules/error_events.json index e663624..4f513ae 100644 --- a/resources/grafana/alert-rules/error_events.json +++ b/resources/grafana/alert-rules/error_events.json @@ -115,7 +115,7 @@ } } ], - "noDataState": "NoData", + "noDataState": "OK", "execErrState": "Error", "for": "5m", "annotations": { diff --git a/resources/grafana/alert-rules/oom_killed_events.json b/resources/grafana/alert-rules/oom_killed_events.json index 46f968b..1c8948e 100644 --- a/resources/grafana/alert-rules/oom_killed_events.json +++ b/resources/grafana/alert-rules/oom_killed_events.json @@ -115,7 +115,7 @@ } } ], - "noDataState": "NoData", + "noDataState": "OK", "execErrState": "Error", "for": "5m", "annotations": { From 79f3b1756f8c055d19ade35f1dad606d751cdf44 Mon Sep 17 00:00:00 2001 From: BOUHOURS Antoine Date: Tue, 4 Jun 2024 16:41:55 +0200 Subject: [PATCH 4/4] Fix error --- resources/grafana/alert-rules/error_events.json | 4 ++-- resources/grafana/alert-rules/oom_killed_events.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/grafana/alert-rules/error_events.json b/resources/grafana/alert-rules/error_events.json index 4f513ae..5b3ba0a 100644 --- a/resources/grafana/alert-rules/error_events.json +++ b/resources/grafana/alert-rules/error_events.json @@ -19,7 +19,7 @@ "uid": "{{DATASOURCE_UID}}" }, "editorMode": "code", - "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"Error\"}[10m]) == 1", + "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"Error\"}[10m]) == 1", "instant": false, "interval": "", "intervalMs": 15000, @@ -121,7 +121,7 @@ "annotations": { "__dashboardUid__": "a24365d3-fb9c-407b-99d6-5b0b14de5adc", "__panelId__": "50", - "summary": "Triggered when a pod is OOMKilled by Kubernetes" + "summary": "Triggered when a pod is restarted because of an error" }, "isPaused": false } diff --git a/resources/grafana/alert-rules/oom_killed_events.json b/resources/grafana/alert-rules/oom_killed_events.json index 1c8948e..8297e3f 100644 --- a/resources/grafana/alert-rules/oom_killed_events.json +++ b/resources/grafana/alert-rules/oom_killed_events.json @@ -19,7 +19,7 @@ "uid": "{{DATASOURCE_UID}}" }, "editorMode": "code", - "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1", + "expr": "(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[10m]) == 1", "instant": false, "interval": "", "intervalMs": 15000,