|
1 | 1 | { |
2 | | - prometheusAlerts+:: { |
| 2 | + new(this): { |
3 | 3 | groups+: [ |
4 | 4 | { |
5 | 5 | name: 'ApacheCouchDBAlerts', |
6 | 6 | rules: [ |
7 | 7 | { |
8 | 8 | alert: 'CouchDBUnhealthyCluster', |
9 | 9 | expr: ||| |
10 | | - min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable) < %(alertsCriticalClusterIsUnstable5m)s |
11 | | - ||| % $._config, |
| 10 | + min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable{%(filteringSelector)s}) < %(alertsCriticalClusterIsUnstable5m)s |
| 11 | + ||| % this.config, |
12 | 12 | 'for': '5m', |
13 | 13 | labels: { |
14 | 14 | severity: 'critical', |
|
19 | 19 | ( |
20 | 20 | '{{$labels.couchdb_cluster}} has reported a value of {{ printf "%%.0f" $value }} for its stability over the last 5 minutes, ' + |
21 | 21 | 'which is below the threshold of %(alertsCriticalClusterIsUnstable5m)s.' |
22 | | - ) % $._config, |
| 22 | + ) % this.config, |
23 | 23 | }, |
24 | 24 | }, |
25 | 25 | { |
26 | 26 | alert: 'CouchDBHigh4xxResponseCodes', |
27 | 27 | expr: ||| |
28 | | - sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.*"}[5m])) > %(alertsWarning4xxResponseCodes5m)s |
29 | | - ||| % $._config, |
| 28 | + sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s}[5m])) > %(alertsWarning4xxResponseCodes5m)s |
| 29 | + ||| % (this.config { filteringSelector: if this.config.filteringSelector != '' then this.config.filteringSelector + ',code=~"4.."' else 'code=~"4.."' }), |
30 | 30 | 'for': '5m', |
31 | 31 | labels: { |
32 | 32 | severity: 'warning', |
|
37 | 37 | ( |
38 | 38 | '{{ printf "%%.0f" $value }} 4xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' + |
39 | 39 | 'which is above the threshold of %(alertsWarning4xxResponseCodes5m)s.' |
40 | | - ) % $._config, |
| 40 | + ) % this.config, |
41 | 41 | }, |
42 | 42 | }, |
43 | 43 | { |
44 | 44 | alert: 'CouchDBHigh5xxResponseCodes', |
45 | 45 | expr: ||| |
46 | | - sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.*"}[5m])) > %(alertsCritical5xxResponseCodes5m)s |
47 | | - ||| % $._config, |
| 46 | + sum by(job, instance) (increase(couchdb_httpd_status_codes{%(filteringSelector)s}[5m])) > %(alertsCritical5xxResponseCodes5m)s |
| 47 | + ||| % (this.config { filteringSelector: if this.config.filteringSelector != '' then this.config.filteringSelector + ',code=~"5.."' else 'code=~"5.."' }), |
48 | 48 | 'for': '5m', |
49 | 49 | labels: { |
50 | 50 | severity: 'critical', |
|
55 | 55 | ( |
56 | 56 | '{{ printf "%%.0f" $value }} 5xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' + |
57 | 57 | 'which is above the threshold of %(alertsCritical5xxResponseCodes5m)s.' |
58 | | - ) % $._config, |
| 58 | + ) % this.config, |
59 | 59 | }, |
60 | 60 | }, |
61 | 61 | { |
62 | 62 | alert: 'CouchDBModerateRequestLatency', |
63 | 63 | expr: ||| |
64 | | - sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsWarningRequestLatency5m)s |
65 | | - ||| % $._config, |
| 64 | + sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsWarningRequestLatency5m)s |
| 65 | + ||| % this.config, |
66 | 66 | 'for': '5m', |
67 | 67 | labels: { |
68 | 68 | severity: 'warning', |
|
73 | 73 | ( |
74 | 74 | 'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' + |
75 | 75 | 'which is above the threshold of %(alertsWarningRequestLatency5m)sms. ' |
76 | | - ) % $._config, |
| 76 | + ) % this.config, |
77 | 77 | }, |
78 | 78 | }, |
79 | 79 | { |
80 | 80 | alert: 'CouchDBHighRequestLatency', |
81 | 81 | expr: ||| |
82 | | - sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsCriticalRequestLatency5m)s |
83 | | - ||| % $._config, |
| 82 | + sum by(job, instance) (rate(couchdb_request_time_seconds_sum{%(filteringSelector)s}[5m]) / rate(couchdb_request_time_seconds_count{%(filteringSelector)s}[5m])) * 1000 > %(alertsCriticalRequestLatency5m)s |
| 83 | + ||| % this.config, |
84 | 84 | 'for': '5m', |
85 | 85 | labels: { |
86 | 86 | severity: 'critical', |
|
91 | 91 | ( |
92 | 92 | 'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' + |
93 | 93 | 'which is above the threshold of %(alertsCriticalRequestLatency5m)sms. ' |
94 | | - ) % $._config, |
| 94 | + ) % this.config, |
95 | 95 | }, |
96 | 96 | }, |
97 | 97 | { |
98 | 98 | alert: 'CouchDBManyReplicatorJobsPending', |
99 | 99 | expr: ||| |
100 | | - sum by(job, instance) (couchdb_couch_replicator_jobs_pending) > %(alertsWarningPendingReplicatorJobs5m)s |
101 | | - ||| % $._config, |
| 100 | + sum by(job, instance) (couchdb_couch_replicator_jobs_pending{%(filteringSelector)s}) > %(alertsWarningPendingReplicatorJobs5m)s |
| 101 | + ||| % this.config, |
102 | 102 | 'for': '5m', |
103 | 103 | labels: { |
104 | 104 | severity: 'warning', |
|
109 | 109 | ( |
110 | 110 | '{{ printf "%%.0f" $value }} replicator jobs are pending on {{$labels.instance}}, ' + |
111 | 111 | 'which is above the threshold of %(alertsWarningPendingReplicatorJobs5m)s. ' |
112 | | - ) % $._config, |
| 112 | + ) % this.config, |
113 | 113 | }, |
114 | 114 | }, |
115 | 115 | { |
116 | 116 | alert: 'CouchDBReplicatorJobsCrashing', |
117 | 117 | expr: ||| |
118 | | - sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s |
119 | | - ||| % $._config, |
| 118 | + sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total{%(filteringSelector)s}[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s |
| 119 | + ||| % this.config, |
120 | 120 | 'for': '5m', |
121 | 121 | labels: { |
122 | 122 | severity: 'critical', |
|
127 | 127 | ( |
128 | 128 | '{{ printf "%%.0f" $value }} replicator jobs have crashed over the last 5 minutes on {{$labels.instance}}, ' + |
129 | 129 | 'which is above the threshold of %(alertsCriticalCrashingReplicatorJobs5m)s. ' |
130 | | - ) % $._config, |
| 130 | + ) % this.config, |
131 | 131 | }, |
132 | 132 | }, |
133 | 133 | { |
134 | 134 | alert: 'CouchDBReplicatorChangesQueuesDying', |
135 | 135 | expr: ||| |
136 | | - sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s |
137 | | - ||| % $._config, |
| 136 | + sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total{%(filteringSelector)s}[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s |
| 137 | + ||| % this.config, |
138 | 138 | 'for': '5m', |
139 | 139 | labels: { |
140 | 140 | severity: 'warning', |
|
145 | 145 | ( |
146 | 146 | '{{ printf "%%.0f" $value }} replicator changes queue processes have died over the last 5 minutes on {{$labels.instance}}, ' + |
147 | 147 | 'which is above the threshold of %(alertsWarningDyingReplicatorChangesQueues5m)s. ' |
148 | | - ) % $._config, |
| 148 | + ) % this.config, |
149 | 149 | }, |
150 | 150 | }, |
151 | 151 | { |
152 | | - alert: 'CouchDBReplicatorConnectionOwnersCrashing', |
| 152 | + alert: 'CouchDBReplicatorOwnersCrashing', |
153 | 153 | expr: ||| |
154 | | - sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s |
155 | | - ||| % $._config, |
| 154 | + sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s |
| 155 | + ||| % this.config, |
156 | 156 | 'for': '5m', |
157 | 157 | labels: { |
158 | 158 | severity: 'warning', |
|
163 | 163 | ( |
164 | 164 | '{{ printf "%%.0f" $value }} replicator connection owner processes have crashed over the last 5 minutes on {{$labels.instance}}, ' + |
165 | 165 | 'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionOwners5m)s. ' |
166 | | - ) % $._config, |
| 166 | + ) % this.config, |
167 | 167 | }, |
168 | 168 | }, |
169 | 169 | { |
170 | | - alert: 'CouchDBReplicatorConnectionWorkersCrashing', |
| 170 | + alert: 'CouchDBReplicatorWorkersCrashing', |
171 | 171 | expr: ||| |
172 | | - sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s |
173 | | - ||| % $._config, |
| 172 | + sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total{%(filteringSelector)s}[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s |
| 173 | + ||| % this.config, |
174 | 174 | 'for': '5m', |
175 | 175 | labels: { |
176 | 176 | severity: 'warning', |
|
181 | 181 | ( |
182 | 182 | '{{ printf "%%.0f" $value }} replicator connection worker processes have crashed over the last 5 minutes on {{$labels.instance}}, ' + |
183 | 183 | 'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionWorkers5m)s. ' |
184 | | - ) % $._config, |
| 184 | + ) % this.config, |
185 | 185 | }, |
186 | 186 | }, |
187 | 187 | ], |
|
0 commit comments