-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathalerts.yml
More file actions
170 lines (156 loc) · 5.89 KB
/
alerts.yml
File metadata and controls
170 lines (156 loc) · 5.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# PATAS Production Alerting Rules
# Configure alertmanager to route these alerts appropriately
groups:
- name: patas_critical
interval: 30s
rules:
# Service availability
- alert: PATASServiceDown
expr: up{job="patas"} == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "PATAS service is down"
description: "PATAS API endpoint {{ $labels.instance }} is not responding"
runbook_url: "https://wiki/patas/runbooks/service-down"
# Database connectivity
- alert: DatabaseConnectionFailed
expr: patas_db_pool_usage{pool_type="active"} == 0 and patas_db_pool_usage{pool_type="idle"} == 0
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "Database connection pool empty"
description: "No database connections available - service may be unable to process requests"
runbook_url: "https://wiki/patas/runbooks/database-connection"
# Rule precision critical drop
- alert: RulePrecisionCritical
expr: patas_rule_precision_avg{profile="conservative"} < 0.85
for: 10m
labels:
severity: critical
team: ml
annotations:
summary: "Critical drop in rule precision"
description: "Average rule precision for {{ $labels.profile }} profile dropped below 85%"
runbook_url: "https://wiki/patas/runbooks/rule-precision"
- name: patas_warning
interval: 30s
rules:
# High error rate (>1%)
- alert: HighErrorRate
expr: |
sum(rate(patas_api_errors_total[5m])) / sum(rate(patas_api_requests_total[5m])) > 0.01
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "High API error rate"
description: "Error rate exceeds 1% over the last 5 minutes ({{ $value | printf \"%.2f\" }}%)"
runbook_url: "https://wiki/patas/runbooks/high-error-rate"
# High API latency (P95 > 500ms)
- alert: HighAPILatency
expr: |
histogram_quantile(0.95, rate(patas_api_latency_seconds_bucket[5m])) > 0.5
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "High API latency (P95 > 500ms)"
description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s"
runbook_url: "https://wiki/patas/runbooks/high-latency"
# Low rule precision (<90%)
- alert: LowRulePrecision
expr: patas_rule_precision_avg{profile="balanced"} < 0.90
for: 15m
labels:
severity: warning
team: ml
annotations:
summary: "Low rule precision"
description: "Average rule precision for {{ $labels.profile }} profile is {{ $value | printf \"%.2f\" }}"
runbook_url: "https://wiki/patas/runbooks/rule-precision"
# High false positive rate
- alert: HighFalsePositiveRate
expr: patas_false_positive_rate > 0.05
for: 10m
labels:
severity: warning
team: ml
annotations:
summary: "High false positive rate"
description: "False positive rate for {{ $labels.profile }} is {{ $value | printf \"%.2f\" }}%"
runbook_url: "https://wiki/patas/runbooks/false-positives"
# Database connection pool exhaustion
- alert: DatabasePoolExhaustion
expr: |
patas_db_pool_usage{pool_type="active"} /
(patas_db_pool_usage{pool_type="active"} + patas_db_pool_usage{pool_type="idle"}) > 0.8
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Database connection pool near exhaustion"
description: "Database connection pool is {{ $value | printf \"%.0f\" }}% utilized"
runbook_url: "https://wiki/patas/runbooks/database-pool"
# Low cache hit rate
- alert: LowCacheHitRate
expr: patas_cache_hit_rate{cache_type="llm"} < 50
for: 30m
labels:
severity: warning
team: platform
annotations:
summary: "Low LLM cache hit rate"
description: "LLM cache hit rate is {{ $value | printf \"%.0f\" }}%"
runbook_url: "https://wiki/patas/runbooks/cache-performance"
# Pattern mining failure
- alert: PatternMiningFailure
expr: increase(patas_api_errors_total{endpoint="/api/v1/patterns/mine"}[5m]) > 0
for: 1m
labels:
severity: warning
team: ml
annotations:
summary: "Pattern mining errors detected"
description: "Pattern mining encountered errors in the last 5 minutes"
runbook_url: "https://wiki/patas/runbooks/pattern-mining"
- name: patas_info
interval: 60s
rules:
# Low pattern discovery rate
- alert: LowPatternDiscoveryRate
expr: rate(patas_pattern_discovery_total[1h]) < 1
for: 2h
labels:
severity: info
team: ml
annotations:
summary: "Low pattern discovery rate"
description: "Fewer than 1 pattern discovered per hour for the last 2 hours"
# No messages processed
- alert: NoMessagesProcessed
expr: rate(patas_messages_processed_total[1h]) == 0
for: 1h
labels:
severity: info
team: platform
annotations:
summary: "No messages processed"
description: "No messages have been processed in the last hour"
# Rule count anomaly
- alert: ActiveRuleCountLow
expr: patas_active_rules{status="active"} < 10
for: 1h
labels:
severity: info
team: ml
annotations:
summary: "Low number of active rules"
description: "Only {{ $value }} active rules - consider running pattern mining"