PATAS/alerts.yml at main · KikuAI-Lab/PATAS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# PATAS Production Alerting Rules
# Configure alertmanager to route these alerts appropriately

groups:
  - name: patas_critical
    interval: 30s
    rules:
      # Service availability
      - alert: PATASServiceDown
        expr: up{job="patas"} == 0
        for: 1m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "PATAS service is down"
          description: "PATAS API endpoint {{ $labels.instance }} is not responding"
          runbook_url: "https://wiki/patas/runbooks/service-down"

      # Database connectivity
      - alert: DatabaseConnectionFailed
        expr: patas_db_pool_usage{pool_type="active"} == 0 and patas_db_pool_usage{pool_type="idle"} == 0
        for: 2m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Database connection pool empty"
          description: "No database connections available - service may be unable to process requests"
          runbook_url: "https://wiki/patas/runbooks/database-connection"

      # Rule precision critical drop
      - alert: RulePrecisionCritical
        expr: patas_rule_precision_avg{profile="conservative"} < 0.85
        for: 10m
        labels:
          severity: critical
          team: ml
        annotations:
          summary: "Critical drop in rule precision"
          description: "Average rule precision for {{ $labels.profile }} profile dropped below 85%"
          runbook_url: "https://wiki/patas/runbooks/rule-precision"

  - name: patas_warning
    interval: 30s
    rules:
      # High error rate (>1%)
      - alert: HighErrorRate
        expr: |
          sum(rate(patas_api_errors_total[5m])) / sum(rate(patas_api_requests_total[5m])) > 0.01
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High API error rate"
          description: "Error rate exceeds 1% over the last 5 minutes ({{ $value | printf \"%.2f\" }}%)"
          runbook_url: "https://wiki/patas/runbooks/high-error-rate"

      # High API latency (P95 > 500ms)
      - alert: HighAPILatency
        expr: |
          histogram_quantile(0.95, rate(patas_api_latency_seconds_bucket[5m])) > 0.5
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High API latency (P95 > 500ms)"
          description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s"
          runbook_url: "https://wiki/patas/runbooks/high-latency"

      # Low rule precision (<90%)
      - alert: LowRulePrecision
        expr: patas_rule_precision_avg{profile="balanced"} < 0.90
        for: 15m
        labels:
          severity: warning
          team: ml
        annotations:
          summary: "Low rule precision"
          description: "Average rule precision for {{ $labels.profile }} profile is {{ $value | printf \"%.2f\" }}"
          runbook_url: "https://wiki/patas/runbooks/rule-precision"

      # High false positive rate
      - alert: HighFalsePositiveRate
        expr: patas_false_positive_rate > 0.05
        for: 10m
        labels:
          severity: warning
          team: ml
        annotations:
          summary: "High false positive rate"
          description: "False positive rate for {{ $labels.profile }} is {{ $value | printf \"%.2f\" }}%"
          runbook_url: "https://wiki/patas/runbooks/false-positives"

      # Database connection pool exhaustion
      - alert: DatabasePoolExhaustion
        expr: |
          patas_db_pool_usage{pool_type="active"} /
          (patas_db_pool_usage{pool_type="active"} + patas_db_pool_usage{pool_type="idle"}) > 0.8
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Database connection pool near exhaustion"
          description: "Database connection pool is {{ $value | printf \"%.0f\" }}% utilized"
          runbook_url: "https://wiki/patas/runbooks/database-pool"

      # Low cache hit rate
      - alert: LowCacheHitRate
        expr: patas_cache_hit_rate{cache_type="llm"} < 50
        for: 30m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Low LLM cache hit rate"
          description: "LLM cache hit rate is {{ $value | printf \"%.0f\" }}%"
          runbook_url: "https://wiki/patas/runbooks/cache-performance"

      # Pattern mining failure
      - alert: PatternMiningFailure
        expr: increase(patas_api_errors_total{endpoint="/api/v1/patterns/mine"}[5m]) > 0
        for: 1m
        labels:
          severity: warning
          team: ml
        annotations:
          summary: "Pattern mining errors detected"
          description: "Pattern mining encountered errors in the last 5 minutes"
          runbook_url: "https://wiki/patas/runbooks/pattern-mining"

  - name: patas_info
    interval: 60s
    rules:
      # Low pattern discovery rate
      - alert: LowPatternDiscoveryRate
        expr: rate(patas_pattern_discovery_total[1h]) < 1
        for: 2h
        labels:
          severity: info
          team: ml
        annotations:
          summary: "Low pattern discovery rate"
          description: "Fewer than 1 pattern discovered per hour for the last 2 hours"

      # No messages processed
      - alert: NoMessagesProcessed
        expr: rate(patas_messages_processed_total[1h]) == 0
        for: 1h
        labels:
          severity: info
          team: platform
        annotations:
          summary: "No messages processed"
          description: "No messages have been processed in the last hour"

      # Rule count anomaly
      - alert: ActiveRuleCountLow
        expr: patas_active_rules{status="active"} < 10
        for: 1h
        labels:
          severity: info
          team: ml
        annotations:
          summary: "Low number of active rules"
          description: "Only {{ $value }} active rules - consider running pattern mining"