boxgo
diff --git a/‎docs/metric.md‎
Lines changed: 6 additions & 238 deletions b/‎docs/metric.md‎
Lines changed: 6 additions & 238 deletions
@@ -63,11 +63,11 @@
 
 ### 1.2 HTTP Client (Wukong)
 
-| 指标名称                               | 类型      | Labels                                             | 说明                           |
-| :------------------------------------- | :-------- | :------------------------------------------------- | :----------------------------- |
-| `http_client_requests_inflight`        | Gauge     | `method`, `baseUrl`, `url`                         | 当前正在进行的下游 HTTP 请求数 |
-| `http_client_requests_total`           | Counter   | `method`, `baseUrl`, `url`, `statusCode`, `result` | 发起的 HTTP 请求总数           |
-| `http_client_request_duration_seconds` | Histogram | `method`, `baseUrl`, `url`, `statusCode`, `result` | HTTP 请求耗时分布              |
+| 指标名称                               | 类型      | Labels                                        | 说明                           |
+| :------------------------------------- | :-------- | :-------------------------------------------- | :----------------------------- |
+| `http_client_requests_inflight`        | Gauge     | `method`, `baseUrl`, `url`                    | 当前正在进行的下游 HTTP 请求数 |
+| `http_client_requests_total`           | Counter   | `method`, `baseUrl`, `url`, `status`, `error` | 发起的 HTTP 请求总数           |
+| `http_client_request_duration_seconds` | Histogram | `method`, `baseUrl`, `url`, `status`, `error` | HTTP 请求耗时分布              |
 
 ### 1.3 gRPC Server
 
@@ -342,239 +342,7 @@
 
 以下是基于 Prometheus 的推荐告警规则配置，涵盖了可用性、延迟、错误率、资源饱和度及运行时异常。
 
-```yaml
-groups:
-  - name: box-server-alerts
-    rules:
-      # ==========================================================
-      # 1. 可用性与错误率 (Availability & Errors) - Severity: Critical
-      # ==========================================================
-      - alert: HighHttpErrorRate
-        expr: |
-          (sum(rate(http_server_requests_total{status=~"5.."}[1m]))
-          /
-          sum(rate(http_server_requests_total[1m]))) > 0.05
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "High HTTP error rate ({{ $value | humanizePercentage }})"
-          description: "HTTP 5xx error rate is above 5% for the last 2 minutes."
-
-      - alert: HighGrpcErrorRate
-        expr: |
-          (sum(rate(grpc_server_requests_total{code!="OK"}[1m]))
-          /
-          sum(rate(grpc_server_requests_total[1m]))) > 0.05
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "High gRPC error rate ({{ $value | humanizePercentage }})"
-          description: "gRPC error rate is above 5% for the last 2 minutes."
-
-      - alert: HighDbErrorRate
-        expr: |
-          (sum(rate(db_client_request_duration_seconds_count{result="error"}[1m]))
-          /
-          sum(rate(db_client_request_duration_seconds_count[1m]))) > 0.05
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "High DB Error Rate ({{ $value | humanizePercentage }})"
-          description: "Database query error rate is above 5%."
-
-      - alert: HighRedisErrorRate
-        expr: |
-          (sum(rate(redis_client_requests_total{result!="success"}[1m]))
-          /
-          sum(rate(redis_client_requests_total[1m]))) > 0.05
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "High Redis Error Rate ({{ $value | humanizePercentage }})"
-          description: "Redis command error rate is above 5%."
-
-      - alert: HighMongoErrorRate
-        expr: |
-          (sum(rate(mongo_client_requests_total{result="error"}[1m]))
-          /
-          sum(rate(mongo_client_requests_total[1m]))) > 0.05
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "High MongoDB Error Rate ({{ $value | humanizePercentage }})"
-          description: "MongoDB command error rate is above 5%."
-
-      - alert: GrpcServerPanic
-        expr: increase(grpc_server_panics_total[1m]) > 0
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: "gRPC Server Panic detected"
-          description: "gRPC service recovered from a panic."
-
-      - alert: ScheduleJobFailed
-        expr: increase(schedule_jobs_total{result!="success"}[1m]) > 0
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Schedule Job Failed"
-          description: "Scheduled job {{ $labels.task }} failed execution."
-
-      # ==========================================================
-      # 2. 延迟与体验 (Latency & UX) - Severity: Warning
-      # ==========================================================
-      - alert: LowApdexScore
-        expr: |
-          (
-            sum(rate(http_server_request_duration_seconds_bucket{le="0.25"}[5m])) * 0.5 +
-            sum(rate(http_server_request_duration_seconds_bucket{le="1"}[5m])) * 0.5
-          )
-          /
-          sum(rate(http_server_request_duration_seconds_count[5m])) < 0.7
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: 'Low Apdex Score ({{ $value | printf "%.2f" }})'
-          description: "User satisfaction score (Apdex) is below 0.7 (Fair)."
-
-      - alert: HighHttpLatency
-        expr: |
-          histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) > 1.0
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High HTTP Latency ({{ $value }}s)"
-          description: "HTTP P99 latency is above 1s for the last 5 minutes."
-
-      - alert: HighRedisLatency
-        expr: |
-          histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket[5m])) by (le)) > 0.1
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High Redis Latency ({{ $value }}s)"
-          description: "Redis P99 latency is above 100ms for the last 5 minutes."
-
-      - alert: HighDbLatency
-        expr: |
-          histogram_quantile(0.99, sum(rate(db_client_request_duration_seconds_bucket[5m])) by (le)) > 0.5
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High DB Latency ({{ $value }}s)"
-          description: "Database P99 latency is above 500ms for the last 5 minutes."
-
-      - alert: HighMongoLatency
-        expr: |
-          histogram_quantile(0.99, sum(rate(mongo_client_request_duration_seconds_bucket[5m])) by (le)) > 0.5
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High MongoDB Latency ({{ $value }}s)"
-          description: "MongoDB P99 latency is above 500ms for the last 5 minutes."
-
-      # ==========================================================
-      # 3. 资源饱和度 (Saturation) - Severity: Warning
-      # ==========================================================
-      - alert: DBConnectionPoolSaturation
-        expr: |
-          sum(db_client_connections_in_use) by (database)
-          /
-          sum(db_client_connections_max_open) by (database) > 0.8
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "DB Pool Saturation ({{ $value | humanizePercentage }})"
-          description: "Database connection pool usage is above 80%."
-
-      # ==========================================================
-      # 4. Go Runtime 异常 (Runtime) - Severity: Warning/Critical
-      # ==========================================================
-      - alert: HighGoroutineCount
-        expr: go_goroutines > 10000
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High Goroutine Count ({{ $value }})"
-          description: "Goroutine count exceeds 10,000."
-
-      - alert: GoroutineLeak
-        expr: rate(go_goroutines[5m]) > 100
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Potential Goroutine Leak"
-          description: "Goroutine count is increasing rapidly (>100/s rate)."
-
-      - alert: HighThreadCount
-        expr: go_threads > 500
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High Thread Count ({{ $value }})"
-          description: "OS thread count is above 500, possible thread leak."
-
-      - alert: HighMemoryUsage
-        expr: go_memstats_heap_inuse_bytes > 1e9
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High Memory Usage ({{ $value | humanize1024 }})"
-          description: "Heap in-use memory is above 1GB."
-
-      - alert: MemoryLeak
-        expr: rate(go_memstats_heap_alloc_bytes[5m]) > 1e6
-        for: 15m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Potential Memory Leak"
-          description: "Heap allocation is growing rapidly (>1MB/s rate)."
-
-      - alert: HighGCDuration
-        expr: go_gc_duration_seconds{quantile="1"} > 1
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High GC Duration ({{ $value }}s)"
-          description: "Max GC duration is above 1s."
-
-      - alert: HighGCRate
-        expr: rate(go_gc_duration_seconds_count[1m]) > 5
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High GC Rate ({{ $value }}/s)"
-          description: "GC is running more than 5 times per second."
-
-      - alert: HighGCCPUFraction
-        expr: go_memstats_gc_cpu_fraction > 0.3
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High GC CPU Usage ({{ $value | humanizePercentage }})"
-          description: "GC is consuming more than 30% of CPU time."
-```
+[prometheus_alerts_template](./prometheus_alerts_template.yaml)
 
 ---