|
63 | 63 |
|
64 | 64 | ### 1.2 HTTP Client (Wukong) |
65 | 65 |
|
66 | | -| 指标名称 | 类型 | Labels | 说明 | |
67 | | -| :------------------------------------- | :-------- | :------------------------------------------------- | :----------------------------- | |
68 | | -| `http_client_requests_inflight` | Gauge | `method`, `baseUrl`, `url` | 当前正在进行的下游 HTTP 请求数 | |
69 | | -| `http_client_requests_total` | Counter | `method`, `baseUrl`, `url`, `statusCode`, `result` | 发起的 HTTP 请求总数 | |
70 | | -| `http_client_request_duration_seconds` | Histogram | `method`, `baseUrl`, `url`, `statusCode`, `result` | HTTP 请求耗时分布 | |
| 66 | +| 指标名称 | 类型 | Labels | 说明 | |
| 67 | +| :------------------------------------- | :-------- | :-------------------------------------------- | :----------------------------- | |
| 68 | +| `http_client_requests_inflight` | Gauge | `method`, `baseUrl`, `url` | 当前正在进行的下游 HTTP 请求数 | |
| 69 | +| `http_client_requests_total` | Counter | `method`, `baseUrl`, `url`, `status`, `error` | 发起的 HTTP 请求总数 | |
| 70 | +| `http_client_request_duration_seconds` | Histogram | `method`, `baseUrl`, `url`, `status`, `error` | HTTP 请求耗时分布 | |
71 | 71 |
|
72 | 72 | ### 1.3 gRPC Server |
73 | 73 |
|
|
342 | 342 |
|
343 | 343 | 以下是基于 Prometheus 的推荐告警规则配置,涵盖了可用性、延迟、错误率、资源饱和度及运行时异常。 |
344 | 344 |
|
345 | | -```yaml |
346 | | -groups: |
347 | | - - name: box-server-alerts |
348 | | - rules: |
349 | | - # ========================================================== |
350 | | - # 1. 可用性与错误率 (Availability & Errors) - Severity: Critical |
351 | | - # ========================================================== |
352 | | - - alert: HighHttpErrorRate |
353 | | - expr: | |
354 | | - (sum(rate(http_server_requests_total{status=~"5.."}[1m])) |
355 | | - / |
356 | | - sum(rate(http_server_requests_total[1m]))) > 0.05 |
357 | | - for: 2m |
358 | | - labels: |
359 | | - severity: critical |
360 | | - annotations: |
361 | | - summary: "High HTTP error rate ({{ $value | humanizePercentage }})" |
362 | | - description: "HTTP 5xx error rate is above 5% for the last 2 minutes." |
363 | | - |
364 | | - - alert: HighGrpcErrorRate |
365 | | - expr: | |
366 | | - (sum(rate(grpc_server_requests_total{code!="OK"}[1m])) |
367 | | - / |
368 | | - sum(rate(grpc_server_requests_total[1m]))) > 0.05 |
369 | | - for: 2m |
370 | | - labels: |
371 | | - severity: critical |
372 | | - annotations: |
373 | | - summary: "High gRPC error rate ({{ $value | humanizePercentage }})" |
374 | | - description: "gRPC error rate is above 5% for the last 2 minutes." |
375 | | - |
376 | | - - alert: HighDbErrorRate |
377 | | - expr: | |
378 | | - (sum(rate(db_client_request_duration_seconds_count{result="error"}[1m])) |
379 | | - / |
380 | | - sum(rate(db_client_request_duration_seconds_count[1m]))) > 0.05 |
381 | | - for: 2m |
382 | | - labels: |
383 | | - severity: critical |
384 | | - annotations: |
385 | | - summary: "High DB Error Rate ({{ $value | humanizePercentage }})" |
386 | | - description: "Database query error rate is above 5%." |
387 | | - |
388 | | - - alert: HighRedisErrorRate |
389 | | - expr: | |
390 | | - (sum(rate(redis_client_requests_total{result!="success"}[1m])) |
391 | | - / |
392 | | - sum(rate(redis_client_requests_total[1m]))) > 0.05 |
393 | | - for: 2m |
394 | | - labels: |
395 | | - severity: critical |
396 | | - annotations: |
397 | | - summary: "High Redis Error Rate ({{ $value | humanizePercentage }})" |
398 | | - description: "Redis command error rate is above 5%." |
399 | | - |
400 | | - - alert: HighMongoErrorRate |
401 | | - expr: | |
402 | | - (sum(rate(mongo_client_requests_total{result="error"}[1m])) |
403 | | - / |
404 | | - sum(rate(mongo_client_requests_total[1m]))) > 0.05 |
405 | | - for: 2m |
406 | | - labels: |
407 | | - severity: critical |
408 | | - annotations: |
409 | | - summary: "High MongoDB Error Rate ({{ $value | humanizePercentage }})" |
410 | | - description: "MongoDB command error rate is above 5%." |
411 | | - |
412 | | - - alert: GrpcServerPanic |
413 | | - expr: increase(grpc_server_panics_total[1m]) > 0 |
414 | | - for: 0m |
415 | | - labels: |
416 | | - severity: critical |
417 | | - annotations: |
418 | | - summary: "gRPC Server Panic detected" |
419 | | - description: "gRPC service recovered from a panic." |
420 | | - |
421 | | - - alert: ScheduleJobFailed |
422 | | - expr: increase(schedule_jobs_total{result!="success"}[1m]) > 0 |
423 | | - for: 0m |
424 | | - labels: |
425 | | - severity: warning |
426 | | - annotations: |
427 | | - summary: "Schedule Job Failed" |
428 | | - description: "Scheduled job {{ $labels.task }} failed execution." |
429 | | - |
430 | | - # ========================================================== |
431 | | - # 2. 延迟与体验 (Latency & UX) - Severity: Warning |
432 | | - # ========================================================== |
433 | | - - alert: LowApdexScore |
434 | | - expr: | |
435 | | - ( |
436 | | - sum(rate(http_server_request_duration_seconds_bucket{le="0.25"}[5m])) * 0.5 + |
437 | | - sum(rate(http_server_request_duration_seconds_bucket{le="1"}[5m])) * 0.5 |
438 | | - ) |
439 | | - / |
440 | | - sum(rate(http_server_request_duration_seconds_count[5m])) < 0.7 |
441 | | - for: 5m |
442 | | - labels: |
443 | | - severity: warning |
444 | | - annotations: |
445 | | - summary: 'Low Apdex Score ({{ $value | printf "%.2f" }})' |
446 | | - description: "User satisfaction score (Apdex) is below 0.7 (Fair)." |
447 | | - |
448 | | - - alert: HighHttpLatency |
449 | | - expr: | |
450 | | - histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) > 1.0 |
451 | | - for: 5m |
452 | | - labels: |
453 | | - severity: warning |
454 | | - annotations: |
455 | | - summary: "High HTTP Latency ({{ $value }}s)" |
456 | | - description: "HTTP P99 latency is above 1s for the last 5 minutes." |
457 | | - |
458 | | - - alert: HighRedisLatency |
459 | | - expr: | |
460 | | - histogram_quantile(0.99, sum(rate(redis_client_request_duration_seconds_bucket[5m])) by (le)) > 0.1 |
461 | | - for: 5m |
462 | | - labels: |
463 | | - severity: warning |
464 | | - annotations: |
465 | | - summary: "High Redis Latency ({{ $value }}s)" |
466 | | - description: "Redis P99 latency is above 100ms for the last 5 minutes." |
467 | | - |
468 | | - - alert: HighDbLatency |
469 | | - expr: | |
470 | | - histogram_quantile(0.99, sum(rate(db_client_request_duration_seconds_bucket[5m])) by (le)) > 0.5 |
471 | | - for: 5m |
472 | | - labels: |
473 | | - severity: warning |
474 | | - annotations: |
475 | | - summary: "High DB Latency ({{ $value }}s)" |
476 | | - description: "Database P99 latency is above 500ms for the last 5 minutes." |
477 | | - |
478 | | - - alert: HighMongoLatency |
479 | | - expr: | |
480 | | - histogram_quantile(0.99, sum(rate(mongo_client_request_duration_seconds_bucket[5m])) by (le)) > 0.5 |
481 | | - for: 5m |
482 | | - labels: |
483 | | - severity: warning |
484 | | - annotations: |
485 | | - summary: "High MongoDB Latency ({{ $value }}s)" |
486 | | - description: "MongoDB P99 latency is above 500ms for the last 5 minutes." |
487 | | - |
488 | | - # ========================================================== |
489 | | - # 3. 资源饱和度 (Saturation) - Severity: Warning |
490 | | - # ========================================================== |
491 | | - - alert: DBConnectionPoolSaturation |
492 | | - expr: | |
493 | | - sum(db_client_connections_in_use) by (database) |
494 | | - / |
495 | | - sum(db_client_connections_max_open) by (database) > 0.8 |
496 | | - for: 5m |
497 | | - labels: |
498 | | - severity: warning |
499 | | - annotations: |
500 | | - summary: "DB Pool Saturation ({{ $value | humanizePercentage }})" |
501 | | - description: "Database connection pool usage is above 80%." |
502 | | - |
503 | | - # ========================================================== |
504 | | - # 4. Go Runtime 异常 (Runtime) - Severity: Warning/Critical |
505 | | - # ========================================================== |
506 | | - - alert: HighGoroutineCount |
507 | | - expr: go_goroutines > 10000 |
508 | | - for: 5m |
509 | | - labels: |
510 | | - severity: warning |
511 | | - annotations: |
512 | | - summary: "High Goroutine Count ({{ $value }})" |
513 | | - description: "Goroutine count exceeds 10,000." |
514 | | - |
515 | | - - alert: GoroutineLeak |
516 | | - expr: rate(go_goroutines[5m]) > 100 |
517 | | - for: 10m |
518 | | - labels: |
519 | | - severity: critical |
520 | | - annotations: |
521 | | - summary: "Potential Goroutine Leak" |
522 | | - description: "Goroutine count is increasing rapidly (>100/s rate)." |
523 | | - |
524 | | - - alert: HighThreadCount |
525 | | - expr: go_threads > 500 |
526 | | - for: 5m |
527 | | - labels: |
528 | | - severity: warning |
529 | | - annotations: |
530 | | - summary: "High Thread Count ({{ $value }})" |
531 | | - description: "OS thread count is above 500, possible thread leak." |
532 | | - |
533 | | - - alert: HighMemoryUsage |
534 | | - expr: go_memstats_heap_inuse_bytes > 1e9 |
535 | | - for: 5m |
536 | | - labels: |
537 | | - severity: warning |
538 | | - annotations: |
539 | | - summary: "High Memory Usage ({{ $value | humanize1024 }})" |
540 | | - description: "Heap in-use memory is above 1GB." |
541 | | - |
542 | | - - alert: MemoryLeak |
543 | | - expr: rate(go_memstats_heap_alloc_bytes[5m]) > 1e6 |
544 | | - for: 15m |
545 | | - labels: |
546 | | - severity: critical |
547 | | - annotations: |
548 | | - summary: "Potential Memory Leak" |
549 | | - description: "Heap allocation is growing rapidly (>1MB/s rate)." |
550 | | - |
551 | | - - alert: HighGCDuration |
552 | | - expr: go_gc_duration_seconds{quantile="1"} > 1 |
553 | | - for: 1m |
554 | | - labels: |
555 | | - severity: warning |
556 | | - annotations: |
557 | | - summary: "High GC Duration ({{ $value }}s)" |
558 | | - description: "Max GC duration is above 1s." |
559 | | - |
560 | | - - alert: HighGCRate |
561 | | - expr: rate(go_gc_duration_seconds_count[1m]) > 5 |
562 | | - for: 5m |
563 | | - labels: |
564 | | - severity: warning |
565 | | - annotations: |
566 | | - summary: "High GC Rate ({{ $value }}/s)" |
567 | | - description: "GC is running more than 5 times per second." |
568 | | - |
569 | | - - alert: HighGCCPUFraction |
570 | | - expr: go_memstats_gc_cpu_fraction > 0.3 |
571 | | - for: 5m |
572 | | - labels: |
573 | | - severity: warning |
574 | | - annotations: |
575 | | - summary: "High GC CPU Usage ({{ $value | humanizePercentage }})" |
576 | | - description: "GC is consuming more than 30% of CPU time." |
577 | | -``` |
| 345 | +[prometheus_alerts_template](./prometheus_alerts_template.yaml) |
578 | 346 |
|
579 | 347 | --- |
580 | 348 |
|
|
0 commit comments