diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml new file mode 100644 index 00000000..62e622a7 --- /dev/null +++ b/alertmanager/alertmanager.yml @@ -0,0 +1,31 @@ +global: + resolve_timeout: 5m + smtp_smarthost: 'SMTP_SMARTHOST_VALUE' + smtp_from: 'SMTP_FROM_VALUE' + smtp_auth_username: 'SMTP_AUTH_USERNAME_VALUE' + smtp_auth_password: 'SMTP_AUTH_PASSWORD_VALUE' + smtp_require_tls: true + +route: + group_by: ['alertname', 'job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 2h + receiver: email-only + routes: + - matchers: + - severity="critical" + receiver: email-and-sms + +receivers: + - name: email-only + email_configs: + - to: 'ALERT_EMAIL_TO_VALUE' + send_resolved: true + + - name: email-and-sms + email_configs: + - to: 'ALERT_EMAIL_TO_VALUE' + send_resolved: true + - to: 'ALERT_SMS_TO_VALUE' + send_resolved: true diff --git a/docker-compose.gold.yml b/docker-compose.gold.yml index b37fbaf0..5e6cae4b 100644 --- a/docker-compose.gold.yml +++ b/docker-compose.gold.yml @@ -54,6 +54,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-1.log + volumes: + - ./logs/app-1:/app/logs depends_on: db: condition: service_healthy @@ -77,6 +80,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-2.log + volumes: + - ./logs/app-2:/app/logs depends_on: db: condition: service_healthy @@ -102,6 +108,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-3.log + volumes: + - ./logs/app-3:/app/logs depends_on: db: condition: service_healthy @@ -127,6 +136,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-4.log + volumes: + - ./logs/app-4:/app/logs depends_on: db: condition: service_healthy diff --git a/docker-compose.yml b/docker-compose.yml index 83c26a5b..0a4aa5c5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ name: url-shortner services: + # ── App infrastructure ────────────────────────────────────────────── db: image: postgres:16 environment: @@ -10,28 +11,71 @@ services: - "5433:5432" volumes: - postgres_data:/var/lib/postgresql/data + command: + - "postgres" + - "-c" + - "max_connections=200" + - "-c" + - "shared_buffers=256MB" + - "-c" + - "work_mem=4MB" + - "-c" + - "effective_cache_size=512MB" + - "-c" + - "synchronous_commit=off" healthcheck: test: ["CMD", "pg_isready", "-U", "postgres"] - interval: 10s - timeout: 5s + interval: 5s + timeout: 3s retries: 5 + deploy: + resources: + limits: + memory: 1G redis: image: redis:7 + command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru --save "" healthcheck: test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + deploy: + resources: + limits: + memory: 256M + + # ── App instances (gold: 4 instances + nginx LB) ──────────────────── + app-1: + build: . + environment: + DATABASE_NAME: hackathon_db + DATABASE_HOST: db + DATABASE_PORT: "5432" + DATABASE_USER: postgres + DATABASE_PASSWORD: postgres + REDIS_URL: redis://redis:6379 + SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-1.log + volumes: + - ./logs/app-1:/app/logs + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] interval: 10s timeout: 5s + start_period: 15s retries: 5 + restart: always - app: + app-2: build: . - ports: - - "5000:5000" environment: - FLASK_DEBUG: "false" - FLASK_HOST: 0.0.0.0 - FLASK_PORT: "5000" DATABASE_NAME: hackathon_db DATABASE_HOST: db DATABASE_PORT: "5432" @@ -39,34 +83,145 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key - LOG_FILE_PATH: ${LOG_FILE_PATH:-/app/logs/app.log} + LOG_FILE_PATH: /app/logs/app-2.log volumes: - - ./logs/app:/app/logs + - ./logs/app-2:/app/logs depends_on: db: condition: service_healthy redis: condition: service_healthy + app-1: + condition: service_healthy healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] - interval: 15s + interval: 10s timeout: 5s + start_period: 15s retries: 5 restart: always + app-3: + build: . + environment: + DATABASE_NAME: hackathon_db + DATABASE_HOST: db + DATABASE_PORT: "5432" + DATABASE_USER: postgres + DATABASE_PASSWORD: postgres + REDIS_URL: redis://redis:6379 + SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-3.log + volumes: + - ./logs/app-3:/app/logs + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + app-1: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] + interval: 10s + timeout: 5s + start_period: 15s + retries: 5 + restart: always + + app-4: + build: . + environment: + DATABASE_NAME: hackathon_db + DATABASE_HOST: db + DATABASE_PORT: "5432" + DATABASE_USER: postgres + DATABASE_PASSWORD: postgres + REDIS_URL: redis://redis:6379 + SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-4.log + volumes: + - ./logs/app-4:/app/logs + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + app-1: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] + interval: 10s + timeout: 5s + start_period: 15s + retries: 5 + restart: always + + nginx: + image: nginx:latest + ports: + - "80:80" + volumes: + - ./nginx/nginx.gold.conf:/etc/nginx/conf.d/default.conf:ro + depends_on: + app-1: + condition: service_healthy + app-2: + condition: service_healthy + app-3: + condition: service_healthy + app-4: + condition: service_healthy + restart: always + frontend: build: context: ./frontend dockerfile: docker/Dockerfile ports: - "3000:3000" - + + # ── Observability stack ───────────────────────────────────────────── prometheus: image: prom/prometheus:latest ports: - 9090:9090 volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + + alertmanager: + image: prom/alertmanager:latest + entrypoint: /bin/sh + command: + - -c + - | + sed -e "s|SMTP_SMARTHOST_VALUE|$$SMTP_SMARTHOST|g" \ + -e "s|SMTP_FROM_VALUE|$$SMTP_FROM|g" \ + -e "s|SMTP_AUTH_USERNAME_VALUE|$$SMTP_AUTH_USERNAME|g" \ + -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \ + -e "s|ALERT_EMAIL_TO_VALUE|$$ALERT_EMAIL_TO|g" \ + -e "s|ALERT_SMS_TO_VALUE|$$ALERT_SMS_TO|g" \ + /etc/alertmanager/alertmanager.tmpl.yml > /tmp/alertmanager.yml \ + && exec /bin/alertmanager --config.file=/tmp/alertmanager.yml + environment: + SMTP_SMARTHOST: ${SMTP_SMARTHOST} + SMTP_FROM: ${SMTP_FROM} + SMTP_AUTH_USERNAME: ${SMTP_AUTH_USERNAME} + SMTP_AUTH_PASSWORD: ${SMTP_AUTH_PASSWORD} + ALERT_EMAIL_TO: ${ALERT_EMAIL_TO} + ALERT_SMS_TO: ${ALERT_SMS_TO} + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.tmpl.yml:ro + ports: + - 9093:9093 + + blackbox-exporter: + image: prom/blackbox-exporter:latest + volumes: + - ./prometheus/blackbox.yml:/etc/blackbox_exporter/config.yml:ro + ports: + - 9115:9115 node-exporter: image: prom/node-exporter:latest @@ -85,7 +240,7 @@ services: - ./prometheus/process-exporter.yml:/etc/process-exporter/config.yml:ro ports: - 9256:9256 - + otel: image: otel/opentelemetry-collector-contrib:latest volumes: @@ -128,6 +283,7 @@ services: - grafana-data:/var/lib/grafana - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro volumes: diff --git a/grafana/dashboards/critical-alerts.json b/grafana/dashboards/critical-alerts.json new file mode 100644 index 00000000..2233a6ff --- /dev/null +++ b/grafana/dashboards/critical-alerts.json @@ -0,0 +1,94 @@ +{ + "uid": "meta-critical-alerts", + "title": "Meta Critical Alerts", + "schemaVersion": 38, + "version": 1, + "refresh": "15s", + "time": { + "from": "now-6h", + "to": "now" + }, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Firing Critical Alerts", + "gridPos": { "x": 0, "y": 0, "w": 8, "h": 5 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "refId": "A", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})" + } + ], + "options": { + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + } + }, + { + "id": 2, + "type": "alertlist", + "title": "Critical Alert List", + "gridPos": { "x": 8, "y": 0, "w": 16, "h": 12 }, + "options": { + "maxItems": 50, + "sortOrder": 1, + "stateFilter": { + "firing": true, + "pending": true, + "normal": false, + "inactive": false, + "nodata": true, + "error": true + }, + "viewMode": "list", + "groupMode": "default", + "showInstances": true, + "alertName": "", + "dashboardAlerts": false, + "folder": "", + "datasource": "-- Grafana --" + } + }, + { + "id": 3, + "type": "timeseries", + "title": "Critical Alerts Over Time", + "gridPos": { "x": 0, "y": 5, "w": 8, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "refId": "A", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})" + } + ], + "fieldConfig": { + "defaults": { + "min": 0 + }, + "overrides": [] + } + } + ] +} diff --git a/grafana/provisioning/alerting/loki-alert-rules.yml b/grafana/provisioning/alerting/loki-alert-rules.yml new file mode 100644 index 00000000..f16617e1 --- /dev/null +++ b/grafana/provisioning/alerting/loki-alert-rules.yml @@ -0,0 +1,190 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: app-log-alerts + folder: Observability + interval: 1m + rules: + - uid: app-error-logs + title: App Error Logs Detected + condition: C + data: + - refId: A + queryType: range + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: loki + model: + datasource: + type: loki + uid: loki + editorMode: code + expr: 'sum(count_over_time({service="app"} |= "\"level\": \"ERROR\"" [5m]))' + intervalMs: 1000 + maxDataPoints: 43200 + queryType: range + refId: A + - refId: C + queryType: "" + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 2m + annotations: + summary: "Application emitted error logs" + description: "One or more ERROR level application logs detected in the past 5 minutes." + labels: + severity: warning + source: loki + + - uid: high-error-rate + title: High HTTP Error Rate + condition: C + data: + - refId: A + queryType: range + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: loki + model: + datasource: + type: loki + uid: loki + editorMode: code + expr: | + sum(count_over_time({service=~"app-.*"} |= "request_completed" | json | status_code >= 500 [5m])) + / + sum(count_over_time({service=~"app-.*"} |= "request_completed" [5m])) + intervalMs: 1000 + maxDataPoints: 43200 + queryType: range + refId: A + - refId: C + queryType: "" + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.1 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 2m + annotations: + summary: "High HTTP error rate detected" + description: "More than 10% of HTTP requests are returning 5xx errors over the past 5 minutes." + labels: + severity: critical + source: loki + + - uid: service-no-logs + title: Service Not Producing Logs + condition: C + data: + - refId: A + queryType: range + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: loki + model: + datasource: + type: loki + uid: loki + editorMode: code + expr: 'sum(count_over_time({service=~"app-.*"} [5m]))' + intervalMs: 1000 + maxDataPoints: 43200 + queryType: range + refId: A + - refId: C + queryType: "" + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: Alerting + execErrState: Error + for: 5m + annotations: + summary: "Service stopped producing logs" + description: "No logs received from any app instance in the past 5 minutes. Service may be down." + labels: + severity: critical + source: loki diff --git a/prometheus/alerts.yml b/prometheus/alerts.yml new file mode 100644 index 00000000..f7b90971 --- /dev/null +++ b/prometheus/alerts.yml @@ -0,0 +1,138 @@ +groups: + - name: platform-health + interval: 30s + rules: + - alert: PrometheusTargetDown + expr: up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Prometheus target down" + description: "Target {{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes." + + - alert: HostHighCpuUsage + expr: 100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage" + description: "Node {{ $labels.instance }} CPU usage is above 85% for 10 minutes." + + - alert: HostLowDiskSpace + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15 + for: 10m + labels: + severity: warning + annotations: + summary: "Low disk space" + description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% free space." + + - name: service-health + interval: 15s + rules: + - alert: ServiceDown + expr: probe_success{job="blackbox-health"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Service is down" + description: "Health probe {{ $labels.instance }} has been failing for more than 2 minutes. The application is unreachable." + + - alert: ServiceHighLatency + expr: probe_duration_seconds{job="blackbox-health"} > 5 + for: 2m + labels: + severity: warning + annotations: + summary: "Service high latency" + description: "Health probe {{ $labels.instance }} is taking more than 5 seconds to respond." + + - name: observability-stack + interval: 30s + rules: + - alert: LokiUnavailable + expr: up{job="loki"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Loki unavailable" + description: "Loki is down, log queries and ingestion may be impacted." + + - alert: PromtailUnavailable + expr: up{job="promtail"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Promtail unavailable" + description: "Promtail is down, new logs are not being shipped to Loki." + + - alert: PromtailNoNewLogs + expr: increase(promtail_read_lines_total[15m]) == 0 + for: 15m + labels: + severity: warning + annotations: + summary: "No new logs shipped by Promtail" + description: "Promtail has not read any new log lines in the last 15 minutes." + + - name: process-health + interval: 30s + rules: + - alert: MainApplicationProcessMissing + expr: absent(namedprocess_namegroup_num_procs{groupname="backend-main"}) or namedprocess_namegroup_num_procs{groupname="backend-main"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Main application process missing" + description: "The backend main process (gunicorn/run.py) is not detected by process-exporter." + + - alert: FrontendProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="frontend"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Frontend process missing" + description: "No frontend process detected by process-exporter." + + - alert: PostgresProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="database-postgres"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Postgres process missing" + description: "No postgres process detected by process-exporter." + + - alert: PrometheusProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="prometheus"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Prometheus process missing" + description: "No prometheus process detected by process-exporter." + + - alert: LokiProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="loki"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Loki process missing" + description: "No loki process detected by process-exporter." + + - alert: PromtailProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="promtail"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Promtail process missing" + description: "No promtail process detected by process-exporter." diff --git a/prometheus/blackbox.yml b/prometheus/blackbox.yml new file mode 100644 index 00000000..7ea82a6d --- /dev/null +++ b/prometheus/blackbox.yml @@ -0,0 +1,10 @@ +modules: + http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] + method: GET + follow_redirects: true + fail_if_not_ssl: false diff --git a/prometheus/process-exporter.yml b/prometheus/process-exporter.yml index cad4f485..3bc1f048 100644 --- a/prometheus/process-exporter.yml +++ b/prometheus/process-exporter.yml @@ -1,14 +1,14 @@ process_names: - - name: "backend-flask" - comm: - - 'python' + - name: "backend-main" cmdline: - - '.*run\.py.*' + - '.*/gunicorn' + - '.*run:app.*' - name: "frontend" comm: - 'next-server (v' - - 'next-server' + cmdline: + - '.*next-server.*' - name: "database-postgres" cmdline: diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index 3bdd34bd..1d26d60c 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -1,6 +1,14 @@ global: scrape_interval: 15s +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +rule_files: + - /etc/prometheus/alerts.yml + scrape_configs: - job_name: 'prometheus' static_configs: @@ -20,3 +28,37 @@ scrape_configs: static_configs: - targets: ['process-exporter:9256'] metrics_path: /metrics + + - job_name: 'blackbox-health' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://nginx:80/health/live + - http://nginx:80/health/ready + labels: + service: app + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + scrape_interval: 15s + + - job_name: 'blackbox-exporter' + static_configs: + - targets: ['blackbox-exporter:9115'] + metrics_path: /metrics + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + metrics_path: /metrics + + - job_name: 'promtail' + static_configs: + - targets: ['promtail:9080'] + metrics_path: /metrics diff --git a/scripts/test_tier2.sh b/scripts/test_tier2.sh new file mode 100644 index 00000000..3001de62 --- /dev/null +++ b/scripts/test_tier2.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Tier 2 verification script — tests logging + alertmanager config +set -e + +BASE="http://localhost" +AM="http://localhost:9093" + +echo "=== 1. Check app health through nginx ===" +HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/health/live" 2>/dev/null || echo "FAIL") +echo "GET /health/live => $HTTP_CODE" + +echo "" +echo "=== 2. Generate some traffic ===" +for i in $(seq 1 5); do + curl -s -o /dev/null -w "GET /users => %{http_code}\n" "$BASE/users" +done + +echo "" +echo "=== 3. Check log files exist on host ===" +for i in 1 2 3 4; do + LOG="./logs/app-$i/app-$i.log" + if [ -f "$LOG" ]; then + LINES=$(wc -l < "$LOG") + echo " app-$i.log: $LINES lines" + else + echo " app-$i.log: MISSING" + fi +done + +# Also check single-app log +if [ -f "./logs/app/app.log" ]; then + LINES=$(wc -l < "./logs/app/app.log") + echo " app.log (single): $LINES lines" +fi + +echo "" +echo "=== 4. Check Alertmanager status ===" +AM_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$AM/-/healthy" 2>/dev/null || echo "FAIL") +echo "Alertmanager health => $AM_STATUS" + +echo "" +echo "=== 5. Check Alertmanager config has real credentials ===" +AM_CONFIG=$(curl -s "$AM/api/v1/status" 2>/dev/null) +if echo "$AM_CONFIG" | grep -q "replace-with"; then + echo " FAIL: Alertmanager still has placeholder credentials" +elif echo "$AM_CONFIG" | grep -q "example.com"; then + echo " FAIL: Alertmanager still has example.com addresses" +elif echo "$AM_CONFIG" | grep -q "smtp_smarthost"; then + echo " OK: Alertmanager config loaded (check emails below)" + echo "$AM_CONFIG" | python -m json.tool 2>/dev/null | grep -E "(smtp_from|smtp_smarthost|to:)" || true +else + echo " WARN: Could not fetch Alertmanager config" +fi + +echo "" +echo "=== 6. Check Prometheus alert rules ===" +PROM_RULES=$(curl -s "http://localhost:9090/api/v1/rules" 2>/dev/null) +ALERT_COUNT=$(echo "$PROM_RULES" | python -c "import sys,json; d=json.load(sys.stdin); print(sum(len(g.get('rules',[])) for g in d.get('data',{}).get('groups',[])))" 2>/dev/null || echo "?") +echo "Prometheus has $ALERT_COUNT alert rules loaded" + +echo "" +echo "=== 7. Check for any firing alerts ===" +FIRING=$(curl -s "$AM/api/v2/alerts" 2>/dev/null) +FIRING_COUNT=$(echo "$FIRING" | python -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "?") +echo "Currently firing alerts: $FIRING_COUNT" + +echo "" +echo "=== Done ==="