From 61f7df60073ef5314f4269a4bdd31e682d1670eb Mon Sep 17 00:00:00 2001 From: Victor Date: Sat, 4 Apr 2026 20:07:30 -0400 Subject: [PATCH 1/5] added prometheus and grafana alerts --- app/__init__.py | 14 ++- docker-compose.yml | 2 + grafana/dashboards/critical-alerts.json | 94 ++++++++++++++ .../alerting/loki-alert-rules.yml | 67 ++++++++++ prometheus/alerts.yml | 117 ++++++++++++++++++ prometheus/process-exporter.yml | 5 + prometheus/prometheus.yml | 13 ++ 7 files changed, 311 insertions(+), 1 deletion(-) create mode 100644 grafana/dashboards/critical-alerts.json create mode 100644 grafana/provisioning/alerting/loki-alert-rules.yml create mode 100644 prometheus/alerts.yml diff --git a/app/__init__.py b/app/__init__.py index 62761323..4f8c7720 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -7,7 +7,7 @@ from datetime import datetime, timezone from dotenv import load_dotenv -from flask import Flask, current_app, jsonify, g, request +from flask import Flask, Response, current_app, jsonify, g, request from flask_cors import CORS from app.database import init_db, db, check_db_connection @@ -222,6 +222,18 @@ def health(): cache=cache_status, ), status_code + @app.route("/metrics", methods=["GET"]) + def metrics(): + metrics_payload = ( + "# HELP app_info Static metadata for the Flask service\n" + "# TYPE app_info gauge\n" + "app_info{service=\"url-shortener-api\",component=\"backend\"} 1\n" + ) + return Response( + metrics_payload, + mimetype="text/plain; version=0.0.4; charset=utf-8", + ) + @app.errorhandler(404) def not_found(e): app.logger.warning( diff --git a/docker-compose.yml b/docker-compose.yml index 83c26a5b..6d1bf228 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -67,6 +67,7 @@ services: - 9090:9090 volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro node-exporter: image: prom/node-exporter:latest @@ -128,6 +129,7 @@ services: - grafana-data:/var/lib/grafana - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro volumes: diff --git a/grafana/dashboards/critical-alerts.json b/grafana/dashboards/critical-alerts.json new file mode 100644 index 00000000..2233a6ff --- /dev/null +++ b/grafana/dashboards/critical-alerts.json @@ -0,0 +1,94 @@ +{ + "uid": "meta-critical-alerts", + "title": "Meta Critical Alerts", + "schemaVersion": 38, + "version": 1, + "refresh": "15s", + "time": { + "from": "now-6h", + "to": "now" + }, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Firing Critical Alerts", + "gridPos": { "x": 0, "y": 0, "w": 8, "h": 5 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "refId": "A", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})" + } + ], + "options": { + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + } + }, + { + "id": 2, + "type": "alertlist", + "title": "Critical Alert List", + "gridPos": { "x": 8, "y": 0, "w": 16, "h": 12 }, + "options": { + "maxItems": 50, + "sortOrder": 1, + "stateFilter": { + "firing": true, + "pending": true, + "normal": false, + "inactive": false, + "nodata": true, + "error": true + }, + "viewMode": "list", + "groupMode": "default", + "showInstances": true, + "alertName": "", + "dashboardAlerts": false, + "folder": "", + "datasource": "-- Grafana --" + } + }, + { + "id": 3, + "type": "timeseries", + "title": "Critical Alerts Over Time", + "gridPos": { "x": 0, "y": 5, "w": 8, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "refId": "A", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})" + } + ], + "fieldConfig": { + "defaults": { + "min": 0 + }, + "overrides": [] + } + } + ] +} diff --git a/grafana/provisioning/alerting/loki-alert-rules.yml b/grafana/provisioning/alerting/loki-alert-rules.yml new file mode 100644 index 00000000..c82afccd --- /dev/null +++ b/grafana/provisioning/alerting/loki-alert-rules.yml @@ -0,0 +1,67 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: app-log-alerts + folder: Observability + interval: 1m + rules: + - uid: app-error-logs + title: App Error Logs Detected + condition: C + data: + - refId: A + queryType: range + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: loki + model: + datasource: + type: loki + uid: loki + editorMode: code + expr: 'sum(count_over_time({service="app"} |= "\"level\": \"ERROR\"" [5m]))' + intervalMs: 1000 + maxDataPoints: 43200 + queryType: range + refId: A + - refId: C + queryType: "" + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 2m + annotations: + summary: "Application emitted error logs" + description: "One or more ERROR level application logs detected in the past 5 minutes." + labels: + severity: warning + source: loki diff --git a/prometheus/alerts.yml b/prometheus/alerts.yml new file mode 100644 index 00000000..c5efe692 --- /dev/null +++ b/prometheus/alerts.yml @@ -0,0 +1,117 @@ +groups: + - name: platform-health + interval: 30s + rules: + - alert: PrometheusTargetDown + expr: up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Prometheus target down" + description: "Target {{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes." + + - alert: HostHighCpuUsage + expr: 100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage" + description: "Node {{ $labels.instance }} CPU usage is above 85% for 10 minutes." + + - alert: HostLowDiskSpace + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15 + for: 10m + labels: + severity: warning + annotations: + summary: "Low disk space" + description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% free space." + + - name: observability-stack + interval: 30s + rules: + - alert: LokiUnavailable + expr: up{job="loki"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Loki unavailable" + description: "Loki is down, log queries and ingestion may be impacted." + + - alert: PromtailUnavailable + expr: up{job="promtail"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Promtail unavailable" + description: "Promtail is down, new logs are not being shipped to Loki." + + - alert: PromtailNoNewLogs + expr: increase(promtail_read_lines_total[15m]) == 0 + for: 15m + labels: + severity: warning + annotations: + summary: "No new logs shipped by Promtail" + description: "Promtail has not read any new log lines in the last 15 minutes." + + - name: process-health + interval: 30s + rules: + - alert: MainApplicationProcessMissing + expr: absent(namedprocess_namegroup_num_procs{groupname="backend-main"}) or namedprocess_namegroup_num_procs{groupname="backend-main"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Main application process missing" + description: "The backend main process (gunicorn/run.py) is not detected by process-exporter." + + - alert: FrontendProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="frontend"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Frontend process missing" + description: "No frontend process detected by process-exporter." + + - alert: PostgresProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="database-postgres"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Postgres process missing" + description: "No postgres process detected by process-exporter." + + - alert: PrometheusProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="prometheus"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Prometheus process missing" + description: "No prometheus process detected by process-exporter." + + - alert: LokiProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="loki"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Loki process missing" + description: "No loki process detected by process-exporter." + + - alert: PromtailProcessMissing + expr: namedprocess_namegroup_num_procs{groupname="promtail"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Promtail process missing" + description: "No promtail process detected by process-exporter." diff --git a/prometheus/process-exporter.yml b/prometheus/process-exporter.yml index cad4f485..e34b1639 100644 --- a/prometheus/process-exporter.yml +++ b/prometheus/process-exporter.yml @@ -1,4 +1,9 @@ process_names: + - name: "backend-main" + cmdline: + - '.*/gunicorn' + - '.*run:app.*' + - name: "backend-flask" comm: - 'python' diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index 3bdd34bd..c084eff8 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -1,6 +1,9 @@ global: scrape_interval: 15s +rule_files: + - /etc/prometheus/alerts.yml + scrape_configs: - job_name: 'prometheus' static_configs: @@ -20,3 +23,13 @@ scrape_configs: static_configs: - targets: ['process-exporter:9256'] metrics_path: /metrics + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + metrics_path: /metrics + + - job_name: 'promtail' + static_configs: + - targets: ['promtail:9080'] + metrics_path: /metrics From f2dd248749c6ecf81456fde214d9bf6b85dd060d Mon Sep 17 00:00:00 2001 From: Victor Date: Sat, 4 Apr 2026 20:15:03 -0400 Subject: [PATCH 2/5] oopsies --- app/__init__.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/app/__init__.py b/app/__init__.py index 4f8c7720..62761323 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -7,7 +7,7 @@ from datetime import datetime, timezone from dotenv import load_dotenv -from flask import Flask, Response, current_app, jsonify, g, request +from flask import Flask, current_app, jsonify, g, request from flask_cors import CORS from app.database import init_db, db, check_db_connection @@ -222,18 +222,6 @@ def health(): cache=cache_status, ), status_code - @app.route("/metrics", methods=["GET"]) - def metrics(): - metrics_payload = ( - "# HELP app_info Static metadata for the Flask service\n" - "# TYPE app_info gauge\n" - "app_info{service=\"url-shortener-api\",component=\"backend\"} 1\n" - ) - return Response( - metrics_payload, - mimetype="text/plain; version=0.0.4; charset=utf-8", - ) - @app.errorhandler(404) def not_found(e): app.logger.warning( From 4210ae537966dca82ac2389db6763a381e75ab82 Mon Sep 17 00:00:00 2001 From: Victor Date: Sat, 4 Apr 2026 20:32:35 -0400 Subject: [PATCH 3/5] auto notifications --- alertmanager/alertmanager.yml | 31 +++++++++++++++++++++++++++++++ docker-compose.yml | 9 +++++++++ prometheus/prometheus.yml | 5 +++++ 3 files changed, 45 insertions(+) create mode 100644 alertmanager/alertmanager.yml diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml new file mode 100644 index 00000000..1cbe7d45 --- /dev/null +++ b/alertmanager/alertmanager.yml @@ -0,0 +1,31 @@ +global: + resolve_timeout: 5m + smtp_smarthost: 'smtp.gmail.com:587' + smtp_from: 'alerts@example.com' + smtp_auth_username: 'replace-with-email-username' + smtp_auth_password: 'replace-with-email-app-password' + smtp_require_tls: true + +route: + group_by: ['alertname', 'job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 2h + receiver: email-only + routes: + - matchers: + - severity="critical" + receiver: email-and-sms + +receivers: + - name: email-only + email_configs: + - to: 'replace-with-your-email@example.com' + send_resolved: true + + - name: email-and-sms + email_configs: + - to: 'replace-with-your-email@example.com' + send_resolved: true + - to: 'replace-with-your-phone-sms-gateway@example.com' + send_resolved: true diff --git a/docker-compose.yml b/docker-compose.yml index 6d1bf228..af6c4f75 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -69,6 +69,15 @@ services: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + alertmanager: + image: prom/alertmanager:latest + command: + - --config.file=/etc/alertmanager/alertmanager.yml + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + ports: + - 9093:9093 + node-exporter: image: prom/node-exporter:latest command: diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index c084eff8..22cb4590 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -1,6 +1,11 @@ global: scrape_interval: 15s +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + rule_files: - /etc/prometheus/alerts.yml From faa6324f5ce3e0629f78499aabfd914b4bd9a134 Mon Sep 17 00:00:00 2001 From: Victor Date: Sat, 4 Apr 2026 20:50:09 -0400 Subject: [PATCH 4/5] frontend missing from dashboard fix --- prometheus/process-exporter.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/prometheus/process-exporter.yml b/prometheus/process-exporter.yml index e34b1639..3bc1f048 100644 --- a/prometheus/process-exporter.yml +++ b/prometheus/process-exporter.yml @@ -4,16 +4,11 @@ process_names: - '.*/gunicorn' - '.*run:app.*' - - name: "backend-flask" - comm: - - 'python' - cmdline: - - '.*run\.py.*' - - name: "frontend" comm: - 'next-server (v' - - 'next-server' + cmdline: + - '.*next-server.*' - name: "database-postgres" cmdline: From 0025d5b082a52052609d0277d0ff947f3612cdd7 Mon Sep 17 00:00:00 2001 From: huzaifa <146299744+zaifnatra@users.noreply.github.com> Date: Sat, 4 Apr 2026 21:26:11 -0400 Subject: [PATCH 5/5] tier 2 observability gmail alerts --- alertmanager/alertmanager.yml | 14 +- docker-compose.gold.yml | 12 ++ docker-compose.yml | 175 ++++++++++++++++-- .../alerting/loki-alert-rules.yml | 123 ++++++++++++ prometheus/alerts.yml | 21 +++ prometheus/blackbox.yml | 10 + prometheus/prometheus.yml | 24 +++ scripts/test_tier2.sh | 68 +++++++ 8 files changed, 425 insertions(+), 22 deletions(-) create mode 100644 prometheus/blackbox.yml create mode 100644 scripts/test_tier2.sh diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml index 1cbe7d45..62e622a7 100644 --- a/alertmanager/alertmanager.yml +++ b/alertmanager/alertmanager.yml @@ -1,9 +1,9 @@ global: resolve_timeout: 5m - smtp_smarthost: 'smtp.gmail.com:587' - smtp_from: 'alerts@example.com' - smtp_auth_username: 'replace-with-email-username' - smtp_auth_password: 'replace-with-email-app-password' + smtp_smarthost: 'SMTP_SMARTHOST_VALUE' + smtp_from: 'SMTP_FROM_VALUE' + smtp_auth_username: 'SMTP_AUTH_USERNAME_VALUE' + smtp_auth_password: 'SMTP_AUTH_PASSWORD_VALUE' smtp_require_tls: true route: @@ -20,12 +20,12 @@ route: receivers: - name: email-only email_configs: - - to: 'replace-with-your-email@example.com' + - to: 'ALERT_EMAIL_TO_VALUE' send_resolved: true - name: email-and-sms email_configs: - - to: 'replace-with-your-email@example.com' + - to: 'ALERT_EMAIL_TO_VALUE' send_resolved: true - - to: 'replace-with-your-phone-sms-gateway@example.com' + - to: 'ALERT_SMS_TO_VALUE' send_resolved: true diff --git a/docker-compose.gold.yml b/docker-compose.gold.yml index b37fbaf0..5e6cae4b 100644 --- a/docker-compose.gold.yml +++ b/docker-compose.gold.yml @@ -54,6 +54,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-1.log + volumes: + - ./logs/app-1:/app/logs depends_on: db: condition: service_healthy @@ -77,6 +80,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-2.log + volumes: + - ./logs/app-2:/app/logs depends_on: db: condition: service_healthy @@ -102,6 +108,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-3.log + volumes: + - ./logs/app-3:/app/logs depends_on: db: condition: service_healthy @@ -127,6 +136,9 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-4.log + volumes: + - ./logs/app-4:/app/logs depends_on: db: condition: service_healthy diff --git a/docker-compose.yml b/docker-compose.yml index af6c4f75..0a4aa5c5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ name: url-shortner services: + # ── App infrastructure ────────────────────────────────────────────── db: image: postgres:16 environment: @@ -10,28 +11,71 @@ services: - "5433:5432" volumes: - postgres_data:/var/lib/postgresql/data + command: + - "postgres" + - "-c" + - "max_connections=200" + - "-c" + - "shared_buffers=256MB" + - "-c" + - "work_mem=4MB" + - "-c" + - "effective_cache_size=512MB" + - "-c" + - "synchronous_commit=off" healthcheck: test: ["CMD", "pg_isready", "-U", "postgres"] - interval: 10s - timeout: 5s + interval: 5s + timeout: 3s retries: 5 + deploy: + resources: + limits: + memory: 1G redis: image: redis:7 + command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru --save "" healthcheck: test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + deploy: + resources: + limits: + memory: 256M + + # ── App instances (gold: 4 instances + nginx LB) ──────────────────── + app-1: + build: . + environment: + DATABASE_NAME: hackathon_db + DATABASE_HOST: db + DATABASE_PORT: "5432" + DATABASE_USER: postgres + DATABASE_PASSWORD: postgres + REDIS_URL: redis://redis:6379 + SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-1.log + volumes: + - ./logs/app-1:/app/logs + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] interval: 10s timeout: 5s + start_period: 15s retries: 5 + restart: always - app: + app-2: build: . - ports: - - "5000:5000" environment: - FLASK_DEBUG: "false" - FLASK_HOST: 0.0.0.0 - FLASK_PORT: "5000" DATABASE_NAME: hackathon_db DATABASE_HOST: db DATABASE_PORT: "5432" @@ -39,28 +83,105 @@ services: DATABASE_PASSWORD: postgres REDIS_URL: redis://redis:6379 SECRET_KEY: random_secret_key - LOG_FILE_PATH: ${LOG_FILE_PATH:-/app/logs/app.log} + LOG_FILE_PATH: /app/logs/app-2.log volumes: - - ./logs/app:/app/logs + - ./logs/app-2:/app/logs depends_on: db: condition: service_healthy redis: condition: service_healthy + app-1: + condition: service_healthy healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] - interval: 15s + interval: 10s timeout: 5s + start_period: 15s retries: 5 restart: always + app-3: + build: . + environment: + DATABASE_NAME: hackathon_db + DATABASE_HOST: db + DATABASE_PORT: "5432" + DATABASE_USER: postgres + DATABASE_PASSWORD: postgres + REDIS_URL: redis://redis:6379 + SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-3.log + volumes: + - ./logs/app-3:/app/logs + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + app-1: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] + interval: 10s + timeout: 5s + start_period: 15s + retries: 5 + restart: always + + app-4: + build: . + environment: + DATABASE_NAME: hackathon_db + DATABASE_HOST: db + DATABASE_PORT: "5432" + DATABASE_USER: postgres + DATABASE_PASSWORD: postgres + REDIS_URL: redis://redis:6379 + SECRET_KEY: random_secret_key + LOG_FILE_PATH: /app/logs/app-4.log + volumes: + - ./logs/app-4:/app/logs + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + app-1: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"] + interval: 10s + timeout: 5s + start_period: 15s + retries: 5 + restart: always + + nginx: + image: nginx:latest + ports: + - "80:80" + volumes: + - ./nginx/nginx.gold.conf:/etc/nginx/conf.d/default.conf:ro + depends_on: + app-1: + condition: service_healthy + app-2: + condition: service_healthy + app-3: + condition: service_healthy + app-4: + condition: service_healthy + restart: always + frontend: build: context: ./frontend dockerfile: docker/Dockerfile ports: - "3000:3000" - + + # ── Observability stack ───────────────────────────────────────────── prometheus: image: prom/prometheus:latest ports: @@ -71,13 +192,37 @@ services: alertmanager: image: prom/alertmanager:latest + entrypoint: /bin/sh command: - - --config.file=/etc/alertmanager/alertmanager.yml + - -c + - | + sed -e "s|SMTP_SMARTHOST_VALUE|$$SMTP_SMARTHOST|g" \ + -e "s|SMTP_FROM_VALUE|$$SMTP_FROM|g" \ + -e "s|SMTP_AUTH_USERNAME_VALUE|$$SMTP_AUTH_USERNAME|g" \ + -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \ + -e "s|ALERT_EMAIL_TO_VALUE|$$ALERT_EMAIL_TO|g" \ + -e "s|ALERT_SMS_TO_VALUE|$$ALERT_SMS_TO|g" \ + /etc/alertmanager/alertmanager.tmpl.yml > /tmp/alertmanager.yml \ + && exec /bin/alertmanager --config.file=/tmp/alertmanager.yml + environment: + SMTP_SMARTHOST: ${SMTP_SMARTHOST} + SMTP_FROM: ${SMTP_FROM} + SMTP_AUTH_USERNAME: ${SMTP_AUTH_USERNAME} + SMTP_AUTH_PASSWORD: ${SMTP_AUTH_PASSWORD} + ALERT_EMAIL_TO: ${ALERT_EMAIL_TO} + ALERT_SMS_TO: ${ALERT_SMS_TO} volumes: - - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.tmpl.yml:ro ports: - 9093:9093 + blackbox-exporter: + image: prom/blackbox-exporter:latest + volumes: + - ./prometheus/blackbox.yml:/etc/blackbox_exporter/config.yml:ro + ports: + - 9115:9115 + node-exporter: image: prom/node-exporter:latest command: @@ -95,7 +240,7 @@ services: - ./prometheus/process-exporter.yml:/etc/process-exporter/config.yml:ro ports: - 9256:9256 - + otel: image: otel/opentelemetry-collector-contrib:latest volumes: diff --git a/grafana/provisioning/alerting/loki-alert-rules.yml b/grafana/provisioning/alerting/loki-alert-rules.yml index c82afccd..f16617e1 100644 --- a/grafana/provisioning/alerting/loki-alert-rules.yml +++ b/grafana/provisioning/alerting/loki-alert-rules.yml @@ -65,3 +65,126 @@ groups: labels: severity: warning source: loki + + - uid: high-error-rate + title: High HTTP Error Rate + condition: C + data: + - refId: A + queryType: range + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: loki + model: + datasource: + type: loki + uid: loki + editorMode: code + expr: | + sum(count_over_time({service=~"app-.*"} |= "request_completed" | json | status_code >= 500 [5m])) + / + sum(count_over_time({service=~"app-.*"} |= "request_completed" [5m])) + intervalMs: 1000 + maxDataPoints: 43200 + queryType: range + refId: A + - refId: C + queryType: "" + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.1 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 2m + annotations: + summary: "High HTTP error rate detected" + description: "More than 10% of HTTP requests are returning 5xx errors over the past 5 minutes." + labels: + severity: critical + source: loki + + - uid: service-no-logs + title: Service Not Producing Logs + condition: C + data: + - refId: A + queryType: range + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: loki + model: + datasource: + type: loki + uid: loki + editorMode: code + expr: 'sum(count_over_time({service=~"app-.*"} [5m]))' + intervalMs: 1000 + maxDataPoints: 43200 + queryType: range + refId: A + - refId: C + queryType: "" + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: Alerting + execErrState: Error + for: 5m + annotations: + summary: "Service stopped producing logs" + description: "No logs received from any app instance in the past 5 minutes. Service may be down." + labels: + severity: critical + source: loki diff --git a/prometheus/alerts.yml b/prometheus/alerts.yml index c5efe692..f7b90971 100644 --- a/prometheus/alerts.yml +++ b/prometheus/alerts.yml @@ -29,6 +29,27 @@ groups: summary: "Low disk space" description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% free space." + - name: service-health + interval: 15s + rules: + - alert: ServiceDown + expr: probe_success{job="blackbox-health"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Service is down" + description: "Health probe {{ $labels.instance }} has been failing for more than 2 minutes. The application is unreachable." + + - alert: ServiceHighLatency + expr: probe_duration_seconds{job="blackbox-health"} > 5 + for: 2m + labels: + severity: warning + annotations: + summary: "Service high latency" + description: "Health probe {{ $labels.instance }} is taking more than 5 seconds to respond." + - name: observability-stack interval: 30s rules: diff --git a/prometheus/blackbox.yml b/prometheus/blackbox.yml new file mode 100644 index 00000000..7ea82a6d --- /dev/null +++ b/prometheus/blackbox.yml @@ -0,0 +1,10 @@ +modules: + http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] + method: GET + follow_redirects: true + fail_if_not_ssl: false diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index 22cb4590..1d26d60c 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -29,6 +29,30 @@ scrape_configs: - targets: ['process-exporter:9256'] metrics_path: /metrics + - job_name: 'blackbox-health' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://nginx:80/health/live + - http://nginx:80/health/ready + labels: + service: app + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + scrape_interval: 15s + + - job_name: 'blackbox-exporter' + static_configs: + - targets: ['blackbox-exporter:9115'] + metrics_path: /metrics + - job_name: 'loki' static_configs: - targets: ['loki:3100'] diff --git a/scripts/test_tier2.sh b/scripts/test_tier2.sh new file mode 100644 index 00000000..3001de62 --- /dev/null +++ b/scripts/test_tier2.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Tier 2 verification script — tests logging + alertmanager config +set -e + +BASE="http://localhost" +AM="http://localhost:9093" + +echo "=== 1. Check app health through nginx ===" +HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/health/live" 2>/dev/null || echo "FAIL") +echo "GET /health/live => $HTTP_CODE" + +echo "" +echo "=== 2. Generate some traffic ===" +for i in $(seq 1 5); do + curl -s -o /dev/null -w "GET /users => %{http_code}\n" "$BASE/users" +done + +echo "" +echo "=== 3. Check log files exist on host ===" +for i in 1 2 3 4; do + LOG="./logs/app-$i/app-$i.log" + if [ -f "$LOG" ]; then + LINES=$(wc -l < "$LOG") + echo " app-$i.log: $LINES lines" + else + echo " app-$i.log: MISSING" + fi +done + +# Also check single-app log +if [ -f "./logs/app/app.log" ]; then + LINES=$(wc -l < "./logs/app/app.log") + echo " app.log (single): $LINES lines" +fi + +echo "" +echo "=== 4. Check Alertmanager status ===" +AM_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$AM/-/healthy" 2>/dev/null || echo "FAIL") +echo "Alertmanager health => $AM_STATUS" + +echo "" +echo "=== 5. Check Alertmanager config has real credentials ===" +AM_CONFIG=$(curl -s "$AM/api/v1/status" 2>/dev/null) +if echo "$AM_CONFIG" | grep -q "replace-with"; then + echo " FAIL: Alertmanager still has placeholder credentials" +elif echo "$AM_CONFIG" | grep -q "example.com"; then + echo " FAIL: Alertmanager still has example.com addresses" +elif echo "$AM_CONFIG" | grep -q "smtp_smarthost"; then + echo " OK: Alertmanager config loaded (check emails below)" + echo "$AM_CONFIG" | python -m json.tool 2>/dev/null | grep -E "(smtp_from|smtp_smarthost|to:)" || true +else + echo " WARN: Could not fetch Alertmanager config" +fi + +echo "" +echo "=== 6. Check Prometheus alert rules ===" +PROM_RULES=$(curl -s "http://localhost:9090/api/v1/rules" 2>/dev/null) +ALERT_COUNT=$(echo "$PROM_RULES" | python -c "import sys,json; d=json.load(sys.stdin); print(sum(len(g.get('rules',[])) for g in d.get('data',{}).get('groups',[])))" 2>/dev/null || echo "?") +echo "Prometheus has $ALERT_COUNT alert rules loaded" + +echo "" +echo "=== 7. Check for any firing alerts ===" +FIRING=$(curl -s "$AM/api/v2/alerts" 2>/dev/null) +FIRING_COUNT=$(echo "$FIRING" | python -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "?") +echo "Currently firing alerts: $FIRING_COUNT" + +echo "" +echo "=== Done ==="