From 61f7df60073ef5314f4269a4bdd31e682d1670eb Mon Sep 17 00:00:00 2001
From: Victor <ioan.1931@gmail.com>
Date: Sat, 4 Apr 2026 20:07:30 -0400
Subject: [PATCH 1/5] added prometheus and grafana alerts

---
 app/__init__.py                               |  14 ++-
 docker-compose.yml                            |   2 +
 grafana/dashboards/critical-alerts.json       |  94 ++++++++++++++
 .../alerting/loki-alert-rules.yml             |  67 ++++++++++
 prometheus/alerts.yml                         | 117 ++++++++++++++++++
 prometheus/process-exporter.yml               |   5 +
 prometheus/prometheus.yml                     |  13 ++
 7 files changed, 311 insertions(+), 1 deletion(-)
 create mode 100644 grafana/dashboards/critical-alerts.json
 create mode 100644 grafana/provisioning/alerting/loki-alert-rules.yml
 create mode 100644 prometheus/alerts.yml

diff --git a/app/__init__.py b/app/__init__.py
index 62761323..4f8c7720 100644
--- a/app/__init__.py
+++ b/app/__init__.py
@@ -7,7 +7,7 @@
 from datetime import datetime, timezone
 
 from dotenv import load_dotenv
-from flask import Flask, current_app, jsonify, g, request
+from flask import Flask, Response, current_app, jsonify, g, request
 from flask_cors import CORS
 
 from app.database import init_db, db, check_db_connection
@@ -222,6 +222,18 @@ def health():
             cache=cache_status,
         ), status_code
 
+    @app.route("/metrics", methods=["GET"])
+    def metrics():
+        metrics_payload = (
+            "# HELP app_info Static metadata for the Flask service\n"
+            "# TYPE app_info gauge\n"
+            "app_info{service=\"url-shortener-api\",component=\"backend\"} 1\n"
+        )
+        return Response(
+            metrics_payload,
+            mimetype="text/plain; version=0.0.4; charset=utf-8",
+        )
+
     @app.errorhandler(404)
     def not_found(e):
         app.logger.warning(
diff --git a/docker-compose.yml b/docker-compose.yml
index 83c26a5b..6d1bf228 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -67,6 +67,7 @@ services:
       - 9090:9090
     volumes:
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
 
   node-exporter:
     image: prom/node-exporter:latest
@@ -128,6 +129,7 @@ services:
       - grafana-data:/var/lib/grafana
       - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
       - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
+      - ./grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro
       - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
 
 volumes:
diff --git a/grafana/dashboards/critical-alerts.json b/grafana/dashboards/critical-alerts.json
new file mode 100644
index 00000000..2233a6ff
--- /dev/null
+++ b/grafana/dashboards/critical-alerts.json
@@ -0,0 +1,94 @@
+{
+  "uid": "meta-critical-alerts",
+  "title": "Meta Critical Alerts",
+  "schemaVersion": 38,
+  "version": 1,
+  "refresh": "15s",
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Firing Critical Alerts",
+      "gridPos": { "x": 0, "y": 0, "w": 8, "h": 5 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})"
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "values": false,
+          "calcs": ["lastNotNull"],
+          "fields": ""
+        },
+        "orientation": "auto",
+        "textMode": "auto",
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "center"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 1 }
+            ]
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 2,
+      "type": "alertlist",
+      "title": "Critical Alert List",
+      "gridPos": { "x": 8, "y": 0, "w": 16, "h": 12 },
+      "options": {
+        "maxItems": 50,
+        "sortOrder": 1,
+        "stateFilter": {
+          "firing": true,
+          "pending": true,
+          "normal": false,
+          "inactive": false,
+          "nodata": true,
+          "error": true
+        },
+        "viewMode": "list",
+        "groupMode": "default",
+        "showInstances": true,
+        "alertName": "",
+        "dashboardAlerts": false,
+        "folder": "",
+        "datasource": "-- Grafana --"
+      }
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "Critical Alerts Over Time",
+      "gridPos": { "x": 0, "y": 5, "w": 8, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "min": 0
+        },
+        "overrides": []
+      }
+    }
+  ]
+}
diff --git a/grafana/provisioning/alerting/loki-alert-rules.yml b/grafana/provisioning/alerting/loki-alert-rules.yml
new file mode 100644
index 00000000..c82afccd
--- /dev/null
+++ b/grafana/provisioning/alerting/loki-alert-rules.yml
@@ -0,0 +1,67 @@
+apiVersion: 1
+
+groups:
+  - orgId: 1
+    name: app-log-alerts
+    folder: Observability
+    interval: 1m
+    rules:
+      - uid: app-error-logs
+        title: App Error Logs Detected
+        condition: C
+        data:
+          - refId: A
+            queryType: range
+            relativeTimeRange:
+              from: 300
+              to: 0
+            datasourceUid: loki
+            model:
+              datasource:
+                type: loki
+                uid: loki
+              editorMode: code
+              expr: 'sum(count_over_time({service="app"} |= "\"level\": \"ERROR\"" [5m]))'
+              intervalMs: 1000
+              maxDataPoints: 43200
+              queryType: range
+              refId: A
+          - refId: C
+            queryType: ""
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              conditions:
+                - evaluator:
+                    params:
+                      - 0
+                    type: gt
+                  operator:
+                    type: and
+                  query:
+                    params:
+                      - C
+                  reducer:
+                    params: []
+                    type: last
+                  type: query
+              datasource:
+                name: Expression
+                type: __expr__
+                uid: __expr__
+              expression: A
+              intervalMs: 1000
+              maxDataPoints: 43200
+              refId: C
+              type: threshold
+        noDataState: NoData
+        execErrState: Error
+        for: 2m
+        annotations:
+          summary: "Application emitted error logs"
+          description: "One or more ERROR level application logs detected in the past 5 minutes."
+        labels:
+          severity: warning
+          source: loki
diff --git a/prometheus/alerts.yml b/prometheus/alerts.yml
new file mode 100644
index 00000000..c5efe692
--- /dev/null
+++ b/prometheus/alerts.yml
@@ -0,0 +1,117 @@
+groups:
+  - name: platform-health
+    interval: 30s
+    rules:
+      - alert: PrometheusTargetDown
+        expr: up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Prometheus target down"
+          description: "Target {{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes."
+
+      - alert: HostHighCpuUsage
+        expr: 100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 85
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage"
+          description: "Node {{ $labels.instance }} CPU usage is above 85% for 10 minutes."
+
+      - alert: HostLowDiskSpace
+        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low disk space"
+          description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% free space."
+
+  - name: observability-stack
+    interval: 30s
+    rules:
+      - alert: LokiUnavailable
+        expr: up{job="loki"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Loki unavailable"
+          description: "Loki is down, log queries and ingestion may be impacted."
+
+      - alert: PromtailUnavailable
+        expr: up{job="promtail"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Promtail unavailable"
+          description: "Promtail is down, new logs are not being shipped to Loki."
+
+      - alert: PromtailNoNewLogs
+        expr: increase(promtail_read_lines_total[15m]) == 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "No new logs shipped by Promtail"
+          description: "Promtail has not read any new log lines in the last 15 minutes."
+
+  - name: process-health
+    interval: 30s
+    rules:
+      - alert: MainApplicationProcessMissing
+        expr: absent(namedprocess_namegroup_num_procs{groupname="backend-main"}) or namedprocess_namegroup_num_procs{groupname="backend-main"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Main application process missing"
+          description: "The backend main process (gunicorn/run.py) is not detected by process-exporter."
+
+      - alert: FrontendProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="frontend"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Frontend process missing"
+          description: "No frontend process detected by process-exporter."
+
+      - alert: PostgresProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="database-postgres"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Postgres process missing"
+          description: "No postgres process detected by process-exporter."
+
+      - alert: PrometheusProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="prometheus"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Prometheus process missing"
+          description: "No prometheus process detected by process-exporter."
+
+      - alert: LokiProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="loki"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Loki process missing"
+          description: "No loki process detected by process-exporter."
+
+      - alert: PromtailProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="promtail"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Promtail process missing"
+          description: "No promtail process detected by process-exporter."
diff --git a/prometheus/process-exporter.yml b/prometheus/process-exporter.yml
index cad4f485..e34b1639 100644
--- a/prometheus/process-exporter.yml
+++ b/prometheus/process-exporter.yml
@@ -1,4 +1,9 @@
 process_names:
+  - name: "backend-main"
+    cmdline:
+      - '.*/gunicorn'
+      - '.*run:app.*'
+
   - name: "backend-flask"
     comm:
       - 'python'
diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml
index 3bdd34bd..c084eff8 100644
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@@ -1,6 +1,9 @@
 global:
   scrape_interval: 15s
 
+rule_files:
+  - /etc/prometheus/alerts.yml
+
 scrape_configs:
   - job_name: 'prometheus'
     static_configs:
@@ -20,3 +23,13 @@ scrape_configs:
     static_configs:
       - targets: ['process-exporter:9256']
     metrics_path: /metrics
+
+  - job_name: 'loki'
+    static_configs:
+      - targets: ['loki:3100']
+    metrics_path: /metrics
+
+  - job_name: 'promtail'
+    static_configs:
+      - targets: ['promtail:9080']
+    metrics_path: /metrics

From f2dd248749c6ecf81456fde214d9bf6b85dd060d Mon Sep 17 00:00:00 2001
From: Victor <ioan.1931@gmail.com>
Date: Sat, 4 Apr 2026 20:15:03 -0400
Subject: [PATCH 2/5] oopsies

---
 app/__init__.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/app/__init__.py b/app/__init__.py
index 4f8c7720..62761323 100644
--- a/app/__init__.py
+++ b/app/__init__.py
@@ -7,7 +7,7 @@
 from datetime import datetime, timezone
 
 from dotenv import load_dotenv
-from flask import Flask, Response, current_app, jsonify, g, request
+from flask import Flask, current_app, jsonify, g, request
 from flask_cors import CORS
 
 from app.database import init_db, db, check_db_connection
@@ -222,18 +222,6 @@ def health():
             cache=cache_status,
         ), status_code
 
-    @app.route("/metrics", methods=["GET"])
-    def metrics():
-        metrics_payload = (
-            "# HELP app_info Static metadata for the Flask service\n"
-            "# TYPE app_info gauge\n"
-            "app_info{service=\"url-shortener-api\",component=\"backend\"} 1\n"
-        )
-        return Response(
-            metrics_payload,
-            mimetype="text/plain; version=0.0.4; charset=utf-8",
-        )
-
     @app.errorhandler(404)
     def not_found(e):
         app.logger.warning(

From 4210ae537966dca82ac2389db6763a381e75ab82 Mon Sep 17 00:00:00 2001
From: Victor <ioan.1931@gmail.com>
Date: Sat, 4 Apr 2026 20:32:35 -0400
Subject: [PATCH 3/5] auto notifications

---
 alertmanager/alertmanager.yml | 31 +++++++++++++++++++++++++++++++
 docker-compose.yml            |  9 +++++++++
 prometheus/prometheus.yml     |  5 +++++
 3 files changed, 45 insertions(+)
 create mode 100644 alertmanager/alertmanager.yml

diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml
new file mode 100644
index 00000000..1cbe7d45
--- /dev/null
+++ b/alertmanager/alertmanager.yml
@@ -0,0 +1,31 @@
+global:
+  resolve_timeout: 5m
+  smtp_smarthost: 'smtp.gmail.com:587'
+  smtp_from: 'alerts@example.com'
+  smtp_auth_username: 'replace-with-email-username'
+  smtp_auth_password: 'replace-with-email-app-password'
+  smtp_require_tls: true
+
+route:
+  group_by: ['alertname', 'job']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 2h
+  receiver: email-only
+  routes:
+    - matchers:
+        - severity="critical"
+      receiver: email-and-sms
+
+receivers:
+  - name: email-only
+    email_configs:
+      - to: 'replace-with-your-email@example.com'
+        send_resolved: true
+
+  - name: email-and-sms
+    email_configs:
+      - to: 'replace-with-your-email@example.com'
+        send_resolved: true
+      - to: 'replace-with-your-phone-sms-gateway@example.com'
+        send_resolved: true
diff --git a/docker-compose.yml b/docker-compose.yml
index 6d1bf228..af6c4f75 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -69,6 +69,15 @@ services:
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
       - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
 
+  alertmanager:
+    image: prom/alertmanager:latest
+    command:
+      - --config.file=/etc/alertmanager/alertmanager.yml
+    volumes:
+      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+    ports:
+      - 9093:9093
+
   node-exporter:
     image: prom/node-exporter:latest
     command:
diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml
index c084eff8..22cb4590 100644
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@@ -1,6 +1,11 @@
 global:
   scrape_interval: 15s
 
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']
+
 rule_files:
   - /etc/prometheus/alerts.yml
 

From faa6324f5ce3e0629f78499aabfd914b4bd9a134 Mon Sep 17 00:00:00 2001
From: Victor <ioan.1931@gmail.com>
Date: Sat, 4 Apr 2026 20:50:09 -0400
Subject: [PATCH 4/5] frontend missing from dashboard fix

---
 prometheus/process-exporter.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/prometheus/process-exporter.yml b/prometheus/process-exporter.yml
index e34b1639..3bc1f048 100644
--- a/prometheus/process-exporter.yml
+++ b/prometheus/process-exporter.yml
@@ -4,16 +4,11 @@ process_names:
       - '.*/gunicorn'
       - '.*run:app.*'
 
-  - name: "backend-flask"
-    comm:
-      - 'python'
-    cmdline:
-      - '.*run\.py.*'
-
   - name: "frontend"
     comm:
       - 'next-server (v'
-      - 'next-server'
+    cmdline:
+      - '.*next-server.*'
 
   - name: "database-postgres"
     cmdline:

From 0025d5b082a52052609d0277d0ff947f3612cdd7 Mon Sep 17 00:00:00 2001
From: huzaifa <146299744+zaifnatra@users.noreply.github.com>
Date: Sat, 4 Apr 2026 21:26:11 -0400
Subject: [PATCH 5/5] tier 2 observability

gmail alerts
---
 alertmanager/alertmanager.yml                 |  14 +-
 docker-compose.gold.yml                       |  12 ++
 docker-compose.yml                            | 175 ++++++++++++++++--
 .../alerting/loki-alert-rules.yml             | 123 ++++++++++++
 prometheus/alerts.yml                         |  21 +++
 prometheus/blackbox.yml                       |  10 +
 prometheus/prometheus.yml                     |  24 +++
 scripts/test_tier2.sh                         |  68 +++++++
 8 files changed, 425 insertions(+), 22 deletions(-)
 create mode 100644 prometheus/blackbox.yml
 create mode 100644 scripts/test_tier2.sh

diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml
index 1cbe7d45..62e622a7 100644
--- a/alertmanager/alertmanager.yml
+++ b/alertmanager/alertmanager.yml
@@ -1,9 +1,9 @@
 global:
   resolve_timeout: 5m
-  smtp_smarthost: 'smtp.gmail.com:587'
-  smtp_from: 'alerts@example.com'
-  smtp_auth_username: 'replace-with-email-username'
-  smtp_auth_password: 'replace-with-email-app-password'
+  smtp_smarthost: 'SMTP_SMARTHOST_VALUE'
+  smtp_from: 'SMTP_FROM_VALUE'
+  smtp_auth_username: 'SMTP_AUTH_USERNAME_VALUE'
+  smtp_auth_password: 'SMTP_AUTH_PASSWORD_VALUE'
   smtp_require_tls: true
 
 route:
@@ -20,12 +20,12 @@ route:
 receivers:
   - name: email-only
     email_configs:
-      - to: 'replace-with-your-email@example.com'
+      - to: 'ALERT_EMAIL_TO_VALUE'
         send_resolved: true
 
   - name: email-and-sms
     email_configs:
-      - to: 'replace-with-your-email@example.com'
+      - to: 'ALERT_EMAIL_TO_VALUE'
         send_resolved: true
-      - to: 'replace-with-your-phone-sms-gateway@example.com'
+      - to: 'ALERT_SMS_TO_VALUE'
         send_resolved: true
diff --git a/docker-compose.gold.yml b/docker-compose.gold.yml
index b37fbaf0..5e6cae4b 100644
--- a/docker-compose.gold.yml
+++ b/docker-compose.gold.yml
@@ -54,6 +54,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-1.log
+    volumes:
+      - ./logs/app-1:/app/logs
     depends_on:
       db:
         condition: service_healthy
@@ -77,6 +80,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-2.log
+    volumes:
+      - ./logs/app-2:/app/logs
     depends_on:
       db:
         condition: service_healthy
@@ -102,6 +108,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-3.log
+    volumes:
+      - ./logs/app-3:/app/logs
     depends_on:
       db:
         condition: service_healthy
@@ -127,6 +136,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-4.log
+    volumes:
+      - ./logs/app-4:/app/logs
     depends_on:
       db:
         condition: service_healthy
diff --git a/docker-compose.yml b/docker-compose.yml
index af6c4f75..0a4aa5c5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,6 @@
 name: url-shortner
 services:
+  # ── App infrastructure ──────────────────────────────────────────────
   db:
     image: postgres:16
     environment:
@@ -10,28 +11,71 @@ services:
       - "5433:5432"
     volumes:
       - postgres_data:/var/lib/postgresql/data
+    command:
+      - "postgres"
+      - "-c"
+      - "max_connections=200"
+      - "-c"
+      - "shared_buffers=256MB"
+      - "-c"
+      - "work_mem=4MB"
+      - "-c"
+      - "effective_cache_size=512MB"
+      - "-c"
+      - "synchronous_commit=off"
     healthcheck:
       test: ["CMD", "pg_isready", "-U", "postgres"]
-      interval: 10s
-      timeout: 5s
+      interval: 5s
+      timeout: 3s
       retries: 5
+    deploy:
+      resources:
+        limits:
+          memory: 1G
 
   redis:
     image: redis:7
+    command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru --save ""
     healthcheck:
       test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 5
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  # ── App instances (gold: 4 instances + nginx LB) ────────────────────
+  app-1:
+    build: .
+    environment:
+      DATABASE_NAME: hackathon_db
+      DATABASE_HOST: db
+      DATABASE_PORT: "5432"
+      DATABASE_USER: postgres
+      DATABASE_PASSWORD: postgres
+      REDIS_URL: redis://redis:6379
+      SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-1.log
+    volumes:
+      - ./logs/app-1:/app/logs
+    depends_on:
+      db:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
       interval: 10s
       timeout: 5s
+      start_period: 15s
       retries: 5
+    restart: always
 
-  app:
+  app-2:
     build: .
-    ports:
-      - "5000:5000"
     environment:
-      FLASK_DEBUG: "false"
-      FLASK_HOST: 0.0.0.0
-      FLASK_PORT: "5000"
       DATABASE_NAME: hackathon_db
       DATABASE_HOST: db
       DATABASE_PORT: "5432"
@@ -39,28 +83,105 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
-      LOG_FILE_PATH: ${LOG_FILE_PATH:-/app/logs/app.log}
+      LOG_FILE_PATH: /app/logs/app-2.log
     volumes:
-      - ./logs/app:/app/logs
+      - ./logs/app-2:/app/logs
     depends_on:
       db:
         condition: service_healthy
       redis:
         condition: service_healthy
+      app-1:
+        condition: service_healthy
     healthcheck:
       test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
-      interval: 15s
+      interval: 10s
       timeout: 5s
+      start_period: 15s
       retries: 5
     restart: always
 
+  app-3:
+    build: .
+    environment:
+      DATABASE_NAME: hackathon_db
+      DATABASE_HOST: db
+      DATABASE_PORT: "5432"
+      DATABASE_USER: postgres
+      DATABASE_PASSWORD: postgres
+      REDIS_URL: redis://redis:6379
+      SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-3.log
+    volumes:
+      - ./logs/app-3:/app/logs
+    depends_on:
+      db:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      app-1:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
+      interval: 10s
+      timeout: 5s
+      start_period: 15s
+      retries: 5
+    restart: always
+
+  app-4:
+    build: .
+    environment:
+      DATABASE_NAME: hackathon_db
+      DATABASE_HOST: db
+      DATABASE_PORT: "5432"
+      DATABASE_USER: postgres
+      DATABASE_PASSWORD: postgres
+      REDIS_URL: redis://redis:6379
+      SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-4.log
+    volumes:
+      - ./logs/app-4:/app/logs
+    depends_on:
+      db:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      app-1:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
+      interval: 10s
+      timeout: 5s
+      start_period: 15s
+      retries: 5
+    restart: always
+
+  nginx:
+    image: nginx:latest
+    ports:
+      - "80:80"
+    volumes:
+      - ./nginx/nginx.gold.conf:/etc/nginx/conf.d/default.conf:ro
+    depends_on:
+      app-1:
+        condition: service_healthy
+      app-2:
+        condition: service_healthy
+      app-3:
+        condition: service_healthy
+      app-4:
+        condition: service_healthy
+    restart: always
+
   frontend:
     build:
       context: ./frontend
       dockerfile: docker/Dockerfile
     ports:
       - "3000:3000"
-    
+
+  # ── Observability stack ─────────────────────────────────────────────
   prometheus:
     image: prom/prometheus:latest
     ports:
@@ -71,13 +192,37 @@ services:
 
   alertmanager:
     image: prom/alertmanager:latest
+    entrypoint: /bin/sh
     command:
-      - --config.file=/etc/alertmanager/alertmanager.yml
+      - -c
+      - |
+        sed -e "s|SMTP_SMARTHOST_VALUE|$$SMTP_SMARTHOST|g" \
+            -e "s|SMTP_FROM_VALUE|$$SMTP_FROM|g" \
+            -e "s|SMTP_AUTH_USERNAME_VALUE|$$SMTP_AUTH_USERNAME|g" \
+            -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \
+            -e "s|ALERT_EMAIL_TO_VALUE|$$ALERT_EMAIL_TO|g" \
+            -e "s|ALERT_SMS_TO_VALUE|$$ALERT_SMS_TO|g" \
+            /etc/alertmanager/alertmanager.tmpl.yml > /tmp/alertmanager.yml \
+        && exec /bin/alertmanager --config.file=/tmp/alertmanager.yml
+    environment:
+      SMTP_SMARTHOST: ${SMTP_SMARTHOST}
+      SMTP_FROM: ${SMTP_FROM}
+      SMTP_AUTH_USERNAME: ${SMTP_AUTH_USERNAME}
+      SMTP_AUTH_PASSWORD: ${SMTP_AUTH_PASSWORD}
+      ALERT_EMAIL_TO: ${ALERT_EMAIL_TO}
+      ALERT_SMS_TO: ${ALERT_SMS_TO}
     volumes:
-      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.tmpl.yml:ro
     ports:
       - 9093:9093
 
+  blackbox-exporter:
+    image: prom/blackbox-exporter:latest
+    volumes:
+      - ./prometheus/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
+    ports:
+      - 9115:9115
+
   node-exporter:
     image: prom/node-exporter:latest
     command:
@@ -95,7 +240,7 @@ services:
       - ./prometheus/process-exporter.yml:/etc/process-exporter/config.yml:ro
     ports:
       - 9256:9256
-    
+
   otel:
     image: otel/opentelemetry-collector-contrib:latest
     volumes:
diff --git a/grafana/provisioning/alerting/loki-alert-rules.yml b/grafana/provisioning/alerting/loki-alert-rules.yml
index c82afccd..f16617e1 100644
--- a/grafana/provisioning/alerting/loki-alert-rules.yml
+++ b/grafana/provisioning/alerting/loki-alert-rules.yml
@@ -65,3 +65,126 @@ groups:
         labels:
           severity: warning
           source: loki
+
+      - uid: high-error-rate
+        title: High HTTP Error Rate
+        condition: C
+        data:
+          - refId: A
+            queryType: range
+            relativeTimeRange:
+              from: 300
+              to: 0
+            datasourceUid: loki
+            model:
+              datasource:
+                type: loki
+                uid: loki
+              editorMode: code
+              expr: |
+                sum(count_over_time({service=~"app-.*"} |= "request_completed" | json | status_code >= 500 [5m]))
+                /
+                sum(count_over_time({service=~"app-.*"} |= "request_completed" [5m]))
+              intervalMs: 1000
+              maxDataPoints: 43200
+              queryType: range
+              refId: A
+          - refId: C
+            queryType: ""
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              conditions:
+                - evaluator:
+                    params:
+                      - 0.1
+                    type: gt
+                  operator:
+                    type: and
+                  query:
+                    params:
+                      - C
+                  reducer:
+                    params: []
+                    type: last
+                  type: query
+              datasource:
+                name: Expression
+                type: __expr__
+                uid: __expr__
+              expression: A
+              intervalMs: 1000
+              maxDataPoints: 43200
+              refId: C
+              type: threshold
+        noDataState: NoData
+        execErrState: Error
+        for: 2m
+        annotations:
+          summary: "High HTTP error rate detected"
+          description: "More than 10% of HTTP requests are returning 5xx errors over the past 5 minutes."
+        labels:
+          severity: critical
+          source: loki
+
+      - uid: service-no-logs
+        title: Service Not Producing Logs
+        condition: C
+        data:
+          - refId: A
+            queryType: range
+            relativeTimeRange:
+              from: 300
+              to: 0
+            datasourceUid: loki
+            model:
+              datasource:
+                type: loki
+                uid: loki
+              editorMode: code
+              expr: 'sum(count_over_time({service=~"app-.*"} [5m]))'
+              intervalMs: 1000
+              maxDataPoints: 43200
+              queryType: range
+              refId: A
+          - refId: C
+            queryType: ""
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              conditions:
+                - evaluator:
+                    params:
+                      - 1
+                    type: lt
+                  operator:
+                    type: and
+                  query:
+                    params:
+                      - C
+                  reducer:
+                    params: []
+                    type: last
+                  type: query
+              datasource:
+                name: Expression
+                type: __expr__
+                uid: __expr__
+              expression: A
+              intervalMs: 1000
+              maxDataPoints: 43200
+              refId: C
+              type: threshold
+        noDataState: Alerting
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: "Service stopped producing logs"
+          description: "No logs received from any app instance in the past 5 minutes. Service may be down."
+        labels:
+          severity: critical
+          source: loki
diff --git a/prometheus/alerts.yml b/prometheus/alerts.yml
index c5efe692..f7b90971 100644
--- a/prometheus/alerts.yml
+++ b/prometheus/alerts.yml
@@ -29,6 +29,27 @@ groups:
           summary: "Low disk space"
           description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% free space."
 
+  - name: service-health
+    interval: 15s
+    rules:
+      - alert: ServiceDown
+        expr: probe_success{job="blackbox-health"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service is down"
+          description: "Health probe {{ $labels.instance }} has been failing for more than 2 minutes. The application is unreachable."
+
+      - alert: ServiceHighLatency
+        expr: probe_duration_seconds{job="blackbox-health"} > 5
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Service high latency"
+          description: "Health probe {{ $labels.instance }} is taking more than 5 seconds to respond."
+
   - name: observability-stack
     interval: 30s
     rules:
diff --git a/prometheus/blackbox.yml b/prometheus/blackbox.yml
new file mode 100644
index 00000000..7ea82a6d
--- /dev/null
+++ b/prometheus/blackbox.yml
@@ -0,0 +1,10 @@
+modules:
+  http_2xx:
+    prober: http
+    timeout: 5s
+    http:
+      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+      valid_status_codes: [200]
+      method: GET
+      follow_redirects: true
+      fail_if_not_ssl: false
diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml
index 22cb4590..1d26d60c 100644
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@@ -29,6 +29,30 @@ scrape_configs:
       - targets: ['process-exporter:9256']
     metrics_path: /metrics
 
+  - job_name: 'blackbox-health'
+    metrics_path: /probe
+    params:
+      module: [http_2xx]
+    static_configs:
+      - targets:
+          - http://nginx:80/health/live
+          - http://nginx:80/health/ready
+        labels:
+          service: app
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+    scrape_interval: 15s
+
+  - job_name: 'blackbox-exporter'
+    static_configs:
+      - targets: ['blackbox-exporter:9115']
+    metrics_path: /metrics
+
   - job_name: 'loki'
     static_configs:
       - targets: ['loki:3100']
diff --git a/scripts/test_tier2.sh b/scripts/test_tier2.sh
new file mode 100644
index 00000000..3001de62
--- /dev/null
+++ b/scripts/test_tier2.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Tier 2 verification script — tests logging + alertmanager config
+set -e
+
+BASE="http://localhost"
+AM="http://localhost:9093"
+
+echo "=== 1. Check app health through nginx ==="
+HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/health/live" 2>/dev/null || echo "FAIL")
+echo "GET /health/live => $HTTP_CODE"
+
+echo ""
+echo "=== 2. Generate some traffic ==="
+for i in $(seq 1 5); do
+  curl -s -o /dev/null -w "GET /users => %{http_code}\n" "$BASE/users"
+done
+
+echo ""
+echo "=== 3. Check log files exist on host ==="
+for i in 1 2 3 4; do
+  LOG="./logs/app-$i/app-$i.log"
+  if [ -f "$LOG" ]; then
+    LINES=$(wc -l < "$LOG")
+    echo "  app-$i.log: $LINES lines"
+  else
+    echo "  app-$i.log: MISSING"
+  fi
+done
+
+# Also check single-app log
+if [ -f "./logs/app/app.log" ]; then
+  LINES=$(wc -l < "./logs/app/app.log")
+  echo "  app.log (single): $LINES lines"
+fi
+
+echo ""
+echo "=== 4. Check Alertmanager status ==="
+AM_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$AM/-/healthy" 2>/dev/null || echo "FAIL")
+echo "Alertmanager health => $AM_STATUS"
+
+echo ""
+echo "=== 5. Check Alertmanager config has real credentials ==="
+AM_CONFIG=$(curl -s "$AM/api/v1/status" 2>/dev/null)
+if echo "$AM_CONFIG" | grep -q "replace-with"; then
+  echo "  FAIL: Alertmanager still has placeholder credentials"
+elif echo "$AM_CONFIG" | grep -q "example.com"; then
+  echo "  FAIL: Alertmanager still has example.com addresses"
+elif echo "$AM_CONFIG" | grep -q "smtp_smarthost"; then
+  echo "  OK: Alertmanager config loaded (check emails below)"
+  echo "$AM_CONFIG" | python -m json.tool 2>/dev/null | grep -E "(smtp_from|smtp_smarthost|to:)" || true
+else
+  echo "  WARN: Could not fetch Alertmanager config"
+fi
+
+echo ""
+echo "=== 6. Check Prometheus alert rules ==="
+PROM_RULES=$(curl -s "http://localhost:9090/api/v1/rules" 2>/dev/null)
+ALERT_COUNT=$(echo "$PROM_RULES" | python -c "import sys,json; d=json.load(sys.stdin); print(sum(len(g.get('rules',[])) for g in d.get('data',{}).get('groups',[])))" 2>/dev/null || echo "?")
+echo "Prometheus has $ALERT_COUNT alert rules loaded"
+
+echo ""
+echo "=== 7. Check for any firing alerts ==="
+FIRING=$(curl -s "$AM/api/v2/alerts" 2>/dev/null)
+FIRING_COUNT=$(echo "$FIRING" | python -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "?")
+echo "Currently firing alerts: $FIRING_COUNT"
+
+echo ""
+echo "=== Done ==="