diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml
new file mode 100644
index 00000000..62e622a7
--- /dev/null
+++ b/alertmanager/alertmanager.yml
@@ -0,0 +1,31 @@
+global:
+  resolve_timeout: 5m
+  smtp_smarthost: 'SMTP_SMARTHOST_VALUE'
+  smtp_from: 'SMTP_FROM_VALUE'
+  smtp_auth_username: 'SMTP_AUTH_USERNAME_VALUE'
+  smtp_auth_password: 'SMTP_AUTH_PASSWORD_VALUE'
+  smtp_require_tls: true
+
+route:
+  group_by: ['alertname', 'job']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 2h
+  receiver: email-only
+  routes:
+    - matchers:
+        - severity="critical"
+      receiver: email-and-sms
+
+receivers:
+  - name: email-only
+    email_configs:
+      - to: 'ALERT_EMAIL_TO_VALUE'
+        send_resolved: true
+
+  - name: email-and-sms
+    email_configs:
+      - to: 'ALERT_EMAIL_TO_VALUE'
+        send_resolved: true
+      - to: 'ALERT_SMS_TO_VALUE'
+        send_resolved: true
diff --git a/docker-compose.gold.yml b/docker-compose.gold.yml
index b37fbaf0..5e6cae4b 100644
--- a/docker-compose.gold.yml
+++ b/docker-compose.gold.yml
@@ -54,6 +54,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-1.log
+    volumes:
+      - ./logs/app-1:/app/logs
     depends_on:
       db:
         condition: service_healthy
@@ -77,6 +80,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-2.log
+    volumes:
+      - ./logs/app-2:/app/logs
     depends_on:
       db:
         condition: service_healthy
@@ -102,6 +108,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-3.log
+    volumes:
+      - ./logs/app-3:/app/logs
     depends_on:
       db:
         condition: service_healthy
@@ -127,6 +136,9 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-4.log
+    volumes:
+      - ./logs/app-4:/app/logs
     depends_on:
       db:
         condition: service_healthy
diff --git a/docker-compose.yml b/docker-compose.yml
index 83c26a5b..0a4aa5c5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,6 @@
 name: url-shortner
 services:
+  # ── App infrastructure ──────────────────────────────────────────────
   db:
     image: postgres:16
     environment:
@@ -10,28 +11,71 @@ services:
       - "5433:5432"
     volumes:
       - postgres_data:/var/lib/postgresql/data
+    command:
+      - "postgres"
+      - "-c"
+      - "max_connections=200"
+      - "-c"
+      - "shared_buffers=256MB"
+      - "-c"
+      - "work_mem=4MB"
+      - "-c"
+      - "effective_cache_size=512MB"
+      - "-c"
+      - "synchronous_commit=off"
     healthcheck:
       test: ["CMD", "pg_isready", "-U", "postgres"]
-      interval: 10s
-      timeout: 5s
+      interval: 5s
+      timeout: 3s
       retries: 5
+    deploy:
+      resources:
+        limits:
+          memory: 1G
 
   redis:
     image: redis:7
+    command: redis-server --maxmemory 128mb --maxmemory-policy allkeys-lru --save ""
     healthcheck:
       test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 5
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  # ── App instances (gold: 4 instances + nginx LB) ────────────────────
+  app-1:
+    build: .
+    environment:
+      DATABASE_NAME: hackathon_db
+      DATABASE_HOST: db
+      DATABASE_PORT: "5432"
+      DATABASE_USER: postgres
+      DATABASE_PASSWORD: postgres
+      REDIS_URL: redis://redis:6379
+      SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-1.log
+    volumes:
+      - ./logs/app-1:/app/logs
+    depends_on:
+      db:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
       interval: 10s
       timeout: 5s
+      start_period: 15s
       retries: 5
+    restart: always
 
-  app:
+  app-2:
     build: .
-    ports:
-      - "5000:5000"
     environment:
-      FLASK_DEBUG: "false"
-      FLASK_HOST: 0.0.0.0
-      FLASK_PORT: "5000"
       DATABASE_NAME: hackathon_db
       DATABASE_HOST: db
       DATABASE_PORT: "5432"
@@ -39,34 +83,145 @@ services:
       DATABASE_PASSWORD: postgres
       REDIS_URL: redis://redis:6379
       SECRET_KEY: random_secret_key
-      LOG_FILE_PATH: ${LOG_FILE_PATH:-/app/logs/app.log}
+      LOG_FILE_PATH: /app/logs/app-2.log
     volumes:
-      - ./logs/app:/app/logs
+      - ./logs/app-2:/app/logs
     depends_on:
       db:
         condition: service_healthy
       redis:
         condition: service_healthy
+      app-1:
+        condition: service_healthy
     healthcheck:
       test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
-      interval: 15s
+      interval: 10s
       timeout: 5s
+      start_period: 15s
       retries: 5
     restart: always
 
+  app-3:
+    build: .
+    environment:
+      DATABASE_NAME: hackathon_db
+      DATABASE_HOST: db
+      DATABASE_PORT: "5432"
+      DATABASE_USER: postgres
+      DATABASE_PASSWORD: postgres
+      REDIS_URL: redis://redis:6379
+      SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-3.log
+    volumes:
+      - ./logs/app-3:/app/logs
+    depends_on:
+      db:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      app-1:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
+      interval: 10s
+      timeout: 5s
+      start_period: 15s
+      retries: 5
+    restart: always
+
+  app-4:
+    build: .
+    environment:
+      DATABASE_NAME: hackathon_db
+      DATABASE_HOST: db
+      DATABASE_PORT: "5432"
+      DATABASE_USER: postgres
+      DATABASE_PASSWORD: postgres
+      REDIS_URL: redis://redis:6379
+      SECRET_KEY: random_secret_key
+      LOG_FILE_PATH: /app/logs/app-4.log
+    volumes:
+      - ./logs/app-4:/app/logs
+    depends_on:
+      db:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      app-1:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health/live')"]
+      interval: 10s
+      timeout: 5s
+      start_period: 15s
+      retries: 5
+    restart: always
+
+  nginx:
+    image: nginx:latest
+    ports:
+      - "80:80"
+    volumes:
+      - ./nginx/nginx.gold.conf:/etc/nginx/conf.d/default.conf:ro
+    depends_on:
+      app-1:
+        condition: service_healthy
+      app-2:
+        condition: service_healthy
+      app-3:
+        condition: service_healthy
+      app-4:
+        condition: service_healthy
+    restart: always
+
   frontend:
     build:
       context: ./frontend
       dockerfile: docker/Dockerfile
     ports:
       - "3000:3000"
-    
+
+  # ── Observability stack ─────────────────────────────────────────────
   prometheus:
     image: prom/prometheus:latest
     ports:
       - 9090:9090
     volumes:
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    entrypoint: /bin/sh
+    command:
+      - -c
+      - |
+        sed -e "s|SMTP_SMARTHOST_VALUE|$$SMTP_SMARTHOST|g" \
+            -e "s|SMTP_FROM_VALUE|$$SMTP_FROM|g" \
+            -e "s|SMTP_AUTH_USERNAME_VALUE|$$SMTP_AUTH_USERNAME|g" \
+            -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \
+            -e "s|ALERT_EMAIL_TO_VALUE|$$ALERT_EMAIL_TO|g" \
+            -e "s|ALERT_SMS_TO_VALUE|$$ALERT_SMS_TO|g" \
+            /etc/alertmanager/alertmanager.tmpl.yml > /tmp/alertmanager.yml \
+        && exec /bin/alertmanager --config.file=/tmp/alertmanager.yml
+    environment:
+      SMTP_SMARTHOST: ${SMTP_SMARTHOST}
+      SMTP_FROM: ${SMTP_FROM}
+      SMTP_AUTH_USERNAME: ${SMTP_AUTH_USERNAME}
+      SMTP_AUTH_PASSWORD: ${SMTP_AUTH_PASSWORD}
+      ALERT_EMAIL_TO: ${ALERT_EMAIL_TO}
+      ALERT_SMS_TO: ${ALERT_SMS_TO}
+    volumes:
+      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.tmpl.yml:ro
+    ports:
+      - 9093:9093
+
+  blackbox-exporter:
+    image: prom/blackbox-exporter:latest
+    volumes:
+      - ./prometheus/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
+    ports:
+      - 9115:9115
 
   node-exporter:
     image: prom/node-exporter:latest
@@ -85,7 +240,7 @@ services:
       - ./prometheus/process-exporter.yml:/etc/process-exporter/config.yml:ro
     ports:
       - 9256:9256
-    
+
   otel:
     image: otel/opentelemetry-collector-contrib:latest
     volumes:
@@ -128,6 +283,7 @@ services:
       - grafana-data:/var/lib/grafana
       - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
       - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
+      - ./grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro
       - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
 
 volumes:
diff --git a/grafana/dashboards/critical-alerts.json b/grafana/dashboards/critical-alerts.json
new file mode 100644
index 00000000..2233a6ff
--- /dev/null
+++ b/grafana/dashboards/critical-alerts.json
@@ -0,0 +1,94 @@
+{
+  "uid": "meta-critical-alerts",
+  "title": "Meta Critical Alerts",
+  "schemaVersion": 38,
+  "version": 1,
+  "refresh": "15s",
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Firing Critical Alerts",
+      "gridPos": { "x": 0, "y": 0, "w": 8, "h": 5 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})"
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "values": false,
+          "calcs": ["lastNotNull"],
+          "fields": ""
+        },
+        "orientation": "auto",
+        "textMode": "auto",
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "center"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 1 }
+            ]
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 2,
+      "type": "alertlist",
+      "title": "Critical Alert List",
+      "gridPos": { "x": 8, "y": 0, "w": 16, "h": 12 },
+      "options": {
+        "maxItems": 50,
+        "sortOrder": 1,
+        "stateFilter": {
+          "firing": true,
+          "pending": true,
+          "normal": false,
+          "inactive": false,
+          "nodata": true,
+          "error": true
+        },
+        "viewMode": "list",
+        "groupMode": "default",
+        "showInstances": true,
+        "alertName": "",
+        "dashboardAlerts": false,
+        "folder": "",
+        "datasource": "-- Grafana --"
+      }
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "Critical Alerts Over Time",
+      "gridPos": { "x": 0, "y": 5, "w": 8, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"})"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "min": 0
+        },
+        "overrides": []
+      }
+    }
+  ]
+}
diff --git a/grafana/provisioning/alerting/loki-alert-rules.yml b/grafana/provisioning/alerting/loki-alert-rules.yml
new file mode 100644
index 00000000..f16617e1
--- /dev/null
+++ b/grafana/provisioning/alerting/loki-alert-rules.yml
@@ -0,0 +1,190 @@
+apiVersion: 1
+
+groups:
+  - orgId: 1
+    name: app-log-alerts
+    folder: Observability
+    interval: 1m
+    rules:
+      - uid: app-error-logs
+        title: App Error Logs Detected
+        condition: C
+        data:
+          - refId: A
+            queryType: range
+            relativeTimeRange:
+              from: 300
+              to: 0
+            datasourceUid: loki
+            model:
+              datasource:
+                type: loki
+                uid: loki
+              editorMode: code
+              expr: 'sum(count_over_time({service="app"} |= "\"level\": \"ERROR\"" [5m]))'
+              intervalMs: 1000
+              maxDataPoints: 43200
+              queryType: range
+              refId: A
+          - refId: C
+            queryType: ""
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              conditions:
+                - evaluator:
+                    params:
+                      - 0
+                    type: gt
+                  operator:
+                    type: and
+                  query:
+                    params:
+                      - C
+                  reducer:
+                    params: []
+                    type: last
+                  type: query
+              datasource:
+                name: Expression
+                type: __expr__
+                uid: __expr__
+              expression: A
+              intervalMs: 1000
+              maxDataPoints: 43200
+              refId: C
+              type: threshold
+        noDataState: NoData
+        execErrState: Error
+        for: 2m
+        annotations:
+          summary: "Application emitted error logs"
+          description: "One or more ERROR level application logs detected in the past 5 minutes."
+        labels:
+          severity: warning
+          source: loki
+
+      - uid: high-error-rate
+        title: High HTTP Error Rate
+        condition: C
+        data:
+          - refId: A
+            queryType: range
+            relativeTimeRange:
+              from: 300
+              to: 0
+            datasourceUid: loki
+            model:
+              datasource:
+                type: loki
+                uid: loki
+              editorMode: code
+              expr: |
+                sum(count_over_time({service=~"app-.*"} |= "request_completed" | json | status_code >= 500 [5m]))
+                /
+                sum(count_over_time({service=~"app-.*"} |= "request_completed" [5m]))
+              intervalMs: 1000
+              maxDataPoints: 43200
+              queryType: range
+              refId: A
+          - refId: C
+            queryType: ""
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              conditions:
+                - evaluator:
+                    params:
+                      - 0.1
+                    type: gt
+                  operator:
+                    type: and
+                  query:
+                    params:
+                      - C
+                  reducer:
+                    params: []
+                    type: last
+                  type: query
+              datasource:
+                name: Expression
+                type: __expr__
+                uid: __expr__
+              expression: A
+              intervalMs: 1000
+              maxDataPoints: 43200
+              refId: C
+              type: threshold
+        noDataState: NoData
+        execErrState: Error
+        for: 2m
+        annotations:
+          summary: "High HTTP error rate detected"
+          description: "More than 10% of HTTP requests are returning 5xx errors over the past 5 minutes."
+        labels:
+          severity: critical
+          source: loki
+
+      - uid: service-no-logs
+        title: Service Not Producing Logs
+        condition: C
+        data:
+          - refId: A
+            queryType: range
+            relativeTimeRange:
+              from: 300
+              to: 0
+            datasourceUid: loki
+            model:
+              datasource:
+                type: loki
+                uid: loki
+              editorMode: code
+              expr: 'sum(count_over_time({service=~"app-.*"} [5m]))'
+              intervalMs: 1000
+              maxDataPoints: 43200
+              queryType: range
+              refId: A
+          - refId: C
+            queryType: ""
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              conditions:
+                - evaluator:
+                    params:
+                      - 1
+                    type: lt
+                  operator:
+                    type: and
+                  query:
+                    params:
+                      - C
+                  reducer:
+                    params: []
+                    type: last
+                  type: query
+              datasource:
+                name: Expression
+                type: __expr__
+                uid: __expr__
+              expression: A
+              intervalMs: 1000
+              maxDataPoints: 43200
+              refId: C
+              type: threshold
+        noDataState: Alerting
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: "Service stopped producing logs"
+          description: "No logs received from any app instance in the past 5 minutes. Service may be down."
+        labels:
+          severity: critical
+          source: loki
diff --git a/prometheus/alerts.yml b/prometheus/alerts.yml
new file mode 100644
index 00000000..f7b90971
--- /dev/null
+++ b/prometheus/alerts.yml
@@ -0,0 +1,138 @@
+groups:
+  - name: platform-health
+    interval: 30s
+    rules:
+      - alert: PrometheusTargetDown
+        expr: up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Prometheus target down"
+          description: "Target {{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes."
+
+      - alert: HostHighCpuUsage
+        expr: 100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 85
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage"
+          description: "Node {{ $labels.instance }} CPU usage is above 85% for 10 minutes."
+
+      - alert: HostLowDiskSpace
+        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low disk space"
+          description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% free space."
+
+  - name: service-health
+    interval: 15s
+    rules:
+      - alert: ServiceDown
+        expr: probe_success{job="blackbox-health"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service is down"
+          description: "Health probe {{ $labels.instance }} has been failing for more than 2 minutes. The application is unreachable."
+
+      - alert: ServiceHighLatency
+        expr: probe_duration_seconds{job="blackbox-health"} > 5
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Service high latency"
+          description: "Health probe {{ $labels.instance }} is taking more than 5 seconds to respond."
+
+  - name: observability-stack
+    interval: 30s
+    rules:
+      - alert: LokiUnavailable
+        expr: up{job="loki"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Loki unavailable"
+          description: "Loki is down, log queries and ingestion may be impacted."
+
+      - alert: PromtailUnavailable
+        expr: up{job="promtail"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Promtail unavailable"
+          description: "Promtail is down, new logs are not being shipped to Loki."
+
+      - alert: PromtailNoNewLogs
+        expr: increase(promtail_read_lines_total[15m]) == 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "No new logs shipped by Promtail"
+          description: "Promtail has not read any new log lines in the last 15 minutes."
+
+  - name: process-health
+    interval: 30s
+    rules:
+      - alert: MainApplicationProcessMissing
+        expr: absent(namedprocess_namegroup_num_procs{groupname="backend-main"}) or namedprocess_namegroup_num_procs{groupname="backend-main"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Main application process missing"
+          description: "The backend main process (gunicorn/run.py) is not detected by process-exporter."
+
+      - alert: FrontendProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="frontend"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Frontend process missing"
+          description: "No frontend process detected by process-exporter."
+
+      - alert: PostgresProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="database-postgres"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Postgres process missing"
+          description: "No postgres process detected by process-exporter."
+
+      - alert: PrometheusProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="prometheus"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Prometheus process missing"
+          description: "No prometheus process detected by process-exporter."
+
+      - alert: LokiProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="loki"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Loki process missing"
+          description: "No loki process detected by process-exporter."
+
+      - alert: PromtailProcessMissing
+        expr: namedprocess_namegroup_num_procs{groupname="promtail"} < 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Promtail process missing"
+          description: "No promtail process detected by process-exporter."
diff --git a/prometheus/blackbox.yml b/prometheus/blackbox.yml
new file mode 100644
index 00000000..7ea82a6d
--- /dev/null
+++ b/prometheus/blackbox.yml
@@ -0,0 +1,10 @@
+modules:
+  http_2xx:
+    prober: http
+    timeout: 5s
+    http:
+      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+      valid_status_codes: [200]
+      method: GET
+      follow_redirects: true
+      fail_if_not_ssl: false
diff --git a/prometheus/process-exporter.yml b/prometheus/process-exporter.yml
index cad4f485..3bc1f048 100644
--- a/prometheus/process-exporter.yml
+++ b/prometheus/process-exporter.yml
@@ -1,14 +1,14 @@
 process_names:
-  - name: "backend-flask"
-    comm:
-      - 'python'
+  - name: "backend-main"
     cmdline:
-      - '.*run\.py.*'
+      - '.*/gunicorn'
+      - '.*run:app.*'
 
   - name: "frontend"
     comm:
       - 'next-server (v'
-      - 'next-server'
+    cmdline:
+      - '.*next-server.*'
 
   - name: "database-postgres"
     cmdline:
diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml
index 3bdd34bd..1d26d60c 100644
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@@ -1,6 +1,14 @@
 global:
   scrape_interval: 15s
 
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']
+
+rule_files:
+  - /etc/prometheus/alerts.yml
+
 scrape_configs:
   - job_name: 'prometheus'
     static_configs:
@@ -20,3 +28,37 @@ scrape_configs:
     static_configs:
       - targets: ['process-exporter:9256']
     metrics_path: /metrics
+
+  - job_name: 'blackbox-health'
+    metrics_path: /probe
+    params:
+      module: [http_2xx]
+    static_configs:
+      - targets:
+          - http://nginx:80/health/live
+          - http://nginx:80/health/ready
+        labels:
+          service: app
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+    scrape_interval: 15s
+
+  - job_name: 'blackbox-exporter'
+    static_configs:
+      - targets: ['blackbox-exporter:9115']
+    metrics_path: /metrics
+
+  - job_name: 'loki'
+    static_configs:
+      - targets: ['loki:3100']
+    metrics_path: /metrics
+
+  - job_name: 'promtail'
+    static_configs:
+      - targets: ['promtail:9080']
+    metrics_path: /metrics
diff --git a/scripts/test_tier2.sh b/scripts/test_tier2.sh
new file mode 100644
index 00000000..3001de62
--- /dev/null
+++ b/scripts/test_tier2.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Tier 2 verification script — tests logging + alertmanager config
+set -e
+
+BASE="http://localhost"
+AM="http://localhost:9093"
+
+echo "=== 1. Check app health through nginx ==="
+HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/health/live" 2>/dev/null || echo "FAIL")
+echo "GET /health/live => $HTTP_CODE"
+
+echo ""
+echo "=== 2. Generate some traffic ==="
+for i in $(seq 1 5); do
+  curl -s -o /dev/null -w "GET /users => %{http_code}\n" "$BASE/users"
+done
+
+echo ""
+echo "=== 3. Check log files exist on host ==="
+for i in 1 2 3 4; do
+  LOG="./logs/app-$i/app-$i.log"
+  if [ -f "$LOG" ]; then
+    LINES=$(wc -l < "$LOG")
+    echo "  app-$i.log: $LINES lines"
+  else
+    echo "  app-$i.log: MISSING"
+  fi
+done
+
+# Also check single-app log
+if [ -f "./logs/app/app.log" ]; then
+  LINES=$(wc -l < "./logs/app/app.log")
+  echo "  app.log (single): $LINES lines"
+fi
+
+echo ""
+echo "=== 4. Check Alertmanager status ==="
+AM_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$AM/-/healthy" 2>/dev/null || echo "FAIL")
+echo "Alertmanager health => $AM_STATUS"
+
+echo ""
+echo "=== 5. Check Alertmanager config has real credentials ==="
+AM_CONFIG=$(curl -s "$AM/api/v1/status" 2>/dev/null)
+if echo "$AM_CONFIG" | grep -q "replace-with"; then
+  echo "  FAIL: Alertmanager still has placeholder credentials"
+elif echo "$AM_CONFIG" | grep -q "example.com"; then
+  echo "  FAIL: Alertmanager still has example.com addresses"
+elif echo "$AM_CONFIG" | grep -q "smtp_smarthost"; then
+  echo "  OK: Alertmanager config loaded (check emails below)"
+  echo "$AM_CONFIG" | python -m json.tool 2>/dev/null | grep -E "(smtp_from|smtp_smarthost|to:)" || true
+else
+  echo "  WARN: Could not fetch Alertmanager config"
+fi
+
+echo ""
+echo "=== 6. Check Prometheus alert rules ==="
+PROM_RULES=$(curl -s "http://localhost:9090/api/v1/rules" 2>/dev/null)
+ALERT_COUNT=$(echo "$PROM_RULES" | python -c "import sys,json; d=json.load(sys.stdin); print(sum(len(g.get('rules',[])) for g in d.get('data',{}).get('groups',[])))" 2>/dev/null || echo "?")
+echo "Prometheus has $ALERT_COUNT alert rules loaded"
+
+echo ""
+echo "=== 7. Check for any firing alerts ==="
+FIRING=$(curl -s "$AM/api/v2/alerts" 2>/dev/null)
+FIRING_COUNT=$(echo "$FIRING" | python -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo "?")
+echo "Currently firing alerts: $FIRING_COUNT"
+
+echo ""
+echo "=== Done ==="