From d0741419cad57ed39125ae0da1a90e5d893afc88 Mon Sep 17 00:00:00 2001 From: Victor Date: Sun, 5 Apr 2026 11:04:00 -0400 Subject: [PATCH 1/4] observability for different droplets --- docker-compose.observability.prod.yml | 93 ++++++++++++++++ docker-compose.prod.yml | 105 +----------------- prometheus/prometheus.yml | 43 ++++--- .../k6/docker-compose.gold.yml | 0 .../k6/docker-compose.silver.yml | 0 5 files changed, 126 insertions(+), 115 deletions(-) create mode 100644 docker-compose.observability.prod.yml rename docker-compose.gold.yml => scripts/k6/docker-compose.gold.yml (100%) rename docker-compose.silver.yml => scripts/k6/docker-compose.silver.yml (100%) diff --git a/docker-compose.observability.prod.yml b/docker-compose.observability.prod.yml new file mode 100644 index 00000000..f85b30ad --- /dev/null +++ b/docker-compose.observability.prod.yml @@ -0,0 +1,93 @@ +services: + prometheus: + image: prom/prometheus:latest + restart: always + ports: + - "9090:9090" + volumes: + - ${PWD}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ${PWD}/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + - prometheus-data:/prometheus + + alertmanager: + image: prom/alertmanager:latest + restart: always + entrypoint: /bin/sh + command: + - -c + - | + sed -e "s|SMTP_SMARTHOST_VALUE|$$SMTP_SMARTHOST|g" \ + -e "s|SMTP_FROM_VALUE|$$SMTP_FROM|g" \ + -e "s|SMTP_AUTH_USERNAME_VALUE|$$SMTP_AUTH_USERNAME|g" \ + -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \ + -e "s|ALERT_EMAIL_TO_VALUE|$$ALERT_EMAIL_TO|g" \ + -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \ + -e "s|ALERT_SMS_TO_VALUE|$$ALERT_SMS_TO|g" \ + -e "s|DISCORD_WEBHOOK_URL_VALUE|$$DISCORD_WEBHOOK_URL|g" \ + /etc/alertmanager/alertmanager.tmpl.yml > /tmp/alertmanager.yml \ + && exec /bin/alertmanager --config.file=/tmp/alertmanager.yml + environment: + SMTP_SMARTHOST: ${SMTP_SMARTHOST:-localhost:25} + SMTP_FROM: ${SMTP_FROM:-alerts@example.com} + SMTP_AUTH_USERNAME: ${SMTP_AUTH_USERNAME:-} + SMTP_AUTH_PASSWORD: ${SMTP_AUTH_PASSWORD:-} + ALERT_EMAIL_TO: ${ALERT_EMAIL_TO:-admin@example.com} + ALERT_SMS_TO: ${ALERT_SMS_TO:-} + DISCORD_WEBHOOK_URL: ${DISCORD_WEBHOOK_URL:-} + volumes: + - ${PWD}/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.tmpl.yml:ro + ports: + - "9093:9093" + + alertmanager-discord: + image: benjojo/alertmanager-discord:latest + restart: always + environment: + DISCORD_WEBHOOK: ${DISCORD_WEBHOOK_URL:-} + + blackbox-exporter: + image: prom/blackbox-exporter:latest + restart: always + volumes: + - ${PWD}/prometheus/blackbox.yml:/etc/blackbox_exporter/config.yml:ro + ports: + - "9115:9115" + + otel: + image: otel/opentelemetry-collector-contrib:latest + restart: always + volumes: + - ${PWD}/otel/config.yaml:/etc/otelcol-contrib/config.yaml + ports: + - "8889:8889" # Prometheus scrape endpoint (metrics exposed by OTel) + - "4318:4318" # OTLP HTTP receiver (apps push to this) + + loki: + image: grafana/loki:3.0.0 + restart: always + command: -config.file=/etc/loki/config.yml + ports: + - "3100:3100" + volumes: + - ${PWD}/loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + + grafana: + image: grafana/grafana:latest + restart: always + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana-data:/var/lib/grafana + - ${PWD}/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - ${PWD}/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ${PWD}/grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro + - ${PWD}/grafana/dashboards:/var/lib/grafana/dashboards:ro + +volumes: + prometheus-data: + loki-data: + grafana-data: \ No newline at end of file diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index bbf0406d..bd96a6ce 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -12,7 +12,7 @@ services: REDIS_URL: ${REDIS_URL} SECRET_KEY: ${SECRET_KEY} LOG_FILE_PATH: /app/logs/app.log - OTEL_EXPORTER_OTLP_ENDPOINT: http://otel:4318 + OTEL_EXPORTER_OTLP_ENDPOINT: http://${DO_OBSERVABILITY}:4318 volumes: - app1_logs:/app/logs healthcheck: @@ -35,7 +35,7 @@ services: REDIS_URL: ${REDIS_URL} SECRET_KEY: ${SECRET_KEY} LOG_FILE_PATH: /app/logs/app.log - OTEL_EXPORTER_OTLP_ENDPOINT: http://otel:4318 + OTEL_EXPORTER_OTLP_ENDPOINT: http://${DO_OBSERVABILITY}:4318 volumes: - app2_logs:/app/logs healthcheck: @@ -58,7 +58,7 @@ services: REDIS_URL: ${REDIS_URL} SECRET_KEY: ${SECRET_KEY} LOG_FILE_PATH: /app/logs/app.log - OTEL_EXPORTER_OTLP_ENDPOINT: http://otel:4318 + OTEL_EXPORTER_OTLP_ENDPOINT: http://${DO_OBSERVABILITY}:4318 volumes: - app3_logs:/app/logs healthcheck: @@ -81,7 +81,7 @@ services: REDIS_URL: ${REDIS_URL} SECRET_KEY: ${SECRET_KEY} LOG_FILE_PATH: /app/logs/app.log - OTEL_EXPORTER_OTLP_ENDPOINT: http://otel:4318 + OTEL_EXPORTER_OTLP_ENDPOINT: http://${DO_OBSERVABILITY}:4318 volumes: - app4_logs:/app/logs healthcheck: @@ -105,86 +105,6 @@ services: ports: - "3000:3000" - prometheus: - image: prom/prometheus:latest - restart: always - ports: - - "9090:9090" - volumes: - - ${PWD}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - - ${PWD}/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro - - alertmanager: - image: prom/alertmanager:latest - restart: always - entrypoint: /bin/sh - command: - - -c - - | - sed -e "s|SMTP_SMARTHOST_VALUE|$$SMTP_SMARTHOST|g" \ - -e "s|SMTP_FROM_VALUE|$$SMTP_FROM|g" \ - -e "s|SMTP_AUTH_USERNAME_VALUE|$$SMTP_AUTH_USERNAME|g" \ - -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \ - -e "s|ALERT_EMAIL_TO_VALUE|$$ALERT_EMAIL_TO|g" \ - -e "s|SMTP_AUTH_PASSWORD_VALUE|$$SMTP_AUTH_PASSWORD|g" \ - -e "s|ALERT_SMS_TO_VALUE|$$ALERT_SMS_TO|g" \ - -e "s|DISCORD_WEBHOOK_URL_VALUE|$$DISCORD_WEBHOOK_URL|g" \ - /etc/alertmanager/alertmanager.tmpl.yml > /tmp/alertmanager.yml \ - && exec /bin/alertmanager --config.file=/tmp/alertmanager.yml - environment: - SMTP_SMARTHOST: ${SMTP_SMARTHOST:-localhost:25} - SMTP_FROM: ${SMTP_FROM:-alerts@example.com} - SMTP_AUTH_USERNAME: ${SMTP_AUTH_USERNAME:-} - SMTP_AUTH_PASSWORD: ${SMTP_AUTH_PASSWORD:-} - ALERT_EMAIL_TO: ${ALERT_EMAIL_TO:-admin@example.com} - ALERT_SMS_TO: ${ALERT_SMS_TO:-} - DISCORD_WEBHOOK_URL: ${DISCORD_WEBHOOK_URL:-} - volumes: - - ${PWD}/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.tmpl.yml:ro - ports: - - "9093:9093" - - alertmanager-discord: - image: benjojo/alertmanager-discord:latest - restart: always - environment: - DISCORD_WEBHOOK: ${DISCORD_WEBHOOK_URL:-} - - blackbox-exporter: - image: prom/blackbox-exporter:latest - restart: always - volumes: - - ${PWD}/prometheus/blackbox.yml:/etc/blackbox_exporter/config.yml:ro - ports: - - "9115:9115" - - node-exporter: - image: prom/node-exporter:latest - restart: always - command: - - --no-collector.kernel_hung - ports: - - "9100:9100" - - otel: - image: otel/opentelemetry-collector-contrib:latest - restart: always - volumes: - - ${PWD}/otel/config.yaml:/etc/otelcol-contrib/config.yaml - ports: - - "8889:8889" - - "4318:4318" - - loki: - image: grafana/loki:3.0.0 - restart: always - command: -config.file=/etc/loki/config.yml - ports: - - "3100:3100" - volumes: - - ${PWD}/loki/config.yml:/etc/loki/config.yml:ro - - loki-data:/loki - promtail: image: grafana/promtail:3.0.0 restart: always @@ -195,24 +115,7 @@ services: - /var/lib/docker/containers:/var/lib/docker/containers:ro - promtail-data:/tmp - grafana: - image: grafana/grafana:latest - restart: always - ports: - - "3001:3000" - environment: - - GF_SECURITY_ADMIN_USER=admin - - GF_SECURITY_ADMIN_PASSWORD=admin - volumes: - - grafana-data:/var/lib/grafana - - ${PWD}/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro - - ${PWD}/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro - - ${PWD}/grafana/provisioning/alerting:/etc/grafana/provisioning/alerting:ro - - ${PWD}/grafana/dashboards:/var/lib/grafana/dashboards:ro - volumes: - grafana-data: - loki-data: promtail-data: app1_logs: app2_logs: diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index 1d26d60c..47207da4 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -4,7 +4,7 @@ global: alerting: alertmanagers: - static_configs: - - targets: ['alertmanager:9093'] + - targets: ['${DO_OBSERVABILITY}:9093'] rule_files: - /etc/prometheus/alerts.yml @@ -12,21 +12,31 @@ rule_files: scrape_configs: - job_name: 'prometheus' static_configs: - - targets: ['localhost:9090'] # Prometheus server default port + - targets: ['${DO_OBSERVABILITY}:9090'] # Prometheus server on observability droplet - job_name: 'otel-collector' static_configs: - - targets: ['otel:8889'] # Common default for OTel Collector Prometheus exporter + - targets: ['${DO_OBSERVABILITY}:8889'] # OTel Collector on observability droplet metrics_path: /metrics - - job_name: 'node-exporter' + - job_name: 'node-exporter-droplet-1' static_configs: - - targets: ['node-exporter:9100'] + - targets: ['${DO_HOST_1}:9100'] metrics_path: /metrics - - job_name: 'process-exporter' + - job_name: 'node-exporter-droplet-2' static_configs: - - targets: ['process-exporter:9256'] + - targets: ['${DO_HOST_2}:9100'] + metrics_path: /metrics + + - job_name: 'process-exporter-droplet-1' + static_configs: + - targets: ['${DO_HOST_1}:9256'] + metrics_path: /metrics + + - job_name: 'process-exporter-droplet-2' + static_configs: + - targets: ['${DO_HOST_2}:9256'] metrics_path: /metrics - job_name: 'blackbox-health' @@ -35,30 +45,35 @@ scrape_configs: module: [http_2xx] static_configs: - targets: - - http://nginx:80/health/live - - http://nginx:80/health/ready + - http://${DO_HOST_1}:80/health/live + - http://${DO_HOST_1}:80/health/ready + labels: + service: app-droplet-1 + - targets: + - http://${DO_HOST_2}:80/health/live + - http://${DO_HOST_2}:80/health/ready labels: - service: app + service: app-droplet-2 relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ - replacement: blackbox-exporter:9115 + replacement: ${DO_OBSERVABILITY}:9115 scrape_interval: 15s - job_name: 'blackbox-exporter' static_configs: - - targets: ['blackbox-exporter:9115'] + - targets: ['${DO_OBSERVABILITY}:9115'] metrics_path: /metrics - job_name: 'loki' static_configs: - - targets: ['loki:3100'] + - targets: ['${DO_OBSERVABILITY}:3100'] metrics_path: /metrics - job_name: 'promtail' static_configs: - - targets: ['promtail:9080'] + - targets: ['${DO_OBSERVABILITY}:9080'] metrics_path: /metrics diff --git a/docker-compose.gold.yml b/scripts/k6/docker-compose.gold.yml similarity index 100% rename from docker-compose.gold.yml rename to scripts/k6/docker-compose.gold.yml diff --git a/docker-compose.silver.yml b/scripts/k6/docker-compose.silver.yml similarity index 100% rename from docker-compose.silver.yml rename to scripts/k6/docker-compose.silver.yml From cf0fc331c11584d2fab85d5787d661d778503e4e Mon Sep 17 00:00:00 2001 From: Victor Date: Sun, 5 Apr 2026 11:20:36 -0400 Subject: [PATCH 2/4] deploy observability --- .github/workflows/ci-cd.yml | 44 +++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml index fc1da89a..92446e0c 100644 --- a/.github/workflows/ci-cd.yml +++ b/.github/workflows/ci-cd.yml @@ -105,6 +105,9 @@ jobs: git pull origin main cat > .env << 'EOF' + DO_HOST_1=${{ secrets.DO_HOST_1 }} + DO_HOST_2=${{ secrets.DO_HOST_2 }} + DO_OBSERVABILITY=${{ secrets.DO_OBSERVABILITY }} DATABASE_NAME=${{ secrets.DATABASE_NAME }} DATABASE_HOST=${{ secrets.DATABASE_HOST }} DATABASE_PORT=${{ secrets.DATABASE_PORT }} @@ -119,6 +122,10 @@ jobs: SMTP_AUTH_PASSWORD=${{ secrets.SMTP_AUTH_PASSWORD }} ALERT_EMAIL_TO=${{ secrets.ALERT_EMAIL_TO }} DISCORD_WEBHOOK_URL=${{ secrets.DISCORD_WEBHOOK_URL }} + S3_KEY=${{ secrets.S3_KEY }} + SECRET_S3_KEY=${{ secrets.SECRET_S3_KEY }} + AWS_REGION=${{ secrets.AWS_REGION }} + LOKI_S3_BUCKET=${{ secrets.LOKI_S3_BUCKET }} EOF docker build -t url-shortner-app . @@ -149,6 +156,9 @@ jobs: git pull origin main cat > .env << 'EOF' + DO_HOST_1=${{ secrets.DO_HOST_1 }} + DO_HOST_2=${{ secrets.DO_HOST_2 }} + DO_OBSERVABILITY=${{ secrets.DO_OBSERVABILITY }} DATABASE_NAME=${{ secrets.DATABASE_NAME }} DATABASE_HOST=${{ secrets.DATABASE_HOST }} DATABASE_PORT=${{ secrets.DATABASE_PORT }} @@ -163,6 +173,10 @@ jobs: SMTP_AUTH_PASSWORD=${{ secrets.SMTP_AUTH_PASSWORD }} ALERT_EMAIL_TO=${{ secrets.ALERT_EMAIL_TO }} DISCORD_WEBHOOK_URL=${{ secrets.DISCORD_WEBHOOK_URL }} + S3_KEY=${{ secrets.S3_KEY }} + SECRET_S3_KEY=${{ secrets.SECRET_S3_KEY }} + AWS_REGION=${{ secrets.AWS_REGION }} + LOKI_S3_BUCKET=${{ secrets.LOKI_S3_BUCKET }} EOF docker build -t url-shortner-app . @@ -170,3 +184,33 @@ jobs: docker compose -f docker-compose.prod.yml up -d --remove-orphans docker compose -f docker-compose.prod.yml restart nginx + + - name: Deploy observability stack to observability droplet + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.DO_OBSERVABILITY }} + username: root + key: ${{ secrets.DO_SSH_PRIVATE_KEY }} + command_timeout: 30m + script: | + cd ~/MetaHackathon + git pull origin main + + cat > .env << 'EOF' + DO_HOST_1=${{ secrets.DO_HOST_1 }} + DO_HOST_2=${{ secrets.DO_HOST_2 }} + DO_OBSERVABILITY=${{ secrets.DO_OBSERVABILITY }} + SMTP_SMARTHOST=${{ secrets.SMTP_SMARTHOST }} + SMTP_FROM=${{ secrets.SMTP_FROM }} + SMTP_AUTH_USERNAME=${{ secrets.SMTP_AUTH_USERNAME }} + SMTP_AUTH_PASSWORD=${{ secrets.SMTP_AUTH_PASSWORD }} + ALERT_EMAIL_TO=${{ secrets.ALERT_EMAIL_TO }} + ALERT_SMS_TO=${{ secrets.ALERT_SMS_TO }} + DISCORD_WEBHOOK_URL=${{ secrets.DISCORD_WEBHOOK_URL }} + S3_KEY=${{ secrets.S3_KEY }} + SECRET_S3_KEY=${{ secrets.SECRET_S3_KEY }} + AWS_REGION=${{ secrets.AWS_REGION }} + LOKI_S3_BUCKET=${{ secrets.LOKI_S3_BUCKET }} + EOF + + docker compose -f docker-compose.observability.prod.yml up -d --remove-orphans From 0000fcc645c88361b91a9d10c911e582d4095c65 Mon Sep 17 00:00:00 2001 From: Victor Date: Sun, 5 Apr 2026 12:29:15 -0400 Subject: [PATCH 3/4] runbook --- README.md | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/README.md b/README.md index e35200f4..c56cee5d 100644 --- a/README.md +++ b/README.md @@ -191,17 +191,7 @@ One of the problems we had was with malformed data and we would get a lone error ## Runbooks -### Incident Response - -- [Backend Outage](#) -- [Database Issues](#) -- [High Latency](#) - -### Operational Tasks - -- [Scaling the Backend](#) -- [Backup Procedures](#) -- [Log Access](#) +- [Runbook](docs/RUNBOOK.md) --- From 1bf6ab144827e0d337cc9d7480f52d4b57f0f091 Mon Sep 17 00:00:00 2001 From: Victor Date: Sun, 5 Apr 2026 12:37:34 -0400 Subject: [PATCH 4/4] cool readme stats --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index c56cee5d..e2dff2d0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ # Meta Production Engineering Hackathon +![Python](https://img.shields.io/badge/Python-3.11%2B-3776AB?logo=python&logoColor=white) +![Flask](https://img.shields.io/badge/Backend-Flask-000000?logo=flask&logoColor=white) +![Frontend](https://img.shields.io/badge/Frontend-Next.js-000000?logo=nextdotjs&logoColor=white) +![Infra](https://img.shields.io/badge/Infra-Docker%20%7C%20Nginx-2496ED?logo=docker&logoColor=white) +![Observability](https://img.shields.io/badge/Observability-Prometheus%20%7C%20Grafana%20%7C%20Loki-E6522C) +![Scale Tested](https://img.shields.io/badge/Scale%20Tested-~7000%20Concurrent%20Users-success) + This is the most scalable, reliable and guaranteed to wake up the on-call engineer url-shortner of all time. Provided to you by 4 students from Canada, 2 from Waterloo and 2 from Concordia. ## Quick Links