diff --git a/.antigravity/rules.md b/.antigravity/rules.md index 60c3bea..ebf5984 100644 --- a/.antigravity/rules.md +++ b/.antigravity/rules.md @@ -62,4 +62,11 @@ To ensure the home lab remains secure and stable, the following standards apply: - In a "Service Down" situation, manual fixes may be applied to the live instance **ONLY after explicit USER permission**. - Once verified, changes **MUST** be immediately committed to Git. - **Idempotency**: All setup scripts must be safe to run multiple times. -- **Documentation**: Maintain `ReadMe.md` parity with cluster changes. + +## 6. Monitoring & Observability + +Every new service added MUST be onboarded to the monitoring stack using native tools to avoid sidecar vulnerabilities: + +1. **Uptime Kuma**: Add the public/internal URL for immediate up/down status tracking. +2. **Prometheus / Blackbox**: Add HTTP and/or DNS probes to the `website_and_http_checks` or `dns_service_checks` in `prometheus.yml.template`. +3. **Alerts**: Any new probes must be covered by the generic "EndpointDown" rule in `alert_rules.yml`. Do not add custom third-party exporters unless explicitly required. diff --git a/ReadMe.md b/ReadMe.md index c72adcf..24e9830 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -1,6 +1,6 @@ -[![Deploy Docker Stack to Prod](https://github.com/geeksbsmrt/RaspberryPi/actions/workflows/deploy-prod.yaml/badge.svg)](https://github.com/geeksbsmrt/RaspberryPi/actions/workflows/deploy-prod.yaml) +# geeksbsmrt/RaspberryPi Home Lab Configuration -# RaspberryPi Home Lab Configuration +[![Deploy Docker Stack to Prod](https://github.com/geeksbsmrt/RaspberryPi/actions/workflows/deploy-prod.yaml/badge.svg)](https://github.com/geeksbsmrt/RaspberryPi/actions/workflows/deploy-prod.yaml) This repository contains configuration files and resources for setting up and managing a Raspberry Pi-based home lab environment. It leverages tools like Docker, pre-commit hooks, and encrypted secrets management to ensure a secure and maintainable setup. @@ -21,3 +21,9 @@ This repository contains configuration files and resources for setting up and ma docker/ # Docker configurations and Dockerfiles secrets.sops.env # Encrypted environment variables ``` + +## Monitoring + +- **Uptime Kuma**: External and internal status page (`uptime.home`). +- **Prometheus & Grafana**: Time-series metrics & visualization (`grafana.home`). +- **Resource Scope**: Node Exporter (Host), cAdvisor (Containers), Blackbox (Endpoints), Caddy (Web Traffic). diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 84282d3..dcfc0e1 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -207,6 +207,7 @@ services: environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD} + - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/etc/grafana/provisioning/dashboards/json/homelab-overview.json - TZ="America/New_York" networks: macvlan: diff --git a/docker/grafana/provisioning/dashboards/dashboards.yaml b/docker/grafana/provisioning/dashboards/dashboards.yaml new file mode 100644 index 0000000..c0081e8 --- /dev/null +++ b/docker/grafana/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: "Homelab" + orgId: 1 + folder: "Homelab" + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /etc/grafana/provisioning/dashboards/json + foldersFromFilesStructure: false diff --git a/docker/grafana/provisioning/dashboards/json/homelab-overview.json b/docker/grafana/provisioning/dashboards/json/homelab-overview.json new file mode 100644 index 0000000..94cd9f2 --- /dev/null +++ b/docker/grafana/provisioning/dashboards/json/homelab-overview.json @@ -0,0 +1,370 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "title": "System Status Overview", + "type": "row" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "refId": "A" + } + ], + "title": "Active Alerts", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 0, "text": "DOWN" }, + "1": { "color": "green", "index": 1, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 18, "x": 6, "y": 1 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "name" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "probe_success{job=\"website_and_http_checks\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "HTTP Endpoints (Blackbox)", + "type": "stat" + }, + { + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 10, + "title": "Host Hardware", + "type": "row" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "unit": "percent", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "showPoints": "auto", + "stacking": { "group": "A", "mode": "none" } + } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 11, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "CPU Usage", + "refId": "A" + } + ], + "title": "Host CPU Utilization", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "showPoints": "auto", + "stacking": { "group": "A", "mode": "none" } + } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 12, + "options": { + "legend": { + "calcs": ["mean", "max", "last"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "single" } + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes", + "legendFormat": "Used", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "node_memory_MemTotal_bytes", + "legendFormat": "Total", + "refId": "B" + } + ], + "title": "Host Memory Usage", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineWidth": 2, + "stacking": { "group": "A", "mode": "none" } + } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "id": 13, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "single" } + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"rootfs\"} - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"rootfs\"}", + "legendFormat": "Used", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"rootfs\"}", + "legendFormat": "Total", + "refId": "B" + } + ], + "title": "Disk Space (/)", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2 + } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "id": 14, + "options": { + "legend": { + "calcs": ["mean", "max", "last"], + "displayMode": "table", + "placement": "right", + "showLegend": true + } + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(node_network_receive_bytes_total{device=\"eth0\"}[5m])", + "legendFormat": "Inbound (Receive)", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "rate(node_network_transmit_bytes_total{device=\"eth0\"}[5m])", + "legendFormat": "Outbound (Transmit)", + "refId": "B" + } + ], + "title": "Network Traffic (eth0)", + "type": "timeseries" + }, + { + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 20, + "title": "Docker Containers", + "type": "row" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2 + } + } + }, + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 23 }, + "id": 21, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + } + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_compose_service!=\"\"}[5m])) by (container_label_com_docker_compose_service) * 100", + "legendFormat": "{{container_label_com_docker_compose_service}}", + "refId": "A" + } + ], + "title": "Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2 + } + } + }, + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 23 }, + "id": 22, + "options": { + "legend": { + "calcs": ["max", "last"], + "displayMode": "table", + "placement": "right", + "showLegend": true + } + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "sum(container_memory_working_set_bytes{container_label_com_docker_compose_service!=\"\"}) by (container_label_com_docker_compose_service)", + "legendFormat": "{{container_label_com_docker_compose_service}}", + "refId": "A" + } + ], + "title": "Container Memory Usage", + "type": "timeseries" + } + ], + "refresh": "1m", + "schemaVersion": 38, + "style": "dark", + "tags": ["homelab", "auto"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timezone": "", + "title": "Homelab Overview", + "uid": "homelab-overview", + "version": 2 +} diff --git a/docker/prometheus/config/alert_rules.yml b/docker/prometheus/config/alert_rules.yml new file mode 100644 index 0000000..8de6c03 --- /dev/null +++ b/docker/prometheus/config/alert_rules.yml @@ -0,0 +1,60 @@ +groups: + - name: HostAlerts + rules: + - alert: HighCpuUsage + expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "CPU usage is at {{ $value }}%" + + - alert: HighMemoryUsage + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage is at {{ $value }}%" + + - alert: DiskSpaceLow + expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 15 + for: 5m + labels: + severity: critical + annotations: + summary: "Low disk space on {{ $labels.instance }}" + description: "Disk space available is less than 15% ({{ $value }}% left)" + + - name: ContainerAlerts + rules: + - alert: ContainerDown + expr: container_last_seen{container_label_com_docker_compose_service!=""} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Container {{ $labels.container_label_com_docker_compose_service }} is down" + description: "Container has not been seen for more than 2 minutes" + + - name: ProbeAlerts + rules: + - alert: EndpointDown + expr: probe_success == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Endpoint {{ $labels.instance }} is down" + description: "Blackbox probe failed for {{ $labels.instance }}" + + - alert: SSLCertExpiring + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 14 + for: 1h + labels: + severity: warning + annotations: + summary: "SSL certificate for {{ $labels.instance }} expires soon" + description: "Certificate expires in less than 14 days" diff --git a/docker/prometheus/config/prometheus.yml.template b/docker/prometheus/config/prometheus.yml.template index a7646e1..0513144 100644 --- a/docker/prometheus/config/prometheus.yml.template +++ b/docker/prometheus/config/prometheus.yml.template @@ -1,8 +1,10 @@ -# ./docker/data/prometheus/config/prometheus.yml.template global: scrape_interval: 30s evaluation_interval: 30s +rule_files: + - "alert_rules.yml" + scrape_configs: - job_name: 'prometheus_self' static_configs: @@ -21,6 +23,12 @@ scrape_configs: - targets: ['caddy:2019'] metrics_path: /metrics + - job_name: 'uptime_kuma' + metrics_path: /metrics + bearer_token: '${UPTIME_KUMA_API_KEY}' + static_configs: + - targets: ['uptime_kuma:3001'] + - job_name: 'website_and_http_checks' metrics_path: /probe params: @@ -30,6 +38,11 @@ scrape_configs: - https://geeksbsmrt.com - https://smrtgeekdevs.com - https://pihole.smrtgeekdevs.com + - https://analytics.geeksbsmrt.com + - https://firefly.home + - https://grafana.home + - https://uptime.home + - https://firefly-import.home - http://pihole/admin/ relabel_configs: - source_labels: [__address__] diff --git a/secrets.sops.env b/secrets.sops.env index 0e4e893..72c566b 100644 --- a/secrets.sops.env +++ b/secrets.sops.env @@ -48,11 +48,13 @@ IP_BLACKBOX=ENC[AES256_GCM,data:5yNMIy2JeNDYk/U/2OCRqw0=,iv:pjeHF1+rv3I78mgfEXGE IP_UPTIME_KUMA=ENC[AES256_GCM,data:UQAhVIKOdIhnU0OmCXa+rIY=,iv:c2S/5FkHC31qaEab5sokvFN37DJtMeWwqSG4cKkVKxU=,tag:Fgc2KwoK3xDo0HFotgpTSg==,type:str] IP_FIREFLY_APP=ENC[AES256_GCM,data:VGRgFVBMeqm47xvV+Dk1TDk=,iv:6PIiBwrdWYJ7js8nNoS6lj35lQIRIl0/X4TA9iloIMY=,tag:++b6/H+F9DB5VtpEUWYt6A==,type:str] IP_FIREFLY_IMPORTER=ENC[AES256_GCM,data:T92SNAD8JriDaPQkkVpc2cU=,iv:s4gEts4D0URQmGstNHnW/Xv3XyZDbKu/2WVXN/fl70M=,tag:BcWNlyF8DNSf2FUINolFlw==,type:str] +#ENC[AES256_GCM,data:H3eEZNOu7M25ZVI=,iv:JRvKWB8uvKq7NwTQbrxZZwZjRvDrgdKDIDVEXVpSOj8=,tag:yHaX3ajIMWwkZoxSN8ZCXg==,type:comment] +UPTIME_KUMA_API_KEY=ENC[AES256_GCM,data:RvEPiEh7gkCs5FnWAeh3UAGPzgWU7GHEkV3i57MMJ0rA6vqnC8dYrQwe2qyK9A==,iv:WdsJVyHfnjGJZ2SADL/SacerZ+flIqw6CPFdfgfkQJA=,tag:1VIvxsmJuHUYjG5fq2bUow==,type:str] sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSAvSHMvMy8vdTVMWEdocndm\nZUpNandoNGFtTWxJblFDeEQ3Wjdnc1MreFZrClgxT2srQ1dIcXMzYkJyalV5bFFF\ncVNwK3pSckQ0djRIMS9kZGs2ZWFoQ3cKLS0tIEI4TytpSnZveUpqVnlJYVNKb0F2\nYXF4UGRKUk9aVG41Znh0NnM0U2hsMlkKkzorNRurt4xGaXLYumzQ9JXrDWugH+ga\naUS6R0eS58TVBgPywG0gPIxFOpYIRQnDIUgqYdznc4rv0F62CSWMFg==\n-----END AGE ENCRYPTED FILE-----\n sops_age__list_0__map_recipient=age13pmf2jna228g9n780j7tl63qdws4gdll36rtuvk74nld5pet543q5ch3wf sops_age__list_1__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSB3VDZzb0IvcXAvcWpRY2pK\nL0txc3ZNVFhSN0NJVCswL05LWHhudHl2eGdZCm9JLy9BNlJrdFVsNlM4WmxNNXdY\nVW1pK3AxOVJvTW1SRWNRUTVUT1BseUEKLS0tIGhXTDE3eG9ucnVuY3dsTW1tbUZq\nTWszMjNwVFA1dUxTaTNUbWRvTkJmWUEKr+qniEgTJ5mBQ0wHGxlMQnj3zNWBdkHZ\nlbpdEQbWsSKAdtPvnKvW//A4gUueemGrTHTBkpiAR8svW5JVlpEImw==\n-----END AGE ENCRYPTED FILE-----\n sops_age__list_1__map_recipient=age1ryc8jh6xye29kgaxqwy57n7qunxak5w8kdtpgk0lwwjuh3w2gywsha6ja2 -sops_lastmodified=2026-02-21T06:34:18Z -sops_mac=ENC[AES256_GCM,data:3ASjP57Av/8pst88LtpPJO5u2Z85EWDQv/KxRo2wmRZaeGLnXlIQUmqZiLZJAtfdnfaHxo8hmdYITWlKnC7EqycLmUJTB7FyO0x3FIGtvb3EYhWTcQgpLRSGn/KiuvO8LzxbFLQNds8wA9spbB/6jtgu7+gZ3RxR/klRXxhe3OY=,iv:ZFxv8b83YJbTpZUBmEGjTlcH22i7SCNKdTIgLxDmYV4=,tag:BkeoO5WqK8OxMI46Es7MuQ==,type:str] +sops_lastmodified=2026-02-21T18:17:03Z +sops_mac=ENC[AES256_GCM,data:LZswEKuRPQO71RfnshD37UB+ZbZkJvCj9gco02F+2bnqjcYYbFfCQP9ytdT7iNB1hxpJYJIQTr/BIUmsj/O1T84+LAqq4E2l+cA/+e3nS4U3SFaTXxGz3KOnPSPXR3cB6jzL2gxAPsWFvQyg92eDxIe3mzTDQ3i6t5IE79N/do8=,iv:Zefxi4UzrxAuoCzivxvV6n5Uz8Dqi0joVIyZvrkPK34=,tag:Fr3USr+F4iyePVgdiIkiog==,type:str] sops_unencrypted_suffix=_unencrypted -sops_version=3.11.0 +sops_version=3.12.0