diff --git a/images/flashbox-l1.conf b/images/flashbox-l1.conf index 42f61ce8..a5d07dfd 100644 --- a/images/flashbox-l1.conf +++ b/images/flashbox-l1.conf @@ -2,6 +2,7 @@ Include=shared/mkosi.conf Include=modules/flashbox/common/mkosi.conf Include=modules/flashbox/flashbox-l1/mkosi.conf +Include=modules/flashbox/observability/mkosi.conf [Config] Profiles=azure,gcp diff --git a/images/flashbox-l2.conf b/images/flashbox-l2.conf index 96076496..7f127812 100644 --- a/images/flashbox-l2.conf +++ b/images/flashbox-l2.conf @@ -2,6 +2,7 @@ Include=shared/mkosi.conf Include=modules/flashbox/common/mkosi.conf Include=modules/flashbox/flashbox-l2/mkosi.conf +Include=modules/flashbox/observability/mkosi.conf [Config] Profiles=gcp diff --git a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh index 8701b55e..ad966ea4 100755 --- a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh +++ b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh @@ -151,8 +151,13 @@ drop_dst_ip() { # # `source` is not supported in dash ########################################################################### + . /etc/bob/firewall-config +# Observability rule (sourced only if the observability module is included +# in the image — it owns its own static egress rule for the metrics LB IP). +[ -f /etc/bob/firewall-config-observability ] && . /etc/bob/firewall-config-observability + ########################################################################### # (6) Start in Maintenance Mode ########################################################################### diff --git a/modules/flashbox/observability/mkosi.conf b/modules/flashbox/observability/mkosi.conf new file mode 100644 index 00000000..f45d7c01 --- /dev/null +++ b/modules/flashbox/observability/mkosi.conf @@ -0,0 +1,8 @@ +[Content] +ExtraTrees=modules/flashbox/observability/mkosi.extra +PostInstallationScripts=modules/flashbox/observability/mkosi.postinst + +Packages=prometheus + prometheus-node-exporter + prometheus-process-exporter + gettext-base diff --git a/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability b/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability new file mode 100644 index 00000000..568b9b72 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability @@ -0,0 +1,11 @@ +# Observability drop-in for /etc/bob/firewall-config.d/ +# +# Loaded by init-firewall.sh after the per-image firewall-config. Opens host +# egress to AWS Managed Prometheus, reached via the internal TCP-proxy load +# balancer at a fixed RFC1918 address (provisioned in terraform: +# gcp/flashbots-l1-flashbox/us-east4/amp-proxy.tf -> amp_proxy_ip). +# +# Static IP -> the rule is measured into the image; no boot-time DNS +# resolution and no env file. Placed in ALWAYS_OUT so it survives the +# maintenance -> production mode toggle (production drops general egress). +accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp 10.88.0.100 $HTTPS_PORT "Metrics endpoint (AMP via internal LB)" diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml new file mode 100644 index 00000000..033f901d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml @@ -0,0 +1,5 @@ +process_names: + # Monitor the searcher container (conmon + all children via --children flag) + - name: "searcher-container" + cmdline: + - 'conmon.*searcher-container' diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl new file mode 100644 index 00000000..92634506 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl @@ -0,0 +1,46 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + # Stamped onto every series sent via remote_write so the central + # Prometheus / AMP can distinguish samples coming from different + # flashbox VMs. + external_labels: + host: ${FLASHBOX_VM} + +# Recording rules for aggregated metrics +rule_files: + - /etc/prometheus/recording_rules.yml + +scrape_configs: + # Node exporter on localhost + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + metric_relabel_configs: + # Only keep aggregated metrics for remote write + - source_labels: [__name__] + regex: 'node_(cpu|memory|disk|filesystem|network|vmstat)_.*' + action: keep + + # Process exporter for container monitoring + - job_name: 'process' + static_configs: + - targets: ['localhost:9256'] + +# Remote write to AWS Managed Prometheus via the internal TCP-proxy LB. +# Host is the fixed LB IP (10.88.0.100, see terraform amp-proxy.tf), so no DNS +# resolution happens on the VM; TLS SNI + cert validation are overridden to +# AMP's real regional hostname via tls_config.server_name. +remote_write: + - url: https://10.88.0.100/workspaces/${METRICS_FLASHBOTS_WORKSPACE}/api/v1/remote_write + write_relabel_configs: + # Only send flashbox: prefixed metrics + - source_labels: [__name__] + regex: 'flashbox:.*' + action: keep + tls_config: + server_name: aps-workspaces.${METRICS_FLASHBOTS_REGION}.amazonaws.com + sigv4: + region: ${METRICS_FLASHBOTS_REGION} + access_key: ${METRICS_FLASHBOTS_ACCESS_KEY} + secret_key: ${METRICS_FLASHBOTS_SECRET_KEY} diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml new file mode 100644 index 00000000..a4190ae0 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml @@ -0,0 +1,43 @@ +groups: + # Base metrics — local: prefix means they stay inside the TEE + # (remote_write only forwards flashbox:*) + - name: local_container_metrics + interval: 30s + rules: + - record: local:container_cpu_percent + expr: sum(rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m])) * 100 + + # Forwarded metrics — flashbox: prefix, picked up by remote_write + - name: flashbox_health + interval: 30s + rules: + # Binary: 1 if process-exporter is up AND at least one process is in the + # searcher-container cgroup; 0 otherwise. + - record: flashbox:container_alive + expr: > + (up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"}) + > bool 0 + + # Spike-guarded: current 15m avg must be under 80%, + # AND the 10m max ending 5m ago must have been under 70% + - record: flashbox:container_average_cpu_is_under_80_percent + expr: > + (avg_over_time(local:container_cpu_percent[15m]) < bool 80) + * (max_over_time(local:container_cpu_percent[10m] offset 5m) < bool 70) + + - record: flashbox:container_oom_kills_count + expr: node_vmstat_oom_kill + + - record: flashbox:disk_free_space_is_over_10_percent + expr: > + (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) > bool 0.1 + + - record: flashbox:disk_free_space_is_over_128_gb + expr: > + (node_filesystem_avail_bytes{mountpoint="/persistent"}) > bool (128 * 1024 * 1024 * 1024) + + - record: flashbox:network_is_up + expr: > + (sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) + + sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) + > bool 0 diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/flashbox-observability-setup.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/flashbox-observability-setup.service new file mode 100644 index 00000000..a8fcf4f7 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/flashbox-observability-setup.service @@ -0,0 +1,14 @@ +[Unit] +Description=Flashbox observability setup (fetch creds, render Prometheus config) +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +ExecStart=/usr/bin/flashbox-observability-setup +RemainAfterExit=yes +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service new file mode 100644 index 00000000..1f6d2a3d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service @@ -0,0 +1,56 @@ +[Unit] +Description=Prometheus Node Exporter +Documentation=https://github.com/prometheus/node_exporter +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/bin/prometheus-node-exporter \ + --web.listen-address=127.0.0.1:9100 \ + --collector.cpu \ + --collector.meminfo \ + --collector.diskstats \ + --collector.filesystem \ + --collector.netdev \ + --collector.loadavg \ + --no-collector.arp \ + --no-collector.bcache \ + --no-collector.bonding \ + --no-collector.conntrack \ + --no-collector.cpufreq \ + --no-collector.edac \ + --no-collector.entropy \ + --no-collector.filefd \ + --no-collector.hwmon \ + --no-collector.infiniband \ + --no-collector.ipvs \ + --no-collector.mdadm \ + --no-collector.netclass \ + --no-collector.netstat \ + --no-collector.nfs \ + --no-collector.nfsd \ + --no-collector.pressure \ + --no-collector.rapl \ + --no-collector.schedstat \ + --no-collector.sockstat \ + --no-collector.softnet \ + --no-collector.stat \ + --no-collector.textfile \ + --no-collector.thermal_zone \ + --no-collector.time \ + --no-collector.timex \ + --no-collector.udp_queues \ + --no-collector.uname \ + --collector.vmstat \ + --no-collector.xfs \ + --no-collector.zfs \ + --no-collector.systemd \ + --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/) +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service new file mode 100644 index 00000000..30b1257c --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service @@ -0,0 +1,19 @@ +[Unit] +Description=Prometheus Process Exporter +Documentation=https://github.com/ncabatoff/process-exporter +After=network-online.target searcher-container.service +Wants=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/bin/prometheus-process-exporter \ + --web.listen-address=127.0.0.1:9256 \ + --config.path=/etc/prometheus/process-exporter.yml \ + --children +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service new file mode 100644 index 00000000..17eafe39 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service @@ -0,0 +1,29 @@ +[Unit] +Description=Prometheus Monitoring System +Documentation=https://prometheus.io/docs/introduction/overview/ +After=network-online.target flashbox-observability-setup.service +Wants=network-online.target + +# flashbox-observability-setup writes this file only when Vault creds are +# fetched + validated. If it's absent (Vault down, dev image, bad secret) the +# unit is cleanly skipped — inactive, not failed, no Restart= crash-loop — and +# self-heals on the next boot that renders a config. +ConditionPathExists=/etc/prometheus/prometheus.yml + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/bin/prometheus \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/var/lib/prometheus/ \ + --storage.tsdb.retention.time=24h \ + --web.console.templates=/usr/share/prometheus/consoles \ + --web.console.libraries=/usr/share/prometheus/console_libraries \ + --web.listen-address=127.0.0.1:9090 +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup new file mode 100755 index 00000000..48cca197 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -0,0 +1,49 @@ +#!/bin/sh +set -eu -o pipefail + +# Boot-time observability setup: authenticate to Vault, fetch + validate the +# Prometheus remote_write secret, and render /etc/prometheus/prometheus.yml. +# +# Prometheus ships flashbox:* metrics to Prometheus via the internal TCP-proxy +# at a fixed IP (10.88.0.100); the host firewall statically allows that IP, and +# TLS is validated via tls_config.server_name in the template. +# +# Fail-safe: on any failure (QEMU dev, Vault unreachable, missing/invalid +# secret) this writes NO config and exits 0. Because prometheus.service has +# ConditionPathExists=/etc/prometheus/prometheus.yml, it is cleanly skipped +# (inactive, not failed) until a later boot renders a config — boot never +# fails because of observability. + +PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml +PROMETHEUS_TEMPLATE=/etc/prometheus/prometheus.yml.tmpl + +. /usr/lib/flashbox/vault.sh + +# Always start clean — never leave a stale config from a previous boot. +rm -f "$PROMETHEUS_CONFIG" + +# host= external label, from the GCE instance name. +FLASHBOX_VM=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/name") || FLASHBOX_VM=unknown +printf '%s' "$FLASHBOX_VM" | grep -qE '^[a-z0-9-]+$' || FLASHBOX_VM=unknown +export FLASHBOX_VM + +# Local QEMU dev image: no Vault, no config (Prometheus stays disabled). +if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ + [ -f /etc/systemd/system/serial-console.service ]; then + echo "QEMU dev environment, leaving Prometheus disabled" + exit 0 +fi + +# Fetch + validate the four metrics variables (exported by vault_fetch). +if ! vault_fetch; then + echo "WARNING: Vault fetch/validation failed, leaving Prometheus disabled" + exit 0 +fi + +# Single render: only the five curated variables are substituted; every other +# token in the template (e.g. [__name__]) is left untouched. +envsubst '$FLASHBOX_VM $METRICS_FLASHBOTS_WORKSPACE $METRICS_FLASHBOTS_REGION $METRICS_FLASHBOTS_ACCESS_KEY $METRICS_FLASHBOTS_SECRET_KEY' \ + < "$PROMETHEUS_TEMPLATE" > "$PROMETHEUS_CONFIG" + +echo "Observability config written (workspace: ${METRICS_FLASHBOTS_WORKSPACE})" diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh new file mode 100644 index 00000000..9ba7882a --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -0,0 +1,62 @@ +#!/bin/sh +# Vault auth (GCP backend) + fetch/validate the Prometheus remote_write secret. +# Sourced by flashbox-observability-setup. +# +# vault_fetch authenticates with the GCE instance-identity JWT, reads the +# shared secret, and exports the four metrics variables — each checked against +# its expected format, so a malformed or tampered value can never reach the +# rendered config (and can't carry shell/YAML metacharacters). Any failure +# returns non-zero and exports nothing usable; the caller then writes no config. +# +# Note: values are piped with `printf '%s'`, never `echo` — dash's echo +# interprets backslash escapes (\n, \c, ...) and would corrupt JSON / values. + +vault_fetch() { + local addr mount role kv suffix jwt token data + + # 1. Bootstrap config from GCE instance metadata. + addr=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_addr") || return 1 + mount=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_auth_mount_gcp") || return 1 + role=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_role") || return 1 + kv=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_kv_path") || return 1 + suffix=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_kv_common_suffix") || return 1 + + # 2. Authenticate: GCE identity JWT -> Vault token. + jwt=$(curl -sf --header "Metadata-Flavor: Google" \ + --data-urlencode "audience=http://vault/${role}" \ + --data-urlencode "format=full" \ + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || return 1 + token=$(curl -sf \ + --data "$(printf '{"role":"%s","jwt":"%s"}' "$role" "$jwt")" \ + "${addr}/v1/${mount}/login" | jq -re .auth.client_token) || return 1 + + # 3. Read the shared secret blob. + data=$(curl -sf --header "X-Vault-Token: ${token}" \ + "${addr}/v1/${kv}/node/${suffix}" | jq -ce .data.data) || return 1 + + # 4. Extract each variable and validate it against its expected format. + METRICS_FLASHBOTS_WORKSPACE=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_WORKSPACE) || return 1 + printf '%s' "$METRICS_FLASHBOTS_WORKSPACE" | grep -qE '^ws-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$' \ + || { echo "vault_fetch: WORKSPACE is not an AMP workspace id" >&2; return 1; } + + METRICS_FLASHBOTS_REGION=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_REGION) || return 1 + printf '%s' "$METRICS_FLASHBOTS_REGION" | grep -qE '^[a-z]{2}-[a-z]+-[0-9]+$' \ + || { echo "vault_fetch: REGION is not an AWS region" >&2; return 1; } + + METRICS_FLASHBOTS_ACCESS_KEY=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_ACCESS_KEY) || return 1 + printf '%s' "$METRICS_FLASHBOTS_ACCESS_KEY" | grep -qE '^[A-Z0-9]{20}$' \ + || { echo "vault_fetch: ACCESS_KEY is not an AWS access key id" >&2; return 1; } + + METRICS_FLASHBOTS_SECRET_KEY=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_SECRET_KEY) || return 1 + printf '%s' "$METRICS_FLASHBOTS_SECRET_KEY" | grep -qE '^[A-Za-z0-9/+]{40}$' \ + || { echo "vault_fetch: SECRET_KEY is not an AWS secret key" >&2; return 1; } + + # 5. All present and well-formed — publish to the environment for envsubst. + export METRICS_FLASHBOTS_WORKSPACE METRICS_FLASHBOTS_REGION \ + METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY +} diff --git a/modules/flashbox/observability/mkosi.postinst b/modules/flashbox/observability/mkosi.postinst new file mode 100755 index 00000000..ec872aa6 --- /dev/null +++ b/modules/flashbox/observability/mkosi.postinst @@ -0,0 +1,12 @@ +#!/bin/bash +set -euxo pipefail + +# Ensure prometheus owns its data directory +mkosi-chroot chown -R prometheus:prometheus /var/lib/prometheus + +# Enable observability services +mkosi-chroot systemctl add-wants minimal.target \ + flashbox-observability-setup.service \ + prometheus.service \ + node-exporter.service \ + process-exporter.service