From 56f45b445b8204c5d2c8a9cd7bcc94673cc1f068 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:01:04 -0300 Subject: [PATCH 01/15] Split observability prototype into a separate module --- images/flashbox-l2.conf | 1 + .../mkosi.extra/usr/bin/init-firewall.sh | 4 + .../mkosi.extra/etc/bob/firewall-config | 5 + modules/flashbox/observability/mkosi.build | 12 ++ modules/flashbox/observability/mkosi.conf | 15 +++ .../etc/prometheus/process-exporter.yml | 5 + .../etc/prometheus/prometheus.yml.tmpl | 43 +++++++ .../etc/prometheus/recording_rules.yml | 39 ++++++ .../system/fetch-observability-config.service | 14 ++ .../etc/systemd/system/node-exporter.service | 56 ++++++++ .../systemd/system/process-exporter.service | 19 +++ .../etc/systemd/system/prometheus.service | 25 ++++ .../needs-observability.conf | 3 + .../usr/bin/fetch-observability-config.sh | 121 ++++++++++++++++++ modules/flashbox/observability/mkosi.postinst | 12 ++ 15 files changed, 374 insertions(+) create mode 100755 modules/flashbox/observability/mkosi.build create mode 100644 modules/flashbox/observability/mkosi.conf create mode 100644 modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml create mode 100644 modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl create mode 100644 modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml create mode 100644 modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service create mode 100644 modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service create mode 100644 modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service create mode 100644 modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service create mode 100644 modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf create mode 100755 modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh create mode 100755 modules/flashbox/observability/mkosi.postinst diff --git a/images/flashbox-l2.conf b/images/flashbox-l2.conf index 96076496..7f127812 100644 --- a/images/flashbox-l2.conf +++ b/images/flashbox-l2.conf @@ -2,6 +2,7 @@ Include=shared/mkosi.conf Include=modules/flashbox/common/mkosi.conf Include=modules/flashbox/flashbox-l2/mkosi.conf +Include=modules/flashbox/observability/mkosi.conf [Config] Profiles=gcp diff --git a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh index 8701b55e..67acef20 100755 --- a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh +++ b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh @@ -151,6 +151,10 @@ drop_dst_ip() { # # `source` is not supported in dash ########################################################################### + +# Load observability config if the module is included (metrics endpoint IP) +[ -f /etc/flashbox/observability.env ] && . /etc/flashbox/observability.env + . /etc/bob/firewall-config ########################################################################### diff --git a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config index bee194c2..44a06dcb 100644 --- a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config +++ b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config @@ -50,6 +50,11 @@ accept_dst_port $CHAIN_ALWAYS_IN tcp $CVM_REVERSE_PROXY_PORT "CVM reverse-proxy" accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP" accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS" +# Observability metrics endpoint (loaded from /etc/flashbox/observability.env) +if [ -n "${METRICS_ENDPOINT:-}" ]; then + accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)" +fi + ########################################################################### # (3) MAINTENANCE_IN: Inbound rules for Maintenance Mode ########################################################################### diff --git a/modules/flashbox/observability/mkosi.build b/modules/flashbox/observability/mkosi.build new file mode 100755 index 00000000..e8f52796 --- /dev/null +++ b/modules/flashbox/observability/mkosi.build @@ -0,0 +1,12 @@ +#!/bin/bash +set -euxo pipefail + +source scripts/make_git_package.sh + +# Build gomplate (template engine for Prometheus config) +make_git_package \ + "gomplate" \ + "v4.3.3" \ + "https://github.com/hairyhenderson/gomplate" \ + 'go build -trimpath -ldflags "-s -w -buildid=" -o ./build/gomplate ./cmd/gomplate' \ + "build/gomplate:/usr/bin/gomplate" diff --git a/modules/flashbox/observability/mkosi.conf b/modules/flashbox/observability/mkosi.conf new file mode 100644 index 00000000..f23f464c --- /dev/null +++ b/modules/flashbox/observability/mkosi.conf @@ -0,0 +1,15 @@ +[Build] +WithNetwork=true + +[Content] +ExtraTrees=modules/flashbox/observability/mkosi.extra +PostInstallationScripts=modules/flashbox/observability/mkosi.postinst +BuildScripts=modules/flashbox/observability/mkosi.build + +Packages=prometheus + prometheus-node-exporter + prometheus-process-exporter + +BuildPackages=build-essential + git + golang diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml new file mode 100644 index 00000000..033f901d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml @@ -0,0 +1,5 @@ +process_names: + # Monitor the searcher container (conmon + all children via --children flag) + - name: "searcher-container" + cmdline: + - 'conmon.*searcher-container' diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl new file mode 100644 index 00000000..2e2dc00d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl @@ -0,0 +1,43 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +# Recording rules for aggregated metrics +rule_files: + - /etc/prometheus/recording_rules.yml + +# Scrape configurations +scrape_configs: + # Node exporter on localhost + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + metric_relabel_configs: + # Only keep aggregated metrics for remote write + - source_labels: [__name__] + regex: 'node_(cpu|memory|disk|filesystem|network|vmstat)_.*' + action: keep + + # Process exporter for container monitoring + - job_name: 'process' + static_configs: + - targets: ['localhost:9256'] + +{{- $config := (datasource "config") }} +{{- if $config.remote_write_flashbots_url }} + +# Remote write configuration (dynamically configured) +remote_write: + # Flashbots endpoint + - url: {{ $config.remote_write_flashbots_url }} + write_relabel_configs: + # Only send flashbox: prefixed metrics + - source_labels: [__name__] + regex: 'flashbox:.*' + action: keep + {{- if $config.remote_write_flashbots_auth }} + basic_auth: + username: {{ $config.remote_write_flashbots_username }} + password: {{ $config.remote_write_flashbots_password }} + {{- end }} +{{- end }} diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml new file mode 100644 index 00000000..79a370cc --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml @@ -0,0 +1,39 @@ +groups: + # Base metrics — local: prefix means they stay inside the TEE + # (remote_write only forwards flashbox:*) + - name: local_container_metrics + interval: 30s + rules: + - record: local:container_cpu_percent + expr: sum(rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m])) * 100 + + # Forwarded metrics — flashbox: prefix, picked up by remote_write + - name: flashbox_health + interval: 30s + rules: + - record: flashbox:container_alive + expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"} + + # Spike-guarded: current 15m avg must be under 80%, + # AND the 10m max ending 5m ago must have been under 70% + - record: flashbox:container_average_cpu_is_under_80_percent + expr: > + (avg_over_time(local:container_cpu_percent[15m]) < bool 80) + * (max_over_time(local:container_cpu_percent[10m] offset 5m) < bool 70) + + - record: flashbox:container_oom_kills_count + expr: node_vmstat_oom_kill + + - record: flashbox:disk_free_space_is_over_10_percent + expr: > + (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) > bool 0.1 + + - record: flashbox:disk_free_space_is_over_128_gb + expr: > + (node_filesystem_avail_bytes{mountpoint="/persistent"}) > bool (128 * 1024 * 1024 * 1024) + + - record: flashbox:network_is_up + expr: > + (sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) + + sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) + > bool 0 diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service new file mode 100644 index 00000000..f45ccb49 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service @@ -0,0 +1,14 @@ +[Unit] +Description=Fetch observability configuration +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +ExecStart=/usr/bin/fetch-observability-config.sh +RemainAfterExit=yes +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service new file mode 100644 index 00000000..1f6d2a3d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service @@ -0,0 +1,56 @@ +[Unit] +Description=Prometheus Node Exporter +Documentation=https://github.com/prometheus/node_exporter +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/bin/prometheus-node-exporter \ + --web.listen-address=127.0.0.1:9100 \ + --collector.cpu \ + --collector.meminfo \ + --collector.diskstats \ + --collector.filesystem \ + --collector.netdev \ + --collector.loadavg \ + --no-collector.arp \ + --no-collector.bcache \ + --no-collector.bonding \ + --no-collector.conntrack \ + --no-collector.cpufreq \ + --no-collector.edac \ + --no-collector.entropy \ + --no-collector.filefd \ + --no-collector.hwmon \ + --no-collector.infiniband \ + --no-collector.ipvs \ + --no-collector.mdadm \ + --no-collector.netclass \ + --no-collector.netstat \ + --no-collector.nfs \ + --no-collector.nfsd \ + --no-collector.pressure \ + --no-collector.rapl \ + --no-collector.schedstat \ + --no-collector.sockstat \ + --no-collector.softnet \ + --no-collector.stat \ + --no-collector.textfile \ + --no-collector.thermal_zone \ + --no-collector.time \ + --no-collector.timex \ + --no-collector.udp_queues \ + --no-collector.uname \ + --collector.vmstat \ + --no-collector.xfs \ + --no-collector.zfs \ + --no-collector.systemd \ + --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/) +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service new file mode 100644 index 00000000..30b1257c --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service @@ -0,0 +1,19 @@ +[Unit] +Description=Prometheus Process Exporter +Documentation=https://github.com/ncabatoff/process-exporter +After=network-online.target searcher-container.service +Wants=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/bin/prometheus-process-exporter \ + --web.listen-address=127.0.0.1:9256 \ + --config.path=/etc/prometheus/process-exporter.yml \ + --children +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service new file mode 100644 index 00000000..fb6397ca --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service @@ -0,0 +1,25 @@ +[Unit] +Description=Prometheus Monitoring System +Documentation=https://prometheus.io/docs/introduction/overview/ +After=network-online.target fetch-observability-config.service +Wants=network-online.target +Requires=fetch-observability-config.service + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json +ExecStart=/usr/bin/prometheus \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/var/lib/prometheus/ \ + --storage.tsdb.retention.time=24h \ + --web.console.templates=/usr/share/prometheus/consoles \ + --web.console.libraries=/usr/share/prometheus/console_libraries \ + --web.listen-address=127.0.0.1:9090 +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf new file mode 100644 index 00000000..b11c3917 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf @@ -0,0 +1,3 @@ +[Unit] +After=fetch-observability-config.service +Wants=fetch-observability-config.service diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh b/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh new file mode 100755 index 00000000..f319a28f --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh @@ -0,0 +1,121 @@ +#!/bin/sh +set -eu -o pipefail + +# Fetches observability configuration (metrics endpoint credentials) and writes: +# /etc/flashbox/observability-config.json — consumed by gomplate for Prometheus config +# /etc/flashbox/observability.env — sourced by firewall for metrics endpoint IP +# +# On failure: logs a warning and writes empty defaults. Prometheus runs locally +# without remote_write. This is intentional — observability should never block boot. + +OBSERVABILITY_CONFIG_PATH=/etc/flashbox/observability-config.json +OBSERVABILITY_ENV_PATH=/etc/flashbox/observability.env + +write_config() { + local url="${1:-}" + local username="${2:-}" + local password="${3:-}" + + # Extract IP for firewall rules + local metrics_endpoint="" + if [ -n "$url" ]; then + metrics_endpoint=$(echo "$url" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' | head -1 || true) + fi + + mkdir -p /etc/flashbox + + # JSON config for Prometheus gomplate template + cat < "$OBSERVABILITY_CONFIG_PATH" +{ + "remote_write_flashbots_url": "${url}", + "remote_write_flashbots_username": "${username}", + "remote_write_flashbots_password": "${password}", + "remote_write_flashbots_auth": $([ -n "${username}" ] && echo '"true"' || echo '""') +} +EOF + + # Env file for firewall (sourced by init-firewall.sh) + cat < "$OBSERVABILITY_ENV_PATH" +METRICS_ENDPOINT='${metrics_endpoint}' +EOF + + echo "Observability config written (endpoint: ${metrics_endpoint:-none})" +} + +# Don't override if config already exists +if [ -f "$OBSERVABILITY_CONFIG_PATH" ]; then + echo "Observability config already exists, skipping" + exit 0 +fi + +# Local QEMU dev: no remote_write +if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ + [ -f /etc/systemd/system/serial-console.service ]; then + echo "QEMU dev environment, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +# Production: fetch from Vault (non-fatal on failure) +echo "Fetching observability config from Vault..." + +fetch_metadata_value() { + curl -sf \ + --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/$1" +} + +if ! instance_name=$(fetch_metadata_value "name") || \ + ! vault_addr=$(fetch_metadata_value "vault_addr") || \ + ! vault_auth_mount=$(fetch_metadata_value "vault_auth_mount_gcp") || \ + ! vault_kv_path=$(fetch_metadata_value "vault_kv_path") || \ + ! vault_kv_common_suffix=$(fetch_metadata_value "vault_kv_common_suffix"); then + echo "WARNING: Could not fetch GCP metadata, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +# Authenticate with Vault using GCP identity +gcp_token=$(curl -sf \ + --header "Metadata-Flavor: Google" \ + --data-urlencode "audience=http://vault/$instance_name" \ + --data-urlencode "format=full" \ + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || true + +if [ -z "${gcp_token:-}" ]; then + echo "WARNING: Could not get GCP identity token, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +vault_token=$(curl -sf \ + --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ + "${vault_addr}/v1/${vault_auth_mount}/login" | \ + jq -r .auth.client_token) || true + +if [ -z "${vault_token:-}" ]; then + echo "WARNING: Could not authenticate with Vault, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +# Fetch common data (observability keys live here) +common_data=$(curl -sf \ + --header "X-Vault-Token: ${vault_token}" \ + "${vault_addr}/v1/${vault_kv_path}/node/${vault_kv_common_suffix}" | + jq -c .data.data) || true + +if [ -z "${common_data:-}" ]; then + echo "WARNING: Could not fetch Vault data, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +get_value() { + echo "$common_data" | jq -rc --arg key "$1" '.[$key] // ""' +} + +write_config \ + "$(get_value metrics_flashbots_url)" \ + "$(get_value metrics_flashbots_username)" \ + "$(get_value metrics_flashbots_password)" diff --git a/modules/flashbox/observability/mkosi.postinst b/modules/flashbox/observability/mkosi.postinst new file mode 100755 index 00000000..e0d88484 --- /dev/null +++ b/modules/flashbox/observability/mkosi.postinst @@ -0,0 +1,12 @@ +#!/bin/bash +set -euxo pipefail + +# Ensure prometheus owns its data directory +mkosi-chroot chown -R prometheus:prometheus /var/lib/prometheus + +# Enable observability services +mkosi-chroot systemctl add-wants minimal.target \ + fetch-observability-config.service \ + prometheus.service \ + node-exporter.service \ + process-exporter.service From 1c22ca5ae122cdc82040a6c547525410982582db Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Tue, 21 Apr 2026 15:18:21 -0300 Subject: [PATCH 02/15] Add obs to L1 also --- images/flashbox-l1.conf | 1 + .../flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/images/flashbox-l1.conf b/images/flashbox-l1.conf index 42f61ce8..a5d07dfd 100644 --- a/images/flashbox-l1.conf +++ b/images/flashbox-l1.conf @@ -2,6 +2,7 @@ Include=shared/mkosi.conf Include=modules/flashbox/common/mkosi.conf Include=modules/flashbox/flashbox-l1/mkosi.conf +Include=modules/flashbox/observability/mkosi.conf [Config] Profiles=azure,gcp diff --git a/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config b/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config index 4d88d139..992a47a1 100644 --- a/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config +++ b/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config @@ -62,6 +62,11 @@ accept_dst_port $CHAIN_ALWAYS_OUT udp $CL_P2P_PORT "CL P2P (UDP)" accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP" accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS" +# Observability metrics endpoint (loaded from /etc/flashbox/observability.env) +if [ -n "${METRICS_ENDPOINT:-}" ]; then + accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)" +fi + # Titan builder bundle endpoints (always on) # Security note: This is a side channel. # While the operator will not be able to see the content of the packets, From 46218b601591d800768135579e71b22ceab85e7a Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 29 Apr 2026 19:59:27 -0300 Subject: [PATCH 03/15] Rework observability fetching config --- modules/flashbox/observability/mkosi.build | 12 -- modules/flashbox/observability/mkosi.conf | 9 +- ...rometheus.yml.tmpl => prometheus-base.yml} | 19 --- .../prometheus-remote-write.yml.tmpl | 13 ++ ...e => flashbox-observability-setup.service} | 4 +- .../etc/systemd/system/prometheus.service | 4 +- .../needs-observability.conf | 4 +- .../usr/bin/fetch-observability-config.sh | 121 ------------------ .../usr/bin/flashbox-observability-setup | 109 ++++++++++++++++ .../mkosi.extra/usr/lib/flashbox/metadata.sh | 10 ++ .../mkosi.extra/usr/lib/flashbox/render.sh | 8 ++ .../mkosi.extra/usr/lib/flashbox/vault.sh | 37 ++++++ modules/flashbox/observability/mkosi.postinst | 2 +- 13 files changed, 184 insertions(+), 168 deletions(-) delete mode 100755 modules/flashbox/observability/mkosi.build rename modules/flashbox/observability/mkosi.extra/etc/prometheus/{prometheus.yml.tmpl => prometheus-base.yml} (51%) create mode 100644 modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl rename modules/flashbox/observability/mkosi.extra/etc/systemd/system/{fetch-observability-config.service => flashbox-observability-setup.service} (59%) delete mode 100755 modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh create mode 100755 modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup create mode 100644 modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh create mode 100644 modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh create mode 100644 modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh diff --git a/modules/flashbox/observability/mkosi.build b/modules/flashbox/observability/mkosi.build deleted file mode 100755 index e8f52796..00000000 --- a/modules/flashbox/observability/mkosi.build +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -euxo pipefail - -source scripts/make_git_package.sh - -# Build gomplate (template engine for Prometheus config) -make_git_package \ - "gomplate" \ - "v4.3.3" \ - "https://github.com/hairyhenderson/gomplate" \ - 'go build -trimpath -ldflags "-s -w -buildid=" -o ./build/gomplate ./cmd/gomplate' \ - "build/gomplate:/usr/bin/gomplate" diff --git a/modules/flashbox/observability/mkosi.conf b/modules/flashbox/observability/mkosi.conf index f23f464c..f45d7c01 100644 --- a/modules/flashbox/observability/mkosi.conf +++ b/modules/flashbox/observability/mkosi.conf @@ -1,15 +1,8 @@ -[Build] -WithNetwork=true - [Content] ExtraTrees=modules/flashbox/observability/mkosi.extra PostInstallationScripts=modules/flashbox/observability/mkosi.postinst -BuildScripts=modules/flashbox/observability/mkosi.build Packages=prometheus prometheus-node-exporter prometheus-process-exporter - -BuildPackages=build-essential - git - golang + gettext-base diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml similarity index 51% rename from modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl rename to modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml index 2e2dc00d..afea6058 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml @@ -22,22 +22,3 @@ scrape_configs: - job_name: 'process' static_configs: - targets: ['localhost:9256'] - -{{- $config := (datasource "config") }} -{{- if $config.remote_write_flashbots_url }} - -# Remote write configuration (dynamically configured) -remote_write: - # Flashbots endpoint - - url: {{ $config.remote_write_flashbots_url }} - write_relabel_configs: - # Only send flashbox: prefixed metrics - - source_labels: [__name__] - regex: 'flashbox:.*' - action: keep - {{- if $config.remote_write_flashbots_auth }} - basic_auth: - username: {{ $config.remote_write_flashbots_username }} - password: {{ $config.remote_write_flashbots_password }} - {{- end }} -{{- end }} diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl new file mode 100644 index 00000000..cb7892c8 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl @@ -0,0 +1,13 @@ + +# Remote write configuration (dynamically configured) +remote_write: + # Flashbots endpoint + - url: ${METRICS_URL} + write_relabel_configs: + # Only send flashbox: prefixed metrics + - source_labels: [__name__] + regex: 'flashbox:.*' + action: keep + basic_auth: + username: ${METRICS_USERNAME} + password: ${METRICS_PASSWORD} diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/flashbox-observability-setup.service similarity index 59% rename from modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service rename to modules/flashbox/observability/mkosi.extra/etc/systemd/system/flashbox-observability-setup.service index f45ccb49..a8fcf4f7 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/flashbox-observability-setup.service @@ -1,11 +1,11 @@ [Unit] -Description=Fetch observability configuration +Description=Flashbox observability setup (fetch creds, render Prometheus config) After=network-online.target Wants=network-online.target [Service] Type=oneshot -ExecStart=/usr/bin/fetch-observability-config.sh +ExecStart=/usr/bin/flashbox-observability-setup RemainAfterExit=yes StandardOutput=journal StandardError=journal diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service index fb6397ca..f584fdc4 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service @@ -1,15 +1,13 @@ [Unit] Description=Prometheus Monitoring System Documentation=https://prometheus.io/docs/introduction/overview/ -After=network-online.target fetch-observability-config.service +After=network-online.target flashbox-observability-setup.service Wants=network-online.target -Requires=fetch-observability-config.service [Service] Type=simple User=prometheus Group=prometheus -ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json ExecStart=/usr/bin/prometheus \ --config.file=/etc/prometheus/prometheus.yml \ --storage.tsdb.path=/var/lib/prometheus/ \ diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf index b11c3917..3940ab16 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf @@ -1,3 +1,3 @@ [Unit] -After=fetch-observability-config.service -Wants=fetch-observability-config.service +After=flashbox-observability-setup.service +Wants=flashbox-observability-setup.service diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh b/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh deleted file mode 100755 index f319a28f..00000000 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/sh -set -eu -o pipefail - -# Fetches observability configuration (metrics endpoint credentials) and writes: -# /etc/flashbox/observability-config.json — consumed by gomplate for Prometheus config -# /etc/flashbox/observability.env — sourced by firewall for metrics endpoint IP -# -# On failure: logs a warning and writes empty defaults. Prometheus runs locally -# without remote_write. This is intentional — observability should never block boot. - -OBSERVABILITY_CONFIG_PATH=/etc/flashbox/observability-config.json -OBSERVABILITY_ENV_PATH=/etc/flashbox/observability.env - -write_config() { - local url="${1:-}" - local username="${2:-}" - local password="${3:-}" - - # Extract IP for firewall rules - local metrics_endpoint="" - if [ -n "$url" ]; then - metrics_endpoint=$(echo "$url" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' | head -1 || true) - fi - - mkdir -p /etc/flashbox - - # JSON config for Prometheus gomplate template - cat < "$OBSERVABILITY_CONFIG_PATH" -{ - "remote_write_flashbots_url": "${url}", - "remote_write_flashbots_username": "${username}", - "remote_write_flashbots_password": "${password}", - "remote_write_flashbots_auth": $([ -n "${username}" ] && echo '"true"' || echo '""') -} -EOF - - # Env file for firewall (sourced by init-firewall.sh) - cat < "$OBSERVABILITY_ENV_PATH" -METRICS_ENDPOINT='${metrics_endpoint}' -EOF - - echo "Observability config written (endpoint: ${metrics_endpoint:-none})" -} - -# Don't override if config already exists -if [ -f "$OBSERVABILITY_CONFIG_PATH" ]; then - echo "Observability config already exists, skipping" - exit 0 -fi - -# Local QEMU dev: no remote_write -if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ - [ -f /etc/systemd/system/serial-console.service ]; then - echo "QEMU dev environment, writing empty observability config" - write_config "" "" "" - exit 0 -fi - -# Production: fetch from Vault (non-fatal on failure) -echo "Fetching observability config from Vault..." - -fetch_metadata_value() { - curl -sf \ - --header "Metadata-Flavor: Google" \ - "http://metadata/computeMetadata/v1/instance/attributes/$1" -} - -if ! instance_name=$(fetch_metadata_value "name") || \ - ! vault_addr=$(fetch_metadata_value "vault_addr") || \ - ! vault_auth_mount=$(fetch_metadata_value "vault_auth_mount_gcp") || \ - ! vault_kv_path=$(fetch_metadata_value "vault_kv_path") || \ - ! vault_kv_common_suffix=$(fetch_metadata_value "vault_kv_common_suffix"); then - echo "WARNING: Could not fetch GCP metadata, writing empty observability config" - write_config "" "" "" - exit 0 -fi - -# Authenticate with Vault using GCP identity -gcp_token=$(curl -sf \ - --header "Metadata-Flavor: Google" \ - --data-urlencode "audience=http://vault/$instance_name" \ - --data-urlencode "format=full" \ - "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || true - -if [ -z "${gcp_token:-}" ]; then - echo "WARNING: Could not get GCP identity token, writing empty observability config" - write_config "" "" "" - exit 0 -fi - -vault_token=$(curl -sf \ - --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ - "${vault_addr}/v1/${vault_auth_mount}/login" | \ - jq -r .auth.client_token) || true - -if [ -z "${vault_token:-}" ]; then - echo "WARNING: Could not authenticate with Vault, writing empty observability config" - write_config "" "" "" - exit 0 -fi - -# Fetch common data (observability keys live here) -common_data=$(curl -sf \ - --header "X-Vault-Token: ${vault_token}" \ - "${vault_addr}/v1/${vault_kv_path}/node/${vault_kv_common_suffix}" | - jq -c .data.data) || true - -if [ -z "${common_data:-}" ]; then - echo "WARNING: Could not fetch Vault data, writing empty observability config" - write_config "" "" "" - exit 0 -fi - -get_value() { - echo "$common_data" | jq -rc --arg key "$1" '.[$key] // ""' -} - -write_config \ - "$(get_value metrics_flashbots_url)" \ - "$(get_value metrics_flashbots_username)" \ - "$(get_value metrics_flashbots_password)" diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup new file mode 100755 index 00000000..162e99ed --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -0,0 +1,109 @@ +#!/bin/sh +set -eu -o pipefail + +# Boot-time observability setup: +# - reads instance metadata +# - authenticates to Vault (GCP backend) and fetches remote_write creds +# - writes /etc/flashbox/observability.env (sourced by firewall) +# - renders /etc/prometheus/prometheus.yml from the base config plus an +# optional remote_write fragment +# +# On any failure (Vault unreachable, missing secret, missing fields) this +# script writes sane defaults and exits 0. Boot must never fail because of +# observability. + +OBSERVABILITY_DIR=/etc/flashbox +OBSERVABILITY_ENV_PATH="$OBSERVABILITY_DIR/observability.env" + +PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml +PROMETHEUS_BASE=/etc/prometheus/prometheus-base.yml +PROMETHEUS_REMOTE_WRITE=/etc/prometheus/prometheus-remote-write.yml.tmpl + +. /usr/lib/flashbox/metadata.sh +. /usr/lib/flashbox/vault.sh +. /usr/lib/flashbox/render.sh + +mkdir -p "$OBSERVABILITY_DIR" + +write_firewall_env() { + local endpoint="${1:-}" + cat > "$OBSERVABILITY_ENV_PATH" </dev/null | grep -q "QEMU" && \ + [ -f /etc/systemd/system/serial-console.service ]; then + echo "QEMU dev environment, skipping observability config fetch" + exit 0 +fi + +echo "Fetching observability config from Vault..." + +instance_name=$(gce_metadata_get name) || { + echo "WARNING: could not fetch instance name from GCE metadata, skipping" + exit 0 +} +vault_addr=$(gce_metadata_get attributes/vault_addr) || { + echo "WARNING: could not fetch vault_addr from GCE metadata, skipping" + exit 0 +} +vault_auth_mount=$(gce_metadata_get attributes/vault_auth_mount_gcp) || { + echo "WARNING: could not fetch vault_auth_mount_gcp from GCE metadata, skipping" + exit 0 +} +vault_kv_path=$(gce_metadata_get attributes/vault_kv_path) || { + echo "WARNING: could not fetch vault_kv_path from GCE metadata, skipping" + exit 0 +} +vault_kv_common_suffix=$(gce_metadata_get attributes/vault_kv_common_suffix) || { + echo "WARNING: could not fetch vault_kv_common_suffix from GCE metadata, skipping" + exit 0 +} + +vault_token=$(vault_login_gcp "$instance_name" "$vault_addr" "$vault_auth_mount") || { + echo "WARNING: could not authenticate with Vault, skipping" + exit 0 +} + +secret_data=$(vault_kv_get "$vault_addr" "$vault_token" "${vault_kv_path}/node/${vault_kv_common_suffix}") || { + echo "WARNING: could not fetch Vault data, skipping" + exit 0 +} + +METRICS_URL=$(echo "$secret_data" | jq -rc '.metrics_flashbots_url // ""') +METRICS_USERNAME=$(echo "$secret_data" | jq -rc '.metrics_flashbots_username // ""') +METRICS_PASSWORD=$(echo "$secret_data" | jq -rc '.metrics_flashbots_password // ""') + +if [ -z "$METRICS_URL" ]; then + echo "No metrics URL configured, remote_write disabled" + exit 0 +fi + +if [ -z "$METRICS_USERNAME" ] || [ -z "$METRICS_PASSWORD" ]; then + echo "WARNING: metrics URL set but basic_auth credentials are missing, remote_write disabled" + exit 0 +fi + +# Extract IPv4 literal for firewall egress allowlist. +endpoint=$(echo "$METRICS_URL" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' | head -1 || true) +if [ -z "$endpoint" ]; then + echo "WARNING: metrics URL is not an IPv4 literal, firewall egress will not be opened — remote_write will be blocked" +fi + +write_firewall_env "$endpoint" + +# Append the remote_write fragment to the base prometheus config. +export METRICS_URL METRICS_USERNAME METRICS_PASSWORD +fragment=$(mktemp) +trap 'rm -f "$fragment"' EXIT + +render_envsubst "$PROMETHEUS_REMOTE_WRITE" "$fragment" +cat "$fragment" >> "$PROMETHEUS_CONFIG" + +echo "Observability config written (endpoint: ${endpoint:-none})" diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh new file mode 100644 index 00000000..7b36b1db --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Helpers for reading GCE instance metadata. + +# Read a single attribute from the GCE metadata server. +# Args: $1 = relative path under /computeMetadata/v1/instance/ +gce_metadata_get() { + curl -sf \ + --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/$1" +} diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh new file mode 100644 index 00000000..ffeb4c8f --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh @@ -0,0 +1,8 @@ +#!/bin/sh +# Template rendering helpers. + +# Render a template by substituting ${VAR} references from the environment. +# Args: $1 = template path, $2 = output path +render_envsubst() { + envsubst < "$1" > "$2" +} diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh new file mode 100644 index 00000000..aece9e5b --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# Vault auth (GCP backend) and KV reads. + +# Authenticate to Vault using the GCE instance identity JWT. +# Args: $1 = instance name (used as audience and role), $2 = vault addr, +# $3 = vault auth mount path (e.g. "auth/gcp/l1-flashbox") +# Stdout: vault client token. Returns non-zero on failure. +vault_login_gcp() { + local instance_name="$1" vault_addr="$2" auth_mount="$3" + + local gcp_token + gcp_token=$(curl -sf \ + --header "Metadata-Flavor: Google" \ + --data-urlencode "audience=http://vault/${instance_name}" \ + --data-urlencode "format=full" \ + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || return 1 + + [ -n "$gcp_token" ] || return 1 + + curl -sf \ + --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ + "${vault_addr}/v1/${auth_mount}/login" \ + | jq -re .auth.client_token +} + +# Read a Vault KV v2 secret. +# Args: $1 = vault addr, $2 = vault token, $3 = full KV API path +# (e.g. "secret/data/foo/bar") +# Stdout: the secret's `.data.data` JSON. Returns non-zero on failure. +vault_kv_get() { + local vault_addr="$1" token="$2" path="$3" + + curl -sf \ + --header "X-Vault-Token: ${token}" \ + "${vault_addr}/v1/${path}" \ + | jq -ce .data.data +} diff --git a/modules/flashbox/observability/mkosi.postinst b/modules/flashbox/observability/mkosi.postinst index e0d88484..ec872aa6 100755 --- a/modules/flashbox/observability/mkosi.postinst +++ b/modules/flashbox/observability/mkosi.postinst @@ -6,7 +6,7 @@ mkosi-chroot chown -R prometheus:prometheus /var/lib/prometheus # Enable observability services mkosi-chroot systemctl add-wants minimal.target \ - fetch-observability-config.service \ + flashbox-observability-setup.service \ prometheus.service \ node-exporter.service \ process-exporter.service From d68cfde146ec3cf220448cccadcf2bb5298865a0 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:52:28 -0300 Subject: [PATCH 04/15] Refactor to make it cleaner --- .../prometheus-remote-write.yml.tmpl | 6 +- .../usr/bin/flashbox-observability-setup | 64 ++++--------------- .../mkosi.extra/usr/lib/flashbox/metadata.sh | 10 --- .../mkosi.extra/usr/lib/flashbox/render.sh | 21 ++++-- .../mkosi.extra/usr/lib/flashbox/vault.sh | 61 ++++++++++++------ 5 files changed, 72 insertions(+), 90 deletions(-) delete mode 100644 modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl index cb7892c8..584cf83a 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl @@ -2,12 +2,12 @@ # Remote write configuration (dynamically configured) remote_write: # Flashbots endpoint - - url: ${METRICS_URL} + - url: ${METRICS_FLASHBOTS_URL} write_relabel_configs: # Only send flashbox: prefixed metrics - source_labels: [__name__] regex: 'flashbox:.*' action: keep basic_auth: - username: ${METRICS_USERNAME} - password: ${METRICS_PASSWORD} + username: ${METRICS_FLASHBOTS_USERNAME} + password: ${METRICS_FLASHBOTS_PASSWORD} diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup index 162e99ed..0f800316 100755 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -1,12 +1,8 @@ #!/bin/sh set -eu -o pipefail -# Boot-time observability setup: -# - reads instance metadata -# - authenticates to Vault (GCP backend) and fetches remote_write creds -# - writes /etc/flashbox/observability.env (sourced by firewall) -# - renders /etc/prometheus/prometheus.yml from the base config plus an -# optional remote_write fragment +# Boot-time observability setup: fetch metrics creds from Vault, render +# /etc/prometheus/prometheus.yml, and write the firewall env file. # # On any failure (Vault unreachable, missing secret, missing fields) this # script writes sane defaults and exits 0. Boot must never fail because of @@ -19,16 +15,14 @@ PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml PROMETHEUS_BASE=/etc/prometheus/prometheus-base.yml PROMETHEUS_REMOTE_WRITE=/etc/prometheus/prometheus-remote-write.yml.tmpl -. /usr/lib/flashbox/metadata.sh . /usr/lib/flashbox/vault.sh . /usr/lib/flashbox/render.sh mkdir -p "$OBSERVABILITY_DIR" write_firewall_env() { - local endpoint="${1:-}" cat > "$OBSERVABILITY_ENV_PATH" <> "$PROMETHEUS_CONFIG" +render_template "$PROMETHEUS_CONFIG" \ + "METRICS_FLASHBOTS_URL METRICS_FLASHBOTS_USERNAME METRICS_FLASHBOTS_PASSWORD" \ + "$PROMETHEUS_BASE" "$PROMETHEUS_REMOTE_WRITE" echo "Observability config written (endpoint: ${endpoint:-none})" diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh deleted file mode 100644 index 7b36b1db..00000000 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/metadata.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -# Helpers for reading GCE instance metadata. - -# Read a single attribute from the GCE metadata server. -# Args: $1 = relative path under /computeMetadata/v1/instance/ -gce_metadata_get() { - curl -sf \ - --header "Metadata-Flavor: Google" \ - "http://metadata/computeMetadata/v1/instance/$1" -} diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh index ffeb4c8f..d2fe0da2 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh @@ -1,8 +1,21 @@ #!/bin/sh # Template rendering helpers. -# Render a template by substituting ${VAR} references from the environment. -# Args: $1 = template path, $2 = output path -render_envsubst() { - envsubst < "$1" > "$2" +# Concatenate one or more templates and render the result with envsubst, +# substituting only the env vars in the explicit allowlist. +# +# Args: $1 = output path +# $2 = space-separated list of var names to substitute (e.g. +# "METRICS_URL METRICS_USERNAME") +# $3.. = template files to concatenate +render_template() { + local out="$1" vars="$2" + shift 2 + + local allowlist="" v + for v in $vars; do + allowlist="${allowlist}\$${v} " + done + + cat "$@" | envsubst "$allowlist" > "$out" } diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh index aece9e5b..8641f09d 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -1,12 +1,28 @@ #!/bin/sh -# Vault auth (GCP backend) and KV reads. +# Vault auth (GCP backend) and secret fetch. +# +# Reads bootstrap config from GCE instance metadata, authenticates to Vault +# using the instance identity JWT, fetches the shared secret blob, and +# exports every key in the blob as an uppercase env var. -# Authenticate to Vault using the GCE instance identity JWT. -# Args: $1 = instance name (used as audience and role), $2 = vault addr, -# $3 = vault auth mount path (e.g. "auth/gcp/l1-flashbox") -# Stdout: vault client token. Returns non-zero on failure. -vault_login_gcp() { - local instance_name="$1" vault_addr="$2" auth_mount="$3" +_gce_metadata_get() { + curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/$1" +} + +# Authenticate to Vault and fetch the shared secret blob. Each key in the +# secret is exported as `=` (e.g. `metrics_flashbots_url` +# becomes `METRICS_FLASHBOTS_URL`). +# +# Returns non-zero on any failure (metadata unreachable, auth failure, +# secret not found, malformed response). Exports nothing in that case. +vault_fetch() { + local instance_name vault_addr auth_mount kv_path kv_common_suffix + instance_name=$(_gce_metadata_get name) || return 1 + vault_addr=$(_gce_metadata_get attributes/vault_addr) || return 1 + auth_mount=$(_gce_metadata_get attributes/vault_auth_mount_gcp) || return 1 + kv_path=$(_gce_metadata_get attributes/vault_kv_path) || return 1 + kv_common_suffix=$(_gce_metadata_get attributes/vault_kv_common_suffix) || return 1 local gcp_token gcp_token=$(curl -sf \ @@ -14,24 +30,27 @@ vault_login_gcp() { --data-urlencode "audience=http://vault/${instance_name}" \ --data-urlencode "format=full" \ "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || return 1 - [ -n "$gcp_token" ] || return 1 - curl -sf \ + local vault_token + vault_token=$(curl -sf \ --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ "${vault_addr}/v1/${auth_mount}/login" \ - | jq -re .auth.client_token -} + | jq -re .auth.client_token) || return 1 + + local secret_data + secret_data=$(curl -sf \ + --header "X-Vault-Token: ${vault_token}" \ + "${vault_addr}/v1/${kv_path}/node/${kv_common_suffix}" \ + | jq -ce .data.data) || return 1 -# Read a Vault KV v2 secret. -# Args: $1 = vault addr, $2 = vault token, $3 = full KV API path -# (e.g. "secret/data/foo/bar") -# Stdout: the secret's `.data.data` JSON. Returns non-zero on failure. -vault_kv_get() { - local vault_addr="$1" token="$2" path="$3" + local keys + keys=$(echo "$secret_data" | jq -r 'keys[]') || return 1 - curl -sf \ - --header "X-Vault-Token: ${token}" \ - "${vault_addr}/v1/${path}" \ - | jq -ce .data.data + local key upper_key value + for key in $keys; do + upper_key=$(echo "$key" | tr '[:lower:]' '[:upper:]') + value=$(echo "$secret_data" | jq -rc --arg k "$key" '.[$k] // ""') + export "${upper_key}=${value}" + done } From a3571d2aa809efd19df4928fb102df096a70ce34 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 29 Apr 2026 21:00:35 -0300 Subject: [PATCH 05/15] Remove uppercasing logic --- .../mkosi.extra/usr/lib/flashbox/vault.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh index 8641f09d..1d18e78b 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -11,8 +11,9 @@ _gce_metadata_get() { } # Authenticate to Vault and fetch the shared secret blob. Each key in the -# secret is exported as `=` (e.g. `metrics_flashbots_url` -# becomes `METRICS_FLASHBOTS_URL`). +# secret is exported as `=` verbatim — store keys in Vault with +# the exact casing you want as the env var name (UPPER_SNAKE_CASE by +# convention). # # Returns non-zero on any failure (metadata unreachable, auth failure, # secret not found, malformed response). Exports nothing in that case. @@ -47,10 +48,9 @@ vault_fetch() { local keys keys=$(echo "$secret_data" | jq -r 'keys[]') || return 1 - local key upper_key value + local key value for key in $keys; do - upper_key=$(echo "$key" | tr '[:lower:]' '[:upper:]') value=$(echo "$secret_data" | jq -rc --arg k "$key" '.[$k] // ""') - export "${upper_key}=${value}" + export "${key}=${value}" done } From 5942584e5b45d532affb8c27ed7aadee94a0ae10 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 29 Apr 2026 23:36:40 -0300 Subject: [PATCH 06/15] Add logging to vault fetch script --- .../mkosi.extra/usr/lib/flashbox/vault.sh | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh index 1d18e78b..276902a1 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -3,7 +3,7 @@ # # Reads bootstrap config from GCE instance metadata, authenticates to Vault # using the instance identity JWT, fetches the shared secret blob, and -# exports every key in the blob as an uppercase env var. +# exports every key in the blob as an env var. _gce_metadata_get() { curl -sf --header "Metadata-Flavor: Google" \ @@ -15,38 +15,48 @@ _gce_metadata_get() { # the exact casing you want as the env var name (UPPER_SNAKE_CASE by # convention). # -# Returns non-zero on any failure (metadata unreachable, auth failure, -# secret not found, malformed response). Exports nothing in that case. +# On failure, logs a specific reason to stderr and returns non-zero without +# exporting anything. vault_fetch() { local instance_name vault_addr auth_mount kv_path kv_common_suffix - instance_name=$(_gce_metadata_get name) || return 1 - vault_addr=$(_gce_metadata_get attributes/vault_addr) || return 1 - auth_mount=$(_gce_metadata_get attributes/vault_auth_mount_gcp) || return 1 - kv_path=$(_gce_metadata_get attributes/vault_kv_path) || return 1 - kv_common_suffix=$(_gce_metadata_get attributes/vault_kv_common_suffix) || return 1 + instance_name=$(_gce_metadata_get name) || { + echo "WARNING: vault_fetch: could not read GCE metadata 'name'" >&2; return 1; } + vault_addr=$(_gce_metadata_get attributes/vault_addr) || { + echo "WARNING: vault_fetch: could not read GCE metadata 'vault_addr'" >&2; return 1; } + auth_mount=$(_gce_metadata_get attributes/vault_auth_mount_gcp) || { + echo "WARNING: vault_fetch: could not read GCE metadata 'vault_auth_mount_gcp'" >&2; return 1; } + kv_path=$(_gce_metadata_get attributes/vault_kv_path) || { + echo "WARNING: vault_fetch: could not read GCE metadata 'vault_kv_path'" >&2; return 1; } + kv_common_suffix=$(_gce_metadata_get attributes/vault_kv_common_suffix) || { + echo "WARNING: vault_fetch: could not read GCE metadata 'vault_kv_common_suffix'" >&2; return 1; } local gcp_token gcp_token=$(curl -sf \ --header "Metadata-Flavor: Google" \ --data-urlencode "audience=http://vault/${instance_name}" \ --data-urlencode "format=full" \ - "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || return 1 - [ -n "$gcp_token" ] || return 1 + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || { + echo "WARNING: vault_fetch: could not obtain GCP identity JWT from metadata server" >&2; return 1; } + [ -n "$gcp_token" ] || { + echo "WARNING: vault_fetch: GCP identity JWT was empty" >&2; return 1; } local vault_token vault_token=$(curl -sf \ --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ "${vault_addr}/v1/${auth_mount}/login" \ - | jq -re .auth.client_token) || return 1 + | jq -re .auth.client_token) || { + echo "WARNING: vault_fetch: Vault GCP login failed (role=${instance_name}, mount=${auth_mount})" >&2; return 1; } local secret_data secret_data=$(curl -sf \ --header "X-Vault-Token: ${vault_token}" \ "${vault_addr}/v1/${kv_path}/node/${kv_common_suffix}" \ - | jq -ce .data.data) || return 1 + | jq -ce .data.data) || { + echo "WARNING: vault_fetch: KV read failed at ${kv_path}/node/${kv_common_suffix}" >&2; return 1; } local keys - keys=$(echo "$secret_data" | jq -r 'keys[]') || return 1 + keys=$(echo "$secret_data" | jq -r 'keys[]') || { + echo "WARNING: vault_fetch: could not enumerate keys in secret payload" >&2; return 1; } local key value for key in $keys; do From 637cd0a14e33fbf40230ff1a360faf35d8d17608 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Tue, 12 May 2026 01:38:34 -0300 Subject: [PATCH 07/15] Add host label to prometheus metrics --- ...ometheus-base.yml => prometheus-base.yml.tmpl} | 5 +++++ .../usr/bin/flashbox-observability-setup | 15 +++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) rename modules/flashbox/observability/mkosi.extra/etc/prometheus/{prometheus-base.yml => prometheus-base.yml.tmpl} (76%) diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml.tmpl similarity index 76% rename from modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml rename to modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml.tmpl index afea6058..13bc8228 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml.tmpl @@ -1,6 +1,11 @@ global: scrape_interval: 15s evaluation_interval: 15s + # Stamped onto every series sent via remote_write so the central + # Prometheus / AMP can distinguish samples coming from different + # flashbox VMs. + external_labels: + host: ${FLASHBOX_VM} # Recording rules for aggregated metrics rule_files: diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup index 0f800316..4aa542f6 100755 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -12,7 +12,7 @@ OBSERVABILITY_DIR=/etc/flashbox OBSERVABILITY_ENV_PATH="$OBSERVABILITY_DIR/observability.env" PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml -PROMETHEUS_BASE=/etc/prometheus/prometheus-base.yml +PROMETHEUS_BASE=/etc/prometheus/prometheus-base.yml.tmpl PROMETHEUS_REMOTE_WRITE=/etc/prometheus/prometheus-remote-write.yml.tmpl . /usr/lib/flashbox/vault.sh @@ -20,14 +20,21 @@ PROMETHEUS_REMOTE_WRITE=/etc/prometheus/prometheus-remote-write.yml.tmpl mkdir -p "$OBSERVABILITY_DIR" +# Read the GCE instance name and expose it to prometheus-base.yml.tmpl's +# external_labels — every remote_write sample is stamped with `host=` +# so the central Prometheus can distinguish flashbox VMs. +FLASHBOX_VM=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/name" || echo "unknown") +export FLASHBOX_VM + write_firewall_env() { cat > "$OBSERVABILITY_ENV_PATH" < Date: Tue, 12 May 2026 02:21:34 -0300 Subject: [PATCH 08/15] Accept multiple endpoints for metrics in firewall + use sigv4 for auth to aws prom --- .../mkosi.extra/etc/bob/firewall-config | 8 ++-- .../mkosi.extra/etc/bob/firewall-config | 8 ++-- .../prometheus-remote-write.yml.tmpl | 7 ++-- .../usr/bin/flashbox-observability-setup | 41 ++++++++++++++----- 4 files changed, 45 insertions(+), 19 deletions(-) diff --git a/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config b/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config index 992a47a1..58e29fa3 100644 --- a/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config +++ b/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config @@ -62,9 +62,11 @@ accept_dst_port $CHAIN_ALWAYS_OUT udp $CL_P2P_PORT "CL P2P (UDP)" accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP" accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS" -# Observability metrics endpoint (loaded from /etc/flashbox/observability.env) -if [ -n "${METRICS_ENDPOINT:-}" ]; then - accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)" +# Observability metrics endpoints (loaded from /etc/flashbox/observability.env) +if [ -n "${METRICS_ENDPOINTS:-}" ]; then + for ip in $METRICS_ENDPOINTS; do + accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$ip" $HTTPS_PORT "Metrics endpoint (Flashbots)" + done fi # Titan builder bundle endpoints (always on) diff --git a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config index 44a06dcb..2925cbde 100644 --- a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config +++ b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config @@ -50,9 +50,11 @@ accept_dst_port $CHAIN_ALWAYS_IN tcp $CVM_REVERSE_PROXY_PORT "CVM reverse-proxy" accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP" accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS" -# Observability metrics endpoint (loaded from /etc/flashbox/observability.env) -if [ -n "${METRICS_ENDPOINT:-}" ]; then - accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)" +# Observability metrics endpoints (loaded from /etc/flashbox/observability.env) +if [ -n "${METRICS_ENDPOINTS:-}" ]; then + for ip in $METRICS_ENDPOINTS; do + accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$ip" $HTTPS_PORT "Metrics endpoint (Flashbots)" + done fi ########################################################################### diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl index 584cf83a..478c05dd 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl @@ -8,6 +8,7 @@ remote_write: - source_labels: [__name__] regex: 'flashbox:.*' action: keep - basic_auth: - username: ${METRICS_FLASHBOTS_USERNAME} - password: ${METRICS_FLASHBOTS_PASSWORD} + sigv4: + region: ${METRICS_FLASHBOTS_REGION} + access_key: ${METRICS_FLASHBOTS_ACCESS_KEY} + secret_key: ${METRICS_FLASHBOTS_SECRET_KEY} diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup index 4aa542f6..5da3b4d3 100755 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -29,10 +29,30 @@ export FLASHBOX_VM write_firewall_env() { cat > "$OBSERVABILITY_ENV_PATH" </dev/null | awk '{print $1}' | sort -u | tr '\n' ' ' | sed 's/ *$//' + fi +} + # Default state: base config rendered with external_labels, firewall closed. render_template "$PROMETHEUS_CONFIG" "FLASHBOX_VM" "$PROMETHEUS_BASE" write_firewall_env "" @@ -56,21 +76,22 @@ if [ -z "${METRICS_FLASHBOTS_URL:-}" ]; then exit 0 fi -if [ -z "${METRICS_FLASHBOTS_USERNAME:-}" ] || [ -z "${METRICS_FLASHBOTS_PASSWORD:-}" ]; then - echo "WARNING: metrics URL set but basic_auth credentials are missing, remote_write disabled" +if [ -z "${METRICS_FLASHBOTS_REGION:-}" ] \ + || [ -z "${METRICS_FLASHBOTS_ACCESS_KEY:-}" ] \ + || [ -z "${METRICS_FLASHBOTS_SECRET_KEY:-}" ]; then + echo "WARNING: metrics URL set but SigV4 credentials are incomplete (need REGION, ACCESS_KEY, SECRET_KEY), remote_write disabled" exit 0 fi -# Extract IPv4 literal for firewall egress allowlist. -endpoint=$(echo "$METRICS_FLASHBOTS_URL" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' | head -1 || true) -if [ -z "$endpoint" ]; then - echo "WARNING: metrics URL is not an IPv4 literal, firewall egress will not be opened — remote_write will be blocked" +endpoints=$(resolve_to_ips "$METRICS_FLASHBOTS_URL") +if [ -z "$endpoints" ]; then + echo "WARNING: could not resolve metrics URL host, firewall egress will not be opened — remote_write will be blocked" fi -write_firewall_env "$endpoint" +write_firewall_env "$endpoints" render_template "$PROMETHEUS_CONFIG" \ - "FLASHBOX_VM METRICS_FLASHBOTS_URL METRICS_FLASHBOTS_USERNAME METRICS_FLASHBOTS_PASSWORD" \ + "FLASHBOX_VM METRICS_FLASHBOTS_URL METRICS_FLASHBOTS_REGION METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY" \ "$PROMETHEUS_BASE" "$PROMETHEUS_REMOTE_WRITE" -echo "Observability config written (endpoint: ${endpoint:-none})" +echo "Observability config written (endpoints: ${endpoints:-none})" From bf2897ce1a500c3595b6f0194cae83d365be14f5 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Tue, 12 May 2026 02:28:12 -0300 Subject: [PATCH 09/15] Refactor obs firefwall rules loading --- .../common/mkosi.extra/usr/bin/init-firewall.sh | 7 ++++--- .../mkosi.extra/etc/bob/firewall-config | 7 ------- .../mkosi.extra/etc/bob/firewall-config | 7 ------- .../etc/bob/firewall-config-observability | 15 +++++++++++++++ 4 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability diff --git a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh index 67acef20..71befdfc 100755 --- a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh +++ b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh @@ -152,11 +152,12 @@ drop_dst_ip() { # `source` is not supported in dash ########################################################################### -# Load observability config if the module is included (metrics endpoint IP) -[ -f /etc/flashbox/observability.env ] && . /etc/flashbox/observability.env - . /etc/bob/firewall-config +# Observability rule (sourced only if the observability module is included +# in the image — it owns its own egress rule + env-file dependency). +[ -f /etc/bob/firewall-config-observability ] && . /etc/bob/firewall-config-observability + ########################################################################### # (6) Start in Maintenance Mode ########################################################################### diff --git a/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config b/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config index 58e29fa3..4d88d139 100644 --- a/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config +++ b/modules/flashbox/flashbox-l1/mkosi.extra/etc/bob/firewall-config @@ -62,13 +62,6 @@ accept_dst_port $CHAIN_ALWAYS_OUT udp $CL_P2P_PORT "CL P2P (UDP)" accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP" accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS" -# Observability metrics endpoints (loaded from /etc/flashbox/observability.env) -if [ -n "${METRICS_ENDPOINTS:-}" ]; then - for ip in $METRICS_ENDPOINTS; do - accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$ip" $HTTPS_PORT "Metrics endpoint (Flashbots)" - done -fi - # Titan builder bundle endpoints (always on) # Security note: This is a side channel. # While the operator will not be able to see the content of the packets, diff --git a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config index 2925cbde..bee194c2 100644 --- a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config +++ b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config @@ -50,13 +50,6 @@ accept_dst_port $CHAIN_ALWAYS_IN tcp $CVM_REVERSE_PROXY_PORT "CVM reverse-proxy" accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP" accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS" -# Observability metrics endpoints (loaded from /etc/flashbox/observability.env) -if [ -n "${METRICS_ENDPOINTS:-}" ]; then - for ip in $METRICS_ENDPOINTS; do - accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$ip" $HTTPS_PORT "Metrics endpoint (Flashbots)" - done -fi - ########################################################################### # (3) MAINTENANCE_IN: Inbound rules for Maintenance Mode ########################################################################### diff --git a/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability b/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability new file mode 100644 index 00000000..24072517 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability @@ -0,0 +1,15 @@ +# Observability drop-in for /etc/bob/firewall-config.d/ +# +# Loaded by init-firewall.sh after the per-image firewall-config. +# Owns: +# - sourcing the metrics endpoint env file written by +# flashbox-observability-setup at boot +# - the egress allowlist for the metrics endpoint IPs + +[ -f /etc/flashbox/observability.env ] && . /etc/flashbox/observability.env + +if [ -n "${METRICS_ENDPOINTS:-}" ]; then + for ip in $METRICS_ENDPOINTS; do + accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$ip" $HTTPS_PORT "Metrics endpoint (Flashbots)" + done +fi From bee12eb07aee9e934fa3048f070dd9daf4e37a9a Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Tue, 12 May 2026 14:26:32 -0300 Subject: [PATCH 10/15] Make container up rule show only 1 or 0 --- .../mkosi.extra/etc/prometheus/recording_rules.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml index 79a370cc..a4190ae0 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml @@ -11,8 +11,12 @@ groups: - name: flashbox_health interval: 30s rules: + # Binary: 1 if process-exporter is up AND at least one process is in the + # searcher-container cgroup; 0 otherwise. - record: flashbox:container_alive - expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"} + expr: > + (up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"}) + > bool 0 # Spike-guarded: current 15m avg must be under 80%, # AND the 10m max ending 5m ago must have been under 70% From 0deb378e11b2f156c85bde1cba0a3df7be4e3335 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Tue, 12 May 2026 18:06:48 -0300 Subject: [PATCH 11/15] Handle hosts to IP translation on boot --- .../usr/bin/flashbox-observability-setup | 36 ++++------- .../mkosi.extra/usr/lib/flashbox/hosts.sh | 61 +++++++++++++++++++ 2 files changed, 74 insertions(+), 23 deletions(-) create mode 100644 modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup index 5da3b4d3..8dec4256 100755 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -2,7 +2,9 @@ set -eu -o pipefail # Boot-time observability setup: fetch metrics creds from Vault, render -# /etc/prometheus/prometheus.yml, and write the firewall env file. +# /etc/prometheus/prometheus.yml, write the firewall env file, and pin the +# metrics endpoint's FQDN → IPs mapping in /etc/hosts so Prometheus can +# reconnect after the host firewall locks down (production mode blocks DNS). # # On any failure (Vault unreachable, missing secret, missing fields) this # script writes sane defaults and exits 0. Boot must never fail because of @@ -15,8 +17,11 @@ PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml PROMETHEUS_BASE=/etc/prometheus/prometheus-base.yml.tmpl PROMETHEUS_REMOTE_WRITE=/etc/prometheus/prometheus-remote-write.yml.tmpl +HOSTS_MARKER=flashbox-observability + . /usr/lib/flashbox/vault.sh . /usr/lib/flashbox/render.sh +. /usr/lib/flashbox/hosts.sh mkdir -p "$OBSERVABILITY_DIR" @@ -33,29 +38,12 @@ METRICS_ENDPOINTS='${1:-}' EOF } -# Resolve a URL (or bare host) to a space-separated list of IPv4 addresses -# for the firewall egress allowlist. We resolve here because the orchestrator -# runs *before* searcher-firewall locks the box down — DNS is still available -# at this point. Production mode blocks DNS, so once the firewall is up the -# only way to refresh stale IPs is a full reboot. -# -# Empty input or unresolvable host → empty output. -resolve_to_ips() { - local input="${1:-}" - [ -n "$input" ] || { echo ""; return; } - - local host - host=$(echo "$input" | sed -E 's|^[a-z]+://||; s|/.*||; s|:.*||') - if echo "$host" | grep -qE '^[0-9]{1,3}(\.[0-9]{1,3}){3}$'; then - echo "$host" - else - getent ahostsv4 "$host" 2>/dev/null | awk '{print $1}' | sort -u | tr '\n' ' ' | sed 's/ *$//' - fi -} - -# Default state: base config rendered with external_labels, firewall closed. +# Default state: base config rendered with external_labels, firewall closed, +# /etc/hosts sentinel block cleared. The success path below re-populates the +# block; until then any stale entries are removed. render_template "$PROMETHEUS_CONFIG" "FLASHBOX_VM" "$PROMETHEUS_BASE" write_firewall_env "" +hosts_clean_block "$HOSTS_MARKER" # Local QEMU dev: skip Vault entirely. if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ @@ -84,14 +72,16 @@ if [ -z "${METRICS_FLASHBOTS_REGION:-}" ] \ fi endpoints=$(resolve_to_ips "$METRICS_FLASHBOTS_URL") +host=$(url_to_host "$METRICS_FLASHBOTS_URL") if [ -z "$endpoints" ]; then echo "WARNING: could not resolve metrics URL host, firewall egress will not be opened — remote_write will be blocked" fi write_firewall_env "$endpoints" +hosts_write_block "$HOSTS_MARKER" "$host" "$endpoints" render_template "$PROMETHEUS_CONFIG" \ "FLASHBOX_VM METRICS_FLASHBOTS_URL METRICS_FLASHBOTS_REGION METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY" \ "$PROMETHEUS_BASE" "$PROMETHEUS_REMOTE_WRITE" -echo "Observability config written (endpoints: ${endpoints:-none})" +echo "Observability config written (host: ${host:-none}, endpoints: ${endpoints:-none})" diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh new file mode 100644 index 00000000..49fcbb8e --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# URL / DNS / /etc/hosts helpers. +# +# Used by orchestrators that need to pin a FQDN → IPs mapping locally so +# that DNS resolution works without going to the network (e.g. when the +# searcher firewall blocks port 53 in production mode). + +HOSTS_FILE=/etc/hosts + +# Extract the bare hostname from a URL (or return the input verbatim if +# it's already a bare host). Strips scheme, path, port. +url_to_host() { + echo "${1:-}" | sed -E 's|^[a-z]+://||; s|/.*||; s|:.*||' +} + +# Resolve a URL (or bare host) to a space-separated list of IPv4 addresses. +# IPv4 literals are passed through unchanged; hostnames go through getent. +# Empty input or unresolvable host → empty output. +# +# Intended to be called *before* the host firewall locks down — at that +# point getent can still reach upstream DNS through systemd-resolved. +resolve_to_ips() { + local input="${1:-}" + [ -n "$input" ] || { echo ""; return; } + + local host + host=$(url_to_host "$input") + if echo "$host" | grep -qE '^[0-9]{1,3}(\.[0-9]{1,3}){3}$'; then + echo "$host" + else + getent ahostsv4 "$host" 2>/dev/null | awk '{print $1}' | sort -u | tr '\n' ' ' | sed 's/ *$//' + fi +} + +# Drop a sentinel-delimited block from /etc/hosts (no-op if not present). +# Each block is delimited by "# BEGIN " / "# END " lines +# so multiple consumers can manage their own sections independently. +# Args: $1 = marker name (e.g. "flashbox-observability") +hosts_clean_block() { + local marker="$1" + [ -f "$HOSTS_FILE" ] || return 0 + sed -i "/# BEGIN ${marker}/,/# END ${marker}/d" "$HOSTS_FILE" +} + +# Replace a sentinel-delimited block in /etc/hosts with fresh entries: +# one line per IP, all mapped to the same hostname. Empty inputs leave +# the block dropped (no entries written). +# Args: $1 = marker, $2 = hostname, $3 = space-separated IPs +hosts_write_block() { + local marker="$1" host="$2" ips="$3" + hosts_clean_block "$marker" + [ -n "$host" ] && [ -n "$ips" ] || return 0 + + { + echo "# BEGIN ${marker} (managed by flashbox)" + for ip in $ips; do + echo "$ip $host" + done + echo "# END ${marker}" + } >> "$HOSTS_FILE" +} From 736a61c09a3c89fb0bff1b6322514e81080d670b Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 13 May 2026 13:44:24 -0300 Subject: [PATCH 12/15] Specify vault role from metadata --- .../mkosi.extra/usr/lib/flashbox/vault.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh index 276902a1..d32f76e1 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -18,7 +18,7 @@ _gce_metadata_get() { # On failure, logs a specific reason to stderr and returns non-zero without # exporting anything. vault_fetch() { - local instance_name vault_addr auth_mount kv_path kv_common_suffix + local instance_name vault_addr auth_mount vault_role kv_path kv_common_suffix instance_name=$(_gce_metadata_get name) || { echo "WARNING: vault_fetch: could not read GCE metadata 'name'" >&2; return 1; } vault_addr=$(_gce_metadata_get attributes/vault_addr) || { @@ -30,10 +30,16 @@ vault_fetch() { kv_common_suffix=$(_gce_metadata_get attributes/vault_kv_common_suffix) || { echo "WARNING: vault_fetch: could not read GCE metadata 'vault_kv_common_suffix'" >&2; return 1; } + # vault_role is optional metadata. If set, it's used as the Vault GCP + # auth role name (and as the JWT audience). If unset, fall back to the + # instance name — preserves the L2 meva-uni convention of role-name=VM-name + # for projects that go with per-VM roles. + vault_role=$(_gce_metadata_get attributes/vault_role 2>/dev/null) || vault_role="$instance_name" + local gcp_token gcp_token=$(curl -sf \ --header "Metadata-Flavor: Google" \ - --data-urlencode "audience=http://vault/${instance_name}" \ + --data-urlencode "audience=http://vault/${vault_role}" \ --data-urlencode "format=full" \ "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || { echo "WARNING: vault_fetch: could not obtain GCP identity JWT from metadata server" >&2; return 1; } @@ -42,10 +48,10 @@ vault_fetch() { local vault_token vault_token=$(curl -sf \ - --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ + --data "$(printf '{"role":"%s","jwt":"%s"}' "$vault_role" "$gcp_token")" \ "${vault_addr}/v1/${auth_mount}/login" \ | jq -re .auth.client_token) || { - echo "WARNING: vault_fetch: Vault GCP login failed (role=${instance_name}, mount=${auth_mount})" >&2; return 1; } + echo "WARNING: vault_fetch: Vault GCP login failed (role=${vault_role}, mount=${auth_mount})" >&2; return 1; } local secret_data secret_data=$(curl -sf \ From e78b94f83655d0d505b9c42f3fbdf91421657e79 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 27 May 2026 13:08:03 -0300 Subject: [PATCH 13/15] Remove DNS caching logic --- .../mkosi.extra/usr/bin/init-firewall.sh | 2 +- .../etc/bob/firewall-config-observability | 22 +++--- .../prometheus-remote-write.yml.tmpl | 12 +++- .../needs-observability.conf | 3 - .../usr/bin/flashbox-observability-setup | 68 +++++++------------ .../mkosi.extra/usr/lib/flashbox/hosts.sh | 61 ----------------- .../mkosi.extra/usr/lib/flashbox/vault.sh | 37 ++++++---- 7 files changed, 67 insertions(+), 138 deletions(-) delete mode 100644 modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf delete mode 100644 modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh diff --git a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh index 71befdfc..ad966ea4 100755 --- a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh +++ b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh @@ -155,7 +155,7 @@ drop_dst_ip() { . /etc/bob/firewall-config # Observability rule (sourced only if the observability module is included -# in the image — it owns its own egress rule + env-file dependency). +# in the image — it owns its own static egress rule for the metrics LB IP). [ -f /etc/bob/firewall-config-observability ] && . /etc/bob/firewall-config-observability ########################################################################### diff --git a/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability b/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability index 24072517..568b9b72 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability +++ b/modules/flashbox/observability/mkosi.extra/etc/bob/firewall-config-observability @@ -1,15 +1,11 @@ # Observability drop-in for /etc/bob/firewall-config.d/ # -# Loaded by init-firewall.sh after the per-image firewall-config. -# Owns: -# - sourcing the metrics endpoint env file written by -# flashbox-observability-setup at boot -# - the egress allowlist for the metrics endpoint IPs - -[ -f /etc/flashbox/observability.env ] && . /etc/flashbox/observability.env - -if [ -n "${METRICS_ENDPOINTS:-}" ]; then - for ip in $METRICS_ENDPOINTS; do - accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$ip" $HTTPS_PORT "Metrics endpoint (Flashbots)" - done -fi +# Loaded by init-firewall.sh after the per-image firewall-config. Opens host +# egress to AWS Managed Prometheus, reached via the internal TCP-proxy load +# balancer at a fixed RFC1918 address (provisioned in terraform: +# gcp/flashbots-l1-flashbox/us-east4/amp-proxy.tf -> amp_proxy_ip). +# +# Static IP -> the rule is measured into the image; no boot-time DNS +# resolution and no env file. Placed in ALWAYS_OUT so it survives the +# maintenance -> production mode toggle (production drops general egress). +accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp 10.88.0.100 $HTTPS_PORT "Metrics endpoint (AMP via internal LB)" diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl index 478c05dd..a400bfe8 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl @@ -1,13 +1,19 @@ -# Remote write configuration (dynamically configured) +# Remote write configuration (rendered at boot from Vault-provided values). remote_write: - # Flashbots endpoint - - url: ${METRICS_FLASHBOTS_URL} + # AWS Managed Prometheus, reached via the internal TCP-proxy load balancer. + # Host is the fixed LB IP (10.88.0.100, see terraform amp-proxy.tf), so no + # DNS resolution happens on the VM. TLS SNI + cert validation are overridden + # to AMP's real regional hostname via tls_config.server_name + # (aps-workspaces..amazonaws.com is AMP's standard regional endpoint). + - url: https://10.88.0.100/workspaces/${METRICS_FLASHBOTS_WORKSPACE}/api/v1/remote_write write_relabel_configs: # Only send flashbox: prefixed metrics - source_labels: [__name__] regex: 'flashbox:.*' action: keep + tls_config: + server_name: aps-workspaces.${METRICS_FLASHBOTS_REGION}.amazonaws.com sigv4: region: ${METRICS_FLASHBOTS_REGION} access_key: ${METRICS_FLASHBOTS_ACCESS_KEY} diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf deleted file mode 100644 index 3940ab16..00000000 --- a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf +++ /dev/null @@ -1,3 +0,0 @@ -[Unit] -After=flashbox-observability-setup.service -Wants=flashbox-observability-setup.service diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup index 8dec4256..94729934 100755 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -1,29 +1,26 @@ #!/bin/sh set -eu -o pipefail -# Boot-time observability setup: fetch metrics creds from Vault, render -# /etc/prometheus/prometheus.yml, write the firewall env file, and pin the -# metrics endpoint's FQDN → IPs mapping in /etc/hosts so Prometheus can -# reconnect after the host firewall locks down (production mode blocks DNS). +# Boot-time observability setup: fetch metrics creds from Vault and render +# /etc/prometheus/prometheus.yml. # -# On any failure (Vault unreachable, missing secret, missing fields) this -# script writes sane defaults and exits 0. Boot must never fail because of -# observability. - -OBSERVABILITY_DIR=/etc/flashbox -OBSERVABILITY_ENV_PATH="$OBSERVABILITY_DIR/observability.env" +# Prometheus ships flashbox:* metrics to AWS Managed Prometheus via the +# internal TCP-proxy load balancer at a fixed IP (10.88.0.100). Because the +# remote_write URL is an IP literal there's no DNS to resolve on the VM and no +# /etc/hosts pinning — the host firewall statically allows egress to that IP +# (see /etc/bob/firewall-config-observability), and TLS cert validation is +# handled by tls_config.server_name in prometheus-remote-write.yml.tmpl. +# +# On any failure (Vault unreachable, missing/invalid secret) this script +# renders the base config without remote_write and exits 0. Boot must never +# fail because of observability. PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml PROMETHEUS_BASE=/etc/prometheus/prometheus-base.yml.tmpl PROMETHEUS_REMOTE_WRITE=/etc/prometheus/prometheus-remote-write.yml.tmpl -HOSTS_MARKER=flashbox-observability - . /usr/lib/flashbox/vault.sh . /usr/lib/flashbox/render.sh -. /usr/lib/flashbox/hosts.sh - -mkdir -p "$OBSERVABILITY_DIR" # Read the GCE instance name and expose it to prometheus-base.yml.tmpl's # external_labels — every remote_write sample is stamped with `host=` @@ -32,18 +29,9 @@ FLASHBOX_VM=$(curl -sf --header "Metadata-Flavor: Google" \ "http://metadata/computeMetadata/v1/instance/name" || echo "unknown") export FLASHBOX_VM -write_firewall_env() { - cat > "$OBSERVABILITY_ENV_PATH" </dev/null | grep -q "QEMU" && \ @@ -54,34 +42,24 @@ fi echo "Fetching observability config from Vault..." -if ! vault_fetch; then +# Fetch only the keys we need; vault_fetch enforces this allowlist and +# validates each value before exporting (see vault.sh). +if ! vault_fetch METRICS_FLASHBOTS_WORKSPACE METRICS_FLASHBOTS_REGION \ + METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY; then echo "WARNING: could not fetch observability config from Vault, using defaults" exit 0 fi -if [ -z "${METRICS_FLASHBOTS_URL:-}" ]; then - echo "No metrics URL configured, remote_write disabled" - exit 0 -fi - -if [ -z "${METRICS_FLASHBOTS_REGION:-}" ] \ +if [ -z "${METRICS_FLASHBOTS_WORKSPACE:-}" ] \ + || [ -z "${METRICS_FLASHBOTS_REGION:-}" ] \ || [ -z "${METRICS_FLASHBOTS_ACCESS_KEY:-}" ] \ || [ -z "${METRICS_FLASHBOTS_SECRET_KEY:-}" ]; then - echo "WARNING: metrics URL set but SigV4 credentials are incomplete (need REGION, ACCESS_KEY, SECRET_KEY), remote_write disabled" + echo "WARNING: incomplete observability config from Vault (need WORKSPACE, REGION, ACCESS_KEY, SECRET_KEY), remote_write disabled" exit 0 fi -endpoints=$(resolve_to_ips "$METRICS_FLASHBOTS_URL") -host=$(url_to_host "$METRICS_FLASHBOTS_URL") -if [ -z "$endpoints" ]; then - echo "WARNING: could not resolve metrics URL host, firewall egress will not be opened — remote_write will be blocked" -fi - -write_firewall_env "$endpoints" -hosts_write_block "$HOSTS_MARKER" "$host" "$endpoints" - render_template "$PROMETHEUS_CONFIG" \ - "FLASHBOX_VM METRICS_FLASHBOTS_URL METRICS_FLASHBOTS_REGION METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY" \ + "FLASHBOX_VM METRICS_FLASHBOTS_WORKSPACE METRICS_FLASHBOTS_REGION METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY" \ "$PROMETHEUS_BASE" "$PROMETHEUS_REMOTE_WRITE" -echo "Observability config written (host: ${host:-none}, endpoints: ${endpoints:-none})" +echo "Observability config written (workspace: ${METRICS_FLASHBOTS_WORKSPACE})" diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh deleted file mode 100644 index 49fcbb8e..00000000 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/hosts.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/sh -# URL / DNS / /etc/hosts helpers. -# -# Used by orchestrators that need to pin a FQDN → IPs mapping locally so -# that DNS resolution works without going to the network (e.g. when the -# searcher firewall blocks port 53 in production mode). - -HOSTS_FILE=/etc/hosts - -# Extract the bare hostname from a URL (or return the input verbatim if -# it's already a bare host). Strips scheme, path, port. -url_to_host() { - echo "${1:-}" | sed -E 's|^[a-z]+://||; s|/.*||; s|:.*||' -} - -# Resolve a URL (or bare host) to a space-separated list of IPv4 addresses. -# IPv4 literals are passed through unchanged; hostnames go through getent. -# Empty input or unresolvable host → empty output. -# -# Intended to be called *before* the host firewall locks down — at that -# point getent can still reach upstream DNS through systemd-resolved. -resolve_to_ips() { - local input="${1:-}" - [ -n "$input" ] || { echo ""; return; } - - local host - host=$(url_to_host "$input") - if echo "$host" | grep -qE '^[0-9]{1,3}(\.[0-9]{1,3}){3}$'; then - echo "$host" - else - getent ahostsv4 "$host" 2>/dev/null | awk '{print $1}' | sort -u | tr '\n' ' ' | sed 's/ *$//' - fi -} - -# Drop a sentinel-delimited block from /etc/hosts (no-op if not present). -# Each block is delimited by "# BEGIN " / "# END " lines -# so multiple consumers can manage their own sections independently. -# Args: $1 = marker name (e.g. "flashbox-observability") -hosts_clean_block() { - local marker="$1" - [ -f "$HOSTS_FILE" ] || return 0 - sed -i "/# BEGIN ${marker}/,/# END ${marker}/d" "$HOSTS_FILE" -} - -# Replace a sentinel-delimited block in /etc/hosts with fresh entries: -# one line per IP, all mapped to the same hostname. Empty inputs leave -# the block dropped (no entries written). -# Args: $1 = marker, $2 = hostname, $3 = space-separated IPs -hosts_write_block() { - local marker="$1" host="$2" ips="$3" - hosts_clean_block "$marker" - [ -n "$host" ] && [ -n "$ips" ] || return 0 - - { - echo "# BEGIN ${marker} (managed by flashbox)" - for ip in $ips; do - echo "$ip $host" - done - echo "# END ${marker}" - } >> "$HOSTS_FILE" -} diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh index d32f76e1..b8560060 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -3,20 +3,22 @@ # # Reads bootstrap config from GCE instance metadata, authenticates to Vault # using the instance identity JWT, fetches the shared secret blob, and -# exports every key in the blob as an env var. +# exports the explicitly-requested keys as env vars. _gce_metadata_get() { curl -sf --header "Metadata-Flavor: Google" \ "http://metadata/computeMetadata/v1/instance/$1" } -# Authenticate to Vault and fetch the shared secret blob. Each key in the -# secret is exported as `=` verbatim — store keys in Vault with -# the exact casing you want as the env var name (UPPER_SNAKE_CASE by -# convention). +# Authenticate to Vault and fetch the shared secret blob, then export ONLY +# the keys named in the arguments (an allowlist) — never the whole blob. +# Usage: vault_fetch KEY [KEY ...]. Keys are exported verbatim, so store them +# in Vault with the exact UPPER_SNAKE_CASE env var name you want. Each value +# is charset-validated before export to prevent env/YAML injection from a +# tampered secret; missing/empty/invalid values are skipped with a warning. # -# On failure, logs a specific reason to stderr and returns non-zero without -# exporting anything. +# On auth/fetch failure, logs a specific reason to stderr and returns +# non-zero without exporting anything. vault_fetch() { local instance_name vault_addr auth_mount vault_role kv_path kv_common_suffix instance_name=$(_gce_metadata_get name) || { @@ -60,13 +62,24 @@ vault_fetch() { | jq -ce .data.data) || { echo "WARNING: vault_fetch: KV read failed at ${kv_path}/node/${kv_common_suffix}" >&2; return 1; } - local keys - keys=$(echo "$secret_data" | jq -r 'keys[]') || { - echo "WARNING: vault_fetch: could not enumerate keys in secret payload" >&2; return 1; } - + # Export ONLY the requested keys (the allowlist passed as args). This + # bounds what a tampered/compromised secret can inject into our env — an + # unexpected key (PATH, LD_PRELOAD, ...) is never exported. Each value is + # charset-restricted to block YAML/env injection via crafted values + # (newlines, quotes, $, backticks, spaces). local key value - for key in $keys; do + for key in "$@"; do value=$(echo "$secret_data" | jq -rc --arg k "$key" '.[$k] // ""') + if [ -z "$value" ]; then + echo "WARNING: vault_fetch: key '${key}' missing or empty in secret" >&2 + continue + fi + case "$value" in + *[!A-Za-z0-9._/+=-]*) + echo "WARNING: vault_fetch: value for '${key}' has disallowed characters, skipping" >&2 + continue + ;; + esac export "${key}=${value}" done } From 537ee460aeaeae54657901d56945394d8b75c314 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 27 May 2026 15:45:21 -0300 Subject: [PATCH 14/15] Simplify prometheus startup logic --- .../prometheus-remote-write.yml.tmpl | 20 --- ...heus-base.yml.tmpl => prometheus.yml.tmpl} | 19 ++- .../etc/systemd/system/prometheus.service | 6 + .../usr/bin/flashbox-observability-setup | 72 +++++----- .../mkosi.extra/usr/lib/flashbox/render.sh | 21 --- .../mkosi.extra/usr/lib/flashbox/vault.sh | 125 ++++++++---------- 6 files changed, 114 insertions(+), 149 deletions(-) delete mode 100644 modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl rename modules/flashbox/observability/mkosi.extra/etc/prometheus/{prometheus-base.yml.tmpl => prometheus.yml.tmpl} (50%) delete mode 100644 modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl deleted file mode 100644 index a400bfe8..00000000 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-remote-write.yml.tmpl +++ /dev/null @@ -1,20 +0,0 @@ - -# Remote write configuration (rendered at boot from Vault-provided values). -remote_write: - # AWS Managed Prometheus, reached via the internal TCP-proxy load balancer. - # Host is the fixed LB IP (10.88.0.100, see terraform amp-proxy.tf), so no - # DNS resolution happens on the VM. TLS SNI + cert validation are overridden - # to AMP's real regional hostname via tls_config.server_name - # (aps-workspaces..amazonaws.com is AMP's standard regional endpoint). - - url: https://10.88.0.100/workspaces/${METRICS_FLASHBOTS_WORKSPACE}/api/v1/remote_write - write_relabel_configs: - # Only send flashbox: prefixed metrics - - source_labels: [__name__] - regex: 'flashbox:.*' - action: keep - tls_config: - server_name: aps-workspaces.${METRICS_FLASHBOTS_REGION}.amazonaws.com - sigv4: - region: ${METRICS_FLASHBOTS_REGION} - access_key: ${METRICS_FLASHBOTS_ACCESS_KEY} - secret_key: ${METRICS_FLASHBOTS_SECRET_KEY} diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl similarity index 50% rename from modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml.tmpl rename to modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl index 13bc8228..92634506 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus-base.yml.tmpl +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl @@ -11,7 +11,6 @@ global: rule_files: - /etc/prometheus/recording_rules.yml -# Scrape configurations scrape_configs: # Node exporter on localhost - job_name: 'node' @@ -27,3 +26,21 @@ scrape_configs: - job_name: 'process' static_configs: - targets: ['localhost:9256'] + +# Remote write to AWS Managed Prometheus via the internal TCP-proxy LB. +# Host is the fixed LB IP (10.88.0.100, see terraform amp-proxy.tf), so no DNS +# resolution happens on the VM; TLS SNI + cert validation are overridden to +# AMP's real regional hostname via tls_config.server_name. +remote_write: + - url: https://10.88.0.100/workspaces/${METRICS_FLASHBOTS_WORKSPACE}/api/v1/remote_write + write_relabel_configs: + # Only send flashbox: prefixed metrics + - source_labels: [__name__] + regex: 'flashbox:.*' + action: keep + tls_config: + server_name: aps-workspaces.${METRICS_FLASHBOTS_REGION}.amazonaws.com + sigv4: + region: ${METRICS_FLASHBOTS_REGION} + access_key: ${METRICS_FLASHBOTS_ACCESS_KEY} + secret_key: ${METRICS_FLASHBOTS_SECRET_KEY} diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service index f584fdc4..17eafe39 100644 --- a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service @@ -4,6 +4,12 @@ Documentation=https://prometheus.io/docs/introduction/overview/ After=network-online.target flashbox-observability-setup.service Wants=network-online.target +# flashbox-observability-setup writes this file only when Vault creds are +# fetched + validated. If it's absent (Vault down, dev image, bad secret) the +# unit is cleanly skipped — inactive, not failed, no Restart= crash-loop — and +# self-heals on the next boot that renders a config. +ConditionPathExists=/etc/prometheus/prometheus.yml + [Service] Type=simple User=prometheus diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup index 94729934..f06ddacb 100755 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -2,64 +2,58 @@ set -eu -o pipefail # Boot-time observability setup: fetch metrics creds from Vault and render -# /etc/prometheus/prometheus.yml. +# /etc/prometheus/prometheus.yml from the single template. # # Prometheus ships flashbox:* metrics to AWS Managed Prometheus via the -# internal TCP-proxy load balancer at a fixed IP (10.88.0.100). Because the -# remote_write URL is an IP literal there's no DNS to resolve on the VM and no -# /etc/hosts pinning — the host firewall statically allows egress to that IP -# (see /etc/bob/firewall-config-observability), and TLS cert validation is -# handled by tls_config.server_name in prometheus-remote-write.yml.tmpl. +# internal TCP-proxy LB at a fixed IP (10.88.0.100) — no DNS, no /etc/hosts +# pinning; the host firewall statically allows that IP, and TLS is validated +# via tls_config.server_name in the template. # -# On any failure (Vault unreachable, missing/invalid secret) this script -# renders the base config without remote_write and exits 0. Boot must never -# fail because of observability. +# Fail-safe: on any failure (QEMU dev, Vault unreachable, missing/invalid +# secret) this script writes NO config and exits 0. prometheus.service has +# ConditionPathExists=/etc/prometheus/prometheus.yml, so it is cleanly skipped +# (inactive, not failed) until a later boot renders the config. Boot never +# fails because of observability. PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml -PROMETHEUS_BASE=/etc/prometheus/prometheus-base.yml.tmpl -PROMETHEUS_REMOTE_WRITE=/etc/prometheus/prometheus-remote-write.yml.tmpl +PROMETHEUS_TEMPLATE=/etc/prometheus/prometheus.yml.tmpl . /usr/lib/flashbox/vault.sh -. /usr/lib/flashbox/render.sh -# Read the GCE instance name and expose it to prometheus-base.yml.tmpl's -# external_labels — every remote_write sample is stamped with `host=` -# so the central Prometheus can distinguish flashbox VMs. -FLASHBOX_VM=$(curl -sf --header "Metadata-Flavor: Google" \ - "http://metadata/computeMetadata/v1/instance/name" || echo "unknown") -export FLASHBOX_VM +# Start clean: never leave a stale config from a previous boot. +rm -f "$PROMETHEUS_CONFIG" -# Default state: base config only (no remote_write). The success path below -# re-renders with the remote_write block once creds are in hand. -render_template "$PROMETHEUS_CONFIG" "FLASHBOX_VM" "$PROMETHEUS_BASE" +# Read the GCE instance name for external_labels (host=), so the central +# Prometheus can distinguish flashbox VMs. +FLASHBOX_VM=$(_meta name || echo "unknown") +export FLASHBOX_VM -# Local QEMU dev: skip Vault entirely. +# Local QEMU dev: no Vault, no config (Prometheus stays disabled). if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ [ -f /etc/systemd/system/serial-console.service ]; then - echo "QEMU dev environment, skipping observability config fetch" + echo "QEMU dev environment, leaving Prometheus disabled" exit 0 fi echo "Fetching observability config from Vault..." -# Fetch only the keys we need; vault_fetch enforces this allowlist and -# validates each value before exporting (see vault.sh). -if ! vault_fetch METRICS_FLASHBOTS_WORKSPACE METRICS_FLASHBOTS_REGION \ - METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY; then - echo "WARNING: could not fetch observability config from Vault, using defaults" +token=$(vault_login) || { + echo "WARNING: Vault login failed, leaving Prometheus disabled" exit 0 -fi - -if [ -z "${METRICS_FLASHBOTS_WORKSPACE:-}" ] \ - || [ -z "${METRICS_FLASHBOTS_REGION:-}" ] \ - || [ -z "${METRICS_FLASHBOTS_ACCESS_KEY:-}" ] \ - || [ -z "${METRICS_FLASHBOTS_SECRET_KEY:-}" ]; then - echo "WARNING: incomplete observability config from Vault (need WORKSPACE, REGION, ACCESS_KEY, SECRET_KEY), remote_write disabled" +} + +vault_fetch "$token" \ + METRICS_FLASHBOTS_WORKSPACE \ + METRICS_FLASHBOTS_REGION \ + METRICS_FLASHBOTS_ACCESS_KEY \ + METRICS_FLASHBOTS_SECRET_KEY || { + echo "WARNING: could not fetch/validate metrics secret, leaving Prometheus disabled" exit 0 -fi +} -render_template "$PROMETHEUS_CONFIG" \ - "FLASHBOX_VM METRICS_FLASHBOTS_WORKSPACE METRICS_FLASHBOTS_REGION METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY" \ - "$PROMETHEUS_BASE" "$PROMETHEUS_REMOTE_WRITE" +# Single render: only the five curated variables are substituted; every other +# token in the template (e.g. [__name__]) is left untouched. +envsubst '$FLASHBOX_VM $METRICS_FLASHBOTS_WORKSPACE $METRICS_FLASHBOTS_REGION $METRICS_FLASHBOTS_ACCESS_KEY $METRICS_FLASHBOTS_SECRET_KEY' \ + < "$PROMETHEUS_TEMPLATE" > "$PROMETHEUS_CONFIG" echo "Observability config written (workspace: ${METRICS_FLASHBOTS_WORKSPACE})" diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh deleted file mode 100644 index d2fe0da2..00000000 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/render.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh -# Template rendering helpers. - -# Concatenate one or more templates and render the result with envsubst, -# substituting only the env vars in the explicit allowlist. -# -# Args: $1 = output path -# $2 = space-separated list of var names to substitute (e.g. -# "METRICS_URL METRICS_USERNAME") -# $3.. = template files to concatenate -render_template() { - local out="$1" vars="$2" - shift 2 - - local allowlist="" v - for v in $vars; do - allowlist="${allowlist}\$${v} " - done - - cat "$@" | envsubst "$allowlist" > "$out" -} diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh index b8560060..d7f484a5 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -1,85 +1,74 @@ #!/bin/sh -# Vault auth (GCP backend) and secret fetch. +# Vault auth (GCP backend) + validated fetch of named keys. # -# Reads bootstrap config from GCE instance metadata, authenticates to Vault -# using the instance identity JWT, fetches the shared secret blob, and -# exports the explicitly-requested keys as env vars. +# vault_login -> echoes a Vault client token +# vault_fetch TOKEN KEY... -> exports each named key after validating its value +# +# Only the keys the caller names are ever read or exported (no blob iteration), +# and every value is charset-checked before export, so a tampered secret cannot +# inject shell or YAML. If any requested key is missing or fails validation, +# vault_fetch returns non-zero and the caller writes no config. -_gce_metadata_get() { +_meta() { curl -sf --header "Metadata-Flavor: Google" \ "http://metadata/computeMetadata/v1/instance/$1" } -# Authenticate to Vault and fetch the shared secret blob, then export ONLY -# the keys named in the arguments (an allowlist) — never the whole blob. -# Usage: vault_fetch KEY [KEY ...]. Keys are exported verbatim, so store them -# in Vault with the exact UPPER_SNAKE_CASE env var name you want. Each value -# is charset-validated before export to prevent env/YAML injection from a -# tampered secret; missing/empty/invalid values are skipped with a warning. -# -# On auth/fetch failure, logs a specific reason to stderr and returns -# non-zero without exporting anything. -vault_fetch() { - local instance_name vault_addr auth_mount vault_role kv_path kv_common_suffix - instance_name=$(_gce_metadata_get name) || { - echo "WARNING: vault_fetch: could not read GCE metadata 'name'" >&2; return 1; } - vault_addr=$(_gce_metadata_get attributes/vault_addr) || { - echo "WARNING: vault_fetch: could not read GCE metadata 'vault_addr'" >&2; return 1; } - auth_mount=$(_gce_metadata_get attributes/vault_auth_mount_gcp) || { - echo "WARNING: vault_fetch: could not read GCE metadata 'vault_auth_mount_gcp'" >&2; return 1; } - kv_path=$(_gce_metadata_get attributes/vault_kv_path) || { - echo "WARNING: vault_fetch: could not read GCE metadata 'vault_kv_path'" >&2; return 1; } - kv_common_suffix=$(_gce_metadata_get attributes/vault_kv_common_suffix) || { - echo "WARNING: vault_fetch: could not read GCE metadata 'vault_kv_common_suffix'" >&2; return 1; } +# Conservative allowlist: alphanumerics + the symbols our values legitimately +# use (AWS keys are base64 -> / + = ; region/workspace use - _ .). Everything +# else -- whitespace, quotes, $, backtick, ; | & < > \, newlines -- is rejected. +_safe() { + case "$1" in + *[!A-Za-z0-9._/+=-]*) return 1 ;; + *) return 0 ;; + esac +} - # vault_role is optional metadata. If set, it's used as the Vault GCP - # auth role name (and as the JWT audience). If unset, fall back to the - # instance name — preserves the L2 meva-uni convention of role-name=VM-name - # for projects that go with per-VM roles. - vault_role=$(_gce_metadata_get attributes/vault_role 2>/dev/null) || vault_role="$instance_name" +# Authenticate with the GCE instance-identity JWT and echo a Vault token. +# The Vault GCP auth role is read from metadata (a shared role for this +# project); there is no per-VM fallback. +vault_login() { + local addr mount role jwt + addr=$(_meta attributes/vault_addr) || return 1 + mount=$(_meta attributes/vault_auth_mount_gcp) || return 1 + role=$(_meta attributes/vault_role) || return 1 - local gcp_token - gcp_token=$(curl -sf \ - --header "Metadata-Flavor: Google" \ - --data-urlencode "audience=http://vault/${vault_role}" \ + jwt=$(curl -sf --header "Metadata-Flavor: Google" \ + --data-urlencode "audience=http://vault/${role}" \ --data-urlencode "format=full" \ - "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || { - echo "WARNING: vault_fetch: could not obtain GCP identity JWT from metadata server" >&2; return 1; } - [ -n "$gcp_token" ] || { - echo "WARNING: vault_fetch: GCP identity JWT was empty" >&2; return 1; } + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || return 1 + + curl -sf \ + --data "$(printf '{"role":"%s","jwt":"%s"}' "$role" "$jwt")" \ + "${addr}/v1/${mount}/login" \ + | jq -re .auth.client_token +} + +# vault_fetch TOKEN KEY [KEY...] +# Reads the shared (common) secret blob once, then exports each requested key +# after validation. Returns 1 if any key is missing or fails validation. +vault_fetch() { + local token addr kv suffix data key value + token=$1 + shift - local vault_token - vault_token=$(curl -sf \ - --data "$(printf '{"role":"%s","jwt":"%s"}' "$vault_role" "$gcp_token")" \ - "${vault_addr}/v1/${auth_mount}/login" \ - | jq -re .auth.client_token) || { - echo "WARNING: vault_fetch: Vault GCP login failed (role=${vault_role}, mount=${auth_mount})" >&2; return 1; } + addr=$(_meta attributes/vault_addr) || return 1 + kv=$(_meta attributes/vault_kv_path) || return 1 + suffix=$(_meta attributes/vault_kv_common_suffix) || return 1 - local secret_data - secret_data=$(curl -sf \ - --header "X-Vault-Token: ${vault_token}" \ - "${vault_addr}/v1/${kv_path}/node/${kv_common_suffix}" \ - | jq -ce .data.data) || { - echo "WARNING: vault_fetch: KV read failed at ${kv_path}/node/${kv_common_suffix}" >&2; return 1; } + data=$(curl -sf --header "X-Vault-Token: ${token}" \ + "${addr}/v1/${kv}/node/${suffix}" \ + | jq -ce .data.data) || return 1 - # Export ONLY the requested keys (the allowlist passed as args). This - # bounds what a tampered/compromised secret can inject into our env — an - # unexpected key (PATH, LD_PRELOAD, ...) is never exported. Each value is - # charset-restricted to block YAML/env injection via crafted values - # (newlines, quotes, $, backticks, spaces). - local key value for key in "$@"; do - value=$(echo "$secret_data" | jq -rc --arg k "$key" '.[$k] // ""') - if [ -z "$value" ]; then - echo "WARNING: vault_fetch: key '${key}' missing or empty in secret" >&2 - continue - fi - case "$value" in - *[!A-Za-z0-9._/+=-]*) - echo "WARNING: vault_fetch: value for '${key}' has disallowed characters, skipping" >&2 - continue - ;; - esac + value=$(echo "$data" | jq -re --arg k "$key" '.[$k]') || { + echo "WARNING: vault_fetch: key '${key}' missing in secret" >&2 + return 1 + } + _safe "$value" || { + echo "WARNING: vault_fetch: value for '${key}' has unsafe characters" >&2 + return 1 + } export "${key}=${value}" done } From 7a12554d2a0114dcb341fe4b21b3e9d9fe7444d5 Mon Sep 17 00:00:00 2001 From: pablin-10 <118397961+pablin-10@users.noreply.github.com> Date: Wed, 27 May 2026 16:45:54 -0300 Subject: [PATCH 15/15] Simplify and linearize scripts and cleanup on comments --- .../usr/bin/flashbox-observability-setup | 46 +++----- .../mkosi.extra/usr/lib/flashbox/vault.sh | 108 ++++++++---------- 2 files changed, 66 insertions(+), 88 deletions(-) diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup index f06ddacb..48cca197 100755 --- a/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/flashbox-observability-setup @@ -1,18 +1,17 @@ #!/bin/sh set -eu -o pipefail -# Boot-time observability setup: fetch metrics creds from Vault and render -# /etc/prometheus/prometheus.yml from the single template. +# Boot-time observability setup: authenticate to Vault, fetch + validate the +# Prometheus remote_write secret, and render /etc/prometheus/prometheus.yml. # -# Prometheus ships flashbox:* metrics to AWS Managed Prometheus via the -# internal TCP-proxy LB at a fixed IP (10.88.0.100) — no DNS, no /etc/hosts -# pinning; the host firewall statically allows that IP, and TLS is validated -# via tls_config.server_name in the template. +# Prometheus ships flashbox:* metrics to Prometheus via the internal TCP-proxy +# at a fixed IP (10.88.0.100); the host firewall statically allows that IP, and +# TLS is validated via tls_config.server_name in the template. # # Fail-safe: on any failure (QEMU dev, Vault unreachable, missing/invalid -# secret) this script writes NO config and exits 0. prometheus.service has -# ConditionPathExists=/etc/prometheus/prometheus.yml, so it is cleanly skipped -# (inactive, not failed) until a later boot renders the config. Boot never +# secret) this writes NO config and exits 0. Because prometheus.service has +# ConditionPathExists=/etc/prometheus/prometheus.yml, it is cleanly skipped +# (inactive, not failed) until a later boot renders a config — boot never # fails because of observability. PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml @@ -20,36 +19,27 @@ PROMETHEUS_TEMPLATE=/etc/prometheus/prometheus.yml.tmpl . /usr/lib/flashbox/vault.sh -# Start clean: never leave a stale config from a previous boot. +# Always start clean — never leave a stale config from a previous boot. rm -f "$PROMETHEUS_CONFIG" -# Read the GCE instance name for external_labels (host=), so the central -# Prometheus can distinguish flashbox VMs. -FLASHBOX_VM=$(_meta name || echo "unknown") +# host= external label, from the GCE instance name. +FLASHBOX_VM=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/name") || FLASHBOX_VM=unknown +printf '%s' "$FLASHBOX_VM" | grep -qE '^[a-z0-9-]+$' || FLASHBOX_VM=unknown export FLASHBOX_VM -# Local QEMU dev: no Vault, no config (Prometheus stays disabled). +# Local QEMU dev image: no Vault, no config (Prometheus stays disabled). if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ [ -f /etc/systemd/system/serial-console.service ]; then echo "QEMU dev environment, leaving Prometheus disabled" exit 0 fi -echo "Fetching observability config from Vault..." - -token=$(vault_login) || { - echo "WARNING: Vault login failed, leaving Prometheus disabled" - exit 0 -} - -vault_fetch "$token" \ - METRICS_FLASHBOTS_WORKSPACE \ - METRICS_FLASHBOTS_REGION \ - METRICS_FLASHBOTS_ACCESS_KEY \ - METRICS_FLASHBOTS_SECRET_KEY || { - echo "WARNING: could not fetch/validate metrics secret, leaving Prometheus disabled" +# Fetch + validate the four metrics variables (exported by vault_fetch). +if ! vault_fetch; then + echo "WARNING: Vault fetch/validation failed, leaving Prometheus disabled" exit 0 -} +fi # Single render: only the five curated variables are substituted; every other # token in the template (e.g. [__name__]) is left untouched. diff --git a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh index d7f484a5..9ba7882a 100644 --- a/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh +++ b/modules/flashbox/observability/mkosi.extra/usr/lib/flashbox/vault.sh @@ -1,74 +1,62 @@ #!/bin/sh -# Vault auth (GCP backend) + validated fetch of named keys. +# Vault auth (GCP backend) + fetch/validate the Prometheus remote_write secret. +# Sourced by flashbox-observability-setup. # -# vault_login -> echoes a Vault client token -# vault_fetch TOKEN KEY... -> exports each named key after validating its value +# vault_fetch authenticates with the GCE instance-identity JWT, reads the +# shared secret, and exports the four metrics variables — each checked against +# its expected format, so a malformed or tampered value can never reach the +# rendered config (and can't carry shell/YAML metacharacters). Any failure +# returns non-zero and exports nothing usable; the caller then writes no config. # -# Only the keys the caller names are ever read or exported (no blob iteration), -# and every value is charset-checked before export, so a tampered secret cannot -# inject shell or YAML. If any requested key is missing or fails validation, -# vault_fetch returns non-zero and the caller writes no config. - -_meta() { - curl -sf --header "Metadata-Flavor: Google" \ - "http://metadata/computeMetadata/v1/instance/$1" -} - -# Conservative allowlist: alphanumerics + the symbols our values legitimately -# use (AWS keys are base64 -> / + = ; region/workspace use - _ .). Everything -# else -- whitespace, quotes, $, backtick, ; | & < > \, newlines -- is rejected. -_safe() { - case "$1" in - *[!A-Za-z0-9._/+=-]*) return 1 ;; - *) return 0 ;; - esac -} - -# Authenticate with the GCE instance-identity JWT and echo a Vault token. -# The Vault GCP auth role is read from metadata (a shared role for this -# project); there is no per-VM fallback. -vault_login() { - local addr mount role jwt - addr=$(_meta attributes/vault_addr) || return 1 - mount=$(_meta attributes/vault_auth_mount_gcp) || return 1 - role=$(_meta attributes/vault_role) || return 1 +# Note: values are piped with `printf '%s'`, never `echo` — dash's echo +# interprets backslash escapes (\n, \c, ...) and would corrupt JSON / values. +vault_fetch() { + local addr mount role kv suffix jwt token data + + # 1. Bootstrap config from GCE instance metadata. + addr=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_addr") || return 1 + mount=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_auth_mount_gcp") || return 1 + role=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_role") || return 1 + kv=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_kv_path") || return 1 + suffix=$(curl -sf --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/vault_kv_common_suffix") || return 1 + + # 2. Authenticate: GCE identity JWT -> Vault token. jwt=$(curl -sf --header "Metadata-Flavor: Google" \ --data-urlencode "audience=http://vault/${role}" \ --data-urlencode "format=full" \ "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || return 1 - - curl -sf \ + token=$(curl -sf \ --data "$(printf '{"role":"%s","jwt":"%s"}' "$role" "$jwt")" \ - "${addr}/v1/${mount}/login" \ - | jq -re .auth.client_token -} + "${addr}/v1/${mount}/login" | jq -re .auth.client_token) || return 1 -# vault_fetch TOKEN KEY [KEY...] -# Reads the shared (common) secret blob once, then exports each requested key -# after validation. Returns 1 if any key is missing or fails validation. -vault_fetch() { - local token addr kv suffix data key value - token=$1 - shift + # 3. Read the shared secret blob. + data=$(curl -sf --header "X-Vault-Token: ${token}" \ + "${addr}/v1/${kv}/node/${suffix}" | jq -ce .data.data) || return 1 - addr=$(_meta attributes/vault_addr) || return 1 - kv=$(_meta attributes/vault_kv_path) || return 1 - suffix=$(_meta attributes/vault_kv_common_suffix) || return 1 + # 4. Extract each variable and validate it against its expected format. + METRICS_FLASHBOTS_WORKSPACE=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_WORKSPACE) || return 1 + printf '%s' "$METRICS_FLASHBOTS_WORKSPACE" | grep -qE '^ws-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$' \ + || { echo "vault_fetch: WORKSPACE is not an AMP workspace id" >&2; return 1; } - data=$(curl -sf --header "X-Vault-Token: ${token}" \ - "${addr}/v1/${kv}/node/${suffix}" \ - | jq -ce .data.data) || return 1 + METRICS_FLASHBOTS_REGION=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_REGION) || return 1 + printf '%s' "$METRICS_FLASHBOTS_REGION" | grep -qE '^[a-z]{2}-[a-z]+-[0-9]+$' \ + || { echo "vault_fetch: REGION is not an AWS region" >&2; return 1; } + + METRICS_FLASHBOTS_ACCESS_KEY=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_ACCESS_KEY) || return 1 + printf '%s' "$METRICS_FLASHBOTS_ACCESS_KEY" | grep -qE '^[A-Z0-9]{20}$' \ + || { echo "vault_fetch: ACCESS_KEY is not an AWS access key id" >&2; return 1; } + + METRICS_FLASHBOTS_SECRET_KEY=$(printf '%s' "$data" | jq -re .METRICS_FLASHBOTS_SECRET_KEY) || return 1 + printf '%s' "$METRICS_FLASHBOTS_SECRET_KEY" | grep -qE '^[A-Za-z0-9/+]{40}$' \ + || { echo "vault_fetch: SECRET_KEY is not an AWS secret key" >&2; return 1; } - for key in "$@"; do - value=$(echo "$data" | jq -re --arg k "$key" '.[$k]') || { - echo "WARNING: vault_fetch: key '${key}' missing in secret" >&2 - return 1 - } - _safe "$value" || { - echo "WARNING: vault_fetch: value for '${key}' has unsafe characters" >&2 - return 1 - } - export "${key}=${value}" - done + # 5. All present and well-formed — publish to the environment for envsubst. + export METRICS_FLASHBOTS_WORKSPACE METRICS_FLASHBOTS_REGION \ + METRICS_FLASHBOTS_ACCESS_KEY METRICS_FLASHBOTS_SECRET_KEY }