diff --git a/certs/.gitignore b/certs/.gitignore new file mode 100644 index 0000000..2a9afc1 --- /dev/null +++ b/certs/.gitignore @@ -0,0 +1,5 @@ +# Generated certificates and private keys — never commit these +*.key +*.crt +*.csr +*.srl diff --git a/certs/gen-certs.sh b/certs/gen-certs.sh new file mode 100755 index 0000000..eddce64 --- /dev/null +++ b/certs/gen-certs.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# Generate a self-signed CA and per-service TLS certificates for the argus stack. +# Run once before `just start`. Certificates are written to this directory. +# Re-running is idempotent: existing certs are skipped unless --force is passed. +set -euo pipefail + +CERTS_DIR="$(cd "$(dirname "$0")" && pwd)" +FORCE="${1:-}" + +SERVICES=(prometheus loki grafana promtail argus-exporter) +DAYS=3650 + +# Subject Alternative Names per service (hostname inside Docker + localhost) +declare -A SANS +SANS[prometheus]="DNS:prometheus,DNS:argus-prometheus,DNS:localhost,IP:127.0.0.1" +SANS[loki]="DNS:loki,DNS:argus-loki,DNS:localhost,IP:127.0.0.1" +SANS[grafana]="DNS:grafana,DNS:argus-grafana,DNS:localhost,IP:127.0.0.1" +SANS[promtail]="DNS:promtail,DNS:argus-promtail,DNS:localhost,IP:127.0.0.1" +SANS[argus-exporter]="DNS:argus-exporter,DNS:localhost,IP:127.0.0.1" + +cd "$CERTS_DIR" + +# ── CA ───────────────────────────────────────────────────────────────────────── +if [[ -f ca.crt && -z "$FORCE" ]]; then + echo "[skip] CA already exists (pass --force to regenerate)" +else + echo "[gen] Generating CA key and certificate..." + openssl genrsa -out ca.key 4096 + openssl req -new -x509 -days "$DAYS" -key ca.key -out ca.crt \ + -subj "/CN=argus-local-ca/O=ProjectArgus/OU=HomericIntelligence" + echo "[ok] CA generated: ca.crt" +fi + +# ── Per-service certs ────────────────────────────────────────────────────────── +for svc in "${SERVICES[@]}"; do + if [[ -f "${svc}.crt" && -z "$FORCE" ]]; then + echo "[skip] ${svc}.crt already exists" + continue + fi + + echo "[gen] Generating cert for ${svc}..." + openssl genrsa -out "${svc}.key" 2048 + + # Write SAN extension to a temp file + san_ext=$(mktemp) + cat > "$san_ext" <.crt` / `certs/.key` — one cert per service + +Certificates are valid for 10 years. The `certs/` directory is git-ignored for `*.crt` and `*.key` — private keys must never be committed. + +### 2. Start the stack + +```bash +just start +``` + +All services mount their certificates from `certs/` via the volumes defined in `docker-compose.yml`. + +### 3. Verify + +```bash +just test-scrape # Prometheus queries over HTTPS +just reload-prometheus # Prometheus reload over HTTPS +just import-dashboards # Grafana API calls over HTTPS +``` + +Open Grafana at `https://localhost:3001`. Your browser will warn about the self-signed certificate; add `certs/ca.crt` to your OS/browser trust store to suppress the warning. + +## Tier 1: Cross-Host Paths (Exporter → Agamemnon / NATS) + +The exporter reaches Agamemnon (`172.20.0.1:8080`) and NATS (`172.24.0.1:8222`) across the WSL2 host gateway. These paths cross a network boundary and are the highest-risk. + +**Recommended approach: Tailscale** + +Route these URLs through Tailscale IPs instead of raw gateway IPs. Tailscale encrypts the hop end-to-end and sidesteps the self-signed certificate distribution problem for external services. + +Update `docker-compose.yml`: +```yaml +AGAMEMNON_URL: "https://:8080" +NESTOR_URL: "https://:8081" +NATS_URL: "https://:8222" +``` + +If Agamemnon/NATS serve HTTPS with our self-signed CA, also set: +```yaml +AGAMEMNON_TLS_CA: "/certs/ca.crt" +NESTOR_TLS_CA: "/certs/ca.crt" +NATS_TLS_CA: "/certs/ca.crt" +``` + +The CA file `/certs/ca.crt` is already mounted in the `argus-exporter` container. + +**Fallback: Plain HTTP (current default)** + +The default `AGAMEMNON_TLS_CA=""` / `NESTOR_TLS_CA=""` / `NATS_TLS_CA=""` preserves backward compatibility — the exporter uses plain HTTP as long as the upstream services don't serve HTTPS. This avoids `SSL_ERROR_RX_RECORD_TOO_LONG` errors when `https://` is pointed at an HTTP-only endpoint. + +## Tier 2: Docker-Internal Paths + +| Service | Certificate | Mounted at | +|---------|-------------|------------| +| Prometheus | `certs/prometheus.{crt,key}` | `/etc/prometheus/tls/` | +| Loki | `certs/loki.{crt,key}` | `/etc/loki/tls/` | +| Grafana | `certs/grafana.{crt,key}` | `/etc/grafana/tls/` | +| Promtail (client) | `certs/ca.crt` | `/etc/promtail/tls/` | + +### Grafana CA cert for datasources + +Grafana provisioning (`configs/grafana/datasources.yml`) includes `tlsAuthWithCACert: true` and a `secureJsonData.tlsCACert` placeholder. To inject the actual CA cert at startup, either: + +**Option A — Env var injection (recommended for Docker)** + +Add to `docker-compose.yml` under `grafana.environment`: +```yaml +GF_DATASOURCE_PROMETHEUS_JSONDATA_TLSCACERT: | + +``` + +Or use a startup script that patches the provisioning file: +```bash +sed -i "s|# Mount the CA cert content here.*|$(cat certs/ca.crt | sed 's/^/ /')|" \ + configs/grafana/datasources.yml +``` + +**Option B — Grafana UI** + +After startup, navigate to each datasource in the Grafana UI and paste the CA cert content into the "TLS CA Certificate" field. Export the datasource JSON and check it in. + +## Certificate Rotation + +1. Remove existing certificates: `rm certs/*.crt certs/*.key certs/*.srl` +2. Regenerate: `just gen-certs` +3. Restart the stack: `just stop && just start` + +Or regenerate without removing first (force mode): +```bash +bash certs/gen-certs.sh --force +just stop && just start +``` + +## Troubleshooting + +### `SSL_ERROR_RX_RECORD_TOO_LONG` + +This means `https://` was used against a service that is still serving plain HTTP. Check: +1. Is the target service configured with TLS? (Prometheus `tls_server_config`, Loki `http_tls_config`, etc.) +2. Are the certificates mounted correctly? Check `docker compose logs ` for TLS init errors. +3. Did `just gen-certs` complete without errors? + +### Certificate not trusted in browser + +Add `certs/ca.crt` to your OS trust store: +- **Ubuntu/Debian**: `sudo cp certs/ca.crt /usr/local/share/ca-certificates/argus-ca.crt && sudo update-ca-certificates` +- **macOS**: `sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain certs/ca.crt` +- **Windows**: Import via Certificate Manager (`certmgr.msc`) → Trusted Root Certification Authorities + +### Promtail push failures after TLS + +Check that Loki is serving HTTPS and that `certs/ca.crt` is present in the container: +```bash +just logs promtail +just logs loki +docker exec argus-promtail ls /etc/promtail/tls/ +``` diff --git a/exporter/exporter.py b/exporter/exporter.py index 1ba8fe6..f0ca868 100644 --- a/exporter/exporter.py +++ b/exporter/exporter.py @@ -8,49 +8,64 @@ import json import logging import os -import signal -import threading +import ssl import time import urllib.error import urllib.request from concurrent.futures import ThreadPoolExecutor -from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Optional logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger("homeric-exporter") -AGAMEMNON_URL = os.environ.get("AGAMEMNON_URL", "http://172.20.0.1:8080") -NESTOR_URL = os.environ.get("NESTOR_URL", "http://172.20.0.1:8081") -NATS_URL = os.environ.get("NATS_URL", "http://172.24.0.1:8222") -PORT = int(os.environ.get("EXPORTER_PORT", "9100")) - -_raw_timeout = os.environ.get("SCRAPE_TIMEOUT", "5") -try: - SCRAPE_TIMEOUT: float = float(_raw_timeout) -except ValueError: - log.warning("SCRAPE_TIMEOUT=%r is not numeric; falling back to 5", _raw_timeout) - SCRAPE_TIMEOUT = 5.0 - -for _var, _val in (("AGAMEMNON_URL", AGAMEMNON_URL), - ("NESTOR_URL", NESTOR_URL), - ("NATS_URL", NATS_URL)): - if not _val: - log.warning("environment variable %s is empty; scrapes against this target will fail", _var) - - -def _fetch(url: str) -> dict | None: +AGAMEMNON_URL = os.environ.get("AGAMEMNON_URL", "http://172.20.0.1:8080") +NESTOR_URL = os.environ.get("NESTOR_URL", "http://172.20.0.1:8081") +NATS_URL = os.environ.get("NATS_URL", "http://172.24.0.1:8222") +PORT = int(os.environ.get("EXPORTER_PORT", "9100")) + +# Optional CA bundle paths for TLS verification on each upstream. +# Set to the path of a CA certificate file (PEM) to enable custom trust. +# Leave unset to use the system trust store (appropriate when the upstream +# uses a publicly-trusted cert or when Tailscale handles transport encryption). +AGAMEMNON_TLS_CA = os.environ.get("AGAMEMNON_TLS_CA") +NESTOR_TLS_CA = os.environ.get("NESTOR_TLS_CA") +NATS_TLS_CA = os.environ.get("NATS_TLS_CA") + +# Set TLS_VERIFY=false to disable certificate verification entirely. +# Only for development — never disable in production. +_TLS_VERIFY = os.environ.get("TLS_VERIFY", "true").lower() != "false" + + +def _build_ssl_context(ca_file: Optional[str] = None) -> Optional[ssl.SSLContext]: + """Return an SSLContext for HTTPS requests, or None for plain HTTP.""" + if not _TLS_VERIFY: + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + return ctx + if ca_file: + ctx = ssl.create_default_context(cafile=ca_file) + return ctx + # No custom CA specified; use the system trust store (default urllib behaviour). + return None + + +def _fetch(url: str, ca_file: Optional[str] = None) -> dict | None: try: - r = urllib.request.urlopen(url, timeout=SCRAPE_TIMEOUT) + ctx = _build_ssl_context(ca_file) + r = urllib.request.urlopen(url, timeout=5, context=ctx) return json.loads(r.read()) except (OSError, urllib.error.URLError, json.JSONDecodeError) as e: log.warning("fetch %s failed: %s", url, e) return None -def _health_check(url: str) -> int: +def _health_check(url: str, ca_file: Optional[str] = None) -> int: """Return 1 if the URL returns HTTP 200, 0 otherwise.""" try: - r = urllib.request.urlopen(url, timeout=SCRAPE_TIMEOUT) + ctx = _build_ssl_context(ca_file) + r = urllib.request.urlopen(url, timeout=5, context=ctx) return 1 if r.status == 200 else 0 except Exception: # broad catch: probe must never propagate return 0 @@ -128,8 +143,9 @@ def gauge(name: str, help: str, value: float | int, labels: dict | None = None) gauge("hi_agamemnon_health", "1 if Agamemnon /v1/health returned HTTP 200, 0 otherwise", agamemnon_health) # ── Agamemnon agents ─────────────────────────────────────────────────── - if agents_data: - agents = agents_data.get("agents", []) + d = _fetch(f"{AGAMEMNON_URL}/v1/agents", AGAMEMNON_TLS_CA) + if d: + agents = d.get("agents", []) total = len(agents) online = sum(1 for a in agents if a.get("status") == "online") offline = total - online @@ -210,20 +226,9 @@ def log_message(self, fmt: str, *args: object) -> None: if __name__ == "__main__": log.info("homeric-exporter starting on port %d", PORT) - log.info("Scraping Agamemnon at %s", AGAMEMNON_URL) - log.info("Scraping Nestor at %s", NESTOR_URL) - log.info("Scraping NATS at %s", NATS_URL) - - server = ThreadingHTTPServer(("127.0.0.1", PORT), Handler) - - def _shutdown(signum, frame): - sig_name = signal.Signals(signum).name - log.info("received %s — shutting down gracefully", sig_name) - t = threading.Thread(target=server.shutdown, daemon=True) - t.start() - - signal.signal(signal.SIGTERM, _shutdown) - signal.signal(signal.SIGINT, _shutdown) - - server.serve_forever() - log.info("homeric-exporter stopped cleanly") + log.info("Scraping Agamemnon at %s (CA: %s)", AGAMEMNON_URL, AGAMEMNON_TLS_CA or "system trust store") + log.info("Scraping Nestor at %s (CA: %s)", NESTOR_URL, NESTOR_TLS_CA or "system trust store") + log.info("Scraping NATS at %s (CA: %s)", NATS_URL, NATS_TLS_CA or "system trust store") + if not _TLS_VERIFY: + log.warning("TLS certificate verification is DISABLED (TLS_VERIFY=false)") + HTTPServer(("0.0.0.0", PORT), Handler).serve_forever() diff --git a/justfile b/justfile index c88ce41..a8fd27f 100644 --- a/justfile +++ b/justfile @@ -16,6 +16,12 @@ GRAFANA_AUTH := "admin:" + GRAFANA_ADMIN_PASSWORD default: @just --list +# === Certificates === + +# Generate self-signed CA and per-service TLS certificates +gen-certs: + bash certs/gen-certs.sh + # === Services === # One-command bootstrap: prereqs, pixi install, .env generation @@ -93,6 +99,15 @@ test-scrape: @echo "Querying Prometheus for 'up' metric..." {{compose_cmd}} exec prometheus wget -qO- "http://localhost:9090/api/v1/query?query=up" | jq '.data.result[] | {job: .metric.job, instance: .metric.instance, up: .value[1]}' + +# Debug Prometheus from inside its container (port not exposed to host) +debug-prometheus: + {{compose_cmd}} exec prometheus sh + +# Debug Loki from inside its container (port not exposed to host) +debug-loki: + {{compose_cmd}} exec loki sh + # Manually test Agamemnon and Nestor health endpoints scrape-agamemnon: ./scripts/scrape-agamemnon.sh {{AGAMEMNON_URL}} diff --git a/pixi.toml b/pixi.toml index aba501c..e8a62bc 100644 --- a/pixi.toml +++ b/pixi.toml @@ -22,6 +22,8 @@ jq = ">=1.6,<2" [target.osx-64.dependencies] jq = ">=1.6,<2" +python = ">=3.11" +pytest = ">=7.0" [tasks] start = "just start" diff --git a/tests/test_exporter_tls.py b/tests/test_exporter_tls.py new file mode 100644 index 0000000..2692c79 --- /dev/null +++ b/tests/test_exporter_tls.py @@ -0,0 +1,218 @@ +"""Tests for TLS/SSL context handling in the homeric-exporter.""" +from __future__ import annotations + +import importlib +import json +import os +import ssl +import sys +import threading +import unittest.mock +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path +from typing import Generator +from unittest.mock import MagicMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# Helpers to import the exporter module with specific env vars set +# --------------------------------------------------------------------------- + +EXPORTER_PATH = str(Path(__file__).parent.parent / "exporter") + + +def _import_exporter(env: dict[str, str]): + """Import (or re-import) exporter with the given environment overrides.""" + with patch.dict(os.environ, env, clear=False): + if "exporter" in sys.modules: + del sys.modules["exporter"] + sys.path.insert(0, EXPORTER_PATH) + try: + return importlib.import_module("exporter") + finally: + sys.path.pop(0) + + +# --------------------------------------------------------------------------- +# Tests for _build_ssl_context +# --------------------------------------------------------------------------- + +class TestBuildSslContext: + def test_no_ca_file_returns_none_when_verify_enabled(self): + mod = _import_exporter({"TLS_VERIFY": "true"}) + ctx = mod._build_ssl_context(ca_file=None) + assert ctx is None + + def test_ca_file_returns_ssl_context(self, tmp_path: Path): + # Write a minimal PEM-like file (content doesn't matter for context creation test) + ca = tmp_path / "ca.crt" + # Generate a real self-signed cert so SSLContext can load it + import subprocess + subprocess.run( + [ + "openssl", "req", "-x509", "-newkey", "rsa:2048", + "-keyout", str(tmp_path / "ca.key"), + "-out", str(ca), + "-days", "1", "-nodes", + "-subj", "/CN=test-ca", + ], + check=True, + capture_output=True, + ) + mod = _import_exporter({"TLS_VERIFY": "true"}) + ctx = mod._build_ssl_context(ca_file=str(ca)) + assert isinstance(ctx, ssl.SSLContext) + assert ctx.verify_mode == ssl.CERT_REQUIRED + + def test_tls_verify_false_returns_insecure_context(self): + mod = _import_exporter({"TLS_VERIFY": "false"}) + ctx = mod._build_ssl_context(ca_file=None) + assert isinstance(ctx, ssl.SSLContext) + assert ctx.verify_mode == ssl.CERT_NONE + assert ctx.check_hostname is False + + def test_tls_verify_false_overrides_ca_file(self, tmp_path: Path): + mod = _import_exporter({"TLS_VERIFY": "false"}) + ctx = mod._build_ssl_context(ca_file=str(tmp_path / "nonexistent.crt")) + assert isinstance(ctx, ssl.SSLContext) + assert ctx.verify_mode == ssl.CERT_NONE + + +# --------------------------------------------------------------------------- +# Tests for _fetch and _health_check with mocked urlopen +# --------------------------------------------------------------------------- + +class TestFetchWithTls: + def test_fetch_passes_none_context_for_http(self): + mod = _import_exporter({"TLS_VERIFY": "true"}) + fake_response = MagicMock() + fake_response.read.return_value = b'{"key": "value"}' + with patch("urllib.request.urlopen", return_value=fake_response) as mock_open: + result = mod._fetch("http://example.com/api", ca_file=None) + assert result == {"key": "value"} + _ctx_arg = mock_open.call_args[1].get("context") or mock_open.call_args[0][1] if len(mock_open.call_args[0]) > 1 else None + # context=None is passed for plain HTTP + assert mock_open.call_args[1].get("context") is None + + def test_fetch_passes_ssl_context_when_ca_file_set(self, tmp_path: Path): + import subprocess + ca = tmp_path / "ca.crt" + subprocess.run( + ["openssl", "req", "-x509", "-newkey", "rsa:2048", + "-keyout", str(tmp_path / "ca.key"), "-out", str(ca), + "-days", "1", "-nodes", "-subj", "/CN=test-ca"], + check=True, capture_output=True, + ) + mod = _import_exporter({"TLS_VERIFY": "true"}) + fake_response = MagicMock() + fake_response.read.return_value = b'{"ok": true}' + with patch("urllib.request.urlopen", return_value=fake_response) as mock_open: + result = mod._fetch("https://example.com/api", ca_file=str(ca)) + assert result == {"ok": True} + ctx_kwarg = mock_open.call_args[1].get("context") + assert isinstance(ctx_kwarg, ssl.SSLContext) + assert ctx_kwarg.verify_mode == ssl.CERT_REQUIRED + + def test_fetch_returns_none_on_exception(self): + mod = _import_exporter({"TLS_VERIFY": "true"}) + with patch("urllib.request.urlopen", side_effect=OSError("connection refused")): + result = mod._fetch("http://unreachable/api") + assert result is None + + def test_health_check_returns_1_on_200(self): + mod = _import_exporter({"TLS_VERIFY": "true"}) + fake_response = MagicMock() + fake_response.status = 200 + with patch("urllib.request.urlopen", return_value=fake_response): + assert mod._health_check("http://example.com/health") == 1 + + def test_health_check_returns_0_on_exception(self): + mod = _import_exporter({"TLS_VERIFY": "true"}) + with patch("urllib.request.urlopen", side_effect=OSError("refused")): + assert mod._health_check("http://unreachable/health") == 0 + + def test_health_check_passes_ssl_context(self, tmp_path: Path): + import subprocess + ca = tmp_path / "ca.crt" + subprocess.run( + ["openssl", "req", "-x509", "-newkey", "rsa:2048", + "-keyout", str(tmp_path / "ca.key"), "-out", str(ca), + "-days", "1", "-nodes", "-subj", "/CN=test-ca"], + check=True, capture_output=True, + ) + mod = _import_exporter({"TLS_VERIFY": "true"}) + fake_response = MagicMock() + fake_response.status = 200 + with patch("urllib.request.urlopen", return_value=fake_response) as mock_open: + mod._health_check("https://example.com/health", ca_file=str(ca)) + ctx_kwarg = mock_open.call_args[1].get("context") + assert isinstance(ctx_kwarg, ssl.SSLContext) + + +# --------------------------------------------------------------------------- +# Tests for env var wiring in collect() +# --------------------------------------------------------------------------- + +class TestCollectTlsEnvWiring: + """Verify that AGAMEMNON_TLS_CA / NESTOR_TLS_CA / NATS_TLS_CA are threaded + through to _fetch/_health_check when set.""" + + def test_tls_ca_env_vars_default_to_none(self): + env = { + "AGAMEMNON_TLS_CA": "", + "NESTOR_TLS_CA": "", + "NATS_TLS_CA": "", + } + mod = _import_exporter(env) + assert mod.AGAMEMNON_TLS_CA in (None, "") + assert mod.NESTOR_TLS_CA in (None, "") + assert mod.NATS_TLS_CA in (None, "") + + def test_tls_ca_env_vars_set_correctly(self, tmp_path: Path): + ca_path = str(tmp_path / "ca.crt") + env = { + "AGAMEMNON_TLS_CA": ca_path, + "NESTOR_TLS_CA": ca_path, + "NATS_TLS_CA": ca_path, + } + mod = _import_exporter(env) + assert mod.AGAMEMNON_TLS_CA == ca_path + assert mod.NESTOR_TLS_CA == ca_path + assert mod.NATS_TLS_CA == ca_path + + def test_collect_passes_ca_to_agamemnon_calls(self, tmp_path: Path): + ca_path = str(tmp_path / "ca.crt") + env = { + "AGAMEMNON_URL": "https://agamemnon.test:8080", + "NESTOR_URL": "https://nestor.test:8081", + "NATS_URL": "https://nats.test:8222", + "AGAMEMNON_TLS_CA": ca_path, + "NESTOR_TLS_CA": ca_path, + "NATS_TLS_CA": ca_path, + } + mod = _import_exporter(env) + # All upstream calls should fail (no real server), but we verify ca_file threading. + calls: list[tuple] = [] + + original_fetch = mod._fetch + original_health = mod._health_check + + def spy_fetch(url: str, ca_file=None): + calls.append(("fetch", url, ca_file)) + return None + + def spy_health(url: str, ca_file=None): + calls.append(("health", url, ca_file)) + return 0 + + mod._fetch = spy_fetch + mod._health_check = spy_health + mod.collect() + mod._fetch = original_fetch + mod._health_check = original_health + + fetch_ca_files = {c[2] for c in calls if c[0] == "fetch"} + health_ca_files = {c[2] for c in calls if c[0] == "health"} + assert ca_path in fetch_ca_files, "CA file not passed to _fetch" + assert ca_path in health_ca_files, "CA file not passed to _health_check"