diff --git a/.github/workflows/_required.yml b/.github/workflows/_required.yml index a7b35b0..5fe39af 100644 --- a/.github/workflows/_required.yml +++ b/.github/workflows/_required.yml @@ -163,6 +163,9 @@ jobs: unit-tests: name: unit-tests runs-on: ubuntu-24.04 + timeout-minutes: 15 + permissions: + contents: read steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Validate all YAML config files @@ -180,6 +183,27 @@ jobs: fi # Validate all YAML files with yamllint (the canonical config-repo unit test) find . -path './.git' -prune -o \( -name "*.yml" -o -name "*.yaml" \) -print | xargs yamllint -c .yamllint.yaml + - name: Cache pixi environment + uses: actions/cache@v4 + with: + path: ~/.pixi + key: pixi-${{ runner.os }}-${{ hashFiles('pixi.lock') }} + restore-keys: pixi-${{ runner.os }}- + - name: Setup pixi + uses: prefix-dev/setup-pixi@v0.9.5 + with: + pixi-version: v0.67.2 + cache: false + - name: pixi install (locked) + run: | + if [ -f pixi.lock ]; then + pixi install --locked + else + echo "::warning::pixi.toml present but pixi.lock missing — running unlocked" + pixi install + fi + - name: Run pytest + run: pixi run pytest tests/ -v integration-tests: name: integration-tests diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8696b88..707d007 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -155,7 +155,7 @@ jobs: python-version: "3.11" - name: Install test dependencies - run: pip install pytest + run: pip install pytest pytest-cov pyyaml - name: Run tests run: python -m pytest tests/ -v diff --git a/docs/tls-setup.md b/docs/tls-setup.md index a31e58d..52e2812 100644 --- a/docs/tls-setup.md +++ b/docs/tls-setup.md @@ -1,6 +1,7 @@ # TLS Setup Runbook — ProjectArgus -This document describes how to enable and maintain TLS for all inter-service communication in the argus observability stack. +This document describes how to enable and maintain TLS for all inter-service +communication in the argus observability stack. ## Overview @@ -8,7 +9,7 @@ The stack uses a two-tier TLS strategy: | Tier | Path | Mechanism | |------|------|-----------| -| 1 (high priority) | exporter → Agamemnon/NATS/Nestor | Tailscale transport encryption (cross-host WSL2 boundary) | +| 1 (high priority) | exporter → Agamemnon/NATS/Nestor | Tailscale transport encryption | | 2 (best practice) | Docker-internal services | Self-signed CA + per-service certificates | ## Quick Start @@ -20,10 +21,12 @@ just gen-certs ``` This runs `certs/gen-certs.sh`, which creates: + - `certs/ca.crt` / `certs/ca.key` — local Certificate Authority - `certs/.crt` / `certs/.key` — one cert per service -Certificates are valid for 10 years. The `certs/` directory is git-ignored for `*.crt` and `*.key` — private keys must never be committed. +Certificates are valid for 10 years. The `certs/` directory is git-ignored for +`*.crt` and `*.key` — private keys must never be committed. ### 2. Start the stack @@ -31,7 +34,8 @@ Certificates are valid for 10 years. The `certs/` directory is git-ignored for ` just start ``` -All services mount their certificates from `certs/` via the volumes defined in `docker-compose.yml`. +All services mount their certificates from `certs/` via the volumes defined in +`docker-compose.yml`. ### 3. Verify @@ -41,17 +45,24 @@ just reload-prometheus # Prometheus reload over HTTPS just import-dashboards # Grafana API calls over HTTPS ``` -Open Grafana at `https://localhost:3001`. Your browser will warn about the self-signed certificate; add `certs/ca.crt` to your OS/browser trust store to suppress the warning. +Open Grafana at `https://localhost:3001`. Your browser will warn about the +self-signed certificate; add `certs/ca.crt` to your OS/browser trust store to +suppress the warning. ## Tier 1: Cross-Host Paths (Exporter → Agamemnon / NATS) -The exporter reaches Agamemnon (`172.20.0.1:8080`) and NATS (`172.24.0.1:8222`) across the WSL2 host gateway. These paths cross a network boundary and are the highest-risk. +The exporter reaches Agamemnon (`172.20.0.1:8080`) and NATS +(`172.24.0.1:8222`) across the WSL2 host gateway. These paths cross a network +boundary and are the highest-risk. **Recommended approach: Tailscale** -Route these URLs through Tailscale IPs instead of raw gateway IPs. Tailscale encrypts the hop end-to-end and sidesteps the self-signed certificate distribution problem for external services. +Route these URLs through Tailscale IPs instead of raw gateway IPs. Tailscale +encrypts the hop end-to-end and sidesteps the self-signed certificate +distribution problem for external services. Update `docker-compose.yml`: + ```yaml AGAMEMNON_URL: "https://:8080" NESTOR_URL: "https://:8081" @@ -59,17 +70,22 @@ NATS_URL: "https://:8222" ``` If Agamemnon/NATS serve HTTPS with our self-signed CA, also set: + ```yaml AGAMEMNON_TLS_CA: "/certs/ca.crt" NESTOR_TLS_CA: "/certs/ca.crt" NATS_TLS_CA: "/certs/ca.crt" ``` -The CA file `/certs/ca.crt` is already mounted in the `argus-exporter` container. +The CA file `/certs/ca.crt` is already mounted in the `argus-exporter` +container. **Fallback: Plain HTTP (current default)** -The default `AGAMEMNON_TLS_CA=""` / `NESTOR_TLS_CA=""` / `NATS_TLS_CA=""` preserves backward compatibility — the exporter uses plain HTTP as long as the upstream services don't serve HTTPS. This avoids `SSL_ERROR_RX_RECORD_TOO_LONG` errors when `https://` is pointed at an HTTP-only endpoint. +The default `AGAMEMNON_TLS_CA=""` / `NESTOR_TLS_CA=""` / `NATS_TLS_CA=""` +preserves backward compatibility — the exporter uses plain HTTP as long as the +upstream services don't serve HTTPS. This avoids `SSL_ERROR_RX_RECORD_TOO_LONG` +errors when `https://` is pointed at an HTTP-only endpoint. ## Tier 2: Docker-Internal Paths @@ -82,25 +98,31 @@ The default `AGAMEMNON_TLS_CA=""` / `NESTOR_TLS_CA=""` / `NATS_TLS_CA=""` preser ### Grafana CA cert for datasources -Grafana provisioning (`configs/grafana/datasources.yml`) includes `tlsAuthWithCACert: true` and a `secureJsonData.tlsCACert` placeholder. To inject the actual CA cert at startup, either: +Grafana provisioning (`configs/grafana/datasources.yml`) includes +`tlsAuthWithCACert: true` and a `secureJsonData.tlsCACert` placeholder. To +inject the actual CA cert at startup, either: **Option A — Env var injection (recommended for Docker)** Add to `docker-compose.yml` under `grafana.environment`: + ```yaml GF_DATASOURCE_PROMETHEUS_JSONDATA_TLSCACERT: | ``` Or use a startup script that patches the provisioning file: + ```bash -sed -i "s|# Mount the CA cert content here.*|$(cat certs/ca.crt | sed 's/^/ /')|" \ - configs/grafana/datasources.yml +sed -i "s|# Mount the CA cert content here.*|$(cat certs/ca.crt \ + | sed 's/^/ /')|" configs/grafana/datasources.yml ``` **Option B — Grafana UI** -After startup, navigate to each datasource in the Grafana UI and paste the CA cert content into the "TLS CA Certificate" field. Export the datasource JSON and check it in. +After startup, navigate to each datasource in the Grafana UI and paste the CA +cert content into the "TLS CA Certificate" field. Export the datasource JSON +and check it in. ## Certificate Rotation @@ -109,6 +131,7 @@ After startup, navigate to each datasource in the Grafana UI and paste the CA ce 3. Restart the stack: `just stop && just start` Or regenerate without removing first (force mode): + ```bash bash certs/gen-certs.sh --force just stop && just start @@ -118,21 +141,31 @@ just stop && just start ### `SSL_ERROR_RX_RECORD_TOO_LONG` -This means `https://` was used against a service that is still serving plain HTTP. Check: -1. Is the target service configured with TLS? (Prometheus `tls_server_config`, Loki `http_tls_config`, etc.) -2. Are the certificates mounted correctly? Check `docker compose logs ` for TLS init errors. +This means `https://` was used against a service that is still serving plain +HTTP. Check: + +1. Is the target service configured with TLS? (Prometheus `tls_server_config`, + Loki `http_tls_config`, etc.) +2. Are the certificates mounted correctly? Check + `docker compose logs ` for TLS init errors. 3. Did `just gen-certs` complete without errors? ### Certificate not trusted in browser Add `certs/ca.crt` to your OS trust store: -- **Ubuntu/Debian**: `sudo cp certs/ca.crt /usr/local/share/ca-certificates/argus-ca.crt && sudo update-ca-certificates` -- **macOS**: `sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain certs/ca.crt` -- **Windows**: Import via Certificate Manager (`certmgr.msc`) → Trusted Root Certification Authorities + +- **Ubuntu/Debian**: + `sudo cp certs/ca.crt /usr/local/share/ca-certificates/argus-ca.crt && sudo update-ca-certificates` +- **macOS**: + `sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain certs/ca.crt` +- **Windows**: Import via Certificate Manager (`certmgr.msc`) → + Trusted Root Certification Authorities ### Promtail push failures after TLS -Check that Loki is serving HTTPS and that `certs/ca.crt` is present in the container: +Check that Loki is serving HTTPS and that `certs/ca.crt` is present in the +container: + ```bash just logs promtail just logs loki diff --git a/exporter/exporter.py b/exporter/exporter.py index 7792a12..7f1a052 100644 --- a/exporter/exporter.py +++ b/exporter/exporter.py @@ -116,13 +116,13 @@ def gauge(name: str, help: str, value: float | int, labels: dict | None = None) # ── Parallelise all independent upstream fetches ────────────────────── with ThreadPoolExecutor(max_workers=7) as pool: - f_agamemnon_health = pool.submit(_health_check, f"{AGAMEMNON_URL}/v1/health") - f_agents = pool.submit(_fetch, f"{AGAMEMNON_URL}/v1/agents") - f_tasks = pool.submit(_fetch, f"{AGAMEMNON_URL}/v1/tasks") - f_nestor_health = pool.submit(_health_check, f"{NESTOR_URL}/v1/health") - f_nestor_stats = pool.submit(_fetch, f"{NESTOR_URL}/v1/research/stats") - f_nats_varz = pool.submit(_fetch, f"{NATS_URL}/varz") - f_nats_jsz = pool.submit(_fetch, f"{NATS_URL}/jsz") + f_agamemnon_health = pool.submit(_health_check, f"{AGAMEMNON_URL}/v1/health", AGAMEMNON_TLS_CA) + f_agents = pool.submit(_fetch, f"{AGAMEMNON_URL}/v1/agents", AGAMEMNON_TLS_CA) + f_tasks = pool.submit(_fetch, f"{AGAMEMNON_URL}/v1/tasks", AGAMEMNON_TLS_CA) + f_nestor_health = pool.submit(_health_check, f"{NESTOR_URL}/v1/health", NESTOR_TLS_CA) + f_nestor_stats = pool.submit(_fetch, f"{NESTOR_URL}/v1/research/stats", NESTOR_TLS_CA) + f_nats_varz = pool.submit(_fetch, f"{NATS_URL}/varz", NATS_TLS_CA) + f_nats_jsz = pool.submit(_fetch, f"{NATS_URL}/jsz", NATS_TLS_CA) # Resolve all futures before building metric lines agamemnon_health = f_agamemnon_health.result() agents_data = f_agents.result() @@ -224,11 +224,11 @@ def log_message(self, fmt: str, *args: object) -> None: log.debug(fmt, *args) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover log.info("homeric-exporter starting on port %d", PORT) log.info("Scraping Agamemnon at %s (CA: %s)", AGAMEMNON_URL, AGAMEMNON_TLS_CA or "system trust store") log.info("Scraping Nestor at %s (CA: %s)", NESTOR_URL, NESTOR_TLS_CA or "system trust store") log.info("Scraping NATS at %s (CA: %s)", NATS_URL, NATS_TLS_CA or "system trust store") if not _TLS_VERIFY: log.warning("TLS certificate verification is DISABLED (TLS_VERIFY=false)") - HTTPServer(("0.0.0.0", PORT), Handler).serve_forever() + HTTPServer(("0.0.0.0", PORT), Handler).serve_forever() # nosec B104 diff --git a/justfile b/justfile index 0346605..bc88ab2 100644 --- a/justfile +++ b/justfile @@ -78,9 +78,11 @@ dev: @./scripts/dev-watch.sh # Run local test suite +test: + pixi run test # Run local test suite with coverage -test: +test-unit: pixi run test-unit # Tail logs for a specific service (e.g. just logs prometheus) diff --git a/pixi.toml b/pixi.toml index 603f416..02298f8 100644 --- a/pixi.toml +++ b/pixi.toml @@ -25,10 +25,11 @@ jq = ">=1.6,<2" jq = ">=1.6,<2" [tasks] -start = "just start" -stop = "just stop" -status = "just status" -test = "python -m pytest tests/ -v" +start = "just start" +stop = "just stop" +status = "just status" +test = "python -m pytest tests/ -v" +test-unit = "python -m pytest tests/ -v --cov=exporter --cov-report=term-missing --cov-report=xml" [feature.lint.dependencies] python = ">=3.11" diff --git a/pyproject.toml b/pyproject.toml index 5b9609e..899fb7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,7 @@ [tool.pytest.ini_options] -addopts = [ - "--cov=exporter", - "--cov-report=term-missing", - "--cov-report=html", - "--cov-report=xml", -] +testpaths = ["tests"] +pythonpath = ["."] +addopts = "-v --strict-markers --cov=exporter --cov-report=term-missing --cov-report=html --cov-report=xml" [tool.coverage.run] source = ["exporter"] diff --git a/tests/test_exporter.py b/tests/test_exporter.py index 5860927..2b3dd77 100644 --- a/tests/test_exporter.py +++ b/tests/test_exporter.py @@ -7,7 +7,6 @@ from __future__ import annotations import contextlib -import io import json import sys import threading @@ -126,12 +125,12 @@ def _patch_collect( agents_data = agents_data or {} tasks_data = tasks_data or {} - def _fake_health_check(url: str) -> int: + def _fake_health_check(url: str, ca_file=None) -> int: if "agamemnon" in url or "8080" in url: return agamemnon_health return nestor_health - def _fake_fetch(url: str) -> dict | None: + def _fake_fetch(url: str, ca_file=None) -> dict | None: if "/v1/agents" in url: return agents_data if "/v1/tasks" in url: diff --git a/tests/test_exporter_tls.py b/tests/test_exporter_tls.py index 2692c79..1030ffe 100644 --- a/tests/test_exporter_tls.py +++ b/tests/test_exporter_tls.py @@ -2,19 +2,12 @@ from __future__ import annotations import importlib -import json import os import ssl import sys -import threading -import unittest.mock -from http.server import BaseHTTPRequestHandler, HTTPServer from pathlib import Path -from typing import Generator from unittest.mock import MagicMock, patch -import pytest - # --------------------------------------------------------------------------- # Helpers to import the exporter module with specific env vars set # --------------------------------------------------------------------------- diff --git a/tests/test_htpasswd_security.py b/tests/test_htpasswd_security.py index 2ad824b..135eb57 100644 --- a/tests/test_htpasswd_security.py +++ b/tests/test_htpasswd_security.py @@ -3,7 +3,6 @@ secrets/htpasswd must be generated at runtime from environment variables. """ import os -import re import stat import subprocess import tempfile diff --git a/tests/test_jetstream_consumer.py b/tests/test_jetstream_consumer.py index f0efa43..d6631f5 100644 --- a/tests/test_jetstream_consumer.py +++ b/tests/test_jetstream_consumer.py @@ -8,11 +8,10 @@ import json import sys import threading -import time +from pathlib import Path import types import urllib.request -from io import BytesIO -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest @@ -61,9 +60,10 @@ def _stub_nats(): @pytest.fixture(autouse=True, scope="module") def consumer(): """Import the consumer module once after the nats stub is in place.""" + _consumer_path = Path(__file__).parent.parent / "jetstream-consumer" / "consumer.py" spec = importlib.util.spec_from_file_location( "consumer", - "/home/mvillmow/Projects/ProjectArgus/.worktrees/issue-4/jetstream-consumer/consumer.py", + _consumer_path, ) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) diff --git a/tests/test_justfile.py b/tests/test_justfile.py index 3d815ba..e7c3bc7 100644 --- a/tests/test_justfile.py +++ b/tests/test_justfile.py @@ -1,7 +1,6 @@ """ Assert that the justfile contains no hardcoded Grafana credentials. """ -import re import unittest from pathlib import Path diff --git a/tests/unit/scripts/test_check_version_consistency.py b/tests/unit/scripts/test_check_version_consistency.py index fd1dacd..3cc12f3 100644 --- a/tests/unit/scripts/test_check_version_consistency.py +++ b/tests/unit/scripts/test_check_version_consistency.py @@ -3,7 +3,6 @@ from __future__ import annotations import importlib.util -import sys from pathlib import Path import pytest