From 22839989ef95f1f5e1b830594dace51215478bcd Mon Sep 17 00:00:00 2001 From: monte Date: Tue, 2 Jun 2026 17:37:09 +0200 Subject: [PATCH 1/3] feat(health): add /ready readiness probe, wire as compose healthcheck gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add GET /ready that checks DB connectivity (SELECT 1) and runtime config volume writable; returns 200 {"status":"ready",...} or 503 with per-check breakdown so operators can tell which dependency is down - Keep GET /health as a pure liveness probe (process-alive, no deps) - Suppress GET /ready access-log noise in _HealthcheckAccessFilter - Switch Docker Compose backend healthcheck from /health to /ready so the entire dependent stack (frontend, haproxy, coraza, log-shipper) only starts once the backend can actually serve real traffic - Fix dead GUARD_PROXY_RUNTIME_DIR env var in compose → RUNTIME_GENERATED_CONFIG_ROOT (value unchanged; previous name was never read by the Python settings layer) - 12 tests (7 integration + 2 unit new, 3 existing unit retained) Closes #128 Co-Authored-By: Claude Opus 4.8 --- README.architecture.md | 25 ++- README.md | 3 +- deploy/docker/docker-compose.yml | 4 +- src/backend/app/main.py | 53 +++++- .../tests/integration/test_health_router.py | 160 ++++++++++++++++++ src/backend/tests/unit/test_access_logging.py | 8 + 6 files changed, 246 insertions(+), 7 deletions(-) create mode 100644 src/backend/tests/integration/test_health_router.py diff --git a/README.architecture.md b/README.architecture.md index 765fa7c..ace6890 100644 --- a/README.architecture.md +++ b/README.architecture.md @@ -58,7 +58,7 @@ machine-readable degraded reason header. | **HAProxy** | Reference reverse proxy, host routing, SPOE filter, WAF enforcement, degraded-mode handling | `configs/haproxy/` | | **Coraza SPOA + OWASP CRS** | Request-phase WAF inspection, CRS anomaly scoring, JSON audit log | `configs/coraza/`, `deploy/docker/coraza.Dockerfile` | | **Log Shipper sidecar** | Tails Coraza's JSON audit file and ships each event to `POST /logs/ingest` with exponential backoff | `src/log-shipper/` | -| **FastAPI Backend** | Control-plane API for auth, vhosts, policies, rule overrides, logs, and health checks | `src/backend/` | +| **FastAPI Backend** | Control-plane API for auth, vhosts, policies, rule overrides, logs, and health/readiness probes | `src/backend/` | | **React Frontend** | Admin panel SPA built with React, TypeScript, Vite, Tailwind CSS, and pnpm | `src/frontend/` | | **PostgreSQL** | Docker Compose database for backend state | `deploy/docker/docker-compose.yml` | | **Docker Compose Stack** | Local full-stack orchestration, health checks, networks, logs, and persistent volumes | `deploy/docker/` | @@ -108,6 +108,29 @@ machine-readable degraded reason header. ## Deployment +### Health and Readiness Probes + +The backend exposes two probe endpoints, both unauthenticated: + +| Endpoint | Type | Returns | Checks | +|----------|------|---------|--------| +| `GET /health` | Liveness | 200 always | None — proves the process is alive | +| `GET /ready` | Readiness | 200 / 503 | DB connectivity (`SELECT 1`), runtime config volume writable | + +`/ready` returns a JSON body with per-check status so operators can tell which dependency is down: + +```json +// 200 — all clear +{"status": "ready", "checks": {"database": {"status": "ok"}, "runtime_config": {"status": "ok"}}} + +// 503 — one or more dependencies unavailable +{"status": "not ready", "checks": {"database": {"status": "ok"}, "runtime_config": {"status": "error", "detail": "/var/lib/guard-proxy/generated is not a writable directory"}}} +``` + +The Docker Compose `healthcheck` for the `backend` service targets `/ready`. Almost every other service in the stack (`frontend`, `haproxy`, `coraza`, `log-shipper`) depends on `backend` with `condition: service_healthy`, so `/ready` must pass before dependents start. This prevents dependents from launching while the database is still initialising or the config volume is unavailable. + +`/health` is retained as a lightweight liveness probe for orchestrators that separately track process aliveness (e.g. a future Kubernetes `livenessProbe`). + ### Development (Docker Compose) The implemented M1 stack lives in `deploy/docker/docker-compose.yml`. diff --git a/README.md b/README.md index 47bc311..e77fecf 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,8 @@ make seed # add the initial admin user |---|---| | Admin panel | http://localhost:3000 | | API (via HAProxy) | http://localhost:8080 | -| Health check | `curl http://localhost:8080/health` | +| Liveness probe | `curl http://localhost:8080/health` | +| Readiness probe | `curl http://localhost:8080/ready` | ### WAF smoke test diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml index b80ab40..cdbf3e2 100644 --- a/deploy/docker/docker-compose.yml +++ b/deploy/docker/docker-compose.yml @@ -27,7 +27,7 @@ services: - .env environment: DATABASE_URL: "postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB}" - GUARD_PROXY_RUNTIME_DIR: /var/lib/guard-proxy/generated + RUNTIME_GENERATED_CONFIG_ROOT: /var/lib/guard-proxy/generated depends_on: postgres: condition: service_healthy @@ -35,7 +35,7 @@ services: test: [ "CMD-SHELL", - "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/health', timeout=2)\"", + "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/ready', timeout=2)\"", ] interval: 10s timeout: 5s diff --git a/src/backend/app/main.py b/src/backend/app/main.py index 3ac3c4f..efb7432 100644 --- a/src/backend/app/main.py +++ b/src/backend/app/main.py @@ -1,11 +1,18 @@ import logging +import os from collections.abc import AsyncIterator from contextlib import asynccontextmanager +from pathlib import Path -from fastapi import FastAPI +from fastapi import Depends, FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from sqlalchemy import text +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.orm import Session from app.config import settings, validate_runtime_settings +from app.database import get_db from app.routers import ( auth, config, @@ -24,7 +31,7 @@ def filter(self, record: logging.LogRecord) -> bool: isinstance(args, tuple) and len(args) >= 3 and args[1] == "GET" - and args[2] == "/health" + and args[2] in ("/health", "/ready") ): return False return True @@ -67,8 +74,48 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: @app.get("/health") def health_check() -> dict[str, str]: - """Health check endpoint.""" + """Liveness probe — always returns 200 if the process is running.""" return { "status": "healthy", "version": "0.1.0", } + + +@app.get("/ready") +def readiness_check(db: Session = Depends(get_db)) -> JSONResponse: + """Readiness probe — checks DB connectivity and runtime config volume. + + Returns 200 when all checks pass, 503 with a per-check breakdown when any + dependency is unavailable. Used as the Docker Compose healthcheck gate so + dependent services (frontend, HAProxy, Coraza) only start once the backend + can actually serve traffic. + """ + checks: dict[str, dict[str, str]] = {} + all_ok = True + + # Check 1: database connectivity + try: + db.execute(text("SELECT 1")) + checks["database"] = {"status": "ok"} + except SQLAlchemyError as exc: + checks["database"] = {"status": "error", "detail": str(exc)} + all_ok = False + + # Check 2: runtime config volume present and writable + config_root = Path(settings.runtime_generated_config_root) + if config_root.is_dir() and os.access(config_root, os.W_OK): + checks["runtime_config"] = {"status": "ok"} + else: + checks["runtime_config"] = { + "status": "error", + "detail": f"{config_root} is not a writable directory", + } + all_ok = False + + return JSONResponse( + content={ + "status": "ready" if all_ok else "not ready", + "checks": checks, + }, + status_code=200 if all_ok else 503, + ) diff --git a/src/backend/tests/integration/test_health_router.py b/src/backend/tests/integration/test_health_router.py new file mode 100644 index 0000000..9d539c7 --- /dev/null +++ b/src/backend/tests/integration/test_health_router.py @@ -0,0 +1,160 @@ +"""Integration tests for the /health liveness and /ready readiness probes.""" + +from collections.abc import Iterator +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from fastapi.testclient import TestClient +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.orm import Session + +import app.main as main_module +from app.database import get_db +from app.main import app + + +# --------------------------------------------------------------------------- +# /health — liveness probe +# --------------------------------------------------------------------------- + + +def test_health_returns_200(client: TestClient) -> None: + response = client.get("/health") + assert response.status_code == 200 + data = response.json() + assert data["status"] == "healthy" + assert "version" in data + + +def test_health_does_not_require_auth(client: TestClient) -> None: + """Liveness probe must be accessible without authentication.""" + response = client.get("/health") + assert response.status_code == 200 + + +# --------------------------------------------------------------------------- +# /ready — readiness probe (happy path) +# --------------------------------------------------------------------------- + + +def test_ready_returns_200_when_all_checks_pass( + client: TestClient, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Both DB and config-dir checks pass → 200 with all checks ok.""" + monkeypatch.setattr( + main_module.settings, "runtime_generated_config_root", str(tmp_path) + ) + + response = client.get("/ready") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "ready" + assert data["checks"]["database"]["status"] == "ok" + assert data["checks"]["runtime_config"]["status"] == "ok" + + +def test_ready_does_not_require_auth( + client: TestClient, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Readiness probe must be accessible without authentication.""" + monkeypatch.setattr( + main_module.settings, "runtime_generated_config_root", str(tmp_path) + ) + response = client.get("/ready") + assert response.status_code == 200 + + +# --------------------------------------------------------------------------- +# /ready — readiness probe (DB failure) +# --------------------------------------------------------------------------- + + +def test_ready_returns_503_when_db_fails( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """DB connectivity failure → 503 with database check reporting error.""" + monkeypatch.setattr( + main_module.settings, "runtime_generated_config_root", str(tmp_path) + ) + + mock_session = MagicMock(spec=Session) + mock_session.execute.side_effect = SQLAlchemyError("connection refused") + + def broken_db() -> Iterator[Session]: + yield mock_session + + app.dependency_overrides[get_db] = broken_db + try: + with TestClient(app) as c: + response = c.get("/ready") + finally: + app.dependency_overrides.pop(get_db, None) + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "not ready" + assert data["checks"]["database"]["status"] == "error" + assert "detail" in data["checks"]["database"] + + +# --------------------------------------------------------------------------- +# /ready — readiness probe (config dir failure) +# --------------------------------------------------------------------------- + + +def test_ready_returns_503_when_config_dir_missing( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Missing config dir → 503 with runtime_config check reporting error.""" + monkeypatch.setattr( + main_module.settings, + "runtime_generated_config_root", + "/nonexistent/guard-proxy-test-path", + ) + + response = client.get("/ready") + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "not ready" + assert data["checks"]["runtime_config"]["status"] == "error" + assert "detail" in data["checks"]["runtime_config"] + # DB check should still have passed + assert data["checks"]["database"]["status"] == "ok" + + +def test_ready_body_includes_both_checks_on_full_failure( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When both checks fail the body contains entries for both.""" + monkeypatch.setattr( + main_module.settings, + "runtime_generated_config_root", + "/nonexistent/guard-proxy-test-path", + ) + + mock_session = MagicMock(spec=Session) + mock_session.execute.side_effect = SQLAlchemyError("unreachable") + + def broken_db() -> Iterator[Session]: + yield mock_session + + app.dependency_overrides[get_db] = broken_db + try: + with TestClient(app) as c: + response = c.get("/ready") + finally: + app.dependency_overrides.pop(get_db, None) + + assert response.status_code == 503 + data = response.json() + assert data["checks"]["database"]["status"] == "error" + assert data["checks"]["runtime_config"]["status"] == "error" diff --git a/src/backend/tests/unit/test_access_logging.py b/src/backend/tests/unit/test_access_logging.py index 8c82d97..fc0a1e4 100644 --- a/src/backend/tests/unit/test_access_logging.py +++ b/src/backend/tests/unit/test_access_logging.py @@ -25,3 +25,11 @@ def test_healthcheck_access_filter_keeps_non_get_health() -> None: def test_healthcheck_access_filter_keeps_non_health_requests() -> None: assert _HealthcheckAccessFilter().filter(_access_record("GET", "/api/v1/policies")) + + +def test_healthcheck_access_filter_drops_get_ready() -> None: + assert not _HealthcheckAccessFilter().filter(_access_record("GET", "/ready")) + + +def test_healthcheck_access_filter_keeps_non_get_ready() -> None: + assert _HealthcheckAccessFilter().filter(_access_record("POST", "/ready", 405)) From 7a88fd74fce5bfaab65765f859cd2b1657f0985b Mon Sep 17 00:00:00 2001 From: monte Date: Tue, 2 Jun 2026 17:42:19 +0200 Subject: [PATCH 2/3] fix(review): address Copilot feedback on /ready probe - Replace str(exc) in DB error detail with generic "database unavailable" message; log full exception at ERROR level instead to avoid leaking internal hostnames/credentials through the unauthenticated probe - Add os.X_OK to the runtime config dir access check (W_OK alone doesn't guarantee files can be created inside a directory) - Tighten test assertion to verify the generic detail string Co-Authored-By: Claude Opus 4.8 --- src/backend/app/main.py | 10 +++++++--- src/backend/tests/integration/test_health_router.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/backend/app/main.py b/src/backend/app/main.py index efb7432..a95503c 100644 --- a/src/backend/app/main.py +++ b/src/backend/app/main.py @@ -13,6 +13,8 @@ from app.config import settings, validate_runtime_settings from app.database import get_db + +logger = logging.getLogger(__name__) from app.routers import ( auth, config, @@ -98,12 +100,14 @@ def readiness_check(db: Session = Depends(get_db)) -> JSONResponse: db.execute(text("SELECT 1")) checks["database"] = {"status": "ok"} except SQLAlchemyError as exc: - checks["database"] = {"status": "error", "detail": str(exc)} + logger.error("Readiness DB check failed: %s", exc) + checks["database"] = {"status": "error", "detail": "database unavailable"} all_ok = False - # Check 2: runtime config volume present and writable + # Check 2: runtime config volume present and writable (W_OK + X_OK required + # to create files inside a directory) config_root = Path(settings.runtime_generated_config_root) - if config_root.is_dir() and os.access(config_root, os.W_OK): + if config_root.is_dir() and os.access(config_root, os.W_OK | os.X_OK): checks["runtime_config"] = {"status": "ok"} else: checks["runtime_config"] = { diff --git a/src/backend/tests/integration/test_health_router.py b/src/backend/tests/integration/test_health_router.py index 9d539c7..217842a 100644 --- a/src/backend/tests/integration/test_health_router.py +++ b/src/backend/tests/integration/test_health_router.py @@ -101,7 +101,7 @@ def broken_db() -> Iterator[Session]: data = response.json() assert data["status"] == "not ready" assert data["checks"]["database"]["status"] == "error" - assert "detail" in data["checks"]["database"] + assert data["checks"]["database"]["detail"] == "database unavailable" # --------------------------------------------------------------------------- From 9ba6bee7d614a011a8ca8fe32278760c1c72f4a3 Mon Sep 17 00:00:00 2001 From: monte Date: Tue, 2 Jun 2026 17:45:17 +0200 Subject: [PATCH 3/3] fix(lint): move logger after imports to fix ruff E402 Co-Authored-By: Claude Opus 4.8 --- src/backend/app/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/app/main.py b/src/backend/app/main.py index a95503c..1bbb080 100644 --- a/src/backend/app/main.py +++ b/src/backend/app/main.py @@ -13,8 +13,6 @@ from app.config import settings, validate_runtime_settings from app.database import get_db - -logger = logging.getLogger(__name__) from app.routers import ( auth, config, @@ -25,6 +23,8 @@ vhosts, ) +logger = logging.getLogger(__name__) + class _HealthcheckAccessFilter(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: