diff --git a/README.architecture.md b/README.architecture.md index 765fa7c..ace6890 100644 --- a/README.architecture.md +++ b/README.architecture.md @@ -58,7 +58,7 @@ machine-readable degraded reason header. | **HAProxy** | Reference reverse proxy, host routing, SPOE filter, WAF enforcement, degraded-mode handling | `configs/haproxy/` | | **Coraza SPOA + OWASP CRS** | Request-phase WAF inspection, CRS anomaly scoring, JSON audit log | `configs/coraza/`, `deploy/docker/coraza.Dockerfile` | | **Log Shipper sidecar** | Tails Coraza's JSON audit file and ships each event to `POST /logs/ingest` with exponential backoff | `src/log-shipper/` | -| **FastAPI Backend** | Control-plane API for auth, vhosts, policies, rule overrides, logs, and health checks | `src/backend/` | +| **FastAPI Backend** | Control-plane API for auth, vhosts, policies, rule overrides, logs, and health/readiness probes | `src/backend/` | | **React Frontend** | Admin panel SPA built with React, TypeScript, Vite, Tailwind CSS, and pnpm | `src/frontend/` | | **PostgreSQL** | Docker Compose database for backend state | `deploy/docker/docker-compose.yml` | | **Docker Compose Stack** | Local full-stack orchestration, health checks, networks, logs, and persistent volumes | `deploy/docker/` | @@ -108,6 +108,29 @@ machine-readable degraded reason header. ## Deployment +### Health and Readiness Probes + +The backend exposes two probe endpoints, both unauthenticated: + +| Endpoint | Type | Returns | Checks | +|----------|------|---------|--------| +| `GET /health` | Liveness | 200 always | None — proves the process is alive | +| `GET /ready` | Readiness | 200 / 503 | DB connectivity (`SELECT 1`), runtime config volume writable | + +`/ready` returns a JSON body with per-check status so operators can tell which dependency is down: + +```json +// 200 — all clear +{"status": "ready", "checks": {"database": {"status": "ok"}, "runtime_config": {"status": "ok"}}} + +// 503 — one or more dependencies unavailable +{"status": "not ready", "checks": {"database": {"status": "ok"}, "runtime_config": {"status": "error", "detail": "/var/lib/guard-proxy/generated is not a writable directory"}}} +``` + +The Docker Compose `healthcheck` for the `backend` service targets `/ready`. Almost every other service in the stack (`frontend`, `haproxy`, `coraza`, `log-shipper`) depends on `backend` with `condition: service_healthy`, so `/ready` must pass before dependents start. This prevents dependents from launching while the database is still initialising or the config volume is unavailable. + +`/health` is retained as a lightweight liveness probe for orchestrators that separately track process aliveness (e.g. a future Kubernetes `livenessProbe`). + ### Development (Docker Compose) The implemented M1 stack lives in `deploy/docker/docker-compose.yml`. diff --git a/README.md b/README.md index 47bc311..e77fecf 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,8 @@ make seed # add the initial admin user |---|---| | Admin panel | http://localhost:3000 | | API (via HAProxy) | http://localhost:8080 | -| Health check | `curl http://localhost:8080/health` | +| Liveness probe | `curl http://localhost:8080/health` | +| Readiness probe | `curl http://localhost:8080/ready` | ### WAF smoke test diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml index b80ab40..cdbf3e2 100644 --- a/deploy/docker/docker-compose.yml +++ b/deploy/docker/docker-compose.yml @@ -27,7 +27,7 @@ services: - .env environment: DATABASE_URL: "postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB}" - GUARD_PROXY_RUNTIME_DIR: /var/lib/guard-proxy/generated + RUNTIME_GENERATED_CONFIG_ROOT: /var/lib/guard-proxy/generated depends_on: postgres: condition: service_healthy @@ -35,7 +35,7 @@ services: test: [ "CMD-SHELL", - "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/health', timeout=2)\"", + "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/ready', timeout=2)\"", ] interval: 10s timeout: 5s diff --git a/src/backend/app/main.py b/src/backend/app/main.py index 3ac3c4f..1bbb080 100644 --- a/src/backend/app/main.py +++ b/src/backend/app/main.py @@ -1,11 +1,18 @@ import logging +import os from collections.abc import AsyncIterator from contextlib import asynccontextmanager +from pathlib import Path -from fastapi import FastAPI +from fastapi import Depends, FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from sqlalchemy import text +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.orm import Session from app.config import settings, validate_runtime_settings +from app.database import get_db from app.routers import ( auth, config, @@ -16,6 +23,8 @@ vhosts, ) +logger = logging.getLogger(__name__) + class _HealthcheckAccessFilter(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: @@ -24,7 +33,7 @@ def filter(self, record: logging.LogRecord) -> bool: isinstance(args, tuple) and len(args) >= 3 and args[1] == "GET" - and args[2] == "/health" + and args[2] in ("/health", "/ready") ): return False return True @@ -67,8 +76,50 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: @app.get("/health") def health_check() -> dict[str, str]: - """Health check endpoint.""" + """Liveness probe — always returns 200 if the process is running.""" return { "status": "healthy", "version": "0.1.0", } + + +@app.get("/ready") +def readiness_check(db: Session = Depends(get_db)) -> JSONResponse: + """Readiness probe — checks DB connectivity and runtime config volume. + + Returns 200 when all checks pass, 503 with a per-check breakdown when any + dependency is unavailable. Used as the Docker Compose healthcheck gate so + dependent services (frontend, HAProxy, Coraza) only start once the backend + can actually serve traffic. + """ + checks: dict[str, dict[str, str]] = {} + all_ok = True + + # Check 1: database connectivity + try: + db.execute(text("SELECT 1")) + checks["database"] = {"status": "ok"} + except SQLAlchemyError as exc: + logger.error("Readiness DB check failed: %s", exc) + checks["database"] = {"status": "error", "detail": "database unavailable"} + all_ok = False + + # Check 2: runtime config volume present and writable (W_OK + X_OK required + # to create files inside a directory) + config_root = Path(settings.runtime_generated_config_root) + if config_root.is_dir() and os.access(config_root, os.W_OK | os.X_OK): + checks["runtime_config"] = {"status": "ok"} + else: + checks["runtime_config"] = { + "status": "error", + "detail": f"{config_root} is not a writable directory", + } + all_ok = False + + return JSONResponse( + content={ + "status": "ready" if all_ok else "not ready", + "checks": checks, + }, + status_code=200 if all_ok else 503, + ) diff --git a/src/backend/tests/integration/test_health_router.py b/src/backend/tests/integration/test_health_router.py new file mode 100644 index 0000000..217842a --- /dev/null +++ b/src/backend/tests/integration/test_health_router.py @@ -0,0 +1,160 @@ +"""Integration tests for the /health liveness and /ready readiness probes.""" + +from collections.abc import Iterator +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from fastapi.testclient import TestClient +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.orm import Session + +import app.main as main_module +from app.database import get_db +from app.main import app + + +# --------------------------------------------------------------------------- +# /health — liveness probe +# --------------------------------------------------------------------------- + + +def test_health_returns_200(client: TestClient) -> None: + response = client.get("/health") + assert response.status_code == 200 + data = response.json() + assert data["status"] == "healthy" + assert "version" in data + + +def test_health_does_not_require_auth(client: TestClient) -> None: + """Liveness probe must be accessible without authentication.""" + response = client.get("/health") + assert response.status_code == 200 + + +# --------------------------------------------------------------------------- +# /ready — readiness probe (happy path) +# --------------------------------------------------------------------------- + + +def test_ready_returns_200_when_all_checks_pass( + client: TestClient, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Both DB and config-dir checks pass → 200 with all checks ok.""" + monkeypatch.setattr( + main_module.settings, "runtime_generated_config_root", str(tmp_path) + ) + + response = client.get("/ready") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "ready" + assert data["checks"]["database"]["status"] == "ok" + assert data["checks"]["runtime_config"]["status"] == "ok" + + +def test_ready_does_not_require_auth( + client: TestClient, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Readiness probe must be accessible without authentication.""" + monkeypatch.setattr( + main_module.settings, "runtime_generated_config_root", str(tmp_path) + ) + response = client.get("/ready") + assert response.status_code == 200 + + +# --------------------------------------------------------------------------- +# /ready — readiness probe (DB failure) +# --------------------------------------------------------------------------- + + +def test_ready_returns_503_when_db_fails( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """DB connectivity failure → 503 with database check reporting error.""" + monkeypatch.setattr( + main_module.settings, "runtime_generated_config_root", str(tmp_path) + ) + + mock_session = MagicMock(spec=Session) + mock_session.execute.side_effect = SQLAlchemyError("connection refused") + + def broken_db() -> Iterator[Session]: + yield mock_session + + app.dependency_overrides[get_db] = broken_db + try: + with TestClient(app) as c: + response = c.get("/ready") + finally: + app.dependency_overrides.pop(get_db, None) + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "not ready" + assert data["checks"]["database"]["status"] == "error" + assert data["checks"]["database"]["detail"] == "database unavailable" + + +# --------------------------------------------------------------------------- +# /ready — readiness probe (config dir failure) +# --------------------------------------------------------------------------- + + +def test_ready_returns_503_when_config_dir_missing( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Missing config dir → 503 with runtime_config check reporting error.""" + monkeypatch.setattr( + main_module.settings, + "runtime_generated_config_root", + "/nonexistent/guard-proxy-test-path", + ) + + response = client.get("/ready") + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "not ready" + assert data["checks"]["runtime_config"]["status"] == "error" + assert "detail" in data["checks"]["runtime_config"] + # DB check should still have passed + assert data["checks"]["database"]["status"] == "ok" + + +def test_ready_body_includes_both_checks_on_full_failure( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When both checks fail the body contains entries for both.""" + monkeypatch.setattr( + main_module.settings, + "runtime_generated_config_root", + "/nonexistent/guard-proxy-test-path", + ) + + mock_session = MagicMock(spec=Session) + mock_session.execute.side_effect = SQLAlchemyError("unreachable") + + def broken_db() -> Iterator[Session]: + yield mock_session + + app.dependency_overrides[get_db] = broken_db + try: + with TestClient(app) as c: + response = c.get("/ready") + finally: + app.dependency_overrides.pop(get_db, None) + + assert response.status_code == 503 + data = response.json() + assert data["checks"]["database"]["status"] == "error" + assert data["checks"]["runtime_config"]["status"] == "error" diff --git a/src/backend/tests/unit/test_access_logging.py b/src/backend/tests/unit/test_access_logging.py index 8c82d97..fc0a1e4 100644 --- a/src/backend/tests/unit/test_access_logging.py +++ b/src/backend/tests/unit/test_access_logging.py @@ -25,3 +25,11 @@ def test_healthcheck_access_filter_keeps_non_get_health() -> None: def test_healthcheck_access_filter_keeps_non_health_requests() -> None: assert _HealthcheckAccessFilter().filter(_access_record("GET", "/api/v1/policies")) + + +def test_healthcheck_access_filter_drops_get_ready() -> None: + assert not _HealthcheckAccessFilter().filter(_access_record("GET", "/ready")) + + +def test_healthcheck_access_filter_keeps_non_get_ready() -> None: + assert _HealthcheckAccessFilter().filter(_access_record("POST", "/ready", 405))