From 3cf959f18357fdb2e793578a4946aa9e2ea7a27e Mon Sep 17 00:00:00 2001 From: "railway-app[bot]" <68434857+railway-app[bot]@users.noreply.github.com> Date: Thu, 30 Apr 2026 12:47:39 +0000 Subject: [PATCH] feat: graceful degradation for Postgres/Redis with startup logging --- backend/app/api/v1/endpoints/health.py | 46 ++++++++++++--- backend/app/db/session.py | 55 ++++++++++++++++- backend/app/main.py | 81 ++++++++++++++++++++++++-- 3 files changed, 166 insertions(+), 16 deletions(-) diff --git a/backend/app/api/v1/endpoints/health.py b/backend/app/api/v1/endpoints/health.py index 8da23ecd..20806e5f 100644 --- a/backend/app/api/v1/endpoints/health.py +++ b/backend/app/api/v1/endpoints/health.py @@ -1,12 +1,12 @@ import httpx -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, Response from sqlalchemy.orm import Session from sqlalchemy import text from pydantic import BaseModel from typing import Dict, Any import redis as redis_lib -from app.db.session import get_db +from app.db.session import get_db, probe_db from app.core.config import settings from app.core.circuit_breaker import all_breakers @@ -20,10 +20,18 @@ class HealthStatus(BaseModel): def _check_postgres(db: Session) -> dict: + """Check Postgres by running a lightweight query on the injected session. + + Also updates the module-level ``_db_available`` flag in db.session so + that the root ``/health`` endpoint reflects the current state. + """ try: db.execute(text("SELECT 1")) + # Sync the availability flag so /health picks up recovery. + probe_db() return {"status": "ok"} except Exception as e: + probe_db() return {"status": "error", "detail": str(e)} @@ -36,10 +44,12 @@ def _check_redis() -> dict: r.ping() return {"status": "ok"} except Exception as e: - return {"status": "error", "detail": str(e)} + return {"status": "error", "detail": str(e), "fallback": "in-memory"} async def _check_discord() -> dict: + if not settings.DISCORD_BOT_TOKEN: + return {"status": "skipped", "detail": "DISCORD_BOT_TOKEN not configured"} try: async with httpx.AsyncClient(timeout=3) as client: resp = await client.get( @@ -52,10 +62,14 @@ async def _check_discord() -> dict: @router.get("/health", response_model=HealthStatus, tags=["health"]) -async def health_check(db: Session = Depends(get_db)): +async def health_check(response: Response, db: Session = Depends(get_db)): """GET /api/v1/health - Deep health check — Postgres, Redis, Discord. - Returns 200 healthy / 207 degraded. + + Deep health check — Postgres, Redis, Discord, circuit breakers. + + HTTP status codes: + 200 — all core services healthy + 503 — Postgres unavailable (app running in degraded mode) """ checks = { "postgres": _check_postgres(db), @@ -64,9 +78,23 @@ async def health_check(db: Session = Depends(get_db)): "circuit_breakers": all_breakers(), } - infra_checks = {k: v for k, v in checks.items() if k != "circuit_breakers"} - all_ok = all(v["status"] == "ok" for v in infra_checks.values()) - overall = "healthy" if all_ok else "degraded" + # Postgres is a *core* dependency — its failure drives the HTTP status. + postgres_ok = checks["postgres"]["status"] == "ok" + # Redis and Discord are non-critical; their failure only affects `status`. + non_core_ok = all( + v["status"] in {"ok", "skipped"} + for k, v in checks.items() + if k not in {"postgres", "circuit_breakers"} + ) + + if not postgres_ok: + overall = "unhealthy" + response.status_code = 503 + elif not non_core_ok: + overall = "degraded" + # Still 200 — app is functional, non-critical services are down. + else: + overall = "healthy" return HealthStatus( status=overall, diff --git a/backend/app/db/session.py b/backend/app/db/session.py index af648181..a23916f6 100644 --- a/backend/app/db/session.py +++ b/backend/app/db/session.py @@ -1,14 +1,21 @@ import asyncio +import logging from contextlib import asynccontextmanager from typing import AsyncGenerator -from sqlalchemy import create_engine +from sqlalchemy import create_engine, text from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker from sqlalchemy.orm import sessionmaker from app.core.config import settings +logger = logging.getLogger(__name__) + # --------------------------------------------------------------------------- # Sync engine — kept for Celery workers, Alembic, and legacy sync code +# +# The engine object is always created (it only configures the pool, it does +# not open a connection). The actual TCP handshake happens on the first +# query, so a missing Postgres at import time does NOT raise here. # --------------------------------------------------------------------------- engine = create_engine( settings.HYPERCODE_DB_URL, @@ -20,14 +27,58 @@ ) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +# --------------------------------------------------------------------------- +# Lazy DB availability probe +# +# _db_available is a tri-state: +# None — not yet probed (first request will trigger a probe) +# True — last probe succeeded +# False — last probe failed (re-probed on every request so recovery is +# detected automatically) +# --------------------------------------------------------------------------- +_db_available: bool | None = None + + +def probe_db() -> bool: + """Attempt a cheap SELECT 1 against Postgres. + + Updates and returns the module-level ``_db_available`` flag. Safe to + call from any thread; never raises. + """ + global _db_available + try: + with engine.connect() as conn: + conn.execute(text("SELECT 1")) + _db_available = True + except Exception as exc: + logger.warning("Database probe failed: %s", exc) + _db_available = False + return bool(_db_available) + + +def is_db_available() -> bool: + """Return the cached DB availability flag, probing if not yet known.""" + if _db_available is None: + return probe_db() + return bool(_db_available) + def get_db(): - """Sync DB dependency — use in sync routes / Celery tasks.""" + """Sync DB dependency — use in sync routes / Celery tasks. + + On every call the module-level ``_db_available`` flag is updated so that + a recovered Postgres is detected automatically without a restart. + """ + global _db_available db = SessionLocal() try: yield db + # If we got here without an exception the connection is healthy. + _db_available = True except Exception: db.rollback() + # Re-probe so the health endpoint reflects the current state. + probe_db() raise finally: db.close() diff --git a/backend/app/main.py b/backend/app/main.py index 0525df2b..b55ffe91 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -9,9 +9,11 @@ from contextlib import asynccontextmanager, suppress _boot_error: str | None = None +# Set to True when Postgres was unreachable at startup (degraded mode). +_db_degraded: bool = False import redis.asyncio as aioredis -from fastapi import FastAPI, Request +from fastapi import FastAPI, Request, Response from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse try: @@ -104,9 +106,19 @@ async def _core_heartbeat_loop() -> None: async def _lifespan(app: FastAPI): global _metrics_redis global _boot_error + global _db_degraded heartbeat_task: asyncio.Task | None = None + # ── Structured startup log helpers ──────────────────────────────────── + _startup_lines: list[str] = [] + + def _svc_log(name: str, state: str, note: str = "") -> None: + suffix = f" ({note})" if note else "" + line = f"[STARTUP] {name}: {state}{suffix}" + _startup_lines.append(line) + logger.info(line) + try: settings.validate_security() except Exception as exc: @@ -114,6 +126,25 @@ async def _lifespan(app: FastAPI): logger.error("Startup security validation failed: %s", _boot_error) else: + # ── Postgres probe ───────────────────────────────────────────────── + async def _probe_postgres_startup() -> None: + """Non-blocking Postgres probe that sets _db_degraded.""" + global _db_degraded + try: + from app.db.session import probe_db as _probe_db + available = await asyncio.to_thread(_probe_db) + if available: + _svc_log("Postgres", "CONNECTED") + _db_degraded = False + else: + _svc_log("Postgres", "UNAVAILABLE", "will retry on next request") + _db_degraded = True + except Exception as exc: + _svc_log("Postgres", "UNAVAILABLE", f"will retry on next request — {exc}") + _db_degraded = True + + await _probe_postgres_startup() + async def _init_db_background() -> None: # The Dockerfile entrypoint blocks on `alembic upgrade head` before uvicorn # starts, so by the time this lifespan runs the schema is at head. We still @@ -172,6 +203,7 @@ def _seed_sync() -> None: asyncio.create_task(_init_db_background()) + # ── Redis / Metrics probe ────────────────────────────────────────── try: redis_client = aioredis.from_url( settings.HYPERCODE_REDIS_URL, @@ -180,10 +212,19 @@ def _seed_sync() -> None: ) await redis_client.ping() _metrics_redis = redis_client - logger.info("Metrics Redis client connected") + _svc_log("Redis", "CONNECTED") heartbeat_task = asyncio.create_task(_core_heartbeat_loop()) except Exception: - logger.warning("Metrics Redis unavailable — metrics middleware will no-op") + _svc_log("Redis", "UNAVAILABLE", "using in-memory fallback") + + # ── Metrics (Prometheus) ─────────────────────────────────────────── + try: + if _Instrumentator is not None: + _svc_log("Metrics", "CONNECTED") + else: + _svc_log("Metrics", "UNAVAILABLE", "prometheus_fastapi_instrumentator not installed") + except Exception: + _svc_log("Metrics", "UNAVAILABLE", "probe error") try: if setup_rate_limiting is not None: @@ -198,6 +239,12 @@ def _seed_sync() -> None: except Exception: logger.exception("Telemetry init failed (non-fatal)") + # ── Final startup summary ────────────────────────────────────────── + if _db_degraded: + logger.info("[STARTUP] Application ready (degraded mode — Postgres unavailable)") + else: + logger.info("[STARTUP] Application ready (all services healthy)") + yield logger.info("Shutdown initiated...") @@ -354,10 +401,21 @@ async def _unhandled_exception_handler(request: Request, exc: Exception): app.include_router(uplink_router) # 🔌 Phase 10J — WS /ws/uplink @app.get("/health") -@cache_response("health", ttl=10) @limiter.limit("120/minute") if limiter is not None else (lambda f: f) -async def health_check(request: Request): +async def health_check(request: Request, response: Response): + """Root liveness / readiness check. + + HTTP status codes: + 200 — application healthy (all core services reachable) + 503 — application running but Postgres is unavailable (degraded mode) + + The deep health endpoint at ``/api/v1/health`` provides per-service + detail including Redis, Discord, and circuit-breaker states. + """ + from app.db.session import is_db_available + if _boot_error is not None: + response.status_code = 503 return { "status": "degraded", "service": settings.SERVICE_NAME, @@ -365,6 +423,19 @@ async def health_check(request: Request): "environment": settings.ENVIRONMENT, "boot_error": _boot_error, } + + # Re-probe on every call so recovery is reflected without a restart. + db_ok = is_db_available() + if not db_ok: + response.status_code = 503 + return { + "status": "degraded", + "service": settings.SERVICE_NAME, + "version": settings.VERSION, + "environment": settings.ENVIRONMENT, + "postgres": "unavailable", + } + return { "status": "ok", "service": settings.SERVICE_NAME,