From 3cf959f18357fdb2e793578a4946aa9e2ea7a27e Mon Sep 17 00:00:00 2001
From: "railway-app[bot]" <68434857+railway-app[bot]@users.noreply.github.com>
Date: Thu, 30 Apr 2026 12:47:39 +0000
Subject: [PATCH] feat: graceful degradation for Postgres/Redis with startup
 logging

---
 backend/app/api/v1/endpoints/health.py | 46 ++++++++++++---
 backend/app/db/session.py              | 55 ++++++++++++++++-
 backend/app/main.py                    | 81 ++++++++++++++++++++++++--
 3 files changed, 166 insertions(+), 16 deletions(-)

diff --git a/backend/app/api/v1/endpoints/health.py b/backend/app/api/v1/endpoints/health.py
index 8da23ecd..20806e5f 100644
--- a/backend/app/api/v1/endpoints/health.py
+++ b/backend/app/api/v1/endpoints/health.py
@@ -1,12 +1,12 @@
 import httpx
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, Response
 from sqlalchemy.orm import Session
 from sqlalchemy import text
 from pydantic import BaseModel
 from typing import Dict, Any
 import redis as redis_lib
 
-from app.db.session import get_db
+from app.db.session import get_db, probe_db
 from app.core.config import settings
 from app.core.circuit_breaker import all_breakers
 
@@ -20,10 +20,18 @@ class HealthStatus(BaseModel):
 
 
 def _check_postgres(db: Session) -> dict:
+    """Check Postgres by running a lightweight query on the injected session.
+
+    Also updates the module-level ``_db_available`` flag in db.session so
+    that the root ``/health`` endpoint reflects the current state.
+    """
     try:
         db.execute(text("SELECT 1"))
+        # Sync the availability flag so /health picks up recovery.
+        probe_db()
         return {"status": "ok"}
     except Exception as e:
+        probe_db()
         return {"status": "error", "detail": str(e)}
 
 
@@ -36,10 +44,12 @@ def _check_redis() -> dict:
         r.ping()
         return {"status": "ok"}
     except Exception as e:
-        return {"status": "error", "detail": str(e)}
+        return {"status": "error", "detail": str(e), "fallback": "in-memory"}
 
 
 async def _check_discord() -> dict:
+    if not settings.DISCORD_BOT_TOKEN:
+        return {"status": "skipped", "detail": "DISCORD_BOT_TOKEN not configured"}
     try:
         async with httpx.AsyncClient(timeout=3) as client:
             resp = await client.get(
@@ -52,10 +62,14 @@ async def _check_discord() -> dict:
 
 
 @router.get("/health", response_model=HealthStatus, tags=["health"])
-async def health_check(db: Session = Depends(get_db)):
+async def health_check(response: Response, db: Session = Depends(get_db)):
     """GET /api/v1/health
-    Deep health check — Postgres, Redis, Discord.
-    Returns 200 healthy / 207 degraded.
+
+    Deep health check — Postgres, Redis, Discord, circuit breakers.
+
+    HTTP status codes:
+      200 — all core services healthy
+      503 — Postgres unavailable (app running in degraded mode)
     """
     checks = {
         "postgres": _check_postgres(db),
@@ -64,9 +78,23 @@ async def health_check(db: Session = Depends(get_db)):
         "circuit_breakers": all_breakers(),
     }
 
-    infra_checks = {k: v for k, v in checks.items() if k != "circuit_breakers"}
-    all_ok = all(v["status"] == "ok" for v in infra_checks.values())
-    overall = "healthy" if all_ok else "degraded"
+    # Postgres is a *core* dependency — its failure drives the HTTP status.
+    postgres_ok = checks["postgres"]["status"] == "ok"
+    # Redis and Discord are non-critical; their failure only affects `status`.
+    non_core_ok = all(
+        v["status"] in {"ok", "skipped"}
+        for k, v in checks.items()
+        if k not in {"postgres", "circuit_breakers"}
+    )
+
+    if not postgres_ok:
+        overall = "unhealthy"
+        response.status_code = 503
+    elif not non_core_ok:
+        overall = "degraded"
+        # Still 200 — app is functional, non-critical services are down.
+    else:
+        overall = "healthy"
 
     return HealthStatus(
         status=overall,
diff --git a/backend/app/db/session.py b/backend/app/db/session.py
index af648181..a23916f6 100644
--- a/backend/app/db/session.py
+++ b/backend/app/db/session.py
@@ -1,14 +1,21 @@
 import asyncio
+import logging
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator
 
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text
 from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
 from sqlalchemy.orm import sessionmaker
 from app.core.config import settings
 
+logger = logging.getLogger(__name__)
+
 # ---------------------------------------------------------------------------
 # Sync engine — kept for Celery workers, Alembic, and legacy sync code
+#
+# The engine object is always created (it only configures the pool, it does
+# not open a connection).  The actual TCP handshake happens on the first
+# query, so a missing Postgres at import time does NOT raise here.
 # ---------------------------------------------------------------------------
 engine = create_engine(
     settings.HYPERCODE_DB_URL,
@@ -20,14 +27,58 @@
 )
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 
+# ---------------------------------------------------------------------------
+# Lazy DB availability probe
+#
+# _db_available is a tri-state:
+#   None  — not yet probed (first request will trigger a probe)
+#   True  — last probe succeeded
+#   False — last probe failed (re-probed on every request so recovery is
+#            detected automatically)
+# ---------------------------------------------------------------------------
+_db_available: bool | None = None
+
+
+def probe_db() -> bool:
+    """Attempt a cheap SELECT 1 against Postgres.
+
+    Updates and returns the module-level ``_db_available`` flag.  Safe to
+    call from any thread; never raises.
+    """
+    global _db_available
+    try:
+        with engine.connect() as conn:
+            conn.execute(text("SELECT 1"))
+        _db_available = True
+    except Exception as exc:
+        logger.warning("Database probe failed: %s", exc)
+        _db_available = False
+    return bool(_db_available)
+
+
+def is_db_available() -> bool:
+    """Return the cached DB availability flag, probing if not yet known."""
+    if _db_available is None:
+        return probe_db()
+    return bool(_db_available)
+
 
 def get_db():
-    """Sync DB dependency — use in sync routes / Celery tasks."""
+    """Sync DB dependency — use in sync routes / Celery tasks.
+
+    On every call the module-level ``_db_available`` flag is updated so that
+    a recovered Postgres is detected automatically without a restart.
+    """
+    global _db_available
     db = SessionLocal()
     try:
         yield db
+        # If we got here without an exception the connection is healthy.
+        _db_available = True
     except Exception:
         db.rollback()
+        # Re-probe so the health endpoint reflects the current state.
+        probe_db()
         raise
     finally:
         db.close()
diff --git a/backend/app/main.py b/backend/app/main.py
index 0525df2b..b55ffe91 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -9,9 +9,11 @@
 from contextlib import asynccontextmanager, suppress
 
 _boot_error: str | None = None
+# Set to True when Postgres was unreachable at startup (degraded mode).
+_db_degraded: bool = False
 
 import redis.asyncio as aioredis
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 try:
@@ -104,9 +106,19 @@ async def _core_heartbeat_loop() -> None:
 async def _lifespan(app: FastAPI):
     global _metrics_redis
     global _boot_error
+    global _db_degraded
 
     heartbeat_task: asyncio.Task | None = None
 
+    # ── Structured startup log helpers ────────────────────────────────────
+    _startup_lines: list[str] = []
+
+    def _svc_log(name: str, state: str, note: str = "") -> None:
+        suffix = f" ({note})" if note else ""
+        line = f"[STARTUP] {name}: {state}{suffix}"
+        _startup_lines.append(line)
+        logger.info(line)
+
     try:
         settings.validate_security()
     except Exception as exc:
@@ -114,6 +126,25 @@ async def _lifespan(app: FastAPI):
         logger.error("Startup security validation failed: %s", _boot_error)
     else:
 
+        # ── Postgres probe ─────────────────────────────────────────────────
+        async def _probe_postgres_startup() -> None:
+            """Non-blocking Postgres probe that sets _db_degraded."""
+            global _db_degraded
+            try:
+                from app.db.session import probe_db as _probe_db
+                available = await asyncio.to_thread(_probe_db)
+                if available:
+                    _svc_log("Postgres", "CONNECTED")
+                    _db_degraded = False
+                else:
+                    _svc_log("Postgres", "UNAVAILABLE", "will retry on next request")
+                    _db_degraded = True
+            except Exception as exc:
+                _svc_log("Postgres", "UNAVAILABLE", f"will retry on next request — {exc}")
+                _db_degraded = True
+
+        await _probe_postgres_startup()
+
         async def _init_db_background() -> None:
             # The Dockerfile entrypoint blocks on `alembic upgrade head` before uvicorn
             # starts, so by the time this lifespan runs the schema is at head. We still
@@ -172,6 +203,7 @@ def _seed_sync() -> None:
 
         asyncio.create_task(_init_db_background())
 
+        # ── Redis / Metrics probe ──────────────────────────────────────────
         try:
             redis_client = aioredis.from_url(
                 settings.HYPERCODE_REDIS_URL,
@@ -180,10 +212,19 @@ def _seed_sync() -> None:
             )
             await redis_client.ping()
             _metrics_redis = redis_client
-            logger.info("Metrics Redis client connected")
+            _svc_log("Redis", "CONNECTED")
             heartbeat_task = asyncio.create_task(_core_heartbeat_loop())
         except Exception:
-            logger.warning("Metrics Redis unavailable — metrics middleware will no-op")
+            _svc_log("Redis", "UNAVAILABLE", "using in-memory fallback")
+
+        # ── Metrics (Prometheus) ───────────────────────────────────────────
+        try:
+            if _Instrumentator is not None:
+                _svc_log("Metrics", "CONNECTED")
+            else:
+                _svc_log("Metrics", "UNAVAILABLE", "prometheus_fastapi_instrumentator not installed")
+        except Exception:
+            _svc_log("Metrics", "UNAVAILABLE", "probe error")
 
         try:
             if setup_rate_limiting is not None:
@@ -198,6 +239,12 @@ def _seed_sync() -> None:
         except Exception:
             logger.exception("Telemetry init failed (non-fatal)")
 
+        # ── Final startup summary ──────────────────────────────────────────
+        if _db_degraded:
+            logger.info("[STARTUP] Application ready (degraded mode — Postgres unavailable)")
+        else:
+            logger.info("[STARTUP] Application ready (all services healthy)")
+
     yield
 
     logger.info("Shutdown initiated...")
@@ -354,10 +401,21 @@ async def _unhandled_exception_handler(request: Request, exc: Exception):
     app.include_router(uplink_router)  # 🔌 Phase 10J — WS /ws/uplink
 
 @app.get("/health")
-@cache_response("health", ttl=10)
 @limiter.limit("120/minute") if limiter is not None else (lambda f: f)
-async def health_check(request: Request):
+async def health_check(request: Request, response: Response):
+    """Root liveness / readiness check.
+
+    HTTP status codes:
+      200 — application healthy (all core services reachable)
+      503 — application running but Postgres is unavailable (degraded mode)
+
+    The deep health endpoint at ``/api/v1/health`` provides per-service
+    detail including Redis, Discord, and circuit-breaker states.
+    """
+    from app.db.session import is_db_available
+
     if _boot_error is not None:
+        response.status_code = 503
         return {
             "status": "degraded",
             "service": settings.SERVICE_NAME,
@@ -365,6 +423,19 @@ async def health_check(request: Request):
             "environment": settings.ENVIRONMENT,
             "boot_error": _boot_error,
         }
+
+    # Re-probe on every call so recovery is reflected without a restart.
+    db_ok = is_db_available()
+    if not db_ok:
+        response.status_code = 503
+        return {
+            "status": "degraded",
+            "service": settings.SERVICE_NAME,
+            "version": settings.VERSION,
+            "environment": settings.ENVIRONMENT,
+            "postgres": "unavailable",
+        }
+
     return {
         "status": "ok",
         "service": settings.SERVICE_NAME,