Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions backend/app/api/v1/endpoints/health.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import httpx
from fastapi import APIRouter, Depends
from fastapi import APIRouter, Depends, Response
from sqlalchemy.orm import Session
from sqlalchemy import text
from pydantic import BaseModel
from typing import Dict, Any
import redis as redis_lib

from app.db.session import get_db
from app.db.session import get_db, probe_db
from app.core.config import settings
from app.core.circuit_breaker import all_breakers

Expand All @@ -20,10 +20,18 @@ class HealthStatus(BaseModel):


def _check_postgres(db: Session) -> dict:
"""Check Postgres by running a lightweight query on the injected session.

Also updates the module-level ``_db_available`` flag in db.session so
that the root ``/health`` endpoint reflects the current state.
"""
try:
db.execute(text("SELECT 1"))
# Sync the availability flag so /health picks up recovery.
probe_db()
return {"status": "ok"}
except Exception as e:
probe_db()
return {"status": "error", "detail": str(e)}


Expand All @@ -36,10 +44,12 @@ def _check_redis() -> dict:
r.ping()
return {"status": "ok"}
except Exception as e:
return {"status": "error", "detail": str(e)}
return {"status": "error", "detail": str(e), "fallback": "in-memory"}


async def _check_discord() -> dict:
if not settings.DISCORD_BOT_TOKEN:
return {"status": "skipped", "detail": "DISCORD_BOT_TOKEN not configured"}
try:
async with httpx.AsyncClient(timeout=3) as client:
resp = await client.get(
Expand All @@ -52,10 +62,14 @@ async def _check_discord() -> dict:


@router.get("/health", response_model=HealthStatus, tags=["health"])
async def health_check(db: Session = Depends(get_db)):
async def health_check(response: Response, db: Session = Depends(get_db)):
"""GET /api/v1/health
Deep health check — Postgres, Redis, Discord.
Returns 200 healthy / 207 degraded.

Deep health check — Postgres, Redis, Discord, circuit breakers.

HTTP status codes:
200 — all core services healthy
503 — Postgres unavailable (app running in degraded mode)
"""
checks = {
"postgres": _check_postgres(db),
Expand All @@ -64,9 +78,23 @@ async def health_check(db: Session = Depends(get_db)):
"circuit_breakers": all_breakers(),
}

infra_checks = {k: v for k, v in checks.items() if k != "circuit_breakers"}
all_ok = all(v["status"] == "ok" for v in infra_checks.values())
overall = "healthy" if all_ok else "degraded"
# Postgres is a *core* dependency — its failure drives the HTTP status.
postgres_ok = checks["postgres"]["status"] == "ok"
# Redis and Discord are non-critical; their failure only affects `status`.
non_core_ok = all(
v["status"] in {"ok", "skipped"}
for k, v in checks.items()
if k not in {"postgres", "circuit_breakers"}
)

if not postgres_ok:
overall = "unhealthy"
response.status_code = 503
elif not non_core_ok:
overall = "degraded"
# Still 200 — app is functional, non-critical services are down.
else:
overall = "healthy"

return HealthStatus(
status=overall,
Expand Down
55 changes: 53 additions & 2 deletions backend/app/db/session.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import asyncio
import logging
from contextlib import asynccontextmanager
from typing import AsyncGenerator

from sqlalchemy import create_engine
from sqlalchemy import create_engine, text
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
from sqlalchemy.orm import sessionmaker
from app.core.config import settings

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Sync engine — kept for Celery workers, Alembic, and legacy sync code
#
# The engine object is always created (it only configures the pool, it does
# not open a connection). The actual TCP handshake happens on the first
# query, so a missing Postgres at import time does NOT raise here.
# ---------------------------------------------------------------------------
engine = create_engine(
settings.HYPERCODE_DB_URL,
Expand All @@ -20,14 +27,58 @@
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

# ---------------------------------------------------------------------------
# Lazy DB availability probe
#
# _db_available is a tri-state:
# None — not yet probed (first request will trigger a probe)
# True — last probe succeeded
# False — last probe failed (re-probed on every request so recovery is
# detected automatically)
# ---------------------------------------------------------------------------
_db_available: bool | None = None


def probe_db() -> bool:
"""Attempt a cheap SELECT 1 against Postgres.

Updates and returns the module-level ``_db_available`` flag. Safe to
call from any thread; never raises.
"""
global _db_available
try:
with engine.connect() as conn:
conn.execute(text("SELECT 1"))
_db_available = True
except Exception as exc:
logger.warning("Database probe failed: %s", exc)
_db_available = False
return bool(_db_available)


def is_db_available() -> bool:
"""Return the cached DB availability flag, probing if not yet known."""
if _db_available is None:
return probe_db()
return bool(_db_available)


def get_db():
"""Sync DB dependency — use in sync routes / Celery tasks."""
"""Sync DB dependency — use in sync routes / Celery tasks.

On every call the module-level ``_db_available`` flag is updated so that
a recovered Postgres is detected automatically without a restart.
"""
global _db_available
db = SessionLocal()
try:
yield db
# If we got here without an exception the connection is healthy.
_db_available = True
except Exception:
db.rollback()
# Re-probe so the health endpoint reflects the current state.
probe_db()
raise
finally:
db.close()
Expand Down
81 changes: 76 additions & 5 deletions backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
from contextlib import asynccontextmanager, suppress

_boot_error: str | None = None
# Set to True when Postgres was unreachable at startup (degraded mode).
_db_degraded: bool = False

import redis.asyncio as aioredis
from fastapi import FastAPI, Request
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
try:
Expand Down Expand Up @@ -104,16 +106,45 @@ async def _core_heartbeat_loop() -> None:
async def _lifespan(app: FastAPI):
global _metrics_redis
global _boot_error
global _db_degraded

heartbeat_task: asyncio.Task | None = None

# ── Structured startup log helpers ────────────────────────────────────
_startup_lines: list[str] = []

def _svc_log(name: str, state: str, note: str = "") -> None:
suffix = f" ({note})" if note else ""
line = f"[STARTUP] {name}: {state}{suffix}"
_startup_lines.append(line)
logger.info(line)

try:
settings.validate_security()
except Exception as exc:
_boot_error = str(exc)
logger.error("Startup security validation failed: %s", _boot_error)
else:

# ── Postgres probe ─────────────────────────────────────────────────
async def _probe_postgres_startup() -> None:
"""Non-blocking Postgres probe that sets _db_degraded."""
global _db_degraded
try:
from app.db.session import probe_db as _probe_db
available = await asyncio.to_thread(_probe_db)
if available:
_svc_log("Postgres", "CONNECTED")
_db_degraded = False
else:
_svc_log("Postgres", "UNAVAILABLE", "will retry on next request")
_db_degraded = True
except Exception as exc:
_svc_log("Postgres", "UNAVAILABLE", f"will retry on next request — {exc}")
_db_degraded = True

await _probe_postgres_startup()

async def _init_db_background() -> None:
# The Dockerfile entrypoint blocks on `alembic upgrade head` before uvicorn
# starts, so by the time this lifespan runs the schema is at head. We still
Expand Down Expand Up @@ -172,6 +203,7 @@ def _seed_sync() -> None:

asyncio.create_task(_init_db_background())

# ── Redis / Metrics probe ──────────────────────────────────────────
try:
redis_client = aioredis.from_url(
settings.HYPERCODE_REDIS_URL,
Expand All @@ -180,10 +212,19 @@ def _seed_sync() -> None:
)
await redis_client.ping()
_metrics_redis = redis_client
logger.info("Metrics Redis client connected")
_svc_log("Redis", "CONNECTED")
heartbeat_task = asyncio.create_task(_core_heartbeat_loop())
except Exception:
logger.warning("Metrics Redis unavailable — metrics middleware will no-op")
_svc_log("Redis", "UNAVAILABLE", "using in-memory fallback")

# ── Metrics (Prometheus) ───────────────────────────────────────────
try:
if _Instrumentator is not None:
_svc_log("Metrics", "CONNECTED")
else:
_svc_log("Metrics", "UNAVAILABLE", "prometheus_fastapi_instrumentator not installed")
except Exception:
_svc_log("Metrics", "UNAVAILABLE", "probe error")

try:
if setup_rate_limiting is not None:
Expand All @@ -198,6 +239,12 @@ def _seed_sync() -> None:
except Exception:
logger.exception("Telemetry init failed (non-fatal)")

# ── Final startup summary ──────────────────────────────────────────
if _db_degraded:
logger.info("[STARTUP] Application ready (degraded mode — Postgres unavailable)")
else:
logger.info("[STARTUP] Application ready (all services healthy)")

yield

logger.info("Shutdown initiated...")
Expand Down Expand Up @@ -354,17 +401,41 @@ async def _unhandled_exception_handler(request: Request, exc: Exception):
app.include_router(uplink_router) # 🔌 Phase 10J — WS /ws/uplink

@app.get("/health")
@cache_response("health", ttl=10)
@limiter.limit("120/minute") if limiter is not None else (lambda f: f)
async def health_check(request: Request):
async def health_check(request: Request, response: Response):
"""Root liveness / readiness check.

HTTP status codes:
200 — application healthy (all core services reachable)
503 — application running but Postgres is unavailable (degraded mode)

The deep health endpoint at ``/api/v1/health`` provides per-service
detail including Redis, Discord, and circuit-breaker states.
"""
from app.db.session import is_db_available

if _boot_error is not None:
response.status_code = 503
return {
"status": "degraded",
"service": settings.SERVICE_NAME,
"version": settings.VERSION,
"environment": settings.ENVIRONMENT,
"boot_error": _boot_error,
}

# Re-probe on every call so recovery is reflected without a restart.
db_ok = is_db_available()
if not db_ok:
response.status_code = 503
return {
"status": "degraded",
"service": settings.SERVICE_NAME,
"version": settings.VERSION,
"environment": settings.ENVIRONMENT,
"postgres": "unavailable",
}

return {
"status": "ok",
"service": settings.SERVICE_NAME,
Expand Down
Loading