-
Notifications
You must be signed in to change notification settings - Fork 0
feat(api): SP-5 · /metrics surface + structured JSON logging (AIN-238 + AIN-249) #101
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| """SP-5 PR-C AIN-238 · per-request metrics middleware. | ||
|
|
||
| Times every served request + records the (method, path-template, | ||
| status) counter triple. Wires into FastAPI as an ASGI middleware so | ||
| exception handlers also get observed (the route's response.status | ||
| is read AFTER the exception handler fires, which is the right | ||
| attribution). | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Awaitable, Callable | ||
|
|
||
| from fastapi import Request, Response | ||
| from starlette.middleware.base import BaseHTTPMiddleware | ||
|
|
||
| from ainfera_api.services.metrics import ( | ||
| normalize_route_path, | ||
| request_finished, | ||
| request_started, | ||
| ) | ||
|
|
||
|
|
||
| class RequestMetricsMiddleware(BaseHTTPMiddleware): | ||
| """Time + count every served request. | ||
|
|
||
| Path normalization uses the FastAPI route template | ||
| (`/v1/agents/{agent_id}`) when one matches; falls back to the | ||
| raw path for 404s. Defensive cardinality cap in | ||
| `services/metrics.normalize_route_path` keeps a misbehaving | ||
| client probing random URLs from blowing up label cardinality. | ||
| """ | ||
|
|
||
| async def dispatch( | ||
| self, | ||
| request: Request, | ||
| call_next: Callable[[Request], Awaitable[Response]], | ||
| ) -> Response: | ||
| started = request_started() | ||
| # Tracks the response so the `finally` clause can still observe | ||
| # status when the handler raised (FastAPI's exception_handler | ||
| # converts to a Response before this returns). | ||
| response: Response | None = None | ||
| status_code = 500 | ||
| try: | ||
| response = await call_next(request) | ||
| status_code = response.status_code | ||
| return response | ||
| finally: | ||
| # `request.scope["route"]` is set by Starlette once routing | ||
| # resolves; on a 404 it's absent and we fall back to the | ||
| # raw path. The normalize helper caps cardinality so a | ||
| # probe-spam client can't explode the histogram label set. | ||
| route = request.scope.get("route", None) | ||
| template = getattr(route, "path", None) | ||
| path_label = normalize_route_path(request.url.path, template) | ||
| request_finished( | ||
| started, | ||
| method=request.method, | ||
| path=path_label, | ||
| status_code=status_code, | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| """SP-5 PR-C AIN-238 + AIN-249 · the `/metrics` endpoint. | ||
|
|
||
| Internal-scoped Prometheus exposition endpoint. NOT public — gated by | ||
| the existing `X-Ainfera-Internal-Key` header (same key the signup | ||
| endpoints check). Tenants cannot reach it. | ||
|
|
||
| ## What this endpoint exposes | ||
|
|
||
| The named series live in `services/metrics.py`: | ||
|
|
||
| - `ainfera_http_requests_total{method,path,status}` | ||
| - `ainfera_http_request_duration_seconds{method,path}` | ||
| - `ainfera_provider_calls_total{provider,outcome}` | ||
| - `ainfera_router_alias_hit_total{alias}` | ||
| - `ainfera_audit_chain_height` | ||
| - `ainfera_audit_chain_freshness_seconds` | ||
| - `ainfera_dispatch_without_capture_total` (SP-4 PR-A guard scrape) | ||
| - `ainfera_cost_killswitch_engaged` | ||
| - `ainfera_cost_killswitch_spent_usd` | ||
| - `ainfera_cost_killswitch_threshold_usd` | ||
| - `ainfera_app_info{version}` | ||
|
|
||
| Every label is process-global or per-route / per-provider — NO | ||
| tenant_id, agent_id, or owner_handle ever appears. Tenant cardinality | ||
| would land on a different (stricter-auth) endpoint, not here. | ||
|
|
||
| ## Auth posture | ||
|
|
||
| `/metrics` accepts `X-Ainfera-Internal-Key` matching the same secret | ||
| the signup proxy uses. Without it → 401. Wrong key → 401. This keeps | ||
| the endpoint usable from Railway / Vercel internal scrape jobs that | ||
| already carry the key in env without exposing it to public traffic. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from datetime import UTC, datetime | ||
| from typing import Annotated | ||
|
|
||
| from fastapi import APIRouter, Header, HTTPException, Response, status | ||
| from sqlalchemy import text | ||
|
|
||
| from ainfera_api.config import get_settings | ||
| from ainfera_api.deps import DBSession | ||
| from ainfera_api.services.metrics import ( | ||
| registry, | ||
| set_app_info, | ||
| set_audit_chain_height, | ||
| ) | ||
|
|
||
| router = APIRouter(tags=["metrics"]) | ||
|
|
||
|
|
||
| @router.get( | ||
| "/metrics", | ||
| response_class=Response, | ||
| summary="Prometheus-format metrics (internal-key gated).", | ||
| # Hide from the public openapi surface — the scraper knows the URL; | ||
| # adding it to `/docs` would advertise the endpoint to public clients. | ||
| include_in_schema=False, | ||
| ) | ||
| async def metrics_endpoint( | ||
| db: DBSession, | ||
| x_ainfera_internal_key: Annotated[str | None, Header()] = None, | ||
| ) -> Response: | ||
| """Render the registry in Prometheus exposition format. | ||
|
|
||
| Authentication: matches the same internal-key the signup proxy | ||
| uses. A scraper carries it in env; public clients return 401. | ||
|
|
||
| Cold-path enrichment: before rendering, this handler updates a | ||
| few gauges that change continuously (chain height, freshness, | ||
| killswitch state) so a single GET reflects the live picture. | ||
| Counters are live-updated by the middleware / dispatcher and | ||
| don't need a cold-path refresh. | ||
| """ | ||
| settings = get_settings() | ||
| if x_ainfera_internal_key != settings.internal_signup_key: | ||
| # Same 401 surface as the signup endpoints — gives no hint | ||
| # about whether the header was missing or wrong. | ||
| raise HTTPException( | ||
| status_code=status.HTTP_401_UNAUTHORIZED, | ||
| detail="invalid X-Ainfera-Internal-Key", | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. DB opened before metrics authLow Severity
Reviewed by Cursor Bugbot for commit 7e81c9b. Configure here. |
||
|
|
||
| # Cold-path enrichment of gauges that change continuously. | ||
| await _refresh_audit_chain_metrics(db) | ||
| set_app_info(version="1.0.0") | ||
|
|
||
| body = registry().render() | ||
| # Prometheus content-type per https://prometheus.io/docs/instrumenting/exposition_formats/ | ||
| return Response( | ||
| content=body, | ||
| media_type="text/plain; version=0.0.4; charset=utf-8", | ||
| headers={"cache-control": "no-cache"}, | ||
| ) | ||
|
|
||
|
|
||
| async def _refresh_audit_chain_metrics(db: DBSession) -> None: | ||
| """Read the newest audit_events row to compute height + freshness. | ||
|
|
||
| Two SELECTs, both lightweight + indexed: `max(seq)` for height + | ||
| `max(created_at)` for freshness. We don't write a probe row — that | ||
| would mutate the immutable chain (SP-5 §2 survivor list). | ||
| """ | ||
| height_row = (await db.execute(text("SELECT max(seq) AS h FROM audit_events"))).one() | ||
| freshness_row = (await db.execute(text("SELECT max(created_at) AS t FROM audit_events"))).one() | ||
| height = int(height_row.h or 0) | ||
| newest_ts = freshness_row.t | ||
| # newest_ts is a TIMESTAMP WITH TIME ZONE (already aware) when set. | ||
| freshness_s = 0.0 if newest_ts is None else (datetime.now(UTC) - newest_ts).total_seconds() | ||
| set_audit_chain_height(height=height, freshness_s=freshness_s) | ||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metrics skips key rotation
Medium Severity
/metricscompares the header tointernal_signup_keywith!=instead ofSettings.verify_internal_key. During AIN-289 rotation, scrapers usinginternal_signup_key_previousget 401 here while other internal endpoints still accept the old key.Reviewed by Cursor Bugbot for commit 7e81c9b. Configure here.