Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions ainfera_api/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""FastAPI application entry — L5 Orchestration surface."""

import logging
import sys

from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse

from ainfera_api.middleware.agent_signature import AgentSignatureMiddleware
from ainfera_api.middleware.request_metrics import RequestMetricsMiddleware
from ainfera_api.routers import (
admin_policy,
agent_memory,
Expand Down Expand Up @@ -36,6 +38,19 @@
wallets,
workflows,
)
from ainfera_api.routers import (
metrics as metrics_router,
)
from ainfera_api.services.structured_log import StructuredJSONFormatter

# SP-5 PR-C AIN-238 · structured JSON logging with secret scrubbing.
# Install BEFORE the routers' module imports so any startup log lines
# also flow through the scrubber. This is the LAST line of defense
# before bytes hit stdout — catches the accidental f-string interp
# case where a caller forgot to mask a key.
_root_handler = logging.StreamHandler(sys.stdout)
_root_handler.setFormatter(StructuredJSONFormatter())
logging.basicConfig(level=logging.INFO, handlers=[_root_handler], force=True)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -79,6 +94,11 @@
# to hard-reject by PR-J6e via AGENT_SIGNATURE_ENFORCE=1 Doppler var.
app.add_middleware(AgentSignatureMiddleware)

# SP-5 PR-C AIN-238 · per-request metrics + latency histogram. ASGI
# middleware so exception handlers also get observed (status code is
# read AFTER the exception handler converts to a Response).
app.add_middleware(RequestMetricsMiddleware)

app.include_router(tenants.router)
app.include_router(signup.router)
app.include_router(install.router)
Expand Down Expand Up @@ -112,6 +132,7 @@
app.include_router(
dashboard.router
) # SP-2 AIN-263/264/265 · /v1/usage/daily + /v1/caps/rollup + /v1/agents/{id}/metrics
app.include_router(metrics_router.router) # SP-5 PR-C AIN-238 · /metrics


@app.exception_handler(Exception)
Expand Down
62 changes: 62 additions & 0 deletions ainfera_api/middleware/request_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""SP-5 PR-C AIN-238 · per-request metrics middleware.

Times every served request + records the (method, path-template,
status) counter triple. Wires into FastAPI as an ASGI middleware so
exception handlers also get observed (the route's response.status
is read AFTER the exception handler fires, which is the right
attribution).
"""

from __future__ import annotations

from collections.abc import Awaitable, Callable

from fastapi import Request, Response
from starlette.middleware.base import BaseHTTPMiddleware

from ainfera_api.services.metrics import (
normalize_route_path,
request_finished,
request_started,
)


class RequestMetricsMiddleware(BaseHTTPMiddleware):
"""Time + count every served request.

Path normalization uses the FastAPI route template
(`/v1/agents/{agent_id}`) when one matches; falls back to the
raw path for 404s. Defensive cardinality cap in
`services/metrics.normalize_route_path` keeps a misbehaving
client probing random URLs from blowing up label cardinality.
"""

async def dispatch(
self,
request: Request,
call_next: Callable[[Request], Awaitable[Response]],
) -> Response:
started = request_started()
# Tracks the response so the `finally` clause can still observe
# status when the handler raised (FastAPI's exception_handler
# converts to a Response before this returns).
response: Response | None = None
status_code = 500
try:
response = await call_next(request)
status_code = response.status_code
return response
finally:
# `request.scope["route"]` is set by Starlette once routing
# resolves; on a 404 it's absent and we fall back to the
# raw path. The normalize helper caps cardinality so a
# probe-spam client can't explode the histogram label set.
route = request.scope.get("route", None)
template = getattr(route, "path", None)
path_label = normalize_route_path(request.url.path, template)
request_finished(
started,
method=request.method,
path=path_label,
status_code=status_code,
)
112 changes: 112 additions & 0 deletions ainfera_api/routers/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""SP-5 PR-C AIN-238 + AIN-249 · the `/metrics` endpoint.

Internal-scoped Prometheus exposition endpoint. NOT public — gated by
the existing `X-Ainfera-Internal-Key` header (same key the signup
endpoints check). Tenants cannot reach it.

## What this endpoint exposes

The named series live in `services/metrics.py`:

- `ainfera_http_requests_total{method,path,status}`
- `ainfera_http_request_duration_seconds{method,path}`
- `ainfera_provider_calls_total{provider,outcome}`
- `ainfera_router_alias_hit_total{alias}`
- `ainfera_audit_chain_height`
- `ainfera_audit_chain_freshness_seconds`
- `ainfera_dispatch_without_capture_total` (SP-4 PR-A guard scrape)
- `ainfera_cost_killswitch_engaged`
- `ainfera_cost_killswitch_spent_usd`
- `ainfera_cost_killswitch_threshold_usd`
- `ainfera_app_info{version}`

Every label is process-global or per-route / per-provider — NO
tenant_id, agent_id, or owner_handle ever appears. Tenant cardinality
would land on a different (stricter-auth) endpoint, not here.

## Auth posture

`/metrics` accepts `X-Ainfera-Internal-Key` matching the same secret
the signup proxy uses. Without it → 401. Wrong key → 401. This keeps
the endpoint usable from Railway / Vercel internal scrape jobs that
already carry the key in env without exposing it to public traffic.
"""

from __future__ import annotations

from datetime import UTC, datetime
from typing import Annotated

from fastapi import APIRouter, Header, HTTPException, Response, status
from sqlalchemy import text

from ainfera_api.config import get_settings
from ainfera_api.deps import DBSession
from ainfera_api.services.metrics import (
registry,
set_app_info,
set_audit_chain_height,
)

router = APIRouter(tags=["metrics"])


@router.get(
"/metrics",
response_class=Response,
summary="Prometheus-format metrics (internal-key gated).",
# Hide from the public openapi surface — the scraper knows the URL;
# adding it to `/docs` would advertise the endpoint to public clients.
include_in_schema=False,
)
async def metrics_endpoint(
db: DBSession,
x_ainfera_internal_key: Annotated[str | None, Header()] = None,
) -> Response:
"""Render the registry in Prometheus exposition format.

Authentication: matches the same internal-key the signup proxy
uses. A scraper carries it in env; public clients return 401.

Cold-path enrichment: before rendering, this handler updates a
few gauges that change continuously (chain height, freshness,
killswitch state) so a single GET reflects the live picture.
Counters are live-updated by the middleware / dispatcher and
don't need a cold-path refresh.
"""
settings = get_settings()
if x_ainfera_internal_key != settings.internal_signup_key:
# Same 401 surface as the signup endpoints — gives no hint
# about whether the header was missing or wrong.
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="invalid X-Ainfera-Internal-Key",
)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metrics skips key rotation

Medium Severity

/metrics compares the header to internal_signup_key with != instead of Settings.verify_internal_key. During AIN-289 rotation, scrapers using internal_signup_key_previous get 401 here while other internal endpoints still accept the old key.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 7e81c9b. Configure here.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DB opened before metrics auth

Low Severity

DBSession is injected before the internal-key check in the handler body, so unauthenticated /metrics probes still acquire a database session. Sibling internal routes use Depends(require_internal_key) so auth fails before any DB work.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 7e81c9b. Configure here.


# Cold-path enrichment of gauges that change continuously.
await _refresh_audit_chain_metrics(db)
set_app_info(version="1.0.0")

body = registry().render()
# Prometheus content-type per https://prometheus.io/docs/instrumenting/exposition_formats/
return Response(
content=body,
media_type="text/plain; version=0.0.4; charset=utf-8",
headers={"cache-control": "no-cache"},
)


async def _refresh_audit_chain_metrics(db: DBSession) -> None:
"""Read the newest audit_events row to compute height + freshness.

Two SELECTs, both lightweight + indexed: `max(seq)` for height +
`max(created_at)` for freshness. We don't write a probe row — that
would mutate the immutable chain (SP-5 §2 survivor list).
"""
height_row = (await db.execute(text("SELECT max(seq) AS h FROM audit_events"))).one()
freshness_row = (await db.execute(text("SELECT max(created_at) AS t FROM audit_events"))).one()
height = int(height_row.h or 0)
newest_ts = freshness_row.t
# newest_ts is a TIMESTAMP WITH TIME ZONE (already aware) when set.
freshness_s = 0.0 if newest_ts is None else (datetime.now(UTC) - newest_ts).total_seconds()
set_audit_chain_height(height=height, freshness_s=freshness_s)
Loading
Loading