From 27a81ec9767d0eb197444cc82d9a2f1207294f3c Mon Sep 17 00:00:00 2001 From: MorganOnCode <87934408+MorganOnCode@users.noreply.github.com> Date: Fri, 15 May 2026 10:36:55 +0000 Subject: [PATCH] feat(observability): Prometheus /metrics endpoint Closes audit #12 -- the observability gap noted in the 2026-05-15 audit (Sentry covers errors but no latency percentiles, throughput, or queue depth, blocking SLA validation and capacity planning). Adds a small focused plugin at src/routes/metrics.ts using prom-client directly (no wrapper plugin): - GET /metrics returns the standard Prometheus text format (v0.0.4) - Registers Node.js default metrics: heap, GC, event loop lag, CPU - Adds two custom metrics labeled by method/route/status_code: - http_requests_total (counter) - http_request_duration_seconds (histogram with realistic facilitator latency buckets: 5ms to 10s) - Default label `service="cardano402"` on every series - Route label uses the templated pattern (e.g. "/files/:cid") not the raw URL -- bounded cardinality, won't explode the time-series count - Skips /metrics itself (recursive accounting) and /health (k8s-style liveness-probe noise that would skew latency percentiles) robots.txt updated to Disallow /metrics so it isn't crawl-indexed. 9 new tests cover: content type + Prometheus format, default Node.js metrics presence, custom counter/histogram presence, request tracking across multiple calls, route-pattern cardinality boundedness, method + status_code labels, and the /metrics + /health exclusions. Full suite: 34 files / 452 tests passing (was 33 / 443). No production behaviour change: the plugin only adds a new GET route and an onResponse hook that does in-memory increments. Memory cost is trivial (prom-client is ~50KB). Co-Authored-By: Claude Opus 4.7 (1M context) --- package.json | 1 + pnpm-lock.yaml | 24 ++++++++ src/routes/agent-discovery.ts | 1 + src/routes/metrics.ts | 61 ++++++++++++++++++++ src/server.ts | 5 ++ tests/unit/routes/metrics.test.ts | 95 +++++++++++++++++++++++++++++++ 6 files changed, 187 insertions(+) create mode 100644 src/routes/metrics.ts create mode 100644 tests/unit/routes/metrics.test.ts diff --git a/package.json b/package.json index cf3eb61..77dcf89 100644 --- a/package.json +++ b/package.json @@ -93,6 +93,7 @@ "fastify-type-provider-zod": "^6.1.0", "ioredis": "^5.10.1", "pino": "^10.3.1", + "prom-client": "^15.1.3", "zod": "^4.4.3" }, "devDependencies": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 528e3ee..f707789 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -62,6 +62,9 @@ importers: pino: specifier: ^10.3.1 version: 10.3.1 + prom-client: + specifier: ^15.1.3 + version: 15.1.3 zod: specifier: ^4.4.3 version: 4.4.3 @@ -1685,6 +1688,9 @@ packages: bignumber.js@9.3.1: resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==} + bintrees@1.0.2: + resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==} + bip39@3.1.0: resolution: {integrity: sha512-c9kiwdk45Do5GL0vJMe7tS95VjCii65mYAH7DfWl3uW8AVzXKQVUm64i3hzVybBDMp9r7j9iNxR85+ul8MdN/A==} @@ -3243,6 +3249,10 @@ packages: progress-events@1.0.1: resolution: {integrity: sha512-MOzLIwhpt64KIVN64h1MwdKWiyKFNc/S6BoYKPIVUHFg0/eIEyBulhWCgn678v/4c0ri3FdGuzXymNCv02MUIw==} + prom-client@15.1.3: + resolution: {integrity: sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==} + engines: {node: ^16 || ^18 || >=20} + prop-types@15.8.1: resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==} @@ -3587,6 +3597,9 @@ packages: resolution: {integrity: sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==} engines: {node: '>=6'} + tdigest@0.1.2: + resolution: {integrity: sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==} + thenify-all@1.6.0: resolution: {integrity: sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==} engines: {node: '>=0.8'} @@ -5652,6 +5665,8 @@ snapshots: bignumber.js@9.3.1: {} + bintrees@1.0.2: {} + bip39@3.1.0: dependencies: '@noble/hashes': 1.8.0 @@ -7385,6 +7400,11 @@ snapshots: progress-events@1.0.1: {} + prom-client@15.1.3: + dependencies: + '@opentelemetry/api': 1.9.1 + tdigest: 0.1.2 + prop-types@15.8.1: dependencies: loose-envify: 1.4.0 @@ -7794,6 +7814,10 @@ snapshots: tapable@2.3.0: {} + tdigest@0.1.2: + dependencies: + bintrees: 1.0.2 + thenify-all@1.6.0: dependencies: thenify: 3.3.1 diff --git a/src/routes/agent-discovery.ts b/src/routes/agent-discovery.ts index e3e044d..68525d7 100644 --- a/src/routes/agent-discovery.ts +++ b/src/routes/agent-discovery.ts @@ -22,6 +22,7 @@ Disallow: /verify Disallow: /settle Disallow: /upload Disallow: /files/ +Disallow: /metrics Sitemap: /sitemap.xml `; diff --git a/src/routes/metrics.ts b/src/routes/metrics.ts new file mode 100644 index 0000000..389125f --- /dev/null +++ b/src/routes/metrics.ts @@ -0,0 +1,61 @@ +// Prometheus metrics endpoint. +// +// Exposes default Node.js process/runtime metrics (heap, GC, event loop) +// plus per-route HTTP request count + duration histogram, scrape-able by +// any Prometheus-compatible system. Mounted at GET /metrics. +// +// /metrics itself and /health are excluded from request tracking -- the +// former to avoid recursive accounting, the latter to keep liveness-probe +// noise out of latency percentiles. + +import type { FastifyPluginCallback } from 'fastify'; +import fp from 'fastify-plugin'; +import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client'; + +const SKIP_ROUTES = new Set(['/metrics', '/health']); + +const metricsPlugin: FastifyPluginCallback = (fastify, _options, done) => { + const registry = new Registry(); + registry.setDefaultLabels({ service: 'cardano402' }); + collectDefaultMetrics({ register: registry }); + + const httpDuration = new Histogram({ + name: 'http_request_duration_seconds', + help: 'HTTP request duration in seconds, labeled by method, route, and status code', + labelNames: ['method', 'route', 'status_code'], + // Buckets cover the realistic facilitator latency band (sub-ms to a few seconds). + buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registers: [registry], + }); + + const httpTotal = new Counter({ + name: 'http_requests_total', + help: 'Total HTTP requests, labeled by method, route, and status code', + labelNames: ['method', 'route', 'status_code'], + registers: [registry], + }); + + fastify.addHook('onResponse', async (request, reply) => { + // Prefer the route pattern (e.g. "/files/:cid") over the raw URL so + // cardinality stays bounded. Falls back to raw URL for unmatched paths. + const route = request.routeOptions?.url ?? request.url; + if (SKIP_ROUTES.has(route)) return; + const method = request.method; + const statusCode = String(reply.statusCode); + const elapsedMs = reply.elapsedTime; + httpDuration.labels(method, route, statusCode).observe(elapsedMs / 1000); + httpTotal.labels(method, route, statusCode).inc(); + }); + + fastify.get('/metrics', async (_req, reply) => { + const body = await registry.metrics(); + return reply.type(registry.contentType).status(200).send(body); + }); + + done(); +}; + +export const metricsRoutesPlugin = fp(metricsPlugin, { + name: 'metrics-routes', + fastify: '5.x', +}); diff --git a/src/server.ts b/src/server.ts index 7f643f5..6100c1e 100644 --- a/src/server.ts +++ b/src/server.ts @@ -25,6 +25,7 @@ import { agentDiscoveryRoutesPlugin } from './routes/agent-discovery.js'; import { demoRoutesPlugin } from './routes/demo.js'; import { downloadRoutesPlugin } from './routes/download.js'; import { healthRoutesPlugin } from './routes/health.js'; +import { metricsRoutesPlugin } from './routes/metrics.js'; import { settleRoutesPlugin } from './routes/settle.js'; import { statusRoutesPlugin } from './routes/status.js'; import { supportedRoutesPlugin } from './routes/supported.js'; @@ -218,6 +219,10 @@ export async function createServer(options: CreateServerOptions): Promise { + let server: FastifyInstance; + + beforeEach(async () => { + server = fastify({ logger: false }); + await server.register(metricsRoutesPlugin); + // Sample routes for traffic that should be tracked + server.get('/sample', async () => ({ ok: true })); + server.get('/files/:cid', async (req) => ({ cid: (req.params as { cid: string }).cid })); + server.get('/health', async () => ({ status: 'ok' })); + await server.ready(); + }); + + afterEach(async () => { + if (server) await server.close(); + }); + + describe('GET /metrics', () => { + it('returns 200 with Prometheus text/plain content type', async () => { + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.statusCode).toBe(200); + expect(res.headers['content-type']).toContain('text/plain'); + expect(res.headers['content-type']).toContain('version=0.0.4'); + }); + + it('exposes default Node.js process metrics', async () => { + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.body).toMatch(/# HELP process_cpu_user_seconds_total/); + expect(res.body).toMatch(/# HELP nodejs_heap_size_total_bytes/); + expect(res.body).toMatch(/# HELP nodejs_eventloop_lag_seconds/); + }); + + it('exposes the http_requests_total counter and http_request_duration_seconds histogram', async () => { + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.body).toMatch(/# HELP http_requests_total/); + expect(res.body).toMatch(/# HELP http_request_duration_seconds/); + }); + + it('attaches a service="cardano402" default label', async () => { + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.body).toMatch(/service="cardano402"/); + }); + }); + + describe('HTTP request tracking', () => { + it('tracks the request count for tracked routes', async () => { + await server.inject({ method: 'GET', url: '/sample' }); + await server.inject({ method: 'GET', url: '/sample' }); + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.body).toMatch(/http_requests_total\{[^}]*route="\/sample"[^}]*\}\s+2/); + expect(res.body).toMatch( + /http_request_duration_seconds_count\{[^}]*route="\/sample"[^}]*\}\s+2/ + ); + }); + + it('uses the route pattern not the raw URL (bounded cardinality)', async () => { + await server.inject({ method: 'GET', url: '/files/abc123' }); + await server.inject({ method: 'GET', url: '/files/xyz789' }); + const res = await server.inject({ method: 'GET', url: '/metrics' }); + // Both calls collapse onto a single time series for the templated route + expect(res.body).toMatch(/http_requests_total\{[^}]*route="\/files\/:cid"[^}]*\}\s+2/); + // The raw cids are NOT present as labels (would explode cardinality) + expect(res.body).not.toMatch(/route="\/files\/abc123"/); + expect(res.body).not.toMatch(/route="\/files\/xyz789"/); + }); + + it('labels by method and status_code', async () => { + await server.inject({ method: 'GET', url: '/sample' }); + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.body).toMatch(/method="GET"/); + expect(res.body).toMatch(/status_code="200"/); + }); + }); + + describe('Excluded routes', () => { + it('does NOT track requests to /metrics (avoid recursive accounting)', async () => { + await server.inject({ method: 'GET', url: '/metrics' }); + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.body).not.toMatch(/http_requests_total\{[^}]*route="\/metrics"[^}]*\}/); + }); + + it('does NOT track requests to /health (liveness-probe noise)', async () => { + await server.inject({ method: 'GET', url: '/health' }); + await server.inject({ method: 'GET', url: '/health' }); + const res = await server.inject({ method: 'GET', url: '/metrics' }); + expect(res.body).not.toMatch(/http_requests_total\{[^}]*route="\/health"[^}]*\}/); + }); + }); +});