tailscale · lkosewsk · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/README.md b/README.md
@@ -386,6 +386,40 @@ Tunable parameters (these mirror Headroom's `CompressConfig`):
 
 > **Access**: the `/config` endpoint is gated only by your [tailnet policy file](https://tailscale.com/docs/reference/syntax/policy-file): anyone who can reach the device can read and change its configuration. Lock the device down accordingly (see [Security](#security-and-data-handling)).
 
+## Metrics (`/metrics`)
+
+tsheadroom exposes a Prometheus endpoint at `GET /metrics` on the same device. It uses the standard text exposition format, so any existing scraper works:
+
+```sh
+curl -s http://tsheadroom.<your-tailnet>.ts.net/metrics
+```
+
+Two families are emitted. The `headroom_*` metrics reuse the names Headroom's own proxy emits, so if you already scrape a Headroom proxy you can point the same dashboards at this endpoint:
+
+| metric | type | meaning |
+|---|---|---|
+| `headroom_requests_total` | counter | Compression hook calls processed. |
+| `headroom_requests_failed_total` | counter | Calls where compression errored and we failed open. |
+| `headroom_tokens_input_total` | counter | Input tokens seen (`tokens_before`). |
+| `headroom_tokens_saved_total` | counter | Tokens saved by compression (`tokens_saved`). |
+| `headroom_overhead_ms_{sum,count,min,max}` | counter/gauge | tsheadroom processing time per request, in ms (excludes the upstream LLM — that's what Headroom calls "overhead"). |
+| `headroom_requests_by_provider{provider}` | counter | Requests per provider (from Aperture metadata). |
+| `headroom_requests_by_model{model}` | counter | Requests per model (from Aperture metadata). |
+| `headroom_inbound_requests_total` / `_completed_total` / `_active` | counter/gauge | Inbound HTTP requests to the hook handler; `_active` is the live in-flight count. |
+
+The `tsheadroom_*` metrics have no Headroom analog and describe this service specifically:
+
+| metric | type | meaning |
+|---|---|---|
+| `tsheadroom_pool_slots_total` / `tsheadroom_pool_slots_busy` | gauge | Worker-pool size and how many slots are compressing right now. When `busy` sits at `total`, the pool is saturated and new requests queue (see [ML model loading and timeouts](#ml-model-loading-and-timeouts)). |
+| `tsheadroom_requests_by_outcome{outcome}` | counter | Requests by outcome: `modify`, `noop`, `error`, `passthrough`, `read_error`. |
+| `tsheadroom_cold_starts_total` | counter | Worker first-real-call events that paid the one-time ML model load. |
+| `tsheadroom_tokens_after_total` | counter | Output tokens after compression (`tokens_after`), so you can compute the realized ratio. |
+
+> **Why a subset?** tsheadroom is a `pre_request` hook, not a proxy: it runs `headroom.compress()` and never sees the upstream response. So Headroom proxy metrics that depend on the response or on prompt caching — completion tokens, TTFB, end-to-end latency, cache reads/writes and busts, per-transform timing, waste signals — have no source here and are deliberately omitted rather than reported as misleading zeros.
+
+> **Access**: like `/config`, `/metrics` is gated only by your tailnet policy file. Anyone who can reach the device can read it; it exposes aggregate counts and model/provider names, not request contents.
+
 ### ML model loading and timeouts
 
 The ML text compressor loads a ~600 MB model on first use (one-time, then cached on disk and resident in each worker). This load takes several seconds; a large, late-session conversation can itself take a few seconds to compress. There is one worker timeout, and the client-facing wait is owned by Aperture:

diff --git a/handler.go b/handler.go
@@ -18,11 +18,23 @@ import (
 const maxBody = 50 << 20 // 50 MiB
 
 // hookCallData is the subset of aperture's HookCallData we consume. The hook is
-// configured with send: ["request_body"], so that is the only field we read.
+// configured with send: ["request_body"], so request_body is gated on that; the
+// metadata object is always sent regardless of send, which is where aperture
+// puts the resolved provider and model.
 type hookCallData struct {
+	Metadata    hookMetadata    `json:"metadata"`
 	RequestBody json.RawMessage `json:"request_body"`
 }
 
+// hookMetadata is the subset of aperture's always-present HookMetadata we use.
+// provider and model are populated unconditionally by aperture (not gated by
+// the hook's send config), so they're our authoritative source for both the
+// per-provider/per-model metrics and the model passed to compress().
+type hookMetadata struct {
+	Provider string `json:"provider"`
+	Model    string `json:"model"`
+}
+
 // guardrailResponse is aperture's GuardrailResponse. We only ever emit "allow"
 // or "modify" — never "block" — and always with HTTP 200.
 //
@@ -67,36 +79,50 @@ type Handler struct {
 	comp     compressor
 	settings *settingsStore // current compress knobs, read per request
 	log      *slog.Logger   // operational logs + warnings (stderr)
+	metrics  *Metrics       // lifetime counters for /metrics (nil-safe; nil in tests)
 
 	verbose bool      // when set, emit a per-request summary to out
 	out     io.Writer // destination for verbose summaries (stdout)
 }
 
-// summary holds the numbers reported in the -v per-request line.
+// summary holds the numbers reported in the -v per-request line and folded into
+// /metrics.
 type summary struct {
 	inMessages int // number of messages received
 	inBytes    int // serialized size of received messages
 	outBytes   int // serialized size of returned messages
 
+	provider string // aperture-resolved provider (metrics label)
+	model    string // aperture-resolved model (metrics label)
+
+	tokensBefore int // res.TokensBefore (0 when no worker result)
+	tokensSaved  int // res.TokensSaved (0 when no worker result)
+	tokensAfter  int // res.TokensAfter (0 when no worker result)
+
 	workerMs   float64 // worker-reported compress() time (0 when no worker result)
 	cold       bool    // worker's first real request (paid the cold model load)
 	modelLimit int     // context-window limit the worker compressed against (0 when no result)
 	reason     string  // why this action was chosen: modify / allow(noop|error|passthrough|read-error)
 }
 
 func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	h.metrics.inboundStart()
+	defer h.metrics.inboundDone()
+
 	if r.Method != http.MethodPost {
 		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
 		return
 	}
 
 	start := time.Now()
 	resp, s := h.process(r)
+	durMs := float64(time.Since(start).Microseconds()) / 1000
 
 	if h.verbose {
-		fmt.Fprintf(h.out, "request in_msgs=%d in_bytes=%d out_bytes=%d dur_ms=%d worker_ms=%.0f cold=%t model_limit=%d -> %s\n",
-			s.inMessages, s.inBytes, s.outBytes, time.Since(start).Milliseconds(), s.workerMs, s.cold, s.modelLimit, s.reason)
+		fmt.Fprintf(h.out, "request in_msgs=%d in_bytes=%d out_bytes=%d dur_ms=%.0f worker_ms=%.0f cold=%t model_limit=%d -> %s\n",
+			s.inMessages, s.inBytes, s.outBytes, durMs, s.workerMs, s.cold, s.modelLimit, s.reason)
 	}
+	h.metrics.record(s, durMs)
 	writeJSON(w, http.StatusOK, resp)
 }
 
@@ -121,25 +147,30 @@ func (h *Handler) process(r *http.Request) (guardrailResponse, summary) {
 	// return the whole thing (aperture rejects non-object modified bodies).
 	var reqBody map[string]any
 	if err := json.Unmarshal(data.RequestBody, &reqBody); err != nil {
-		return allow, summary{reason: "allow(passthrough)"}
+		return allow, summary{reason: "allow(passthrough)", provider: data.Metadata.Provider, model: data.Metadata.Model}
 	}
 
 	// v1 handles only the `messages` shape (Anthropic/OpenAI). Anything else
 	// (e.g. Gemini's `contents`, embeddings) passes through untouched.
 	rawMessages, ok := reqBody["messages"]
 	if !ok {
-		return allow, summary{reason: "allow(passthrough)"}
+		return allow, summary{reason: "allow(passthrough)", provider: data.Metadata.Provider, model: data.Metadata.Model}
 	}
 	messages, ok := rawMessages.([]any)
 	if !ok {
-		return allow, summary{reason: "allow(passthrough)"}
+		return allow, summary{reason: "allow(passthrough)", provider: data.Metadata.Provider, model: data.Metadata.Model}
+	}
+	// Prefer aperture's resolved metadata.model (always present, authoritative);
+	// fall back to the request body's own model field. Empty -> worker default.
+	model := data.Metadata.Model
+	if model == "" {
+		model, _ = reqBody["model"].(string)
 	}
-	model, _ := reqBody["model"].(string) // absent/non-string -> worker default
 
 	// Byte sizes are only used for the -v summary; skip the marshal otherwise
 	// (messages can be multi-MB). The message count is cheap, so keep it.
 	// For an allow result, output size equals input size (body unchanged).
-	s := summary{inMessages: len(messages)}
+	s := summary{inMessages: len(messages), provider: data.Metadata.Provider, model: model}
 	if h.verbose {
 		s.inBytes = jsonLen(messages)
 		s.outBytes = s.inBytes
@@ -161,11 +192,15 @@ func (h *Handler) process(r *http.Request) (guardrailResponse, summary) {
 		s.reason = "allow(error)"
 		return allow, s
 	}
-	// Worker timing/cold/limit are available for both noop and modify; record
-	// them so the -v line shows them regardless of the outcome.
+	// Worker timing/cold/limit and token counts are available for both noop and
+	// modify; record them so the -v line and /metrics reflect them regardless of
+	// the outcome.
 	s.workerMs = res.ElapsedMs
 	s.cold = res.ColdFirstCall
 	s.modelLimit = res.ModelLimit
+	s.tokensBefore = res.TokensBefore
+	s.tokensSaved = res.TokensSaved
+	s.tokensAfter = res.TokensAfter
 	if res.TokensSaved <= 0 {
 		// No-op: nothing meaningful to change, so don't rewrite the body.
 		s.reason = "allow(noop)"

diff --git a/main.go b/main.go
@@ -66,17 +66,23 @@ func main() {
 	}, *maxCompress, log)
 	defer pool.Shutdown()
 
+	metrics := newMetrics()
+	metrics.poolStats = pool.stats
+
 	handler := &Handler{
 		comp:     pool,
 		settings: settings,
 		log:      log,
+		metrics:  metrics,
 		verbose:  *verbose,
 		out:      os.Stdout,
 	}
 
-	// /config is the runtime tuning API; everything else is the aperture hook.
+	// /config is the runtime tuning API; /metrics is the Prometheus scrape
+	// endpoint; everything else is the aperture hook.
 	mux := http.NewServeMux()
 	mux.Handle("/config", &configHandler{store: settings, log: log})
+	mux.Handle("/metrics", metrics)
 	mux.Handle("/", handler)
 	httpSrv := &http.Server{Handler: mux}