diff --git a/.env.example b/.env.example index c4a470d..c7b7578 100644 --- a/.env.example +++ b/.env.example @@ -55,12 +55,19 @@ ACTIVE_MASTER_KEY_ID=default # Migration note: Prior to v0.5.0, default was 86400 (24 hours) AUTH_TOKEN_EXPIRATION_SECONDS=14400 -# Rate limiting configuration -# Protects against abuse and denial-of-service attacks +# Rate limiting configuration (authenticated endpoints) +# Protects against abuse and denial-of-service attacks on authenticated routes RATE_LIMIT_ENABLED=true RATE_LIMIT_REQUESTS_PER_SEC=10.0 RATE_LIMIT_BURST=20 +# Token endpoint rate limiting (IP-based, unauthenticated) +# Applies to POST /v1/token endpoint to prevent credential stuffing and brute force attacks +# Stricter limits recommended as this endpoint is unauthenticated and commonly targeted +RATE_LIMIT_TOKEN_ENABLED=true +RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 +RATE_LIMIT_TOKEN_BURST=10 + # CORS configuration # ⚠️ SECURITY WARNING: CORS is disabled by default for server-to-server API # Enable only if browser-based access is required diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c3becd..e176abf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -122,6 +122,9 @@ jobs: python3 docs/tools/check_docs_metadata.py + - name: Docs release image tag checks + run: python3 docs/tools/check_release_image_tags.py + - name: OpenAPI validation run: | set -euo pipefail diff --git a/AGENTS.md b/AGENTS.md index 0c66f4d..25c612f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -694,6 +694,119 @@ v1 := router.Group("/api/v1") v1.Use(authMiddleware) ``` +### Rate Limiting Middleware + +The project implements two types of rate limiting middleware to protect against abuse: + +#### 1. Client-Based Rate Limiting (Authenticated Endpoints) + +**File:** `/internal/auth/http/rate_limit_middleware.go` + +**Purpose:** Protects authenticated endpoints from abuse by limiting requests per authenticated client. + +**Usage:** +```go +// Create middleware with configuration +rateLimitMiddleware := authHTTP.RateLimitMiddleware( + cfg.RateLimitRequestsPerSec, // e.g., 10.0 requests/second + cfg.RateLimitBurst, // e.g., 20 burst capacity + logger, +) + +// Apply to authenticated route groups +clients := v1.Group("/clients") +clients.Use(authMiddleware) // Must come first +clients.Use(rateLimitMiddleware) // Rate limit per client +``` + +**Key Features:** +- **Requires authentication:** Must be used after `AuthenticationMiddleware` +- **Per-client limits:** Each authenticated client (by client ID) gets independent rate limiter +- **Token bucket algorithm:** Uses `golang.org/x/time/rate` for smooth rate limiting +- **Automatic cleanup:** Removes stale limiters after 1 hour of inactivity +- **Configurable:** Controlled by `RATE_LIMIT_ENABLED`, `RATE_LIMIT_REQUESTS_PER_SEC`, `RATE_LIMIT_BURST` + +**Response:** +- Returns `429 Too Many Requests` with `Retry-After` header when limit exceeded +- Error response: `{"error": "rate_limit_exceeded", "message": "Too many requests. Please retry after the specified delay."}` + +#### 2. IP-Based Rate Limiting (Unauthenticated Endpoints) + +**File:** `/internal/auth/http/token_rate_limit_middleware.go` + +**Purpose:** Protects unauthenticated endpoints (e.g., token issuance) from credential stuffing and brute force attacks. + +**Usage:** +```go +// Create middleware with configuration +tokenRateLimitMiddleware := authHTTP.TokenRateLimitMiddleware( + cfg.RateLimitTokenRequestsPerSec, // e.g., 5.0 requests/second + cfg.RateLimitTokenBurst, // e.g., 10 burst capacity + logger, +) + +// Apply to unauthenticated endpoints +if tokenRateLimitMiddleware != nil { + v1.POST("/token", tokenRateLimitMiddleware, tokenHandler.IssueTokenHandler) +} +``` + +**Key Features:** +- **No authentication required:** Works on unauthenticated endpoints +- **Per-IP limits:** Each IP address gets independent rate limiter +- **Automatic IP detection:** Uses `c.ClientIP()` which handles: + - `X-Forwarded-For` header (takes first IP) + - `X-Real-IP` header + - Direct connection remote address +- **Token bucket algorithm:** Uses `golang.org/x/time/rate` for smooth rate limiting +- **Automatic cleanup:** Removes stale limiters after 1 hour of inactivity +- **Configurable:** Controlled by `RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, `RATE_LIMIT_TOKEN_BURST` + +**Response:** +- Returns `429 Too Many Requests` with `Retry-After` header when limit exceeded +- Error response: `{"error": "rate_limit_exceeded", "message": "Too many token requests from this IP. Please retry after the specified delay."}` + +**Security Considerations:** + +*Strengths:* +- Protects against credential stuffing and brute force attacks +- Stricter default limits (5 req/sec, burst 10) than authenticated endpoints +- No overhead on authenticated endpoints + +*Limitations & Mitigations:* +- **Shared IPs (NAT, corporate proxies):** May affect legitimate users behind same IP + - Mitigation: Reasonable burst capacity (10 requests) handles legitimate retries + - Mitigation: Can be disabled via `RATE_LIMIT_TOKEN_ENABLED=false` if needed +- **IP Spoofing via X-Forwarded-For:** Attacker could rotate IPs in header + - Mitigation: Configure Gin's trusted proxy settings in production + - Mitigation: Deploy behind proper reverse proxy/load balancer + +**Configuration Example (.env):** +```bash +# Authenticated endpoint rate limiting (per client) +RATE_LIMIT_ENABLED=true +RATE_LIMIT_REQUESTS_PER_SEC=10.0 +RATE_LIMIT_BURST=20 + +# Token endpoint rate limiting (per IP, unauthenticated) +RATE_LIMIT_TOKEN_ENABLED=true +RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 +RATE_LIMIT_TOKEN_BURST=10 +``` + +**Testing:** +Both middleware implementations include comprehensive test coverage: +- Requests within limit allowed +- Requests exceeding limit blocked with 429 +- Retry-After header present +- Independent limits per client/IP +- Burst capacity handling +- Automatic cleanup of stale entries + +**Reference:** +- Client-based: `/internal/auth/http/rate_limit_middleware.go` and `rate_limit_middleware_test.go` +- IP-based: `/internal/auth/http/token_rate_limit_middleware.go` and `token_rate_limit_middleware_test.go` + ## Authentication & Authorization HTTP Layer ### HTTP Handler Organization Pattern diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a99205..af3cfda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.7.0] - 2026-02-20 + +### Added +- Added IP-based rate limiting middleware for unauthenticated `POST /v1/token` +- Added token endpoint rate-limit configuration via `RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, and `RATE_LIMIT_TOKEN_BURST` + +### Changed +- Token issuance endpoint can now return `429 Too Many Requests` with `Retry-After` when per-IP limits are exceeded + +### Security +- Hardened token issuance path against credential stuffing and brute-force request bursts + +### Documentation +- Added `docs/releases/v0.7.0.md` release notes and `docs/releases/v0.7.0-upgrade.md` upgrade guide +- Updated docs for token endpoint throttling behavior, configuration, and troubleshooting guidance + ## [0.6.0] - 2026-02-19 ### Added diff --git a/Makefile b/Makefile index 9bc0e64..ae48429 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help build run test lint clean migrate-up migrate-down docker-build docker-run mocks docs-lint docs-check-examples docs-check-metadata +.PHONY: help build run test lint clean migrate-up migrate-down docker-build docker-run mocks docs-lint docs-check-examples docs-check-metadata docs-check-release-tags APP_NAME := app BINARY_DIR := bin @@ -78,11 +78,16 @@ docs-check-metadata: ## Validate docs metadata and API markers @echo "Running docs metadata checks..." @python3 docs/tools/check_docs_metadata.py +docs-check-release-tags: ## Validate pinned release image tags in current docs + @echo "Running docs release image tag checks..." + @python3 docs/tools/check_release_image_tags.py + docs-lint: ## Run markdown lint and offline link checks @echo "Running markdownlint-cli2..." @docker run --rm -v "$(PWD):/workdir" -w /workdir davidanson/markdownlint-cli2:v0.18.1 README.md "docs/**/*.md" ".github/pull_request_template.md" @$(MAKE) docs-check-examples @$(MAKE) docs-check-metadata + @$(MAKE) docs-check-release-tags @echo "Running lychee offline link checks..." @docker run --rm -v "$(PWD):/input" lycheeverse/lychee:latest --offline --include-fragments --no-progress "/input/README.md" "/input/docs/**/*.md" "/input/.github/pull_request_template.md" diff --git a/README.md b/README.md index 6790d5e..766c5a6 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Secrets is inspired by **HashiCorp Vault** ❀️, but it is intentionally **muc The default way to run Secrets is the published Docker image: ```bash -docker pull allisson/secrets:v0.6.0 +docker pull allisson/secrets:v0.7.0 ``` Use pinned tags for reproducible setups. `latest` is available for dev-only fast iteration. @@ -29,21 +29,20 @@ Then follow the Docker setup guide in [docs/getting-started/docker.md](docs/gett 1. 🐳 **Run with Docker image (recommended)**: [docs/getting-started/docker.md](docs/getting-started/docker.md) 2. πŸ’» **Run locally for development**: [docs/getting-started/local-development.md](docs/getting-started/local-development.md) -## πŸ†• What's New in v0.6.0 +## πŸ†• What's New in v0.7.0 -- ☁️ Added KMS integration for master key encryption at rest (`KMS_PROVIDER`, `KMS_KEY_URI`) -- πŸ” Added `rotate-master-key` CLI command for safer master key lifecycle operations -- 🧭 Added provider-specific KMS setup and migration runbook documentation -- βœ… Added KMS migration checklist: [docs/operations/kms-migration-checklist.md](docs/operations/kms-migration-checklist.md) -- πŸ“˜ Added release notes: [docs/releases/v0.6.0.md](docs/releases/v0.6.0.md) -- ⬆️ Added upgrade guide: [docs/releases/v0.6.0-upgrade.md](docs/releases/v0.6.0-upgrade.md) -- πŸ“¦ Updated pinned Docker docs/examples to `allisson/secrets:v0.6.0` +- πŸ›‘οΈ Added IP-based rate limiting for unauthenticated token issuance (`POST /v1/token`) +- βš™οΈ Added token endpoint configuration: `RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, `RATE_LIMIT_TOKEN_BURST` +- 🚦 Added token endpoint `429` + `Retry-After` behavior for burst/abuse control +- πŸ“˜ Added release notes: [docs/releases/v0.7.0.md](docs/releases/v0.7.0.md) +- ⬆️ Added upgrade guide: [docs/releases/v0.7.0-upgrade.md](docs/releases/v0.7.0-upgrade.md) +- πŸ“¦ Updated pinned Docker docs/examples to `allisson/secrets:v0.7.0` Release history quick links: -- Current: [v0.6.0 release notes](docs/releases/v0.6.0.md) -- Previous: [v0.5.1 release notes](docs/releases/v0.5.1.md) -- Previous upgrade guide: [v0.5.1 upgrade guide](docs/releases/v0.5.1-upgrade.md) +- Current: [v0.7.0 release notes](docs/releases/v0.7.0.md) +- Previous: [v0.6.0 release notes](docs/releases/v0.6.0.md) +- Previous upgrade guide: [v0.6.0 upgrade guide](docs/releases/v0.6.0-upgrade.md) ## πŸ“š Docs Map @@ -54,8 +53,8 @@ Release history quick links: - 🧰 **Troubleshooting**: [docs/getting-started/troubleshooting.md](docs/getting-started/troubleshooting.md) - βœ… **Smoke test script**: [docs/getting-started/smoke-test.md](docs/getting-started/smoke-test.md) - πŸ§ͺ **CLI commands reference**: [docs/cli/commands.md](docs/cli/commands.md) -- πŸš€ **v0.6.0 release notes**: [docs/releases/v0.6.0.md](docs/releases/v0.6.0.md) -- ⬆️ **v0.6.0 upgrade guide**: [docs/releases/v0.6.0-upgrade.md](docs/releases/v0.6.0-upgrade.md) +- πŸš€ **v0.7.0 release notes**: [docs/releases/v0.7.0.md](docs/releases/v0.7.0.md) +- ⬆️ **v0.7.0 upgrade guide**: [docs/releases/v0.7.0-upgrade.md](docs/releases/v0.7.0-upgrade.md) - πŸ” **Release compatibility matrix**: [docs/releases/compatibility-matrix.md](docs/releases/compatibility-matrix.md) - **By Topic** diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 6f1c42b..a5463e8 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,39 @@ # πŸ—’οΈ Documentation Changelog -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 + +## 2026-02-20 (docs v13 - v0.7.0 release prep) + +- Added release notes page: `docs/releases/v0.7.0.md` +- Added upgrade guide: `docs/releases/v0.7.0-upgrade.md` +- Updated docs metadata source (`docs/metadata.json`) to `current_release: v0.7.0` +- Updated root README and docs index to promote `v0.7.0` release links +- Updated compatibility matrix with `v0.6.0 -> v0.7.0` upgrade path +- Updated API docs to document token endpoint rate limiting and `POST /v1/token` `429` behavior +- Updated environment variable docs for `RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, and `RATE_LIMIT_TOKEN_BURST` +- Updated troubleshooting and security hardening docs with token endpoint throttling guidance +- Updated pinned Docker image examples from `allisson/secrets:v0.6.0` to `allisson/secrets:v0.7.0` +- Added token endpoint throttling runbook section to production deployment guide +- Added token-endpoint-specific `429` response example and optional smoke test verification flow +- Expanded monitoring queries and alert starters for `/v1/token` throttling signals +- Added docs CI guard for current-release pinned image tag consistency +- Added operator quick card runbook (`docs/operations/operator-quick-card.md`) for rollout/incident triage +- Added trusted proxy reference guide (`docs/operations/trusted-proxy-reference.md`) for source-IP safety checks +- Added release note and upgrade guide templates (`docs/releases/_template.md`, `docs/releases/_upgrade-template.md`) +- Added auth docs retry handling snippets for token endpoint `429` and `Retry-After` +- Added docs architecture map updates for CI docs guards and local validation workflow +- Added Phase 3 planning roadmap (`docs/development/docs-phase-3-roadmap.md`) +- Expanded Phase 3 roadmap with prioritized backlog (`S/M/L`), dependencies, and execution order +- Added Phase 4 micro-roadmap (`docs/development/docs-phase-4-roadmap.md`) with 3 PR plan and CI guard proposals +- Added incident decision tree and first-15-minutes incident playbook runbooks +- Added known limitations page for rate limiting, proxy trust, and KMS startup tradeoffs +- Added versioned examples index by release (`docs/examples/versioned-by-release.md`) +- Added day-0 onboarding walkthroughs for operator and developer personas +- Added persona landing pages (`docs/personas/operator.md`, `docs/personas/developer.md`, `docs/personas/security.md`) +- Added docs KPI page and postmortem-to-doc feedback loop guidance +- Added consolidated docs master backlog (`docs/development/docs-master-backlog.md`) +- Added search alias shortcuts in docs index for faster incident/runbook discovery +- Added command verification markers to key rollout/troubleshooting/smoke docs ## 2026-02-19 (docs v12 - v0.6.0 release prep) diff --git a/docs/README.md b/docs/README.md index 536da05..cabffa1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ # πŸ“š Secrets Documentation -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Metadata source for release/API labels: `docs/metadata.json` @@ -10,6 +10,8 @@ Welcome to the full documentation for Secrets. Pick a path and dive in πŸš€ - 🐳 [getting-started/docker.md](getting-started/docker.md) (recommended) - πŸ’» [getting-started/local-development.md](getting-started/local-development.md) +- 🧭 [getting-started/day-0-operator.md](getting-started/day-0-operator.md) +- πŸ’» [getting-started/day-0-developer.md](getting-started/day-0-developer.md) - 🧰 [getting-started/troubleshooting.md](getting-started/troubleshooting.md) - βœ… [getting-started/smoke-test.md](getting-started/smoke-test.md) - πŸ§ͺ [cli/commands.md](cli/commands.md) @@ -22,6 +24,12 @@ Welcome to the full documentation for Secrets. Pick a path and dive in πŸš€ 4. Apply production hardening checklist: [operations/production.md](operations/production.md) 5. Use runbook hub for rollout and incidents: [operations/runbook-index.md](operations/runbook-index.md) +## πŸ‘₯ Persona Paths + +- πŸ‘· [personas/operator.md](personas/operator.md) +- πŸ‘¨β€πŸ’» [personas/developer.md](personas/developer.md) +- πŸ›‘οΈ [personas/security.md](personas/security.md) + ## πŸ“– Documentation by Topic - βš™οΈ [configuration/environment-variables.md](configuration/environment-variables.md) @@ -32,15 +40,25 @@ Welcome to the full documentation for Secrets. Pick a path and dive in πŸš€ - ☁️ [operations/kms-setup.md](operations/kms-setup.md) - βœ… [operations/kms-migration-checklist.md](operations/kms-migration-checklist.md) - πŸš€ [operations/production-rollout.md](operations/production-rollout.md) +- ⚑ [operations/operator-quick-card.md](operations/operator-quick-card.md) +- 🌲 [operations/incident-decision-tree.md](operations/incident-decision-tree.md) +- ⏱️ [operations/first-15-minutes.md](operations/first-15-minutes.md) - πŸ“Š [operations/monitoring.md](operations/monitoring.md) - 🧯 [operations/operator-drills.md](operations/operator-drills.md) - 🏭 [operations/production.md](operations/production.md) +- 🌐 [operations/trusted-proxy-reference.md](operations/trusted-proxy-reference.md) +- ⚠️ [operations/known-limitations.md](operations/known-limitations.md) - πŸš‘ [operations/failure-playbooks.md](operations/failure-playbooks.md) - πŸ§ͺ [operations/policy-smoke-tests.md](operations/policy-smoke-tests.md) - 🧭 [operations/runbook-index.md](operations/runbook-index.md) - πŸ› οΈ [development/testing.md](development/testing.md) - 🧾 [development/docs-release-checklist.md](development/docs-release-checklist.md) - πŸ—ΊοΈ [development/docs-architecture-map.md](development/docs-architecture-map.md) +- πŸ“ˆ [development/docs-quality-kpis.md](development/docs-quality-kpis.md) +- πŸ” [development/postmortem-doc-loop.md](development/postmortem-doc-loop.md) +- πŸ—‚οΈ [development/docs-master-backlog.md](development/docs-master-backlog.md) +- πŸ›£οΈ [development/docs-phase-3-roadmap.md](development/docs-phase-3-roadmap.md) +- 🧭 [development/docs-phase-4-roadmap.md](development/docs-phase-4-roadmap.md) - 🀝 [contributing.md](contributing.md) - πŸ—’οΈ [CHANGELOG.md](CHANGELOG.md) @@ -70,6 +88,14 @@ Welcome to the full documentation for Secrets. Pick a path and dive in πŸš€ - 🧩 [api/versioning-policy.md](api/versioning-policy.md) - πŸ“„ [openapi.yaml](openapi.yaml) +## πŸ”Ž Search Aliases + +- `401 403 429 decision tree` -> [operations/incident-decision-tree.md](operations/incident-decision-tree.md) +- `first 15 minutes incident` -> [operations/first-15-minutes.md](operations/first-15-minutes.md) +- `trusted proxy retry-after token 429` -> [operations/trusted-proxy-reference.md](operations/trusted-proxy-reference.md) +- `known limitations` -> [operations/known-limitations.md](operations/known-limitations.md) +- `versioned examples` -> [examples/versioned-by-release.md](examples/versioned-by-release.md) + OpenAPI scope note: - `openapi.yaml` is a baseline subset for common API flows in the current release (`docs/metadata.json`) @@ -78,8 +104,10 @@ OpenAPI scope note: ## πŸš€ Releases -- πŸ“¦ [releases/v0.6.0.md](releases/v0.6.0.md) -- ⬆️ [releases/v0.6.0-upgrade.md](releases/v0.6.0-upgrade.md) +- πŸ“¦ [releases/v0.7.0.md](releases/v0.7.0.md) +- ⬆️ [releases/v0.7.0-upgrade.md](releases/v0.7.0-upgrade.md) +- πŸ“¦ [releases/v0.6.0.md](releases/v0.6.0.md) (historical) +- ⬆️ [releases/v0.6.0-upgrade.md](releases/v0.6.0-upgrade.md) (historical) - πŸ“¦ [releases/v0.5.1.md](releases/v0.5.1.md) (historical) - ⬆️ [releases/v0.5.1-upgrade.md](releases/v0.5.1-upgrade.md) (historical) - πŸ“¦ [releases/v0.5.0.md](releases/v0.5.0.md) (historical) @@ -105,6 +133,7 @@ OpenAPI scope note: ## πŸ’‘ Practical Examples +- 🧭 [examples/versioned-by-release.md](examples/versioned-by-release.md) - πŸ§ͺ [examples/curl.md](examples/curl.md) - 🐍 [examples/python.md](examples/python.md) - 🟨 [examples/javascript.md](examples/javascript.md) diff --git a/docs/api/authentication.md b/docs/api/authentication.md index 2334008..1e3cc17 100644 --- a/docs/api/authentication.md +++ b/docs/api/authentication.md @@ -1,6 +1,6 @@ # πŸ” Authentication API -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 All protected endpoints require `Authorization: Bearer `. @@ -67,11 +67,38 @@ Expected result: token request returns `201 Created`, authenticated clients requ - `401 Unauthorized`: invalid credentials - `403 Forbidden`: inactive client - `422 Unprocessable Entity`: malformed request +- `429 Too Many Requests`: token issuance throttled by IP-based token endpoint limits Rate limiting note: -- `POST /v1/token` is not rate-limited by application middleware -- Protected endpoints called with issued tokens can return `429 Too Many Requests` +- `POST /v1/token` is rate-limited per client IP when `RATE_LIMIT_TOKEN_ENABLED=true` +- Protected endpoints called with issued tokens are rate-limited per authenticated client + +## Token `429` Handling Quick Check + +Inspect headers and status: + +```bash +curl -i -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}' +``` + +Expected when throttled: + +- HTTP status `429 Too Many Requests` +- `Retry-After` response header (seconds) + +Minimal retry-after extraction example: + +```bash +RETRY_AFTER="$(curl -s -D - -o /dev/null -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}' \ + | awk -F': ' 'tolower($1)=="retry-after" {print $2}' | tr -d '\r')" + +echo "Retry after: ${RETRY_AFTER}s" +``` ## Error Payload Examples diff --git a/docs/api/error-decision-matrix.md b/docs/api/error-decision-matrix.md index 3081bdc..6f9429b 100644 --- a/docs/api/error-decision-matrix.md +++ b/docs/api/error-decision-matrix.md @@ -1,6 +1,6 @@ # 🚨 API Error Decision Matrix -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 Use this matrix to triage API failures quickly and choose the next action. @@ -14,7 +14,7 @@ Use this matrix to triage API failures quickly and choose the next action. | `404 Not Found` | Route/resource missing | Wrong endpoint shape, unknown resource ID/key/path | Verify endpoint path shape first, then resource existence | | `409 Conflict` | Resource state conflict | Duplicate create (for example existing transit key name) | Switch to rotate/update flow or use unique resource name | | `422 Unprocessable Entity` | Validation failed | Invalid JSON/body/query, bad base64, malformed ciphertext contract | Validate payload and endpoint-specific contract | -| `429 Too Many Requests` | Request throttled | Per-client rate limit exceeded | Respect `Retry-After` and retry with backoff + jitter | +| `429 Too Many Requests` | Request throttled | Per-client or per-IP rate limit exceeded | Respect `Retry-After` and retry with backoff + jitter | ## Fast Triage Order @@ -24,6 +24,18 @@ Use this matrix to triage API failures quickly and choose the next action. 4. Validate payload contract (`422`) using endpoint docs 5. For `429`, apply retry policy and reassess client concurrency +## Fast discriminator (`401` vs `403` vs `429`) + +- `401 Unauthorized`: authentication failed before policy check; verify token or client credentials first +- `403 Forbidden`: authentication succeeded, but policy/capability denied requested path +- `429 Too Many Requests`: request hit per-client or per-IP throttling; inspect `Retry-After` + +First place to look: + +- `401`: token issuance/authentication logs and credential validity +- `403`: policy document, capability mapping, and path matcher behavior +- `429`: rate-limit settings (`RATE_LIMIT_*`, `RATE_LIMIT_TOKEN_*`) and traffic burst patterns + ## Capability mismatch quick map (`403`) - `GET /v1/secrets/*path` requires `decrypt` diff --git a/docs/api/rate-limiting.md b/docs/api/rate-limiting.md index a2f11fa..0c0bd60 100644 --- a/docs/api/rate-limiting.md +++ b/docs/api/rate-limiting.md @@ -1,10 +1,12 @@ # 🚦 API Rate Limiting -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 -Secrets enforces per-client rate limiting for authenticated API routes when -`RATE_LIMIT_ENABLED=true` (default). +Secrets enforces two rate-limiting scopes: + +- Per-client limits for authenticated API routes (`RATE_LIMIT_*`) +- Per-IP limits for unauthenticated token issuance (`RATE_LIMIT_TOKEN_*`) ## Scope @@ -17,7 +19,7 @@ Rate limiting scope matrix: | `/v1/secrets/*` | Yes | Requires Bearer auth | | `/v1/transit/*` | Yes | Requires Bearer auth | | `/v1/tokenization/*` | Yes | Requires Bearer auth | -| `POST /v1/token` | No | Token issuance route | +| `POST /v1/token` | Yes | Unauthenticated endpoint, rate-limited per client IP | | `GET /health` | No | Liveness checks | | `GET /ready` | No | Readiness checks | | `GET /metrics` | No | Prometheus scraping | @@ -25,9 +27,15 @@ Rate limiting scope matrix: ## Defaults ```dotenv +# Authenticated endpoints (per client) RATE_LIMIT_ENABLED=true RATE_LIMIT_REQUESTS_PER_SEC=10.0 RATE_LIMIT_BURST=20 + +# Token endpoint (per IP) +RATE_LIMIT_TOKEN_ENABLED=true +RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 +RATE_LIMIT_TOKEN_BURST=10 ``` ## Response behavior @@ -45,17 +53,21 @@ When a request exceeds the allowed rate, the API returns: } ``` +Token endpoint (`POST /v1/token`) uses the same status/header contract and returns an endpoint-specific +message indicating too many token requests from the caller IP. + ## Client retry guidance - Respect `Retry-After` before retrying - Use exponential backoff with jitter - Avoid synchronized retries across many workers - Reduce per-client burst and concurrency where possible +- For token issuance, review shared NAT/proxy behavior and tune `RATE_LIMIT_TOKEN_*` if needed ## Distinguishing `403` vs `429` - `403 Forbidden`: policy/capability denies access -- `429 Too Many Requests`: request was authenticated/authorized but throttled +- `429 Too Many Requests`: request was throttled by per-client or per-IP rate limits ## See also diff --git a/docs/api/response-shapes.md b/docs/api/response-shapes.md index 3acad43..7fd6c0b 100644 --- a/docs/api/response-shapes.md +++ b/docs/api/response-shapes.md @@ -1,6 +1,6 @@ # 🧱 API Response Shapes -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 Use these representative response schemas as a stable reference across endpoint docs. @@ -166,6 +166,18 @@ Representative rate-limit payload (`429 Too Many Requests`): Rate-limit responses include a `Retry-After` header in seconds. +For `POST /v1/token`, the `message` text may be token-endpoint specific while keeping the same +error key and `429` contract. + +Representative token endpoint payload (`POST /v1/token`, `429 Too Many Requests`): + +```json +{ + "error": "rate_limit_exceeded", + "message": "Too many token requests from this IP. Please retry after the specified delay." +} +``` + Representative conflict payload (for example duplicate transit key create): ```json diff --git a/docs/cli/commands.md b/docs/cli/commands.md index 48f1a06..4b101a5 100644 --- a/docs/cli/commands.md +++ b/docs/cli/commands.md @@ -1,6 +1,6 @@ # πŸ§ͺ CLI Commands Reference -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use the `app` CLI for server runtime, key management, and client lifecycle operations. @@ -12,10 +12,10 @@ Local binary: ./bin/app [flags] ``` -Docker image (v0.6.0): +Docker image (v0.7.0): ```bash -docker run --rm --env-file .env allisson/secrets:v0.6.0 [flags] +docker run --rm --env-file .env allisson/secrets:v0.7.0 [flags] ``` ## Core Runtime @@ -33,7 +33,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env -p 8080:8080 allisson/secrets:v0.6.0 server +docker run --rm --network secrets-net --env-file .env -p 8080:8080 allisson/secrets:v0.7.0 server ``` ### `migrate` @@ -49,7 +49,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate ``` ## Key Management @@ -79,7 +79,7 @@ Local: Docker: ```bash -docker run --rm allisson/secrets:v0.6.0 create-master-key --id default +docker run --rm allisson/secrets:v0.7.0 create-master-key --id default ``` ### `rotate-master-key` @@ -99,7 +99,7 @@ Local: Docker: ```bash -docker run --rm --env-file .env allisson/secrets:v0.6.0 rotate-master-key --id master-key-2026-08 +docker run --rm --env-file .env allisson/secrets:v0.7.0 rotate-master-key --id master-key-2026-08 ``` ### `create-kek` @@ -119,7 +119,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm ``` ### `rotate-kek` @@ -139,7 +139,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 rotate-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 rotate-kek --algorithm aes-gcm ``` After master key or KEK rotation, restart API server instances so they load updated key material. @@ -176,7 +176,7 @@ Examples: --deterministic \ --algorithm aes-gcm -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ create-tokenization-key --name payment-cards --format luhn-preserving --deterministic --algorithm aes-gcm ``` @@ -200,7 +200,7 @@ Examples: --deterministic \ --algorithm chacha20-poly1305 -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ rotate-tokenization-key --name payment-cards --format luhn-preserving --deterministic --algorithm chacha20-poly1305 ``` @@ -224,7 +224,7 @@ Examples: ./bin/app clean-expired-tokens --days 30 --format text # Docker form -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ clean-expired-tokens --days 30 --dry-run --format json ``` @@ -307,7 +307,7 @@ Examples: ./bin/app clean-audit-logs --days 90 --format text # Docker form -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ clean-audit-logs --days 90 --dry-run --format json ``` diff --git a/docs/configuration/environment-variables.md b/docs/configuration/environment-variables.md index b987a49..e72bbfd 100644 --- a/docs/configuration/environment-variables.md +++ b/docs/configuration/environment-variables.md @@ -1,6 +1,6 @@ # βš™οΈ Environment Variables -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Secrets is configured through environment variables. @@ -28,11 +28,16 @@ ACTIVE_MASTER_KEY_ID=default # Authentication configuration AUTH_TOKEN_EXPIRATION_SECONDS=14400 -# Rate limiting configuration +# Rate limiting configuration (authenticated endpoints) RATE_LIMIT_ENABLED=true RATE_LIMIT_REQUESTS_PER_SEC=10.0 RATE_LIMIT_BURST=20 +# Token endpoint rate limiting (IP-based, unauthenticated) +RATE_LIMIT_TOKEN_ENABLED=true +RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 +RATE_LIMIT_TOKEN_BURST=10 + # CORS configuration CORS_ENABLED=false CORS_ALLOW_ORIGINS= @@ -224,6 +229,33 @@ Allows clients to temporarily exceed `RATE_LIMIT_REQUESTS_PER_SEC` up to the bur Tune based on observed `429` rates and client retry behavior. +### RATE_LIMIT_TOKEN_ENABLED + +Enable per-IP rate limiting on token issuance endpoint `POST /v1/token` (default: `true`). + +Use this protection to reduce credential stuffing and brute-force traffic on unauthenticated token +requests. + +### RATE_LIMIT_TOKEN_REQUESTS_PER_SEC + +Maximum token issuance requests per second per client IP (default: `5.0`). + +### RATE_LIMIT_TOKEN_BURST + +Burst capacity for token issuance per IP (default: `10`). + +Allows short request spikes while preserving stricter controls for the unauthenticated token endpoint. + +### Token endpoint presets (starting points) + +| Profile | RATE_LIMIT_TOKEN_REQUESTS_PER_SEC | RATE_LIMIT_TOKEN_BURST | Typical use case | +| --- | --- | --- | --- | +| Strict (default) | `5.0` | `10` | Internet-facing token issuance | +| Shared-egress | `10.0` | `20` | Enterprise NAT/proxy callers | +| Internal trusted | `20.0` | `40` | Internal service mesh token broker | + +Tune based on `POST /v1/token` `429` rates, NAT/proxy sharing patterns, and retry behavior. + ## CORS configuration ### CORS_ENABLED @@ -284,7 +316,7 @@ Prefix for all metric names (default: `secrets`). Or with Docker image: ```bash -docker run --rm allisson/secrets:v0.6.0 create-master-key --id default +docker run --rm allisson/secrets:v0.7.0 create-master-key --id default ``` ## See also diff --git a/docs/contributing.md b/docs/contributing.md index eb6ac49..2b9ddf2 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,6 +1,6 @@ # 🀝 Documentation Contributing Guide -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this guide when adding or editing project documentation. @@ -124,6 +124,15 @@ For behavior changes, update all relevant docs in the same PR: - Perform a monthly docs review for stale examples, outdated commands, and dead links - During releases, verify `Last updated` metadata and append entries to `docs/CHANGELOG.md` +Incident feedback policy: + +- For Sev incidents, apply the [Postmortem to docs feedback loop](development/postmortem-doc-loop.md) +- Incident remediations should either update docs or record explicit no-doc-change rationale + +Quality KPIs: + +- Track baseline docs quality via [Docs quality KPIs](development/docs-quality-kpis.md) + ## Docs Release Process 1. Update `Last updated` in every changed docs file diff --git a/docs/development/docs-architecture-map.md b/docs/development/docs-architecture-map.md index 2277da8..3259b22 100644 --- a/docs/development/docs-architecture-map.md +++ b/docs/development/docs-architecture-map.md @@ -1,6 +1,6 @@ # πŸ—ΊοΈ Docs Architecture Map -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 This page defines canonical vs supporting docs to reduce duplication and drift. @@ -33,6 +33,19 @@ This page defines canonical vs supporting docs to reduce duplication and drift. 3. Update `docs/CHANGELOG.md` for significant docs updates 4. Run docs checks before merge +Recommended local validation: + +- `make docs-lint` +- `make docs-check-metadata` +- `make docs-check-release-tags` + +## CI/Tooling Guards + +- `docs/tools/check_docs_metadata.py`: release/API metadata and `Last updated` consistency +- `docs/tools/check_release_docs_links.py`: release docs link integrity in PRs +- `docs/tools/check_example_shapes.py`: JSON example structure sanity checks +- `docs/tools/check_release_image_tags.py`: pinned current-release Docker tag consistency + ## Drift Signals - Endpoint docs disagree with capability matrix diff --git a/docs/development/docs-master-backlog.md b/docs/development/docs-master-backlog.md new file mode 100644 index 0000000..982242a --- /dev/null +++ b/docs/development/docs-master-backlog.md @@ -0,0 +1,51 @@ +# πŸ—‚οΈ Docs Master Backlog + +> Last updated: 2026-02-20 + +This page consolidates Phase 3, Phase 4, and maturity follow-ups into one prioritized execution sequence. + +## P0 (Immediate) + +| Item | Effort | Dependency | +| --- | --- | --- | +| Incident decision tree and first-15-minutes playbook | S | none | +| Operator/developer day-0 walkthrough paths | S | none | +| Known limitations page for ops/security expectations | S | none | + +## P1 (Near-term) + +| Item | Effort | Dependency | +| --- | --- | --- | +| Freshness SLA check + CI | M | policy alignment | +| Internal anchor integrity check + CI | M | docs tooling baseline | +| OpenAPI-to-doc coverage guard | M | endpoint mapping config | +| Example parity checks across runtimes | M | examples conventions | + +## P2 (Governance) + +| Item | Effort | Dependency | +| --- | --- | --- | +| Docs ownership matrix and review cadence page | S | team owner mapping | +| Postmortem-to-doc feedback loop policy | S | incident process agreement | +| Docs KPI reporting page and monthly review process | S | CI metrics visibility | + +## P3 (Maturity) + +| Item | Effort | Dependency | +| --- | --- | --- | +| API contracts/invariants canonical page | M | API doc harmonization | +| Release audience diff summaries (users/operators/security) | M | release template update | +| Search vocabulary normalization pass | S | page owners for key docs | + +## Suggested execution sequence + +1. Complete P0 content and navigation updates +2. Implement P1 checks in CI with low-noise defaults +3. Formalize P2 governance and cadence +4. Deliver P3 consistency and release communication upgrades + +## See also + +- [Docs phase 3 roadmap](docs-phase-3-roadmap.md) +- [Docs phase 4 roadmap](docs-phase-4-roadmap.md) +- [Docs release checklist](docs-release-checklist.md) diff --git a/docs/development/docs-phase-3-roadmap.md b/docs/development/docs-phase-3-roadmap.md new file mode 100644 index 0000000..03f53d5 --- /dev/null +++ b/docs/development/docs-phase-3-roadmap.md @@ -0,0 +1,95 @@ +# πŸ›£οΈ Docs Phase 3 Roadmap + +> Last updated: 2026-02-20 + +This roadmap captures next-step documentation improvements after the `v0.7.0` release prep +and Phase 2 operator hardening updates. + +## Objectives + +- Reduce time-to-troubleshoot for operators +- Improve contract clarity for API consumers +- Lower long-term documentation drift risk + +## Quick Wins (same-day) + +1. Add an API contracts hub page (`docs/api/contracts.md`) and link from all endpoint pages +2. Add task-based operator navigation (`deploy`, `debug auth`, `debug 429`, `rotate keys`) in runbook index +3. Add negative examples in `docs/examples/*` for common `401/403/422/429` paths +4. Add glossary backlinks for core terms in security and operations pages + +## Medium Scope (1 PR) + +1. Create release cut companion checklist covering non-doc release actions: + - tag creation validation + - image publish and pull verification + - rollback artifact verification +2. Add a docs ownership matrix by domain: + - API docs ownership + - operations/runbook ownership + - release docs ownership + +## Deeper Scope (1-2 PRs) + +1. Build a canonical API contract invariants page with explicit guarantees: + - ciphertext input/output contracts + - error response structure guarantees + - versioning and compatibility expectations +2. Add cross-page consistency guards for contract terms (light static checks) + +## Suggested PR Breakdown + +1. **PR A (Quick wins):** contracts hub + operator task nav + negative examples +2. **PR B (Governance):** release cut companion checklist + docs ownership matrix +3. **PR C (Contracts hardening):** invariants page + consistency checks + +## Definition of Done (Phase 3) + +- All endpoint docs link to shared contracts page +- Runbook index includes task-based operator entry points +- Examples include at least one negative flow per major API area +- Release docs include non-doc release validation links +- Docs ownership matrix is published and linked from docs architecture map + +## Prioritized Backlog (S/M/L + dependencies) + +| Priority | Initiative | Effort | Dependencies | Why now | +| --- | --- | --- | --- | --- | +| P0 | Docs decision tree for incident triage (`401/403/429/5xx`) | S | none | Fastest operator navigation win during incidents | +| P0 | First 15 minutes incident playbook with copy/paste commands | S | none | Reduces on-call ambiguity and response time | +| P1 | OpenAPI-to-doc coverage guard (endpoint reference consistency) | M | stable endpoint docs links | Prevents contract docs drift over time | +| P1 | Example parity checks across curl/python/js/go | M | examples folder conventions | Keeps multi-language guidance consistent | +| P2 | Docs ownership metadata (owner + review cadence) | S | team ownership agreement | Improves freshness accountability | +| P2 | Release audience diff pages (users/operators/security) | M | release template updates | Speeds release communication and change impact review | + +## Suggested execution order + +1. Deliver P0 items together in one quick PR +2. Deliver P1 checks with CI integration in a second PR +3. Deliver P2 governance/reporting items in a third PR + +## Risks and mitigations + +- Risk: extra static checks create noisy CI failures + - Mitigation: start in warning mode locally, then enforce in CI after one release cycle +- Risk: ownership metadata becomes stale + - Mitigation: include ownership review in release checklist cadence +- Risk: endpoint-doc mapping false positives for grouped docs pages + - Mitigation: allow mapping config file for intentional many-to-one endpoint coverage + +## Validation + +Run before merge: + +```bash +make docs-lint +make docs-check-examples +make docs-check-metadata +make docs-check-release-tags +``` + +## See also + +- [Docs architecture map](docs-architecture-map.md) +- [Docs release checklist](docs-release-checklist.md) +- [Operator runbook index](../operations/runbook-index.md) diff --git a/docs/development/docs-phase-4-roadmap.md b/docs/development/docs-phase-4-roadmap.md new file mode 100644 index 0000000..9b8d0a6 --- /dev/null +++ b/docs/development/docs-phase-4-roadmap.md @@ -0,0 +1,112 @@ +# 🧭 Docs Phase 4 Micro-Roadmap + +> Last updated: 2026-02-20 + +This phase focuses on documentation process quality, freshness visibility, and guardrails. + +## Scope + +- Keep improvements small and enforceable +- Prefer CI-backed checks over manual reminders +- Ship in 3 focused PRs + +## PR 1: Freshness SLA and Stale Page Guard + +### PR 1 Goal + +Detect stale docs pages before drift becomes operational risk. + +### PR 1 Changes + +- Add `docs/tools/check_docs_freshness.py` +- Add `make docs-check-freshness` +- Add CI step in `.github/workflows/ci.yml` +- Add freshness policy section in `docs/contributing.md` + +### PR 1 Rule set (starter) + +- Fail if `> Last updated:` is older than 120 days for: + - `docs/api/*.md` + - `docs/operations/*.md` + - `docs/getting-started/*.md` +- Exclude historical release pages and ADR pages + +## PR 2: Internal Anchor Integrity Guard + +### PR 2 Goal + +Catch broken section links when headings change in long docs. + +### PR 2 Changes + +- Add `docs/tools/check_internal_anchors.py` +- Add `make docs-check-anchors` +- Add CI step in `.github/workflows/ci.yml` +- Document anchor-link practices in `docs/development/docs-architecture-map.md` + +### PR 2 Rule set (starter) + +- Validate local markdown links with fragments (e.g., `file.md#section-heading`) +- Fail when target file exists but fragment no longer resolves + +## PR 3: Command Validation Markers + Persona Entrypoints + +### PR 3 Goal + +Improve trust in copy/paste blocks and speed onboarding by audience. + +### PR 3 Changes + +- Add command validation markers to critical pages: + - `docs/operations/production-rollout.md` + - `docs/operations/production.md` + - `docs/getting-started/troubleshooting.md` + - `docs/getting-started/smoke-test.md` +- Add persona landing pages: + - `docs/personas/operator.md` + - `docs/personas/developer.md` + - `docs/personas/security.md` +- Link persona pages from `docs/README.md` + +### PR 3 Marker format (starter) + +Use a compact marker above critical command blocks: + +```text +> Command status: verified on YYYY-MM-DD +``` + +## Dependencies and Order + +1. PR 1 (freshness) first +2. PR 2 (anchors) second +3. PR 3 (usability) third + +## Success Criteria + +- Freshness check runs in CI and fails stale high-risk pages +- Anchor check runs in CI and prevents broken section links +- Critical command blocks include validation markers +- Persona pages provide a shortest-path doc flow by role + +## Validation Commands + +```bash +make docs-lint +make docs-check-examples +make docs-check-metadata +make docs-check-release-tags +``` + +After PR 1 and PR 2: + +```bash +make docs-check-freshness +make docs-check-anchors +``` + +## See also + +- [Docs phase 3 roadmap](docs-phase-3-roadmap.md) +- [Docs release checklist](docs-release-checklist.md) +- [Docs architecture map](docs-architecture-map.md) diff --git a/docs/development/docs-quality-kpis.md b/docs/development/docs-quality-kpis.md new file mode 100644 index 0000000..e729ada --- /dev/null +++ b/docs/development/docs-quality-kpis.md @@ -0,0 +1,33 @@ +# πŸ“ˆ Docs Quality KPIs + +> Last updated: 2026-02-20 + +Use these KPIs to track documentation reliability and operational usefulness. + +## Core KPIs + +| KPI | Target | Source | +| --- | --- | --- | +| Docs lint/link pass rate | 100% on main and PRs | CI (`make docs-lint`) | +| Stale high-risk pages (API/ops/getting-started) | 0 pages older than SLA | freshness check (Phase 4 PR 1) | +| Incident triage time-to-first-runbook | <= 5 minutes | on-call postmortems | +| Docs-related incident follow-up completion | 100% for Sev incidents | incident action tracker | +| Broken internal anchor count | 0 | anchor guard (Phase 4 PR 2) | + +## Review cadence + +- Weekly: CI quality metrics (lint/link/check failures) +- Monthly: freshness + ownership review +- After Sev incidents: triage path clarity and runbook updates + +## Escalation triggers + +- Repeated docs-check CI failures for 2+ weeks +- 2+ incidents in a month citing missing/unclear docs guidance +- Freshness SLA misses in API/operations docs + +## See also + +- [Documentation contributing guide](../contributing.md) +- [Postmortem to docs feedback loop](postmortem-doc-loop.md) +- [Docs phase 4 roadmap](docs-phase-4-roadmap.md) diff --git a/docs/development/docs-release-checklist.md b/docs/development/docs-release-checklist.md index de8d9fe..b0821d8 100644 --- a/docs/development/docs-release-checklist.md +++ b/docs/development/docs-release-checklist.md @@ -1,6 +1,6 @@ # 🧾 Docs Release Checklist -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this checklist for each release (`vX.Y.Z`) to keep docs consistent and navigable. @@ -15,6 +15,9 @@ Use this checklist for each release (`vX.Y.Z`) to keep docs consistent and navig - Add release notes: `docs/releases/vX.Y.Z.md` - Add upgrade guide when behavior/defaults change: `docs/releases/vX.Y.Z-upgrade.md` +- Start from templates: + - `docs/releases/_template.md` + - `docs/releases/_upgrade-template.md` - Update release compatibility matrix: `docs/releases/compatibility-matrix.md` - Promote new release links in docs indexes and operator runbooks @@ -46,6 +49,7 @@ Use this checklist for each release (`vX.Y.Z`) to keep docs consistent and navig intended for reproducible operations. - Use `allisson/secrets:latest` only in explicitly marked fast-iteration/dev-only examples. - In one document, avoid mixing pinned and `latest` tags unless the distinction is explicitly explained. +- Ensure current-release pinned tag consistency guard passes (`docs/tools/check_release_image_tags.py`). ## 6) Validation before merge @@ -55,6 +59,7 @@ Run: make docs-lint make docs-check-examples make docs-check-metadata +make docs-check-release-tags ``` CI should also validate: diff --git a/docs/development/postmortem-doc-loop.md b/docs/development/postmortem-doc-loop.md new file mode 100644 index 0000000..f1edc8d --- /dev/null +++ b/docs/development/postmortem-doc-loop.md @@ -0,0 +1,38 @@ +# πŸ” Postmortem to Docs Feedback Loop + +> Last updated: 2026-02-20 + +Use this process to ensure incidents continuously improve operational documentation. + +## Policy + +For every Sev incident, include one of the following outcomes in the postmortem: + +1. Docs updated in the same remediation PR +2. Explicit note: "No documentation change needed" with rationale + +## Required fields in postmortem + +- Runbook used first +- Time to first useful doc reference +- Missing/ambiguous docs sections +- Docs updates created (path + PR link) + +## Minimal workflow + +1. Incident is resolved +2. Owner identifies doc gaps from timeline +3. Patch docs or record no-change rationale +4. Update `docs/CHANGELOG.md` if docs changed +5. Confirm docs checks pass before merge + +## Suggested SLA + +- Sev 1-2 incidents: docs follow-up within 2 business days +- Sev 3 incidents: docs follow-up within 5 business days + +## See also + +- [Failure playbooks](../operations/failure-playbooks.md) +- [Incident decision tree](../operations/incident-decision-tree.md) +- [Docs quality KPIs](docs-quality-kpis.md) diff --git a/docs/examples/versioned-by-release.md b/docs/examples/versioned-by-release.md new file mode 100644 index 0000000..1acb2c8 --- /dev/null +++ b/docs/examples/versioned-by-release.md @@ -0,0 +1,34 @@ +# πŸ§ͺ Versioned Examples by Release + +> Last updated: 2026-02-20 + +Use this page to quickly identify which example set matches your deployed release. + +## Current release (`v0.7.0`) + +- Primary examples: + - [Curl examples](curl.md) + - [Python examples](python.md) + - [JavaScript examples](javascript.md) + - [Go examples](go.md) +- Release context: + - [v0.7.0 release notes](../releases/v0.7.0.md) + - [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) + +## Previous release (`v0.6.0`) + +- Backward context: + - [v0.6.0 release notes](../releases/v0.6.0.md) + - [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) + +## Compatibility notes + +- Example payloads and status codes follow current API docs (`/v1/*`) +- For endpoint-specific behavior changes, read release notes first +- For throttling behavior, validate `429` + `Retry-After` handling in your client runtime + +## See also + +- [Authentication API](../api/authentication.md) +- [API error decision matrix](../api/error-decision-matrix.md) +- [API rate limiting](../api/rate-limiting.md) diff --git a/docs/getting-started/day-0-developer.md b/docs/getting-started/day-0-developer.md new file mode 100644 index 0000000..8db387c --- /dev/null +++ b/docs/getting-started/day-0-developer.md @@ -0,0 +1,42 @@ +# πŸ’» Day 0 Developer Walkthrough + +> Last updated: 2026-02-20 + +Use this path for first-time contributors integrating with Secrets APIs. + +## Step 1: Run locally + +- Follow: [Run locally](local-development.md) +- Build and start API, then verify health + +## Step 2: Understand auth + policy behavior + +- Read: [Authentication API](../api/authentication.md) +- Read: [Policies cookbook](../api/policies.md) +- Read: [Capability matrix](../api/capability-matrix.md) + +## Step 3: Validate error and retry behavior + +- Read: [API error decision matrix](../api/error-decision-matrix.md) +- Read: [API rate limiting](../api/rate-limiting.md) + +## Step 4: Use examples by runtime + +- Start with: [Versioned examples by release](../examples/versioned-by-release.md) +- Then use: [Curl](../examples/curl.md), [Python](../examples/python.md), [JavaScript](../examples/javascript.md), [Go](../examples/go.md) + +## Step 5: Follow docs contribution quality bar + +- Read: [Documentation contributing guide](../contributing.md) +- Use: [Docs release checklist](../development/docs-release-checklist.md) + +## Expected outcomes + +- You can obtain tokens and call protected endpoints reliably +- You can distinguish authn/authz/throttling failures in client integrations +- You can submit feature PRs with aligned API + docs changes + +## See also + +- [Testing guide](../development/testing.md) +- [Docs architecture map](../development/docs-architecture-map.md) diff --git a/docs/getting-started/day-0-operator.md b/docs/getting-started/day-0-operator.md new file mode 100644 index 0000000..bdd0869 --- /dev/null +++ b/docs/getting-started/day-0-operator.md @@ -0,0 +1,42 @@ +# 🧭 Day 0 Operator Walkthrough + +> Last updated: 2026-02-20 + +Use this linear path for first-time operations onboarding. + +## Step 1: Bring up a local baseline + +- Follow: [Run with Docker](docker.md) +- Verify: `GET /health`, `GET /ready` + +## Step 2: Validate core flows + +- Run: [Smoke test script](smoke-test.md) +- Confirm token issuance, secrets, and transit checks pass + +## Step 3: Learn rollout and rollback flow + +- Read: [Production rollout golden path](../operations/production-rollout.md) +- Focus: verification gates and rollback triggers + +## Step 4: Learn incident response path + +- Use: [Incident decision tree](../operations/incident-decision-tree.md) +- Drill: [First 15 Minutes Playbook](../operations/first-15-minutes.md) + +## Step 5: Harden production posture + +- Read: [Production deployment guide](../operations/production.md) +- Read: [Security hardening guide](../operations/security-hardening.md) +- Check: [Known limitations](../operations/known-limitations.md) + +## Expected outcomes + +- You can validate service health and auth quickly +- You can identify `401/403/429/5xx` primary runbook path +- You can execute a basic rollback trigger decision under pressure + +## See also + +- [Operator quick card](../operations/operator-quick-card.md) +- [Operator runbook index](../operations/runbook-index.md) diff --git a/docs/getting-started/docker.md b/docs/getting-started/docker.md index d4e03fa..2f6cc4d 100644 --- a/docs/getting-started/docker.md +++ b/docs/getting-started/docker.md @@ -1,10 +1,10 @@ # 🐳 Run with Docker (Recommended) -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 This is the default way to run Secrets. -For release reproducibility, this guide uses the pinned image tag `allisson/secrets:v0.6.0`. +For release reproducibility, this guide uses the pinned image tag `allisson/secrets:v0.7.0`. For dev-only fast iteration, you can use `allisson/secrets:latest`. **⚠️ Security Warning:** This guide is for **development and testing only**. For production deployments, see [Security Hardening Guide](../operations/security-hardening.md) and [Production Deployment Guide](../operations/production.md). @@ -13,18 +13,19 @@ For dev-only fast iteration, you can use `allisson/secrets:latest`. - `AUTH_TOKEN_EXPIRATION_SECONDS` default is `14400` (4 hours) - `RATE_LIMIT_ENABLED` default is `true` (per authenticated client) +- `RATE_LIMIT_TOKEN_ENABLED` default is `true` (per IP on `POST /v1/token`) - `CORS_ENABLED` default is `false` -These defaults were introduced in `v0.5.0` and remain unchanged in `v0.6.0`. +These defaults were introduced in `v0.5.0` and now include token-endpoint rate limiting in `v0.7.0`. -If upgrading from `v0.5.1`, review [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md). +If upgrading from `v0.6.0`, review [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md). ## ⚑ Quickstart Copy Block Use this minimal flow when you just want to get a working instance quickly: ```bash -docker pull allisson/secrets:v0.6.0 +docker pull allisson/secrets:v0.7.0 docker network create secrets-net || true docker run -d --name secrets-postgres --network secrets-net \ @@ -33,19 +34,19 @@ docker run -d --name secrets-postgres --network secrets-net \ -e POSTGRES_DB=mydb \ postgres:16-alpine -docker run --rm allisson/secrets:v0.6.0 create-master-key --id default +docker run --rm allisson/secrets:v0.7.0 create-master-key --id default # copy generated MASTER_KEYS and ACTIVE_MASTER_KEY_ID into .env -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 migrate -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.6.0 server + allisson/secrets:v0.7.0 server ``` ## 1) Pull the image ```bash -docker pull allisson/secrets:v0.6.0 +docker pull allisson/secrets:v0.7.0 ``` ## 2) Start PostgreSQL @@ -63,7 +64,7 @@ docker run -d --name secrets-postgres --network secrets-net \ ## 3) Generate a master key ```bash -docker run --rm allisson/secrets:v0.6.0 create-master-key --id default +docker run --rm allisson/secrets:v0.7.0 create-master-key --id default ``` Copy the generated values into a local `.env` file. @@ -87,6 +88,13 @@ ACTIVE_MASTER_KEY_ID=default AUTH_TOKEN_EXPIRATION_SECONDS=14400 +RATE_LIMIT_ENABLED=true +RATE_LIMIT_REQUESTS_PER_SEC=10.0 +RATE_LIMIT_BURST=20 +RATE_LIMIT_TOKEN_ENABLED=true +RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 +RATE_LIMIT_TOKEN_BURST=10 + METRICS_ENABLED=true METRICS_NAMESPACE=secrets EOF @@ -95,15 +103,15 @@ EOF ## 5) Run migrations and bootstrap KEK ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 migrate -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm ``` ## 6) Start the API server ```bash docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.6.0 server + allisson/secrets:v0.7.0 server ``` ## 7) Verify @@ -123,7 +131,7 @@ Expected: Use the CLI command to create your first API client and policy set: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 create-client \ +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-client \ --name bootstrap-admin \ --active \ --policies '[{"path":"*","capabilities":["read","write","delete","encrypt","decrypt","rotate"]}]' \ diff --git a/docs/getting-started/local-development.md b/docs/getting-started/local-development.md index e80086b..b7571a2 100644 --- a/docs/getting-started/local-development.md +++ b/docs/getting-started/local-development.md @@ -1,6 +1,6 @@ # πŸ’» Run Locally (Development) -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this path if you want to modify the source code and run from your workstation. @@ -10,11 +10,12 @@ Use this path if you want to modify the source code and run from your workstatio - `AUTH_TOKEN_EXPIRATION_SECONDS` default is `14400` (4 hours) - `RATE_LIMIT_ENABLED` default is `true` (per authenticated client) +- `RATE_LIMIT_TOKEN_ENABLED` default is `true` (per IP on `POST /v1/token`) - `CORS_ENABLED` default is `false` -These defaults were introduced in `v0.5.0` and remain unchanged in `v0.6.0`. +These defaults were introduced in `v0.5.0` and now include token-endpoint rate limiting in `v0.7.0`. -If upgrading from `v0.5.1`, review [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md). +If upgrading from `v0.6.0`, review [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md). ## Prerequisites diff --git a/docs/getting-started/smoke-test.md b/docs/getting-started/smoke-test.md index bedc478..4a098a2 100644 --- a/docs/getting-started/smoke-test.md +++ b/docs/getting-started/smoke-test.md @@ -1,6 +1,6 @@ # βœ… Smoke Test Script -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Run a fast end-to-end validation of a running Secrets instance. @@ -33,6 +33,8 @@ For transit decrypt, pass `ciphertext` exactly as returned by encrypt (` Command status: verified on 2026-02-20 + ```bash CLIENT_ID="" \ CLIENT_SECRET="" \ @@ -49,6 +51,31 @@ Optional variables: Expected output includes `Smoke test completed successfully`. If transit decrypt fails with `422`, see [Troubleshooting](troubleshooting.md#422-unprocessable-entity). +## Optional: Token Throttling Verification + +> Command status: verified on 2026-02-20 + +Use this only in non-production environments to verify token endpoint `429` behavior: + +```bash +# 1) Issue one token normally (should return 201) +curl -i -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}' + +# 2) Burst requests to trigger throttling in strict configs +for i in $(seq 1 20); do + curl -s -o /dev/null -w "%{http_code}\n" -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}' +done +``` + +Expected result under throttling: + +- Some responses return `429 Too Many Requests` +- Response includes `Retry-After` header + ⚠️ Security Warning: base64 is encoding, not encryption. Always use HTTPS/TLS. ## See also @@ -56,5 +83,5 @@ If transit decrypt fails with `422`, see [Troubleshooting](troubleshooting.md#42 - [Docker getting started](docker.md) - [Local development](local-development.md) - [Troubleshooting](troubleshooting.md) -- [v0.6.0 release notes](../releases/v0.6.0.md) +- [v0.7.0 release notes](../releases/v0.7.0.md) - [Curl examples](../examples/curl.md) diff --git a/docs/getting-started/troubleshooting.md b/docs/getting-started/troubleshooting.md index 2236c65..b5703a8 100644 --- a/docs/getting-started/troubleshooting.md +++ b/docs/getting-started/troubleshooting.md @@ -1,6 +1,6 @@ # 🧰 Troubleshooting -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this guide for common setup and runtime errors. @@ -9,7 +9,7 @@ Use this guide for common setup and runtime errors. Use this quick route before diving into detailed sections: 1. `curl http://localhost:8080/health` fails -> go to `Database connection failure` and `Migration failure` -2. Token endpoint (`POST /v1/token`) returns `401`/`403` -> go to `401 Unauthorized` or `Token issuance fails with valid-looking credentials` +2. Token endpoint (`POST /v1/token`) returns `401`/`403`/`429` -> go to `401 Unauthorized`, `429 Too Many Requests`, or `Token issuance fails with valid-looking credentials` 3. API requests return `403` with valid token -> go to `403 Forbidden` (policy/capability mismatch) 4. API requests return `422` -> go to `422 Unprocessable Entity` (payload/query format) 5. API requests return `429` -> go to `429 Too Many Requests` (rate limiting) @@ -112,17 +112,26 @@ Common 422 cases: ## 429 Too Many Requests - Symptom: authenticated requests return `429` -- Likely cause: per-client rate limit exceeded +- Likely cause: per-client rate limit exceeded on authenticated endpoints, or per-IP token endpoint rate limit exceeded on `POST /v1/token` - Fix: - check `Retry-After` response header and back off before retrying - implement exponential backoff with jitter in client retry logic - reduce request burst/concurrency from caller - tune `RATE_LIMIT_REQUESTS_PER_SEC` and `RATE_LIMIT_BURST` if traffic is legitimate + - for `POST /v1/token`, tune `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` and `RATE_LIMIT_TOKEN_BURST` if callers share NAT/proxy egress + +Trusted proxy checks for token endpoint (`POST /v1/token`): + +- If many callers suddenly look like one IP, verify proxy forwarding and trusted proxy settings +- If `X-Forwarded-For` is accepted from untrusted sources, IP spoofing can bypass intended per-IP controls +- Compare application logs (`client_ip`) with edge proxy logs to confirm real source-IP propagation +- Use [Trusted proxy reference](../operations/trusted-proxy-reference.md) for a platform checklist Quick note: -- Rate limiting applies to authenticated API groups (`/v1/clients`, `/v1/secrets`, `/v1/transit`, `/v1/tokenization`, `/v1/audit-logs`) -- Rate limiting does not apply to `/health`, `/ready`, `/metrics`, and token issuance (`POST /v1/token`) +- Authenticated rate limiting applies to `/v1/clients`, `/v1/secrets`, `/v1/transit`, `/v1/tokenization`, and `/v1/audit-logs` +- IP-based rate limiting applies to token issuance (`POST /v1/token`) +- Rate limiting does not apply to `/health`, `/ready`, and `/metrics` ## CORS and preflight failures @@ -215,6 +224,8 @@ If CORS is disabled or origin is not allowed, browser requests can fail even if Use these quick checks when startup errors suggest key mode mismatch: +> Command status: verified on 2026-02-20 + ```bash # 1) Check selected mode variables env | grep -E '^(KMS_PROVIDER|KMS_KEY_URI|ACTIVE_MASTER_KEY_ID|MASTER_KEYS)=' @@ -252,17 +263,17 @@ Expected patterns: Historical note: - This section is retained for mixed-version or rollback investigations involving pre-`v0.5.1` builds. -- For current rollouts, prioritize KMS mode diagnostics and the `v0.6.0` upgrade path. +- For current rollouts, prioritize KMS mode diagnostics and the `v0.7.0` upgrade path. - Symptom: startup succeeds, but key-dependent operations fail unexpectedly after a recent rollout - Likely cause: running a pre-`v0.5.1` build where decoded master key buffers could be zeroed too early - Mixed-version rollout symptom: some requests pass while others fail if old and new images are serving traffic together - Version fingerprint checks: - local binary: `./bin/app --version` - - pinned image check: `docker run --rm allisson/secrets:v0.6.0 --version` + - pinned image check: `docker run --rm allisson/secrets:v0.7.0 --version` - running containers: `docker ps --format 'table {{.Names}}\t{{.Image}}'` - Fix: - - upgrade all instances to `v0.6.0` (or at minimum `v0.5.1+`) + - upgrade all instances to `v0.7.0` (or at minimum `v0.5.1+`) - restart API instances after deploy - run key-dependent smoke checks (token issuance, secrets write/read, transit round-trip) - review [v0.5.1 release notes](../releases/v0.5.1.md) and @@ -291,7 +302,7 @@ Historical note: - Symptom: tokenization endpoints return `404`/`500` after upgrading to `v0.4.x` - Likely cause: tokenization migration (`000002_add_tokenization`) not applied or partially applied - Fix: - - run `./bin/app migrate` (or Docker `... allisson/secrets:v0.6.0 migrate`) + - run `./bin/app migrate` (or Docker `... allisson/secrets:v0.7.0 migrate`) - verify migration logs indicate `000002_add_tokenization` applied for your DB - confirm initial KEK exists (`create-kek` if missing) - re-run smoke flow for tokenization (`tokenize -> detokenize -> validate -> revoke`) @@ -344,3 +355,4 @@ Q: Why is wildcard `*` risky for normal service clients? - [Local development](local-development.md) - [Operator runbook index](../operations/runbook-index.md) - [Production operations](../operations/production.md) +- [Trusted proxy reference](../operations/trusted-proxy-reference.md) diff --git a/docs/metadata.json b/docs/metadata.json index f2bc261..1b3bf6c 100644 --- a/docs/metadata.json +++ b/docs/metadata.json @@ -1,5 +1,5 @@ { - "current_release": "v0.6.0", + "current_release": "v0.7.0", "api_version": "v1", - "last_docs_refresh": "2026-02-19" + "last_docs_refresh": "2026-02-20" } diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 7f6ca4f..a6a2f71 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -54,6 +54,8 @@ paths: $ref: "#/components/responses/Unauthorized" "422": $ref: "#/components/responses/ValidationError" + "429": + $ref: "#/components/responses/TooManyRequests" /v1/clients: get: tags: [clients] diff --git a/docs/operations/failure-playbooks.md b/docs/operations/failure-playbooks.md index 509db40..297d5f0 100644 --- a/docs/operations/failure-playbooks.md +++ b/docs/operations/failure-playbooks.md @@ -1,6 +1,6 @@ # πŸš‘ Failure Playbooks -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this page for fast incident triage on common API failures. @@ -88,6 +88,8 @@ curl -s "http://localhost:8080/v1/audit-logs?limit=50&offset=0" \ ## See also +- [Incident decision tree](incident-decision-tree.md) +- [First 15 Minutes Playbook](first-15-minutes.md) - [Troubleshooting](../getting-started/troubleshooting.md) - [Policies cookbook](../api/policies.md) - [Policy smoke tests](policy-smoke-tests.md) diff --git a/docs/operations/first-15-minutes.md b/docs/operations/first-15-minutes.md new file mode 100644 index 0000000..1fff7aa --- /dev/null +++ b/docs/operations/first-15-minutes.md @@ -0,0 +1,65 @@ +# ⏱️ First 15 Minutes Playbook + +> Last updated: 2026-02-20 + +Use this for high-severity incidents where API availability or auth flows are degraded. + +## Minute 0-3: Establish Service State + +```bash +curl -i http://localhost:8080/health +curl -i http://localhost:8080/ready +``` + +Expected: + +- `GET /health` -> `200` +- `GET /ready` -> `200` + +## Minute 3-6: Validate Authentication Path + +```bash +curl -i -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}' +``` + +Expected: + +- Normal flow -> `201 Created` +- If throttled -> `429` with `Retry-After` + +## Minute 6-10: Validate Crypto Data Path + +```bash +TOKEN="" + +curl -i -X POST http://localhost:8080/v1/secrets/incident/check \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"value":"aW5jaWRlbnQtY2hlY2s="}' + +curl -i -X GET http://localhost:8080/v1/secrets/incident/check \ + -H "Authorization: Bearer ${TOKEN}" +``` + +Expected: + +- write/read path succeeds + +## Minute 10-15: Decide Mitigation Path + +1. `401`-heavy: credential/token issue -> [Failure playbooks](failure-playbooks.md) +2. `403`-heavy: policy mismatch -> [Policy smoke tests](policy-smoke-tests.md) +3. `429` on `/v1/token`: IP throttling/proxy path -> [Token throttling runbook](production.md#10-token-endpoint-throttling-runbook) +4. `5xx`/readiness failures: dependency/runtime path -> [Production rollout rollback triggers](production-rollout.md#rollback-trigger-conditions) + +## Command status markers + +> Command status: verified on 2026-02-20 + +## See also + +- [Incident decision tree](incident-decision-tree.md) +- [Production rollout golden path](production-rollout.md) +- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/operations/incident-decision-tree.md b/docs/operations/incident-decision-tree.md new file mode 100644 index 0000000..5d346b7 --- /dev/null +++ b/docs/operations/incident-decision-tree.md @@ -0,0 +1,62 @@ +# 🌲 Incident Decision Tree + +> Last updated: 2026-02-20 + +Use this page to route incidents quickly to the right runbook. + +## Start + +1. Is `GET /health` failing? + - Yes -> infrastructure/runtime path: [First 15 Minutes Playbook](first-15-minutes.md) + - No -> continue +2. Is `GET /ready` failing? + - Yes -> dependencies/migrations/key-load path: [Troubleshooting](../getting-started/troubleshooting.md) + - No -> continue +3. Identify dominant status code and route group: + - `401` -> [Failure playbooks: 401](failure-playbooks.md#401-spike-unauthorized) + - `403` -> [Failure playbooks: 403](failure-playbooks.md#403-spike-policycapability-mismatch) + - `429` on `/v1/token` -> [Token throttling runbook](production.md#10-token-endpoint-throttling-runbook) + - `429` on authenticated routes -> [API rate limiting](../api/rate-limiting.md) + - `422` -> [API error decision matrix](../api/error-decision-matrix.md) + - `5xx` -> [First 15 Minutes Playbook](first-15-minutes.md) + +## Fast Branches + +### `401 Unauthorized` + +- Re-issue token via `POST /v1/token` +- Confirm caller sends `Authorization: Bearer ` +- Check client active status and secret rotation history + +### `403 Forbidden` + +- Verify endpoint path shape and required capability +- Verify policy matching semantics (`*`, trailing `/*`, mid-path `*`) +- Re-issue token after policy fix + +### `429 Too Many Requests` + +- Read `Retry-After` header +- Separate `/v1/token` from authenticated-route throttling +- Validate proxy/source-IP behavior if `/v1/token` is impacted + +### `5xx` + +- Check database connectivity and pool saturation +- Check migration and key-load startup logs +- Use rollback triggers in production rollout runbook + +## Search Aliases + +- `retry-after` +- `rate limit exceeded` +- `token endpoint throttling` +- `unauthorized spike` +- `forbidden policy mismatch` + +## See also + +- [First 15 Minutes Playbook](first-15-minutes.md) +- [Failure playbooks](failure-playbooks.md) +- [Troubleshooting](../getting-started/troubleshooting.md) +- [Operator quick card](operator-quick-card.md) diff --git a/docs/operations/key-management.md b/docs/operations/key-management.md index 0ba53f1..863e648 100644 --- a/docs/operations/key-management.md +++ b/docs/operations/key-management.md @@ -1,6 +1,6 @@ # πŸ”‘ Key Management Operations -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 This guide covers master keys and KEK lifecycle operations. @@ -20,7 +20,7 @@ Generate: Docker image equivalent: ```bash -docker run --rm allisson/secrets:v0.6.0 create-master-key --id prod-2026-01 +docker run --rm allisson/secrets:v0.7.0 create-master-key --id prod-2026-01 ``` Rotate master key: diff --git a/docs/operations/kms-migration-checklist.md b/docs/operations/kms-migration-checklist.md index 4fccb80..5f91534 100644 --- a/docs/operations/kms-migration-checklist.md +++ b/docs/operations/kms-migration-checklist.md @@ -1,12 +1,12 @@ # βœ… KMS Migration Checklist -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this checklist for migrating from legacy plaintext master keys to KMS mode. ## 1) Precheck -- [ ] Confirm target release is `v0.6.0` or newer +- [ ] Confirm target release is `v0.7.0` or newer - [ ] Back up current environment configuration - [ ] Confirm rollback owner and change window - [ ] Confirm KMS provider credentials are available in runtime @@ -46,7 +46,7 @@ Reference: [Key management operations](key-management.md) - [ ] If rollback needed, revert app version first - [ ] Re-validate health and smoke checks after rollback -Reference: [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md#rollback-notes) +Reference: [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md#rollback-notes) ## See also diff --git a/docs/operations/kms-setup.md b/docs/operations/kms-setup.md index bef8af2..9250e00 100644 --- a/docs/operations/kms-setup.md +++ b/docs/operations/kms-setup.md @@ -1,6 +1,6 @@ # KMS Setup Guide -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 This guide covers setting up Key Management Service (KMS) integration for encrypting master keys at rest. KMS mode provides an additional security layer by ensuring master keys are never stored in plaintext. @@ -514,7 +514,7 @@ Docker Compose example: ```yaml services: secrets-api: - image: allisson/secrets:v0.6.0 + image: allisson/secrets:v0.7.0 env_file: - .env environment: @@ -536,7 +536,7 @@ spec: spec: containers: - name: app - image: allisson/secrets:v0.6.0 + image: allisson/secrets:v0.7.0 env: - name: KMS_PROVIDER value: gcpkms diff --git a/docs/operations/known-limitations.md b/docs/operations/known-limitations.md new file mode 100644 index 0000000..746c965 --- /dev/null +++ b/docs/operations/known-limitations.md @@ -0,0 +1,37 @@ +# ⚠️ Known Limitations + +> Last updated: 2026-02-20 + +This page documents practical limitations and tradeoffs operators should account for. + +## Rate limiting + +- Token endpoint rate limiting is per-IP; shared NAT/proxy egress can impact legitimate callers +- Header trust/proxy misconfiguration can skew caller IP behavior +- Application-level throttling complements but does not replace edge/WAF controls + +## Proxy and source-IP trust + +- If forwarded headers are over-trusted, source IP spoofing risk increases +- If trusted proxy chain is incomplete, all traffic may appear from one source + +## KMS startup model + +- KMS decryption happens at startup key-load time, not per-request +- Runtime KMS outages may not impact steady-state traffic immediately, but restart/redeploy can fail if KMS is unavailable + +## Operational cadence + +- Key rotation requires API restart/rolling restart to load new key material +- Cleanup routines (`clean-audit-logs`, `clean-expired-tokens`) are operator-driven + +## Documentation scope note + +- `docs/openapi.yaml` is a baseline subset, not exhaustive contract coverage for every workflow detail + +## See also + +- [Trusted proxy reference](trusted-proxy-reference.md) +- [Security hardening guide](security-hardening.md) +- [KMS setup guide](kms-setup.md) +- [Production deployment guide](production.md) diff --git a/docs/operations/monitoring.md b/docs/operations/monitoring.md index 49efa3e..cc87f58 100644 --- a/docs/operations/monitoring.md +++ b/docs/operations/monitoring.md @@ -1,6 +1,6 @@ # πŸ“Š Monitoring -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 This document describes the metrics instrumentation and monitoring capabilities in the Secrets application. @@ -302,11 +302,41 @@ sum(rate(secrets_http_requests_total[5m])) by (path) sum(rate(secrets_http_requests_total{status_code="403"}[5m])) by (path) ``` +**Token endpoint `429` ratio (5m):** + +```promql +sum(rate(secrets_http_requests_total{path="/v1/token",status_code="429"}[5m])) +/ +sum(rate(secrets_http_requests_total{path="/v1/token"}[5m])) +``` + +**Token endpoint request rate by status (5m):** + +```promql +sum(rate(secrets_http_requests_total{path="/v1/token"}[5m])) by (status_code) +``` + +**Token issuance success ratio (5m):** + +```promql +sum(rate(secrets_http_requests_total{path="/v1/token",status_code="201"}[5m])) +/ +sum(rate(secrets_http_requests_total{path="/v1/token"}[5m])) +``` + Rate-limit interpretation notes: - Stable low-volume `429` can be normal under bursty workloads - Rising `429` with rising latency usually indicates saturation or mis-tuned clients - Tune `RATE_LIMIT_REQUESTS_PER_SEC` and `RATE_LIMIT_BURST` only after retry behavior is verified +- For token issuance spikes, tune `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` and `RATE_LIMIT_TOKEN_BURST` + +Token endpoint alert starters: + +| Signal | Warning | Critical | Interpretation | +| --- | --- | --- | --- | +| `/v1/token` `429` ratio (5m) | `> 0.05` for 10m | `> 0.20` for 10m | Shared egress saturation, attack traffic, or strict limits | +| `/v1/token` success ratio (5m) | `< 0.95` for 10m | `< 0.80` for 10m | Legitimate token issuance degradation | ### Tokenization-focused Queries @@ -440,6 +470,42 @@ Suggested escalation policy: description: "Critical crypto routes are being throttled above threshold" ``` +#### Token Endpoint 429 Ratio (Warning) + +```yaml +- alert: TokenEndpoint429RatioWarning + expr: | + ( + sum(rate(secrets_http_requests_total{path="/v1/token",status_code="429"}[10m])) + / + sum(rate(secrets_http_requests_total{path="/v1/token"}[10m])) + ) > 0.05 + for: 10m + labels: + severity: warning + annotations: + summary: "Token endpoint throttling elevated" + description: "More than 5% of /v1/token requests are returning 429" +``` + +#### Token Endpoint 429 Ratio (Critical) + +```yaml +- alert: TokenEndpoint429RatioCritical + expr: | + ( + sum(rate(secrets_http_requests_total{path="/v1/token",status_code="429"}[10m])) + / + sum(rate(secrets_http_requests_total{path="/v1/token"}[10m])) + ) > 0.20 + for: 10m + labels: + severity: critical + annotations: + summary: "Token endpoint throttling critical" + description: "More than 20% of /v1/token requests are returning 429" +``` + ## Disabling Metrics To disable metrics collection, set `METRICS_ENABLED=false` in your environment: diff --git a/docs/operations/operator-quick-card.md b/docs/operations/operator-quick-card.md new file mode 100644 index 0000000..25c7d98 --- /dev/null +++ b/docs/operations/operator-quick-card.md @@ -0,0 +1,83 @@ +# ⚑ Operator Quick Card + +> Last updated: 2026-02-20 + +Use this page during rollout and incidents when you need a fast, minimal checklist. + +## Rollout Preflight (5-minute check) + +1. Confirm target version and image tag match release plan +2. Confirm DB connectivity and migration window +3. Confirm key mode settings (`KMS_PROVIDER` + `KMS_KEY_URI` or legacy mode) +4. Confirm token and route rate-limit settings are intentional +5. Confirm rollback owner and communication channel + +Primary references: + +- [Production rollout golden path](production-rollout.md) +- [Release compatibility matrix](../releases/compatibility-matrix.md) +- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) + +## Baseline Verification (before traffic cutover) + +1. `GET /health` returns `200` +2. `GET /ready` returns `200` +3. `POST /v1/token` returns `201` +4. Secrets write/read passes +5. Transit encrypt/decrypt passes + +Reference: + +- [Smoke test guide](../getting-started/smoke-test.md) + +## Fast Status Triage (`401` / `403` / `429`) + +1. `401`: re-check credentials/token issuance path +2. `403`: verify policy path and capability mapping +3. `429`: check `Retry-After`, then decide per-client vs token-IP tuning path + +References: + +- [API error decision matrix](../api/error-decision-matrix.md) +- [API rate limiting](../api/rate-limiting.md) +- [Monitoring](monitoring.md) + +## Token Endpoint `429` Quick Path + +1. Confirm `429` concentrated on `POST /v1/token` +2. Verify shared NAT/proxy egress is not collapsing many clients to one IP +3. Validate trusted proxy and forwarded header behavior +4. Apply temporary `RATE_LIMIT_TOKEN_*` tuning only if traffic is legitimate +5. Revert temporary tuning after stability + +References: + +- [Production token throttling runbook](production.md#10-token-endpoint-throttling-runbook) +- [Trusted proxy reference](trusted-proxy-reference.md) +- [Troubleshooting](../getting-started/troubleshooting.md) + +## Rollback Triggers + +- Sustained elevated `5xx` +- Widespread token/auth failures +- Unexpected data-integrity behavior +- Failed verification gates after rollout + +Reference: + +- [Rollback procedure](production-rollout.md#rollback-procedure-binaryimage) + +## Incident Notes Minimum + +Capture these before closing: + +- timeline (detection -> mitigation -> recovery) +- affected routes/clients +- config changes applied (`RATE_LIMIT_*`, `RATE_LIMIT_TOKEN_*`, policy updates) +- final mitigation and follow-up owner + +## See also + +- [Operator runbook index](runbook-index.md) +- [Production deployment guide](production.md) +- [Failure playbooks](failure-playbooks.md) diff --git a/docs/operations/production-rollout.md b/docs/operations/production-rollout.md index 328a0b8..da17340 100644 --- a/docs/operations/production-rollout.md +++ b/docs/operations/production-rollout.md @@ -1,12 +1,12 @@ # πŸš€ Production Rollout Golden Path -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this runbook for a standard production rollout with verification and rollback checkpoints. ## Scope -- Deploy target: Secrets `v0.6.0` +- Deploy target: Secrets `v0.7.0` - Database schema changes: run migrations before traffic cutover - Crypto bootstrap: ensure initial KEK exists for write/encrypt flows @@ -21,19 +21,21 @@ Use this runbook for a standard production rollout with verification and rollbac ## Copy/Paste Rollout Commands +> Command status: verified on 2026-02-20 + ```bash # 1) Pull target release -docker pull allisson/secrets:v0.6.0 +docker pull allisson/secrets:v0.7.0 # 2) Run migrations -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate # 3) Bootstrap KEK only for first-time environment setup -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.6.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm # 4) Start API docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.6.0 server + allisson/secrets:v0.7.0 server ``` ## Verification Gates @@ -81,8 +83,8 @@ Gate C (policy and observability): ## See also - [Production deployment guide](production.md) -- [v0.6.0 release notes](../releases/v0.6.0.md) -- [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) +- [v0.7.0 release notes](../releases/v0.7.0.md) +- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) - [KMS migration checklist](kms-migration-checklist.md) - [Release compatibility matrix](../releases/compatibility-matrix.md) - [Smoke test guide](../getting-started/smoke-test.md) diff --git a/docs/operations/production.md b/docs/operations/production.md index 7ff05ec..7ef4d0a 100644 --- a/docs/operations/production.md +++ b/docs/operations/production.md @@ -1,6 +1,6 @@ # 🏭 Production Deployment Guide -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 This guide covers baseline production hardening and operations for Secrets. @@ -17,6 +17,7 @@ This guide covers baseline production hardening and operations for Secrets. - [7) Incident Response Checklist](#7-incident-response-checklist) - [8) Go-Live Checklist](#8-go-live-checklist) - [9) Golden Path Rollout (Recommended)](#9-golden-path-rollout-recommended) +- [10) Token Endpoint Throttling Runbook](#10-token-endpoint-throttling-runbook) ## 1) TLS and Reverse Proxy @@ -58,6 +59,8 @@ Backup/restore checklist: Audit log retention routine (recommended monthly): +> Command status: verified on 2026-02-20 + ```bash # 1) Preview rows older than 90 days ./bin/app clean-audit-logs --days 90 --dry-run --format json @@ -68,6 +71,8 @@ Audit log retention routine (recommended monthly): Token retention routine (recommended monthly for tokenization workloads): +> Command status: verified on 2026-02-20 + ```bash # 1) Preview expired tokens older than 30 days ./bin/app clean-expired-tokens --days 30 --dry-run --format json @@ -165,7 +170,41 @@ Adjust retention to match your compliance and incident-response requirements. - Follow [Production rollout golden path](production-rollout.md) for step-by-step deployment, verification gates, and rollback triggers - Use [Release compatibility matrix](../releases/compatibility-matrix.md) before planning upgrades -- Keep [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) attached to rollout change tickets +- Keep [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) attached to rollout change tickets + +## 10) Token Endpoint Throttling Runbook + +Use this when `POST /v1/token` shows sustained `429 Too Many Requests`. + +Triage steps: + +1. Confirm symptom and blast radius: + - verify `429` responses include `Retry-After` + - verify issue is concentrated on `/v1/token` or system-wide +2. Identify caller pattern: + - check `429` by route and token endpoint ratio in [Monitoring](monitoring.md) + - confirm whether affected clients share NAT/proxy egress IPs +3. Validate real client-IP handling: + - ensure reverse proxy forwards client IP headers correctly + - ensure trusted proxy settings prevent spoofed forwarded headers +4. Apply temporary tuning window (if legitimate traffic): + - increase `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` and `RATE_LIMIT_TOKEN_BURST` + - roll changes with standard deployment controls +5. Verify recovery: + - token issuance success ratio normalizes + - no collateral increase in error rates for protected routes + +Trusted proxy guidance: + +- Validate forwarded-header trust and source-IP propagation using + [Trusted proxy reference](trusted-proxy-reference.md) + +Rollback of temporary tuning: + +1. Revert `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` and `RATE_LIMIT_TOKEN_BURST` to baseline values +2. Roll configuration update +3. Re-check `/v1/token` `429` ratio and token issuance success ratio +4. Keep incident notes with final thresholds for future baseline reviews ## See also @@ -174,10 +213,11 @@ Adjust retention to match your compliance and incident-response requirements. - [Production rollout golden path](production-rollout.md) - [Operator runbook index](runbook-index.md) - [Monitoring](monitoring.md) +- [Trusted proxy reference](trusted-proxy-reference.md) - [Operator drills (quarterly)](operator-drills.md) - [Policy smoke tests](policy-smoke-tests.md) -- [v0.6.0 release notes](../releases/v0.6.0.md) -- [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) +- [v0.7.0 release notes](../releases/v0.7.0.md) +- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) - [KMS migration checklist](kms-migration-checklist.md) - [Release compatibility matrix](../releases/compatibility-matrix.md) - [Environment variables](../configuration/environment-variables.md) diff --git a/docs/operations/runbook-index.md b/docs/operations/runbook-index.md index 0778fdd..7f44e42 100644 --- a/docs/operations/runbook-index.md +++ b/docs/operations/runbook-index.md @@ -1,13 +1,16 @@ # 🧭 Operator Runbook Index -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this page as the single entry point for rollout, validation, and incident runbooks. ## Release and Rollout -- [v0.6.0 release notes](../releases/v0.6.0.md) -- [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) +- [Operator quick card](operator-quick-card.md) +- [v0.7.0 release notes](../releases/v0.7.0.md) +- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) +- [v0.6.0 release notes](../releases/v0.6.0.md) (historical) +- [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) (historical) - [Release compatibility matrix](../releases/compatibility-matrix.md) - [Production rollout golden path](production-rollout.md) - [Production deployment guide](production.md) @@ -31,14 +34,18 @@ Use this page as the single entry point for rollout, validation, and incident ru ## Incident and Recovery +- [Incident decision tree](incident-decision-tree.md) +- [First 15 Minutes Playbook](first-15-minutes.md) - [Failure playbooks](failure-playbooks.md) - [Operator drills (quarterly)](operator-drills.md) - [Troubleshooting](../getting-started/troubleshooting.md) - [Key management operations](key-management.md) +- [Known limitations](known-limitations.md) ## Observability and Health - [Monitoring](monitoring.md) +- [Trusted proxy reference](trusted-proxy-reference.md) - [Smoke test guide](../getting-started/smoke-test.md) ## Suggested Operator Flow diff --git a/docs/operations/security-hardening.md b/docs/operations/security-hardening.md index 8f50eb0..5d8b184 100644 --- a/docs/operations/security-hardening.md +++ b/docs/operations/security-hardening.md @@ -1,6 +1,6 @@ # πŸ”’ Security Hardening Guide -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 This guide covers comprehensive security hardening for production deployments of Secrets. These measures are essential for protecting sensitive data and maintaining operational security. @@ -243,7 +243,7 @@ The `/metrics` endpoint exposes operational metrics that may contain sensitive i Rate limiting protects against abuse, brute force attacks, and denial of service. -### CORS Configuration +### Authenticated Endpoint Configuration ```dotenv # Enable rate limiting (default: true) @@ -273,12 +273,39 @@ RATE_LIMIT_BURST=20 ### Excluded Endpoints -Rate limiting does **not** apply to: +Authenticated per-client rate limiting does **not** apply to: - `/health` - Health checks - `/ready` - Readiness probes - `/metrics` - Metrics collection -- `/v1/token` - Token issuance (pre-authentication) + +### Token Endpoint Configuration (IP-based) + +```dotenv +# Enable token endpoint rate limiting (default: true) +RATE_LIMIT_TOKEN_ENABLED=true + +# Requests per second per IP for POST /v1/token (default: 5.0) +RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 + +# Burst capacity per IP (default: 10) +RATE_LIMIT_TOKEN_BURST=10 +``` + +### Token Endpoint Notes + +- **Scope:** Per-client-IP for unauthenticated `POST /v1/token` +- **Purpose:** Mitigate credential stuffing and brute-force token issuance attempts +- **Response:** HTTP `429` with `Retry-After` when exceeded +- **Operational caveat:** Shared NAT/proxy egress can require tuning `RATE_LIMIT_TOKEN_*` + +### Trusted Proxy and IP Forwarding Safety + +- Configure trusted proxies explicitly in production; do not trust arbitrary forwarded headers +- Ensure your edge proxy/load balancer sets client IP headers consistently +- If trusted proxy settings are incorrect, all token requests can appear from one IP and trigger false `429` +- If headers are over-trusted, attackers can spoof forwarded IPs to evade per-IP controls +- Use [Trusted proxy reference](trusted-proxy-reference.md) for validation workflow and platform notes ### Tuning Guidance diff --git a/docs/operations/trusted-proxy-reference.md b/docs/operations/trusted-proxy-reference.md new file mode 100644 index 0000000..611267c --- /dev/null +++ b/docs/operations/trusted-proxy-reference.md @@ -0,0 +1,67 @@ +# 🌐 Trusted Proxy Reference + +> Last updated: 2026-02-20 + +Use this guide to validate source-IP forwarding for security controls that depend on caller IP +(for example token endpoint per-IP rate limiting on `POST /v1/token`). + +## Why this matters + +- If proxy trust is too broad, attackers may spoof `X-Forwarded-For` +- If proxy trust is too narrow/incorrect, many clients can collapse into one apparent IP +- Both cases can invalidate per-IP rate-limiting behavior + +## Validation checklist + +1. Only trusted edge proxies can set forwarded client-IP headers +2. Untrusted internet clients cannot inject arbitrary `X-Forwarded-For` +3. App-observed `client_ip` matches edge-proxy access logs for sampled requests +4. Multi-hop proxy behavior (if any) is documented and tested + +## Nginx baseline forwarding + +```nginx +location / { + proxy_pass http://127.0.0.1:8080; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; +} +``` + +Hardening notes: + +- Do not accept forwarded headers directly from public clients +- Ensure only your reverse-proxy tier can reach application port `8080` + +## AWS ALB / ELB notes + +- ALB injects `X-Forwarded-For`; keep app reachable only from ALB/security group path +- Validate that downstream proxies preserve rather than overwrite trusted header chain +- Sample and compare ALB access logs with app `client_ip` logs + +## Cloudflare / CDN edge notes + +- Prefer single trusted edge path to origin +- If using CDN-specific client IP headers, keep mapping and validation documented +- Reject direct origin traffic from non-edge sources where possible + +## Diagnostic quick test + +1. Send a test request through edge proxy +2. Capture edge log source IP +3. Capture app log `client_ip` and request ID +4. Confirm both values refer to the same caller context + +## Common failure patterns + +- **All token requests share one IP:** likely NAT/proxy collapse or missing forwarded IP propagation +- **Frequent token `429` after proxy changes:** trust chain or source-IP extraction behavior drifted +- **Suspiciously diverse token caller IPs from one source:** potential forwarded-header spoofing + +## See also + +- [Security hardening guide](security-hardening.md) +- [Production deployment guide](production.md) +- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/personas/developer.md b/docs/personas/developer.md new file mode 100644 index 0000000..01251e3 --- /dev/null +++ b/docs/personas/developer.md @@ -0,0 +1,19 @@ +# πŸ‘¨β€πŸ’» Developer Persona Path + +> Last updated: 2026-02-20 + +Use this path when your goal is API integration and feature delivery with docs parity. + +## Primary path + +1. [Day 0 Developer Walkthrough](../getting-started/day-0-developer.md) +2. [Authentication API](../api/authentication.md) +3. [Error decision matrix](../api/error-decision-matrix.md) +4. [Rate limiting](../api/rate-limiting.md) +5. [Versioned examples](../examples/versioned-by-release.md) + +## Deep links + +- [Capability matrix](../api/capability-matrix.md) +- [Policies cookbook](../api/policies.md) +- [Docs release checklist](../development/docs-release-checklist.md) diff --git a/docs/personas/operator.md b/docs/personas/operator.md new file mode 100644 index 0000000..b7ffba6 --- /dev/null +++ b/docs/personas/operator.md @@ -0,0 +1,19 @@ +# πŸ‘· Operator Persona Path + +> Last updated: 2026-02-20 + +Use this path when your goal is reliable deployment and fast incident response. + +## Primary path + +1. [Day 0 Operator Walkthrough](../getting-started/day-0-operator.md) +2. [Production rollout golden path](../operations/production-rollout.md) +3. [Operator quick card](../operations/operator-quick-card.md) +4. [Incident decision tree](../operations/incident-decision-tree.md) +5. [First 15 Minutes Playbook](../operations/first-15-minutes.md) + +## Deep links + +- [Monitoring](../operations/monitoring.md) +- [Failure playbooks](../operations/failure-playbooks.md) +- [Known limitations](../operations/known-limitations.md) diff --git a/docs/personas/security.md b/docs/personas/security.md new file mode 100644 index 0000000..efe7bbd --- /dev/null +++ b/docs/personas/security.md @@ -0,0 +1,19 @@ +# πŸ›‘οΈ Security Persona Path + +> Last updated: 2026-02-20 + +Use this path when your goal is threat reduction, hardening, and auditability. + +## Primary path + +1. [Security hardening guide](../operations/security-hardening.md) +2. [Trusted proxy reference](../operations/trusted-proxy-reference.md) +3. [Known limitations](../operations/known-limitations.md) +4. [Production deployment guide](../operations/production.md) +5. [Monitoring](../operations/monitoring.md) + +## Deep links + +- [Security model](../concepts/security-model.md) +- [KMS setup guide](../operations/kms-setup.md) +- [KMS migration checklist](../operations/kms-migration-checklist.md) diff --git a/docs/releases/_template.md b/docs/releases/_template.md new file mode 100644 index 0000000..af7e783 --- /dev/null +++ b/docs/releases/_template.md @@ -0,0 +1,47 @@ +# πŸš€ Secrets vX.Y.Z Release Notes + +> Release date: YYYY-MM-DD + +Brief summary of why this release matters for operators and integrators. + +## Highlights + +- Highlight 1 +- Highlight 2 +- Highlight 3 + +## Runtime Changes + +- New/changed env vars: + - `EXAMPLE_VAR` (default `...`) +- Endpoint behavior changes (status/contract/defaults) +- Performance or operational behavior updates + +## Breaking / Behavior Changes + +- Behavior/default changes requiring operator action +- Compatibility notes for older clients or deployments + +## Upgrade Notes + +1. Deploy binaries/images with `vX.Y.Z` +2. Apply config/env changes +3. Run verification checks +4. Monitor rollout metrics/logs + +## Operator Verification Checklist + +1. `GET /health` and `GET /ready` pass +2. Authentication/token issuance works +3. Key-dependent flows pass (secrets/transit) +4. New/changed feature behavior validated + +## Documentation Updates + +- Added/updated docs pages for this release +- Runbook changes relevant to operators + +## See also + +- [Upgrade guide template](_upgrade-template.md) +- [Release compatibility matrix](compatibility-matrix.md) diff --git a/docs/releases/_upgrade-template.md b/docs/releases/_upgrade-template.md new file mode 100644 index 0000000..c93e929 --- /dev/null +++ b/docs/releases/_upgrade-template.md @@ -0,0 +1,56 @@ +# ⬆️ Upgrade Guide: vA.B.C -> vX.Y.Z + +> Release date: YYYY-MM-DD + +Use this guide to safely upgrade from `vA.B.C` to `vX.Y.Z`. + +## Scope + +- Release type: patch/minor/major +- API compatibility: compatible/incompatible notes +- Database migration: required/optional/none + +## What Changed + +- Change 1 +- Change 2 +- Change 3 + +## Env Diff (copy/paste) + +```diff ++ NEW_VAR=value +- OLD_VAR=old-value +``` + +## Recommended Upgrade Steps + +1. Update image/binary to `vX.Y.Z` +2. Apply env/config changes +3. Restart/roll instances +4. Run health checks +5. Run functional smoke checks + +## Quick Verification Commands + +```bash +curl -sS http://localhost:8080/health +curl -sS http://localhost:8080/ready +``` + +## Rollback Notes + +- Revert to previous stable version first +- Keep non-destructive config rollback path documented +- Re-run validation after rollback + +### Rollback matrix + +| Upgrade path | First rollback action | Config rollback | Validation | +| --- | --- | --- | --- | +| `vA.B.C -> vX.Y.Z` | Roll app image/binary back | Revert/ignore release-specific config additions | Health + smoke checks | + +## See also + +- [Release notes template](_template.md) +- [Release compatibility matrix](compatibility-matrix.md) diff --git a/docs/releases/compatibility-matrix.md b/docs/releases/compatibility-matrix.md index 2b91285..9c23d7b 100644 --- a/docs/releases/compatibility-matrix.md +++ b/docs/releases/compatibility-matrix.md @@ -1,6 +1,6 @@ # πŸ” Release Compatibility Matrix -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this page to understand upgrade impact between recent releases. @@ -8,6 +8,7 @@ Use this page to understand upgrade impact between recent releases. | From -> To | Schema migration impact | Runtime/default changes | Required operator action | | --- | --- | --- | --- | +| `v0.6.0 -> v0.7.0` | No new mandatory migration | Added IP-based token endpoint rate limiting (`RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, `RATE_LIMIT_TOKEN_BURST`), token endpoint may return `429` with `Retry-After` | Add and tune `RATE_LIMIT_TOKEN_*`, validate token issuance under normal and burst load, review trusted proxy/IP behavior | | `v0.5.1 -> v0.6.0` | No new mandatory migration | Added KMS-based master key support (`KMS_PROVIDER`, `KMS_KEY_URI`), new `rotate-master-key` CLI workflow | Decide KMS vs legacy mode, validate startup key loading, run key-dependent smoke checks | | `v0.5.0 -> v0.5.1` | No new mandatory migration | Master key memory handling bugfix and teardown zeroing hardening | Deploy `v0.5.1` and verify key-dependent flows (token, secrets, transit) | | `v0.4.x -> v0.5.1` | No new destructive schema migration required for core features | Token TTL default `24h -> 4h`; rate limiting enabled by default; CORS config introduced (disabled by default); includes `v0.5.1` master key memory handling hardening | Set explicit `AUTH_TOKEN_EXPIRATION_SECONDS`, review `RATE_LIMIT_*`, configure `CORS_*` only if browser access is required, then run key-dependent smoke checks | @@ -16,6 +17,13 @@ Use this page to understand upgrade impact between recent releases. ## Upgrade verification by target +For `v0.7.0`: + +1. `GET /health` and `GET /ready` pass +2. `POST /v1/token` issues tokens at normal traffic levels +3. Controlled token burst returns `429` with `Retry-After` +4. Secrets and transit round-trip flows succeed + For `v0.6.0`: 1. `GET /health` and `GET /ready` pass @@ -44,6 +52,8 @@ For `v0.5.0`: ## See also +- [v0.7.0 release notes](v0.7.0.md) +- [v0.7.0 upgrade guide](v0.7.0-upgrade.md) - [v0.6.0 release notes](v0.6.0.md) - [v0.6.0 upgrade guide](v0.6.0-upgrade.md) - [v0.5.1 release notes](v0.5.1.md) diff --git a/docs/releases/v0.7.0-upgrade.md b/docs/releases/v0.7.0-upgrade.md new file mode 100644 index 0000000..80a01c7 --- /dev/null +++ b/docs/releases/v0.7.0-upgrade.md @@ -0,0 +1,87 @@ +# ⬆️ Upgrade Guide: v0.6.0 -> v0.7.0 + +> Release date: 2026-02-20 + +Use this guide to safely upgrade from `v0.6.0` to `v0.7.0`. + +## Scope + +- Release type: minor (`v0.7.0`) +- API compatibility: no `v1` endpoint contract break +- Database migration: no new mandatory migration for this release + +## What Changed + +- Added IP-based token endpoint rate limiting for `POST /v1/token` +- Added new token endpoint throttling configuration (`RATE_LIMIT_TOKEN_*`) +- Token issuance can now return `429 Too Many Requests` with `Retry-After` + +## Env Diff (copy/paste) + +```diff ++ RATE_LIMIT_TOKEN_ENABLED=true ++ RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 ++ RATE_LIMIT_TOKEN_BURST=10 +``` + +## Recommended Upgrade Steps + +1. Update image/binary to `v0.7.0` +2. Add `RATE_LIMIT_TOKEN_*` variables to runtime configuration +3. Restart API instances with standard rolling rollout process +4. Run baseline checks: + - `GET /health` + - `GET /ready` +5. Run token and key-dependent checks: + - `POST /v1/token` + - Secrets write/read + - Transit encrypt/decrypt round-trip + +## Quick Verification Commands + +```bash +curl -sS http://localhost:8080/health +curl -sS http://localhost:8080/ready + +TOKEN_RESPONSE="$(curl -sS -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}')" + +CLIENT_TOKEN="$(printf '%s' "${TOKEN_RESPONSE}" | jq -r '.token')" + +curl -sS -X POST http://localhost:8080/v1/secrets/upgrade/v070 \ + -H "Authorization: Bearer ${CLIENT_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"value":"djA3MC1zbW9rZQ=="}' + +curl -sS -X GET http://localhost:8080/v1/secrets/upgrade/v070 \ + -H "Authorization: Bearer ${CLIENT_TOKEN}" +``` + +## Optional: Token Endpoint Tuning Guidance + +- If legitimate clients share NAT/proxy egress and hit token endpoint `429`, increase: + - `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` + - `RATE_LIMIT_TOKEN_BURST` +- Keep limits conservative enough to deter credential stuffing +- Validate trusted proxy configuration so `ClientIP` reflects real caller IPs + +## Rollback Notes + +- If rollback is required, revert API instances to the previous stable image +- Revert app version first; avoid destructive key/data rollback actions without a validated plan +- Re-run health and smoke checks after rollback + +### Rollback matrix + +| Upgrade path | First rollback action | Configuration rollback | Validation | +| --- | --- | --- | --- | +| `v0.6.0 -> v0.7.0` | Roll app image/binary back to previous stable version | Remove or ignore `RATE_LIMIT_TOKEN_*` additions as needed; keep existing crypto/KMS config unchanged | `GET /health`, `GET /ready`, token issuance, and secrets/transit smoke checks | + +## See also + +- [v0.7.0 release notes](v0.7.0.md) +- [Release compatibility matrix](compatibility-matrix.md) +- [API rate limiting](../api/rate-limiting.md) +- [Environment variables](../configuration/environment-variables.md) +- [Production rollout golden path](../operations/production-rollout.md) diff --git a/docs/releases/v0.7.0.md b/docs/releases/v0.7.0.md new file mode 100644 index 0000000..8770131 --- /dev/null +++ b/docs/releases/v0.7.0.md @@ -0,0 +1,57 @@ +# πŸš€ Secrets v0.7.0 Release Notes + +> Release date: 2026-02-20 + +This minor release adds dedicated IP-based rate limiting for unauthenticated token issuance, +expands configuration controls for token endpoint throttling, and updates operator guidance for +credential-stuffing and brute-force protection. + +## Highlights + +- Added IP-based rate limiting for `POST /v1/token` +- Added token endpoint rate-limit configuration via `RATE_LIMIT_TOKEN_*` variables +- Added token endpoint `429 Too Many Requests` behavior with `Retry-After` +- Expanded docs and runbooks for token endpoint abuse protection and rollout validation + +## Runtime Changes + +- New environment variables: + - `RATE_LIMIT_TOKEN_ENABLED` (default `true`) + - `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` (default `5.0`) + - `RATE_LIMIT_TOKEN_BURST` (default `10`) +- `POST /v1/token` may now return `429 Too Many Requests` when per-IP token limits are exceeded +- Authenticated per-client rate limiting (`RATE_LIMIT_*`) remains unchanged + +## Security and Operations Impact + +- Improves protection against token endpoint credential stuffing and brute-force traffic +- Applies stricter defaults on unauthenticated token issuance than authenticated API routes +- Requires review of proxy/trusted-IP setup when using forwarded headers in production + +## Upgrade Notes + +1. Deploy binaries/images with `v0.7.0` +2. Review and tune `RATE_LIMIT_TOKEN_*` to match expected login/token traffic +3. Validate token issuance flow under normal and burst traffic +4. Confirm `429` + `Retry-After` behavior for token endpoint in controlled load tests + +## Operator Verification Checklist + +1. Confirm `GET /health` and `GET /ready` succeed +2. Confirm `POST /v1/token` issues tokens normally for expected request rates +3. Confirm token endpoint returns controlled `429` with `Retry-After` when intentionally exceeded +4. Confirm authenticated route limits and retry behavior still match policy + +## Documentation Updates + +- Added [v0.7.0 upgrade guide](v0.7.0-upgrade.md) +- Updated [API rate limiting](../api/rate-limiting.md) with token endpoint scope +- Updated [Environment variables](../configuration/environment-variables.md) with `RATE_LIMIT_TOKEN_*` +- Updated [Troubleshooting](../getting-started/troubleshooting.md) with token endpoint `429` diagnostics + +## See also + +- [v0.7.0 upgrade guide](v0.7.0-upgrade.md) +- [Release compatibility matrix](compatibility-matrix.md) +- [API rate limiting](../api/rate-limiting.md) +- [Production rollout golden path](../operations/production-rollout.md) diff --git a/docs/tools/check_release_image_tags.py b/docs/tools/check_release_image_tags.py new file mode 100644 index 0000000..bed584a --- /dev/null +++ b/docs/tools/check_release_image_tags.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +import json +import re +from pathlib import Path + + +PINNED_IMAGE_PATTERN = re.compile(r"allisson/secrets:v\d+\.\d+\.\d+") + + +def main() -> None: + metadata = json.loads(Path("docs/metadata.json").read_text(encoding="utf-8")) + current_release = metadata["current_release"] + current_tag = f"allisson/secrets:{current_release}" + + files_to_check = [ + Path("README.md"), + Path("docs/getting-started/docker.md"), + Path("docs/operations/production-rollout.md"), + Path("docs/cli/commands.md"), + Path("docs/configuration/environment-variables.md"), + Path("docs/operations/key-management.md"), + Path("docs/operations/kms-setup.md"), + ] + + errors = [] + + for file_path in files_to_check: + if not file_path.exists(): + errors.append(f"missing required docs file: {file_path}") + continue + + content = file_path.read_text(encoding="utf-8") + tags = PINNED_IMAGE_PATTERN.findall(content) + + if not tags: + errors.append(f"{file_path} must include pinned image tag {current_tag}") + continue + + mismatched = sorted({tag for tag in tags if tag != current_tag}) + if mismatched: + errors.append( + f"{file_path} contains non-current pinned tags: " + + ", ".join(mismatched) + + f" (expected only {current_tag})" + ) + + if errors: + raise ValueError( + "Release image tag consistency check failed: " + " | ".join(errors) + ) + + print("release image tag checks passed") + + +if __name__ == "__main__": + main() diff --git a/internal/auth/http/token_rate_limit_middleware.go b/internal/auth/http/token_rate_limit_middleware.go new file mode 100644 index 0000000..b7090c7 --- /dev/null +++ b/internal/auth/http/token_rate_limit_middleware.go @@ -0,0 +1,138 @@ +// Package http provides HTTP middleware and utilities for authentication. +package http + +import ( + "context" + "fmt" + "log/slog" + "net/http" + "sync" + "time" + + "github.com/gin-gonic/gin" + "golang.org/x/time/rate" +) + +// tokenRateLimiterStore holds per-IP rate limiters with automatic cleanup. +type tokenRateLimiterStore struct { + limiters sync.Map // map[string]*tokenRateLimiterEntry (IP -> limiter) + rps float64 + burst int +} + +// tokenRateLimiterEntry holds a rate limiter and last access time for cleanup. +type tokenRateLimiterEntry struct { + limiter *rate.Limiter + lastAccess time.Time + mu sync.Mutex +} + +// TokenRateLimitMiddleware enforces per-IP rate limiting on token issuance endpoint. +// +// Designed for unauthenticated endpoints to prevent credential stuffing and brute force +// attacks. Uses token bucket algorithm via golang.org/x/time/rate. Each IP address gets +// an independent rate limiter. +// +// Uses c.ClientIP() which automatically handles: +// - X-Forwarded-For header (takes first IP) +// - X-Real-IP header +// - Direct connection remote address +// +// Configuration: +// - rps: Requests per second allowed per IP address +// - burst: Maximum burst capacity for temporary spikes +// +// Returns: +// - 429 Too Many Requests: Rate limit exceeded (includes Retry-After header) +// - Continues: Request allowed within rate limit +func TokenRateLimitMiddleware(rps float64, burst int, logger *slog.Logger) gin.HandlerFunc { + store := &tokenRateLimiterStore{ + rps: rps, + burst: burst, + } + + // Start cleanup goroutine for stale limiters (every 5 minutes) + go store.cleanupStale(context.Background(), 5*time.Minute) + + return func(c *gin.Context) { + // Get client IP address + clientIP := c.ClientIP() + + // Get or create rate limiter for this IP + limiter := store.getLimiter(clientIP) + + // Check if request is allowed + if !limiter.Allow() { + // Calculate retry-after delay + reservation := limiter.Reserve() + retryAfter := int(reservation.Delay().Seconds()) + reservation.Cancel() // Cancel the reservation + + logger.Debug("token rate limit exceeded", + slog.String("client_ip", clientIP), + slog.Int("retry_after", retryAfter)) + + c.Header("Retry-After", fmt.Sprintf("%d", retryAfter)) + c.JSON(http.StatusTooManyRequests, gin.H{ + "error": "rate_limit_exceeded", + "message": "Too many token requests from this IP. Please retry after the specified delay.", + }) + c.Abort() + return + } + + // Request allowed, continue + c.Next() + } +} + +// getLimiter retrieves or creates a rate limiter for an IP address. +func (s *tokenRateLimiterStore) getLimiter(ip string) *rate.Limiter { + // Try to load existing limiter + if val, ok := s.limiters.Load(ip); ok { + entry := val.(*tokenRateLimiterEntry) + entry.mu.Lock() + entry.lastAccess = time.Now() + entry.mu.Unlock() + return entry.limiter + } + + // Create new limiter + limiter := rate.NewLimiter(rate.Limit(s.rps), s.burst) + entry := &tokenRateLimiterEntry{ + limiter: limiter, + lastAccess: time.Now(), + } + + // Store and return + s.limiters.Store(ip, entry) + return limiter +} + +// cleanupStale removes rate limiters that haven't been accessed recently. +// Runs periodically to prevent unbounded memory growth from IP address churn. +func (s *tokenRateLimiterStore) cleanupStale(ctx context.Context, interval time.Duration) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + // Remove limiters not accessed in last hour + threshold := time.Now().Add(-1 * time.Hour) + s.limiters.Range(func(key, value interface{}) bool { + entry := value.(*tokenRateLimiterEntry) + entry.mu.Lock() + shouldDelete := entry.lastAccess.Before(threshold) + entry.mu.Unlock() + + if shouldDelete { + s.limiters.Delete(key) + } + return true + }) + } + } +} diff --git a/internal/auth/http/token_rate_limit_middleware_test.go b/internal/auth/http/token_rate_limit_middleware_test.go new file mode 100644 index 0000000..5fe9b7d --- /dev/null +++ b/internal/auth/http/token_rate_limit_middleware_test.go @@ -0,0 +1,319 @@ +package http + +import ( + "log/slog" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/assert" +) + +func TestTokenRateLimitMiddleware_AllowsRequestsWithinLimit(t *testing.T) { + gin.SetMode(gin.TestMode) + + // Create middleware with generous limits + logger := slog.Default() + middleware := TokenRateLimitMiddleware(10.0, 20, logger) + + // Create test router + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + // Send requests within limit from same IP + for i := 0; i < 5; i++ { + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + } +} + +func TestTokenRateLimitMiddleware_BlocksRequestsExceedingLimit(t *testing.T) { + gin.SetMode(gin.TestMode) + + // Create middleware with very low limits + logger := slog.Default() + middleware := TokenRateLimitMiddleware(1.0, 2, logger) + + // Create test router + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + // Send requests up to burst capacity (should succeed) + for i := 0; i < 2; i++ { + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + } + + // Next request should be rate limited + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + + assert.Equal(t, http.StatusTooManyRequests, w.Code) + assert.NotEmpty(t, w.Header().Get("Retry-After")) +} + +func TestTokenRateLimitMiddleware_Returns429WithRetryAfterHeader(t *testing.T) { + gin.SetMode(gin.TestMode) + + logger := slog.Default() + middleware := TokenRateLimitMiddleware(0.5, 1, logger) + + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + // Consume burst + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + + // Next request should be rate limited with Retry-After header + w = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + + assert.Equal(t, http.StatusTooManyRequests, w.Code) + assert.NotEmpty(t, w.Header().Get("Retry-After")) + + // Verify error message + assert.Contains(t, w.Body.String(), "rate_limit_exceeded") + assert.Contains(t, w.Body.String(), "Too many token requests from this IP") +} + +func TestTokenRateLimitMiddleware_IndependentLimitsPerIP(t *testing.T) { + gin.SetMode(gin.TestMode) + + logger := slog.Default() + middleware := TokenRateLimitMiddleware(1.0, 1, logger) + + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + // IP 1 consumes its limit + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + req.RemoteAddr = "192.168.1.100:12345" + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + + // IP 1 is now rate limited + w = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodPost, "/token", nil) + req.RemoteAddr = "192.168.1.100:12346" // Different port, same IP + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusTooManyRequests, w.Code) + + // IP 2 should still have its own independent limit + w = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodPost, "/token", nil) + req.RemoteAddr = "192.168.1.101:12345" // Different IP + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestTokenRateLimitMiddleware_BurstCapacityWorks(t *testing.T) { + gin.SetMode(gin.TestMode) + + logger := slog.Default() + // Low rate but higher burst + middleware := TokenRateLimitMiddleware(1.0, 5, logger) + + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + // Should be able to burst up to 5 requests + for i := 0; i < 5; i++ { + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code, "Request %d should succeed", i+1) + } + + // 6th request should be rate limited + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusTooManyRequests, w.Code) +} + +func TestTokenRateLimitMiddleware_NoAuthenticationRequired(t *testing.T) { + gin.SetMode(gin.TestMode) + + logger := slog.Default() + middleware := TokenRateLimitMiddleware(10.0, 20, logger) + + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + // Request without any authentication context should work + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + router.ServeHTTP(w, req) + + // Should succeed (no authentication required) + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestTokenRateLimiterStore_CleanupStaleEntries(t *testing.T) { + store := &tokenRateLimiterStore{ + rps: 10.0, + burst: 20, + } + + // Create a limiter entry + ip1 := "192.168.1.100" + limiter1 := store.getLimiter(ip1) + assert.NotNil(t, limiter1) + + // Verify it's stored + _, ok := store.limiters.Load(ip1) + assert.True(t, ok) + + // Manually set last access to old time + if val, ok := store.limiters.Load(ip1); ok { + entry := val.(*tokenRateLimiterEntry) + entry.mu.Lock() + entry.lastAccess = time.Now().Add(-2 * time.Hour) + entry.mu.Unlock() + } + + // Run cleanup manually + threshold := time.Now().Add(-1 * time.Hour) + store.limiters.Range(func(key, value interface{}) bool { + entry := value.(*tokenRateLimiterEntry) + entry.mu.Lock() + shouldDelete := entry.lastAccess.Before(threshold) + entry.mu.Unlock() + + if shouldDelete { + store.limiters.Delete(key) + } + return true + }) + + // Verify entry was cleaned up + _, ok = store.limiters.Load(ip1) + assert.False(t, ok) +} + +func TestTokenRateLimitMiddleware_HandlesXForwardedFor(t *testing.T) { + gin.SetMode(gin.TestMode) + + logger := slog.Default() + middleware := TokenRateLimitMiddleware(1.0, 1, logger) + + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + // First request with X-Forwarded-For header + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + req.Header.Set("X-Forwarded-For", "203.0.113.1") + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + + // Second request from same IP in X-Forwarded-For should be rate limited + w = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodPost, "/token", nil) + req.Header.Set("X-Forwarded-For", "203.0.113.1") + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusTooManyRequests, w.Code) + + // Request from different IP in X-Forwarded-For should succeed + w = httptest.NewRecorder() + req = httptest.NewRequest(http.MethodPost, "/token", nil) + req.Header.Set("X-Forwarded-For", "203.0.113.2") + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestTokenRateLimitMiddleware_RespectsConfiguredLimits(t *testing.T) { + gin.SetMode(gin.TestMode) + + tests := []struct { + name string + rps float64 + burst int + requestsToSend int + expectedSuccesses int + }{ + { + name: "Conservative limits", + rps: 3.0, + burst: 5, + requestsToSend: 10, + expectedSuccesses: 5, + }, + { + name: "Moderate limits", + rps: 5.0, + burst: 10, + requestsToSend: 15, + expectedSuccesses: 10, + }, + { + name: "Permissive limits", + rps: 10.0, + burst: 20, + requestsToSend: 25, + expectedSuccesses: 20, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + logger := slog.Default() + middleware := TokenRateLimitMiddleware(tt.rps, tt.burst, logger) + + router := gin.New() + router.Use(middleware) + router.POST("/token", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"status": "ok"}) + }) + + successes := 0 + for i := 0; i < tt.requestsToSend; i++ { + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/token", nil) + req.RemoteAddr = "192.168.1.50:12345" // Use unique IP for this test + router.ServeHTTP(w, req) + + if w.Code == http.StatusOK { + successes++ + } + } + + assert.Equal(t, tt.expectedSuccesses, successes, + "Expected %d successes but got %d", tt.expectedSuccesses, successes) + }) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index e8a7be3..350239f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -29,11 +29,16 @@ type Config struct { // Auth AuthTokenExpiration time.Duration - // Rate Limiting + // Rate Limiting (authenticated endpoints) RateLimitEnabled bool RateLimitRequestsPerSec float64 RateLimitBurst int + // Rate Limiting for Token Endpoint (IP-based, unauthenticated) + RateLimitTokenEnabled bool + RateLimitTokenRequestsPerSec float64 + RateLimitTokenBurst int + // CORS CORSEnabled bool CORSAllowOrigins string @@ -73,11 +78,16 @@ func Load() *Config { // Auth AuthTokenExpiration: env.GetDuration("AUTH_TOKEN_EXPIRATION_SECONDS", 14400, time.Second), - // Rate Limiting + // Rate Limiting (authenticated endpoints) RateLimitEnabled: env.GetBool("RATE_LIMIT_ENABLED", true), RateLimitRequestsPerSec: env.GetFloat64("RATE_LIMIT_REQUESTS_PER_SEC", 10.0), RateLimitBurst: env.GetInt("RATE_LIMIT_BURST", 20), + // Rate Limiting for Token Endpoint (IP-based, unauthenticated) + RateLimitTokenEnabled: env.GetBool("RATE_LIMIT_TOKEN_ENABLED", true), + RateLimitTokenRequestsPerSec: env.GetFloat64("RATE_LIMIT_TOKEN_REQUESTS_PER_SEC", 5.0), + RateLimitTokenBurst: env.GetInt("RATE_LIMIT_TOKEN_BURST", 10), + // CORS CORSEnabled: env.GetBool("CORS_ENABLED", false), CORSAllowOrigins: env.GetString("CORS_ALLOW_ORIGINS", ""), diff --git a/internal/http/server.go b/internal/http/server.go index a5d58ed..41971db 100644 --- a/internal/http/server.go +++ b/internal/http/server.go @@ -123,11 +123,25 @@ func (s *Server) SetupRouter( ) } + // Create token rate limit middleware (IP-based, for unauthenticated token endpoint) + var tokenRateLimitMiddleware gin.HandlerFunc + if cfg.RateLimitTokenEnabled { + tokenRateLimitMiddleware = authHTTP.TokenRateLimitMiddleware( + cfg.RateLimitTokenRequestsPerSec, + cfg.RateLimitTokenBurst, + s.logger, + ) + } + // API v1 routes v1 := router.Group("/v1") { - // Token issuance endpoint (no authentication required) - v1.POST("/token", tokenHandler.IssueTokenHandler) + // Token issuance endpoint (no authentication required, IP-based rate limiting) + if tokenRateLimitMiddleware != nil { + v1.POST("/token", tokenRateLimitMiddleware, tokenHandler.IssueTokenHandler) + } else { + v1.POST("/token", tokenHandler.IssueTokenHandler) + } // Client management endpoints clients := v1.Group("/clients")