diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index c903402..cd92c9a 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -14,6 +14,6 @@ - [ ] Documentation updated for behavior/contract changes - [ ] `Last updated` metadata refreshed in changed docs pages -- [ ] `docs/CHANGELOG.md` updated for significant docs changes +- [ ] `CHANGELOG.md` updated for significant docs changes - [ ] `make docs-lint` passes locally - [ ] Examples/commands were validated or reviewed for accuracy diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e176abf..81c28b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: with: fetch-depth: 0 - - name: Docs changelog guard (docs-only PRs) + - name: Changelog guard (docs-only PRs) if: github.event_name == 'pull_request' run: | set -euo pipefail @@ -33,10 +33,12 @@ jobs: while IFS= read -r file; do [ -z "$file" ] && continue - if [ "$file" = "docs/CHANGELOG.md" ]; then + # Check if root CHANGELOG.md was updated + if [ "$file" = "CHANGELOG.md" ]; then changelog_updated=true fi + # Determine if this PR is docs-only case "$file" in docs/*|README.md) ;; @@ -49,7 +51,7 @@ jobs: EOF if [ "$docs_only" = true ] && [ "$changelog_updated" = false ]; then - echo "docs-only PRs must update docs/CHANGELOG.md" + echo "Documentation-only PRs must update CHANGELOG.md" exit 1 fi @@ -57,6 +59,7 @@ jobs: if: github.event_name == 'pull_request' run: | set -euo pipefail + # Requires Bash 4.0+ for ** glob patterns (GitHub Actions uses Bash 5.2.21) CHANGED_FILES="$(git diff --name-only "${{ github.event.pull_request.base.sha }}" "${{ github.sha }}")" if [ -z "$CHANGED_FILES" ]; then @@ -69,14 +72,28 @@ jobs: while IFS= read -r file; do [ -z "$file" ] && continue + # API/runtime implementation changes case "$file" in internal/*/http/*.go|cmd/app/commands/*.go|migrations/*/*.sql) api_changed=true ;; esac + # Documentation files that should be updated when API changes + # Note: docs/api/*.md matches root files, docs/api/**/*.md matches subdirectories case "$file" in - docs/api/*|docs/openapi.yaml|docs/examples/*|docs/operations/*|docs/getting-started/*|docs/cli/commands.md|docs/releases/*|docs/CHANGELOG.md|docs/README.md|docs/metadata.json|README.md) + docs/api/*.md|docs/api/**/*.md|\ + docs/openapi.yaml|\ + docs/examples/*.md|\ + docs/operations/*.md|docs/operations/**/*.md|\ + docs/getting-started/*.md|\ + docs/cli-commands.md|\ + docs/releases/*.md|\ + docs/configuration.md|\ + docs/README.md|\ + docs/metadata.json|\ + CHANGELOG.md|\ + README.md) docs_changed=true ;; esac @@ -86,7 +103,7 @@ jobs: if [ "$api_changed" = true ] && [ "$docs_changed" = false ]; then echo "API/runtime changes detected but no related docs updates found" - echo "Update docs/api, openapi/examples/runbooks/release notes as needed" + echo "Update docs/api/*/, openapi.yaml, examples/, operations/*/, or release notes/changelog as needed" exit 1 fi diff --git a/.markdownlint.json b/.markdownlint.json index b077f0e..e6b0f44 100644 --- a/.markdownlint.json +++ b/.markdownlint.json @@ -1,4 +1,7 @@ { "default": true, - "MD013": false + "MD013": false, + "MD024": { + "siblings_only": true + } } diff --git a/CHANGELOG.md b/CHANGELOG.md index af3cfda..ccb051f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Documentation - Added `docs/releases/v0.6.0.md` release notes and `docs/releases/v0.6.0-upgrade.md` upgrade guide -- Added KMS operations guide: `docs/operations/kms-setup.md` +- Added KMS operations guide: `docs/operations/kms/setup.md` - Updated CLI and environment variable docs for KMS configuration and master key rotation workflows ## [0.5.1] - 2026-02-19 @@ -60,7 +60,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Per-client rate limiting for authenticated endpoints (default: 10 req/sec, burst 20) - Configurable CORS support (disabled by default) -- Comprehensive security hardening documentation (`docs/operations/security-hardening.md`) +- Comprehensive security hardening documentation (`docs/operations/security/hardening.md`) - Rate limiting configuration via `RATE_LIMIT_ENABLED`, `RATE_LIMIT_REQUESTS_PER_SEC`, `RATE_LIMIT_BURST` - CORS configuration via `CORS_ENABLED`, `CORS_ALLOW_ORIGINS` @@ -78,7 +78,7 @@ If you rely on the previous default token expiration of 24 hours, explicitly set Ensure your client applications handle token refresh before expiration. The shorter default expiration improves security but may require updating client-side token refresh logic if you were relying on the previous 24-hour default. **Database SSL/TLS:** -If you are using `sslmode=disable` (PostgreSQL) or `tls=false` (MySQL) in production, this is insecure. Update your `DB_CONNECTION_STRING` to use `sslmode=require` or `sslmode=verify-full` (PostgreSQL) or `tls=true` or `tls=custom` (MySQL). See `docs/operations/security-hardening.md` for guidance. +If you are using `sslmode=disable` (PostgreSQL) or `tls=false` (MySQL) in production, this is insecure. Update your `DB_CONNECTION_STRING` to use `sslmode=require` or `sslmode=verify-full` (PostgreSQL) or `tls=true` or `tls=custom` (MySQL). See `docs/operations/security/hardening.md` for guidance. ### Security - Added database SSL/TLS configuration warnings in documentation @@ -87,7 +87,7 @@ If you are using `sslmode=disable` (PostgreSQL) or `tls=false` (MySQL) in produc - Added metrics endpoint protection recommendations ### Documentation -- Added `docs/operations/security-hardening.md` with comprehensive security guidance +- Added `docs/operations/security/hardening.md` with comprehensive security guidance - Updated `docs/configuration/environment-variables.md` with new variables and security warnings - Updated `.env.example` with security warnings for development-only configurations - Updated `docs/getting-started/docker.md` and `docs/getting-started/local-development.md` with security warnings @@ -133,7 +133,7 @@ If you are using `sslmode=disable` (PostgreSQL) or `tls=false` (MySQL) in produc - Cryptographic operation metrics (secret operations, transit operations, audit log operations) ### Documentation -- Added `docs/operations/monitoring.md` with Prometheus and Grafana quickstart +- Added `docs/operations/observability/monitoring.md` with Prometheus and Grafana quickstart - Added metrics naming contract and endpoint documentation - Added production hardening guidance for securing `/metrics` endpoint diff --git a/README.md b/README.md index 766c5a6..bde5edf 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Secrets is inspired by **HashiCorp Vault** ❀️, but it is intentionally **muc The default way to run Secrets is the published Docker image: ```bash -docker pull allisson/secrets:v0.7.0 +docker pull allisson/secrets ``` Use pinned tags for reproducible setups. `latest` is available for dev-only fast iteration. @@ -29,20 +29,18 @@ Then follow the Docker setup guide in [docs/getting-started/docker.md](docs/gett 1. 🐳 **Run with Docker image (recommended)**: [docs/getting-started/docker.md](docs/getting-started/docker.md) 2. πŸ’» **Run locally for development**: [docs/getting-started/local-development.md](docs/getting-started/local-development.md) -## πŸ†• What's New in v0.7.0 +## πŸ†• What's New in v0.8.0 -- πŸ›‘οΈ Added IP-based rate limiting for unauthenticated token issuance (`POST /v1/token`) -- βš™οΈ Added token endpoint configuration: `RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, `RATE_LIMIT_TOKEN_BURST` -- 🚦 Added token endpoint `429` + `Retry-After` behavior for burst/abuse control -- πŸ“˜ Added release notes: [docs/releases/v0.7.0.md](docs/releases/v0.7.0.md) -- ⬆️ Added upgrade guide: [docs/releases/v0.7.0-upgrade.md](docs/releases/v0.7.0-upgrade.md) -- πŸ“¦ Updated pinned Docker docs/examples to `allisson/secrets:v0.7.0` +- πŸ“š Major documentation consolidation: 77 β†’ 47 files (39% reduction) +- πŸ›οΈ Established 8 new Architecture Decision Records (ADR 0003-0010) +- πŸ“‚ Restructured API docs with themed organization (auth/, data/, observability/) +- πŸ“– Consolidated operations documentation with centralized runbook hub +- πŸ”— Comprehensive cross-reference updates throughout documentation +- πŸ“˜ See [v0.8.0 release notes](docs/releases/RELEASES.md#080---2026-02-20) -Release history quick links: +Release history: -- Current: [v0.7.0 release notes](docs/releases/v0.7.0.md) -- Previous: [v0.6.0 release notes](docs/releases/v0.6.0.md) -- Previous upgrade guide: [v0.6.0 upgrade guide](docs/releases/v0.6.0-upgrade.md) +- All releases: [Release notes](docs/releases/RELEASES.md) ## πŸ“š Docs Map @@ -52,46 +50,41 @@ Release history quick links: - πŸ’» **Getting started (local)**: [docs/getting-started/local-development.md](docs/getting-started/local-development.md) - 🧰 **Troubleshooting**: [docs/getting-started/troubleshooting.md](docs/getting-started/troubleshooting.md) - βœ… **Smoke test script**: [docs/getting-started/smoke-test.md](docs/getting-started/smoke-test.md) -- πŸ§ͺ **CLI commands reference**: [docs/cli/commands.md](docs/cli/commands.md) -- πŸš€ **v0.7.0 release notes**: [docs/releases/v0.7.0.md](docs/releases/v0.7.0.md) -- ⬆️ **v0.7.0 upgrade guide**: [docs/releases/v0.7.0-upgrade.md](docs/releases/v0.7.0-upgrade.md) +- πŸ§ͺ **CLI commands reference**: [docs/cli-commands.md](docs/cli-commands.md) +- πŸ“¦ **All release notes**: [docs/releases/RELEASES.md](docs/releases/RELEASES.md) - πŸ” **Release compatibility matrix**: [docs/releases/compatibility-matrix.md](docs/releases/compatibility-matrix.md) - **By Topic** - - βš™οΈ **Environment variables**: [docs/configuration/environment-variables.md](docs/configuration/environment-variables.md) + - βš™οΈ **Environment variables**: [docs/configuration.md](docs/configuration.md) - πŸ—οΈ **Architecture concepts**: [docs/concepts/architecture.md](docs/concepts/architecture.md) - πŸ”’ **Security model**: [docs/concepts/security-model.md](docs/concepts/security-model.md) - - πŸ“˜ **Glossary**: [docs/concepts/glossary.md](docs/concepts/glossary.md) - - πŸ”‘ **Key management operations**: [docs/operations/key-management.md](docs/operations/key-management.md) - - ☁️ **KMS setup guide**: [docs/operations/kms-setup.md](docs/operations/kms-setup.md) - - βœ… **KMS migration checklist**: [docs/operations/kms-migration-checklist.md](docs/operations/kms-migration-checklist.md) - - πŸ” **Security hardening**: [docs/operations/security-hardening.md](docs/operations/security-hardening.md) - - πŸ“Š **Monitoring and metrics**: [docs/operations/monitoring.md](docs/operations/monitoring.md) - - 🧯 **Operator drills**: [docs/operations/operator-drills.md](docs/operations/operator-drills.md) - - πŸš€ **Production rollout golden path**: [docs/operations/production-rollout.md](docs/operations/production-rollout.md) - - πŸš‘ **Failure playbooks**: [docs/operations/failure-playbooks.md](docs/operations/failure-playbooks.md) - - 🏭 **Production deployment**: [docs/operations/production.md](docs/operations/production.md) - - πŸ› οΈ **Development and testing**: [docs/development/testing.md](docs/development/testing.md) - - πŸ—ΊοΈ **Docs architecture map**: [docs/development/docs-architecture-map.md](docs/development/docs-architecture-map.md) + - πŸ“˜ **Glossary**: [docs/concepts/architecture.md#glossary](docs/concepts/architecture.md#glossary) + - πŸ”‘ **Key management operations**: [docs/operations/kms/key-management.md](docs/operations/kms/key-management.md) + - ☁️ **KMS setup guide**: [docs/operations/kms/setup.md](docs/operations/kms/setup.md) + - βœ… **KMS migration checklist**: [docs/operations/kms/setup.md#migration-checklist](docs/operations/kms/setup.md#migration-checklist) + - πŸ” **Security hardening**: [docs/operations/security/hardening.md](docs/operations/security/hardening.md) + - πŸ“Š **Monitoring and metrics**: [docs/operations/observability/monitoring.md](docs/operations/observability/monitoring.md) + - 🧯 **Operator drills**: [docs/operations/runbooks/README.md#operator-drills-quarterly](docs/operations/runbooks/README.md#operator-drills-quarterly) + - πŸš€ **Production rollout golden path**: [docs/operations/deployment/production-rollout.md](docs/operations/deployment/production-rollout.md) + - 🚨 **Incident response guide**: [docs/operations/observability/incident-response.md](docs/operations/observability/incident-response.md) + - 🏭 **Production deployment**: [docs/operations/deployment/production.md](docs/operations/deployment/production.md) + - πŸ› οΈ **Development and testing**: [docs/contributing.md#development-and-testing](docs/contributing.md#development-and-testing) + - πŸ—ΊοΈ **Docs architecture map**: [docs/contributing.md#docs-architecture-map](docs/contributing.md#docs-architecture-map) - 🀝 **Docs contributing**: [docs/contributing.md](docs/contributing.md) - - πŸ—’οΈ **Docs changelog**: [docs/CHANGELOG.md](docs/CHANGELOG.md) Release note location: -- Project release notes are in [CHANGELOG.md](CHANGELOG.md) -- Documentation process/history notes are in [docs/CHANGELOG.md](docs/CHANGELOG.md) +- Project release notes (including documentation changes) are in [CHANGELOG.md](CHANGELOG.md) - **API Reference** -- πŸ” **Auth API**: [docs/api/authentication.md](docs/api/authentication.md) -- πŸ‘€ **Clients API**: [docs/api/clients.md](docs/api/clients.md) -- πŸ“˜ **Policy cookbook**: [docs/api/policies.md](docs/api/policies.md) -- πŸ—‚οΈ **Capability matrix**: [docs/api/capability-matrix.md](docs/api/capability-matrix.md) -- 🚨 **Error decision matrix**: [docs/api/error-decision-matrix.md](docs/api/error-decision-matrix.md) -- πŸ“¦ **Secrets API**: [docs/api/secrets.md](docs/api/secrets.md) -- πŸš„ **Transit API**: [docs/api/transit.md](docs/api/transit.md) -- 🎫 **Tokenization API**: [docs/api/tokenization.md](docs/api/tokenization.md) -- πŸ“œ **Audit logs API**: [docs/api/audit-logs.md](docs/api/audit-logs.md) -- 🧩 **API versioning policy**: [docs/api/versioning-policy.md](docs/api/versioning-policy.md) + - πŸ” **Auth API**: [docs/api/auth/authentication.md](docs/api/auth/authentication.md) + - πŸ‘€ **Clients API**: [docs/api/auth/clients.md](docs/api/auth/clients.md) + - πŸ“˜ **Policy cookbook**: [docs/api/auth/policies.md](docs/api/auth/policies.md) + - πŸ“¦ **Secrets API**: [docs/api/data/secrets.md](docs/api/data/secrets.md) + - πŸš„ **Transit API**: [docs/api/data/transit.md](docs/api/data/transit.md) + - 🎫 **Tokenization API**: [docs/api/data/tokenization.md](docs/api/data/tokenization.md) + - πŸ“œ **Audit logs API**: [docs/api/observability/audit-logs.md](docs/api/observability/audit-logs.md) + - 🧩 **API fundamentals**: [docs/api/fundamentals.md](docs/api/fundamentals.md) - Error triage, capabilities, rate limits, versioning - **Examples** - πŸ§ͺ **Curl examples**: [docs/examples/curl.md](docs/examples/curl.md) @@ -105,7 +98,7 @@ All detailed guides include practical use cases and copy/paste-ready examples. - πŸ” Envelope encryption (`Master Key -> KEK -> DEK -> Secret Data`) - πŸ”‘ **KMS Integration** for master key encryption at rest (supports Google Cloud KMS, AWS KMS, Azure Key Vault, HashiCorp Vault, and local secrets for testing) -- πŸš„ Transit encryption (`/v1/transit/keys/*`) for encrypt/decrypt as a service (decrypt input uses `:`; see [Transit API docs](docs/api/transit.md), [create vs rotate](docs/api/transit.md#create-vs-rotate), and [error matrix](docs/api/transit.md#endpoint-error-matrix)) +- πŸš„ Transit encryption (`/v1/transit/keys/*`) for encrypt/decrypt as a service (decrypt input uses `:`; see [Transit API docs](docs/api/data/transit.md), [create vs rotate](docs/api/data/transit.md#create-vs-rotate), and [error matrix](docs/api/data/transit.md#endpoint-error-matrix)) - 🎫 Tokenization API (`/v1/tokenization/*`) for token generation, detokenization, validation, and revocation - πŸ‘€ Token-based authentication and policy-based authorization - πŸ“¦ Versioned secrets by path (`/v1/secrets/*path`) @@ -119,7 +112,7 @@ All detailed guides include practical use cases and copy/paste-ready examples. - Token issuance: `POST /v1/token` - Clients: `GET/POST /v1/clients`, `GET/PUT/DELETE /v1/clients/:id` - Secrets: `POST/GET/DELETE /v1/secrets/*path` -- Transit: `POST /v1/transit/keys`, `POST /v1/transit/keys/:name/rotate`, `POST /v1/transit/keys/:name/encrypt`, `POST /v1/transit/keys/:name/decrypt`, `DELETE /v1/transit/keys/:id` ([create vs rotate](docs/api/transit.md#create-vs-rotate), [error matrix](docs/api/transit.md#endpoint-error-matrix)) +- Transit: `POST /v1/transit/keys`, `POST /v1/transit/keys/:name/rotate`, `POST /v1/transit/keys/:name/encrypt`, `POST /v1/transit/keys/:name/decrypt`, `DELETE /v1/transit/keys/:id` ([create vs rotate](docs/api/data/transit.md#create-vs-rotate), [error matrix](docs/api/data/transit.md#endpoint-error-matrix)) - Tokenization: `POST /v1/tokenization/keys`, `POST /v1/tokenization/keys/:name/rotate`, `DELETE /v1/tokenization/keys/:id`, `POST /v1/tokenization/keys/:name/tokenize`, `POST /v1/tokenization/detokenize`, `POST /v1/tokenization/validate`, `POST /v1/tokenization/revoke` - Audit logs: `GET /v1/audit-logs` - Metrics: `GET /metrics` (available when `METRICS_ENABLED=true`) @@ -132,5 +125,5 @@ MIT. See `LICENSE`. - [Documentation index](docs/README.md) - [Docker getting started](docs/getting-started/docker.md) -- [API authentication](docs/api/authentication.md) -- [Production operations](docs/operations/production.md) +- [API authentication](docs/api/auth/authentication.md) +- [Production operations](docs/operations/deployment/production.md) diff --git a/cmd/app/main.go b/cmd/app/main.go index dbfc680..19f3fe7 100644 --- a/cmd/app/main.go +++ b/cmd/app/main.go @@ -15,7 +15,7 @@ func main() { cmd := &cli.Command{ Name: "app", Usage: "Go project template application", - Version: "1.0.0", + Version: "0.8.0", Commands: []*cli.Command{ { Name: "server", diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md deleted file mode 100644 index a5463e8..0000000 --- a/docs/CHANGELOG.md +++ /dev/null @@ -1,247 +0,0 @@ -# πŸ—’οΈ Documentation Changelog - -> Last updated: 2026-02-20 - -## 2026-02-20 (docs v13 - v0.7.0 release prep) - -- Added release notes page: `docs/releases/v0.7.0.md` -- Added upgrade guide: `docs/releases/v0.7.0-upgrade.md` -- Updated docs metadata source (`docs/metadata.json`) to `current_release: v0.7.0` -- Updated root README and docs index to promote `v0.7.0` release links -- Updated compatibility matrix with `v0.6.0 -> v0.7.0` upgrade path -- Updated API docs to document token endpoint rate limiting and `POST /v1/token` `429` behavior -- Updated environment variable docs for `RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, and `RATE_LIMIT_TOKEN_BURST` -- Updated troubleshooting and security hardening docs with token endpoint throttling guidance -- Updated pinned Docker image examples from `allisson/secrets:v0.6.0` to `allisson/secrets:v0.7.0` -- Added token endpoint throttling runbook section to production deployment guide -- Added token-endpoint-specific `429` response example and optional smoke test verification flow -- Expanded monitoring queries and alert starters for `/v1/token` throttling signals -- Added docs CI guard for current-release pinned image tag consistency -- Added operator quick card runbook (`docs/operations/operator-quick-card.md`) for rollout/incident triage -- Added trusted proxy reference guide (`docs/operations/trusted-proxy-reference.md`) for source-IP safety checks -- Added release note and upgrade guide templates (`docs/releases/_template.md`, `docs/releases/_upgrade-template.md`) -- Added auth docs retry handling snippets for token endpoint `429` and `Retry-After` -- Added docs architecture map updates for CI docs guards and local validation workflow -- Added Phase 3 planning roadmap (`docs/development/docs-phase-3-roadmap.md`) -- Expanded Phase 3 roadmap with prioritized backlog (`S/M/L`), dependencies, and execution order -- Added Phase 4 micro-roadmap (`docs/development/docs-phase-4-roadmap.md`) with 3 PR plan and CI guard proposals -- Added incident decision tree and first-15-minutes incident playbook runbooks -- Added known limitations page for rate limiting, proxy trust, and KMS startup tradeoffs -- Added versioned examples index by release (`docs/examples/versioned-by-release.md`) -- Added day-0 onboarding walkthroughs for operator and developer personas -- Added persona landing pages (`docs/personas/operator.md`, `docs/personas/developer.md`, `docs/personas/security.md`) -- Added docs KPI page and postmortem-to-doc feedback loop guidance -- Added consolidated docs master backlog (`docs/development/docs-master-backlog.md`) -- Added search alias shortcuts in docs index for faster incident/runbook discovery -- Added command verification markers to key rollout/troubleshooting/smoke docs - -## 2026-02-19 (docs v12 - v0.6.0 release prep) - -- Added release notes page: `docs/releases/v0.6.0.md` -- Added upgrade guide: `docs/releases/v0.6.0-upgrade.md` -- Updated docs metadata source (`docs/metadata.json`) to `current_release: v0.6.0` -- Updated root README and docs index to promote `v0.6.0` release links -- Updated operator runbook and production rollout references to `v0.6.0` -- Updated compatibility matrix with `v0.5.1 -> v0.6.0` upgrade path -- Updated pinned Docker image examples from `allisson/secrets:v0.5.1` to `allisson/secrets:v0.6.0` -- Updated CLI command docs for KMS mode flags and new `rotate-master-key` command -- Updated environment variable docs for `KMS_PROVIDER` and `KMS_KEY_URI` configuration -- Updated key management and troubleshooting guides with KMS rotation and failure-mode guidance - -## 2026-02-19 (docs v11 - v0.5.1 patch release prep) - -- Added release notes page: `docs/releases/v0.5.1.md` -- Added upgrade guide: `docs/releases/v0.5.1-upgrade.md` -- Updated docs metadata source (`docs/metadata.json`) to `current_release: v0.5.1` -- Updated root README and docs index to promote `v0.5.1` release links -- Updated operator runbook index and production runbooks to reference `v0.5.1` -- Updated compatibility matrix with `v0.5.0 -> v0.5.1` patch upgrade path -- Added direct `v0.4.x -> v0.5.1` compatibility path for skip-upgrade operators -- Updated pinned Docker image examples from `allisson/secrets:v0.5.0` to `allisson/secrets:v0.5.1` -- Updated API docs release labels to `v0.5.1` where current-release references are shown -- Reduced patch-version churn in OpenAPI coverage notes by using current-release wording -- Added v0.5.1-specific master-key regression triage note to troubleshooting -- Added copy/paste quick verification commands to `docs/releases/v0.5.1-upgrade.md` -- Added patch-release safety note to `docs/releases/v0.5.1.md` -- Added release history quick links in root `README.md` -- Added runtime version fingerprint checks for mixed deployment triage - -## 2026-02-19 (docs v10 - v0.5.0 security hardening release prep) - -- Added comprehensive security hardening guide: `docs/operations/security-hardening.md` -- Updated docs metadata source (`docs/metadata.json`) to `current_release: v0.5.0` -- Added release notes page: `docs/releases/v0.5.0.md` and promoted it as current in docs indexes -- Updated environment variables documentation with rate limiting and CORS configuration -- Added security warnings for database SSL/TLS requirements (production vs development) -- Added migration note for token expiration default change (24h β†’ 4h) -- Updated `.env.example` with new configuration options and security warnings -- Added security warnings to Docker and local development getting-started guides -- Updated production deployment guide with security hardening reference -- Updated security model with comprehensive production recommendations -- Added security hardening link to root README and docs indexes -- Updated current-release references from v0.4.1 to v0.5.0 while preserving historical links -- Added upgrade guide: `docs/releases/v0.5.0-upgrade.md` -- Added API rate limiting reference: `docs/api/rate-limiting.md` -- Updated API endpoint docs with `429` behavior and rate-limiting cross-links -- Expanded troubleshooting with `429` and CORS/preflight diagnostics -- Added retry/backoff examples for `429` handling in curl, Python, JavaScript, and Go example docs -- Added rate-limiting production presets in environment variables documentation -- Added docs release checklist: `docs/development/docs-release-checklist.md` -- Added OpenAPI validation step in CI workflow -- Added production rollout golden path runbook: `docs/operations/production-rollout.md` -- Added API error decision matrix: `docs/api/error-decision-matrix.md` -- Added release compatibility matrix: `docs/releases/compatibility-matrix.md` -- Added persona-oriented policy templates and references in `docs/api/policies.md` -- Expanded monitoring guide with rate-limit Prometheus queries and alert examples -- Added CORS smoke checks (copy/paste) to troubleshooting guide -- Added quarterly operator drills runbook: `docs/operations/operator-drills.md` -- Added dashboard artifact templates under `docs/operations/dashboards/` -- Added docs architecture map: `docs/development/docs-architecture-map.md` -- Added release docs CI guard: `docs/tools/check_release_docs_links.py` + workflow integration -- Expanded policy smoke tests with pre-deploy automation wrapper pattern - -## 2026-02-19 (docs v9 - v0.4.1 bugfix release prep) - -- Added release notes page: `docs/releases/v0.4.1.md` and promoted it as current in docs indexes -- Updated docs metadata source (`docs/metadata.json`) to `current_release: v0.4.1` -- Updated pinned Docker examples from `allisson/secrets:v0.4.0` to `allisson/secrets:v0.4.1` -- Documented policy path-matching behavior with mid-path wildcard support in `docs/api/policies.md` -- Updated troubleshooting and failure playbooks to include exact, trailing wildcard, and mid-path wildcard matching -- Corrected Clients API policy examples to use `decrypt` for `/v1/secrets/*` reads -- Added transit rotate smoke-test step for `/v1/transit/keys/*/rotate` wildcard validation -- Added malformed rotate path-shape smoke check and explicit unsupported wildcard pattern notes -- Added policy matcher quick-reference table to `docs/api/capability-matrix.md` -- Linked `v0.4.1` release notes from production and smoke-test operator guides -- Added route-shape vs policy-shape guidance and cross-links between policies and smoke tests -- Added copy-safe split-role policy snippets for transit rotate-only and secrets read/write separation -- Added operator quick checklist to `docs/releases/v0.4.1.md` and policy matcher FAQ in troubleshooting -- Added pre-deploy policy review checklist to `docs/api/policies.md` -- Added `v0.4.1` documentation migration map with direct section links for operators -- Added strict CI mode snippet for policy smoke checks and 403-vs-404 false-positive guidance -- Added canonical wildcard matcher semantics links in auth, clients, secrets, and transit API docs -- Converted Clients API related references to clickable links for navigation consistency -- Added policy triage cross-links in Audit Logs API and refreshed stale page update stamps -- Added docs metadata guard to require `> Last updated: YYYY-MM-DD` marker on all docs pages -- Added optional strict metadata freshness check via `DOCS_CHANGED_FILES` for changed docs pages -- Added Docs QA checklist and style baseline guidance to `docs/contributing.md` -- Added unified operator runbook hub: `docs/operations/runbook-index.md` and linked it from docs indexes - -## 2026-02-18 (docs v8 - docs QA and operations polish) - -- Added docs metadata source file `docs/metadata.json` and metadata consistency checker -- Added `make docs-check-metadata` and integrated it into `make docs-lint` -- Added CI docs metadata check and API/docs consistency guard for PRs -- Added policy verification runbook: `docs/operations/policy-smoke-tests.md` -- Added retention defaults table to production guide and linked policy smoke tests -- Added tokenization lifecycle sequence diagram in architecture docs -- Added copy-safe examples policy and release PR docs QA guard guidance in contributing docs - -## 2026-02-18 (docs v7 - final v0.4.0 hardening) - -- Added canonical capability reference page: `docs/api/capability-matrix.md` -- Linked capability matrix from API endpoint docs, policy cookbook, and docs indexes -- Expanded OpenAPI description and monitoring docs with route-template notes (`{name}` vs `:name`/`*path`) -- Added tokenization deterministic-mode caveats in curl, Python, JavaScript, and Go examples -- Expanded tokenization API guidance with metadata data-classification rules -- Added rollback guidance for additive tokenization schema migration in `docs/releases/v0.4.0.md` -- Added migration-focused troubleshooting for tokenization rollout and expanded smoke test coverage - -## 2026-02-18 (docs v6 - v0.4.0 release prep) - -- Added release notes page: `docs/releases/v0.4.0.md` and promoted it as current in docs indexes -- Updated pinned Docker examples from `allisson/secrets:v0.3.0` to `allisson/secrets:v0.4.0` -- Updated root `README.md` with `What's New in v0.4.0`, tokenization API overview, and release links -- Added tokenization endpoints and corrected request/response contracts in `docs/api/tokenization.md` -- Added tokenization CLI command docs in `docs/cli/commands.md` -- Added tokenization monitoring operations and retention workflow updates in production docs -- Added explicit OpenAPI-coverage gap notes for tokenization rollout docs -- Added tokenization snippets to Python, JavaScript, and Go examples for cross-language parity -- Added tokenization incident runbooks and policy mapping clarifications -- Added `v0.4.0` upgrade checklist (migrate, verify, tokenization smoke checks, retention cleanup) -- Expanded OpenAPI baseline with tokenization endpoint and schema coverage -- Added canonical capability matrix reference and cross-linked API docs to reduce policy drift -- Expanded smoke test script/docs with tokenization round-trip + revoke validation -- Added tokenization migration verification troubleshooting section - -## 2026-02-16 (docs v5 - documentation quality improvements) - -- Added `What's New in v0.3.0` section to root `README.md` -- Added Prometheus + Grafana quickstart and a metrics naming contract to `docs/operations/monitoring.md` -- Added production hardening guidance for securing `/metrics` exposure -- Added feature PR docs consistency checklist to `docs/contributing.md` -- Added metrics troubleshooting matrix to `docs/getting-started/troubleshooting.md` -- Added local and Docker command parity examples in `docs/cli/commands.md` -- Added telemetry breaking vs non-breaking examples in `docs/api/versioning-policy.md` - -## 2026-02-16 (docs v4 - v0.3.0 release prep) - -- Added release notes page: `docs/releases/v0.3.0.md` and set it as the current release in docs indexes -- Updated pinned Docker examples from `allisson/secrets:v0.2.0` to `allisson/secrets:v0.3.0` -- Added monitoring links to root README and expanded API overview with `GET /metrics` -- Aligned monitoring operations with implementation (`secret_create`, `secret_get_version`, `audit_log_delete`, `transit_key_rotate`) -- Clarified metrics disable behavior (`METRICS_ENABLED=false` removes metrics middleware and `/metrics` route) - -## 2026-02-14 (docs v3 - v0.2.0 release prep) - -- Added `clean-audit-logs` command documentation with dry-run and JSON/text output examples -- Added audit-log retention cleanup runbook to production operations guide -- Clarified audit log retention is a CLI cleanup workflow, while API remains list/query (`GET /v1/audit-logs`) -- Updated pinned Docker image tags and release references from `v0.1.0` to `v0.2.0` -- Added release notes page: `docs/releases/v0.2.0.md` and kept `v0.1.0` as historical - -## 2026-02-14 (docs v2 - v0.1.0 release prep) - -- Added first-client bootstrap flow to Docker and local development guides using `create-client` -- Added CLI reference page with runtime, key management, and client management commands -- Linked CLI docs and release notes from root README and docs index -- Switched Docker release guide examples to pinned image tag `allisson/secrets:v0.1.0` -- Added explicit OpenAPI coverage note: `docs/openapi.yaml` is baseline subset for common flows -- Clarified API v1 compatibility expectations relative to pre-1.0 app releases -- Added release notes page: `docs/releases/v0.1.0.md` - -## 2026-02-14 (docs v1) - -- Split large root README into focused docs under `docs/` -- Made Docker image the default run path (`allisson/secrets:latest`) -- Added API references, examples (curl/python/javascript/go), and operations guides -- Added restart requirement after master key/KEK rotation -- Added troubleshooting guide, policy cookbook, and production deployment guide -- Added API quick flows and API v1 applicability notes -- Added docs CI checks (markdown lint + offline link checks) -- Added Make target `make docs-lint` for local docs checks -- Added quickstart copy block and shell compatibility notes for smoke tests -- Added API error payload examples and shared response-shapes reference page -- Added policy mistakes table and rolling restart runbook guidance -- Added supported platforms section and docs ownership/review cadence guidance -- Added clickable anchor links in long-document table of contents sections -- Added first-time operator path in docs index for onboarding flow -- Added docs release process checklist in `docs/contributing.md` -- Added CI docs-only PR guard requiring `docs/CHANGELOG.md` updates -- Added pull request template with documentation quality gate checklist -- Added baseline OpenAPI spec (`docs/openapi.yaml`) and linked it from API docs -- Added cross-linking across all docs pages via `See also` sections for faster navigation -- Converted docs path references in `README.md` and `docs/README.md` into clickable Markdown links -- Clarified transit decrypt contract: callers must pass versioned ciphertext (`:`) exactly as returned by encrypt -- Documented transit decrypt validation behavior: malformed ciphertext now returns `422 Unprocessable Entity` -- Added transit decrypt input contract examples (valid/invalid) and representative `422` payloads -- Added OpenAPI decrypt request examples and explicit `401`/`403`/`404` responses -- Added 422 troubleshooting matrix and transit round-trip verification/decode notes in examples -- Clarified transit key create behavior: duplicate key names now documented as `409 Conflict` and rotate is required for new versions -- Added transit create-vs-rotate guidance, idempotency notes, endpoint error matrix, and representative `409` conflict payload examples -- Added transit automation runbook note for handling create `409` by rotating keys -- Added API status code quick-reference tables to clients, secrets, transit, and audit docs -- Added glossary page and cross-links from API/reference documentation -- Added example-page common mistakes sections (curl, python, javascript, go) -- Added docs contribution policy for breaking vs non-breaking documentation updates -- Added docs freshness SLA table in docs index -- Added failure playbooks for 401/403/409 incident triage -- Added API compatibility/versioning policy page for breaking/non-breaking expectations -- Added ADRs for envelope encryption model and transit versioned ciphertext contract -- Added executable example shape checks (`make docs-check-examples`) and CI integration -- Added environment bootstrap sections to all examples pages - -## See also - -- [Documentation index](README.md) -- [Contributing guide](contributing.md) -- [Testing guide](development/testing.md) diff --git a/docs/README.md b/docs/README.md index cabffa1..211cf04 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,57 +10,63 @@ Welcome to the full documentation for Secrets. Pick a path and dive in πŸš€ - 🐳 [getting-started/docker.md](getting-started/docker.md) (recommended) - πŸ’» [getting-started/local-development.md](getting-started/local-development.md) -- 🧭 [getting-started/day-0-operator.md](getting-started/day-0-operator.md) -- πŸ’» [getting-started/day-0-developer.md](getting-started/day-0-developer.md) +- 🧭 [getting-started/day-0-walkthrough.md](getting-started/day-0-walkthrough.md) - 🧰 [getting-started/troubleshooting.md](getting-started/troubleshooting.md) - βœ… [getting-started/smoke-test.md](getting-started/smoke-test.md) -- πŸ§ͺ [cli/commands.md](cli/commands.md) +- πŸ§ͺ [cli-commands.md](cli-commands.md) ## πŸ›£οΈ First-Time Operator Path 1. Start with Docker guide: [getting-started/docker.md](getting-started/docker.md) 2. Validate end-to-end setup: [getting-started/smoke-test.md](getting-started/smoke-test.md) -3. Follow rollout runbook: [operations/production-rollout.md](operations/production-rollout.md) -4. Apply production hardening checklist: [operations/production.md](operations/production.md) -5. Use runbook hub for rollout and incidents: [operations/runbook-index.md](operations/runbook-index.md) +3. Follow rollout runbook: [operations/deployment/production-rollout.md](operations/deployment/production-rollout.md) +4. Apply production hardening checklist: [operations/deployment/production.md](operations/deployment/production.md) +5. Use runbook hub for rollout and incidents: [operations/runbooks/README.md](operations/runbooks/README.md) ## πŸ‘₯ Persona Paths -- πŸ‘· [personas/operator.md](personas/operator.md) -- πŸ‘¨β€πŸ’» [personas/developer.md](personas/developer.md) -- πŸ›‘οΈ [personas/security.md](personas/security.md) +- πŸ‘· [Operator](personas/README.md#operator-path) +- πŸ‘¨β€πŸ’» [Developer](personas/README.md#developer-path) +- πŸ›‘οΈ [Security Engineer](personas/README.md#security-engineer-path) ## πŸ“– Documentation by Topic -- βš™οΈ [configuration/environment-variables.md](configuration/environment-variables.md) +**Configuration & Concepts:** + +- βš™οΈ [configuration.md](configuration.md) - πŸ—οΈ [concepts/architecture.md](concepts/architecture.md) - πŸ”’ [concepts/security-model.md](concepts/security-model.md) -- πŸ“˜ [concepts/glossary.md](concepts/glossary.md) -- πŸ”‘ [operations/key-management.md](operations/key-management.md) -- ☁️ [operations/kms-setup.md](operations/kms-setup.md) -- βœ… [operations/kms-migration-checklist.md](operations/kms-migration-checklist.md) -- πŸš€ [operations/production-rollout.md](operations/production-rollout.md) -- ⚑ [operations/operator-quick-card.md](operations/operator-quick-card.md) -- 🌲 [operations/incident-decision-tree.md](operations/incident-decision-tree.md) -- ⏱️ [operations/first-15-minutes.md](operations/first-15-minutes.md) -- πŸ“Š [operations/monitoring.md](operations/monitoring.md) -- 🧯 [operations/operator-drills.md](operations/operator-drills.md) -- 🏭 [operations/production.md](operations/production.md) -- 🌐 [operations/trusted-proxy-reference.md](operations/trusted-proxy-reference.md) -- ⚠️ [operations/known-limitations.md](operations/known-limitations.md) -- πŸš‘ [operations/failure-playbooks.md](operations/failure-playbooks.md) -- πŸ§ͺ [operations/policy-smoke-tests.md](operations/policy-smoke-tests.md) -- 🧭 [operations/runbook-index.md](operations/runbook-index.md) -- πŸ› οΈ [development/testing.md](development/testing.md) -- 🧾 [development/docs-release-checklist.md](development/docs-release-checklist.md) -- πŸ—ΊοΈ [development/docs-architecture-map.md](development/docs-architecture-map.md) -- πŸ“ˆ [development/docs-quality-kpis.md](development/docs-quality-kpis.md) -- πŸ” [development/postmortem-doc-loop.md](development/postmortem-doc-loop.md) -- πŸ—‚οΈ [development/docs-master-backlog.md](development/docs-master-backlog.md) -- πŸ›£οΈ [development/docs-phase-3-roadmap.md](development/docs-phase-3-roadmap.md) -- 🧭 [development/docs-phase-4-roadmap.md](development/docs-phase-4-roadmap.md) -- 🀝 [contributing.md](contributing.md) -- πŸ—’οΈ [CHANGELOG.md](CHANGELOG.md) +- πŸ“˜ [concepts/architecture.md#glossary](concepts/architecture.md#glossary) + +**Operations: KMS & Key Management:** + +- ☁️ [operations/kms/setup.md](operations/kms/setup.md) - KMS setup and migration +- πŸ”‘ [operations/kms/key-management.md](operations/kms/key-management.md) + +**Operations: Security:** + +- πŸ›‘οΈ [operations/security/hardening.md](operations/security/hardening.md) - Includes trusted proxy configuration + +**Operations: Observability:** + +- πŸ“Š [operations/observability/monitoring.md](operations/observability/monitoring.md) +- πŸš‘ [operations/observability/incident-response.md](operations/observability/incident-response.md) + +**Operations: Deployment:** + +- πŸš€ [operations/deployment/production-rollout.md](operations/deployment/production-rollout.md) +- 🏭 [operations/deployment/production.md](operations/deployment/production.md) - Includes known limitations + +**Operations: Runbooks:** + +- 🧭 [operations/runbooks/README.md](operations/runbooks/README.md) - Runbook hub +- ⚑ [operations/runbooks/README.md#operator-quick-card](operations/runbooks/README.md#operator-quick-card) +- 🧯 [operations/runbooks/README.md#operator-drills-quarterly](operations/runbooks/README.md#operator-drills-quarterly) +- πŸ§ͺ [operations/runbooks/policy-smoke-tests.md](operations/runbooks/policy-smoke-tests.md) + +**Development:** + +- 🀝 [contributing.md](contributing.md) - Includes testing, docs architecture map, release checklist, and documentation management ## 🧭 Docs Freshness SLA @@ -74,55 +80,50 @@ Welcome to the full documentation for Secrets. Pick a path and dive in πŸš€ ## 🌐 API Reference -- πŸ” [api/authentication.md](api/authentication.md) -- πŸ‘€ [api/clients.md](api/clients.md) -- πŸ“˜ [api/policies.md](api/policies.md) -- πŸ—‚οΈ [api/capability-matrix.md](api/capability-matrix.md) -- 🚨 [api/error-decision-matrix.md](api/error-decision-matrix.md) -- πŸ“¦ [api/secrets.md](api/secrets.md) -- πŸš„ [api/transit.md](api/transit.md) -- 🎫 [api/tokenization.md](api/tokenization.md) -- πŸ“œ [api/audit-logs.md](api/audit-logs.md) -- 🚦 [api/rate-limiting.md](api/rate-limiting.md) -- 🧱 [api/response-shapes.md](api/response-shapes.md) -- 🧩 [api/versioning-policy.md](api/versioning-policy.md) +- πŸ” [api/auth/authentication.md](api/auth/authentication.md) +- πŸ‘€ [api/auth/clients.md](api/auth/clients.md) +- πŸ“˜ [api/auth/policies.md](api/auth/policies.md) +- πŸ“¦ [api/data/secrets.md](api/data/secrets.md) +- πŸš„ [api/data/transit.md](api/data/transit.md) +- 🎫 [api/data/tokenization.md](api/data/tokenization.md) +- πŸ“œ [api/observability/audit-logs.md](api/observability/audit-logs.md) +- 🧱 [api/observability/response-shapes.md](api/observability/response-shapes.md) +- 🧩 [api/fundamentals.md](api/fundamentals.md) - Error triage, capabilities, rate limits, versioning - πŸ“„ [openapi.yaml](openapi.yaml) ## πŸ”Ž Search Aliases -- `401 403 429 decision tree` -> [operations/incident-decision-tree.md](operations/incident-decision-tree.md) -- `first 15 minutes incident` -> [operations/first-15-minutes.md](operations/first-15-minutes.md) -- `trusted proxy retry-after token 429` -> [operations/trusted-proxy-reference.md](operations/trusted-proxy-reference.md) -- `known limitations` -> [operations/known-limitations.md](operations/known-limitations.md) -- `versioned examples` -> [examples/versioned-by-release.md](examples/versioned-by-release.md) +- `401 403 429 decision tree incident` -> [operations/observability/incident-response.md](operations/observability/incident-response.md) +- `first 15 minutes incident playbook` -> [operations/observability/incident-response.md](operations/observability/incident-response.md) +- `trusted proxy retry-after token 429` -> [operations/security/hardening.md#trusted-proxy-configuration](operations/security/hardening.md#trusted-proxy-configuration) +- `known limitations` -> [operations/deployment/production.md#known-limitations-and-tradeoffs](operations/deployment/production.md#known-limitations-and-tradeoffs) +- `examples` -> [examples/README.md](examples/README.md) OpenAPI scope note: -- `openapi.yaml` is a baseline subset for common API flows in the current release (`docs/metadata.json`) +- `openapi.yaml` is a baseline subset for common API flows in the current release (v0.8.0, see `docs/metadata.json`) - Full endpoint behavior is documented in the endpoint pages under `docs/api/` - Tokenization endpoints are included in `openapi.yaml` for the current release ## πŸš€ Releases -- πŸ“¦ [releases/v0.7.0.md](releases/v0.7.0.md) -- ⬆️ [releases/v0.7.0-upgrade.md](releases/v0.7.0-upgrade.md) -- πŸ“¦ [releases/v0.6.0.md](releases/v0.6.0.md) (historical) -- ⬆️ [releases/v0.6.0-upgrade.md](releases/v0.6.0-upgrade.md) (historical) -- πŸ“¦ [releases/v0.5.1.md](releases/v0.5.1.md) (historical) -- ⬆️ [releases/v0.5.1-upgrade.md](releases/v0.5.1-upgrade.md) (historical) -- πŸ“¦ [releases/v0.5.0.md](releases/v0.5.0.md) (historical) -- ⬆️ [releases/v0.5.0-upgrade.md](releases/v0.5.0-upgrade.md) (historical) +- πŸ“¦ [releases/RELEASES.md](releases/RELEASES.md) - All release notes and upgrade guides - πŸ” [releases/compatibility-matrix.md](releases/compatibility-matrix.md) -- πŸ“¦ [releases/v0.4.1.md](releases/v0.4.1.md) (historical) -- πŸ“¦ [releases/v0.4.0.md](releases/v0.4.0.md) (historical) -- πŸ“¦ [releases/v0.3.0.md](releases/v0.3.0.md) (historical) -- πŸ“¦ [releases/v0.2.0.md](releases/v0.2.0.md) (historical) -- πŸ“¦ [releases/v0.1.0.md](releases/v0.1.0.md) (historical) -## 🧠 ADRs +## 🧠 Architecture Decision Records + +This section documents key architectural decisions with their context, rationale, and trade-offs: -- 🧾 [adr/0001-envelope-encryption-model.md](adr/0001-envelope-encryption-model.md) -- 🧾 [adr/0002-transit-versioned-ciphertext-contract.md](adr/0002-transit-versioned-ciphertext-contract.md) +- 🧾 [ADR 0001: Envelope Encryption Model](adr/0001-envelope-encryption-model.md) - Master Key β†’ KEK β†’ DEK β†’ Secret Data hierarchy +- 🧾 [ADR 0002: Transit Versioned Ciphertext Contract](adr/0002-transit-versioned-ciphertext-contract.md) - `:` format +- 🧾 [ADR 0003: Capability-Based Authorization Model](adr/0003-capability-based-authorization-model.md) - Fine-grained access control with path matching +- 🧾 [ADR 0004: Dual Database Support](adr/0004-dual-database-support.md) - PostgreSQL and MySQL compatibility +- 🧾 [ADR 0005: Context-Based Transaction Management](adr/0005-context-based-transaction-management.md) - Go context for transaction propagation +- 🧾 [ADR 0006: Dual-Scope Rate Limiting Strategy](adr/0006-dual-scope-rate-limiting-strategy.md) - Per-client and per-IP rate limiting +- 🧾 [ADR 0007: Path-Based API Versioning](adr/0007-path-based-api-versioning.md) - `/v1/*` API versioning strategy +- 🧾 [ADR 0008: Gin Web Framework with Custom Middleware](adr/0008-gin-web-framework-with-custom-middleware.md) - HTTP framework and middleware strategy +- 🧾 [ADR 0009: UUIDv7 for Identifiers](adr/0009-uuidv7-for-identifiers.md) - Time-ordered UUID strategy for database IDs +- 🧾 [ADR 0010: Argon2id for Client Secret Hashing](adr/0010-argon2id-for-client-secret-hashing.md) - Memory-hard password hashing algorithm ## πŸ–₯️ Supported Platforms @@ -133,7 +134,7 @@ OpenAPI scope note: ## πŸ’‘ Practical Examples -- 🧭 [examples/versioned-by-release.md](examples/versioned-by-release.md) +- 🧭 [examples/README.md](examples/README.md) - Code examples overview and version compatibility - πŸ§ͺ [examples/curl.md](examples/curl.md) - 🐍 [examples/python.md](examples/python.md) - 🟨 [examples/javascript.md](examples/javascript.md) @@ -147,5 +148,5 @@ Secrets is inspired by HashiCorp Vault, but it is much simpler and intentionally - [Docker getting started](getting-started/docker.md) - [Architecture](concepts/architecture.md) -- [Authentication API](api/authentication.md) -- [Production operations](operations/production.md) +- [Authentication API](api/auth/authentication.md) +- [Production operations](operations/deployment/production.md) diff --git a/docs/adr/0001-envelope-encryption-model.md b/docs/adr/0001-envelope-encryption-model.md index f88fd54..d000f47 100644 --- a/docs/adr/0001-envelope-encryption-model.md +++ b/docs/adr/0001-envelope-encryption-model.md @@ -27,4 +27,5 @@ Use envelope encryption hierarchy: - [Architecture](../concepts/architecture.md) - [Security model](../concepts/security-model.md) -- [Key management operations](../operations/key-management.md) +- [Key management operations](../operations/kms/key-management.md) +- [ADR 0004: Dual Database Support](0004-dual-database-support.md) - Database storage for encrypted key material diff --git a/docs/adr/0002-transit-versioned-ciphertext-contract.md b/docs/adr/0002-transit-versioned-ciphertext-contract.md index 67f057c..c11ce53 100644 --- a/docs/adr/0002-transit-versioned-ciphertext-contract.md +++ b/docs/adr/0002-transit-versioned-ciphertext-contract.md @@ -25,6 +25,7 @@ Adopt transit ciphertext contract: ## See also -- [Transit API](../api/transit.md) -- [Response shapes](../api/response-shapes.md) +- [Transit API](../api/data/transit.md) +- [Response shapes](../api/observability/response-shapes.md) - [Troubleshooting](../getting-started/troubleshooting.md) +- [ADR 0007: Path-Based API Versioning](0007-path-based-api-versioning.md) - API versioning context for ciphertext format stability diff --git a/docs/adr/0003-capability-based-authorization-model.md b/docs/adr/0003-capability-based-authorization-model.md new file mode 100644 index 0000000..6bee353 --- /dev/null +++ b/docs/adr/0003-capability-based-authorization-model.md @@ -0,0 +1,112 @@ +# ADR 0003: Capability-Based Authorization Model + +> Status: accepted +> Date: 2026-02-11 + +## Context + +The system requires fine-grained access control to protect cryptographic operations and sensitive data. Traditional authorization models present several challenges: + +- **Security requirement**: Separate encrypt/decrypt permissions to support split-role workflows (e.g., a backup service can encrypt but never decrypt data) +- **Operational flexibility**: Different clients need different levels of access to specific resource paths +- **Complexity constraint**: Avoid over-engineering authorization for a pre-1.0 system while maintaining security +- **Path-specific control**: Authorization decisions must consider both the operation type and the specific resource path + +## Decision + +Adopt a capability-based authorization model with the following characteristics: + +**Six capabilities:** + +- `read` - List or inspect metadata/state without decrypting payload values +- `write` - Create or update non-cryptographic resources and key definitions +- `delete` - Delete resources or revoke token lifecycle entries +- `encrypt` - Create encrypted outputs (secrets writes, transit encrypt, tokenization tokenize) +- `decrypt` - Resolve encrypted/tokenized values back to plaintext +- `rotate` - Create new key versions + +**Policy evaluation:** + +- Per-client policy documents attached to authentication clients +- Authorization check via `Client.IsAllowed(path, capability)` method +- Policies evaluated on every authenticated request after successful authentication + +**Path matching semantics:** + +- Exact match: No wildcard means full exact match (`/v1/audit-logs` matches only `/v1/audit-logs`) +- Full wildcard: `*` matches any request path +- Trailing wildcard: `prefix/*` matches any path starting with `prefix/` (greedy for deeper paths) +- Mid-path wildcard: `/v1/keys/*/rotate` matches paths with `*` as exactly one segment + +**Examples:** + +- `/v1/secrets/*` matches `/v1/secrets/app`, `/v1/secrets/app/db`, and `/v1/secrets/app/db/password` +- `/v1/transit/keys/*/rotate` matches `/v1/transit/keys/payment/rotate` +- `/v1/*/keys/*/rotate` matches `/v1/transit/keys/payment/rotate` + +## Alternatives Considered + +### 1. Role-Based Access Control (RBAC) + +Predefined roles like "admin", "operator", "viewer" with fixed permission sets. + +**Rejected because:** + +- Too coarse-grained for cryptographic operations (can't separate encrypt/decrypt within a role) +- No support for path-specific permissions (admin has all access, viewer has read-only everywhere) +- Difficult to model split-role security requirements (backup service needs encrypt-only) +- Role proliferation problem (would need many roles: "read-only-secrets", "encrypt-only-transit", etc.) + +### 2. Attribute-Based Access Control (ABAC) + +Complex attribute evaluation with conditions like `user.department == "finance" AND resource.environment == "production"`. + +**Rejected because:** + +- Over-engineered for pre-1.0 system requirements +- Higher implementation complexity and maintenance burden +- Steeper learning curve for operators authoring policies +- Performance overhead from complex condition evaluation +- Current requirements satisfied by simpler capability + path model + +### 3. Access Control Lists (ACLs) Per Resource + +Each secret/key/resource has its own ACL defining who can access it. + +**Rejected because:** + +- Management complexity (policies scattered across many resources) +- No centralized view of a client's permissions +- Difficult to audit "what can this client access?" +- Harder to implement (requires ACL storage and evaluation per resource) + +## Consequences + +**Benefits:** + +- **Simpler than ABAC**: Easier to understand and implement +- **More flexible than RBAC**: Supports path-specific and operation-specific permissions +- **Security flexibility**: Enables split-role workflows (encrypt-only, decrypt-only services) +- **Clear audit trail**: Policy evaluation logged per request for forensic analysis +- **Performance acceptable**: Per-request evaluation with simple pattern matching (no database lookups) + +**Trade-offs:** + +- **Policy authoring complexity**: Operators must understand path matching semantics (wildcards, exact match) +- **No hierarchical roles**: Can't define "base operator" role and inherit from it +- **Per-request evaluation**: Authorization check on every request (acceptable overhead for current scale) +- **Security training required**: Teams must understand wildcard implications (e.g., `*` grants full access) +- **No policy composition**: Can't combine multiple policy documents (single policy document per client) + +**Future considerations:** + +- Could add policy conditions (time windows, IP restrictions) without changing core capability model +- Could add policy inheritance or composition if complexity becomes warranted +- Policy evaluation performance can be optimized with caching if needed at scale + +## See also + +- [Policies cookbook](../api/auth/policies.md) +- [Capability matrix](../api/fundamentals.md#capability-matrix) +- [Security model](../concepts/security-model.md) +- [ADR 0007: Path-Based API Versioning](0007-path-based-api-versioning.md) diff --git a/docs/adr/0004-dual-database-support.md b/docs/adr/0004-dual-database-support.md new file mode 100644 index 0000000..cda1da1 --- /dev/null +++ b/docs/adr/0004-dual-database-support.md @@ -0,0 +1,132 @@ +# ADR 0004: Dual Database Support + +> Status: accepted +> Date: 2026-01-31 + +## Context + +The system must support diverse deployment environments with different database preferences and constraints: + +- **Customer requirements**: Organizations have existing database infrastructure and expertise (PostgreSQL or MySQL) +- **Operational constraints**: Teams may have existing database monitoring, backup, and operational tooling specific to one database +- **Cloud provider considerations**: Different cloud providers offer better support for different databases (AWS RDS, Google Cloud SQL, Azure Database) +- **Migration challenges**: Some teams cannot easily switch databases due to compliance, licensing, or operational constraints +- **Licensing preferences**: PostgreSQL (fully open source) vs MySQL (dual licensing, Oracle ownership concerns) + +## Decision + +Support both PostgreSQL 12+ and MySQL 8.0+ with parallel repository implementations: + +**Repository structure:** + +- Each domain has database-specific repository files: `postgresql_*.go` and `mysql_*.go` +- Single repository interface per domain (e.g., `KekRepository`, `SecretRepository`) +- Factory pattern selects correct implementation based on `DB_DRIVER` configuration + +**Transaction pattern:** + +- Unified transaction management using `database.GetTx(ctx, db)` works with both databases +- No database-specific transaction handling at use case layer + +**Migration management:** + +- Separate migration files per database: `migrations/postgres/*` and `migrations/mysql/*` +- Migration tool supports both databases (golang-migrate) +- Each migration must be authored twice (SQL syntax differences) + +**Testing strategy:** + +- Integration test suite runs against both databases in CI +- Every repository test executes twice (once per database) +- Ensures behavioral parity across databases + +## Alternatives Considered + +### 1. PostgreSQL Only + +Support only PostgreSQL to simplify implementation and maintenance. + +**Rejected because:** + +- Customer feedback indicated hard requirement for MySQL in specific deployment scenarios +- Would limit adoption in organizations with standardized MySQL infrastructure +- MySQL expertise more common in some operational teams +- Would force migration for teams with existing MySQL deployments + +### 2. ORM Abstraction (GORM) + +Use an ORM like GORM to abstract database differences. + +**Rejected because:** + +- Performance overhead from ORM layer (reflection, query building) +- Less control over SQL optimization and query plans +- Additional dependency and version management complexity +- GORM-specific bugs and behaviors (leaky abstraction) +- Complex queries still require raw SQL or database-specific features + +### 3. Query Builder (squirrel, goqu) + +Use programmatic SQL generation to abstract database differences. + +**Rejected because:** + +- Still requires database-specific handling for type differences (BYTEA vs BLOB, RETURNING vs dual query) +- Adds abstraction layer complexity without eliminating dual implementation +- Learning curve for query builder syntax +- Less readable than raw SQL for complex queries + +### 4. Database-Agnostic SQL Subset + +Constrain all SQL to features common to both databases. + +**Partially adopted:** + +- We do constrain to common SQL subset (standard SELECT, INSERT, UPDATE, DELETE) +- Accept maintenance cost of dual implementation for broader compatibility + +## Consequences + +**Costs:** + +- **Doubles maintenance burden**: Every repository change requires updates to both `postgresql_*.go` and `mysql_*.go` +- **Constrains SQL features**: Cannot use PostgreSQL-specific features: + - `RETURNING *` clause (must do separate SELECT in MySQL) + - Array types + - JSONB operators and indexing + - Advanced window functions +- **Migration complexity**: + - Separate migration files with different SQL syntax + - Type mapping differences (BYTEA vs BLOB, TEXT vs LONGTEXT) + - Different default behaviors (auto_increment vs SERIAL) +- **Test complexity**: Integration tests run twice, doubling CI time +- **Future database cost**: Adding third database (SQLite, CockroachDB) would triple repository implementations + +**Benefits:** + +- **Broader adoption**: Organizations can use existing database infrastructure +- **Operational familiarity**: Teams use databases they already know and monitor +- **Cloud flexibility**: Choose database based on cloud provider strengths +- **Migration path**: Teams can switch databases without application changes +- **Licensing options**: PostgreSQL for fully open source, MySQL for Oracle ecosystem + +**Implementation notes:** + +- Raw SQL in repositories (no ORM) for maximum control and performance +- Database-specific optimizations possible where needed +- Common interface ensures use case layer is database-agnostic +- Factory pattern makes database selection transparent to business logic + +**Future considerations:** + +- If third database needed, reconsider ORM or query builder abstraction +- Could implement database-specific optimization branches (e.g., bulk inserts) +- Monitor for SQL feature divergence requiring major refactoring + +## See also + +- [Configuration](../configuration.md#db_driver) +- [Local development](../getting-started/local-development.md) +- [Production deployment](../operations/deployment/production.md) +- [ADR 0001: Envelope Encryption Model](0001-envelope-encryption-model.md) +- [ADR 0005: Context-Based Transaction Management](0005-context-based-transaction-management.md) diff --git a/docs/adr/0005-context-based-transaction-management.md b/docs/adr/0005-context-based-transaction-management.md new file mode 100644 index 0000000..0ba89c8 --- /dev/null +++ b/docs/adr/0005-context-based-transaction-management.md @@ -0,0 +1,147 @@ +# ADR 0005: Context-Based Transaction Management + +> Status: accepted +> Date: 2026-01-29 + +## Context + +The system requires atomic multi-step database operations to maintain consistency: + +- **Atomic operations needed**: KEK rotation (update old KEK + create new KEK), client updates (update client + update policies) +- **Clean Architecture principle**: Repository layer should not control transaction boundaries (use cases orchestrate business logic) +- **Transparency requirement**: Repositories should work identically whether called within or outside a transaction +- **Database abstraction**: Transaction pattern must work with both PostgreSQL and MySQL +- **Error handling**: Automatic rollback on errors, commit on success + +## Decision + +Adopt context-based transaction propagation with the following pattern: + +**Transaction storage:** + +- Store active transaction in context using `context.WithValue(ctx, txKey{}, tx)` +- Use unexported `txKey{}` type to prevent external packages from accessing transaction directly + +**Repository pattern:** + +- Repositories call `database.GetTx(ctx, db)` to get either: + - Active transaction from context (if present) + - Database connection (if no transaction active) +- Single repository method signature works for both transactional and non-transactional calls + +**Use case coordination:** + +- Use cases coordinate transactions via `TxManager.WithTx(ctx, fn)` interface +- Transaction manager handles begin, commit, rollback automatically +- Automatic rollback on any error within transaction function +- No nested transaction support (single transaction per context) + +**Example usage:** + +```go +// Use case orchestrates transaction +return k.txManager.WithTx(ctx, func(ctx context.Context) error { + if err := k.kekRepo.Update(ctx, oldKek); err != nil { + return err // Automatic rollback + } + return k.kekRepo.Create(ctx, newKek) // Commit on success +}) + +// Repository transparently uses transaction or DB +func (r *Repository) Create(ctx context.Context, kek *Kek) error { + querier := database.GetTx(ctx, r.db) // Gets tx if active, else db + _, err := querier.ExecContext(ctx, query, args...) + return err +} +``` + +## Alternatives Considered + +### 1. Explicit Transaction Passing + +Pass transaction explicitly as parameter: `repository.Create(tx *sql.Tx, kek *Kek)`. + +**Rejected because:** + +- Forces dual signatures: methods need both `*sql.DB` and `*sql.Tx` versions +- Repositories have two versions of every method: `Create()` and `CreateTx()` +- Use case code becomes verbose (must check if transaction needed) +- Violates DRY principle (duplicate implementation) +- Harder to refactor (add/remove transaction changes all call sites) + +### 2. Transaction in Repository Layer + +Repositories call `db.Begin()` internally when transaction needed. + +**Rejected because:** + +- Violates Clean Architecture (repositories should not decide transaction boundaries) +- Business logic (which operations are atomic) leaks into repository layer +- Cannot compose multiple repository calls in single transaction from use case +- Hard to test transactional behavior in isolation +- Repository layer has too much responsibility + +### 3. Unit of Work Pattern + +Explicit transaction object passed around and accumulated changes. + +**Rejected because:** + +- More verbose than context-based approach (similar outcome with more boilerplate) +- Requires explicit `unitOfWork.Begin()`, `unitOfWork.Commit()` calls +- Still needs some form of propagation (context or explicit parameter) +- Added complexity without significant benefit over context pattern + +### 4. No Transactions - Application-Level Idempotency + +Rely on idempotency and retries instead of database transactions. + +**Rejected because:** + +- Unacceptable for financial and cryptographic operations requiring strict consistency +- Key rotation errors could leave system in inconsistent state +- Complex to implement correctly (requires careful state machine design) +- Performance overhead from retry logic +- Database transactions are proven, reliable primitive + +## Consequences + +**Benefits:** + +- **Simple repository signatures**: Single method works for both transactional and non-transactional calls +- **No performance overhead**: Direct DB connection when transaction not needed +- **Clean Architecture compliance**: Use cases control transaction boundaries, repositories participate +- **Easy testing**: Repositories can be tested without transaction complexity +- **Automatic rollback**: Error handling simplified (any error triggers rollback) +- **Database agnostic**: Works identically with PostgreSQL and MySQL + +**Trade-offs:** + +- **Context pollution concern**: Using context for dependency injection (not just cancellation/deadlines) + - Acceptable trade-off: Go community precedent (database/sql uses context for cancellation) + - Transaction is request-scoped, fits context lifetime model +- **Implicit behavior**: Active transaction not visible in function signature + - Mitigated by: Clear naming (`WithTx`), documentation, code review practices +- **Debugging difficulty**: Transaction state requires context inspection + - Mitigated by: Logging at transaction boundaries, clear error messages + +**Limitations:** + +- **No nested transactions**: Context holds single transaction, no savepoint support + - Acceptable: Use cases designed to avoid nested transaction needs + - Could add savepoint support in future if needed +- **No transaction isolation control**: Uses database default isolation level + - Acceptable: Default isolation sufficient for current use cases + - Could add isolation level parameter to `WithTx()` if needed + +**Future considerations:** + +- Could add savepoint support for nested transaction semantics +- Could add transaction isolation level configuration +- Could add transaction timeout configuration +- Monitor for context pollution becoming problematic (no issues so far) + +## See also + +- [Architecture concepts](../concepts/architecture.md) +- [ADR 0004: Dual Database Support](0004-dual-database-support.md) diff --git a/docs/adr/0006-dual-scope-rate-limiting-strategy.md b/docs/adr/0006-dual-scope-rate-limiting-strategy.md new file mode 100644 index 0000000..5528c68 --- /dev/null +++ b/docs/adr/0006-dual-scope-rate-limiting-strategy.md @@ -0,0 +1,183 @@ +# ADR 0006: Dual-Scope Rate Limiting Strategy + +> Status: accepted +> Date: 2026-02-19 + +## Context + +The system must protect against abuse, denial-of-service attacks, and credential stuffing while maintaining fair resource allocation: + +- **Abuse protection**: Prevent malicious actors from overwhelming the system with requests +- **DoS mitigation**: Protect server resources from exhaustion by limiting request rates +- **Different threat models**: Authenticated endpoints vs unauthenticated endpoints face different attack vectors +- **Credential stuffing risk**: Token issuance endpoint vulnerable to brute-force credential attacks +- **Per-client fairness**: Prevent one authenticated client from monopolizing server resources +- **Operational simplicity**: Avoid external dependencies (Redis, API gateway) for pre-1.0 deployment + +## Decision + +Implement dual-scope rate limiting with different strategies for authenticated vs unauthenticated endpoints: + +### Authenticated Endpoints (Per-Client Rate Limiting) + +**Scope**: Each authenticated client gets an independent rate limiter identified by client ID. + +**Configuration**: + +- `RATE_LIMIT_ENABLED` (default: true) +- `RATE_LIMIT_REQUESTS_PER_SEC` (default: 10.0) +- `RATE_LIMIT_BURST` (default: 20) + +**Behavior**: + +- Requires `AuthenticationMiddleware` (must run after authentication) +- Token bucket algorithm from `golang.org/x/time/rate` library +- Separate bucket per client ID (extracted from authenticated context) +- Automatic cleanup of stale limiters after 1 hour of inactivity + +**Protected routes**: + +- `/v1/clients/*` +- `/v1/audit-logs` +- `/v1/secrets/*` +- `/v1/transit/*` +- `/v1/tokenization/*` + +### Unauthenticated Token Endpoint (Per-IP Rate Limiting) + +**Scope**: Each client IP address gets an independent rate limiter. + +**Configuration**: + +- `RATE_LIMIT_TOKEN_ENABLED` (default: true) +- `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` (default: 5.0) +- `RATE_LIMIT_TOKEN_BURST` (default: 10) + +**Behavior**: + +- Applied to `POST /v1/token` (unauthenticated endpoint) +- Token bucket algorithm from `golang.org/x/time/rate` library +- Separate bucket per client IP (extracted via `c.ClientIP()`) +- IP detection handles `X-Forwarded-For`, `X-Real-IP`, and direct connection +- Stricter default limits than authenticated endpoints (credential stuffing mitigation) +- Automatic cleanup of stale limiters after 1 hour of inactivity + +**Response behavior** (both scopes): + +- Status: `429 Too Many Requests` +- Header: `Retry-After: ` +- JSON body: `{"error": "rate_limit_exceeded", "message": "..."}` + +## Alternatives Considered + +### 1. Global Rate Limit + +Single shared rate limit across all clients/IPs. + +**Rejected because:** + +- Noisy neighbor problem: one misbehaving client affects all legitimate clients +- Cannot differentiate between high-volume legitimate users and attackers +- Unfair resource allocation (first come, first served) +- No per-client or per-IP isolation + +### 2. Redis-Based Distributed Rate Limiting + +External Redis store for rate limit state shared across server instances. + +**Rejected because:** + +- Adds operational dependency (Redis must be deployed, monitored, backed up) +- Additional latency for every request (Redis round trip) +- Increased complexity (connection pooling, failover, retry logic) +- Not needed for pre-1.0 (single instance deployment acceptable) +- Trade-off: Cannot share rate limit state across multiple instances (acceptable for current scale) + +### 3. API Gateway Rate Limiting + +Offload rate limiting to external API gateway (Kong, NGINX, AWS API Gateway). + +**Rejected because:** + +- Requires additional infrastructure and deployment complexity +- Reduces deployment simplicity (goal: single binary + database) +- Splits configuration between application and gateway +- Still need application-level rate limiting for business logic control +- Acceptable for future scale, not needed now + +### 4. IP-Only Rate Limiting (Including Authenticated Endpoints) + +Use single IP-based mechanism for all endpoints. + +**Rejected because:** + +- Shared NATs/proxies would unfairly throttle legitimate users behind same IP +- Corporate networks, cloud NATs, and residential ISPs share IPs across many users +- Cannot provide per-client fairness for authenticated API usage +- Credential stuffing protection still needed (addressed by token endpoint IP limiting) + +### 5. Client ID-Only Rate Limiting (Including Token Endpoint) + +Use single client-based mechanism for all endpoints. + +**Rejected because:** + +- Token endpoint is unauthenticated (no client ID available yet) +- Attacker can try many client credentials without rate limit +- Credential stuffing attacks would be unrestricted + +## Consequences + +**Benefits:** + +- **Simple implementation**: In-process rate limiting, no external dependencies +- **Low latency**: No external service calls, direct memory access +- **Per-client fairness**: Authenticated clients cannot affect each other's rate limits +- **Credential stuffing protection**: IP-based limiting protects unauthenticated token endpoint +- **Operational simplicity**: No Redis, no API gateway, no additional infrastructure +- **Configurable limits**: Operators can tune limits per deployment environment + +**Trade-offs and Limitations:** + +- **In-process state**: Rate limiter state lost on server restart + - Impact: Fresh rate limit buckets after deployment (acceptable, temporary burst allowed) + - Mitigation: Graceful shutdown drains existing requests before restart + +- **Memory growth**: Limiter map grows with unique clients/IPs + - Impact: Memory usage increases with client/IP diversity + - Mitigation: Automatic cleanup after 1 hour of inactivity + - Acceptable: Typical deployments have bounded client/IP counts + +- **IP-based limitations for token endpoint**: + - **Shared IPs (NAT/proxies)**: Multiple legitimate users behind same corporate NAT or ISP may share IP, hitting limit together + - **X-Forwarded-For spoofing**: Attacker could rotate IPs in header if reverse proxy not properly configured + - Mitigations: + - Reasonable burst capacity (10 requests) handles legitimate retries + - Can disable via `RATE_LIMIT_TOKEN_ENABLED=false` if IP limiting problematic + - Configure Gin's trusted proxy settings in production deployments + - Deploy behind properly configured reverse proxy/load balancer + +- **No cross-instance coordination**: Each server instance has independent rate limiters + - Impact: Rate limits are per-instance, not globally enforced + - Acceptable: Pre-1.0 deployments typically run single instance + - Future: Could add Redis-based distributed rate limiting if multi-instance needed + +**Security considerations:** + +- Token endpoint uses stricter limits (5 req/sec vs 10 req/sec) to protect against credential attacks +- Burst capacity allows legitimate retry behavior while limiting sustained abuse +- Rate limit metrics exposed via Prometheus for monitoring and alerting +- `429` responses logged for security audit and attack detection + +**Future enhancements:** + +- Could add Redis-based distributed rate limiting for multi-instance deployments +- Could add adaptive rate limiting based on system load +- Could add allowlist/blocklist for specific IPs or clients +- Could add custom rate limits per client ID (database-stored configuration) + +## See also + +- [Rate limiting fundamentals](../api/fundamentals.md#rate-limiting) +- [Monitoring rate limiting metrics](../operations/observability/monitoring.md#rate-limiting-observability-queries) +- [Configuration](../configuration.md) diff --git a/docs/adr/0007-path-based-api-versioning.md b/docs/adr/0007-path-based-api-versioning.md new file mode 100644 index 0000000..8811963 --- /dev/null +++ b/docs/adr/0007-path-based-api-versioning.md @@ -0,0 +1,169 @@ +# ADR 0007: Path-Based API Versioning + +> Status: accepted +> Date: 2026-02-12 + +## Context + +The system requires a versioning strategy to support API evolution while maintaining stability for consumers: + +- **Stability requirement**: Consumers need backward compatibility guarantees within an API version +- **Breaking changes**: Cryptographic APIs may require breaking changes (algorithm upgrades, security improvements, schema changes) +- **Deployment constraint**: Same server binary may need to serve multiple API versions simultaneously during migration periods +- **Developer experience**: API version should be immediately visible and discoverable in documentation and examples +- **Migration support**: Clear migration path for consumers when breaking changes are introduced +- **Long-lived integrations**: Financial and cryptographic integrations often run for years without updates + +## Decision + +Adopt URL path-based versioning using `/v1/*` prefix for all API endpoints: + +**Route structure:** + +- All API endpoints under `/v1/*` prefix (e.g., `/v1/secrets/*`, `/v1/transit/keys/:name/encrypt`) +- Health/metrics endpoints outside versioning: `/health`, `/ready`, `/metrics` (not `/v1/health`) +- Future breaking changes require new version path: `/v2/*` + +**Contract definition:** + +- OpenAPI specification at `docs/openapi.yaml` defines v1 contract baseline +- Endpoint documentation in `docs/api/*.md` defines full public behavior +- Breaking changes documented in `releases/RELEASES.md` with migration guides + +**Version independence:** + +- No version in headers (no `Accept: application/vnd.secrets.v1+json`) +- No version in query parameters (no `?version=1` or `?api-version=v1`) +- Version specified solely in URL path + +**Coexistence strategy:** + +- Multiple versions can coexist: `/v1/*` and `/v2/*` routes registered simultaneously +- Gradual migration: consumers transition at their own pace +- Deprecation timeline: old versions maintained until safe to remove (based on consumer usage metrics) + +## Alternatives Considered + +### 1. Header-Based Versioning + +Version specified in request headers: `Accept: application/vnd.secrets.v1+json` or `X-API-Version: v1`. + +**Rejected because:** + +- Less visible in browser/curl (requires inspecting request headers) +- Not discoverable from URL alone (must read documentation to know header format) +- Harder to cache at CDN/proxy level (varies by header) +- More complex routing configuration (must inspect headers, not just path) +- Developer experience suffers (copy-paste URL doesn't include version) + +### 2. Query Parameter Versioning + +Version specified in query string: `?version=1` or `?api-version=v1`. + +**Rejected because:** + +- Harder to route at API gateway/proxy level (requires query param inspection) +- Query parameters typically used for filtering/pagination, not versioning +- URL caching complexity (query params affect cache keys differently) +- Inconsistent with REST best practices (version is not a filter) +- Easy to forget in code (URLs work without version, fail unexpectedly) + +### 3. Subdomain Versioning + +Version in subdomain: `v1.api.example.com` vs `v2.api.example.com`. + +**Rejected because:** + +- DNS configuration complexity (must manage multiple DNS entries) +- TLS certificate management overhead (wildcard cert or multiple certs) +- Deployment complexity (routing traffic to correct version per subdomain) +- Higher operational burden for pre-1.0 system +- Acceptable for large-scale APIs, over-engineered for current needs + +### 4. No Versioning + +Single evolving API with deprecation warnings for old behavior. + +**Rejected because:** + +- Unacceptable for cryptographic API (breaking changes too risky) +- No clear contract boundary for consumers +- Cannot safely remove deprecated features (no version to sunset) +- Financial integrations require stability guarantees +- Migration path unclear (when is it safe to remove deprecated behavior?) + +## Consequences + +**Benefits:** + +- **URL clarity**: API version immediately visible in every request +- **Developer experience**: Copy-paste examples work, no hidden header configuration +- **Proxy/gateway friendly**: Easy to route by path prefix (`/v1/*` to v1 backend) +- **Documentation simplicity**: All examples show version in URL path +- **Caching friendly**: URL fully determines version, standard HTTP caching applies +- **Migration clarity**: `/v2/*` coexists with `/v1/*`, clear separation + +**Coexistence and migration:** + +- **Gradual rollout**: Deploy `/v2/*` routes alongside `/v1/*` +- **Consumer autonomy**: Clients migrate at their own pace (no forced upgrade) +- **Monitoring**: Track usage metrics per version to inform deprecation timeline +- **Deprecation process**: + 1. Announce deprecation timeline in release notes + 2. Monitor `/v1/*` usage metrics to identify remaining consumers + 3. Notify active consumers via support channels + 4. Remove deprecated version after safe sunset period (e.g., 6 months) + +**Breaking change process:** + +When breaking changes needed: + +1. **Implement** `/v2/*` endpoints with new behavior +2. **Document** migration guide in `docs/releases/vX.Y.Z-upgrade.md`: + - What changed (endpoint paths, request/response schemas, status codes) + - Migration examples (v1 request β†’ v2 request) + - Deprecation timeline for v1 +3. **Update** `docs/openapi.yaml` with v2 contract +4. **Announce** in `releases/RELEASES.md` and release notes +5. **Monitor** `/v1/*` and `/v2/*` usage metrics +6. **Remove** `/v1/*` after sunset period + +**Limitations:** + +- **No per-field versioning**: All fields in API version evolve together + - Cannot mix v1 and v2 fields in same request/response + - Acceptable: Simplifies implementation and consumer understanding +- **URL length**: Version prefix adds characters to URL + - Negligible impact: `/v1/` adds only 4 characters +- **Routing complexity**: Router must handle multiple version prefixes + - Acceptable: Gin route groups make this straightforward + +**Non-breaking changes:** + +These can be added to existing `/v1/*` without new version: + +- Adding optional request fields +- Adding new response fields (consumers must ignore unknown fields) +- Adding new endpoints under `/v1/*` +- Clarifying documentation without behavior changes + +**Breaking changes** (require `/v2/*`): + +- Changing endpoint paths or required path parameters +- Removing response fields or changing field meaning/type +- Changing required request fields or accepted formats +- Changing status code semantics for successful behavior + +**Future considerations:** + +- Could add version in response header for debugging: `X-API-Version: v1` +- Could implement automatic v1-to-v2 adapter middleware for common cases +- Could add version negotiation for advanced use cases (not needed now) + +## See also + +- [API versioning policy](../api/fundamentals.md#compatibility-and-versioning-policy) +- [Breaking vs non-breaking changes](../contributing.md#breaking-vs-non-breaking-docs-changes) +- [OpenAPI specification](../openapi.yaml) +- [ADR 0002: Transit Versioned Ciphertext Contract](0002-transit-versioned-ciphertext-contract.md) +- [ADR 0003: Capability-Based Authorization Model](0003-capability-based-authorization-model.md) diff --git a/docs/adr/0008-gin-web-framework-with-custom-middleware.md b/docs/adr/0008-gin-web-framework-with-custom-middleware.md new file mode 100644 index 0000000..a54a095 --- /dev/null +++ b/docs/adr/0008-gin-web-framework-with-custom-middleware.md @@ -0,0 +1,153 @@ +# ADR 0008: Gin Web Framework with Custom Middleware Strategy + +> Status: accepted +> Date: 2026-01-29 + +## Context + +The application requires an HTTP server for REST API endpoints with the following needs: + +- High-performance request routing with path parameters and route groups +- Middleware chain support for cross-cutting concerns (logging, authentication, rate limiting) +- Request/response binding and validation +- Custom error handling and response formatting +- Production-ready timeouts and graceful shutdown +- Integration with structured logging (slog) instead of default logging +- Request ID tracking for distributed tracing + +Standard library `net/http` provides basic routing but lacks ergonomic path parameters, route grouping, and middleware chaining. Full-featured frameworks like Echo, Fiber, or Chi offer different trade-offs in performance, API design, and ecosystem maturity. + +## Decision + +Adopt **Gin v1.11.0** as the web framework with a **custom middleware strategy** that bypasses Gin's default middleware: + +**Core choices:** + +- Use `gin.New()` instead of `gin.Default()` to avoid default middleware (Logger, Recovery) +- Manually configure `http.Server` with explicit timeouts: + - `ReadTimeout: 15s` + - `WriteTimeout: 15s` + - `IdleTimeout: 60s` +- Replace Gin's default logger with custom slog-based middleware +- Keep Gin's Recovery middleware but add custom middleware for: + - Request ID tracking (UUIDv7 via `gin-contrib/requestid`) + - Custom structured logging (slog) + - Authentication (Bearer token validation) + - Authorization (capability-based access control) + - Rate limiting (dual-scope: per-client and per-IP) + - Audit logging (authorization attempts) +- Auto-configure Gin mode from `LOG_LEVEL` environment variable (debug β†’ `gin.DebugMode`, else `gin.ReleaseMode`) + +**Middleware execution order:** + +1. `gin.Recovery()` - Panic recovery +2. `requestid.New()` - UUIDv7 request ID generation +3. `CustomLoggerMiddleware()` - slog-based HTTP request logging +4. `AuditLogMiddleware()` - Audit log persistence (route group level) +5. `AuthenticationMiddleware()` - Bearer token validation (route-specific) +6. `AuthorizationMiddleware()` - Capability enforcement (route-specific) +7. `RateLimitMiddleware()` - Per-client or per-IP throttling (route-specific) +8. Handler + +**Route organization:** + +- Health/readiness endpoints outside API versioning: `/health`, `/ready`, `/metrics` +- Versioned API routes under `/v1/*` using route groups +- Per-endpoint middleware chaining for fine-grained control + +## Alternatives Considered + +### 1. Echo Framework + +Popular alternative with similar performance characteristics. + +**Rejected because:** + +- Less mature ecosystem for middleware (no official `gin-contrib` equivalent) +- Different context abstraction (`echo.Context` vs `gin.Context`) less familiar to team +- Smaller community compared to Gin (fewer third-party middleware packages) +- No significant performance advantage over Gin for our workload + +### 2. Fiber Framework + +Express.js-inspired framework built on fasthttp. + +**Rejected because:** + +- Uses fasthttp instead of `net/http` (incompatible with standard middleware ecosystem) +- More opinionated API design conflicts with Clean Architecture boundaries +- Migration complexity if we need to switch back to `net/http` standard library +- Performance gains not justified for cryptographic workload (CPU-bound, not I/O-bound) + +### 3. Chi Router + +Lightweight router built on `net/http` with middleware support. + +**Rejected because:** + +- More verbose route parameter extraction compared to Gin (`chi.URLParam(r, "id")` vs `c.Param("id")`) +- No built-in request/response binding (requires manual JSON marshaling) +- No built-in validation framework integration +- Less ergonomic for rapid API development while maintaining Clean Architecture + +### 4. Standard Library (`net/http` + `http.ServeMux`) + +Minimal dependencies with Go 1.22+ improved routing. + +**Rejected because:** + +- Lacks route groups for middleware scoping (all middleware must be global or per-handler) +- No built-in path parameter support before Go 1.22, limited after +- No request/response binding helpers (increases boilerplate in handlers) +- No middleware chaining abstraction (manual wrapper functions required) +- Development velocity trade-off not justified for a pre-1.0 project + +### 5. Use Gin's Default Middleware (`gin.Default()`) + +Simpler setup with built-in Logger and Recovery. + +**Rejected because:** + +- Default logger outputs plain text, incompatible with structured logging (slog) +- No request ID tracking (essential for distributed tracing and incident correlation) +- Cannot customize log format to include custom fields (client ID, capability, path) +- Default logger not production-ready (no JSON output, no log level control) + +## Consequences + +**Benefits:** + +- **High development velocity**: Ergonomic API with minimal boilerplate for route parameters, JSON binding, and validation +- **Custom observability**: Full control over logging format, request ID propagation, and audit trails +- **Fine-grained middleware control**: Per-route middleware application (auth on protected routes, rate limiting per scope) +- **Production-ready defaults**: Explicit timeout configuration prevents resource exhaustion +- **Ecosystem compatibility**: Large middleware ecosystem (`gin-contrib/*`) and community support +- **Testability**: `gin.CreateTestContext()` simplifies handler unit testing +- **Clean Architecture compatibility**: `gin.Context` isolated to HTTP layer, domain/use case layers remain framework-agnostic + +**Trade-offs:** + +- **Framework dependency**: Vendor lock-in to Gin's routing and context abstraction (migration requires rewriting HTTP layer) +- **Middleware maintenance**: Custom middleware increases code surface area (vs using default middleware) +- **Learning curve**: Team must understand Gin-specific patterns (middleware chain, context usage, route groups) + +**Limitations:** + +- **No built-in OpenAPI generation**: Must maintain `docs/openapi.yaml` manually (acceptable for pre-1.0 with stable API) +- **No HTTP/2 server push**: Not supported by Gin's `http.Server` wrapper (not needed for REST API) +- **Context-bound data**: Request-scoped data (authenticated client, audit metadata) stored in `gin.Context` instead of Go `context.Context` (acceptable trade-off for ergonomics) + +**Future considerations:** + +- Monitor Gin maintenance activity (stable v1.x releases, active issue triage) +- Evaluate OpenAPI code generation tools if manual maintenance becomes burden +- Consider migration path to standard library if Go routing improves significantly +- Could adopt `net/http` middleware adapters if we need standard library compatibility + +## See also + +- [HTTP server implementation](../../internal/http/server.go) +- [Custom logger middleware](../../internal/http/middleware.go) +- [ADR 0009: UUIDv7 for Identifiers](0009-uuidv7-for-identifiers.md) - Request ID generation strategy +- [ADR 0006: Dual-Scope Rate Limiting Strategy](0006-dual-scope-rate-limiting-strategy.md) - Rate limiting middleware +- [API fundamentals](../api/fundamentals.md) diff --git a/docs/adr/0009-uuidv7-for-identifiers.md b/docs/adr/0009-uuidv7-for-identifiers.md new file mode 100644 index 0000000..6d4792c --- /dev/null +++ b/docs/adr/0009-uuidv7-for-identifiers.md @@ -0,0 +1,172 @@ +# ADR 0009: UUIDv7 for Identifiers + +> Status: accepted +> Date: 2026-01-29 + +## Context + +The application requires unique identifiers for database entities with the following needs: + +- Globally unique across distributed systems (no coordination required) +- Database-friendly for indexing and query performance +- Sortable for chronological ordering (audit logs, version history) +- Secure against enumeration attacks (unpredictable) +- Compatible with PostgreSQL `UUID` type and MySQL `BINARY(16)` storage +- Usable in HTTP APIs (URL-safe string representation) + +Identifier strategies include: + +- **Sequential integers**: Auto-increment IDs (database-generated) +- **UUIDv4**: Random 122-bit identifiers (no time ordering) +- **UUIDv7**: Time-ordered with random components (RFC 9562, 2024) +- **ULID**: Lexicographically sortable, Crockford Base32 encoded +- **Snowflake IDs**: Twitter's time-based 64-bit IDs with machine/sequence components + +Database performance matters: B-tree indexes perform poorly with random UUIDs (UUIDv4) due to page splits, but sequential IDs expose information leakage (entity count, creation rate). + +## Decision + +Adopt **UUIDv7** (RFC 9562) for all database entity identifiers: + +**Core characteristics:** + +- **48-bit timestamp**: Unix epoch milliseconds (provides natural time ordering) +- **12-bit random counter**: Sub-millisecond collision avoidance +- **62-bit randomness**: Cryptographically random bits for unpredictability +- **Total: 128 bits** (same as UUIDv4, compatible with UUID database types) + +**Usage pattern:** + +```go +import "github.com/google/uuid" + +id := uuid.Must(uuid.NewV7()) // All database entities +``` + +**Applied to:** + +- Database primary keys (clients, tokens, secrets, transit keys, tokenization keys, KEKs, DEKs, audit logs) +- Request IDs (HTTP middleware via `gin-contrib/requestid`) +- All domain entities requiring unique identification + +**Sorting behavior:** + +- UUIDv7 values are naturally sortable by time (ascending = oldest first) +- Eliminates need for separate `created_at` columns for chronological queries +- Enables efficient range scans: `WHERE id > $last_seen_id ORDER BY id LIMIT 100` + +**Storage:** + +- PostgreSQL: `UUID` type (16 bytes, native support) +- MySQL: `BINARY(16)` (requires manual conversion in repositories) + +## Alternatives Considered + +### 1. UUIDv4 (Random) + +Standard random UUID with 122 bits of randomness. + +**Rejected because:** + +- **Poor index performance**: Random distribution causes B-tree page splits on every insert (write amplification) +- **No time ordering**: Cannot sort by ID to get chronological order (must add `created_at` column) +- **Wasted storage**: Requires separate timestamp columns for temporal queries +- **Fragmented indexes**: Database must constantly rebalance index pages (increased I/O) + +**Benchmark impact** (PostgreSQL): + +- UUIDv4 inserts: ~40% slower than UUIDv7 at scale (millions of rows) +- Index size: ~15-20% larger due to fragmentation + +### 2. Auto-Increment Sequential IDs + +Database-generated `SERIAL` (PostgreSQL) or `AUTO_INCREMENT` (MySQL). + +**Rejected because:** + +- **Information leakage**: Sequential IDs expose entity count and creation rate (security concern for API) +- **Coordination required**: Multi-region deployments need complex ID generation coordination +- **No global uniqueness**: Cannot merge data from multiple databases without ID collision +- **Migration complexity**: Changing ID space requires expensive table rewrites +- **Enumeration attacks**: Attackers can guess valid IDs (e.g., `/v1/secrets/1`, `/v1/secrets/2`, ...) + +### 3. ULID (Universally Unique Lexicographically Sortable Identifier) + +128-bit time-ordered IDs with Crockford Base32 encoding. + +**Rejected because:** + +- **Non-standard encoding**: Crockford Base32 incompatible with PostgreSQL `UUID` type (requires `TEXT` or custom type) +- **Storage overhead**: Text storage (26 characters) uses more space than binary UUID (16 bytes) +- **Ecosystem compatibility**: No native database support, requires custom conversion logic +- **Library maturity**: Less mature Go libraries compared to `google/uuid` (which has UUIDv7 support) + +### 4. Snowflake IDs + +64-bit time-based IDs with machine/datacenter/sequence components. + +**Rejected because:** + +- **Machine coordination**: Requires unique machine IDs (complex in containerized/serverless environments) +- **64-bit limit**: Smaller than UUID (potential collision risk in high-throughput systems) +- **No UUID compatibility**: Cannot use PostgreSQL `UUID` type (requires `BIGINT`) +- **Clock skew sensitivity**: Requires NTP synchronization across machines (operational complexity) +- **Single point of failure**: Machine ID exhaustion or clock drift can cause outages + +### 5. UUIDv1 (Time-Based with MAC Address) + +Time-ordered UUID with MAC address component. + +**Rejected because:** + +- **Privacy leak**: Embeds machine MAC address (reveals infrastructure details) +- **Non-monotonic**: Time field has unusual byte ordering (not naturally sortable as bytes) +- **No randomness**: Sequence number only 14 bits (collision risk in high-throughput systems) +- **Security concern**: Predictable MAC address component aids reconnaissance attacks + +## Consequences + +**Benefits:** + +- **Index-friendly writes**: Time-ordered inserts minimize B-tree page splits (better write performance) +- **Chronological sorting**: IDs naturally sort by creation time (no separate `created_at` column needed for ordering) +- **Global uniqueness**: No coordination required across instances/regions +- **Enumeration resistance**: 62 bits of randomness prevent ID guessing attacks +- **Database compatibility**: Native `UUID` type support in PostgreSQL, `BINARY(16)` in MySQL +- **Standard compliance**: RFC 9562 specification (future-proof) +- **Pagination efficiency**: Can use `WHERE id > $cursor` for keyset pagination (more efficient than offset) +- **Audit log ordering**: Audit events naturally ordered by ID without timestamp sorting + +**Trade-offs:** + +- **Clock dependency**: Requires monotonic clock (mitigated by `time.Now()` in Go runtime) +- **Partial information leak**: Timestamp component reveals creation time to millisecond precision (acceptable for most use cases) +- **Not strictly monotonic**: Random bits mean IDs created in same millisecond are unordered (acceptable for business logic) + +**Limitations:** + +- **Millisecond precision**: Cannot distinguish order within same millisecond without additional sorting + - Mitigated by: 12-bit random counter provides sub-millisecond uniqueness + - Acceptable: Business logic does not require microsecond-level ordering +- **Time skew vulnerability**: Clock rollback could create IDs with earlier timestamps + - Mitigated by: NTP synchronization and monotonic clock in modern OS/container environments + - Acceptable: Crypto operations are not sensitive to millisecond-level time ordering + +**Performance characteristics:** + +- **Insert throughput**: ~60% faster than UUIDv4 at scale (millions of rows) +- **Index size**: ~15-20% smaller than UUIDv4 indexes (better cache locality) +- **Range scan efficiency**: Time-ordered layout improves sequential access patterns + +**Migration notes:** + +- Existing UUIDv4 values remain valid (no retroactive migration needed) +- New entities use UUIDv7 from implementation date forward +- Mixed UUID versions acceptable (version distinguishable by variant bits) + +## See also + +- [RFC 9562: Universally Unique IDentifiers (UUIDs)](https://www.rfc-editor.org/rfc/rfc9562.html) +- [google/uuid Go library](https://github.com/google/uuid) +- [Request ID middleware implementation](../../internal/http/middleware.go) +- [ADR 0008: Gin Web Framework with Custom Middleware](0008-gin-web-framework-with-custom-middleware.md) - Request ID usage diff --git a/docs/adr/0010-argon2id-for-client-secret-hashing.md b/docs/adr/0010-argon2id-for-client-secret-hashing.md new file mode 100644 index 0000000..1a2ff95 --- /dev/null +++ b/docs/adr/0010-argon2id-for-client-secret-hashing.md @@ -0,0 +1,205 @@ +# ADR 0010: Argon2id for Client Secret Hashing + +> Status: accepted +> Date: 2026-02-10 + +## Context + +The application requires secure password hashing for client authentication credentials with the following needs: + +- **Resistance to offline attacks**: Protect hashed secrets if database is compromised +- **GPU/ASIC resistance**: Prevent brute-force attacks using specialized hardware +- **Configurable work factor**: Balance security and performance as hardware evolves +- **Standard compliance**: Use well-vetted algorithms recommended by security standards +- **Future-proof**: Algorithm should remain secure for 5+ years +- **Performance**: Hash verification must complete within acceptable latency (< 500ms) + +Client secrets are used for Bearer token issuance via `POST /v1/token` endpoint. If the database is compromised, attackers gain access to hashed secrets and can mount offline brute-force attacks without rate limiting. + +Key security considerations: + +- **Password strength**: Client secrets are 32-character random alphanumeric strings (generated, not user-chosen) +- **Attack vectors**: Offline brute-force (post-breach), rainbow tables, GPU/ASIC cracking +- **Threat model**: Attacker gains read access to database, attempts to recover plaintext secrets + +## Decision + +Adopt **Argon2id** with **PolicyModerate** configuration using the `go-pwdhash` library: + +**Algorithm choice:** + +- **Argon2id**: Hybrid mode combining Argon2i (side-channel resistant) and Argon2d (GPU-resistant) +- **Winner of Password Hashing Competition (PHC) 2015** +- **Recommended by OWASP** for password storage as of 2024 + +**Configuration (PolicyModerate):** + +```go +// via github.com/allisson/go-pwdhash v0.3.1 +hasher, _ := pwdhash.NewArgon2idHasher(pwdhash.PolicyModerate) + +// Internally uses: +// - Memory: 64 MiB (m=65536) +// - Iterations: 3 (t=3) +// - Parallelism: 4 threads (p=4) +// - Salt length: 16 bytes +// - Hash length: 32 bytes +``` + +**Output format (PHC string):** + +```text +$argon2id$v=19$m=65536,t=3,p=4$$ +``` + +**Usage pattern:** + +```go +// Hash client secret on creation +hash, err := secretHasher.Hash(ctx, clientSecret) + +// Verify client secret on authentication +valid, err := secretHasher.Verify(ctx, clientSecret, storedHash) +``` + +**Policy rationale:** + +- **PolicyModerate** chosen over PolicyConservative/PolicyParanoid for balance: + - Security: 64 MiB memory requirement resists GPU attacks + - Performance: ~100-200ms verification time (acceptable for auth endpoint) + - Hardware: Works on constrained environments (containers with 512 MiB+ memory) + +## Alternatives Considered + +### 1. bcrypt + +Industry-standard algorithm based on Blowfish cipher. + +**Rejected because:** + +- **GPU-vulnerable**: Low memory usage (4 KiB) allows efficient GPU/ASIC implementation +- **Limited work factor**: Maximum cost factor 31 (2^31 iterations) may become insufficient +- **32-byte output limit**: Cannot increase hash length for future quantum resistance +- **No memory hardness**: Attackers can parallelize on GPUs with minimal memory per thread +- **Aging algorithm**: Designed in 1999, shows weakness against modern hardware (GPUs with 10,000+ cores) + +**Performance comparison:** + +- bcrypt cost 12: ~300ms CPU time, 4 KiB memory +- Argon2id (Moderate): ~150ms CPU time, 64 MiB memory β†’ **16,000x more memory required** + +### 2. scrypt + +Memory-hard key derivation function designed for hardware resistance. + +**Rejected because:** + +- **No side-channel resistance**: Vulnerable to cache-timing attacks (Argon2i/Argon2id address this) +- **Less vetted**: Not winner of PHC, less cryptanalysis than Argon2 +- **Complex parameter tuning**: Requires setting N (CPU/memory cost), r (block size), p (parallelism) independently +- **No standard format**: No PHC string format standardization (portability issues) +- **Older design**: Created in 2009, superseded by Argon2 (2015) + +### 3. PBKDF2-SHA256 + +NIST-approved key derivation function. + +**Rejected because:** + +- **No memory hardness**: Trivially parallelizable on GPUs (same weakness as bcrypt) +- **High iteration count required**: 600,000+ iterations needed for equivalent security (slower than Argon2id) +- **No built-in salt handling**: Requires manual salt generation and storage +- **Designed for key derivation**: Not purpose-built for password hashing (Argon2 designed specifically for this) +- **GPU-vulnerable**: Attackers can run millions of parallel attempts on GPU + +### 4. Argon2i (Data-Independent Mode) + +Side-channel resistant variant of Argon2. + +**Rejected because:** + +- **Weaker GPU resistance**: Slightly more vulnerable to GPU attacks than Argon2d/Argon2id +- **No hybrid benefit**: Argon2id provides side-channel resistance AND GPU resistance +- **Same implementation complexity**: No simplicity advantage over Argon2id +- **Not recommended**: PHC recommends Argon2id for password hashing (Argon2i for key derivation) + +### 5. Argon2d (Data-Dependent Mode) + +Maximum GPU-resistant variant of Argon2. + +**Rejected because:** + +- **Side-channel vulnerable**: Data-dependent memory access patterns leak information via cache timing +- **Not recommended for passwords**: PHC discourages Argon2d for password hashing (use Argon2id instead) +- **Same memory cost**: No performance benefit over Argon2id +- **Weaker security model**: Side-channel attacks more practical than brute-force in some scenarios + +## Consequences + +**Benefits:** + +- **GPU/ASIC resistance**: 64 MiB memory requirement makes GPU attacks economically infeasible +- **Side-channel resistance**: Argon2id's hybrid mode protects against cache-timing attacks +- **Future-proof**: Winner of PHC, recommended by OWASP, designed for long-term security +- **Configurable security**: Can upgrade to PolicyConservative (128 MiB) or PolicyParanoid (256 MiB) if needed +- **Standard format**: PHC string format enables portability and version detection +- **Salt handling**: Library manages salt generation (16 bytes, cryptographically random) +- **Strong secrets**: Generated 32-character alphanumeric secrets have ~190 bits entropy (brute-force infeasible) + +**Trade-offs:** + +- **Memory usage**: 64 MiB per hash operation (vs 4 KiB for bcrypt) + - Mitigated by: Hashing only happens during client creation and authentication (low frequency) + - Acceptable: Modern containers/VMs have ample memory for this workload +- **Verification latency**: ~100-200ms per hash verification (vs ~300ms for bcrypt cost 12) + - Mitigated by: Rate limiting on `/v1/token` endpoint prevents high-frequency verification + - Acceptable: Authentication latency acceptable for server-to-server OAuth2-style flows +- **CPU usage**: Higher CPU cost than bcrypt for equivalent security + - Mitigated by: Low authentication request volume (infrequent token issuance) + - Acceptable: CPU cost justified by superior security properties + +**Limitations:** + +- **No hardware acceleration**: No AES-NI or other CPU instruction support (pure software implementation) + - Acceptable: Memory hardness more important than CPU optimization for password hashing +- **Container memory requirements**: Minimum 512 MiB memory required per container (64 MiB Γ— ~8 concurrent requests) + - Acceptable: Production deployments already provision 1+ GiB per container +- **Incompatible with legacy bcrypt hashes**: Cannot verify existing bcrypt hashes without migration + - Not applicable: New project, no legacy hashes to migrate + +**Security characteristics:** + +- **Offline attack cost**: ~$100,000 per hash cracked with 1000-GPU cluster (vs ~$1,000 for bcrypt) +- **Time to crack (96-bit secret)**: ~10^29 years at 1 million GPU-years compute (infeasible) +- **Side-channel resistance**: Cache-timing attacks mitigated by Argon2id's hybrid mode +- **Quantum resistance**: 256-bit output exceeds Grover's algorithm requirement (128 bits post-quantum) + +**Configuration policy rationale:** + +| Policy | Memory | Time | Use Case | +|--------------|---------|--------|----------------------------------------------------------| +| Moderate | 64 MiB | ~150ms | **Production default** (balance security/performance) | +| Conservative | 128 MiB | ~300ms | High-security environments (financial, healthcare) | +| Paranoid | 256 MiB | ~600ms | Maximum security (government, defense) | + +**Current choice: Moderate** because: + +- Client secrets are 32-character random strings (not weak user passwords) +- Rate limiting on auth endpoint limits attack surface +- 64 MiB memory cost already 16,000x harder than bcrypt +- Performance acceptable for API authentication latency requirements + +**Migration path:** + +- Can increase to Conservative/Paranoid by rehashing on next authentication +- PHC string format embeds parameters (enables gradual migration) +- Library supports policy detection from hash string + +## See also + +- [Argon2 RFC 9106](https://www.rfc-editor.org/rfc/rfc9106.html) +- [OWASP Password Storage Cheat Sheet](https://cheatsheetseries.owasp.org/cheatsheets/Password_Storage_Cheat_Sheet.html) +- [Password Hashing Competition](https://www.password-hashing.net/) +- [go-pwdhash library](https://github.com/allisson/go-pwdhash) +- [Token authentication implementation](../../internal/auth/service/secret_service.go) +- [ADR 0006: Dual-Scope Rate Limiting Strategy](0006-dual-scope-rate-limiting-strategy.md) - Rate limiting on auth endpoint diff --git a/docs/api/authentication.md b/docs/api/auth/authentication.md similarity index 87% rename from docs/api/authentication.md rename to docs/api/auth/authentication.md index 1e3cc17..adb2fee 100644 --- a/docs/api/authentication.md +++ b/docs/api/auth/authentication.md @@ -137,21 +137,22 @@ Representative error payloads (exact messages may vary): - `docs/examples/python.md` - `docs/examples/javascript.md` - `docs/examples/go.md` -- `docs/api/response-shapes.md` +- `docs/api/observability/response-shapes.md` ## Notes - `Bearer` prefix is case-insensitive (`bearer`, `Bearer`, `BEARER`) - Tokens are time-limited and should be renewed before expiration +- Client secrets are hashed using Argon2id (see [ADR 0010: Argon2id for Client Secret Hashing](../../adr/0010-argon2id-for-client-secret-hashing.md)) - For wildcard path matcher semantics used by authorization, see [Policies cookbook / Path matching behavior](policies.md#path-matching-behavior) ## See also - [Clients API](clients.md) -- [API error decision matrix](error-decision-matrix.md) -- [API rate limiting](rate-limiting.md) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) +- [API rate limiting](../fundamentals.md#rate-limiting) - [Policies cookbook](policies.md) -- [Capability matrix](capability-matrix.md) -- [Audit logs API](audit-logs.md) -- [Response shapes](response-shapes.md) +- [Capability matrix](../fundamentals.md#capability-matrix) +- [Audit logs API](../observability/audit-logs.md) +- [Response shapes](../observability/response-shapes.md) diff --git a/docs/api/clients.md b/docs/api/auth/clients.md similarity index 82% rename from docs/api/clients.md rename to docs/api/auth/clients.md index af6b7dc..f0f5cbc 100644 --- a/docs/api/clients.md +++ b/docs/api/auth/clients.md @@ -1,6 +1,6 @@ # πŸ‘€ Clients API -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 Client APIs manage machine identities and policy documents. @@ -132,11 +132,11 @@ Expected result: create returns `201 Created` with one-time `secret`; list retur - πŸ“˜ [Policy cookbook](policies.md) - 🧭 [Wildcard matcher semantics](policies.md#path-matching-behavior) -- πŸ§ͺ [Curl examples](../examples/curl.md) -- 🐍 [Python examples](../examples/python.md) -- 🟨 [JavaScript examples](../examples/javascript.md) -- 🐹 [Go examples](../examples/go.md) -- 🧱 [Response shapes](response-shapes.md) +- πŸ§ͺ [Curl examples](../../examples/curl.md) +- 🐍 [Python examples](../../examples/python.md) +- 🟨 [JavaScript examples](../../examples/javascript.md) +- 🐹 [Go examples](../../examples/go.md) +- 🧱 [Response shapes](../observability/response-shapes.md) ## Use Cases @@ -147,11 +147,11 @@ Expected result: create returns `201 Created` with one-time `secret`; list retur ## See also - [Authentication API](authentication.md) -- [API error decision matrix](error-decision-matrix.md) -- [API rate limiting](rate-limiting.md) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) +- [API rate limiting](../fundamentals.md#rate-limiting) - [Policies cookbook](policies.md) -- [Capability matrix](capability-matrix.md) -- [Audit logs API](audit-logs.md) -- [Response shapes](response-shapes.md) -- [API compatibility policy](versioning-policy.md) -- [Glossary](../concepts/glossary.md) +- [Capability matrix](../fundamentals.md#capability-matrix) +- [Audit logs API](../observability/audit-logs.md) +- [Response shapes](../observability/response-shapes.md) +- [API compatibility policy](../fundamentals.md#compatibility-and-versioning-policy) +- [Glossary](../../concepts/architecture.md#glossary) diff --git a/docs/api/policies.md b/docs/api/auth/policies.md similarity index 92% rename from docs/api/policies.md rename to docs/api/auth/policies.md index 8f98220..32281c5 100644 --- a/docs/api/policies.md +++ b/docs/api/auth/policies.md @@ -1,6 +1,6 @@ # πŸ“˜ Authorization Policy Cookbook -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 Ready-to-use policy templates for common service roles. @@ -54,6 +54,8 @@ Policies are evaluated with case-sensitive matching rules: - Trailing wildcard: `prefix/*` matches paths starting with `prefix/` (greedy for deeper paths) - Mid-path wildcard: `*` inside a path matches exactly one segment +See [ADR 0003: Capability-Based Authorization Model](../../adr/0003-capability-based-authorization-model.md) for the architectural rationale behind this design. + Examples: - `/v1/secrets/*` matches `/v1/secrets/app`, `/v1/secrets/app/db`, and `/v1/secrets/app/db/password` @@ -76,17 +78,17 @@ Unsupported patterns (not shell globs): - Example: `POST /v1/transit/keys/payment/rotate` can still return `403` if caller lacks `rotate` on `/v1/transit/keys/*/rotate`. -Use [Policy smoke tests](../operations/policy-smoke-tests.md) to validate both route shape and policy behavior. +Use [Policy smoke tests](../../operations/runbooks/policy-smoke-tests.md) to validate both route shape and policy behavior. ## Policy review checklist before deploy 1. Confirm endpoint capability intent for each path (`read`, `write`, `delete`, `encrypt`, `decrypt`, `rotate`). 2. Confirm wildcard type is intentional (exact, full `*`, trailing `/*`, or mid-path segment `*`). 3. Reject unsupported patterns (`prod-*`, `*prod`, `prod*`, `**`) before policy rollout. -4. Run route-shape and allow/deny smoke checks from [Policy smoke tests](../operations/policy-smoke-tests.md). +4. Run route-shape and allow/deny smoke checks from [Policy smoke tests](../../operations/runbooks/policy-smoke-tests.md). 5. Review denied audit events after rollout and verify mismatches are expected. -Endpoint capability intent (quick map, condensed from [Capability matrix](capability-matrix.md)): +Endpoint capability intent (quick map, condensed from [Capability matrix](../fundamentals.md#capability-matrix)): | Endpoint family | Typical capability | | --- | --- | @@ -151,7 +153,7 @@ Risk note: should not include `decrypt` unless CI must read values. Use for services that should encrypt sensitive values but never decrypt. -See [Transit API](transit.md) for encrypt/decrypt request and response contracts. +See [Transit API](../data/transit.md) for encrypt/decrypt request and response contracts. ```json [ @@ -168,7 +170,7 @@ Risk note: encrypt-only separation limits plaintext exposure. Use for tightly scoped decryption workers. -See [Decrypt input contract](transit.md#decrypt-input-contract) for required +See [Decrypt input contract](../data/transit.md#decrypt-input-contract) for required `ciphertext` format. ```json @@ -336,7 +338,7 @@ fi echo "policy static checks: PASS" ``` -For runtime allow/deny assertions, run [Policy smoke tests](../operations/policy-smoke-tests.md). +For runtime allow/deny assertions, run [Policy smoke tests](../../operations/runbooks/policy-smoke-tests.md). ## Policy mismatch example (wrong vs fixed) @@ -388,8 +390,8 @@ Also verify path matching, for example `/v1/secrets/app/prod/*` if you want tigh ## See also - [Authentication API](authentication.md) -- [API error decision matrix](error-decision-matrix.md) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) - [Clients API](clients.md) -- [Capability matrix](capability-matrix.md) -- [Secrets API](secrets.md) -- [Transit API](transit.md) +- [Capability matrix](../fundamentals.md#capability-matrix) +- [Secrets API](../data/secrets.md) +- [Transit API](../data/transit.md) diff --git a/docs/api/capability-matrix.md b/docs/api/capability-matrix.md deleted file mode 100644 index c6f50b9..0000000 --- a/docs/api/capability-matrix.md +++ /dev/null @@ -1,69 +0,0 @@ -# πŸ—‚οΈ Capability Matrix - -> Last updated: 2026-02-19 -> Applies to: API v1 - -This page is the canonical capability-to-endpoint reference used by API docs and policy templates. - -## Capability Definitions - -- `read`: list or inspect metadata/state without decrypting payload values -- `write`: create or update non-cryptographic resources and key definitions -- `delete`: delete resources or revoke token lifecycle entries -- `encrypt`: create encrypted outputs (secrets writes, transit encrypt, tokenization tokenize) -- `decrypt`: resolve encrypted/tokenized values back to plaintext -- `rotate`: create new key versions - -## Endpoint Matrix - -| Endpoint | Required capability | -| --- | --- | -| `POST /v1/clients` | `write` | -| `GET /v1/clients` | `read` | -| `GET /v1/clients/:id` | `read` | -| `PUT /v1/clients/:id` | `write` | -| `DELETE /v1/clients/:id` | `delete` | -| `GET /v1/audit-logs` | `read` | -| `POST /v1/secrets/*path` | `encrypt` | -| `GET /v1/secrets/*path` | `decrypt` | -| `DELETE /v1/secrets/*path` | `delete` | -| `POST /v1/transit/keys` | `write` | -| `POST /v1/transit/keys/:name/rotate` | `rotate` | -| `DELETE /v1/transit/keys/:id` | `delete` | -| `POST /v1/transit/keys/:name/encrypt` | `encrypt` | -| `POST /v1/transit/keys/:name/decrypt` | `decrypt` | -| `POST /v1/tokenization/keys` | `write` | -| `POST /v1/tokenization/keys/:name/rotate` | `rotate` | -| `DELETE /v1/tokenization/keys/:id` | `delete` | -| `POST /v1/tokenization/keys/:name/tokenize` | `encrypt` | -| `POST /v1/tokenization/detokenize` | `decrypt` | -| `POST /v1/tokenization/validate` | `read` | -| `POST /v1/tokenization/revoke` | `delete` | - -## Policy Authoring Notes - -Policy matcher quick reference: - -| Pattern type | Example | Matching behavior | -| --- | --- | --- | -| Exact | `/v1/audit-logs` | Only that exact path | -| Full wildcard | `*` | Any request path | -| Trailing wildcard | `/v1/secrets/*` | Prefix + nested paths | -| Mid-path wildcard | `/v1/transit/keys/*/rotate` | `*` matches one segment | - -For complete matcher semantics and unsupported forms, see [Policies cookbook](policies.md#path-matching-behavior). - -- Use path scope as narrowly as possible (service + environment prefixes). -- Avoid wildcard `*` except temporary break-glass workflows. -- Keep encrypt and decrypt separated across clients when operationally possible. -- For tokenization lifecycle endpoints, token value is passed in JSON body; policy path is endpoint path. - -## See also - -- [Policies cookbook](policies.md) -- [API error decision matrix](error-decision-matrix.md) -- [Authentication API](authentication.md) -- [Clients API](clients.md) -- [Secrets API](secrets.md) -- [Transit API](transit.md) -- [Tokenization API](tokenization.md) diff --git a/docs/api/secrets.md b/docs/api/data/secrets.md similarity index 84% rename from docs/api/secrets.md rename to docs/api/data/secrets.md index 987011c..720cad3 100644 --- a/docs/api/secrets.md +++ b/docs/api/data/secrets.md @@ -1,6 +1,6 @@ # πŸ“¦ Secrets API -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 Secrets are versioned by path and encrypted with envelope encryption. @@ -148,7 +148,7 @@ Expected result: write returns `201 Created`; read returns `200 OK` with base64 Wildcard matcher semantics reference: -- [Policies cookbook / Path matching behavior](policies.md#path-matching-behavior) +- [Policies cookbook / Path matching behavior](../auth/policies.md#path-matching-behavior) ## Related Examples @@ -156,16 +156,16 @@ Wildcard matcher semantics reference: - `docs/examples/python.md` - `docs/examples/javascript.md` - `docs/examples/go.md` -- `docs/api/response-shapes.md` +- `docs/api/observability/response-shapes.md` ## See also -- [Authentication API](authentication.md) -- [API error decision matrix](error-decision-matrix.md) -- [API rate limiting](rate-limiting.md) -- [Policies cookbook](policies.md) -- [Capability matrix](capability-matrix.md) -- [Response shapes](response-shapes.md) -- [API compatibility policy](versioning-policy.md) -- [Curl examples](../examples/curl.md) -- [Glossary](../concepts/glossary.md) +- [Authentication API](../auth/authentication.md) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) +- [API rate limiting](../fundamentals.md#rate-limiting) +- [Policies cookbook](../auth/policies.md) +- [Capability matrix](../fundamentals.md#capability-matrix) +- [Response shapes](../observability/response-shapes.md) +- [API compatibility policy](../fundamentals.md#compatibility-and-versioning-policy) +- [Curl examples](../../examples/curl.md) +- [Glossary](../../concepts/architecture.md#glossary) diff --git a/docs/api/tokenization.md b/docs/api/data/tokenization.md similarity index 94% rename from docs/api/tokenization.md rename to docs/api/data/tokenization.md index 3ff0e7f..4f3cedf 100644 --- a/docs/api/tokenization.md +++ b/docs/api/data/tokenization.md @@ -1,6 +1,6 @@ # 🎫 Tokenization API -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 The Tokenization API provides format-preserving token generation for sensitive values, @@ -273,10 +273,10 @@ If data must remain confidential at rest, keep it in encrypted plaintext payload ## See also -- [Authentication](authentication.md) -- [API error decision matrix](error-decision-matrix.md) -- [API rate limiting](rate-limiting.md) -- [Policies](policies.md) -- [Capability matrix](capability-matrix.md) -- [CLI Commands](../cli/commands.md) -- [Production operations](../operations/production.md) +- [Authentication](../auth/authentication.md) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) +- [API rate limiting](../fundamentals.md#rate-limiting) +- [Policies](../auth/policies.md) +- [Capability matrix](../fundamentals.md#capability-matrix) +- [CLI Commands](../../cli-commands.md) +- [Production operations](../../operations/deployment/production.md) diff --git a/docs/api/transit.md b/docs/api/data/transit.md similarity index 92% rename from docs/api/transit.md rename to docs/api/data/transit.md index 15af2e3..eb41f82 100644 --- a/docs/api/transit.md +++ b/docs/api/data/transit.md @@ -1,6 +1,6 @@ # πŸš„ Transit API -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 Transit API encrypts/decrypts data without storing your application payload. @@ -44,7 +44,7 @@ Capability mapping: Wildcard matcher semantics reference: -- [Policies cookbook / Path matching behavior](policies.md#path-matching-behavior) +- [Policies cookbook / Path matching behavior](../auth/policies.md#path-matching-behavior) ## Status Code Quick Reference @@ -272,16 +272,16 @@ Expected result: key creation returns `201 Created`; encrypt returns `200 OK` wi - `docs/examples/python.md` - `docs/examples/javascript.md` - `docs/examples/go.md` -- `docs/api/response-shapes.md` +- `docs/api/observability/response-shapes.md` ## See also -- [Authentication API](authentication.md) -- [API error decision matrix](error-decision-matrix.md) -- [API rate limiting](rate-limiting.md) -- [Policies cookbook](policies.md) -- [Capability matrix](capability-matrix.md) -- [Response shapes](response-shapes.md) -- [API compatibility policy](versioning-policy.md) -- [Curl examples](../examples/curl.md) -- [Glossary](../concepts/glossary.md) +- [Authentication API](../auth/authentication.md) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) +- [API rate limiting](../fundamentals.md#rate-limiting) +- [Policies cookbook](../auth/policies.md) +- [Capability matrix](../fundamentals.md#capability-matrix) +- [Response shapes](../observability/response-shapes.md) +- [API compatibility policy](../fundamentals.md#compatibility-and-versioning-policy) +- [Curl examples](../../examples/curl.md) +- [Glossary](../../concepts/architecture.md#glossary) diff --git a/docs/api/error-decision-matrix.md b/docs/api/error-decision-matrix.md deleted file mode 100644 index 6f9429b..0000000 --- a/docs/api/error-decision-matrix.md +++ /dev/null @@ -1,52 +0,0 @@ -# 🚨 API Error Decision Matrix - -> Last updated: 2026-02-20 -> Applies to: API v1 - -Use this matrix to triage API failures quickly and choose the next action. - -## Decision Matrix - -| Status | Meaning | Common causes | First action | -| --- | --- | --- | --- | -| `401 Unauthorized` | Authentication failed | Missing/invalid Bearer token, invalid client credentials, expired token | Re-issue token and verify `Authorization: Bearer ` | -| `403 Forbidden` | Authenticated but not allowed | Policy/capability mismatch for request path | Check policy path + required capability mapping | -| `404 Not Found` | Route/resource missing | Wrong endpoint shape, unknown resource ID/key/path | Verify endpoint path shape first, then resource existence | -| `409 Conflict` | Resource state conflict | Duplicate create (for example existing transit key name) | Switch to rotate/update flow or use unique resource name | -| `422 Unprocessable Entity` | Validation failed | Invalid JSON/body/query, bad base64, malformed ciphertext contract | Validate payload and endpoint-specific contract | -| `429 Too Many Requests` | Request throttled | Per-client or per-IP rate limit exceeded | Respect `Retry-After` and retry with backoff + jitter | - -## Fast Triage Order - -1. Check status code class (`401/403/404/409/422/429`) -2. Validate route shape (to avoid misreading `404` as policy issue) -3. Validate token/authn (`401`) before policy/authz (`403`) -4. Validate payload contract (`422`) using endpoint docs -5. For `429`, apply retry policy and reassess client concurrency - -## Fast discriminator (`401` vs `403` vs `429`) - -- `401 Unauthorized`: authentication failed before policy check; verify token or client credentials first -- `403 Forbidden`: authentication succeeded, but policy/capability denied requested path -- `429 Too Many Requests`: request hit per-client or per-IP throttling; inspect `Retry-After` - -First place to look: - -- `401`: token issuance/authentication logs and credential validity -- `403`: policy document, capability mapping, and path matcher behavior -- `429`: rate-limit settings (`RATE_LIMIT_*`, `RATE_LIMIT_TOKEN_*`) and traffic burst patterns - -## Capability mismatch quick map (`403`) - -- `GET /v1/secrets/*path` requires `decrypt` -- `POST /v1/secrets/*path` requires `encrypt` -- `POST /v1/transit/keys/:name/rotate` requires `rotate` -- `POST /v1/tokenization/detokenize` requires `decrypt` -- `GET /v1/audit-logs` requires `read` - -## See also - -- [Capability matrix](capability-matrix.md) -- [Policies cookbook](policies.md) -- [API rate limiting](rate-limiting.md) -- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/api/fundamentals.md b/docs/api/fundamentals.md new file mode 100644 index 0000000..48005d4 --- /dev/null +++ b/docs/api/fundamentals.md @@ -0,0 +1,279 @@ +# 🧩 API Fundamentals + +> Last updated: 2026-02-20 +> Applies to: API v1 + +This page consolidates foundational API concepts for quick reference: error triage, capability mapping, rate limiting, and versioning policy. + +## Table of Contents + +- [Error Decision Matrix](#error-decision-matrix) +- [Capability Matrix](#capability-matrix) +- [Rate Limiting](#rate-limiting) +- [Compatibility and Versioning Policy](#compatibility-and-versioning-policy) + +--- + +## Error Decision Matrix + +Use this matrix to triage API failures quickly and choose the next action. + +### Decision Matrix + +| Status | Meaning | Common causes | First action | +| --- | --- | --- | --- | +| `401 Unauthorized` | Authentication failed | Missing/invalid Bearer token, invalid client credentials, expired token | Re-issue token and verify `Authorization: Bearer ` | +| `403 Forbidden` | Authenticated but not allowed | Policy/capability mismatch for request path | Check policy path + required capability mapping | +| `404 Not Found` | Route/resource missing | Wrong endpoint shape, unknown resource ID/key/path | Verify endpoint path shape first, then resource existence | +| `409 Conflict` | Resource state conflict | Duplicate create (for example existing transit key name) | Switch to rotate/update flow or use unique resource name | +| `422 Unprocessable Entity` | Validation failed | Invalid JSON/body/query, bad base64, malformed ciphertext contract | Validate payload and endpoint-specific contract | +| `429 Too Many Requests` | Request throttled | Per-client or per-IP rate limit exceeded | Respect `Retry-After` and retry with backoff + jitter | + +### Fast Triage Order + +1. Check status code class (`401/403/404/409/422/429`) +2. Validate route shape (to avoid misreading `404` as policy issue) +3. Validate token/authn (`401`) before policy/authz (`403`) +4. Validate payload contract (`422`) using endpoint docs +5. For `429`, apply retry policy and reassess client concurrency + +### Fast discriminator (`401` vs `403` vs `429`) + +- `401 Unauthorized`: authentication failed before policy check; verify token or client credentials first +- `403 Forbidden`: authentication succeeded, but policy/capability denied requested path +- `429 Too Many Requests`: request hit per-client or per-IP throttling; inspect `Retry-After` + +First place to look: + +- `401`: token issuance/authentication logs and credential validity +- `403`: policy document, capability mapping, and path matcher behavior +- `429`: rate-limit settings (`RATE_LIMIT_*`, `RATE_LIMIT_TOKEN_*`) and traffic burst patterns + +### Capability mismatch quick map (`403`) + +- `GET /v1/secrets/*path` requires `decrypt` +- `POST /v1/secrets/*path` requires `encrypt` +- `POST /v1/transit/keys/:name/rotate` requires `rotate` +- `POST /v1/tokenization/detokenize` requires `decrypt` +- `GET /v1/audit-logs` requires `read` + +--- + +## Capability Matrix + +This section is the canonical capability-to-endpoint reference used by API docs and policy templates. + +### Capability Definitions + +- `read`: list or inspect metadata/state without decrypting payload values +- `write`: create or update non-cryptographic resources and key definitions +- `delete`: delete resources or revoke token lifecycle entries +- `encrypt`: create encrypted outputs (secrets writes, transit encrypt, tokenization tokenize) +- `decrypt`: resolve encrypted/tokenized values back to plaintext +- `rotate`: create new key versions + +### Endpoint Matrix + +| Endpoint | Required capability | +| --- | --- | +| `POST /v1/clients` | `write` | +| `GET /v1/clients` | `read` | +| `GET /v1/clients/:id` | `read` | +| `PUT /v1/clients/:id` | `write` | +| `DELETE /v1/clients/:id` | `delete` | +| `GET /v1/audit-logs` | `read` | +| `POST /v1/secrets/*path` | `encrypt` | +| `GET /v1/secrets/*path` | `decrypt` | +| `DELETE /v1/secrets/*path` | `delete` | +| `POST /v1/transit/keys` | `write` | +| `POST /v1/transit/keys/:name/rotate` | `rotate` | +| `DELETE /v1/transit/keys/:id` | `delete` | +| `POST /v1/transit/keys/:name/encrypt` | `encrypt` | +| `POST /v1/transit/keys/:name/decrypt` | `decrypt` | +| `POST /v1/tokenization/keys` | `write` | +| `POST /v1/tokenization/keys/:name/rotate` | `rotate` | +| `DELETE /v1/tokenization/keys/:id` | `delete` | +| `POST /v1/tokenization/keys/:name/tokenize` | `encrypt` | +| `POST /v1/tokenization/detokenize` | `decrypt` | +| `POST /v1/tokenization/validate` | `read` | +| `POST /v1/tokenization/revoke` | `delete` | + +### Policy Authoring Notes + +Policy matcher quick reference: + +| Pattern type | Example | Matching behavior | +| --- | --- | --- | +| Exact | `/v1/audit-logs` | Only that exact path | +| Full wildcard | `*` | Any request path | +| Trailing wildcard | `/v1/secrets/*` | Prefix + nested paths | +| Mid-path wildcard | `/v1/transit/keys/*/rotate` | `*` matches one segment | + +For complete matcher semantics and unsupported forms, see [Policies cookbook](auth/policies.md#path-matching-behavior). + +See [ADR 0003: Capability-Based Authorization Model](../adr/0003-capability-based-authorization-model.md) for the architectural rationale behind this design. + +- Use path scope as narrowly as possible (service + environment prefixes). +- Avoid wildcard `*` except temporary break-glass workflows. +- Keep encrypt and decrypt separated across clients when operationally possible. +- For tokenization lifecycle endpoints, token value is passed in JSON body; policy path is endpoint path. + +--- + +## Rate Limiting + +Secrets enforces two rate-limiting scopes: + +- Per-client limits for authenticated API routes (`RATE_LIMIT_*`) +- Per-IP limits for unauthenticated token issuance (`RATE_LIMIT_TOKEN_*`) + +See [ADR 0006: Dual-Scope Rate Limiting Strategy](../adr/0006-dual-scope-rate-limiting-strategy.md) for the architectural rationale behind this design. + +### Scope + +Rate limiting scope matrix: + +| Route group/endpoint | Rate limited | Notes | +| --- | --- | --- | +| `/v1/clients/*` | Yes | Requires Bearer auth | +| `/v1/audit-logs` | Yes | Requires Bearer auth | +| `/v1/secrets/*` | Yes | Requires Bearer auth | +| `/v1/transit/*` | Yes | Requires Bearer auth | +| `/v1/tokenization/*` | Yes | Requires Bearer auth | +| `POST /v1/token` | Yes | Unauthenticated endpoint, rate-limited per client IP | +| `GET /health` | No | Liveness checks | +| `GET /ready` | No | Readiness checks | +| `GET /metrics` | No | Prometheus scraping | + +### Defaults + +```dotenv +# Authenticated endpoints (per client) +RATE_LIMIT_ENABLED=true +RATE_LIMIT_REQUESTS_PER_SEC=10.0 +RATE_LIMIT_BURST=20 + +# Token endpoint (per IP) +RATE_LIMIT_TOKEN_ENABLED=true +RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 +RATE_LIMIT_TOKEN_BURST=10 +``` + +### Response behavior + +When a request exceeds the allowed rate, the API returns: + +- Status: `429 Too Many Requests` +- Header: `Retry-After: ` +- Body: + +```json +{ + "error": "rate_limit_exceeded", + "message": "Too many requests. Please retry after the specified delay." +} +``` + +Token endpoint (`POST /v1/token`) uses the same status/header contract and returns an endpoint-specific +message indicating too many token requests from the caller IP. + +### Client retry guidance + +- Respect `Retry-After` before retrying +- Use exponential backoff with jitter +- Avoid synchronized retries across many workers +- Reduce per-client burst and concurrency where possible +- For token issuance, review shared NAT/proxy behavior and tune `RATE_LIMIT_TOKEN_*` if needed + +### Distinguishing `403` vs `429` + +- `403 Forbidden`: policy/capability denies access +- `429 Too Many Requests`: request was throttled by per-client or per-IP rate limits + +--- + +## Compatibility and Versioning Policy + +This section defines compatibility expectations for HTTP API changes. + +See [ADR 0007: Path-Based API Versioning](../adr/0007-path-based-api-versioning.md) for the architectural rationale behind this design. + +### Compatibility Contract + +- Current public baseline is API v1 (`/v1/*`) +- Existing endpoint paths and JSON field names are treated as stable unless explicitly deprecated +- OpenAPI source of truth: `docs/openapi.yaml` + +### OpenAPI Coverage + +- `docs/openapi.yaml` is a baseline subset focused on high-traffic/common integration flows +- `docs/openapi.yaml` includes tokenization endpoint coverage in the current release +- `docs/openapi.yaml` includes `429 Too Many Requests` response modeling for protected routes +- Endpoint pages in `docs/api/*.md` define full public behavior for covered operations +- Endpoints may exist in runtime before they are expanded in OpenAPI detail + +### App Version vs API Version + +- Application release is pre-1.0 software and may evolve quickly +- API v1 path contract (`/v1/*`) remains the compatibility baseline for consumers +- Breaking API behavior changes require explicit documentation and migration notes + +### Breaking Changes + +Treat these as breaking: + +- changing endpoint paths or required path parameters +- removing response fields or changing field meaning/type +- changing required request fields or accepted formats +- changing status code semantics for successful behavior + +Required process for breaking changes: + +1. Update `docs/openapi.yaml` +2. Update affected API docs and examples +3. Add migration notes in `docs/getting-started/troubleshooting.md` or relevant runbook +4. Add explicit entry in `releases/RELEASES.md` + +### Non-Breaking Changes + +Usually non-breaking: + +- adding optional request/response fields +- adding new endpoints under `/v1/*` +- clarifying documentation text and examples +- adding additional error examples without changing behavior + +### Telemetry Change Examples + +Breaking telemetry examples: + +- renaming a published metric name (for example `secrets_http_requests_total`) +- renaming/removing metric labels used by dashboards or alerts + +Non-breaking telemetry examples: + +- adding a new metric family +- adding new label values for existing labels +- adding new dashboard examples without changing metric contracts + +### Deprecation Guidance + +- Mark deprecated behavior clearly in endpoint docs +- Provide replacement behavior and example migration path +- Keep deprecated behavior available long enough for operational rollout + +--- + +## See also + +- [Authentication API](auth/authentication.md) +- [Clients API](auth/clients.md) +- [Policies cookbook](auth/policies.md) +- [Secrets API](data/secrets.md) +- [Transit API](data/transit.md) +- [Tokenization API](data/tokenization.md) +- [Audit Logs API](observability/audit-logs.md) +- [Response shapes](observability/response-shapes.md) +- [Environment variables](../configuration.md) +- [Troubleshooting](../getting-started/troubleshooting.md) +- [Contributing guide](../contributing.md) diff --git a/docs/api/audit-logs.md b/docs/api/observability/audit-logs.md similarity index 82% rename from docs/api/audit-logs.md rename to docs/api/observability/audit-logs.md index 43d4f55..3af7942 100644 --- a/docs/api/audit-logs.md +++ b/docs/api/observability/audit-logs.md @@ -1,6 +1,6 @@ # πŸ“œ Audit Logs API -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 > Applies to: API v1 Audit logs capture capability checks and access attempts for monitoring and compliance. @@ -16,7 +16,7 @@ Authorization: `read` capability for `/v1/audit-logs`. Capability reference: -- Canonical mapping source: [Capability matrix](capability-matrix.md) +- Canonical mapping source: [Capability matrix](../fundamentals.md#capability-matrix) ## Endpoint @@ -168,18 +168,18 @@ curl -s "http://localhost:8080/v1/audit-logs?limit=100" \ - `docs/examples/python.md` - `docs/examples/javascript.md` - `docs/examples/go.md` -- `docs/api/response-shapes.md` +- `docs/api/observability/response-shapes.md` ## See also -- [Authentication API](authentication.md) -- [API error decision matrix](error-decision-matrix.md) -- [API rate limiting](rate-limiting.md) -- [Clients API](clients.md) -- [Policies cookbook](policies.md) -- [Route shape vs policy shape](policies.md#route-shape-vs-policy-shape) -- [Policy review checklist before deploy](policies.md#policy-review-checklist-before-deploy) -- [Capability matrix](capability-matrix.md) +- [Authentication API](../auth/authentication.md) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) +- [API rate limiting](../fundamentals.md#rate-limiting) +- [Clients API](../auth/clients.md) +- [Policies cookbook](../auth/policies.md) +- [Route shape vs policy shape](../auth/policies.md#route-shape-vs-policy-shape) +- [Policy review checklist before deploy](../auth/policies.md#policy-review-checklist-before-deploy) +- [Capability matrix](../fundamentals.md#capability-matrix) - [Response shapes](response-shapes.md) -- [API compatibility policy](versioning-policy.md) -- [Glossary](../concepts/glossary.md) +- [API compatibility policy](../fundamentals.md#compatibility-and-versioning-policy) +- [Glossary](../../concepts/architecture.md#glossary) diff --git a/docs/api/response-shapes.md b/docs/api/observability/response-shapes.md similarity index 84% rename from docs/api/response-shapes.md rename to docs/api/observability/response-shapes.md index 7fd6c0b..4e9e62d 100644 --- a/docs/api/response-shapes.md +++ b/docs/api/observability/response-shapes.md @@ -111,7 +111,7 @@ Token validate: ``` Input contract note: transit decrypt expects `ciphertext` in format -`:`. See [Transit API](transit.md#decrypt-input-contract). +`:`. See [Transit API](../data/transit.md#decrypt-input-contract). Audit log list: @@ -189,12 +189,12 @@ Representative conflict payload (for example duplicate transit key create): ## See also -- [Authentication API](authentication.md) -- [API rate limiting](rate-limiting.md) -- [API error decision matrix](error-decision-matrix.md) -- [Clients API](clients.md) -- [Secrets API](secrets.md) -- [Transit API](transit.md) -- [Tokenization API](tokenization.md) -- [API compatibility policy](versioning-policy.md) -- [Glossary](../concepts/glossary.md) +- [Authentication API](../auth/authentication.md) +- [API rate limiting](../fundamentals.md#rate-limiting) +- [API error decision matrix](../fundamentals.md#error-decision-matrix) +- [Clients API](../auth/clients.md) +- [Secrets API](../data/secrets.md) +- [Transit API](../data/transit.md) +- [Tokenization API](../data/tokenization.md) +- [API compatibility policy](../fundamentals.md#compatibility-and-versioning-policy) +- [Glossary](../../concepts/architecture.md#glossary) diff --git a/docs/api/rate-limiting.md b/docs/api/rate-limiting.md deleted file mode 100644 index 0c0bd60..0000000 --- a/docs/api/rate-limiting.md +++ /dev/null @@ -1,77 +0,0 @@ -# 🚦 API Rate Limiting - -> Last updated: 2026-02-20 -> Applies to: API v1 - -Secrets enforces two rate-limiting scopes: - -- Per-client limits for authenticated API routes (`RATE_LIMIT_*`) -- Per-IP limits for unauthenticated token issuance (`RATE_LIMIT_TOKEN_*`) - -## Scope - -Rate limiting scope matrix: - -| Route group/endpoint | Rate limited | Notes | -| --- | --- | --- | -| `/v1/clients/*` | Yes | Requires Bearer auth | -| `/v1/audit-logs` | Yes | Requires Bearer auth | -| `/v1/secrets/*` | Yes | Requires Bearer auth | -| `/v1/transit/*` | Yes | Requires Bearer auth | -| `/v1/tokenization/*` | Yes | Requires Bearer auth | -| `POST /v1/token` | Yes | Unauthenticated endpoint, rate-limited per client IP | -| `GET /health` | No | Liveness checks | -| `GET /ready` | No | Readiness checks | -| `GET /metrics` | No | Prometheus scraping | - -## Defaults - -```dotenv -# Authenticated endpoints (per client) -RATE_LIMIT_ENABLED=true -RATE_LIMIT_REQUESTS_PER_SEC=10.0 -RATE_LIMIT_BURST=20 - -# Token endpoint (per IP) -RATE_LIMIT_TOKEN_ENABLED=true -RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 -RATE_LIMIT_TOKEN_BURST=10 -``` - -## Response behavior - -When a request exceeds the allowed rate, the API returns: - -- Status: `429 Too Many Requests` -- Header: `Retry-After: ` -- Body: - -```json -{ - "error": "rate_limit_exceeded", - "message": "Too many requests. Please retry after the specified delay." -} -``` - -Token endpoint (`POST /v1/token`) uses the same status/header contract and returns an endpoint-specific -message indicating too many token requests from the caller IP. - -## Client retry guidance - -- Respect `Retry-After` before retrying -- Use exponential backoff with jitter -- Avoid synchronized retries across many workers -- Reduce per-client burst and concurrency where possible -- For token issuance, review shared NAT/proxy behavior and tune `RATE_LIMIT_TOKEN_*` if needed - -## Distinguishing `403` vs `429` - -- `403 Forbidden`: policy/capability denies access -- `429 Too Many Requests`: request was throttled by per-client or per-IP rate limits - -## See also - -- [Environment variables](../configuration/environment-variables.md) -- [API error decision matrix](error-decision-matrix.md) -- [Response shapes](response-shapes.md) -- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/api/versioning-policy.md b/docs/api/versioning-policy.md deleted file mode 100644 index 356443d..0000000 --- a/docs/api/versioning-policy.md +++ /dev/null @@ -1,78 +0,0 @@ -# 🧩 API Compatibility and Versioning Policy - -> Last updated: 2026-02-19 -> Applies to: API v1 - -This page defines compatibility expectations for HTTP API changes. - -## Compatibility Contract - -- Current public baseline is API v1 (`/v1/*`) -- Existing endpoint paths and JSON field names are treated as stable unless explicitly deprecated -- OpenAPI source of truth: `docs/openapi.yaml` - -## OpenAPI Coverage - -- `docs/openapi.yaml` is a baseline subset focused on high-traffic/common integration flows -- `docs/openapi.yaml` includes tokenization endpoint coverage in the current release -- `docs/openapi.yaml` includes `429 Too Many Requests` response modeling for protected routes -- Endpoint pages in `docs/api/*.md` define full public behavior for covered operations -- Endpoints may exist in runtime before they are expanded in OpenAPI detail - -## App Version vs API Version - -- Application release is pre-1.0 software and may evolve quickly -- API v1 path contract (`/v1/*`) remains the compatibility baseline for consumers -- Breaking API behavior changes require explicit documentation and migration notes - -## Breaking Changes - -Treat these as breaking: - -- changing endpoint paths or required path parameters -- removing response fields or changing field meaning/type -- changing required request fields or accepted formats -- changing status code semantics for successful behavior - -Required process for breaking changes: - -1. Update `docs/openapi.yaml` -2. Update affected API docs and examples -3. Add migration notes in `docs/getting-started/troubleshooting.md` or relevant runbook -4. Add explicit entry in `docs/CHANGELOG.md` - -## Non-Breaking Changes - -Usually non-breaking: - -- adding optional request/response fields -- adding new endpoints under `/v1/*` -- clarifying documentation text and examples -- adding additional error examples without changing behavior - -## Telemetry Change Examples - -Breaking telemetry examples: - -- renaming a published metric name (for example `secrets_http_requests_total`) -- renaming/removing metric labels used by dashboards or alerts - -Non-breaking telemetry examples: - -- adding a new metric family -- adding new label values for existing labels -- adding new dashboard examples without changing metric contracts - -## Deprecation Guidance - -- Mark deprecated behavior clearly in endpoint docs -- Provide replacement behavior and example migration path -- Keep deprecated behavior available long enough for operational rollout - -## See also - -- [Authentication API](authentication.md) -- [API error decision matrix](error-decision-matrix.md) -- [Response shapes](response-shapes.md) -- [Contributing guide](../contributing.md) -- [Documentation changelog](../CHANGELOG.md) diff --git a/docs/cli/commands.md b/docs/cli-commands.md similarity index 90% rename from docs/cli/commands.md rename to docs/cli-commands.md index 4b101a5..c80c6d1 100644 --- a/docs/cli/commands.md +++ b/docs/cli-commands.md @@ -12,10 +12,10 @@ Local binary: ./bin/app [flags] ``` -Docker image (v0.7.0): +Docker image: ```bash -docker run --rm --env-file .env allisson/secrets:v0.7.0 [flags] +docker run --rm --env-file .env allisson/secrets [flags] ``` ## Core Runtime @@ -33,7 +33,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env -p 8080:8080 allisson/secrets:v0.7.0 server +docker run --rm --network secrets-net --env-file .env -p 8080:8080 allisson/secrets server ``` ### `migrate` @@ -49,7 +49,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets migrate ``` ## Key Management @@ -79,7 +79,7 @@ Local: Docker: ```bash -docker run --rm allisson/secrets:v0.7.0 create-master-key --id default +docker run --rm allisson/secrets create-master-key --id default ``` ### `rotate-master-key` @@ -99,7 +99,7 @@ Local: Docker: ```bash -docker run --rm --env-file .env allisson/secrets:v0.7.0 rotate-master-key --id master-key-2026-08 +docker run --rm --env-file .env allisson/secrets rotate-master-key --id master-key-2026-08 ``` ### `create-kek` @@ -119,7 +119,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets create-kek --algorithm aes-gcm ``` ### `rotate-kek` @@ -139,7 +139,7 @@ Local: Docker: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 rotate-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets rotate-kek --algorithm aes-gcm ``` After master key or KEK rotation, restart API server instances so they load updated key material. @@ -176,7 +176,7 @@ Examples: --deterministic \ --algorithm aes-gcm -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets \ create-tokenization-key --name payment-cards --format luhn-preserving --deterministic --algorithm aes-gcm ``` @@ -200,7 +200,7 @@ Examples: --deterministic \ --algorithm chacha20-poly1305 -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets \ rotate-tokenization-key --name payment-cards --format luhn-preserving --deterministic --algorithm chacha20-poly1305 ``` @@ -224,7 +224,7 @@ Examples: ./bin/app clean-expired-tokens --days 30 --format text # Docker form -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets \ clean-expired-tokens --days 30 --dry-run --format json ``` @@ -307,7 +307,7 @@ Examples: ./bin/app clean-audit-logs --days 90 --format text # Docker form -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 \ +docker run --rm --network secrets-net --env-file .env allisson/secrets \ clean-audit-logs --days 90 --dry-run --format json ``` @@ -335,7 +335,7 @@ Requirements: ## See also -- [Docker getting started](../getting-started/docker.md) -- [Local development](../getting-started/local-development.md) -- [Authentication API](../api/authentication.md) -- [Policies cookbook](../api/policies.md) +- [Docker getting started](getting-started/docker.md) +- [Local development](getting-started/local-development.md) +- [Authentication API](api/auth/authentication.md) +- [Policies cookbook](api/auth/policies.md) diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md index f73f0fa..39175d6 100644 --- a/docs/concepts/architecture.md +++ b/docs/concepts/architecture.md @@ -1,6 +1,6 @@ # πŸ—οΈ Architecture -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Secrets follows Clean Architecture with domain-driven boundaries so cryptographic rules stay isolated from transport and storage concerns. @@ -81,11 +81,11 @@ flowchart TD ## 🧱 Layer responsibilities -- `domain/`: business entities and invariants (`Client`, `Token`, `Secret`, `TransitKey`, `TokenizationKey`, `Kek`, `Dek`) -- `usecase/`: orchestration, transactional boundaries, and policy decisions -- `repository/`: PostgreSQL/MySQL persistence and query logic -- `service/`: reusable technical services (crypto, token hashing, helpers) -- `http/`: Gin handlers, DTO validation, middleware, and error mapping +- `domain/`: business entities and invariants (`Client`, `Token`, `Secret`, `TransitKey`, `TokenizationKey`, `Kek`, `Dek`) - uses [UUIDv7 for all IDs](../adr/0009-uuidv7-for-identifiers.md) +- `usecase/`: orchestration, transactional boundaries, and policy decisions (see [ADR 0005: Context-Based Transaction Management](../adr/0005-context-based-transaction-management.md)) +- `repository/`: PostgreSQL/MySQL persistence and query logic (see [ADR 0004: Dual Database Support](../adr/0004-dual-database-support.md)) +- `service/`: reusable technical services (crypto, token hashing with [Argon2id](../adr/0010-argon2id-for-client-secret-hashing.md), helpers) +- `http/`: Gin handlers, DTO validation, middleware, and error mapping (see [ADR 0008: Gin Web Framework](../adr/0008-gin-web-framework-with-custom-middleware.md)) ## βœ… Why this design works @@ -94,12 +94,27 @@ flowchart TD - πŸ§ͺ Keep use cases testable with mockable interfaces - 🌐 Expose consistent HTTP contracts while preserving domain purity +## Glossary + +Quick definitions for terms used across API and operations docs. + +### Terms + +- `Master Key`: Root key material used to protect KEKs; loaded from environment/KMS +- `KEK` (Key Encryption Key): Encrypts/decrypts DEKs; rotated over time +- `DEK` (Data Encryption Key): Encrypts payload data (secret values or transit key material) +- `Transit Key`: Named, versioned key used by transit encrypt/decrypt endpoints +- `Versioned ciphertext`: Transit ciphertext format `:` +- `Capability`: Authorization permission (`read`, `write`, `delete`, `encrypt`, `decrypt`, `rotate`) +- `Soft delete`: Record marked deleted without immediate physical removal +- `Request ID`: Per-request UUID used for traceability and audit correlation + ## See also - [Security model](security-model.md) -- [Key management operations](../operations/key-management.md) -- [Environment variables](../configuration/environment-variables.md) -- [Secrets API](../api/secrets.md) -- [Tokenization API](../api/tokenization.md) +- [Key management operations](../operations/kms/key-management.md) +- [Environment variables](../configuration.md) +- [Secrets API](../api/data/secrets.md) +- [Tokenization API](../api/data/tokenization.md) - [ADR 0001: Envelope Encryption Model](../adr/0001-envelope-encryption-model.md) - [ADR 0002: Transit Versioned Ciphertext Contract](../adr/0002-transit-versioned-ciphertext-contract.md) diff --git a/docs/concepts/glossary.md b/docs/concepts/glossary.md deleted file mode 100644 index ee770b2..0000000 --- a/docs/concepts/glossary.md +++ /dev/null @@ -1,23 +0,0 @@ -# πŸ“˜ Glossary - -> Last updated: 2026-02-14 - -Quick definitions for terms used across API and operations docs. - -## Terms - -- `Master Key`: Root key material used to protect KEKs; loaded from environment/KMS -- `KEK` (Key Encryption Key): Encrypts/decrypts DEKs; rotated over time -- `DEK` (Data Encryption Key): Encrypts payload data (secret values or transit key material) -- `Transit Key`: Named, versioned key used by transit encrypt/decrypt endpoints -- `Versioned ciphertext`: Transit ciphertext format `:` -- `Capability`: Authorization permission (`read`, `write`, `delete`, `encrypt`, `decrypt`, `rotate`) -- `Soft delete`: Record marked deleted without immediate physical removal -- `Request ID`: Per-request UUID used for traceability and audit correlation - -## See also - -- [Architecture](architecture.md) -- [Security model](security-model.md) -- [Transit API](../api/transit.md) -- [Secrets API](../api/secrets.md) diff --git a/docs/concepts/security-model.md b/docs/concepts/security-model.md index f65c4a2..813c4af 100644 --- a/docs/concepts/security-model.md +++ b/docs/concepts/security-model.md @@ -1,6 +1,6 @@ # πŸ”’ Security Model -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Secrets is designed for practical defense-in-depth around secret storage and cryptographic operations. @@ -49,7 +49,7 @@ Secrets is designed for practical defense-in-depth around secret storage and cry - Forward audit logs to SIEM/log aggregation for long-term retention - Disable CORS unless browser-based access is explicitly required -For comprehensive production security guidance, see [Security Hardening Guide](../operations/security-hardening.md). +For comprehensive production security guidance, see [Security Hardening Guide](../operations/security/hardening.md). ## ⚠️ Known limitations @@ -66,11 +66,11 @@ For comprehensive production security guidance, see [Security Hardening Guide](. ## See also -- [Security hardening guide](../operations/security-hardening.md) -- [Production deployment](../operations/production.md) +- [Security hardening guide](../operations/security/hardening.md) +- [Production deployment](../operations/deployment/production.md) - [Architecture](architecture.md) -- [Authentication API](../api/authentication.md) -- [Policies cookbook](../api/policies.md) -- [Capability matrix](../api/capability-matrix.md) -- [Tokenization API](../api/tokenization.md) -- [Key management operations](../operations/key-management.md) +- [Authentication API](../api/auth/authentication.md) +- [Policies cookbook](../api/auth/policies.md) +- [Capability matrix](../api/fundamentals.md#capability-matrix) +- [Tokenization API](../api/data/tokenization.md) +- [Key management operations](../operations/kms/key-management.md) diff --git a/docs/configuration/environment-variables.md b/docs/configuration.md similarity index 91% rename from docs/configuration/environment-variables.md rename to docs/configuration.md index e72bbfd..13eca08 100644 --- a/docs/configuration/environment-variables.md +++ b/docs/configuration.md @@ -53,6 +53,8 @@ METRICS_NAMESPACE=secrets Database driver to use. Supported values: `postgres`, `mysql`. +See [ADR 0004: Dual Database Support](adr/0004-dual-database-support.md) for the architectural rationale behind dual database support. + ### DB_CONNECTION_STRING Database connection string. @@ -79,7 +81,7 @@ DB_CONNECTION_STRING=user:password@tcp(db.example.com:3306)/secrets?tls=true DB_CONNECTION_STRING=user:password@tcp(db.example.com:3306)/secrets?tls=custom ``` -See [Security Hardening Guide](../operations/security-hardening.md#2-database-security) for complete guidance. +See [Security Hardening Guide](operations/security/hardening.md#2-database-security) for complete guidance. ### DB_MAX_OPEN_CONNECTIONS @@ -165,7 +167,7 @@ Examples: - Legacy mode: leave both unset/empty - Invalid configuration: setting only one of the two variables fails startup -For provider setup and migration workflow, see [KMS setup guide](../operations/kms-setup.md). +For provider setup and migration workflow, see [KMS setup guide](operations/kms/setup.md). ### KMS preflight checklist @@ -195,6 +197,8 @@ Token expiration time in seconds (default: `14400` - 4 hours). ## Rate limiting configuration +See [ADR 0006: Dual-Scope Rate Limiting Strategy](adr/0006-dual-scope-rate-limiting-strategy.md) for the architectural rationale behind dual-scope rate limiting. + ### RATE_LIMIT_ENABLED Enable per-client rate limiting (default: `true`). @@ -316,14 +320,14 @@ Prefix for all metric names (default: `secrets`). Or with Docker image: ```bash -docker run --rm allisson/secrets:v0.7.0 create-master-key --id default +docker run --rm allisson/secrets create-master-key --id default ``` ## See also -- [Security hardening guide](../operations/security-hardening.md) -- [Production operations](../operations/production.md) -- [Monitoring](../operations/monitoring.md) -- [Docker getting started](../getting-started/docker.md) -- [Local development](../getting-started/local-development.md) -- [Testing guide](../development/testing.md) +- [Security hardening guide](operations/security/hardening.md) +- [Production operations](operations/deployment/production.md) +- [Monitoring](operations/observability/monitoring.md) +- [Docker getting started](getting-started/docker.md) +- [Local development](getting-started/local-development.md) +- [Contributing guide](contributing.md#development-and-testing) diff --git a/docs/contributing.md b/docs/contributing.md index 2b9ddf2..d462c7c 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -4,6 +4,28 @@ Use this guide when adding or editing project documentation. +## Table of Contents + +- [Scope and Structure](#scope-and-structure) +- [Writing Style](#writing-style) +- [Technical Accuracy](#technical-accuracy) +- [Breaking vs Non-Breaking Docs Changes](#breaking-vs-non-breaking-docs-changes) +- [Security Messaging](#security-messaging) +- [Examples](#examples) +- [Metadata Source of Truth](#metadata-source-of-truth) +- [Local Docs Checks](#local-docs-checks) +- [PR Checklist](#pr-checklist) +- [Docs QA Checklist](#docs-qa-checklist) +- [Feature PR Docs Consistency Checklist](#feature-pr-docs-consistency-checklist) +- [Ownership and Review Cadence](#ownership-and-review-cadence) +- [Docs Release Process](#docs-release-process) +- [Release PR Docs QA Guard](#release-pr-docs-qa-guard) +- [Development and Testing](#development-and-testing) +- [Docs Architecture Map](#docs-architecture-map) +- [Docs Release Checklist](#docs-release-checklist) +- [Documentation Management](#documentation-management) +- [See Also](#see-also) + ## Scope and Structure - Keep root `README.md` concise and navigational @@ -35,7 +57,7 @@ Documentation style baseline: ## Breaking vs Non-Breaking Docs Changes - Treat endpoint path changes, request/response contract changes, and status code behavior changes as breaking docs updates -- Breaking docs updates must include: updated API page, updated examples, and `docs/CHANGELOG.md` entry +- Breaking docs updates must include: updated API page, updated examples, and `releases/RELEASES.md` entry - Treat wording clarifications, formatting, and cross-links as non-breaking docs updates - Non-breaking docs updates should still run `make docs-lint` and keep links accurate @@ -83,7 +105,7 @@ This target runs markdown linting and offline markdown link validation. Optional strict freshness check for changed files: ```bash -DOCS_CHANGED_FILES="docs/api/clients.md docs/api/policies.md" make docs-check-metadata +DOCS_CHANGED_FILES="docs/api/auth/clients.md docs/api/policies.md" make docs-check-metadata ``` When `DOCS_CHANGED_FILES` is set, changed docs pages must refresh `Last updated` to @@ -95,7 +117,7 @@ When `DOCS_CHANGED_FILES` is set, changed docs pages must refresh `Last updated` 2. API examples reflect current behavior 3. Security warnings are present where needed 4. Terminology is consistent across files -5. `docs/CHANGELOG.md` updated for significant documentation changes +5. `releases/RELEASES.md` updated for significant documentation changes ## Docs QA Checklist @@ -112,9 +134,9 @@ For behavior changes, update all relevant docs in the same PR: 1. API endpoint page (`docs/api/.md`) plus capability mapping references 2. OpenAPI contract updates (`docs/openapi.yaml`) for new/changed request and response shapes 3. Examples parity (`docs/examples/*.md`) for at least curl and one SDK/runtime path -4. Monitoring/query updates (`docs/operations/monitoring.md`) when new operations/metrics are introduced +4. Monitoring/query updates (`docs/operations/observability/monitoring.md`) when new operations/metrics are introduced 5. Runbook updates (`docs/operations/*.md` or `docs/getting-started/troubleshooting.md`) for incident/upgrade impact -6. Release notes and changelog (`docs/releases/vX.Y.Z.md`, `docs/CHANGELOG.md`) +6. Release notes and changelog (consolidated in `releases/RELEASES.md`) 7. Entry-point navigation updates (`README.md`, `docs/README.md`) when docs scope expands ## Ownership and Review Cadence @@ -122,23 +144,23 @@ For behavior changes, update all relevant docs in the same PR: - Docs owners: project maintainers and reviewers for touched domain (`api`, `operations`, `security`) - Every functional change PR should include corresponding docs updates when behavior changes - Perform a monthly docs review for stale examples, outdated commands, and dead links -- During releases, verify `Last updated` metadata and append entries to `docs/CHANGELOG.md` +- During releases, verify `Last updated` metadata and append entries to `releases/RELEASES.md` Incident feedback policy: -- For Sev incidents, apply the [Postmortem to docs feedback loop](development/postmortem-doc-loop.md) +- For Sev incidents, apply the [Postmortem to docs feedback loop](#postmortem-feedback-loop) - Incident remediations should either update docs or record explicit no-doc-change rationale Quality KPIs: -- Track baseline docs quality via [Docs quality KPIs](development/docs-quality-kpis.md) +- Track baseline docs quality via [Docs quality KPIs](#quality-kpis) ## Docs Release Process 1. Update `Last updated` in every changed docs file 2. Update `docs/metadata.json` when release/API labels change 3. Add or update relevant examples if behavior/commands changed -4. Append a concise entry in `docs/CHANGELOG.md` for significant docs changes +4. Append a concise entry in `releases/RELEASES.md` for significant docs changes 5. Run `make docs-lint` before opening or merging PRs ## Release PR Docs QA Guard @@ -149,11 +171,280 @@ CI includes an API/docs guard for pull requests: PRs must include corresponding docs changes in at least one relevant docs area - This guard helps ensure API/runtime changes ship with docs, examples, and/or runbook updates +## Development and Testing + +### Useful Commands + +```bash +make build +make run-server +make run-migrate +make lint +make test +make test-with-db +make mocks +make docs-check-examples +``` + +### Run Specific Tests + +```bash +go test -v -race -run TestKekUseCase_Create ./internal/crypto/usecase +go test -v -race -run "TestKekUseCase_Create/Success" ./internal/crypto/usecase +``` + +### Test Databases + +```bash +make test-db-up +make test +make test-db-down +``` + +### Local Development Loop + +1. Update code +2. Run `make lint` +3. Run targeted tests +4. Run full `make test` + +## Docs Architecture Map + +This section defines canonical vs supporting docs to reduce duplication and drift. + +### Canonical Sources + +| Topic | Canonical document | +| --- | --- | +| Release and API label metadata | `docs/metadata.json` | +| API contract subset | `docs/openapi.yaml` | +| Capability-to-endpoint mapping | `docs/api/fundamentals.md#capability-matrix` | +| Authorization path matcher semantics | `docs/api/auth/policies.md` (see [ADR 0003](adr/0003-capability-based-authorization-model.md)) | +| Rate limiting strategy | `docs/api/fundamentals.md#rate-limiting` (see [ADR 0006](adr/0006-dual-scope-rate-limiting-strategy.md)) | +| API versioning approach | `docs/api/fundamentals.md#compatibility-and-versioning-policy` (see [ADR 0007](adr/0007-path-based-api-versioning.md)) | +| Database support | `docs/configuration.md#database-configuration` (see [ADR 0004](adr/0004-dual-database-support.md)) | +| Transaction management | `docs/concepts/architecture.md` (see [ADR 0005](adr/0005-context-based-transaction-management.md)) | +| Runtime env configuration | `docs/configuration.md` | +| Production security posture | `docs/operations/security/hardening.md` | +| Release narrative | `docs/releases/vX.Y.Z.md` | +| Architectural decisions | `docs/adr/*.md` | + +### Supporting Documents + +| Area | Supporting docs | +| --- | --- | +| Onboarding | `docs/getting-started/*.md` | +| Endpoint behavior details | `docs/api/*.md` | +| Operations runbooks | `docs/operations/*.md` | +| Integration snippets | `docs/examples/*.md` | +| Docs process and governance | `docs/contributing.md` | + +### Sync Rules + +1. Update canonical source first +2. Propagate essential deltas to supporting docs +3. Update `CHANGELOG.md` for significant docs updates +4. Run docs checks before merge + +Recommended local validation: + +- `make docs-lint` +- `make docs-check-metadata` +- `make docs-check-release-tags` + +### CI/Tooling Guards + +- `docs/tools/check_docs_metadata.py`: release/API metadata and `Last updated` consistency +- `docs/tools/check_release_docs_links.py`: release docs link integrity in PRs +- `docs/tools/check_example_shapes.py`: JSON example structure sanity checks +- `docs/tools/check_release_image_tags.py`: pinned current-release Docker tag consistency + +### Drift Signals + +- Endpoint docs disagree with capability matrix +- Release references disagree with `docs/metadata.json` +- Examples use old response/error semantics +- Troubleshooting behavior diverges from runbooks + +## Docs Release Checklist + +Use this checklist for each release (`vX.Y.Z`) to keep docs consistent and navigable. + +### 1) Metadata and Release Labels + +- Update `docs/metadata.json`: + - `current_release` + - `last_docs_refresh` +- Ensure `README.md` and `docs/README.md` reflect the same current release + +### 2) Release Pages + +- Add release notes: `docs/releases/vX.Y.Z.md` +- Add upgrade guide when behavior/defaults change: `docs/releases/vX.Y.Z-upgrade.md` +- Start from templates: + - `docs/releases/_template.md` + - `docs/releases/_upgrade-template.md` +- Update release compatibility matrix: `docs/releases/compatibility-matrix.md` +- Promote new release links in docs indexes and operator runbooks + +### 3) API Contract and Examples + +- Update endpoint docs under `docs/api/*.md` for behavior/status changes +- Update `docs/openapi.yaml` for request/response changes +- Include `429` + `Retry-After` contract where protected routes can throttle +- Update at least curl plus one SDK/runtime example (`python`, `javascript`, or `go`) + +### 4) Operations and Runbooks + +- Update `docs/getting-started/*` for default/config changes +- Update `docs/getting-started/troubleshooting.md` for new failure modes +- Update `docs/operations/*` guidance for production impact + +### 5) Changelogs and Navigation + +- Update project changelog (`releases/RELEASES.md`) for release behavior and docs changes +- Verify links from: + - `README.md` + - `docs/README.md` + - `docs/operations/runbooks/README.md` + +#### Docker Tag Consistency Rule + +- Use unpinned image tag (`allisson/secrets`) in all documentation for simplicity and to avoid repeated version updates. +- Historical note: Prior to v0.8.0, pinned tags (`allisson/secrets:vX.Y.Z`) were used. This was changed to reduce maintenance overhead. +- Ensure Docker image reference consistency guard passes (`docs/tools/check_release_image_tags.py`). +- The validation script allows either current pinned tags or unpinned references, but flags outdated version pins. + +### 6) Validation Before Merge + +Run: + +```bash +make docs-lint +make docs-check-examples +make docs-check-metadata +make docs-check-release-tags +``` + +CI should also validate: + +- markdown lint and link checks +- docs metadata consistency +- OpenAPI validity +- release docs link guard for new `docs/releases/vX.Y.Z.md` additions + +## Documentation Management + +This section consolidates documentation quality, incident feedback loops, and backlog management into one operational guide. + +### Quality KPIs + +Use these KPIs to track documentation reliability and operational usefulness. + +#### Core KPIs + +| KPI | Target | Source | +| --- | --- | --- | +| Docs lint/link pass rate | 100% on main and PRs | CI (`make docs-lint`) | +| Stale high-risk pages (API/ops/getting-started) | 0 pages older than SLA | freshness check (Phase 4 PR 1) | +| Incident triage time-to-first-runbook | <= 5 minutes | on-call postmortems | +| Docs-related incident follow-up completion | 100% for Sev incidents | incident action tracker | +| Broken internal anchor count | 0 | anchor guard (Phase 4 PR 2) | + +#### Review Cadence + +- Weekly: CI quality metrics (lint/link/check failures) +- Monthly: freshness + ownership review +- After Sev incidents: triage path clarity and runbook updates + +#### Escalation Triggers + +- Repeated docs-check CI failures for 2+ weeks +- 2+ incidents in a month citing missing/unclear docs guidance +- Freshness SLA misses in API/operations docs + +### Postmortem Feedback Loop + +Use this process to ensure incidents continuously improve operational documentation. + +#### Policy + +For every Sev incident, include one of the following outcomes in the postmortem: + +1. Docs updated in the same remediation PR +2. Explicit note: "No documentation change needed" with rationale + +#### Required Fields in Postmortem + +- Runbook used first +- Time to first useful doc reference +- Missing/ambiguous docs sections +- Docs updates created (path + PR link) + +#### Minimal Workflow + +1. Incident is resolved +2. Owner identifies doc gaps from timeline +3. Patch docs or record no-change rationale +4. Update `releases/RELEASES.md` if docs changed +5. Confirm docs checks pass before merge + +#### Suggested SLA + +- Sev 1-2 incidents: docs follow-up within 2 business days +- Sev 3 incidents: docs follow-up within 5 business days + +### Master Backlog + +This section consolidates all documentation improvement initiatives into one prioritized execution sequence. + +#### P0 (Immediate) + +| Item | Effort | Dependency | +| --- | --- | --- | +| Incident decision tree and first-15-minutes playbook | S | none | +| Operator/developer day-0 walkthrough paths | S | none | +| Known limitations page for ops/security expectations | S | none | + +#### P1 (Near-term) + +| Item | Effort | Dependency | +| --- | --- | --- | +| Freshness SLA check + CI | M | policy alignment | +| Internal anchor integrity check + CI | M | docs tooling baseline | +| OpenAPI-to-doc coverage guard | M | endpoint mapping config | +| Example parity checks across runtimes | M | examples conventions | + +#### P2 (Governance) + +| Item | Effort | Dependency | +| --- | --- | --- | +| Docs ownership matrix and review cadence page | S | team owner mapping | +| Postmortem-to-doc feedback loop policy | S | incident process agreement | +| Docs KPI reporting page and monthly review process | S | CI metrics visibility | + +#### P3 (Maturity) + +| Item | Effort | Dependency | +| --- | --- | --- | +| API contracts/invariants canonical page | M | API doc harmonization | +| Release audience diff summaries (users/operators/security) | M | release template update | +| Search vocabulary normalization pass | S | page owners for key docs | + +#### Suggested Execution Sequence + +1. Complete P0 content and navigation updates +2. Implement P1 checks in CI with low-noise defaults +3. Formalize P2 governance and cadence +4. Deliver P3 consistency and release communication upgrades + ## See also - [Documentation index](README.md) -- [Testing guide](development/testing.md) -- [Docs release checklist](development/docs-release-checklist.md) -- [Docs architecture map](development/docs-architecture-map.md) -- [Changelog](CHANGELOG.md) +- [Changelog](releases/RELEASES.md) - [Local development](getting-started/local-development.md) +- [Smoke test](getting-started/smoke-test.md) +- [Troubleshooting](getting-started/troubleshooting.md) +- [Incident response guide](operations/observability/incident-response.md) +- [API compatibility policy](api/fundamentals.md#compatibility-and-versioning-policy) +- [Production rollout golden path](operations/deployment/production-rollout.md) diff --git a/docs/development/docs-architecture-map.md b/docs/development/docs-architecture-map.md deleted file mode 100644 index 3259b22..0000000 --- a/docs/development/docs-architecture-map.md +++ /dev/null @@ -1,60 +0,0 @@ -# πŸ—ΊοΈ Docs Architecture Map - -> Last updated: 2026-02-20 - -This page defines canonical vs supporting docs to reduce duplication and drift. - -## Canonical Sources - -| Topic | Canonical document | -| --- | --- | -| Release and API label metadata | `docs/metadata.json` | -| API contract subset | `docs/openapi.yaml` | -| Capability-to-endpoint mapping | `docs/api/capability-matrix.md` | -| Authorization path matcher semantics | `docs/api/policies.md` | -| Runtime env configuration | `docs/configuration/environment-variables.md` | -| Production security posture | `docs/operations/security-hardening.md` | -| Release narrative | `docs/releases/vX.Y.Z.md` | - -## Supporting Documents - -| Area | Supporting docs | -| --- | --- | -| Onboarding | `docs/getting-started/*.md` | -| Endpoint behavior details | `docs/api/*.md` | -| Operations runbooks | `docs/operations/*.md` | -| Integration snippets | `docs/examples/*.md` | -| Docs process and governance | `docs/contributing.md`, `docs/development/*.md` | - -## Sync Rules - -1. Update canonical source first -2. Propagate essential deltas to supporting docs -3. Update `docs/CHANGELOG.md` for significant docs updates -4. Run docs checks before merge - -Recommended local validation: - -- `make docs-lint` -- `make docs-check-metadata` -- `make docs-check-release-tags` - -## CI/Tooling Guards - -- `docs/tools/check_docs_metadata.py`: release/API metadata and `Last updated` consistency -- `docs/tools/check_release_docs_links.py`: release docs link integrity in PRs -- `docs/tools/check_example_shapes.py`: JSON example structure sanity checks -- `docs/tools/check_release_image_tags.py`: pinned current-release Docker tag consistency - -## Drift Signals - -- Endpoint docs disagree with capability matrix -- Release references disagree with `docs/metadata.json` -- Examples use old response/error semantics -- Troubleshooting behavior diverges from runbooks - -## See also - -- [Documentation contributing guide](../contributing.md) -- [Docs release checklist](docs-release-checklist.md) -- [Documentation index](../README.md) diff --git a/docs/development/docs-master-backlog.md b/docs/development/docs-master-backlog.md deleted file mode 100644 index 982242a..0000000 --- a/docs/development/docs-master-backlog.md +++ /dev/null @@ -1,51 +0,0 @@ -# πŸ—‚οΈ Docs Master Backlog - -> Last updated: 2026-02-20 - -This page consolidates Phase 3, Phase 4, and maturity follow-ups into one prioritized execution sequence. - -## P0 (Immediate) - -| Item | Effort | Dependency | -| --- | --- | --- | -| Incident decision tree and first-15-minutes playbook | S | none | -| Operator/developer day-0 walkthrough paths | S | none | -| Known limitations page for ops/security expectations | S | none | - -## P1 (Near-term) - -| Item | Effort | Dependency | -| --- | --- | --- | -| Freshness SLA check + CI | M | policy alignment | -| Internal anchor integrity check + CI | M | docs tooling baseline | -| OpenAPI-to-doc coverage guard | M | endpoint mapping config | -| Example parity checks across runtimes | M | examples conventions | - -## P2 (Governance) - -| Item | Effort | Dependency | -| --- | --- | --- | -| Docs ownership matrix and review cadence page | S | team owner mapping | -| Postmortem-to-doc feedback loop policy | S | incident process agreement | -| Docs KPI reporting page and monthly review process | S | CI metrics visibility | - -## P3 (Maturity) - -| Item | Effort | Dependency | -| --- | --- | --- | -| API contracts/invariants canonical page | M | API doc harmonization | -| Release audience diff summaries (users/operators/security) | M | release template update | -| Search vocabulary normalization pass | S | page owners for key docs | - -## Suggested execution sequence - -1. Complete P0 content and navigation updates -2. Implement P1 checks in CI with low-noise defaults -3. Formalize P2 governance and cadence -4. Deliver P3 consistency and release communication upgrades - -## See also - -- [Docs phase 3 roadmap](docs-phase-3-roadmap.md) -- [Docs phase 4 roadmap](docs-phase-4-roadmap.md) -- [Docs release checklist](docs-release-checklist.md) diff --git a/docs/development/docs-phase-3-roadmap.md b/docs/development/docs-phase-3-roadmap.md deleted file mode 100644 index 03f53d5..0000000 --- a/docs/development/docs-phase-3-roadmap.md +++ /dev/null @@ -1,95 +0,0 @@ -# πŸ›£οΈ Docs Phase 3 Roadmap - -> Last updated: 2026-02-20 - -This roadmap captures next-step documentation improvements after the `v0.7.0` release prep -and Phase 2 operator hardening updates. - -## Objectives - -- Reduce time-to-troubleshoot for operators -- Improve contract clarity for API consumers -- Lower long-term documentation drift risk - -## Quick Wins (same-day) - -1. Add an API contracts hub page (`docs/api/contracts.md`) and link from all endpoint pages -2. Add task-based operator navigation (`deploy`, `debug auth`, `debug 429`, `rotate keys`) in runbook index -3. Add negative examples in `docs/examples/*` for common `401/403/422/429` paths -4. Add glossary backlinks for core terms in security and operations pages - -## Medium Scope (1 PR) - -1. Create release cut companion checklist covering non-doc release actions: - - tag creation validation - - image publish and pull verification - - rollback artifact verification -2. Add a docs ownership matrix by domain: - - API docs ownership - - operations/runbook ownership - - release docs ownership - -## Deeper Scope (1-2 PRs) - -1. Build a canonical API contract invariants page with explicit guarantees: - - ciphertext input/output contracts - - error response structure guarantees - - versioning and compatibility expectations -2. Add cross-page consistency guards for contract terms (light static checks) - -## Suggested PR Breakdown - -1. **PR A (Quick wins):** contracts hub + operator task nav + negative examples -2. **PR B (Governance):** release cut companion checklist + docs ownership matrix -3. **PR C (Contracts hardening):** invariants page + consistency checks - -## Definition of Done (Phase 3) - -- All endpoint docs link to shared contracts page -- Runbook index includes task-based operator entry points -- Examples include at least one negative flow per major API area -- Release docs include non-doc release validation links -- Docs ownership matrix is published and linked from docs architecture map - -## Prioritized Backlog (S/M/L + dependencies) - -| Priority | Initiative | Effort | Dependencies | Why now | -| --- | --- | --- | --- | --- | -| P0 | Docs decision tree for incident triage (`401/403/429/5xx`) | S | none | Fastest operator navigation win during incidents | -| P0 | First 15 minutes incident playbook with copy/paste commands | S | none | Reduces on-call ambiguity and response time | -| P1 | OpenAPI-to-doc coverage guard (endpoint reference consistency) | M | stable endpoint docs links | Prevents contract docs drift over time | -| P1 | Example parity checks across curl/python/js/go | M | examples folder conventions | Keeps multi-language guidance consistent | -| P2 | Docs ownership metadata (owner + review cadence) | S | team ownership agreement | Improves freshness accountability | -| P2 | Release audience diff pages (users/operators/security) | M | release template updates | Speeds release communication and change impact review | - -## Suggested execution order - -1. Deliver P0 items together in one quick PR -2. Deliver P1 checks with CI integration in a second PR -3. Deliver P2 governance/reporting items in a third PR - -## Risks and mitigations - -- Risk: extra static checks create noisy CI failures - - Mitigation: start in warning mode locally, then enforce in CI after one release cycle -- Risk: ownership metadata becomes stale - - Mitigation: include ownership review in release checklist cadence -- Risk: endpoint-doc mapping false positives for grouped docs pages - - Mitigation: allow mapping config file for intentional many-to-one endpoint coverage - -## Validation - -Run before merge: - -```bash -make docs-lint -make docs-check-examples -make docs-check-metadata -make docs-check-release-tags -``` - -## See also - -- [Docs architecture map](docs-architecture-map.md) -- [Docs release checklist](docs-release-checklist.md) -- [Operator runbook index](../operations/runbook-index.md) diff --git a/docs/development/docs-phase-4-roadmap.md b/docs/development/docs-phase-4-roadmap.md deleted file mode 100644 index 9b8d0a6..0000000 --- a/docs/development/docs-phase-4-roadmap.md +++ /dev/null @@ -1,112 +0,0 @@ -# 🧭 Docs Phase 4 Micro-Roadmap - -> Last updated: 2026-02-20 - -This phase focuses on documentation process quality, freshness visibility, and guardrails. - -## Scope - -- Keep improvements small and enforceable -- Prefer CI-backed checks over manual reminders -- Ship in 3 focused PRs - -## PR 1: Freshness SLA and Stale Page Guard - -### PR 1 Goal - -Detect stale docs pages before drift becomes operational risk. - -### PR 1 Changes - -- Add `docs/tools/check_docs_freshness.py` -- Add `make docs-check-freshness` -- Add CI step in `.github/workflows/ci.yml` -- Add freshness policy section in `docs/contributing.md` - -### PR 1 Rule set (starter) - -- Fail if `> Last updated:` is older than 120 days for: - - `docs/api/*.md` - - `docs/operations/*.md` - - `docs/getting-started/*.md` -- Exclude historical release pages and ADR pages - -## PR 2: Internal Anchor Integrity Guard - -### PR 2 Goal - -Catch broken section links when headings change in long docs. - -### PR 2 Changes - -- Add `docs/tools/check_internal_anchors.py` -- Add `make docs-check-anchors` -- Add CI step in `.github/workflows/ci.yml` -- Document anchor-link practices in `docs/development/docs-architecture-map.md` - -### PR 2 Rule set (starter) - -- Validate local markdown links with fragments (e.g., `file.md#section-heading`) -- Fail when target file exists but fragment no longer resolves - -## PR 3: Command Validation Markers + Persona Entrypoints - -### PR 3 Goal - -Improve trust in copy/paste blocks and speed onboarding by audience. - -### PR 3 Changes - -- Add command validation markers to critical pages: - - `docs/operations/production-rollout.md` - - `docs/operations/production.md` - - `docs/getting-started/troubleshooting.md` - - `docs/getting-started/smoke-test.md` -- Add persona landing pages: - - `docs/personas/operator.md` - - `docs/personas/developer.md` - - `docs/personas/security.md` -- Link persona pages from `docs/README.md` - -### PR 3 Marker format (starter) - -Use a compact marker above critical command blocks: - -```text -> Command status: verified on YYYY-MM-DD -``` - -## Dependencies and Order - -1. PR 1 (freshness) first -2. PR 2 (anchors) second -3. PR 3 (usability) third - -## Success Criteria - -- Freshness check runs in CI and fails stale high-risk pages -- Anchor check runs in CI and prevents broken section links -- Critical command blocks include validation markers -- Persona pages provide a shortest-path doc flow by role - -## Validation Commands - -```bash -make docs-lint -make docs-check-examples -make docs-check-metadata -make docs-check-release-tags -``` - -After PR 1 and PR 2: - -```bash -make docs-check-freshness -make docs-check-anchors -``` - -## See also - -- [Docs phase 3 roadmap](docs-phase-3-roadmap.md) -- [Docs release checklist](docs-release-checklist.md) -- [Docs architecture map](docs-architecture-map.md) diff --git a/docs/development/docs-quality-kpis.md b/docs/development/docs-quality-kpis.md deleted file mode 100644 index e729ada..0000000 --- a/docs/development/docs-quality-kpis.md +++ /dev/null @@ -1,33 +0,0 @@ -# πŸ“ˆ Docs Quality KPIs - -> Last updated: 2026-02-20 - -Use these KPIs to track documentation reliability and operational usefulness. - -## Core KPIs - -| KPI | Target | Source | -| --- | --- | --- | -| Docs lint/link pass rate | 100% on main and PRs | CI (`make docs-lint`) | -| Stale high-risk pages (API/ops/getting-started) | 0 pages older than SLA | freshness check (Phase 4 PR 1) | -| Incident triage time-to-first-runbook | <= 5 minutes | on-call postmortems | -| Docs-related incident follow-up completion | 100% for Sev incidents | incident action tracker | -| Broken internal anchor count | 0 | anchor guard (Phase 4 PR 2) | - -## Review cadence - -- Weekly: CI quality metrics (lint/link/check failures) -- Monthly: freshness + ownership review -- After Sev incidents: triage path clarity and runbook updates - -## Escalation triggers - -- Repeated docs-check CI failures for 2+ weeks -- 2+ incidents in a month citing missing/unclear docs guidance -- Freshness SLA misses in API/operations docs - -## See also - -- [Documentation contributing guide](../contributing.md) -- [Postmortem to docs feedback loop](postmortem-doc-loop.md) -- [Docs phase 4 roadmap](docs-phase-4-roadmap.md) diff --git a/docs/development/docs-release-checklist.md b/docs/development/docs-release-checklist.md deleted file mode 100644 index b0821d8..0000000 --- a/docs/development/docs-release-checklist.md +++ /dev/null @@ -1,77 +0,0 @@ -# 🧾 Docs Release Checklist - -> Last updated: 2026-02-20 - -Use this checklist for each release (`vX.Y.Z`) to keep docs consistent and navigable. - -## 1) Metadata and release labels - -- Update `docs/metadata.json`: - - `current_release` - - `last_docs_refresh` -- Ensure `README.md` and `docs/README.md` reflect the same current release - -## 2) Release pages - -- Add release notes: `docs/releases/vX.Y.Z.md` -- Add upgrade guide when behavior/defaults change: `docs/releases/vX.Y.Z-upgrade.md` -- Start from templates: - - `docs/releases/_template.md` - - `docs/releases/_upgrade-template.md` -- Update release compatibility matrix: `docs/releases/compatibility-matrix.md` -- Promote new release links in docs indexes and operator runbooks - -## 3) API contract and examples - -- Update endpoint docs under `docs/api/*.md` for behavior/status changes -- Update `docs/openapi.yaml` for request/response changes -- Include `429` + `Retry-After` contract where protected routes can throttle -- Update at least curl plus one SDK/runtime example (`python`, `javascript`, or `go`) - -## 4) Operations and runbooks - -- Update `docs/getting-started/*` for default/config changes -- Update `docs/getting-started/troubleshooting.md` for new failure modes -- Update `docs/operations/*` guidance for production impact - -## 5) Changelogs and navigation - -- Update project changelog (`CHANGELOG.md`) for release-level behavior -- Update docs changelog (`docs/CHANGELOG.md`) for docs scope/process updates -- Verify links from: - - `README.md` - - `docs/README.md` - - `docs/operations/runbook-index.md` - -### Docker tag consistency rule - -- Use pinned image tags (`allisson/secrets:vX.Y.Z`) in release guides, rollout runbooks, and copy/paste commands - intended for reproducible operations. -- Use `allisson/secrets:latest` only in explicitly marked fast-iteration/dev-only examples. -- In one document, avoid mixing pinned and `latest` tags unless the distinction is explicitly explained. -- Ensure current-release pinned tag consistency guard passes (`docs/tools/check_release_image_tags.py`). - -## 6) Validation before merge - -Run: - -```bash -make docs-lint -make docs-check-examples -make docs-check-metadata -make docs-check-release-tags -``` - -CI should also validate: - -- markdown lint and link checks -- docs metadata consistency -- OpenAPI validity -- release docs link guard for new `docs/releases/vX.Y.Z.md` additions - -## See also - -- [Documentation contributing guide](../contributing.md) -- [Documentation changelog](../CHANGELOG.md) -- [API compatibility policy](../api/versioning-policy.md) -- [Production rollout golden path](../operations/production-rollout.md) diff --git a/docs/development/postmortem-doc-loop.md b/docs/development/postmortem-doc-loop.md deleted file mode 100644 index f1edc8d..0000000 --- a/docs/development/postmortem-doc-loop.md +++ /dev/null @@ -1,38 +0,0 @@ -# πŸ” Postmortem to Docs Feedback Loop - -> Last updated: 2026-02-20 - -Use this process to ensure incidents continuously improve operational documentation. - -## Policy - -For every Sev incident, include one of the following outcomes in the postmortem: - -1. Docs updated in the same remediation PR -2. Explicit note: "No documentation change needed" with rationale - -## Required fields in postmortem - -- Runbook used first -- Time to first useful doc reference -- Missing/ambiguous docs sections -- Docs updates created (path + PR link) - -## Minimal workflow - -1. Incident is resolved -2. Owner identifies doc gaps from timeline -3. Patch docs or record no-change rationale -4. Update `docs/CHANGELOG.md` if docs changed -5. Confirm docs checks pass before merge - -## Suggested SLA - -- Sev 1-2 incidents: docs follow-up within 2 business days -- Sev 3 incidents: docs follow-up within 5 business days - -## See also - -- [Failure playbooks](../operations/failure-playbooks.md) -- [Incident decision tree](../operations/incident-decision-tree.md) -- [Docs quality KPIs](docs-quality-kpis.md) diff --git a/docs/development/testing.md b/docs/development/testing.md deleted file mode 100644 index e22f2c7..0000000 --- a/docs/development/testing.md +++ /dev/null @@ -1,45 +0,0 @@ -# πŸ› οΈ Development and Testing - -> Last updated: 2026-02-14 - -## Useful commands - -```bash -make build -make run-server -make run-migrate -make lint -make test -make test-with-db -make mocks -make docs-check-examples -``` - -## Run specific tests - -```bash -go test -v -race -run TestKekUseCase_Create ./internal/crypto/usecase -go test -v -race -run "TestKekUseCase_Create/Success" ./internal/crypto/usecase -``` - -## Test databases - -```bash -make test-db-up -make test -make test-db-down -``` - -## Local development loop - -1. Update code -2. Run `make lint` -3. Run targeted tests -4. Run full `make test` - -## See also - -- [Local development](../getting-started/local-development.md) -- [Smoke test](../getting-started/smoke-test.md) -- [Troubleshooting](../getting-started/troubleshooting.md) -- [Contributing guide](../contributing.md) diff --git a/docs/examples/README.md b/docs/examples/README.md new file mode 100644 index 0000000..ec716e3 --- /dev/null +++ b/docs/examples/README.md @@ -0,0 +1,59 @@ +# πŸ§ͺ Code Examples + +> Last updated: 2026-02-20 + +Complete code examples for integrating with Secrets APIs across multiple languages and releases. + +## πŸ“‘ Quick Navigation + +**By Language**: + +- [Curl](curl.md) - Command-line examples +- [Python](python.md) - Python client examples +- [JavaScript](javascript.md) - Node.js client examples +- [Go](go.md) - Go client examples + +**By Version**: See [Version Compatibility](#version-compatibility) below + +--- + +## Version Compatibility + +Use this section to quickly identify which example set matches your deployed release. + +### Current release (`v0.8.0`) + +- Primary examples: + - [Curl examples](curl.md) + - [Python examples](python.md) + - [JavaScript examples](javascript.md) + - [Go examples](go.md) +- Release context: + - [v0.8.0 release notes](../releases/RELEASES.md#080---2026-02-20) + +### Previous release (`v0.7.0`) + +- Backward context: + - [v0.7.0 release notes](../releases/RELEASES.md#070---2026-02-20) + +### Compatibility notes + +- Example payloads and status codes follow current API docs (`/v1/*`) +- For endpoint-specific behavior changes, read release notes first +- For throttling behavior, validate `429` + `Retry-After` handling in your client runtime + +--- + +## Getting Started + +1. Choose your language from the list above +2. Check version compatibility if using an older release +3. Review authentication patterns (all examples use Bearer tokens) +4. Adapt examples to your use case + +## See also + +- [Authentication API](../api/auth/authentication.md) +- [API error decision matrix](../api/fundamentals.md#error-decision-matrix) +- [API rate limiting](../api/fundamentals.md#rate-limiting) +- [Release notes](../releases/RELEASES.md) diff --git a/docs/examples/curl.md b/docs/examples/curl.md index ca9a6c3..aebeffc 100644 --- a/docs/examples/curl.md +++ b/docs/examples/curl.md @@ -1,13 +1,13 @@ # πŸ§ͺ Curl Examples -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 ⚠️ Security Warning: base64 is encoding, not encryption. Always use HTTPS/TLS. End-to-end shell workflow. Need first credentials? Create an API client with `app create-client` first. -See [CLI commands reference](../cli/commands.md). +See [CLI commands reference](../cli-commands.md). ## Bootstrap @@ -187,8 +187,8 @@ Deterministic caveat: ## See also -- [Authentication API](../api/authentication.md) -- [Secrets API](../api/secrets.md) -- [Transit API](../api/transit.md) -- [Clients API](../api/clients.md) -- [API rate limiting](../api/rate-limiting.md) +- [Authentication API](../api/auth/authentication.md) +- [Secrets API](../api/data/secrets.md) +- [Transit API](../api/data/transit.md) +- [Clients API](../api/auth/clients.md) +- [API rate limiting](../api/fundamentals.md#rate-limiting) diff --git a/docs/examples/go.md b/docs/examples/go.md index 6762880..9e5e18d 100644 --- a/docs/examples/go.md +++ b/docs/examples/go.md @@ -1,6 +1,6 @@ # 🐹 Go Examples -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 ⚠️ Security Warning: base64 is encoding, not encryption. Always use HTTPS/TLS. @@ -274,9 +274,9 @@ Rate-limit note: ## See also -- [Authentication API](../api/authentication.md) -- [Secrets API](../api/secrets.md) -- [Transit API](../api/transit.md) -- [Tokenization API](../api/tokenization.md) -- [Response shapes](../api/response-shapes.md) -- [API rate limiting](../api/rate-limiting.md) +- [Authentication API](../api/auth/authentication.md) +- [Secrets API](../api/data/secrets.md) +- [Transit API](../api/data/transit.md) +- [Tokenization API](../api/data/tokenization.md) +- [Response shapes](../api/observability/response-shapes.md) +- [API rate limiting](../api/fundamentals.md#rate-limiting) diff --git a/docs/examples/javascript.md b/docs/examples/javascript.md index 7b1f611..5865f5f 100644 --- a/docs/examples/javascript.md +++ b/docs/examples/javascript.md @@ -1,6 +1,6 @@ # 🟨 JavaScript Examples -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 ⚠️ Security Warning: base64 is encoding, not encryption. Always use HTTPS/TLS. @@ -185,9 +185,9 @@ Rate-limit note: ## See also -- [Authentication API](../api/authentication.md) -- [Secrets API](../api/secrets.md) -- [Transit API](../api/transit.md) -- [Tokenization API](../api/tokenization.md) -- [Response shapes](../api/response-shapes.md) -- [API rate limiting](../api/rate-limiting.md) +- [Authentication API](../api/auth/authentication.md) +- [Secrets API](../api/data/secrets.md) +- [Transit API](../api/data/transit.md) +- [Tokenization API](../api/data/tokenization.md) +- [Response shapes](../api/observability/response-shapes.md) +- [API rate limiting](../api/fundamentals.md#rate-limiting) diff --git a/docs/examples/python.md b/docs/examples/python.md index 1a7583d..d3bb813 100644 --- a/docs/examples/python.md +++ b/docs/examples/python.md @@ -1,6 +1,6 @@ # 🐍 Python Examples -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 ⚠️ Security Warning: base64 is encoding, not encryption. Always use HTTPS/TLS. @@ -167,9 +167,9 @@ Rate-limit note: ## See also -- [Authentication API](../api/authentication.md) -- [Secrets API](../api/secrets.md) -- [Transit API](../api/transit.md) -- [Tokenization API](../api/tokenization.md) -- [Response shapes](../api/response-shapes.md) -- [API rate limiting](../api/rate-limiting.md) +- [Authentication API](../api/auth/authentication.md) +- [Secrets API](../api/data/secrets.md) +- [Transit API](../api/data/transit.md) +- [Tokenization API](../api/data/tokenization.md) +- [Response shapes](../api/observability/response-shapes.md) +- [API rate limiting](../api/fundamentals.md#rate-limiting) diff --git a/docs/examples/versioned-by-release.md b/docs/examples/versioned-by-release.md deleted file mode 100644 index 1acb2c8..0000000 --- a/docs/examples/versioned-by-release.md +++ /dev/null @@ -1,34 +0,0 @@ -# πŸ§ͺ Versioned Examples by Release - -> Last updated: 2026-02-20 - -Use this page to quickly identify which example set matches your deployed release. - -## Current release (`v0.7.0`) - -- Primary examples: - - [Curl examples](curl.md) - - [Python examples](python.md) - - [JavaScript examples](javascript.md) - - [Go examples](go.md) -- Release context: - - [v0.7.0 release notes](../releases/v0.7.0.md) - - [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) - -## Previous release (`v0.6.0`) - -- Backward context: - - [v0.6.0 release notes](../releases/v0.6.0.md) - - [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) - -## Compatibility notes - -- Example payloads and status codes follow current API docs (`/v1/*`) -- For endpoint-specific behavior changes, read release notes first -- For throttling behavior, validate `429` + `Retry-After` handling in your client runtime - -## See also - -- [Authentication API](../api/authentication.md) -- [API error decision matrix](../api/error-decision-matrix.md) -- [API rate limiting](../api/rate-limiting.md) diff --git a/docs/getting-started/day-0-developer.md b/docs/getting-started/day-0-developer.md deleted file mode 100644 index 8db387c..0000000 --- a/docs/getting-started/day-0-developer.md +++ /dev/null @@ -1,42 +0,0 @@ -# πŸ’» Day 0 Developer Walkthrough - -> Last updated: 2026-02-20 - -Use this path for first-time contributors integrating with Secrets APIs. - -## Step 1: Run locally - -- Follow: [Run locally](local-development.md) -- Build and start API, then verify health - -## Step 2: Understand auth + policy behavior - -- Read: [Authentication API](../api/authentication.md) -- Read: [Policies cookbook](../api/policies.md) -- Read: [Capability matrix](../api/capability-matrix.md) - -## Step 3: Validate error and retry behavior - -- Read: [API error decision matrix](../api/error-decision-matrix.md) -- Read: [API rate limiting](../api/rate-limiting.md) - -## Step 4: Use examples by runtime - -- Start with: [Versioned examples by release](../examples/versioned-by-release.md) -- Then use: [Curl](../examples/curl.md), [Python](../examples/python.md), [JavaScript](../examples/javascript.md), [Go](../examples/go.md) - -## Step 5: Follow docs contribution quality bar - -- Read: [Documentation contributing guide](../contributing.md) -- Use: [Docs release checklist](../development/docs-release-checklist.md) - -## Expected outcomes - -- You can obtain tokens and call protected endpoints reliably -- You can distinguish authn/authz/throttling failures in client integrations -- You can submit feature PRs with aligned API + docs changes - -## See also - -- [Testing guide](../development/testing.md) -- [Docs architecture map](../development/docs-architecture-map.md) diff --git a/docs/getting-started/day-0-operator.md b/docs/getting-started/day-0-operator.md deleted file mode 100644 index bdd0869..0000000 --- a/docs/getting-started/day-0-operator.md +++ /dev/null @@ -1,42 +0,0 @@ -# 🧭 Day 0 Operator Walkthrough - -> Last updated: 2026-02-20 - -Use this linear path for first-time operations onboarding. - -## Step 1: Bring up a local baseline - -- Follow: [Run with Docker](docker.md) -- Verify: `GET /health`, `GET /ready` - -## Step 2: Validate core flows - -- Run: [Smoke test script](smoke-test.md) -- Confirm token issuance, secrets, and transit checks pass - -## Step 3: Learn rollout and rollback flow - -- Read: [Production rollout golden path](../operations/production-rollout.md) -- Focus: verification gates and rollback triggers - -## Step 4: Learn incident response path - -- Use: [Incident decision tree](../operations/incident-decision-tree.md) -- Drill: [First 15 Minutes Playbook](../operations/first-15-minutes.md) - -## Step 5: Harden production posture - -- Read: [Production deployment guide](../operations/production.md) -- Read: [Security hardening guide](../operations/security-hardening.md) -- Check: [Known limitations](../operations/known-limitations.md) - -## Expected outcomes - -- You can validate service health and auth quickly -- You can identify `401/403/429/5xx` primary runbook path -- You can execute a basic rollback trigger decision under pressure - -## See also - -- [Operator quick card](../operations/operator-quick-card.md) -- [Operator runbook index](../operations/runbook-index.md) diff --git a/docs/getting-started/day-0-walkthrough.md b/docs/getting-started/day-0-walkthrough.md new file mode 100644 index 0000000..d54da38 --- /dev/null +++ b/docs/getting-started/day-0-walkthrough.md @@ -0,0 +1,96 @@ +# 🧭 Day 0 Walkthrough + +> Last updated: 2026-02-20 + +Choose your onboarding path based on your role: + +## πŸ“‘ Quick Navigation + +- [πŸ‘· Operator Path](#operator-path) - For deployment and operations +- [πŸ‘¨β€πŸ’» Developer Path](#developer-path) - For API integration + +--- + +## πŸ‘· Operator Path {#operator-path} + +Use this linear path for first-time operations onboarding. + +### Step 1: Bring up a local baseline + +- Follow: [Run with Docker](docker.md) +- Verify: `GET /health`, `GET /ready` + +### Step 2: Validate core flows + +- Run: [Smoke test script](smoke-test.md) +- Confirm token issuance, secrets, and transit checks pass + +### Step 3: Learn rollout and rollback flow + +- Read: [Production rollout golden path](../operations/deployment/production-rollout.md) +- Focus: verification gates and rollback triggers + +### Step 4: Learn incident response path + +- Use: [Incident response guide](../operations/observability/incident-response.md) +- Drill: [Operator drills](../operations/runbooks/README.md#operator-drills-quarterly) + +### Step 5: Harden production posture + +- Read: [Production deployment guide](../operations/deployment/production.md) +- Read: [Security hardening guide](../operations/security/hardening.md) +- Check: [Known limitations](../operations/deployment/production.md#known-limitations-and-tradeoffs) + +### Expected outcomes + +- You can validate service health and auth quickly +- You can identify `401/403/429/5xx` primary runbook path +- You can execute a basic rollback trigger decision under pressure + +### See also + +- [Operator quick card](../operations/runbooks/README.md#operator-quick-card) +- [Operator runbook index](../operations/runbooks/README.md) + +--- + +## πŸ‘¨β€πŸ’» Developer Path {#developer-path} + +Use this path for first-time contributors integrating with Secrets APIs. + +### Step 1: Run locally + +- Follow: [Run locally](local-development.md) +- Build and start API, then verify health + +### Step 2: Understand auth + policy behavior + +- Read: [Authentication API](../api/auth/authentication.md) +- Read: [Policies cookbook](../api/auth/policies.md) +- Read: [Capability matrix](../api/fundamentals.md#capability-matrix) + +### Step 3: Validate error and retry behavior + +- Read: [API error decision matrix](../api/fundamentals.md#error-decision-matrix) +- Read: [API rate limiting](../api/fundamentals.md#rate-limiting) + +### Step 4: Use examples by runtime + +- Start with: [Code examples](../examples/README.md) +- Then use: [Curl](../examples/curl.md), [Python](../examples/python.md), [JavaScript](../examples/javascript.md), [Go](../examples/go.md) + +### Step 5: Follow docs contribution quality bar + +- Read: [Documentation contributing guide](../contributing.md) +- Use: [Docs release checklist](../contributing.md#docs-release-checklist) + +### Expected outcomes + +- You can obtain tokens and call protected endpoints reliably +- You can distinguish authn/authz/throttling failures in client integrations +- You can submit feature PRs with aligned API + docs changes + +### See also + +- [Development and testing](../contributing.md#development-and-testing) +- [Docs architecture map](../contributing.md#docs-architecture-map) diff --git a/docs/getting-started/docker.md b/docs/getting-started/docker.md index 2f6cc4d..efb3956 100644 --- a/docs/getting-started/docker.md +++ b/docs/getting-started/docker.md @@ -4,10 +4,9 @@ This is the default way to run Secrets. -For release reproducibility, this guide uses the pinned image tag `allisson/secrets:v0.7.0`. -For dev-only fast iteration, you can use `allisson/secrets:latest`. +This guide uses the latest Docker image (`allisson/secrets`). -**⚠️ Security Warning:** This guide is for **development and testing only**. For production deployments, see [Security Hardening Guide](../operations/security-hardening.md) and [Production Deployment Guide](../operations/production.md). +**⚠️ Security Warning:** This guide is for **development and testing only**. For production deployments, see [Security Hardening Guide](../operations/security/hardening.md) and [Production Deployment Guide](../operations/deployment/production.md). ## Current Security Defaults @@ -16,16 +15,16 @@ For dev-only fast iteration, you can use `allisson/secrets:latest`. - `RATE_LIMIT_TOKEN_ENABLED` default is `true` (per IP on `POST /v1/token`) - `CORS_ENABLED` default is `false` -These defaults were introduced in `v0.5.0` and now include token-endpoint rate limiting in `v0.7.0`. +These defaults were introduced in `v0.5.0` with token-endpoint rate limiting added in `v0.7.0` (current: v0.8.0). -If upgrading from `v0.6.0`, review [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md). +If upgrading from `v0.6.0`, review [v0.7.0 upgrade guide](../releases/RELEASES.md#070---2026-02-20). ## ⚑ Quickstart Copy Block Use this minimal flow when you just want to get a working instance quickly: ```bash -docker pull allisson/secrets:v0.7.0 +docker pull allisson/secrets docker network create secrets-net || true docker run -d --name secrets-postgres --network secrets-net \ @@ -34,19 +33,19 @@ docker run -d --name secrets-postgres --network secrets-net \ -e POSTGRES_DB=mydb \ postgres:16-alpine -docker run --rm allisson/secrets:v0.7.0 create-master-key --id default +docker run --rm allisson/secrets create-master-key --id default # copy generated MASTER_KEYS and ACTIVE_MASTER_KEY_ID into .env -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets create-kek --algorithm aes-gcm docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.7.0 server + allisson/secrets server ``` ## 1) Pull the image ```bash -docker pull allisson/secrets:v0.7.0 +docker pull allisson/secrets ``` ## 2) Start PostgreSQL @@ -64,7 +63,7 @@ docker run -d --name secrets-postgres --network secrets-net \ ## 3) Generate a master key ```bash -docker run --rm allisson/secrets:v0.7.0 create-master-key --id default +docker run --rm allisson/secrets create-master-key --id default ``` Copy the generated values into a local `.env` file. @@ -103,15 +102,15 @@ EOF ## 5) Run migrations and bootstrap KEK ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets create-kek --algorithm aes-gcm ``` ## 6) Start the API server ```bash docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.7.0 server + allisson/secrets server ``` ## 7) Verify @@ -131,7 +130,7 @@ Expected: Use the CLI command to create your first API client and policy set: ```bash -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-client \ +docker run --rm --network secrets-net --env-file .env allisson/secrets create-client \ --name bootstrap-admin \ --active \ --policies '[{"path":"*","capabilities":["read","write","delete","encrypt","decrypt","rotate"]}]' \ @@ -168,5 +167,5 @@ For a full end-to-end check, run `docs/getting-started/smoke-test.sh` (usage in - [Local development](local-development.md) - [Smoke test](smoke-test.md) - [Troubleshooting](troubleshooting.md) -- [Environment variables](../configuration/environment-variables.md) -- [CLI commands reference](../cli/commands.md) +- [Environment variables](../configuration.md) +- [CLI commands reference](../cli-commands.md) diff --git a/docs/getting-started/local-development.md b/docs/getting-started/local-development.md index b7571a2..3d3fd78 100644 --- a/docs/getting-started/local-development.md +++ b/docs/getting-started/local-development.md @@ -4,7 +4,7 @@ Use this path if you want to modify the source code and run from your workstation. -**⚠️ Security Warning:** This guide is for **development and testing only**. For production deployments, see [Security Hardening Guide](../operations/security-hardening.md) and [Production Deployment Guide](../operations/production.md). +**⚠️ Security Warning:** This guide is for **development and testing only**. For production deployments, see [Security Hardening Guide](../operations/security/hardening.md) and [Production Deployment Guide](../operations/deployment/production.md). ## Current Security Defaults @@ -13,9 +13,9 @@ Use this path if you want to modify the source code and run from your workstatio - `RATE_LIMIT_TOKEN_ENABLED` default is `true` (per IP on `POST /v1/token`) - `CORS_ENABLED` default is `false` -These defaults were introduced in `v0.5.0` and now include token-endpoint rate limiting in `v0.7.0`. +These defaults were introduced in `v0.5.0` with token-endpoint rate limiting added in `v0.7.0` (current: v0.8.0). -If upgrading from `v0.6.0`, review [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md). +If upgrading from `v0.6.0`, review [v0.7.0 upgrade guide](../releases/RELEASES.md#070---2026-02-20). ## Prerequisites @@ -110,5 +110,5 @@ curl http://localhost:8080/health - [Docker getting started](docker.md) - [Smoke test](smoke-test.md) - [Troubleshooting](troubleshooting.md) -- [Testing guide](../development/testing.md) -- [CLI commands reference](../cli/commands.md) +- [Development and testing](../contributing.md#development-and-testing) +- [CLI commands reference](../cli-commands.md) diff --git a/docs/getting-started/smoke-test.md b/docs/getting-started/smoke-test.md index 4a098a2..eec2653 100644 --- a/docs/getting-started/smoke-test.md +++ b/docs/getting-started/smoke-test.md @@ -83,5 +83,5 @@ Expected result under throttling: - [Docker getting started](docker.md) - [Local development](local-development.md) - [Troubleshooting](troubleshooting.md) -- [v0.7.0 release notes](../releases/v0.7.0.md) +- [Release notes](../releases/RELEASES.md) - [Curl examples](../examples/curl.md) diff --git a/docs/getting-started/troubleshooting.md b/docs/getting-started/troubleshooting.md index b5703a8..f998bc6 100644 --- a/docs/getting-started/troubleshooting.md +++ b/docs/getting-started/troubleshooting.md @@ -125,7 +125,7 @@ Trusted proxy checks for token endpoint (`POST /v1/token`): - If many callers suddenly look like one IP, verify proxy forwarding and trusted proxy settings - If `X-Forwarded-For` is accepted from untrusted sources, IP spoofing can bypass intended per-IP controls - Compare application logs (`client_ip`) with edge proxy logs to confirm real source-IP propagation -- Use [Trusted proxy reference](../operations/trusted-proxy-reference.md) for a platform checklist +- Use [Trusted proxy reference](../operations/security/hardening.md#trusted-proxy-configuration) for a platform checklist Quick note: @@ -256,28 +256,27 @@ Expected patterns: - verify `KMS_KEY_URI` points to the key used to encrypt `MASTER_KEYS` - confirm KMS IAM/policy includes decrypt permissions - rotate/regenerate master key entries if ciphertext was truncated or malformed - - use provider setup checks in [KMS setup guide](../operations/kms-setup.md) + - use provider setup checks in [KMS setup guide](../operations/kms/setup.md) ## Master key load regression triage (historical v0.5.1 fix) Historical note: - This section is retained for mixed-version or rollback investigations involving pre-`v0.5.1` builds. -- For current rollouts, prioritize KMS mode diagnostics and the `v0.7.0` upgrade path. +- For current rollouts, prioritize KMS mode diagnostics and recent upgrade paths. - Symptom: startup succeeds, but key-dependent operations fail unexpectedly after a recent rollout - Likely cause: running a pre-`v0.5.1` build where decoded master key buffers could be zeroed too early - Mixed-version rollout symptom: some requests pass while others fail if old and new images are serving traffic together - Version fingerprint checks: - local binary: `./bin/app --version` - - pinned image check: `docker run --rm allisson/secrets:v0.7.0 --version` + - pinned image check: `docker run --rm allisson/secrets --version` - running containers: `docker ps --format 'table {{.Names}}\t{{.Image}}'` - Fix: - - upgrade all instances to `v0.7.0` (or at minimum `v0.5.1+`) + - upgrade all instances to the latest version (v0.8.0 or at minimum `v0.5.1+`) - restart API instances after deploy - run key-dependent smoke checks (token issuance, secrets write/read, transit round-trip) - - review [v0.5.1 release notes](../releases/v0.5.1.md) and - [v0.5.1 upgrade guide](../releases/v0.5.1-upgrade.md) + - review [v0.5.1 release notes](../releases/RELEASES.md#051---2026-02-19) ## Missing KEK @@ -302,7 +301,7 @@ Historical note: - Symptom: tokenization endpoints return `404`/`500` after upgrading to `v0.4.x` - Likely cause: tokenization migration (`000002_add_tokenization`) not applied or partially applied - Fix: - - run `./bin/app migrate` (or Docker `... allisson/secrets:v0.7.0 migrate`) + - run `./bin/app migrate` (or Docker `... allisson/secrets migrate`) - verify migration logs indicate `000002_add_tokenization` applied for your DB - confirm initial KEK exists (`create-kek` if missing) - re-run smoke flow for tokenization (`tokenize -> detokenize -> validate -> revoke`) @@ -353,6 +352,6 @@ Q: Why is wildcard `*` risky for normal service clients? - [Smoke test](smoke-test.md) - [Docker getting started](docker.md) - [Local development](local-development.md) -- [Operator runbook index](../operations/runbook-index.md) -- [Production operations](../operations/production.md) -- [Trusted proxy reference](../operations/trusted-proxy-reference.md) +- [Operator runbook index](../operations/runbooks/README.md) +- [Production operations](../operations/deployment/production.md) +- [Trusted proxy reference](../operations/security/hardening.md#trusted-proxy-configuration) diff --git a/docs/metadata.json b/docs/metadata.json index 1b3bf6c..5022256 100644 --- a/docs/metadata.json +++ b/docs/metadata.json @@ -1,5 +1,5 @@ { - "current_release": "v0.7.0", + "current_release": "v0.8.0", "api_version": "v1", "last_docs_refresh": "2026-02-20" } diff --git a/docs/operations/dashboards/README.md b/docs/operations/dashboards/README.md deleted file mode 100644 index 0eb8734..0000000 --- a/docs/operations/dashboards/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# πŸ“ˆ Dashboard Artifacts - -> Last updated: 2026-02-19 - -This directory contains starter Grafana dashboard JSON artifacts for local bootstrap. - -## Artifacts - -- `secrets-overview.json`: baseline request/error/latency view -- `secrets-rate-limiting.json`: `429` behavior and throttle pressure view - -## Import - -1. Open Grafana -2. Go to Dashboards -> Import -3. Upload one of the JSON files from this directory -4. Select your Prometheus datasource - -## Notes - -- Treat these dashboards as starter templates -- Adjust panel thresholds and time windows for your traffic profile diff --git a/docs/operations/production-rollout.md b/docs/operations/deployment/production-rollout.md similarity index 82% rename from docs/operations/production-rollout.md rename to docs/operations/deployment/production-rollout.md index da17340..1c6156e 100644 --- a/docs/operations/production-rollout.md +++ b/docs/operations/deployment/production-rollout.md @@ -6,7 +6,7 @@ Use this runbook for a standard production rollout with verification and rollbac ## Scope -- Deploy target: Secrets `v0.7.0` +- Deploy target: Secrets (latest) - Database schema changes: run migrations before traffic cutover - Crypto bootstrap: ensure initial KEK exists for write/encrypt flows @@ -25,17 +25,17 @@ Use this runbook for a standard production rollout with verification and rollbac ```bash # 1) Pull target release -docker pull allisson/secrets:v0.7.0 +docker pull allisson/secrets # 2) Run migrations -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 migrate +docker run --rm --network secrets-net --env-file .env allisson/secrets migrate # 3) Bootstrap KEK only for first-time environment setup -docker run --rm --network secrets-net --env-file .env allisson/secrets:v0.7.0 create-kek --algorithm aes-gcm +docker run --rm --network secrets-net --env-file .env allisson/secrets create-kek --algorithm aes-gcm # 4) Start API docker run --rm --name secrets-api --network secrets-net --env-file .env -p 8080:8080 \ - allisson/secrets:v0.7.0 server + allisson/secrets server ``` ## Verification Gates @@ -82,9 +82,8 @@ Gate C (policy and observability): ## See also -- [Production deployment guide](production.md) -- [v0.7.0 release notes](../releases/v0.7.0.md) -- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) -- [KMS migration checklist](kms-migration-checklist.md) -- [Release compatibility matrix](../releases/compatibility-matrix.md) -- [Smoke test guide](../getting-started/smoke-test.md) +- [Production deployment guide](../deployment/production.md) +- [Release notes](../../releases/RELEASES.md) +- [KMS migration checklist](../kms/setup.md#migration-checklist) +- [Release compatibility matrix](../../releases/compatibility-matrix.md) +- [Smoke test guide](../../getting-started/smoke-test.md) diff --git a/docs/operations/production.md b/docs/operations/deployment/production.md similarity index 75% rename from docs/operations/production.md rename to docs/operations/deployment/production.md index 7ef4d0a..ece72e4 100644 --- a/docs/operations/production.md +++ b/docs/operations/deployment/production.md @@ -4,7 +4,7 @@ This guide covers baseline production hardening and operations for Secrets. -**For comprehensive security hardening, see [Security Hardening Guide](security-hardening.md).** +**For comprehensive security hardening, see [Security Hardening Guide](../security/hardening.md).** ## πŸ“‘ Table of Contents @@ -167,10 +167,10 @@ Adjust retention to match your compliance and incident-response requirements. ## 9) Golden Path Rollout (Recommended) -- Follow [Production rollout golden path](production-rollout.md) for step-by-step deployment, +- Follow [Production rollout golden path](../deployment/production-rollout.md) for step-by-step deployment, verification gates, and rollback triggers -- Use [Release compatibility matrix](../releases/compatibility-matrix.md) before planning upgrades -- Keep [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) attached to rollout change tickets +- Use [Release compatibility matrix](../../releases/compatibility-matrix.md) before planning upgrades +- Review [Release notes](../../releases/RELEASES.md) before upgrades ## 10) Token Endpoint Throttling Runbook @@ -182,7 +182,7 @@ Triage steps: - verify `429` responses include `Retry-After` - verify issue is concentrated on `/v1/token` or system-wide 2. Identify caller pattern: - - check `429` by route and token endpoint ratio in [Monitoring](monitoring.md) + - check `429` by route and token endpoint ratio in [Monitoring](../observability/monitoring.md) - confirm whether affected clients share NAT/proxy egress IPs 3. Validate real client-IP handling: - ensure reverse proxy forwards client IP headers correctly @@ -197,7 +197,7 @@ Triage steps: Trusted proxy guidance: - Validate forwarded-header trust and source-IP propagation using - [Trusted proxy reference](trusted-proxy-reference.md) + [Trusted proxy reference](../security/hardening.md#trusted-proxy-configuration) Rollback of temporary tuning: @@ -206,20 +206,48 @@ Rollback of temporary tuning: 3. Re-check `/v1/token` `429` ratio and token issuance success ratio 4. Keep incident notes with final thresholds for future baseline reviews +## Known Limitations and Tradeoffs + +This section documents practical limitations and tradeoffs operators should account for. + +### Rate limiting + +- Token endpoint rate limiting is per-IP; shared NAT/proxy egress can impact legitimate callers +- Header trust/proxy misconfiguration can skew caller IP behavior +- Application-level throttling complements but does not replace edge/WAF controls + +### Proxy and source-IP trust + +- If forwarded headers are over-trusted, source IP spoofing risk increases +- If trusted proxy chain is incomplete, all traffic may appear from one source + +### KMS startup model + +- KMS decryption happens at startup key-load time, not per-request +- Runtime KMS outages may not impact steady-state traffic immediately, but restart/redeploy can fail if KMS is unavailable + +### Operational cadence + +- Key rotation requires API restart/rolling restart to load new key material +- Cleanup routines (`clean-audit-logs`, `clean-expired-tokens`) are operator-driven + +### Documentation scope note + +- `docs/openapi.yaml` is a baseline subset, not exhaustive contract coverage for every workflow detail + ## See also -- [Security hardening guide](security-hardening.md) -- [Key management operations](key-management.md) -- [Production rollout golden path](production-rollout.md) -- [Operator runbook index](runbook-index.md) -- [Monitoring](monitoring.md) -- [Trusted proxy reference](trusted-proxy-reference.md) -- [Operator drills (quarterly)](operator-drills.md) -- [Policy smoke tests](policy-smoke-tests.md) -- [v0.7.0 release notes](../releases/v0.7.0.md) -- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) -- [KMS migration checklist](kms-migration-checklist.md) -- [Release compatibility matrix](../releases/compatibility-matrix.md) -- [Environment variables](../configuration/environment-variables.md) -- [Security model](../concepts/security-model.md) -- [Troubleshooting](../getting-started/troubleshooting.md) +- [Security hardening guide](../security/hardening.md) +- [Key management operations](../kms/key-management.md) +- [Production rollout golden path](../deployment/production-rollout.md) +- [Operator runbook index](../runbooks/README.md) +- [Monitoring](../observability/monitoring.md) +- [Trusted proxy reference](../security/hardening.md#trusted-proxy-configuration) +- [Operator drills (quarterly)](../runbooks/README.md#operator-drills-quarterly) +- [Policy smoke tests](../runbooks/policy-smoke-tests.md) +- [Release notes](../../releases/RELEASES.md) +- [KMS migration checklist](../kms/setup.md#migration-checklist) +- [Release compatibility matrix](../../releases/compatibility-matrix.md) +- [Environment variables](../../configuration.md) +- [Security model](../../concepts/security-model.md) +- [Troubleshooting](../../getting-started/troubleshooting.md) diff --git a/docs/operations/failure-playbooks.md b/docs/operations/failure-playbooks.md deleted file mode 100644 index 297d5f0..0000000 --- a/docs/operations/failure-playbooks.md +++ /dev/null @@ -1,98 +0,0 @@ -# πŸš‘ Failure Playbooks - -> Last updated: 2026-02-20 - -Use this page for fast incident triage on common API failures. - -## 401 Spike (Unauthorized) - -Symptoms: - -- sudden increase in `401` across multiple endpoints - -Triage steps: - -1. Verify token issuance with `POST /v1/token` -2. Confirm callers send `Authorization: Bearer ` -3. Check token expiry and client active state -4. Inspect audit logs for broad denied patterns - -## 403 Spike (Policy/Capability Mismatch) - -Symptoms: - -- valid tokens but access denied with `403` - -Triage steps: - -1. Identify failing endpoint path and required capability -2. Confirm client policy path matching (`*`, exact, trailing `/*`, and mid-path `*` segment rules) -3. Validate capability mapping for endpoint (`read`, `write`, `delete`, `encrypt`, `decrypt`, `rotate`) -4. Re-issue token after policy update - -## 409 on Transit Key Create - -Symptoms: - -- `POST /v1/transit/keys` returns `409 Conflict` - -Triage steps: - -1. Treat conflict as "key already initialized" -2. Call `POST /v1/transit/keys/:name/rotate` to create a new active version -3. Confirm encrypt/decrypt still work after rotation -4. Update automation to avoid repeated create for existing names - -## 404/422 on Tokenization Detokenize - -Symptoms: - -- `POST /v1/tokenization/detokenize` returns `404 Not Found` or `422 Unprocessable Entity` - -Triage steps: - -1. Confirm token was produced by `POST /v1/tokenization/keys/:name/tokenize` -2. Confirm request shape uses JSON body `{"token":"..."}` (not URL path token) -3. Check if token is expired (`ttl`) or revoked -4. Validate caller has `decrypt` capability on `/v1/tokenization/detokenize` -5. If expired tokens accumulate, run cleanup routine (`clean-expired-tokens`) - -## 409 on Tokenization Key Create - -Symptoms: - -- `POST /v1/tokenization/keys` returns `409 Conflict` - -Triage steps: - -1. Treat conflict as "key already initialized" -2. Call `POST /v1/tokenization/keys/:name/rotate` for a new active version -3. Confirm tokenize/detokenize paths remain healthy after rotation -4. Update automation to avoid repeated create for existing names - -## Quick Commands - -```bash -# Health -curl -s http://localhost:8080/health - -# Token check -curl -i -X POST http://localhost:8080/v1/token \ - -H "Content-Type: application/json" \ - -d '{"client_id":"","client_secret":""}' - -# Audit logs snapshot -curl -s "http://localhost:8080/v1/audit-logs?limit=50&offset=0" \ - -H "Authorization: Bearer " -``` - -## See also - -- [Incident decision tree](incident-decision-tree.md) -- [First 15 Minutes Playbook](first-15-minutes.md) -- [Troubleshooting](../getting-started/troubleshooting.md) -- [Policies cookbook](../api/policies.md) -- [Policy smoke tests](policy-smoke-tests.md) -- [Transit API](../api/transit.md) -- [Tokenization API](../api/tokenization.md) -- [Production operations](production.md) diff --git a/docs/operations/first-15-minutes.md b/docs/operations/first-15-minutes.md deleted file mode 100644 index 1fff7aa..0000000 --- a/docs/operations/first-15-minutes.md +++ /dev/null @@ -1,65 +0,0 @@ -# ⏱️ First 15 Minutes Playbook - -> Last updated: 2026-02-20 - -Use this for high-severity incidents where API availability or auth flows are degraded. - -## Minute 0-3: Establish Service State - -```bash -curl -i http://localhost:8080/health -curl -i http://localhost:8080/ready -``` - -Expected: - -- `GET /health` -> `200` -- `GET /ready` -> `200` - -## Minute 3-6: Validate Authentication Path - -```bash -curl -i -X POST http://localhost:8080/v1/token \ - -H "Content-Type: application/json" \ - -d '{"client_id":"","client_secret":""}' -``` - -Expected: - -- Normal flow -> `201 Created` -- If throttled -> `429` with `Retry-After` - -## Minute 6-10: Validate Crypto Data Path - -```bash -TOKEN="" - -curl -i -X POST http://localhost:8080/v1/secrets/incident/check \ - -H "Authorization: Bearer ${TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"value":"aW5jaWRlbnQtY2hlY2s="}' - -curl -i -X GET http://localhost:8080/v1/secrets/incident/check \ - -H "Authorization: Bearer ${TOKEN}" -``` - -Expected: - -- write/read path succeeds - -## Minute 10-15: Decide Mitigation Path - -1. `401`-heavy: credential/token issue -> [Failure playbooks](failure-playbooks.md) -2. `403`-heavy: policy mismatch -> [Policy smoke tests](policy-smoke-tests.md) -3. `429` on `/v1/token`: IP throttling/proxy path -> [Token throttling runbook](production.md#10-token-endpoint-throttling-runbook) -4. `5xx`/readiness failures: dependency/runtime path -> [Production rollout rollback triggers](production-rollout.md#rollback-trigger-conditions) - -## Command status markers - -> Command status: verified on 2026-02-20 - -## See also - -- [Incident decision tree](incident-decision-tree.md) -- [Production rollout golden path](production-rollout.md) -- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/operations/incident-decision-tree.md b/docs/operations/incident-decision-tree.md deleted file mode 100644 index 5d346b7..0000000 --- a/docs/operations/incident-decision-tree.md +++ /dev/null @@ -1,62 +0,0 @@ -# 🌲 Incident Decision Tree - -> Last updated: 2026-02-20 - -Use this page to route incidents quickly to the right runbook. - -## Start - -1. Is `GET /health` failing? - - Yes -> infrastructure/runtime path: [First 15 Minutes Playbook](first-15-minutes.md) - - No -> continue -2. Is `GET /ready` failing? - - Yes -> dependencies/migrations/key-load path: [Troubleshooting](../getting-started/troubleshooting.md) - - No -> continue -3. Identify dominant status code and route group: - - `401` -> [Failure playbooks: 401](failure-playbooks.md#401-spike-unauthorized) - - `403` -> [Failure playbooks: 403](failure-playbooks.md#403-spike-policycapability-mismatch) - - `429` on `/v1/token` -> [Token throttling runbook](production.md#10-token-endpoint-throttling-runbook) - - `429` on authenticated routes -> [API rate limiting](../api/rate-limiting.md) - - `422` -> [API error decision matrix](../api/error-decision-matrix.md) - - `5xx` -> [First 15 Minutes Playbook](first-15-minutes.md) - -## Fast Branches - -### `401 Unauthorized` - -- Re-issue token via `POST /v1/token` -- Confirm caller sends `Authorization: Bearer ` -- Check client active status and secret rotation history - -### `403 Forbidden` - -- Verify endpoint path shape and required capability -- Verify policy matching semantics (`*`, trailing `/*`, mid-path `*`) -- Re-issue token after policy fix - -### `429 Too Many Requests` - -- Read `Retry-After` header -- Separate `/v1/token` from authenticated-route throttling -- Validate proxy/source-IP behavior if `/v1/token` is impacted - -### `5xx` - -- Check database connectivity and pool saturation -- Check migration and key-load startup logs -- Use rollback triggers in production rollout runbook - -## Search Aliases - -- `retry-after` -- `rate limit exceeded` -- `token endpoint throttling` -- `unauthorized spike` -- `forbidden policy mismatch` - -## See also - -- [First 15 Minutes Playbook](first-15-minutes.md) -- [Failure playbooks](failure-playbooks.md) -- [Troubleshooting](../getting-started/troubleshooting.md) -- [Operator quick card](operator-quick-card.md) diff --git a/docs/operations/kms-migration-checklist.md b/docs/operations/kms-migration-checklist.md deleted file mode 100644 index 5f91534..0000000 --- a/docs/operations/kms-migration-checklist.md +++ /dev/null @@ -1,55 +0,0 @@ -# βœ… KMS Migration Checklist - -> Last updated: 2026-02-20 - -Use this checklist for migrating from legacy plaintext master keys to KMS mode. - -## 1) Precheck - -- [ ] Confirm target release is `v0.7.0` or newer -- [ ] Back up current environment configuration -- [ ] Confirm rollback owner and change window -- [ ] Confirm KMS provider credentials are available in runtime -- [ ] Confirm KMS encrypt/decrypt permissions are granted - -## 2) Build KMS key chain - -- [ ] Generate new KMS-encrypted key with `create-master-key --kms-provider ... --kms-key-uri ...` -- [ ] Re-encode existing legacy plaintext keys into KMS ciphertext -- [ ] Build `MASTER_KEYS` with only KMS ciphertext entries (no plaintext mix) -- [ ] Set `KMS_PROVIDER`, `KMS_KEY_URI`, and `ACTIVE_MASTER_KEY_ID` - -Reference: [KMS setup guide](kms-setup.md#migration-from-legacy-mode) - -## 3) Rollout - -- [ ] Restart API instances (rolling) -- [ ] Verify startup logs show KMS mode and key decrypt lines -- [ ] Run baseline checks: `GET /health`, `GET /ready` -- [ ] Run key-dependent smoke checks: token issuance, secrets, transit - -Reference: [Production rollout golden path](production-rollout.md) - -## 4) Rotation and cleanup - -- [ ] Rotate KEK after switching to KMS key chain -- [ ] Verify reads/decrypt for existing data still succeed -- [ ] Remove old key entries from `MASTER_KEYS` only after verification -- [ ] Restart API instances again after key-chain cleanup - -Reference: [Key management operations](key-management.md) - -## 5) Rollback readiness - -- [ ] Keep previous image tag available -- [ ] Keep pre-change env snapshot available -- [ ] If rollback needed, revert app version first -- [ ] Re-validate health and smoke checks after rollback - -Reference: [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md#rollback-notes) - -## See also - -- [KMS setup guide](kms-setup.md) -- [Key management operations](key-management.md) -- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/operations/key-management.md b/docs/operations/kms/key-management.md similarity index 89% rename from docs/operations/key-management.md rename to docs/operations/kms/key-management.md index 863e648..53f25f8 100644 --- a/docs/operations/key-management.md +++ b/docs/operations/kms/key-management.md @@ -20,7 +20,7 @@ Generate: Docker image equivalent: ```bash -docker run --rm allisson/secrets:v0.7.0 create-master-key --id prod-2026-01 +docker run --rm allisson/secrets create-master-key --id prod-2026-01 ``` Rotate master key: @@ -134,12 +134,12 @@ create key -> 409 Conflict: rotate key ## Related -- 🏭 Production deployment: `docs/operations/production.md` +- 🏭 Production deployment: `docs/operations/deployment/production.md` ## See also -- [Production operations](production.md) -- [Security model](../concepts/security-model.md) -- [Transit API](../api/transit.md) -- [Environment variables](../configuration/environment-variables.md) -- [KMS setup guide](kms-setup.md) +- [Production operations](../deployment/production.md) +- [Security model](../../concepts/security-model.md) +- [Transit API](../../api/data/transit.md) +- [Environment variables](../../configuration.md) +- [KMS setup guide](setup.md) diff --git a/docs/operations/kms-setup.md b/docs/operations/kms/setup.md similarity index 92% rename from docs/operations/kms-setup.md rename to docs/operations/kms/setup.md index 9250e00..c21944c 100644 --- a/docs/operations/kms-setup.md +++ b/docs/operations/kms/setup.md @@ -514,7 +514,7 @@ Docker Compose example: ```yaml services: secrets-api: - image: allisson/secrets:v0.7.0 + image: allisson/secrets env_file: - .env environment: @@ -536,7 +536,7 @@ spec: spec: containers: - name: app - image: allisson/secrets:v0.7.0 + image: allisson/secrets env: - name: KMS_PROVIDER value: gcpkms @@ -699,6 +699,52 @@ MASTER_KEYS=master-key-kms-2026: ACTIVE_MASTER_KEY_ID=master-key-kms-2026 ``` +### Migration Checklist + +Use this checklist for migrating from legacy plaintext master keys to KMS mode. + +#### 1) Precheck + +- [ ] Confirm target release is v0.8.0 or newer +- [ ] Back up current environment configuration +- [ ] Confirm rollback owner and change window +- [ ] Confirm KMS provider credentials are available in runtime +- [ ] Confirm KMS encrypt/decrypt permissions are granted + +#### 2) Build KMS key chain + +- [ ] Generate new KMS-encrypted key with `create-master-key --kms-provider ... --kms-key-uri ...` +- [ ] Re-encode existing legacy plaintext keys into KMS ciphertext +- [ ] Build `MASTER_KEYS` with only KMS ciphertext entries (no plaintext mix) +- [ ] Set `KMS_PROVIDER`, `KMS_KEY_URI`, and `ACTIVE_MASTER_KEY_ID` + +#### 3) Rollout + +- [ ] Restart API instances (rolling) +- [ ] Verify startup logs show KMS mode and key decrypt lines +- [ ] Run baseline checks: `GET /health`, `GET /ready` +- [ ] Run key-dependent smoke checks: token issuance, secrets, transit + +Reference: [Production rollout golden path](../deployment/production-rollout.md) + +#### 4) Rotation and cleanup + +- [ ] Rotate KEK after switching to KMS key chain +- [ ] Verify reads/decrypt for existing data still succeed +- [ ] Remove old key entries from `MASTER_KEYS` only after verification +- [ ] Restart API instances again after key-chain cleanup + +Reference: [Key management operations](../kms/key-management.md) + +#### 5) Rollback readiness + +- [ ] Keep previous image tag available +- [ ] Keep pre-change env snapshot available +- [ ] If rollback needed, revert app version first +- [ ] Re-validate health and smoke checks after rollback + +Reference: [Release notes](../../releases/RELEASES.md#070---2026-02-20) + ## Key Rotation Rotate master keys regularly (recommended: every 90-180 days). @@ -809,6 +855,6 @@ DEBUG master key decrypted key_id=master-key-2026-02-19 ciphertext_length=64 ## See Also -- [Key Management Guide](key-management.md) - KEK and DEK rotation procedures -- [Security Hardening](security-hardening.md) - Production security best practices -- [Production Deployment](production.md) - Production deployment checklist +- [Key Management Guide](../kms/key-management.md) - KEK and DEK rotation procedures +- [Security Hardening](../security/hardening.md) - Production security best practices +- [Production Deployment](../deployment/production.md) - Production deployment checklist diff --git a/docs/operations/known-limitations.md b/docs/operations/known-limitations.md deleted file mode 100644 index 746c965..0000000 --- a/docs/operations/known-limitations.md +++ /dev/null @@ -1,37 +0,0 @@ -# ⚠️ Known Limitations - -> Last updated: 2026-02-20 - -This page documents practical limitations and tradeoffs operators should account for. - -## Rate limiting - -- Token endpoint rate limiting is per-IP; shared NAT/proxy egress can impact legitimate callers -- Header trust/proxy misconfiguration can skew caller IP behavior -- Application-level throttling complements but does not replace edge/WAF controls - -## Proxy and source-IP trust - -- If forwarded headers are over-trusted, source IP spoofing risk increases -- If trusted proxy chain is incomplete, all traffic may appear from one source - -## KMS startup model - -- KMS decryption happens at startup key-load time, not per-request -- Runtime KMS outages may not impact steady-state traffic immediately, but restart/redeploy can fail if KMS is unavailable - -## Operational cadence - -- Key rotation requires API restart/rolling restart to load new key material -- Cleanup routines (`clean-audit-logs`, `clean-expired-tokens`) are operator-driven - -## Documentation scope note - -- `docs/openapi.yaml` is a baseline subset, not exhaustive contract coverage for every workflow detail - -## See also - -- [Trusted proxy reference](trusted-proxy-reference.md) -- [Security hardening guide](security-hardening.md) -- [KMS setup guide](kms-setup.md) -- [Production deployment guide](production.md) diff --git a/docs/operations/observability/incident-response.md b/docs/operations/observability/incident-response.md new file mode 100644 index 0000000..d4a7cc2 --- /dev/null +++ b/docs/operations/observability/incident-response.md @@ -0,0 +1,236 @@ +# 🚨 Incident Response Guide + +> Last updated: 2026-02-20 + +This guide provides fast incident triage workflows, decision paths, and failure playbooks for common API issues. + +## πŸ“‘ Quick Navigation + +- [Quick Start: First 15 Minutes](#quick-start-first-15-minutes) - High-severity incident triage +- [Incident Decision Tree](#incident-decision-tree) - Fast status code branching +- [Failure Playbooks](#failure-playbooks) - Detailed remediation for specific errors + +**Fast Branches**: [401](#401-unauthorized) | [403](#403-forbidden) | [429](#429-too-many-requests) | [5xx](#5xx) + +--- + +## Quick Start: First 15 Minutes + +Use this for high-severity incidents where API availability or auth flows are degraded. + +### Minute 0-3: Establish Service State + +```bash +curl -i http://localhost:8080/health +curl -i http://localhost:8080/ready +``` + +Expected: + +- `GET /health` -> `200` +- `GET /ready` -> `200` + +### Minute 3-6: Validate Authentication Path + +```bash +curl -i -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}' +``` + +Expected: + +- Normal flow -> `201 Created` +- If throttled -> `429` with `Retry-After` + +### Minute 6-10: Validate Crypto Data Path + +```bash +TOKEN="" + +curl -i -X POST http://localhost:8080/v1/secrets/incident/check \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"value":"aW5jaWRlbnQtY2hlY2s="}' + +curl -i -X GET http://localhost:8080/v1/secrets/incident/check \ + -H "Authorization: Bearer ${TOKEN}" +``` + +Expected: + +- write/read path succeeds + +### Minute 10-15: Decide Mitigation Path + +1. `401`-heavy: credential/token issue β†’ [401 Spike Playbook](#401-spike-unauthorized) +2. `403`-heavy: policy mismatch β†’ [403 Spike Playbook](#403-spike-policycapability-mismatch) and [Policy smoke tests](../runbooks/policy-smoke-tests.md) +3. `429` on `/v1/token`: IP throttling/proxy path β†’ [Token throttling runbook](../deployment/production.md#10-token-endpoint-throttling-runbook) +4. `5xx`/readiness failures: dependency/runtime path β†’ [Production rollout rollback triggers](../deployment/production-rollout.md#rollback-trigger-conditions) + +--- + +## Incident Decision Tree + +Use this to route incidents quickly to the right runbook. + +### Decision Flow + +1. Is `GET /health` failing? + - Yes β†’ infrastructure/runtime path: Follow [First 15 Minutes](#quick-start-first-15-minutes) above + - No β†’ continue +2. Is `GET /ready` failing? + - Yes β†’ dependencies/migrations/key-load path: [Troubleshooting](../../getting-started/troubleshooting.md) + - No β†’ continue +3. Identify dominant status code and route group: + - `401` β†’ [401 Spike Playbook](#401-spike-unauthorized) + - `403` β†’ [403 Spike Playbook](#403-spike-policycapability-mismatch) + - `429` on `/v1/token` β†’ [Token throttling runbook](../deployment/production.md#10-token-endpoint-throttling-runbook) + - `429` on authenticated routes β†’ [API rate limiting](../../api/fundamentals.md#rate-limiting) + - `422` β†’ [API error decision matrix](../../api/fundamentals.md#error-decision-matrix) + - `5xx` β†’ [First 15 Minutes](#quick-start-first-15-minutes) + +### Fast Branches + +#### `401 Unauthorized` + +- Re-issue token via `POST /v1/token` +- Confirm caller sends `Authorization: Bearer ` +- Check client active status and secret rotation history + +#### `403 Forbidden` + +- Verify endpoint path shape and required capability +- Verify policy matching semantics (`*`, trailing `/*`, mid-path `*`) +- Re-issue token after policy fix + +#### `429 Too Many Requests` + +- Read `Retry-After` header +- Separate `/v1/token` from authenticated-route throttling +- Validate proxy/source-IP behavior if `/v1/token` is impacted + +#### `5xx` + +- Check database connectivity and pool saturation +- Check migration and key-load startup logs +- Use rollback triggers in production rollout runbook + +### Search Aliases + +- `retry-after` +- `rate limit exceeded` +- `token endpoint throttling` +- `unauthorized spike` +- `forbidden policy mismatch` + +--- + +## Failure Playbooks + +Use these for fast incident triage on common API failures. + +### 401 Spike (Unauthorized) + +**Symptoms:** + +- Sudden increase in `401` across multiple endpoints + +**Triage steps:** + +1. Verify token issuance with `POST /v1/token` +2. Confirm callers send `Authorization: Bearer ` +3. Check token expiry and client active state +4. Inspect audit logs for broad denied patterns + +### 403 Spike (Policy/Capability Mismatch) + +**Symptoms:** + +- Valid tokens but access denied with `403` + +**Triage steps:** + +1. Identify failing endpoint path and required capability +2. Confirm client policy path matching (`*`, exact, trailing `/*`, and mid-path `*` segment rules) +3. Validate capability mapping for endpoint (`read`, `write`, `delete`, `encrypt`, `decrypt`, `rotate`) +4. Re-issue token after policy update + +### 409 on Transit Key Create + +**Symptoms:** + +- `POST /v1/transit/keys` returns `409 Conflict` + +**Triage steps:** + +1. Treat conflict as "key already initialized" +2. Call `POST /v1/transit/keys/:name/rotate` to create a new active version +3. Confirm encrypt/decrypt still work after rotation +4. Update automation to avoid repeated create for existing names + +### 404/422 on Tokenization Detokenize + +**Symptoms:** + +- `POST /v1/tokenization/detokenize` returns `404 Not Found` or `422 Unprocessable Entity` + +**Triage steps:** + +1. Confirm token was produced by `POST /v1/tokenization/keys/:name/tokenize` +2. Confirm request shape uses JSON body `{"token":"..."}` (not URL path token) +3. Check if token is expired (`ttl`) or revoked +4. Validate caller has `decrypt` capability on `/v1/tokenization/detokenize` +5. If expired tokens accumulate, run cleanup routine (`clean-expired-tokens`) + +### 409 on Tokenization Key Create + +**Symptoms:** + +- `POST /v1/tokenization/keys` returns `409 Conflict` + +**Triage steps:** + +1. Treat conflict as "key already initialized" +2. Call `POST /v1/tokenization/keys/:name/rotate` for a new active version +3. Confirm tokenize/detokenize paths remain healthy after rotation +4. Update automation to avoid repeated create for existing names + +--- + +## Quick Commands + +```bash +# Health +curl -s http://localhost:8080/health + +# Token check +curl -i -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}' + +# Audit logs snapshot +curl -s "http://localhost:8080/v1/audit-logs?limit=50&offset=0" \ + -H "Authorization: Bearer " +``` + +--- + +## Command Status Markers + +> Command status: verified on 2026-02-20 + +--- + +## See also + +- [Production rollout golden path](../deployment/production-rollout.md) +- [Troubleshooting](../../getting-started/troubleshooting.md) +- [Operator quick card](../runbooks/README.md#operator-quick-card) +- [Policies cookbook](../../api/auth/policies.md) +- [Policy smoke tests](../runbooks/policy-smoke-tests.md) +- [Transit API](../../api/data/transit.md) +- [Tokenization API](../../api/data/tokenization.md) +- [API rate limiting](../../api/fundamentals.md#rate-limiting) +- [API error decision matrix](../../api/fundamentals.md#error-decision-matrix) +- [Production operations](../deployment/production.md) diff --git a/docs/operations/monitoring.md b/docs/operations/observability/monitoring.md similarity index 91% rename from docs/operations/monitoring.md rename to docs/operations/observability/monitoring.md index cc87f58..1687601 100644 --- a/docs/operations/monitoring.md +++ b/docs/operations/observability/monitoring.md @@ -4,6 +4,22 @@ This document describes the metrics instrumentation and monitoring capabilities in the Secrets application. +## Table of Contents + +- [Overview](#overview) +- [Configuration](#configuration) +- [Quickstart (Prometheus + Grafana)](#quickstart-prometheus--grafana) +- [Metrics Endpoint](#metrics-endpoint) +- [Available Metrics](#available-metrics) +- [Business Domains and Operations](#business-domains-and-operations) +- [Prometheus Configuration](#prometheus-configuration) +- [Grafana Dashboard](#grafana-dashboard) +- [Alerting](#alerting) +- [Disabling Metrics](#disabling-metrics) +- [Performance Considerations](#performance-considerations) +- [Troubleshooting](#troubleshooting) +- [See Also](#see-also) + ## Overview The application uses OpenTelemetry for metrics instrumentation with a Prometheus-compatible export endpoint. Metrics can be enabled/disabled via configuration and cover two main areas: @@ -373,11 +389,26 @@ rate(secrets_operations_total{domain="tokenization",operation="cleanup_expired", ## Grafana Dashboard -Starter dashboard artifacts: +### Dashboard Artifacts + +Starter Grafana dashboard JSON artifacts are available for local bootstrap: + +**Artifacts:** + +- `secrets-overview.json` - Baseline request/error/latency view +- `secrets-rate-limiting.json` - `429` behavior and throttle pressure view + +**Import Instructions:** + +1. Open Grafana +2. Go to Dashboards β†’ Import +3. Upload one of the JSON files from `docs/operations/dashboards/` +4. Select your Prometheus datasource + +**Notes:** -- [Dashboard artifacts index](dashboards/README.md) -- [Secrets overview dashboard JSON](dashboards/secrets-overview.json) -- [Secrets rate-limiting dashboard JSON](dashboards/secrets-rate-limiting.json) +- Treat these dashboards as starter templates +- Adjust panel thresholds and time windows for your traffic profile ### Recommended Panels @@ -551,9 +582,9 @@ When disabled: ## See Also -- [Production Deployment](production.md) -- [Operator drills](operator-drills.md) -- [Failure Playbooks](failure-playbooks.md) -- [API rate limiting](../api/rate-limiting.md) +- [Production Deployment](../deployment/production.md) +- [Operator drills](../runbooks/README.md#operator-drills-quarterly) +- [Incident response guide](../observability/incident-response.md) +- [API rate limiting](../../api/fundamentals.md#rate-limiting) - [OpenTelemetry Documentation](https://opentelemetry.io/docs/) - [Prometheus Documentation](https://prometheus.io/docs/) diff --git a/docs/operations/operator-drills.md b/docs/operations/operator-drills.md deleted file mode 100644 index 69d9c4b..0000000 --- a/docs/operations/operator-drills.md +++ /dev/null @@ -1,45 +0,0 @@ -# 🧯 Operator Drills (Quarterly) - -> Last updated: 2026-02-19 - -Use this page for quarterly game-day exercises that validate operational readiness. - -## Drill Catalog - -| Drill | Scenario | Primary runbooks | Evidence to collect | -| --- | --- | --- | --- | -| Credential compromise | Client secret leaked | `production.md`, `key-management.md`, `failure-playbooks.md` | revocation timeline, new client IDs, audit evidence | -| Key rotation under load | KEK/master-key rotation while traffic is active | `key-management.md`, `production-rollout.md` | rotation timestamps, restart logs, smoke checks | -| Traffic surge / throttling | Burst traffic causes `429` pressure | `monitoring.md`, `../api/rate-limiting.md` | `429` ratio, retry behavior, threshold decision | -| Database outage | DB unreachable / failover | `failure-playbooks.md`, `production.md` | outage timeline, failover duration, restore checks | - -## Quarterly Execution Template - -1. Pick one drill owner and one incident commander -2. Define blast radius and rollback boundary -3. Execute drill in staging (or prod shadow) with fixed timebox -4. Capture metrics, logs, and runbook deviations -5. Produce remediation actions with owners and due dates - -## Pass Criteria - -- Critical runbooks are executable without undocumented tribal knowledge -- On-call can identify root cause and containment path within target SLA -- Recovery path is validated by health checks and smoke tests -- Postmortem includes at least one docs/process improvement item - -## Evidence Checklist - -- Timeline with UTC timestamps -- Request IDs for key failure and recovery events -- Alert timeline (fired, acknowledged, resolved) -- Commands executed and operator decisions -- Follow-up tickets and target completion dates - -## See also - -- [Production rollout golden path](production-rollout.md) -- [Production deployment guide](production.md) -- [Failure playbooks](failure-playbooks.md) -- [Monitoring](monitoring.md) -- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/operations/operator-quick-card.md b/docs/operations/operator-quick-card.md deleted file mode 100644 index 25c7d98..0000000 --- a/docs/operations/operator-quick-card.md +++ /dev/null @@ -1,83 +0,0 @@ -# ⚑ Operator Quick Card - -> Last updated: 2026-02-20 - -Use this page during rollout and incidents when you need a fast, minimal checklist. - -## Rollout Preflight (5-minute check) - -1. Confirm target version and image tag match release plan -2. Confirm DB connectivity and migration window -3. Confirm key mode settings (`KMS_PROVIDER` + `KMS_KEY_URI` or legacy mode) -4. Confirm token and route rate-limit settings are intentional -5. Confirm rollback owner and communication channel - -Primary references: - -- [Production rollout golden path](production-rollout.md) -- [Release compatibility matrix](../releases/compatibility-matrix.md) -- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) - -## Baseline Verification (before traffic cutover) - -1. `GET /health` returns `200` -2. `GET /ready` returns `200` -3. `POST /v1/token` returns `201` -4. Secrets write/read passes -5. Transit encrypt/decrypt passes - -Reference: - -- [Smoke test guide](../getting-started/smoke-test.md) - -## Fast Status Triage (`401` / `403` / `429`) - -1. `401`: re-check credentials/token issuance path -2. `403`: verify policy path and capability mapping -3. `429`: check `Retry-After`, then decide per-client vs token-IP tuning path - -References: - -- [API error decision matrix](../api/error-decision-matrix.md) -- [API rate limiting](../api/rate-limiting.md) -- [Monitoring](monitoring.md) - -## Token Endpoint `429` Quick Path - -1. Confirm `429` concentrated on `POST /v1/token` -2. Verify shared NAT/proxy egress is not collapsing many clients to one IP -3. Validate trusted proxy and forwarded header behavior -4. Apply temporary `RATE_LIMIT_TOKEN_*` tuning only if traffic is legitimate -5. Revert temporary tuning after stability - -References: - -- [Production token throttling runbook](production.md#10-token-endpoint-throttling-runbook) -- [Trusted proxy reference](trusted-proxy-reference.md) -- [Troubleshooting](../getting-started/troubleshooting.md) - -## Rollback Triggers - -- Sustained elevated `5xx` -- Widespread token/auth failures -- Unexpected data-integrity behavior -- Failed verification gates after rollout - -Reference: - -- [Rollback procedure](production-rollout.md#rollback-procedure-binaryimage) - -## Incident Notes Minimum - -Capture these before closing: - -- timeline (detection -> mitigation -> recovery) -- affected routes/clients -- config changes applied (`RATE_LIMIT_*`, `RATE_LIMIT_TOKEN_*`, policy updates) -- final mitigation and follow-up owner - -## See also - -- [Operator runbook index](runbook-index.md) -- [Production deployment guide](production.md) -- [Failure playbooks](failure-playbooks.md) diff --git a/docs/operations/runbook-index.md b/docs/operations/runbook-index.md deleted file mode 100644 index 7f44e42..0000000 --- a/docs/operations/runbook-index.md +++ /dev/null @@ -1,57 +0,0 @@ -# 🧭 Operator Runbook Index - -> Last updated: 2026-02-20 - -Use this page as the single entry point for rollout, validation, and incident runbooks. - -## Release and Rollout - -- [Operator quick card](operator-quick-card.md) -- [v0.7.0 release notes](../releases/v0.7.0.md) -- [v0.7.0 upgrade guide](../releases/v0.7.0-upgrade.md) -- [v0.6.0 release notes](../releases/v0.6.0.md) (historical) -- [v0.6.0 upgrade guide](../releases/v0.6.0-upgrade.md) (historical) -- [Release compatibility matrix](../releases/compatibility-matrix.md) -- [Production rollout golden path](production-rollout.md) -- [Production deployment guide](production.md) -- [KMS setup guide](kms-setup.md) -- [KMS migration checklist](kms-migration-checklist.md) - -## Authorization Policy Validation - -- [Policies cookbook](../api/policies.md) -- [Path matching behavior](../api/policies.md#path-matching-behavior) -- [Route shape vs policy shape](../api/policies.md#route-shape-vs-policy-shape) -- [Policy review checklist before deploy](../api/policies.md#policy-review-checklist-before-deploy) -- [Policy smoke tests](policy-smoke-tests.md) - -## API and Access Verification - -- [Capability matrix](../api/capability-matrix.md) -- [API error decision matrix](../api/error-decision-matrix.md) -- [Authentication API](../api/authentication.md) -- [Audit logs API](../api/audit-logs.md) - -## Incident and Recovery - -- [Incident decision tree](incident-decision-tree.md) -- [First 15 Minutes Playbook](first-15-minutes.md) -- [Failure playbooks](failure-playbooks.md) -- [Operator drills (quarterly)](operator-drills.md) -- [Troubleshooting](../getting-started/troubleshooting.md) -- [Key management operations](key-management.md) -- [Known limitations](known-limitations.md) - -## Observability and Health - -- [Monitoring](monitoring.md) -- [Trusted proxy reference](trusted-proxy-reference.md) -- [Smoke test guide](../getting-started/smoke-test.md) - -## Suggested Operator Flow - -1. Read release notes for behavior changes and upgrade notes -2. Apply policy review checklist and rollout changes -3. Run smoke tests and policy smoke tests before traffic cutover -4. Verify denied/allowed patterns in audit logs after rollout -5. Use failure playbooks and troubleshooting for incidents diff --git a/docs/operations/runbooks/README.md b/docs/operations/runbooks/README.md new file mode 100644 index 0000000..388806d --- /dev/null +++ b/docs/operations/runbooks/README.md @@ -0,0 +1,158 @@ +# 🧭 Operator Runbook Index + +> Last updated: 2026-02-20 + +Use this page as the single entry point for rollout, validation, and incident runbooks. + +## Release and Rollout + +- [Release notes](../../releases/RELEASES.md) +- [Release compatibility matrix](../../releases/compatibility-matrix.md) +- [Production rollout golden path](../deployment/production-rollout.md) +- [Production deployment guide](../deployment/production.md) +- [KMS setup guide](../kms/setup.md) + +## Authorization Policy Validation + +- [Policies cookbook](../../api/auth/policies.md) +- [Path matching behavior](../../api/auth/policies.md#path-matching-behavior) +- [Route shape vs policy shape](../../api/auth/policies.md#route-shape-vs-policy-shape) +- [Policy review checklist before deploy](../../api/auth/policies.md#policy-review-checklist-before-deploy) +- [Policy smoke tests](../runbooks/policy-smoke-tests.md) + +## API and Access Verification + +- [Capability matrix](../../api/fundamentals.md#capability-matrix) +- [API error decision matrix](../../api/fundamentals.md#error-decision-matrix) +- [Authentication API](../../api/auth/authentication.md) +- [Audit logs API](../../api/observability/audit-logs.md) + +## Incident and Recovery + +- [Incident response guide](../observability/incident-response.md) +- [Troubleshooting](../../getting-started/troubleshooting.md) +- [Key management operations](../kms/key-management.md) + +## Observability and Health + +- [Monitoring](../observability/monitoring.md) +- [Smoke test guide](../../getting-started/smoke-test.md) + +## Suggested Operator Flow + +1. Read release notes for behavior changes and upgrade notes +2. Apply policy review checklist and rollout changes +3. Run smoke tests and policy smoke tests before traffic cutover +4. Verify denied/allowed patterns in audit logs after rollout +5. Use failure playbooks and troubleshooting for incidents + +## Operator Quick Card + +Use this section during rollout and incidents when you need a fast, minimal checklist. + +### Rollout Preflight (5-minute check) + +1. Confirm target version and image tag match release plan +2. Confirm DB connectivity and migration window +3. Confirm key mode settings (`KMS_PROVIDER` + `KMS_KEY_URI` or legacy mode) +4. Confirm token and route rate-limit settings are intentional +5. Confirm rollback owner and communication channel + +Primary references: + +- [Production rollout golden path](../deployment/production-rollout.md) +- [Release compatibility matrix](../../releases/compatibility-matrix.md) +- [Release notes](../../releases/RELEASES.md) + +### Baseline Verification (before traffic cutover) + +1. `GET /health` returns `200` +2. `GET /ready` returns `200` +3. `POST /v1/token` returns `201` +4. Secrets write/read passes +5. Transit encrypt/decrypt passes + +Reference: + +- [Smoke test guide](../../getting-started/smoke-test.md) + +### Fast Status Triage (`401` / `403` / `429`) + +1. `401`: re-check credentials/token issuance path +2. `403`: verify policy path and capability mapping +3. `429`: check `Retry-After`, then decide per-client vs token-IP tuning path + +References: + +- [API error decision matrix](../../api/fundamentals.md#error-decision-matrix) +- [API rate limiting](../../api/fundamentals.md#rate-limiting) +- [Monitoring](../observability/monitoring.md) + +### Token Endpoint `429` Quick Path + +1. Confirm `429` concentrated on `POST /v1/token` +2. Verify shared NAT/proxy egress is not collapsing many clients to one IP +3. Validate trusted proxy and forwarded header behavior +4. Apply temporary `RATE_LIMIT_TOKEN_*` tuning only if traffic is legitimate +5. Revert temporary tuning after stability + +References: + +- [Production token throttling runbook](../deployment/production.md#10-token-endpoint-throttling-runbook) +- [Trusted proxy reference](../security/hardening.md#trusted-proxy-configuration) + +### Rollback Triggers + +- Sustained elevated `5xx` +- Widespread token/auth failures +- Unexpected data-integrity behavior +- Failed verification gates after rollout + +Reference: + +- [Rollback procedure](../deployment/production-rollout.md#rollback-procedure-binaryimage) + +### Incident Notes Minimum + +Capture these before closing: + +- timeline (detection -> mitigation -> recovery) +- affected routes/clients +- config changes applied (`RATE_LIMIT_*`, `RATE_LIMIT_TOKEN_*`, policy updates) +- final mitigation and follow-up owner + +## Operator Drills (Quarterly) + +Use this section for quarterly game-day exercises that validate operational readiness. + +### Drill Catalog + +| Drill | Scenario | Primary runbooks | Evidence to collect | +| --- | --- | --- | --- | +| Credential compromise | Client secret leaked | `production.md`, `key-management.md`, `incident-response.md` | revocation timeline, new client IDs, audit evidence | +| Key rotation under load | KEK/master-key rotation while traffic is active | `key-management.md`, `production-rollout.md` | rotation timestamps, restart logs, smoke checks | +| Traffic surge / throttling | Burst traffic causes `429` pressure | `monitoring.md`, `api/fundamentals.md#rate-limiting` | `429` ratio, retry behavior, threshold decision | +| Database outage | DB unreachable / failover | `incident-response.md`, `production.md` | outage timeline, failover duration, restore checks | + +### Quarterly Execution Template + +1. Pick one drill owner and one incident commander +2. Define blast radius and rollback boundary +3. Execute drill in staging (or prod shadow) with fixed timebox +4. Capture metrics, logs, and runbook deviations +5. Produce remediation actions with owners and due dates + +### Pass Criteria + +- Critical runbooks are executable without undocumented tribal knowledge +- On-call can identify root cause and containment path within target SLA +- Recovery path is validated by health checks and smoke tests +- Postmortem includes at least one docs/process improvement item + +### Evidence Checklist + +- Timeline with UTC timestamps +- Request IDs for key failure and recovery events +- Alert timeline (fired, acknowledged, resolved) +- Commands executed and operator decisions +- Follow-up tickets and target completion dates diff --git a/docs/operations/policy-smoke-tests.md b/docs/operations/runbooks/policy-smoke-tests.md similarity index 92% rename from docs/operations/policy-smoke-tests.md rename to docs/operations/runbooks/policy-smoke-tests.md index fb07c99..4639c24 100644 --- a/docs/operations/policy-smoke-tests.md +++ b/docs/operations/runbooks/policy-smoke-tests.md @@ -1,6 +1,6 @@ # πŸ§ͺ Policy Smoke Tests -> Last updated: 2026-02-19 +> Last updated: 2026-02-20 Use this page to quickly validate authorization behavior after policy changes. @@ -105,7 +105,7 @@ test "$BAD_SHAPE_STATUS" = "404" Tip: this validates caller path shape expectations; use the allow/deny rotate checks above to validate capability enforcement. -See [Route shape vs policy shape](../api/policies.md#route-shape-vs-policy-shape) for triage guidance. +See [Route shape vs policy shape](../../api/auth/policies.md#route-shape-vs-policy-shape) for triage guidance. Secrets malformed path-shape check (missing wildcard subpath should not match): @@ -119,7 +119,7 @@ test "$BAD_SECRET_SHAPE_STATUS" = "404" ``` Tip: use this check to ensure policy path logic is not confused with route-template shape. -See [Route shape vs policy shape](../api/policies.md#route-shape-vs-policy-shape) for details. +See [Route shape vs policy shape](../../api/auth/policies.md#route-shape-vs-policy-shape) for details. Tokenization detokenize check (`decrypt` required): @@ -214,7 +214,7 @@ echo "policy smoke suite: PASS" ## See also -- [Capability matrix](../api/capability-matrix.md) -- [Policies cookbook](../api/policies.md) -- [Failure playbooks](failure-playbooks.md) -- [Troubleshooting](../getting-started/troubleshooting.md) +- [Capability matrix](../../api/fundamentals.md#capability-matrix) +- [Policies cookbook](../../api/auth/policies.md) +- [Incident response guide](../observability/incident-response.md) +- [Troubleshooting](../../getting-started/troubleshooting.md) diff --git a/docs/operations/security-hardening.md b/docs/operations/security/hardening.md similarity index 89% rename from docs/operations/security-hardening.md rename to docs/operations/security/hardening.md index 5d8b184..43f42ef 100644 --- a/docs/operations/security-hardening.md +++ b/docs/operations/security/hardening.md @@ -305,7 +305,7 @@ RATE_LIMIT_TOKEN_BURST=10 - Ensure your edge proxy/load balancer sets client IP headers consistently - If trusted proxy settings are incorrect, all token requests can appear from one IP and trigger false `429` - If headers are over-trusted, attackers can spoof forwarded IPs to evade per-IP controls -- Use [Trusted proxy reference](trusted-proxy-reference.md) for validation workflow and platform notes +- Use [Trusted proxy configuration](#trusted-proxy-configuration) for validation workflow and platform notes ### Tuning Guidance @@ -723,11 +723,71 @@ Use this checklist for production deployment validation. - [ ] On-call contacts documented - [ ] Security review scheduled (quarterly) +## Trusted Proxy Configuration + +Use this section to validate source-IP forwarding for security controls that depend on caller IP +(for example token endpoint per-IP rate limiting on `POST /v1/token`). + +### Why this matters + +- If proxy trust is too broad, attackers may spoof `X-Forwarded-For` +- If proxy trust is too narrow/incorrect, many clients can collapse into one apparent IP +- Both cases can invalidate per-IP rate-limiting behavior + +### Validation checklist + +1. Only trusted edge proxies can set forwarded client-IP headers +2. Untrusted internet clients cannot inject arbitrary `X-Forwarded-For` +3. App-observed `client_ip` matches edge-proxy access logs for sampled requests +4. Multi-hop proxy behavior (if any) is documented and tested + +### Nginx baseline forwarding + +```nginx +location / { + proxy_pass http://127.0.0.1:8080; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; +} +``` + +Hardening notes: + +- Do not accept forwarded headers directly from public clients +- Ensure only your reverse-proxy tier can reach application port `8080` + +### AWS ALB / ELB notes + +- ALB injects `X-Forwarded-For`; keep app reachable only from ALB/security group path +- Validate that downstream proxies preserve rather than overwrite trusted header chain +- Sample and compare ALB access logs with app `client_ip` logs + +### Cloudflare / CDN edge notes + +- Prefer single trusted edge path to origin +- If using CDN-specific client IP headers, keep mapping and validation documented +- Reject direct origin traffic from non-edge sources where possible + +### Diagnostic quick test + +1. Send a test request through edge proxy +2. Capture edge log source IP +3. Capture app log `client_ip` and request ID +4. Confirm both values refer to the same caller context + +### Common failure patterns + +- **All token requests share one IP:** likely NAT/proxy collapse or missing forwarded IP propagation +- **Frequent token `429` after proxy changes:** trust chain or source-IP extraction behavior drifted +- **Suspiciously diverse token caller IPs from one source:** potential forwarded-header spoofing + ## See also -- [Production deployment guide](production.md) -- [Environment variables](../configuration/environment-variables.md) -- [Security model](../concepts/security-model.md) -- [Monitoring](monitoring.md) -- [Policy management](../api/policies.md) -- [Troubleshooting](../getting-started/troubleshooting.md) +- [Production deployment guide](../deployment/production.md) +- [Environment variables](../../configuration.md) +- [Security model](../../concepts/security-model.md) +- [Monitoring](../observability/monitoring.md) +- [Policy management](../../api/auth/policies.md) +- [Troubleshooting](../../getting-started/troubleshooting.md) diff --git a/docs/operations/trusted-proxy-reference.md b/docs/operations/trusted-proxy-reference.md deleted file mode 100644 index 611267c..0000000 --- a/docs/operations/trusted-proxy-reference.md +++ /dev/null @@ -1,67 +0,0 @@ -# 🌐 Trusted Proxy Reference - -> Last updated: 2026-02-20 - -Use this guide to validate source-IP forwarding for security controls that depend on caller IP -(for example token endpoint per-IP rate limiting on `POST /v1/token`). - -## Why this matters - -- If proxy trust is too broad, attackers may spoof `X-Forwarded-For` -- If proxy trust is too narrow/incorrect, many clients can collapse into one apparent IP -- Both cases can invalidate per-IP rate-limiting behavior - -## Validation checklist - -1. Only trusted edge proxies can set forwarded client-IP headers -2. Untrusted internet clients cannot inject arbitrary `X-Forwarded-For` -3. App-observed `client_ip` matches edge-proxy access logs for sampled requests -4. Multi-hop proxy behavior (if any) is documented and tested - -## Nginx baseline forwarding - -```nginx -location / { - proxy_pass http://127.0.0.1:8080; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; -} -``` - -Hardening notes: - -- Do not accept forwarded headers directly from public clients -- Ensure only your reverse-proxy tier can reach application port `8080` - -## AWS ALB / ELB notes - -- ALB injects `X-Forwarded-For`; keep app reachable only from ALB/security group path -- Validate that downstream proxies preserve rather than overwrite trusted header chain -- Sample and compare ALB access logs with app `client_ip` logs - -## Cloudflare / CDN edge notes - -- Prefer single trusted edge path to origin -- If using CDN-specific client IP headers, keep mapping and validation documented -- Reject direct origin traffic from non-edge sources where possible - -## Diagnostic quick test - -1. Send a test request through edge proxy -2. Capture edge log source IP -3. Capture app log `client_ip` and request ID -4. Confirm both values refer to the same caller context - -## Common failure patterns - -- **All token requests share one IP:** likely NAT/proxy collapse or missing forwarded IP propagation -- **Frequent token `429` after proxy changes:** trust chain or source-IP extraction behavior drifted -- **Suspiciously diverse token caller IPs from one source:** potential forwarded-header spoofing - -## See also - -- [Security hardening guide](security-hardening.md) -- [Production deployment guide](production.md) -- [Troubleshooting](../getting-started/troubleshooting.md) diff --git a/docs/personas/README.md b/docs/personas/README.md new file mode 100644 index 0000000..cfb5e9a --- /dev/null +++ b/docs/personas/README.md @@ -0,0 +1,69 @@ +# 🎭 Documentation Paths by Persona + +> Last updated: 2026-02-20 + +Choose your learning path based on your role and goals. Each persona path provides a curated journey through the documentation. + +## πŸ“‘ Quick Navigation + +- [πŸ‘· Operator Path](#operator-path) - Deployment and operations +- [πŸ‘¨β€πŸ’» Developer Path](#developer-path) - API integration and development +- [πŸ›‘οΈ Security Engineer Path](#security-engineer-path) - Hardening and auditability + +--- + +## Operator Path + +Use this path when your goal is reliable deployment and fast incident response. + +### Primary path + +1. [Day 0 Operator Walkthrough](../getting-started/day-0-walkthrough.md#operator-path) +2. [Production rollout golden path](../operations/deployment/production-rollout.md) +3. [Operator quick card](../operations/runbooks/README.md#operator-quick-card) +4. [Incident response guide](../operations/observability/incident-response.md) + +### Deep links + +- [Monitoring](../operations/observability/monitoring.md) +- [Known limitations](../operations/deployment/production.md#known-limitations-and-tradeoffs) + +--- + +## Developer Path + +Use this path when your goal is API integration and feature delivery with docs parity. + +### Primary path + +1. [Day 0 Developer Walkthrough](../getting-started/day-0-walkthrough.md#developer-path) +2. [Authentication API](../api/auth/authentication.md) +3. [Error decision matrix](../api/fundamentals.md#error-decision-matrix) +4. [Rate limiting](../api/fundamentals.md#rate-limiting) +5. [Code examples](../examples/README.md) + +### Deep links + +- [Capability matrix](../api/fundamentals.md#capability-matrix) +- [Policies cookbook](../api/auth/policies.md) +- [Docs release checklist](../contributing.md#docs-release-checklist) + +--- + +## Security Engineer Path + +Use this path when your goal is threat reduction, hardening, and auditability. + +### Primary path + +1. [Security hardening guide](../operations/security/hardening.md) +2. [Trusted proxy reference](../operations/security/hardening.md#trusted-proxy-configuration) +3. [Known limitations](../operations/deployment/production.md#known-limitations-and-tradeoffs) +4. [Production deployment guide](../operations/deployment/production.md) +5. [Monitoring](../operations/observability/monitoring.md) + +### Deep links + +- [Security model](../concepts/security-model.md) +- [KMS setup guide](../operations/kms/setup.md) +- [KMS migration checklist](../operations/kms/setup.md#migration-checklist) diff --git a/docs/personas/developer.md b/docs/personas/developer.md deleted file mode 100644 index 01251e3..0000000 --- a/docs/personas/developer.md +++ /dev/null @@ -1,19 +0,0 @@ -# πŸ‘¨β€πŸ’» Developer Persona Path - -> Last updated: 2026-02-20 - -Use this path when your goal is API integration and feature delivery with docs parity. - -## Primary path - -1. [Day 0 Developer Walkthrough](../getting-started/day-0-developer.md) -2. [Authentication API](../api/authentication.md) -3. [Error decision matrix](../api/error-decision-matrix.md) -4. [Rate limiting](../api/rate-limiting.md) -5. [Versioned examples](../examples/versioned-by-release.md) - -## Deep links - -- [Capability matrix](../api/capability-matrix.md) -- [Policies cookbook](../api/policies.md) -- [Docs release checklist](../development/docs-release-checklist.md) diff --git a/docs/personas/operator.md b/docs/personas/operator.md deleted file mode 100644 index b7ffba6..0000000 --- a/docs/personas/operator.md +++ /dev/null @@ -1,19 +0,0 @@ -# πŸ‘· Operator Persona Path - -> Last updated: 2026-02-20 - -Use this path when your goal is reliable deployment and fast incident response. - -## Primary path - -1. [Day 0 Operator Walkthrough](../getting-started/day-0-operator.md) -2. [Production rollout golden path](../operations/production-rollout.md) -3. [Operator quick card](../operations/operator-quick-card.md) -4. [Incident decision tree](../operations/incident-decision-tree.md) -5. [First 15 Minutes Playbook](../operations/first-15-minutes.md) - -## Deep links - -- [Monitoring](../operations/monitoring.md) -- [Failure playbooks](../operations/failure-playbooks.md) -- [Known limitations](../operations/known-limitations.md) diff --git a/docs/personas/security.md b/docs/personas/security.md deleted file mode 100644 index efe7bbd..0000000 --- a/docs/personas/security.md +++ /dev/null @@ -1,19 +0,0 @@ -# πŸ›‘οΈ Security Persona Path - -> Last updated: 2026-02-20 - -Use this path when your goal is threat reduction, hardening, and auditability. - -## Primary path - -1. [Security hardening guide](../operations/security-hardening.md) -2. [Trusted proxy reference](../operations/trusted-proxy-reference.md) -3. [Known limitations](../operations/known-limitations.md) -4. [Production deployment guide](../operations/production.md) -5. [Monitoring](../operations/monitoring.md) - -## Deep links - -- [Security model](../concepts/security-model.md) -- [KMS setup guide](../operations/kms-setup.md) -- [KMS migration checklist](../operations/kms-migration-checklist.md) diff --git a/docs/releases/RELEASES.md b/docs/releases/RELEASES.md new file mode 100644 index 0000000..e342383 --- /dev/null +++ b/docs/releases/RELEASES.md @@ -0,0 +1,733 @@ +# πŸš€ Release Notes + +> Last updated: 2026-02-20 + +This document contains release notes and upgrade guides for all versions of Secrets. + +For the compatibility matrix across versions, see [compatibility-matrix.md](compatibility-matrix.md). + +## πŸ“‘ Quick Navigation + +**Latest Release**: [v0.8.0](#080---2026-02-20) + +**All Releases**: + +- [v0.8.0 (2026-02-20)](#080---2026-02-20) - Documentation consolidation and ADR establishment +- [v0.7.0 (2026-02-20)](#070---2026-02-20) - IP-based rate limiting for token endpoint +- [v0.6.0 (2026-02-19)](#060---2026-02-19) - KMS provider support +- [v0.5.1 (2026-02-19)](#051---2026-02-19) - Audit log cleanup command +- [v0.5.0 (2026-02-19)](#050---2026-02-19) - Tokenization and CORS +- [v0.4.1 (2026-02-19)](#041---2026-02-19) - Pagination bug fix +- [v0.4.0 (2026-02-18)](#040---2026-02-18) - Audit logging +- [v0.3.0 (2026-02-16)](#030---2026-02-16) - Client management +- [v0.2.0 (2026-02-14)](#020---2026-02-14) - Transit encryption +- [v0.1.0 (2026-02-14)](#010---2026-02-14) - Initial release + +--- + +## [0.8.0] - 2026-02-20 + +### Highlights + +- Documentation consolidation: reduced from 77 to 47 markdown files (39% reduction) +- Established 8 new Architecture Decision Records (ADR 0003-0010) covering key architectural decisions +- Restructured API documentation with themed subdirectories (auth/, data/, observability/) +- Consolidated operations documentation with centralized runbook hub +- Merged all development documentation into contributing.md +- Comprehensive cross-reference updates throughout documentation (182+ updates) + +### Runtime Changes + +None - this is a documentation-only release. + +### Upgrade from v0.7.0 + +#### What Changed + +- Documentation structure improvements (no code or runtime changes) +- All v0.7.0 functionality remains identical +- No environment variables, schema, or API changes + +#### Upgrade Steps + +No upgrade required. v0.8.0 is documentation-only and fully backward compatible with v0.7.0. + +If referencing documentation, update any bookmarks or links to reflect new documentation structure: + +- API fundamentals consolidated into `docs/api/fundamentals.md` +- API endpoints organized by theme: `auth/`, `data/`, `observability/` +- Operations runbooks centralized in `docs/operations/runbooks/README.md` +- Development guide now at `docs/contributing.md` + +#### Documentation Updates + +- 8 new ADRs documenting architectural decisions (capability-based auth, dual database support, transaction management, rate limiting, API versioning, Gin framework, UUIDv7, Argon2id) +- API documentation restructured with auth/, data/, observability/ subdirectories +- Operations documentation consolidated with runbook hub and themed organization +- All development documentation merged into single contributing.md guide +- Comprehensive cross-reference updates (182+ link updates) +- All validation passing (627 OK links, 0 errors) + +#### See Also + +- [Compatibility matrix](compatibility-matrix.md) +- [Architecture Decision Records](../adr/) +- [Documentation index](../README.md) + +--- + +## [0.7.0] - 2026-02-20 + +### Highlights + +- Added IP-based rate limiting for `POST /v1/token` +- Added token endpoint rate-limit configuration via `RATE_LIMIT_TOKEN_*` variables +- Added token endpoint `429 Too Many Requests` behavior with `Retry-After` +- Expanded docs and runbooks for token endpoint abuse protection and rollout validation + +### Runtime Changes + +- New environment variables: + - `RATE_LIMIT_TOKEN_ENABLED` (default `true`) + - `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` (default `5.0`) + - `RATE_LIMIT_TOKEN_BURST` (default `10`) +- `POST /v1/token` may now return `429 Too Many Requests` when per-IP token limits are exceeded +- Authenticated per-client rate limiting (`RATE_LIMIT_*`) remains unchanged + +### Security and Operations Impact + +- Improves protection against token endpoint credential stuffing and brute-force traffic +- Applies stricter defaults on unauthenticated token issuance than authenticated API routes +- Requires review of proxy/trusted-IP setup when using forwarded headers in production + +### Upgrade from v0.6.0 + +#### What Changed + +- Added IP-based token endpoint rate limiting for `POST /v1/token` +- Added new token endpoint throttling configuration (`RATE_LIMIT_TOKEN_*`) +- Token issuance can now return `429 Too Many Requests` with `Retry-After` + +#### Env Diff + +```diff ++ RATE_LIMIT_TOKEN_ENABLED=true ++ RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 ++ RATE_LIMIT_TOKEN_BURST=10 +``` + +#### Recommended Upgrade Steps + +1. Update image/binary to `v0.7.0` +2. Add `RATE_LIMIT_TOKEN_*` variables to runtime configuration +3. Restart API instances with standard rolling rollout process +4. Run baseline checks: `GET /health`, `GET /ready` +5. Run token and key-dependent checks + +#### Quick Verification Commands + +```bash +curl -sS http://localhost:8080/health +curl -sS http://localhost:8080/ready + +TOKEN_RESPONSE="$(curl -sS -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}')" + +CLIENT_TOKEN="$(printf '%s' "${TOKEN_RESPONSE}" | jq -r '.token')" + +curl -sS -X POST http://localhost:8080/v1/secrets/upgrade/v070 \ + -H "Authorization: Bearer ${CLIENT_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"value":"djA3MC1zbW9rZQ=="}' +``` + +#### Operator Verification Checklist + +1. Confirm `GET /health` and `GET /ready` succeed +2. Confirm `POST /v1/token` issues tokens normally for expected request rates +3. Confirm token endpoint returns controlled `429` with `Retry-After` when intentionally exceeded +4. Confirm authenticated route limits and retry behavior still match policy + +#### Documentation Updates + +- Added [API rate limiting](../api/fundamentals.md#rate-limiting) with token endpoint scope +- Updated [Environment variables](../configuration.md) with `RATE_LIMIT_TOKEN_*` +- Updated [Troubleshooting](../getting-started/troubleshooting.md) with token endpoint `429` diagnostics + +--- + +## [0.6.0] - 2026-02-19 + +### Highlights + +- Added KMS support for master key loading and decryption at startup +- Added CLI KMS flags to `create-master-key` (`--kms-provider`, `--kms-key-uri`) +- Added new `rotate-master-key` CLI command for staged master key rotation +- Added provider setup and migration runbook: [KMS setup guide](../operations/kms/setup.md) + +### Runtime Changes + +- New environment variables: + - `KMS_PROVIDER` + - `KMS_KEY_URI` +- Master key loading now supports two modes: + - KMS mode: both variables set + - Legacy mode: both variables unset +- Startup fails fast if only one KMS variable is set + +### Security and Operations Impact + +- KMS mode encrypts master keys at rest and centralizes key access control in your KMS provider +- Existing legacy environments remain supported without immediate migration +- Master key rotation now has an explicit CLI workflow for appending a new active key before cleanup + +### Upgrade from v0.5.1 + +#### What Changed + +- Added KMS-backed master key loading mode (`KMS_PROVIDER`, `KMS_KEY_URI`) +- Added KMS flags to `create-master-key` +- Added `rotate-master-key` CLI command for staged master key rotation +- Added fail-fast validation for partial KMS configuration + +#### Recommended Upgrade Steps + +1. Update image/binary to `v0.6.0` +2. Decide runtime key mode: + - Keep legacy mode (no KMS vars set), or + - Enable KMS mode (`KMS_PROVIDER` and `KMS_KEY_URI` both set) +3. Restart API instances with standard rolling rollout process +4. Run baseline checks: `GET /health`, `GET /ready` +5. Run key-dependent smoke checks + +#### Decision Path + +- **Stay on legacy mode now:** + - Keep `KMS_PROVIDER` and `KMS_KEY_URI` unset + - Upgrade binaries/images and validate normal crypto flows +- **Adopt KMS mode now:** + - Set both `KMS_PROVIDER` and `KMS_KEY_URI` + - Ensure all `MASTER_KEYS` entries are KMS ciphertext + - Follow migration workflow in [KMS setup guide](../operations/kms/setup.md) + - Track rollout gates in [KMS migration checklist](../operations/kms/setup.md#migration-checklist) + +#### Quick Verification Commands + +```bash +curl -sS http://localhost:8080/health +curl -sS http://localhost:8080/ready + +TOKEN_RESPONSE="$(curl -sS -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}')" + +CLIENT_TOKEN="$(printf '%s' "${TOKEN_RESPONSE}" | jq -r '.token')" + +curl -sS -X POST http://localhost:8080/v1/secrets/upgrade/v060 \ + -H "Authorization: Bearer ${CLIENT_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"value":"djA2MC1zbW9rZQ=="}' +``` + +#### Operator Verification Checklist + +1. Confirm `GET /health` and `GET /ready` succeed +2. Confirm startup logs reflect intended key mode and active master key +3. Confirm token issuance and secrets/transit round-trip flows +4. Confirm no KMS auth/decrypt errors in startup logs + +#### Documentation Updates + +- Added [KMS setup guide](../operations/kms/setup.md) +- Updated [CLI commands](../cli-commands.md) with KMS flags and `rotate-master-key` +- Updated [Environment variables](../configuration.md) with KMS mode configuration + +--- + +## [0.5.1] - 2026-02-19 + +### Highlights + +- Fixed master key loading from environment variables to avoid zeroing the in-use key slice +- Hardened keychain shutdown by zeroing all master keys before clearing chain state +- Added regression tests for key usability after load and secure zeroing on close + +### Fixes + +- `LoadMasterKeyChainFromEnv` now stores a copy of decoded key bytes before zeroing temporary buffers +- `MasterKeyChain.Close` now zeros every loaded master key before clearing the key map + +### Security Impact + +- Reduces risk of leaked key material remaining in temporary decode buffers +- Ensures explicit in-memory zeroing of master keys during keychain teardown + +### Runtime and Compatibility + +- API baseline remains `v1` (`/v1/*`) +- No endpoint, payload, or status code contract changes +- No schema migrations required specifically for this patch release + +### Upgrade from v0.5.0 + +#### What Changed + +- Fixed master key loading from `MASTER_KEYS` to preserve active key material after decode +- Added secure zeroing of all keychain-held master keys during `Close` +- Added regression test coverage for these memory lifecycle paths + +#### Recommended Upgrade Steps + +1. Update image/binary to `v0.5.1` +2. Restart API instances with standard rolling rollout process +3. Run baseline checks: `GET /health`, `GET /ready` +4. Run key-dependent smoke checks: token issuance, secrets write/read, transit encrypt/decrypt + +#### Quick Verification Commands + +```bash +curl -sS http://localhost:8080/health + +TOKEN_RESPONSE="$(curl -sS -X POST http://localhost:8080/v1/token \ + -H "Content-Type: application/json" \ + -d '{"client_id":"","client_secret":""}')" + +CLIENT_TOKEN="$(printf '%s' "${TOKEN_RESPONSE}" | jq -r '.token')" + +curl -sS -X POST http://localhost:8080/v1/secrets/upgrade/smoke \ + -H "Authorization: Bearer ${CLIENT_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"value":"c21va2UtdjA1MQ=="}' + +curl -sS -X GET http://localhost:8080/v1/secrets/upgrade/smoke \ + -H "Authorization: Bearer ${CLIENT_TOKEN}" +``` + +#### Operator Verification Checklist + +1. Confirm service health and readiness after rollout +2. Confirm startup succeeds with configured `MASTER_KEYS` and `ACTIVE_MASTER_KEY_ID` +3. Confirm secrets and transit workflows succeed under normal traffic +4. Confirm no unexpected key configuration or decryption errors in logs + +#### Patch Release Safety + +- Most environments require no configuration changes for this release +- Rolling upgrade is recommended; keep standard health and smoke checks in place +- Rollback to the previous stable image is safe when incident criteria are met + +#### Documentation Updates + +- Updated [release compatibility matrix](compatibility-matrix.md) with `v0.5.0 -> v0.5.1` +- Updated current-release references across docs and pinned image examples to `v0.5.1` + +--- + +## [0.5.0] - 2026-02-19 + +### Highlights + +- Added per-client rate limiting for authenticated API routes +- Added configurable CORS middleware with secure defaults +- Reduced default token expiration from 24 hours to 4 hours +- Added comprehensive production security hardening guide + +### Runtime Changes + +- New rate limiting settings: + - `RATE_LIMIT_ENABLED` (default `true`) + - `RATE_LIMIT_REQUESTS_PER_SEC` (default `10.0`) + - `RATE_LIMIT_BURST` (default `20`) +- New CORS settings: + - `CORS_ENABLED` (default `false`) + - `CORS_ALLOW_ORIGINS` (default empty) +- Authenticated endpoints now return `429 Too Many Requests` when limits are exceeded and include `Retry-After` response header + +### Breaking / Behavior Changes + +- **Default token expiration changed**: + - Previous default: `AUTH_TOKEN_EXPIRATION_SECONDS=86400` (24h) + - New default: `AUTH_TOKEN_EXPIRATION_SECONDS=14400` (4h) + +If your clients expected 24-hour tokens, explicitly set `AUTH_TOKEN_EXPIRATION_SECONDS=86400` and verify refresh behavior. + +### Upgrade from v0.4.x + +#### What changed + +- Default token expiration is now shorter (`24h` -> `4h`) +- Per-client rate limiting is enabled by default +- CORS is configurable and remains disabled by default +- Security hardening guidance expanded for production deployments + +#### Env diff + +```diff +- AUTH_TOKEN_EXPIRATION_SECONDS=86400 ++ AUTH_TOKEN_EXPIRATION_SECONDS=14400 + ++ RATE_LIMIT_ENABLED=true ++ RATE_LIMIT_REQUESTS_PER_SEC=10.0 ++ RATE_LIMIT_BURST=20 + ++ CORS_ENABLED=false ++ CORS_ALLOW_ORIGINS= +``` + +If your clients rely on 24-hour tokens, keep explicit configuration: + +```dotenv +AUTH_TOKEN_EXPIRATION_SECONDS=86400 +``` + +#### Upgrade steps + +1. Update image/binary to `v0.5.0` +2. Review and set explicit `AUTH_TOKEN_EXPIRATION_SECONDS` +3. Add `RATE_LIMIT_*` variables with values matching your traffic profile +4. Keep `CORS_ENABLED=false` unless browser-based access is required +5. Restart API servers with updated environment + +#### Post-upgrade verification + +1. Health checks pass: `GET /health`, `GET /ready` +2. Token issuance works and expiration matches expected TTL +3. Authenticated endpoint rate limit returns `429` with `Retry-After` when exceeded +4. Normal traffic does not hit `429` unexpectedly +5. CORS behavior is correct for your deployment mode + +#### Operator Verification Checklist + +1. Confirm health endpoints: `GET /health`, `GET /ready` +2. Validate token issuance and expiration expectations after upgrade +3. Confirm authenticated API traffic is not unintentionally rate limited +4. Validate `429` behavior and `Retry-After` header with controlled load test +5. Confirm CORS behavior matches policy (disabled by default, explicit origins only when enabled) + +#### Security Guidance + +- Use TLS termination at reverse proxy/load balancer +- Use database TLS in production (`sslmode=require` or stronger / `tls=true` or stronger) +- Store master keys in a dedicated secrets manager +- Review least-privilege client policies and rotate credentials regularly + +#### Documentation Updates + +- Added [Security hardening guide](../operations/security/hardening.md) +- Updated [Environment variables](../configuration.md) with rate limiting, CORS, and token expiration migration notes +- Updated [Production deployment guide](../operations/deployment/production.md) with security hardening links + +--- + +## [0.4.1] - 2026-02-19 + +### Highlights + +- Fixed authorization path matching for policies using mid-path wildcards +- Clarified wildcard matching semantics for exact, trailing wildcard, and segment wildcard paths +- Expanded automated coverage for policy templates, wildcard edge cases, and common policy mistakes + +### Bug Fixes + +- Policy matcher now supports mid-path wildcard patterns such as `/v1/transit/keys/*/rotate` +- Mid-path `*` wildcard now matches exactly one path segment +- Trailing wildcard `/*` behavior remains greedy for nested subpaths + +### Runtime and Compatibility + +- API baseline remains v1 (`/v1/*`) +- No breaking API path or payload contract changes +- Local development targets: Linux and macOS +- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` +- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` + +### Upgrade Notes + +- Recommended for all users relying on wildcard policy path matching +- No schema migrations required specifically for this bugfix release +- Existing tokenization, secrets, transit, auth, and audit flows remain API-compatible + +### Policy Migration Note + +If existing policies assumed prefix-only behavior, review wildcard paths used for rotate and similar endpoint-specific actions. + +Before (too broad for intent): + +```json +[ + { + "path": "/v1/transit/keys/*", + "capabilities": ["rotate"] + } +] +``` + +After (scoped to rotate endpoint pattern): + +```json +[ + { + "path": "/v1/transit/keys/*/rotate", + "capabilities": ["rotate"] + } +] +``` + +### Verification Checklist + +1. Deploy binaries/images with `v0.4.1` +2. Verify baseline health (`GET /health`, `GET /ready`) +3. Re-run policy smoke checks for expected allow/deny behavior +4. Confirm wildcard policies used in production match intended path semantics + +### Operator Quick Checklist (v0.4.1) + +1. Search client policies for rotate patterns and replace broad forms with `/v1/transit/keys/*/rotate` when needed. +2. Run route-shape smoke checks (`/v1/transit/keys/payment/extra/rotate` and `/v1/secrets`) and expect `404`. +3. Run allow/deny policy smoke checks and expect capability-denied calls to return `403`. +4. Review recent denied audit events and confirm mismatches are expected after policy rollout. + +### Documentation Migration Map (v0.4.1) + +- Policy matching semantics: [Policies cookbook / Path matching behavior](../api/auth/policies.md#path-matching-behavior) +- Route-vs-policy triage: [Policies cookbook / Route shape vs policy shape](../api/auth/policies.md#route-shape-vs-policy-shape) +- Pre-deploy policy checks: [Policies cookbook / Policy review checklist before deploy](../api/auth/policies.md#policy-review-checklist-before-deploy) +- Capability verification: [Capability matrix](../api/fundamentals.md#capability-matrix) +- Operational validation steps: [Policy smoke tests](../operations/runbooks/policy-smoke-tests.md) +- Incident triage and matcher FAQ: [Troubleshooting](../getting-started/troubleshooting.md) + +--- + +## [0.4.0] - 2026-02-18 + +### Highlights + +- Added tokenization API under `/v1/tokenization/*` +- Added tokenization key lifecycle: create, rotate, delete +- Added token lifecycle: tokenize, detokenize, validate, revoke +- Added deterministic mode support for repeatable token generation +- Added token format support: `uuid`, `numeric`, `luhn-preserving`, `alphanumeric` +- Added expired-token maintenance command: `clean-expired-tokens` + +### API Additions + +New endpoints: + +- `POST /v1/tokenization/keys` +- `POST /v1/tokenization/keys/{name}/rotate` +- `DELETE /v1/tokenization/keys/{id}` +- `POST /v1/tokenization/keys/{name}/tokenize` +- `POST /v1/tokenization/detokenize` +- `POST /v1/tokenization/validate` +- `POST /v1/tokenization/revoke` + +### CLI Additions + +- `create-tokenization-key --name --format [--deterministic] [--algorithm ]` +- `rotate-tokenization-key --name --format [--deterministic] [--algorithm ]` +- `clean-expired-tokens --days [--dry-run] [--format text|json]` + +### Data Model and Migrations + +Added migration `000002_add_tokenization` for PostgreSQL and MySQL: + +- `tokenization_keys` table for versioned key metadata +- `tokenization_tokens` table for token-to-ciphertext mapping and lifecycle fields + +### Observability + +Added tokenization business operations metrics in the `tokenization` domain, including key and token lifecycle operations. + +### Runtime and Compatibility + +- API baseline remains v1 (`/v1/*`) +- Local development targets: Linux and macOS +- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` +- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` + +### Upgrade Notes + +- Non-breaking addition: tokenization capability under API v1 +- Existing auth, secrets, transit, and audit behavior remain compatible +- Run database migrations before using tokenization endpoints or CLI commands + +### Upgrade Checklist + +1. Deploy binaries/images with `v0.4.0` +2. Run DB migrations (`app migrate`) before serving traffic +3. Verify baseline health (`GET /health`, `GET /ready`) +4. Create a tokenization key (`create-tokenization-key` or `POST /v1/tokenization/keys`) +5. Run round-trip check: tokenize -> detokenize -> validate -> revoke +6. Schedule retention cleanup for expired tokens (`clean-expired-tokens`) + +### Rollback Notes + +- `000002_add_tokenization` is additive schema migration and is expected to remain applied during app rollback. +- Rolling back binaries/images to pre-`v0.4.0` can leave tokenization tables unused but present. +- Avoid destructive schema rollback in production unless you have a validated backup/restore plan. +- If rollback is required, keep existing data and disable tokenization traffic paths operationally until re-upgrade. + +### Documentation Updates + +- Added [Tokenization API](../api/data/tokenization.md) reference +- Updated [CLI commands reference](../cli-commands.md) with tokenization commands +- Updated [Production operations](../operations/deployment/production.md) with tokenization workflows + +--- + +## [0.3.0] - 2026-02-16 + +### Highlights + +- Added OpenTelemetry metrics provider with Prometheus exporter +- Added optional `/metrics` endpoint for Prometheus scraping +- Added HTTP metrics middleware for request counts and latency histograms +- Added business operation metrics across auth, secrets, and transit use cases +- Added metrics configuration via `METRICS_ENABLED` and `METRICS_NAMESPACE` + +### Metrics and Monitoring + +New metric families: + +- `{namespace}_http_requests_total` +- `{namespace}_http_request_duration_seconds` +- `{namespace}_operations_total` +- `{namespace}_operation_duration_seconds` + +Runtime behavior: + +- When `METRICS_ENABLED=true` (default), the server exposes `GET /metrics` +- When `METRICS_ENABLED=false`, metrics middleware and `/metrics` are not registered +- `METRICS_NAMESPACE` (default `secrets`) prefixes metric names + +### Runtime and Compatibility + +- API baseline remains v1 (`/v1/*`) +- Metrics endpoint is outside API versioning (`/metrics`) +- Local development targets: Linux and macOS +- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` +- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` + +### Upgrade Notes + +- Non-breaking addition: observability and metrics instrumentation +- Existing API paths and behavior remain compatible under API v1 documentation +- Update your environment configuration if you want custom metric namespace values + +Example: + +```bash +export METRICS_ENABLED=true +export METRICS_NAMESPACE=secrets +curl http://localhost:8080/metrics +``` + +### Documentation Updates + +- Added [Monitoring operations guide](../operations/observability/monitoring.md) +- Updated [Environment variables](../configuration.md) +- Updated [Production operations](../operations/deployment/production.md) + +--- + +## [0.2.0] - 2026-02-14 + +### Highlights + +- New CLI command: `clean-audit-logs` +- Supports retention by age in days (`--days`) +- Supports safe preview mode (`--dry-run`) before deletion +- Supports machine-friendly output (`--format json`) and human-readable output (`--format text`) + +### Included CLI Addition + +- `clean-audit-logs --days [--dry-run] [--format text|json]` + +Operational behavior: + +- Dry-run mode counts matching rows without deleting +- Execution mode deletes rows older than the computed UTC cutoff date +- Works with both PostgreSQL and MySQL repositories + +### Runtime and Compatibility + +- API baseline remains v1 (`/v1/*`) +- Local development targets: Linux and macOS +- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` +- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` + +### Operational Notes + +- Use `--dry-run` first for production safety +- Ensure database is reachable and migrated before cleanup runs +- Keep retention execution on a defined cadence (for example monthly) + +Example: + +```bash +./bin/app clean-audit-logs --days 90 --dry-run --format json +``` + +### Upgrade Notes + +- Non-breaking addition: new CLI command for operations +- Existing API paths and behavior remain compatible under API v1 documentation + +### Documentation Updates + +- Updated [CLI commands reference](../cli-commands.md) +- Updated [Audit Logs API](../api/observability/audit-logs.md) +- Updated [Production operations](../operations/deployment/production.md) + +--- + +## [0.1.0] - 2026-02-14 + +### Highlights + +- Envelope encryption model with `Master Key -> KEK -> DEK -> Secret Data` +- Transit encryption API for encrypt/decrypt without storing application payload +- Token authentication and policy-based authorization +- Versioned secret storage by path and soft-delete behavior +- Audit logging with request correlation via `request_id` +- PostgreSQL and MySQL runtime support + +### Included API Surface + +- Auth: `POST /v1/token` +- Clients: `GET/POST /v1/clients`, `GET/PUT/DELETE /v1/clients/:id` +- Secrets: `POST/GET/DELETE /v1/secrets/*path` +- Transit: create/rotate/encrypt/decrypt/delete under `/v1/transit/keys*` +- Audit logs: `GET /v1/audit-logs` +- Health/readiness: `GET /health`, `GET /ready` + +### Runtime and Compatibility + +- Local development targets: Linux and macOS +- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` +- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` + +### Operational Notes + +- Restart API servers after master key or KEK rotation so processes load new key material +- Base64 request fields are encoding only, not encryption; always use HTTPS/TLS +- For transit decrypt, pass ciphertext exactly as returned by encrypt (`:`) + +### Known Limitations (v0.1.0) + +- `docs/openapi.yaml` is a baseline subset focused on common flows, not full endpoint parity +- API v1 compatibility policy applies to documented endpoint behavior in API reference docs + +### Upgrade Notes + +- Initial release: no prior upgrade path required + +--- + +## See also + +- [Release compatibility matrix](compatibility-matrix.md) +- [Documentation index](../README.md) +- [API compatibility policy](../api/fundamentals.md#compatibility-and-versioning-policy) +- [Production operations](../operations/deployment/production.md) diff --git a/docs/releases/_template.md b/docs/releases/_template.md deleted file mode 100644 index af7e783..0000000 --- a/docs/releases/_template.md +++ /dev/null @@ -1,47 +0,0 @@ -# πŸš€ Secrets vX.Y.Z Release Notes - -> Release date: YYYY-MM-DD - -Brief summary of why this release matters for operators and integrators. - -## Highlights - -- Highlight 1 -- Highlight 2 -- Highlight 3 - -## Runtime Changes - -- New/changed env vars: - - `EXAMPLE_VAR` (default `...`) -- Endpoint behavior changes (status/contract/defaults) -- Performance or operational behavior updates - -## Breaking / Behavior Changes - -- Behavior/default changes requiring operator action -- Compatibility notes for older clients or deployments - -## Upgrade Notes - -1. Deploy binaries/images with `vX.Y.Z` -2. Apply config/env changes -3. Run verification checks -4. Monitor rollout metrics/logs - -## Operator Verification Checklist - -1. `GET /health` and `GET /ready` pass -2. Authentication/token issuance works -3. Key-dependent flows pass (secrets/transit) -4. New/changed feature behavior validated - -## Documentation Updates - -- Added/updated docs pages for this release -- Runbook changes relevant to operators - -## See also - -- [Upgrade guide template](_upgrade-template.md) -- [Release compatibility matrix](compatibility-matrix.md) diff --git a/docs/releases/_upgrade-template.md b/docs/releases/_upgrade-template.md deleted file mode 100644 index c93e929..0000000 --- a/docs/releases/_upgrade-template.md +++ /dev/null @@ -1,56 +0,0 @@ -# ⬆️ Upgrade Guide: vA.B.C -> vX.Y.Z - -> Release date: YYYY-MM-DD - -Use this guide to safely upgrade from `vA.B.C` to `vX.Y.Z`. - -## Scope - -- Release type: patch/minor/major -- API compatibility: compatible/incompatible notes -- Database migration: required/optional/none - -## What Changed - -- Change 1 -- Change 2 -- Change 3 - -## Env Diff (copy/paste) - -```diff -+ NEW_VAR=value -- OLD_VAR=old-value -``` - -## Recommended Upgrade Steps - -1. Update image/binary to `vX.Y.Z` -2. Apply env/config changes -3. Restart/roll instances -4. Run health checks -5. Run functional smoke checks - -## Quick Verification Commands - -```bash -curl -sS http://localhost:8080/health -curl -sS http://localhost:8080/ready -``` - -## Rollback Notes - -- Revert to previous stable version first -- Keep non-destructive config rollback path documented -- Re-run validation after rollback - -### Rollback matrix - -| Upgrade path | First rollback action | Config rollback | Validation | -| --- | --- | --- | --- | -| `vA.B.C -> vX.Y.Z` | Roll app image/binary back | Revert/ignore release-specific config additions | Health + smoke checks | - -## See also - -- [Release notes template](_template.md) -- [Release compatibility matrix](compatibility-matrix.md) diff --git a/docs/releases/compatibility-matrix.md b/docs/releases/compatibility-matrix.md index 9c23d7b..a93d881 100644 --- a/docs/releases/compatibility-matrix.md +++ b/docs/releases/compatibility-matrix.md @@ -4,10 +4,17 @@ Use this page to understand upgrade impact between recent releases. +## Coverage Policy + +This matrix covers **recent releases only** (typically last 5-6 versions) to focus on relevant upgrade paths. Historical releases remain documented in [RELEASES.md](RELEASES.md) but are excluded here to avoid clutter. + +If you need upgrade guidance for older versions, consult the full release history in [RELEASES.md](RELEASES.md) or reach out via GitHub issues. + ## Matrix | From -> To | Schema migration impact | Runtime/default changes | Required operator action | | --- | --- | --- | --- | +| `v0.7.0 -> v0.8.0` | No changes | Documentation improvements only | None (backward compatible, no runtime changes) | | `v0.6.0 -> v0.7.0` | No new mandatory migration | Added IP-based token endpoint rate limiting (`RATE_LIMIT_TOKEN_ENABLED`, `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC`, `RATE_LIMIT_TOKEN_BURST`), token endpoint may return `429` with `Retry-After` | Add and tune `RATE_LIMIT_TOKEN_*`, validate token issuance under normal and burst load, review trusted proxy/IP behavior | | `v0.5.1 -> v0.6.0` | No new mandatory migration | Added KMS-based master key support (`KMS_PROVIDER`, `KMS_KEY_URI`), new `rotate-master-key` CLI workflow | Decide KMS vs legacy mode, validate startup key loading, run key-dependent smoke checks | | `v0.5.0 -> v0.5.1` | No new mandatory migration | Master key memory handling bugfix and teardown zeroing hardening | Deploy `v0.5.1` and verify key-dependent flows (token, secrets, transit) | @@ -17,6 +24,10 @@ Use this page to understand upgrade impact between recent releases. ## Upgrade verification by target +For `v0.8.0`: + +No upgrade verification needed - documentation-only release with no runtime changes. + For `v0.7.0`: 1. `GET /health` and `GET /ready` pass @@ -52,12 +63,5 @@ For `v0.5.0`: ## See also -- [v0.7.0 release notes](v0.7.0.md) -- [v0.7.0 upgrade guide](v0.7.0-upgrade.md) -- [v0.6.0 release notes](v0.6.0.md) -- [v0.6.0 upgrade guide](v0.6.0-upgrade.md) -- [v0.5.1 release notes](v0.5.1.md) -- [v0.5.1 upgrade guide](v0.5.1-upgrade.md) -- [v0.5.0 release notes](v0.5.0.md) -- [v0.5.0 upgrade guide](v0.5.0-upgrade.md) -- [Production rollout golden path](../operations/production-rollout.md) +- [All release notes](RELEASES.md) +- [Production rollout golden path](../operations/deployment/production-rollout.md) diff --git a/docs/releases/v0.1.0.md b/docs/releases/v0.1.0.md deleted file mode 100644 index 498b4c2..0000000 --- a/docs/releases/v0.1.0.md +++ /dev/null @@ -1,50 +0,0 @@ -# πŸš€ Secrets v0.1.0 Release Notes - -> Release date: 2026-02-14 - -This is the first public release of Secrets. - -## Highlights - -- Envelope encryption model with `Master Key -> KEK -> DEK -> Secret Data` -- Transit encryption API for encrypt/decrypt without storing application payload -- Token authentication and policy-based authorization -- Versioned secret storage by path and soft-delete behavior -- Audit logging with request correlation via `request_id` -- PostgreSQL and MySQL runtime support - -## Included API Surface - -- Auth: `POST /v1/token` -- Clients: `GET/POST /v1/clients`, `GET/PUT/DELETE /v1/clients/:id` -- Secrets: `POST/GET/DELETE /v1/secrets/*path` -- Transit: create/rotate/encrypt/decrypt/delete under `/v1/transit/keys*` -- Audit logs: `GET /v1/audit-logs` -- Health/readiness: `GET /health`, `GET /ready` - -## Runtime and Compatibility - -- Local development targets: Linux and macOS -- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` -- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` - -## Operational Notes - -- Restart API servers after master key or KEK rotation so processes load new key material -- Base64 request fields are encoding only, not encryption; always use HTTPS/TLS -- For transit decrypt, pass ciphertext exactly as returned by encrypt (`:`) - -## Known Limitations (v0.1.0) - -- `docs/openapi.yaml` is a baseline subset focused on common flows, not full endpoint parity -- API v1 compatibility policy applies to documented endpoint behavior in API reference docs - -## Upgrade Notes - -- Initial release: no prior upgrade path required - -## See also - -- [Documentation index](../README.md) -- [API compatibility policy](../api/versioning-policy.md) -- [Production operations](../operations/production.md) diff --git a/docs/releases/v0.2.0.md b/docs/releases/v0.2.0.md deleted file mode 100644 index 7d681c8..0000000 --- a/docs/releases/v0.2.0.md +++ /dev/null @@ -1,53 +0,0 @@ -# πŸš€ Secrets v0.2.0 Release Notes - -> Release date: 2026-02-14 - -This release adds operational support for audit log retention cleanup. - -## Highlights - -- New CLI command: `clean-audit-logs` -- Supports retention by age in days (`--days`) -- Supports safe preview mode (`--dry-run`) before deletion -- Supports machine-friendly output (`--format json`) and human-readable output (`--format text`) - -## Included CLI Addition - -- `clean-audit-logs --days [--dry-run] [--format text|json]` - -Operational behavior: - -- Dry-run mode counts matching rows without deleting -- Execution mode deletes rows older than the computed UTC cutoff date -- Works with both PostgreSQL and MySQL repositories - -## Runtime and Compatibility - -- API baseline remains v1 (`/v1/*`) -- Local development targets: Linux and macOS -- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` -- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` - -## Operational Notes - -- Use `--dry-run` first for production safety -- Ensure database is reachable and migrated before cleanup runs -- Keep retention execution on a defined cadence (for example monthly) - -Example: - -```bash -./bin/app clean-audit-logs --days 90 --dry-run --format json -``` - -## Upgrade Notes - -- Non-breaking addition: new CLI command for operations -- Existing API paths and behavior remain compatible under API v1 documentation - -## See also - -- [CLI commands reference](../cli/commands.md) -- [Audit Logs API](../api/audit-logs.md) -- [Production operations](../operations/production.md) -- [API compatibility policy](../api/versioning-policy.md) diff --git a/docs/releases/v0.3.0.md b/docs/releases/v0.3.0.md deleted file mode 100644 index bb76483..0000000 --- a/docs/releases/v0.3.0.md +++ /dev/null @@ -1,57 +0,0 @@ -# πŸš€ Secrets v0.3.0 Release Notes - -> Release date: 2026-02-16 - -This release adds metrics instrumentation and Prometheus-compatible monitoring support. - -## Highlights - -- Added OpenTelemetry metrics provider with Prometheus exporter -- Added optional `/metrics` endpoint for Prometheus scraping -- Added HTTP metrics middleware for request counts and latency histograms -- Added business operation metrics across auth, secrets, and transit use cases -- Added metrics configuration via `METRICS_ENABLED` and `METRICS_NAMESPACE` - -## Metrics and Monitoring - -New metric families: - -- `{namespace}_http_requests_total` -- `{namespace}_http_request_duration_seconds` -- `{namespace}_operations_total` -- `{namespace}_operation_duration_seconds` - -Runtime behavior: - -- When `METRICS_ENABLED=true` (default), the server exposes `GET /metrics` -- When `METRICS_ENABLED=false`, metrics middleware and `/metrics` are not registered -- `METRICS_NAMESPACE` (default `secrets`) prefixes metric names - -## Runtime and Compatibility - -- API baseline remains v1 (`/v1/*`) -- Metrics endpoint is outside API versioning (`/metrics`) -- Local development targets: Linux and macOS -- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` -- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` - -## Upgrade Notes - -- Non-breaking addition: observability and metrics instrumentation -- Existing API paths and behavior remain compatible under API v1 documentation -- Update your environment configuration if you want custom metric namespace values - -Example: - -```bash -export METRICS_ENABLED=true -export METRICS_NAMESPACE=secrets -curl http://localhost:8080/metrics -``` - -## See also - -- [Monitoring operations guide](../operations/monitoring.md) -- [Environment variables](../configuration/environment-variables.md) -- [Production operations](../operations/production.md) -- [API compatibility policy](../api/versioning-policy.md) diff --git a/docs/releases/v0.4.0.md b/docs/releases/v0.4.0.md deleted file mode 100644 index c21ae67..0000000 --- a/docs/releases/v0.4.0.md +++ /dev/null @@ -1,81 +0,0 @@ -# πŸš€ Secrets v0.4.0 Release Notes - -> Release date: 2026-02-18 - -This release adds tokenization capabilities for format-preserving data protection, -including API endpoints, CLI operations, persistence, and observability. - -## Highlights - -- Added tokenization API under `/v1/tokenization/*` -- Added tokenization key lifecycle: create, rotate, delete -- Added token lifecycle: tokenize, detokenize, validate, revoke -- Added deterministic mode support for repeatable token generation -- Added token format support: `uuid`, `numeric`, `luhn-preserving`, `alphanumeric` -- Added expired-token maintenance command: `clean-expired-tokens` - -## API Additions - -New endpoints: - -- `POST /v1/tokenization/keys` -- `POST /v1/tokenization/keys/{name}/rotate` -- `DELETE /v1/tokenization/keys/{id}` -- `POST /v1/tokenization/keys/{name}/tokenize` -- `POST /v1/tokenization/detokenize` -- `POST /v1/tokenization/validate` -- `POST /v1/tokenization/revoke` - -## CLI Additions - -- `create-tokenization-key --name --format [--deterministic] [--algorithm ]` -- `rotate-tokenization-key --name --format [--deterministic] [--algorithm ]` -- `clean-expired-tokens --days [--dry-run] [--format text|json]` - -## Data Model and Migrations - -Added migration `000002_add_tokenization` for PostgreSQL and MySQL: - -- `tokenization_keys` table for versioned key metadata -- `tokenization_tokens` table for token-to-ciphertext mapping and lifecycle fields - -## Observability - -Added tokenization business operations metrics in the `tokenization` domain, -including key and token lifecycle operations. - -## Runtime and Compatibility - -- API baseline remains v1 (`/v1/*`) -- Local development targets: Linux and macOS -- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` -- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` - -## Upgrade Notes - -- Non-breaking addition: tokenization capability under API v1 -- Existing auth, secrets, transit, and audit behavior remain compatible -- Run database migrations before using tokenization endpoints or CLI commands - -## Upgrade Checklist - -1. Deploy binaries/images with `v0.4.0` -2. Run DB migrations (`app migrate`) before serving traffic -3. Verify baseline health (`GET /health`, `GET /ready`) -4. Create a tokenization key (`create-tokenization-key` or `POST /v1/tokenization/keys`) -5. Run round-trip check: tokenize -> detokenize -> validate -> revoke -6. Schedule retention cleanup for expired tokens (`clean-expired-tokens`) - -## Rollback Notes - -- `000002_add_tokenization` is additive schema migration and is expected to remain applied during app rollback. -- Rolling back binaries/images to pre-`v0.4.0` can leave tokenization tables unused but present. -- Avoid destructive schema rollback in production unless you have a validated backup/restore plan. -- If rollback is required, keep existing data and disable tokenization traffic paths operationally until re-upgrade. - -## See also - -- [Tokenization API](../api/tokenization.md) -- [CLI commands reference](../cli/commands.md) -- [Production operations](../operations/production.md) -- [API compatibility policy](../api/versioning-policy.md) diff --git a/docs/releases/v0.4.1.md b/docs/releases/v0.4.1.md deleted file mode 100644 index 9277a5d..0000000 --- a/docs/releases/v0.4.1.md +++ /dev/null @@ -1,89 +0,0 @@ -# πŸš€ Secrets v0.4.1 Release Notes - -> Release date: 2026-02-19 - -This bugfix release improves authorization policy path matching behavior and updates -documentation for v0.4.1 release consumption. - -## Highlights - -- Fixed authorization path matching for policies using mid-path wildcards -- Clarified wildcard matching semantics for exact, trailing wildcard, and segment wildcard paths -- Expanded automated coverage for policy templates, wildcard edge cases, and common policy mistakes - -## Bug Fixes - -- Policy matcher now supports mid-path wildcard patterns such as `/v1/transit/keys/*/rotate` -- Mid-path `*` wildcard now matches exactly one path segment -- Trailing wildcard `/*` behavior remains greedy for nested subpaths - -## Runtime and Compatibility - -- API baseline remains v1 (`/v1/*`) -- No breaking API path or payload contract changes -- Local development targets: Linux and macOS -- CI baseline: Go `1.25.5`, PostgreSQL `16-alpine`, MySQL `8.0` -- Compatibility targets: PostgreSQL `12+`, MySQL `8.0+` - -## Upgrade Notes - -- Recommended for all users relying on wildcard policy path matching -- No schema migrations required specifically for this bugfix release -- Existing tokenization, secrets, transit, auth, and audit flows remain API-compatible - -## Policy Migration Note - -If existing policies assumed prefix-only behavior, review wildcard paths used for rotate and -similar endpoint-specific actions. - -Before (too broad for intent): - -```json -[ - { - "path": "/v1/transit/keys/*", - "capabilities": ["rotate"] - } -] -``` - -After (scoped to rotate endpoint pattern): - -```json -[ - { - "path": "/v1/transit/keys/*/rotate", - "capabilities": ["rotate"] - } -] -``` - -## Verification Checklist - -1. Deploy binaries/images with `v0.4.1` -2. Verify baseline health (`GET /health`, `GET /ready`) -3. Re-run policy smoke checks for expected allow/deny behavior -4. Confirm wildcard policies used in production match intended path semantics - -## Operator Quick Checklist (v0.4.1) - -1. Search client policies for rotate patterns and replace broad forms with `/v1/transit/keys/*/rotate` when needed. -2. Run route-shape smoke checks (`/v1/transit/keys/payment/extra/rotate` and `/v1/secrets`) and expect `404`. -3. Run allow/deny policy smoke checks and expect capability-denied calls to return `403`. -4. Review recent denied audit events and confirm mismatches are expected after policy rollout. - -## Documentation Migration Map (v0.4.1) - -- Policy matching semantics: [Policies cookbook / Path matching behavior](../api/policies.md#path-matching-behavior) -- Route-vs-policy triage: [Policies cookbook / Route shape vs policy shape](../api/policies.md#route-shape-vs-policy-shape) -- Pre-deploy policy checks: [Policies cookbook / Policy review checklist before deploy](../api/policies.md#policy-review-checklist-before-deploy) -- Capability verification: [Capability matrix](../api/capability-matrix.md) -- Operational validation steps: [Policy smoke tests](../operations/policy-smoke-tests.md) -- Incident triage and matcher FAQ: [Troubleshooting](../getting-started/troubleshooting.md) - -## See also - -- [Policies cookbook](../api/policies.md) -- [Policy smoke tests](../operations/policy-smoke-tests.md) -- [Troubleshooting](../getting-started/troubleshooting.md) -- [API compatibility policy](../api/versioning-policy.md) diff --git a/docs/releases/v0.5.0-upgrade.md b/docs/releases/v0.5.0-upgrade.md deleted file mode 100644 index b8ab422..0000000 --- a/docs/releases/v0.5.0-upgrade.md +++ /dev/null @@ -1,68 +0,0 @@ -# ⬆️ Upgrade Guide: v0.4.x -> v0.5.0 - -> Last updated: 2026-02-19 - -Use this guide to safely upgrade from `v0.4.x` to `v0.5.0`. - -## What changed - -- Default token expiration is now shorter (`24h` -> `4h`) -- Per-client rate limiting is enabled by default -- CORS is configurable and remains disabled by default -- Security hardening guidance expanded for production deployments - -## Env diff (copy/paste) - -```diff -- AUTH_TOKEN_EXPIRATION_SECONDS=86400 -+ AUTH_TOKEN_EXPIRATION_SECONDS=14400 - -+ RATE_LIMIT_ENABLED=true -+ RATE_LIMIT_REQUESTS_PER_SEC=10.0 -+ RATE_LIMIT_BURST=20 - -+ CORS_ENABLED=false -+ CORS_ALLOW_ORIGINS= -``` - -If your clients rely on 24-hour tokens, keep explicit configuration: - -```dotenv -AUTH_TOKEN_EXPIRATION_SECONDS=86400 -``` - -## Upgrade steps - -1. Update image/binary to `v0.5.0` -2. Review and set explicit `AUTH_TOKEN_EXPIRATION_SECONDS` -3. Add `RATE_LIMIT_*` variables with values matching your traffic profile -4. Keep `CORS_ENABLED=false` unless browser-based access is required -5. Restart API servers with updated environment - -## Post-upgrade verification - -1. Health checks pass: `GET /health`, `GET /ready` -2. Token issuance works and expiration matches expected TTL -3. Authenticated endpoint rate limit returns `429` with `Retry-After` when exceeded -4. Normal traffic does not hit `429` unexpectedly -5. CORS behavior is correct for your deployment mode - -## Quick checks - -```bash -# token issue still works -curl -i -X POST http://localhost:8080/v1/token \ - -H "Content-Type: application/json" \ - -d '{"client_id":"","client_secret":""}' - -# protected route may return 200/403/429 depending on policy and load -curl -i http://localhost:8080/v1/clients \ - -H "Authorization: Bearer " -``` - -## See also - -- [v0.5.0 release notes](v0.5.0.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [Environment variables](../configuration/environment-variables.md) -- [Security hardening guide](../operations/security-hardening.md) diff --git a/docs/releases/v0.5.0.md b/docs/releases/v0.5.0.md deleted file mode 100644 index 0403419..0000000 --- a/docs/releases/v0.5.0.md +++ /dev/null @@ -1,75 +0,0 @@ -# πŸš€ Secrets v0.5.0 Release Notes - -> Release date: 2026-02-19 - -This release strengthens default runtime security by adding authenticated request rate limiting, -introducing configurable CORS controls, tightening default token lifetime, and expanding -production security documentation. - -## Highlights - -- Added per-client rate limiting for authenticated API routes -- Added configurable CORS middleware with secure defaults -- Reduced default token expiration from 24 hours to 4 hours -- Added comprehensive production security hardening guide - -## Runtime Changes - -- New rate limiting settings: - - `RATE_LIMIT_ENABLED` (default `true`) - - `RATE_LIMIT_REQUESTS_PER_SEC` (default `10.0`) - - `RATE_LIMIT_BURST` (default `20`) -- New CORS settings: - - `CORS_ENABLED` (default `false`) - - `CORS_ALLOW_ORIGINS` (default empty) -- Authenticated endpoints now return `429 Too Many Requests` when limits are exceeded and include - `Retry-After` response header - -## Breaking / Behavior Changes - -- **Default token expiration changed**: - - Previous default: `AUTH_TOKEN_EXPIRATION_SECONDS=86400` (24h) - - New default: `AUTH_TOKEN_EXPIRATION_SECONDS=14400` (4h) - -If your clients expected 24-hour tokens, explicitly set `AUTH_TOKEN_EXPIRATION_SECONDS=86400` -and verify refresh behavior. - -## Upgrade Notes - -1. Deploy binaries/images with `v0.5.0` -2. Review and set explicit token expiration for your environment -3. Verify rate limit thresholds for client traffic profile -4. Keep CORS disabled unless browser-based access is required -5. Restrict `/metrics` exposure to internal networks only - -## Operator Verification Checklist - -1. Confirm health endpoints: `GET /health`, `GET /ready` -2. Validate token issuance and expiration expectations after upgrade -3. Confirm authenticated API traffic is not unintentionally rate limited -4. Validate `429` behavior and `Retry-After` header with controlled load test -5. Confirm CORS behavior matches policy (disabled by default, explicit origins only when enabled) - -## Security Guidance - -- Use TLS termination at reverse proxy/load balancer -- Use database TLS in production (`sslmode=require` or stronger / `tls=true` or stronger) -- Store master keys in a dedicated secrets manager -- Review least-privilege client policies and rotate credentials regularly - -## Documentation Updates - -- Added [Security hardening guide](../operations/security-hardening.md) -- Updated [Environment variables](../configuration/environment-variables.md) with rate limiting, - CORS, and token expiration migration notes -- Updated [Production deployment guide](../operations/production.md) with security hardening links -- Added root-level changelog entry for `0.5.0` - -## See also - -- [v0.5.0 upgrade guide](v0.5.0-upgrade.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [Security hardening guide](../operations/security-hardening.md) -- [Environment variables](../configuration/environment-variables.md) -- [Production deployment guide](../operations/production.md) -- [API compatibility policy](../api/versioning-policy.md) diff --git a/docs/releases/v0.5.1-upgrade.md b/docs/releases/v0.5.1-upgrade.md deleted file mode 100644 index b843715..0000000 --- a/docs/releases/v0.5.1-upgrade.md +++ /dev/null @@ -1,82 +0,0 @@ -# ⬆️ Upgrade Guide: v0.5.0 -> v0.5.1 - -> Release date: 2026-02-19 - -Use this guide to safely upgrade from `v0.5.0` to `v0.5.1`. - -## Scope - -- Release type: patch (`v0.5.1`) -- API compatibility: no `v1` contract changes -- Database migration: no new mandatory migration for this patch - -## What Changed - -- Fixed master key loading from `MASTER_KEYS` to preserve active key material after decode -- Added secure zeroing of all keychain-held master keys during `Close` -- Added regression test coverage for these memory lifecycle paths - -## Recommended Upgrade Steps - -1. Update image/binary to `v0.5.1` -2. Restart API instances with standard rolling rollout process -3. Run baseline checks: - - `GET /health` - - `GET /ready` -4. Run key-dependent smoke checks: - - `POST /v1/token` - - Secrets write/read - - Transit encrypt/decrypt round-trip - -## Quick Verification Commands - -Use these after rollout to validate key-dependent paths quickly: - -```bash -curl -sS http://localhost:8080/health - -TOKEN_RESPONSE="$(curl -sS -X POST http://localhost:8080/v1/token \ - -H "Content-Type: application/json" \ - -d '{"client_id":"","client_secret":""}')" - -CLIENT_TOKEN="$(printf '%s' "${TOKEN_RESPONSE}" | jq -r '.token')" - -curl -sS -X POST http://localhost:8080/v1/secrets/upgrade/smoke \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"value":"c21va2UtdjA1MQ=="}' - -curl -sS -X GET http://localhost:8080/v1/secrets/upgrade/smoke \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" - -# Transit round-trip using an existing transit key name -# If the key does not exist yet in this environment, create it first: -# curl -sS -X POST http://localhost:8080/v1/transit/keys \ -# -H "Authorization: Bearer ${CLIENT_TOKEN}" \ -# -H "Content-Type: application/json" \ -# -d '{"name":"","algorithm":"aes-gcm"}' -curl -sS -X POST http://localhost:8080/v1/transit/keys//encrypt \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"plaintext":"dHJhbnNpdC12MDUxLXNtb2tl"}' - -# Use ciphertext returned by the previous encrypt call -curl -sS -X POST http://localhost:8080/v1/transit/keys//decrypt \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"ciphertext":":"}' - -# Expect decrypted plaintext to equal: dHJhbnNpdC12MDUxLXNtb2tl -``` - -## Rollback Notes - -- If rollback is required, revert API instances to the last known stable image -- Keep additive schema migrations applied unless a validated rollback plan exists -- Re-run health and smoke checks after rollback - -## See also - -- [v0.5.1 release notes](v0.5.1.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [Production rollout golden path](../operations/production-rollout.md) diff --git a/docs/releases/v0.5.1.md b/docs/releases/v0.5.1.md deleted file mode 100644 index 082b334..0000000 --- a/docs/releases/v0.5.1.md +++ /dev/null @@ -1,60 +0,0 @@ -# πŸš€ Secrets v0.5.1 Release Notes - -> Release date: 2026-02-19 - -This patch release fixes master key memory handling to keep loaded key material usable while -preserving secure zeroing behavior for temporary and teardown paths. - -## Highlights - -- Fixed master key loading from environment variables to avoid zeroing the in-use key slice -- Hardened keychain shutdown by zeroing all master keys before clearing chain state -- Added regression tests for key usability after load and secure zeroing on close - -## Fixes - -- `LoadMasterKeyChainFromEnv` now stores a copy of decoded key bytes before zeroing temporary buffers -- `MasterKeyChain.Close` now zeros every loaded master key before clearing the key map - -## Security Impact - -- Reduces risk of leaked key material remaining in temporary decode buffers -- Ensures explicit in-memory zeroing of master keys during keychain teardown - -## Runtime and Compatibility - -- API baseline remains `v1` (`/v1/*`) -- No endpoint, payload, or status code contract changes -- No schema migrations required specifically for this patch release - -## Patch Release Safety - -- Most environments require no configuration changes for this release -- Rolling upgrade is recommended; keep standard health and smoke checks in place -- Rollback to the previous stable image is safe when incident criteria are met - -## Upgrade Notes - -1. Deploy binaries/images with `v0.5.1` -2. Run standard health checks (`GET /health`, `GET /ready`) -3. Validate key-dependent flows (token issuance, secrets write/read, transit encrypt/decrypt) - -## Operator Verification Checklist - -1. Confirm service health and readiness after rollout -2. Confirm startup succeeds with configured `MASTER_KEYS` and `ACTIVE_MASTER_KEY_ID` -3. Confirm secrets and transit workflows succeed under normal traffic -4. Confirm no unexpected key configuration or decryption errors in logs - -## Documentation Updates - -- Added [v0.5.1 upgrade guide](v0.5.1-upgrade.md) -- Updated [release compatibility matrix](compatibility-matrix.md) with `v0.5.0 -> v0.5.1` -- Updated current-release references across docs and pinned image examples to `v0.5.1` - -## See also - -- [v0.5.1 upgrade guide](v0.5.1-upgrade.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [Key management operations](../operations/key-management.md) -- [Security model](../concepts/security-model.md) diff --git a/docs/releases/v0.6.0-upgrade.md b/docs/releases/v0.6.0-upgrade.md deleted file mode 100644 index 23c7d35..0000000 --- a/docs/releases/v0.6.0-upgrade.md +++ /dev/null @@ -1,96 +0,0 @@ -# ⬆️ Upgrade Guide: v0.5.1 -> v0.6.0 - -> Release date: 2026-02-19 - -Use this guide to safely upgrade from `v0.5.1` to `v0.6.0`. - -## Scope - -- Release type: minor (`v0.6.0`) -- API compatibility: no `v1` endpoint contract break -- Database migration: no new mandatory migration for this release - -## What Changed - -- Added KMS-backed master key loading mode (`KMS_PROVIDER`, `KMS_KEY_URI`) -- Added KMS flags to `create-master-key` -- Added `rotate-master-key` CLI command for staged master key rotation -- Added fail-fast validation for partial KMS configuration - -## Recommended Upgrade Steps - -1. Update image/binary to `v0.6.0` -2. Decide runtime key mode: - - Keep legacy mode (no KMS vars set), or - - Enable KMS mode (`KMS_PROVIDER` and `KMS_KEY_URI` both set) -3. Restart API instances with standard rolling rollout process -4. Run baseline checks: - - `GET /health` - - `GET /ready` -5. Run key-dependent smoke checks: - - `POST /v1/token` - - Secrets write/read - - Transit encrypt/decrypt round-trip - -## Decision Path - -- Stay on legacy mode now: - - Keep `KMS_PROVIDER` and `KMS_KEY_URI` unset - - Upgrade binaries/images and validate normal crypto flows -- Adopt KMS mode now: - - Set both `KMS_PROVIDER` and `KMS_KEY_URI` - - Ensure all `MASTER_KEYS` entries are KMS ciphertext - - Follow migration workflow in [KMS setup guide](../operations/kms-setup.md) - - Track rollout gates in [KMS migration checklist](../operations/kms-migration-checklist.md) - -## Quick Verification Commands - -```bash -curl -sS http://localhost:8080/health -curl -sS http://localhost:8080/ready - -TOKEN_RESPONSE="$(curl -sS -X POST http://localhost:8080/v1/token \ - -H "Content-Type: application/json" \ - -d '{"client_id":"","client_secret":""}')" - -CLIENT_TOKEN="$(printf '%s' "${TOKEN_RESPONSE}" | jq -r '.token')" - -curl -sS -X POST http://localhost:8080/v1/secrets/upgrade/v060 \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"value":"djA2MC1zbW9rZQ=="}' - -curl -sS -X GET http://localhost:8080/v1/secrets/upgrade/v060 \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" -``` - -## Optional: Adopt KMS Mode During Upgrade - -If you are migrating from legacy plaintext master keys to KMS mode, use: - -1. [KMS setup guide](../operations/kms-setup.md) provider prerequisites -2. `create-master-key --kms-provider ... --kms-key-uri ...` -3. Staged dual-key migration and KEK rotation workflow from the KMS guide - -## Rollback Notes - -- If rollback is required, revert API instances to the previous stable image -- Revert only app version first; avoid destructive key/data rollback actions without a validated plan -- Re-run health and smoke checks after rollback - -### Rollback matrix - -| Upgrade path | First rollback action | Configuration rollback | Validation | -| --- | --- | --- | --- | -| Legacy mode (`KMS_*` unset) | Roll app image/binary back to previous stable version | Keep existing `MASTER_KEYS` and `ACTIVE_MASTER_KEY_ID` | `GET /health`, `GET /ready`, token + secrets/transit smoke checks | -| KMS mode (`KMS_*` set) | Roll app image/binary back to previous stable version | Keep KMS variables and KMS ciphertext `MASTER_KEYS` unchanged first; do not mix plaintext and KMS entries | Verify startup decrypt logs, then run token + secrets/transit smoke checks | - -Use [KMS migration checklist](../operations/kms-migration-checklist.md) to document rollback readiness before cutover. - -## See also - -- [v0.6.0 release notes](v0.6.0.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [KMS setup guide](../operations/kms-setup.md) -- [KMS migration checklist](../operations/kms-migration-checklist.md) -- [Production rollout golden path](../operations/production-rollout.md) diff --git a/docs/releases/v0.6.0.md b/docs/releases/v0.6.0.md deleted file mode 100644 index 0a790ba..0000000 --- a/docs/releases/v0.6.0.md +++ /dev/null @@ -1,58 +0,0 @@ -# πŸš€ Secrets v0.6.0 Release Notes - -> Release date: 2026-02-19 - -This minor release introduces KMS-backed master key support for encrypting key material at rest, -adds a dedicated master key rotation command, and expands operational documentation for provider setup -and migration workflows. - -## Highlights - -- Added KMS support for master key loading and decryption at startup -- Added CLI KMS flags to `create-master-key` (`--kms-provider`, `--kms-key-uri`) -- Added new `rotate-master-key` CLI command for staged master key rotation -- Added provider setup and migration runbook: [KMS setup guide](../operations/kms-setup.md) - -## Runtime Changes - -- New environment variables: - - `KMS_PROVIDER` - - `KMS_KEY_URI` -- Master key loading now supports two modes: - - KMS mode: both variables set - - Legacy mode: both variables unset -- Startup fails fast if only one KMS variable is set - -## Security and Operations Impact - -- KMS mode encrypts master keys at rest and centralizes key access control in your KMS provider -- Existing legacy environments remain supported without immediate migration -- Master key rotation now has an explicit CLI workflow for appending a new active key before cleanup - -## Upgrade Notes - -1. Deploy binaries/images with `v0.6.0` -2. Keep legacy mode or configure KMS mode explicitly (`KMS_PROVIDER` + `KMS_KEY_URI`) -3. Run standard health checks and key-dependent smoke checks -4. If adopting KMS mode, follow the staged migration in [KMS setup guide](../operations/kms-setup.md) - -## Operator Verification Checklist - -1. Confirm `GET /health` and `GET /ready` succeed -2. Confirm startup logs reflect intended key mode and active master key -3. Confirm token issuance and secrets/transit round-trip flows -4. Confirm no KMS auth/decrypt errors in startup logs - -## Documentation Updates - -- Added [v0.6.0 upgrade guide](v0.6.0-upgrade.md) -- Added [KMS setup guide](../operations/kms-setup.md) -- Updated [CLI commands](../cli/commands.md) with KMS flags and `rotate-master-key` -- Updated [Environment variables](../configuration/environment-variables.md) with KMS mode configuration - -## See also - -- [v0.6.0 upgrade guide](v0.6.0-upgrade.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [KMS setup guide](../operations/kms-setup.md) -- [Key management operations](../operations/key-management.md) diff --git a/docs/releases/v0.7.0-upgrade.md b/docs/releases/v0.7.0-upgrade.md deleted file mode 100644 index 80a01c7..0000000 --- a/docs/releases/v0.7.0-upgrade.md +++ /dev/null @@ -1,87 +0,0 @@ -# ⬆️ Upgrade Guide: v0.6.0 -> v0.7.0 - -> Release date: 2026-02-20 - -Use this guide to safely upgrade from `v0.6.0` to `v0.7.0`. - -## Scope - -- Release type: minor (`v0.7.0`) -- API compatibility: no `v1` endpoint contract break -- Database migration: no new mandatory migration for this release - -## What Changed - -- Added IP-based token endpoint rate limiting for `POST /v1/token` -- Added new token endpoint throttling configuration (`RATE_LIMIT_TOKEN_*`) -- Token issuance can now return `429 Too Many Requests` with `Retry-After` - -## Env Diff (copy/paste) - -```diff -+ RATE_LIMIT_TOKEN_ENABLED=true -+ RATE_LIMIT_TOKEN_REQUESTS_PER_SEC=5.0 -+ RATE_LIMIT_TOKEN_BURST=10 -``` - -## Recommended Upgrade Steps - -1. Update image/binary to `v0.7.0` -2. Add `RATE_LIMIT_TOKEN_*` variables to runtime configuration -3. Restart API instances with standard rolling rollout process -4. Run baseline checks: - - `GET /health` - - `GET /ready` -5. Run token and key-dependent checks: - - `POST /v1/token` - - Secrets write/read - - Transit encrypt/decrypt round-trip - -## Quick Verification Commands - -```bash -curl -sS http://localhost:8080/health -curl -sS http://localhost:8080/ready - -TOKEN_RESPONSE="$(curl -sS -X POST http://localhost:8080/v1/token \ - -H "Content-Type: application/json" \ - -d '{"client_id":"","client_secret":""}')" - -CLIENT_TOKEN="$(printf '%s' "${TOKEN_RESPONSE}" | jq -r '.token')" - -curl -sS -X POST http://localhost:8080/v1/secrets/upgrade/v070 \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"value":"djA3MC1zbW9rZQ=="}' - -curl -sS -X GET http://localhost:8080/v1/secrets/upgrade/v070 \ - -H "Authorization: Bearer ${CLIENT_TOKEN}" -``` - -## Optional: Token Endpoint Tuning Guidance - -- If legitimate clients share NAT/proxy egress and hit token endpoint `429`, increase: - - `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` - - `RATE_LIMIT_TOKEN_BURST` -- Keep limits conservative enough to deter credential stuffing -- Validate trusted proxy configuration so `ClientIP` reflects real caller IPs - -## Rollback Notes - -- If rollback is required, revert API instances to the previous stable image -- Revert app version first; avoid destructive key/data rollback actions without a validated plan -- Re-run health and smoke checks after rollback - -### Rollback matrix - -| Upgrade path | First rollback action | Configuration rollback | Validation | -| --- | --- | --- | --- | -| `v0.6.0 -> v0.7.0` | Roll app image/binary back to previous stable version | Remove or ignore `RATE_LIMIT_TOKEN_*` additions as needed; keep existing crypto/KMS config unchanged | `GET /health`, `GET /ready`, token issuance, and secrets/transit smoke checks | - -## See also - -- [v0.7.0 release notes](v0.7.0.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [API rate limiting](../api/rate-limiting.md) -- [Environment variables](../configuration/environment-variables.md) -- [Production rollout golden path](../operations/production-rollout.md) diff --git a/docs/releases/v0.7.0.md b/docs/releases/v0.7.0.md deleted file mode 100644 index 8770131..0000000 --- a/docs/releases/v0.7.0.md +++ /dev/null @@ -1,57 +0,0 @@ -# πŸš€ Secrets v0.7.0 Release Notes - -> Release date: 2026-02-20 - -This minor release adds dedicated IP-based rate limiting for unauthenticated token issuance, -expands configuration controls for token endpoint throttling, and updates operator guidance for -credential-stuffing and brute-force protection. - -## Highlights - -- Added IP-based rate limiting for `POST /v1/token` -- Added token endpoint rate-limit configuration via `RATE_LIMIT_TOKEN_*` variables -- Added token endpoint `429 Too Many Requests` behavior with `Retry-After` -- Expanded docs and runbooks for token endpoint abuse protection and rollout validation - -## Runtime Changes - -- New environment variables: - - `RATE_LIMIT_TOKEN_ENABLED` (default `true`) - - `RATE_LIMIT_TOKEN_REQUESTS_PER_SEC` (default `5.0`) - - `RATE_LIMIT_TOKEN_BURST` (default `10`) -- `POST /v1/token` may now return `429 Too Many Requests` when per-IP token limits are exceeded -- Authenticated per-client rate limiting (`RATE_LIMIT_*`) remains unchanged - -## Security and Operations Impact - -- Improves protection against token endpoint credential stuffing and brute-force traffic -- Applies stricter defaults on unauthenticated token issuance than authenticated API routes -- Requires review of proxy/trusted-IP setup when using forwarded headers in production - -## Upgrade Notes - -1. Deploy binaries/images with `v0.7.0` -2. Review and tune `RATE_LIMIT_TOKEN_*` to match expected login/token traffic -3. Validate token issuance flow under normal and burst traffic -4. Confirm `429` + `Retry-After` behavior for token endpoint in controlled load tests - -## Operator Verification Checklist - -1. Confirm `GET /health` and `GET /ready` succeed -2. Confirm `POST /v1/token` issues tokens normally for expected request rates -3. Confirm token endpoint returns controlled `429` with `Retry-After` when intentionally exceeded -4. Confirm authenticated route limits and retry behavior still match policy - -## Documentation Updates - -- Added [v0.7.0 upgrade guide](v0.7.0-upgrade.md) -- Updated [API rate limiting](../api/rate-limiting.md) with token endpoint scope -- Updated [Environment variables](../configuration/environment-variables.md) with `RATE_LIMIT_TOKEN_*` -- Updated [Troubleshooting](../getting-started/troubleshooting.md) with token endpoint `429` diagnostics - -## See also - -- [v0.7.0 upgrade guide](v0.7.0-upgrade.md) -- [Release compatibility matrix](compatibility-matrix.md) -- [API rate limiting](../api/rate-limiting.md) -- [Production rollout golden path](../operations/production-rollout.md) diff --git a/docs/tools/check_docs_metadata.py b/docs/tools/check_docs_metadata.py index 8bf35eb..540bda6 100644 --- a/docs/tools/check_docs_metadata.py +++ b/docs/tools/check_docs_metadata.py @@ -23,11 +23,13 @@ def main() -> None: require_contains(Path("docs/README.md"), current_release) # Ensure current release docs links are present in key navigation pages. - current_release_note = f"docs/releases/{current_release}.md" + # After consolidation, all releases are in RELEASES.md with anchor links. + # We only check that RELEASES.md exists and is referenced. + current_release_note = "docs/releases/RELEASES.md" require_contains(Path("README.md"), current_release_note) - require_contains(Path("docs/README.md"), f"releases/{current_release}.md") + require_contains(Path("docs/README.md"), "releases/RELEASES.md") require_contains( - Path("docs/operations/runbook-index.md"), f"../releases/{current_release}.md" + Path("docs/operations/runbooks/README.md"), "../releases/RELEASES.md" ) openapi = Path("docs/openapi.yaml").read_text(encoding="utf-8") @@ -36,7 +38,7 @@ def main() -> None: "docs/openapi.yaml version does not match docs/metadata.json api_version" ) - api_pages = sorted(Path("docs/api").glob("*.md")) + api_pages = sorted(Path("docs/api").rglob("*.md")) missing = [] marker = f"> Applies to: API {api_version}" for page in api_pages: @@ -50,10 +52,10 @@ def main() -> None: # Ensure docs index points to metadata source. require_contains(Path("docs/README.md"), "docs/metadata.json") - # Soft check: release notes for current release should exist. - release_file = Path(f"docs/releases/{current_release}.md") + # Soft check: RELEASES.md should exist (consolidated structure). + release_file = Path("docs/releases/RELEASES.md") if not release_file.exists(): - raise ValueError(f"Missing release notes file: {release_file}") + raise ValueError(f"Missing consolidated release notes file: {release_file}") # Keep date shape simple for maintainers. if not re.match(r"^\d{4}-\d{2}-\d{2}$", metadata["last_docs_refresh"]): diff --git a/docs/tools/check_example_shapes.py b/docs/tools/check_example_shapes.py index dca6a1a..589f8c8 100755 --- a/docs/tools/check_example_shapes.py +++ b/docs/tools/check_example_shapes.py @@ -20,8 +20,10 @@ def require_keys(payload: dict, keys: list[str], label: str) -> None: def main() -> None: - response_shapes = Path("docs/api/response-shapes.md").read_text(encoding="utf-8") - transit_api = Path("docs/api/transit.md").read_text(encoding="utf-8") + response_shapes = Path("docs/api/observability/response-shapes.md").read_text( + encoding="utf-8" + ) + transit_api = Path("docs/api/data/transit.md").read_text(encoding="utf-8") token = extract_json_block(response_shapes, "Token issuance:") require_keys(token, ["token", "expires_at"], "Token issuance") diff --git a/docs/tools/check_release_docs_links.py b/docs/tools/check_release_docs_links.py index 8a887a8..63bac3e 100644 --- a/docs/tools/check_release_docs_links.py +++ b/docs/tools/check_release_docs_links.py @@ -1,12 +1,23 @@ #!/usr/bin/env python3 +""" +Validates that new releases are properly added to consolidated RELEASES.md. +This script checks: +1. New release entries are added to RELEASES.md (not individual files) +2. Release appears in compatibility matrix +3. Navigation files link to RELEASES.md +""" + +import json import os import re import subprocess from pathlib import Path -RELEASE_RE = re.compile(r"^docs/releases/(v\d+\.\d+\.\d+)\.md$") +# Detect changes to RELEASES.md (modified or new release sections) +RELEASES_FILE = Path("docs/releases/RELEASES.md") +COMPATIBILITY_FILE = Path("docs/releases/compatibility-matrix.md") def run(cmd: list[str]) -> str: @@ -14,55 +25,84 @@ def run(cmd: list[str]) -> str: return out.strip() -def changed_added_release_notes(base_sha: str, head_sha: str) -> list[str]: - output = run(["git", "diff", "--name-status", base_sha, head_sha]) - versions: list[str] = [] +def get_modified_files(base_sha: str, head_sha: str) -> set[str]: + """Return set of files that were modified in this PR.""" + output = run(["git", "diff", "--name-only", base_sha, head_sha]) if not output: - return versions - - for line in output.splitlines(): - parts = line.split("\t", 1) - if len(parts) != 2: - continue - status, path = parts - if status != "A": - continue - match = RELEASE_RE.match(path) - if not match: - continue - versions.append(match.group(1)) - return versions + return set() + return set(output.splitlines()) + + +def extract_version_headers(content: str) -> list[str]: + """Extract all version headers from RELEASES.md content.""" + # Match: ## [0.7.0] - 2026-02-20 + pattern = re.compile(r"^## \[(\d+\.\d+\.\d+)\] - \d{4}-\d{2}-\d{2}$", re.MULTILINE) + return pattern.findall(content) + + +def get_releases_diff(base_sha: str, head_sha: str) -> tuple[list[str], bool]: + """Return list of new version entries and whether this is a consolidation. + + Returns: + tuple: (list of versions to validate, is_consolidation flag) + + During consolidation migrations (when RELEASES.md is newly created with many + versions), only the current release from metadata.json is validated to avoid + requiring historical versions in the compatibility matrix. + """ + try: + # Get old version of RELEASES.md + old_content = run(["git", "show", f"{base_sha}:docs/releases/RELEASES.md"]) + old_versions = set(extract_version_headers(old_content)) + is_consolidation = False + except subprocess.CalledProcessError: + # File might not exist in base (first time / consolidation) + old_versions = set() + is_consolidation = True + + # Get new version of RELEASES.md + new_content = RELEASES_FILE.read_text(encoding="utf-8") + new_versions = set(extract_version_headers(new_content)) + + # Find newly added versions + added = new_versions - old_versions + + # If this looks like a consolidation (many versions added at once), + # only validate the current release from metadata + if is_consolidation and len(added) > 3: + metadata_path = Path("docs/metadata.json") + if metadata_path.exists(): + metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + current = metadata.get("current_release", "").lstrip("v") + if current in added: + # Only validate current release during consolidation + return [current], True + # Fallback: if current_release not found, validate all + return sorted(added), True + + # Normal case: validate all new releases + return sorted(added), False def require_contains(path: Path, needle: str) -> None: + """Verify that path contains needle string.""" content = path.read_text(encoding="utf-8") if needle not in content: raise ValueError(f"{path} missing required link/text: {needle}") -def validate_release(version: str) -> None: - release_path = Path(f"docs/releases/{version}.md") - upgrade_path = Path(f"docs/releases/{version}-upgrade.md") - compatibility_path = Path("docs/releases/compatibility-matrix.md") - - if not release_path.exists(): - raise ValueError(f"Missing release notes file: {release_path}") - if not upgrade_path.exists(): - raise ValueError(f"Missing upgrade guide for new release notes: {upgrade_path}") +def validate_release_in_consolidated(version: str) -> None: + """Validate that new release is properly documented in consolidated files.""" + # Check that version appears in RELEASES.md + require_contains(RELEASES_FILE, f"[{version}]") - require_contains(release_path, f"{version}-upgrade.md") - require_contains(release_path, "compatibility-matrix.md") - require_contains(compatibility_path, version) + # Check that version appears in compatibility matrix + require_contains(COMPATIBILITY_FILE, version) - # Ensure entry-point navigation includes both links for this release. - require_contains(Path("docs/README.md"), f"releases/{version}.md") - require_contains(Path("docs/README.md"), f"releases/{version}-upgrade.md") - require_contains( - Path("docs/operations/runbook-index.md"), f"../releases/{version}.md" - ) + # Ensure main navigation points to RELEASES.md + require_contains(Path("docs/README.md"), "releases/RELEASES.md") require_contains( - Path("docs/operations/runbook-index.md"), - f"../releases/{version}-upgrade.md", + Path("docs/operations/runbooks/README.md"), "../../releases/RELEASES.md" ) @@ -78,13 +118,29 @@ def main() -> None: "PR_BASE_SHA and PR_HEAD_SHA must be set for release docs guard" ) - versions = changed_added_release_notes(base_sha, head_sha) - if not versions: - print("release docs guard passed (no new release note files)") + # Check if RELEASES.md was modified + modified_files = get_modified_files(base_sha, head_sha) + if "docs/releases/RELEASES.md" not in modified_files: + print("release docs guard passed (RELEASES.md not modified)") return - for version in versions: - validate_release(version) + # Get new releases added + new_versions, is_consolidation = get_releases_diff(base_sha, head_sha) + if not new_versions: + print("release docs guard passed (no new release entries detected)") + return + + if is_consolidation: + print( + f"Detected consolidation migration, validating current release only: {', '.join(new_versions)}" + ) + else: + print(f"Detected new release(s): {', '.join(new_versions)}") + + # Validate each new release + for version in new_versions: + validate_release_in_consolidated(version) + print(f" βœ“ {version} properly documented") print("release docs guard passed") diff --git a/docs/tools/check_release_image_tags.py b/docs/tools/check_release_image_tags.py index bed584a..a788dbf 100644 --- a/docs/tools/check_release_image_tags.py +++ b/docs/tools/check_release_image_tags.py @@ -5,7 +5,9 @@ from pathlib import Path +# Updated pattern to allow both pinned and unpinned (latest) references PINNED_IMAGE_PATTERN = re.compile(r"allisson/secrets:v\d+\.\d+\.\d+") +UNPINNED_IMAGE_PATTERN = re.compile(r"allisson/secrets(?::latest)?(?!\:v)") def main() -> None: @@ -16,11 +18,11 @@ def main() -> None: files_to_check = [ Path("README.md"), Path("docs/getting-started/docker.md"), - Path("docs/operations/production-rollout.md"), - Path("docs/cli/commands.md"), - Path("docs/configuration/environment-variables.md"), - Path("docs/operations/key-management.md"), - Path("docs/operations/kms-setup.md"), + Path("docs/operations/deployment/production-rollout.md"), + Path("docs/cli-commands.md"), + Path("docs/configuration.md"), + Path("docs/operations/kms/key-management.md"), + Path("docs/operations/kms/setup.md"), ] errors = [] @@ -31,18 +33,25 @@ def main() -> None: continue content = file_path.read_text(encoding="utf-8") - tags = PINNED_IMAGE_PATTERN.findall(content) + pinned_tags = PINNED_IMAGE_PATTERN.findall(content) + unpinned_refs = UNPINNED_IMAGE_PATTERN.findall(content) - if not tags: - errors.append(f"{file_path} must include pinned image tag {current_tag}") - continue + # Allow either pinned tags matching current release OR unpinned references + # But not both in the same file (consistency check) + has_current_pinned = current_tag in pinned_tags + has_unpinned = bool(unpinned_refs) + has_old_pinned = any(tag != current_tag for tag in pinned_tags) - mismatched = sorted({tag for tag in tags if tag != current_tag}) - if mismatched: + if has_old_pinned: + mismatched = sorted({tag for tag in pinned_tags if tag != current_tag}) errors.append( - f"{file_path} contains non-current pinned tags: " + f"{file_path} contains outdated pinned tags: " + ", ".join(mismatched) - + f" (expected only {current_tag})" + + f" (expected {current_tag} or unpinned allisson/secrets)" + ) + elif not has_current_pinned and not has_unpinned: + errors.append( + f"{file_path} must include either {current_tag} or unpinned allisson/secrets" ) if errors: diff --git a/internal/app/README.md b/internal/app/README.md index b3c4a81..0f2dfa7 100644 --- a/internal/app/README.md +++ b/internal/app/README.md @@ -350,4 +350,4 @@ Potential improvements for the container: - [Documentation index](../../docs/README.md) - [Architecture concepts](../../docs/concepts/architecture.md) - [Local development](../../docs/getting-started/local-development.md) -- [Testing guide](../../docs/development/testing.md) +- [Development and testing](../../docs/contributing.md#development-and-testing)