From 4756c6224eb6de5c83377dec8ee8853befd8ac04 Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:02:48 -0300 Subject: [PATCH 01/16] docs: add K8s HA report and implementation plans Three implementation plans covering: - Helm chart HA (PDB, probes, anti-affinity, NATS cluster, NetworkPolicy, Ingress) - Application code fixes (NATS reconnect bug, plugin migration locks, shutdown timeout) - Infrastructure & advanced (production values overlay, ESO, cert-manager, Karpenter) --- .context/k8s-ha-report.md | 668 +++++++++++++++++ .../plans/2026-04-07-app-code-ha-fixes.md | 443 +++++++++++ .../plans/2026-04-07-helm-chart-ha.md | 698 ++++++++++++++++++ .../plans/2026-04-07-infra-ha-advanced.md | 528 +++++++++++++ 4 files changed, 2337 insertions(+) create mode 100644 .context/k8s-ha-report.md create mode 100644 docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md create mode 100644 docs/superpowers/plans/2026-04-07-helm-chart-ha.md create mode 100644 docs/superpowers/plans/2026-04-07-infra-ha-advanced.md diff --git a/.context/k8s-ha-report.md b/.context/k8s-ha-report.md new file mode 100644 index 0000000000..e8ca1d6376 --- /dev/null +++ b/.context/k8s-ha-report.md @@ -0,0 +1,668 @@ +# MCP Mesh: Kubernetes High-Availability Report + +> Comprehensive analysis from 5 parallel research tracks exploring maximum availability for the MCP Mesh control plane. + +--- + +## Table of Contents + +1. [Executive Summary](#executive-summary) +2. [Current State Assessment](#current-state-assessment) +3. [Pod-Level Resilience](#1-pod-level-resilience) +4. [Database Resilience](#2-database-resilience) +5. [NATS & Event Bus Resilience](#3-nats--event-bus-resilience) +6. [Networking & Traffic Management](#4-networking--traffic-management) +7. [Cluster & Infrastructure](#5-cluster--infrastructure) +8. [Bugs & Issues Found During Research](#bugs--issues-found) +9. [Prioritized Action Plan](#prioritized-action-plan) +10. [Cost Analysis](#cost-analysis) + +--- + +## Executive Summary + +MCP Mesh is an MCP control plane (Hono API + React SPA, Bun runtime) with three dependent services: **PostgreSQL** (state), **NATS with JetStream** (signaling + streaming), and optional sidecars (OTel collector, S3 sync). The current Helm chart has a solid foundation but several critical gaps for production HA: + +**Top 5 findings:** +1. **No PodDisruptionBudget** -- `kubectl drain` or cluster autoscaler can evict all pods simultaneously +2. **Single NATS instance** -- one pod failure kills real-time event notification (polling fallback adds up to 5s latency) +3. **No Ingress template** -- no route-aware timeouts for SSE vs HTTP, no TLS automation +4. **Connection pooling gap** -- 3-6 replicas x 4 threads x 20 pool max = 240-480 PostgreSQL connections (default PG max is 100) +5. **Migration race condition** -- multiple pods running Kysely migrations simultaneously on startup; plugin migrations lack advisory locks + +**One latent bug found:** `NatsNotifyStrategy.start()` has an `if (this.sub) return` guard that prevents re-subscription after NATS reconnect, silently breaking the notify path. + +--- + +## Current State Assessment + +### Architecture +``` + Internet + | + [No Ingress/Gateway] + | + ClusterIP:80 + | + +----------+----------+ + | Deployment (1-6) | + | Bun/Hono :3000 | + | + OTel sidecar | + | + S3 sync sidecar | + +----------+----------+ + | | + PostgreSQL NATS (1 pod) + (external) JetStream +``` + +### What Already Works Well +- Rolling update with `maxSurge:1, maxUnavailable:0` +- TopologySpreadConstraints across zones (soft) +- HPA template (disabled by default, 3-6 replicas) +- Graceful shutdown sequence (mark 503 -> sleep 2s -> force close -> drain NATS/DB -> 55s hard timeout) +- Event bus hybrid architecture (PG for durability, NATS for signaling, polling as fallback) +- Pod heartbeat via NATS KV with 45s TTL +- Resilience tests with Toxiproxy (DB outage, DB latency, NATS outage, MCP latency) + +### Critical Gaps +| Gap | Impact | Effort | +|-----|--------|--------| +| No PDB | All pods can be evicted simultaneously | Low (new template) | +| No pod anti-affinity | Pods can stack on one node/zone | Low (values change) | +| No startupProbe | 30s liveness initialDelay is a blunt instrument | Low (values change) | +| Single NATS pod | SPOF for real-time notifications | Medium (subchart config) | +| No Ingress template | No route-aware timeouts, no TLS | Medium (new template) | +| No NetworkPolicy | Unrestricted pod-to-pod traffic | Medium (new templates) | +| No connection pooling | Connection exhaustion at 3+ replicas | Medium (infra + config) | +| No PriorityClass | MCP Mesh can be preempted by any workload | Low (new template) | +| preStop hook missing | 2s internal sleep insufficient for endpoint propagation | Low (values change) | + +--- + +## 1. Pod-Level Resilience + +### 1.1 PodDisruptionBudget + +Add `templates/pdb.yaml`: + +```yaml +{{- if or .Values.autoscaling.enabled (gt (int .Values.replicaCount) 1) }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} +spec: + maxUnavailable: 1 + selector: + matchLabels: + {{- include "chart-deco-studio.selectorLabels" . | nindent 6 }} +{{- end }} +``` + +**Why `maxUnavailable: 1` over `minAvailable`:** With HPA range 3-6, `minAvailable: 2` could block node drains when HPA has scaled to 3 and one pod is already unavailable. `maxUnavailable: 1` is simpler and correct regardless of replica count. + +### 1.2 Pod Anti-Affinity + +```yaml +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio + topologyKey: kubernetes.io/hostname + - weight: 50 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio + topologyKey: topology.kubernetes.io/zone +``` + +**Why `preferred` over `required`:** If a zone goes down and remaining zones have fewer nodes than replicas, `required` leaves pods in Pending. `preferred` degrades gracefully. + +### 1.3 Probes: Add startupProbe, Tighten Others + +```yaml +startupProbe: + httpGet: + path: /health/live + port: http + periodSeconds: 2 + failureThreshold: 30 # Up to 60s to start + timeoutSeconds: 3 + +livenessProbe: + httpGet: + path: /health/live + port: http + initialDelaySeconds: 0 # startupProbe handles startup + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + +readinessProbe: + httpGet: + path: /health/ready + port: http + initialDelaySeconds: 0 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 2 # 10s to remove from endpoints (was 20s) +``` + +**Rationale:** Separating startup from liveness eliminates the 30s `initialDelaySeconds` hack. The readiness `failureThreshold` drops from 4 to 2 for faster DB-outage response. + +### 1.4 Graceful Shutdown: preStop Hook + +```yaml +lifecycle: + preStop: + exec: + command: ["sleep", "5"] +terminationGracePeriodSeconds: 65 +``` + +The preStop hook runs before SIGTERM, giving kube-proxy and ingress controllers 5s to remove the pod from endpoints **before** the app starts shutting down. The app's internal timeout should increase from 55s to 58s (`apps/mesh/src/index.ts:161`). + +### 1.5 PriorityClass + +```yaml +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: mcp-mesh-control-plane +value: 1000000 +globalDefault: false +preemptionPolicy: PreemptLowerPriority +``` + +MCP Mesh routes all MCP traffic -- it should be among the last workloads preempted under resource pressure. + +### 1.6 Resource QoS + +Current config (Burstable QoS) is **correct**: +- Memory limit = request (2Gi): prevents OOM impact on other pods +- No CPU limit: avoids CFS throttling on Bun's single-threaded event loop +- HPA scales out before sustained CPU becomes problematic + +### 1.7 TopologySpreadConstraints: Harden for Production + +```yaml +topologySpreadConstraints: + # Zone: hard constraint + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio + # Node: soft constraint + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio +``` + +**`DoNotSchedule` for zones** prevents all pods from stacking in one AZ during scale-up or node pressure. A Pending pod is visible and actionable; silent AZ concentration is not. + +--- + +## 2. Database Resilience + +### 2.1 PostgreSQL HA: Recommended Options + +| Option | Failover Time | RPO | Ops Burden | Best For | +|--------|---------------|-----|------------|----------| +| **CloudNativePG** | 10-30s | ~5s (async WAL) | Low | Self-hosted K8s | +| **RDS Multi-AZ** | 60-120s | 0 (sync repl) | Lowest | AWS deployments | +| **Cloud SQL HA** | ~60s | 0 | Lowest | GCP deployments | +| **AlloyDB** | ~10s | 0 | Lowest | GCP, read-heavy | + +**Recommendation:** Managed service (RDS/Cloud SQL) as default for least operational burden. CloudNativePG for self-hosted/on-prem. The Helm chart already supports both via `database.url`. + +### 2.2 Connection Pooling (Critical) + +**The problem:** With `NUM_THREADS: 4` and `DATABASE_POOL_MAX: 20`, each pod opens up to 80 PostgreSQL connections. At 3-6 replicas, that's **240-480 connections**. PostgreSQL default `max_connections` is 100. + +**Solution:** PgBouncer as a standalone Deployment (not sidecar -- sidecar doesn't reduce total connection count with HPA). + +- CloudNativePG has built-in PgBouncer pooler (`spec.pooler`) +- Use **transaction pooling mode** (Kysely doesn't use `SET`, prepared statements by name, or session-level features) +- Reduce `DATABASE_POOL_MAX` to 5-10 per thread when PgBouncer is in front +- The event bus's `pg_advisory_lock` and `FOR UPDATE SKIP LOCKED` work correctly through PgBouncer in transaction mode + +### 2.3 Read Replicas + +The codebase has clear read/write separation. Add `DATABASE_READ_URL` to the Helm chart: +- **Write paths:** Connection CRUD, event publishing, delivery claiming, thread creation +- **Read paths:** Connection listing, monitoring queries, registry browsing, virtual MCP tool listing +- Most reads tolerate 100-500ms replication lag. Event bus claiming must always hit primary. + +### 2.4 Migration Safety + +**Problem:** Migrations run at pod startup (`apps/mesh/src/settings/pipeline.ts:46-55`). Issues: +1. During rolling update, new pod runs DDL while old pod serves traffic against pre-migration schema +2. Plugin migrations (`apps/mesh/src/database/migrate.ts:149-213`) lack advisory locks -- two pods can race + +**Solutions:** +- **Init container for migrations** (ensures migrations complete before pod joins Service) +- **Backward-compatible migrations only** (add nullable -> backfill -> NOT NULL in next release) +- **Advisory lock for plugin migrations:** + ```typescript + await sql`SELECT pg_advisory_lock(hashtext('plugin_migrations'))`.execute(db); + try { /* run plugin migrations */ } finally { + await sql`SELECT pg_advisory_unlock(hashtext('plugin_migrations'))`.execute(db); + } + ``` + +### 2.5 Database Circuit Breaker + +Currently, when PostgreSQL is down, every request waits 30s (`connectionTimeoutMillis`) before failing. Add: +- Database-level circuit breaker (open after N consecutive failures, short-circuit for cooldown period) +- Transient error retry (codes `57P01`, `57P03`, `08006`, `40001`) with 100-500ms backoff, 2-3 attempts +- Reduce `idleTimeoutMillis` from 300s to 60s for faster stale connection recycling after failover + +### 2.6 Backup & Recovery + +| Scenario | RPO | RTO | Approach | +|----------|-----|-----|----------| +| Pod crash | 0 | 10-30s | K8s reschedule, HPA | +| PG failover (CNPG) | ~5s | 10-30s | Auto-promotion | +| PG failover (RDS) | 0 | 60-120s | DNS-based | +| AZ failure | ~5s | 2-5min | Cross-AZ replicas | +| Region failure | 5-60s | 15-60min | Cross-region backup restore | +| Data corruption | 0 | 5-15min | PITR from WAL | + +--- + +## 3. NATS & Event Bus Resilience + +### 3.1 NATS Clustering: 3-Node + +```yaml +nats: + enabled: true + config: + cluster: + enabled: true + replicas: 3 + jetstream: + enabled: true + memoryStore: + enabled: true + maxSize: 512Mi # Reduced from 1Gi + fileStore: + enabled: true + pvc: + enabled: true + size: 2Gi # Reduced from 10Gi + podTemplate: + merge: + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["nats"] + topologyKey: kubernetes.io/hostname +``` + +**Key points:** +- 3-node Raft quorum tolerates 1 node failure +- Pod anti-affinity ensures each NATS node on a different K8s node +- NATS routes auto-configured via headless service DNS +- JetStream streams should use R3 replication for durability + +### 3.2 JetStream Storage: Right-Size + +NATS is used for: +1. `mesh.events.notify` -- Core pub/sub (no JetStream needed) +2. NatsSSEBroadcast -- Core pub/sub for cross-pod SSE fan-out +3. NatsCancelBroadcast -- Core pub/sub for cancel signals +4. NatsStreamBuffer -- JetStream for decopilot streaming relay (ephemeral) +5. JobStream -- JetStream for automation job processing + +**Recommendation:** Keep JetStream but reduce allocation. 512Mi memory + 2Gi file is sufficient for ephemeral streaming and job queues. The 10Gi file store was 5-10x overprovisioned. + +### 3.3 NATS Client Tuning + +Add to `createNatsConnectionProvider`: +```typescript +{ + reconnectTimeWait: 1000, + reconnectJitter: 500, + pingInterval: 20_000, // Default 120s is too slow for dead detection + maxPingOut: 3, + name: "mesh-app", +} +``` + +### 3.4 Degraded Mode Analysis (NATS Down) + +| Scenario | With NATS | Without NATS | +|----------|-----------|--------------| +| Same-pod event publish | Instant | Instant (PollingStrategy.notify is direct) | +| Cross-pod event publish | Instant | Up to 5s (next poll cycle) | +| Scheduled/cron events | Up to 5s (polling) | Up to 5s (identical) | + +**The hybrid architecture is sound.** NATS down is a latency degradation (5s worst case for cross-pod), not a data loss event. Events are durable in PostgreSQL. + +### 3.5 Future Enhancement: PG LISTEN/NOTIFY Fallback + +Add a `PgNotifyStrategy` as a second fallback (zero-infrastructure, sub-second cross-pod notification using the existing PostgreSQL connection). This would make the composition: `compose(PollingStrategy, NatsNotifyStrategy, PgNotifyStrategy)` -- three layers of defense. + +--- + +## 4. Networking & Traffic Management + +### 4.1 Ingress Controller: Envoy-Based (Contour) + +MCP uses SSE (Server-Sent Events) for long-lived connections. Envoy natively handles streaming without buffering. NGINX requires explicit `proxy_buffering off` annotations. + +**Contour HTTPProxy example with route-aware timeouts:** + +```yaml +apiVersion: projectcontour.io/v1 +kind: HTTPProxy +metadata: + name: mcp-mesh +spec: + virtualhost: + fqdn: mesh.example.com + tls: + secretName: mesh-tls + routes: + # MCP proxy -- long-lived SSE + - conditions: + - prefix: /mcp + services: + - name: deco-studio + port: 80 + timeoutPolicy: + response: "0s" # No response timeout for SSE + idle: "300s" # Kill truly idle connections after 5m + retryPolicy: + count: 0 # NEVER retry MCP -- tool calls have side effects + # OAuth/auth + - conditions: + - prefix: /api/auth + services: + - name: deco-studio + port: 80 + timeoutPolicy: + response: "30s" + # SPA / static assets + - conditions: + - prefix: / + services: + - name: deco-studio + port: 80 + timeoutPolicy: + response: "15s" +``` + +### 4.2 Critical: Never Retry `/mcp/*` Routes + +MCP tool calls (`tools/call`) are **not idempotent**. A retried `EVENT_PUBLISH` creates duplicate events. Safe-to-retry endpoints: +- `GET /health/*`, `GET /api/auth/*`, `GET /` (SPA assets), `GET /api/v1/*` (REST reads) + +### 4.3 TLS: Terminate at Ingress + cert-manager + +``` +Internet -> [TLS] -> Ingress (termination) -> [plaintext] -> ClusterIP -> Pod:3000 +``` + +The Hono server doesn't handle TLS. cert-manager with Let's Encrypt automates certificate lifecycle. + +### 4.4 Session Affinity: NOT Needed + +The app stores sessions in PostgreSQL (Better Auth), not in-memory. NATS provides inter-pod communication. Session affinity would harm HPA scaling by creating uneven load distribution. + +### 4.5 Network Policies + +```yaml +# App pods: restrict to ingress controller + PostgreSQL + NATS + DNS + external HTTPS +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: mcp-mesh-app +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + policyTypes: [Ingress, Egress] + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: ingress-nginx + ports: + - port: 3000 + egress: + - to: + - podSelector: + matchLabels: + app.kubernetes.io/name: nats + ports: + - port: 4222 + - to: # PostgreSQL + - ipBlock: { cidr: 10.0.0.0/8 } + ports: + - port: 5432 + - to: # DNS + - namespaceSelector: {} + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - port: 53 + protocol: UDP + - to: # External MCP servers, OAuth providers + - ipBlock: + cidr: 0.0.0.0/0 + except: ["169.254.169.254/32"] # Block metadata service + ports: + - port: 443 + - port: 80 +``` + +--- + +## 5. Cluster & Infrastructure + +### 5.1 Multi-AZ: 3 AZs, 6 Nodes Minimum + +- 2 nodes per AZ for N+1 redundancy per zone +- Separate node groups: **app** (compute-optimized for MCP proxy) and **data** (for NATS with JetStream persistence) +- NATS pods must run on on-demand instances (not spot) + +### 5.2 Autoscaler: Karpenter Over Cluster Autoscaler + +- Right-sized node provisioning (bypasses ASG, ~45-60s vs ~2-3min) +- Diversified instance pools for spot resilience +- NodePool with `consolidationPolicy: WhenEmptyOrUnderutilized` + +**HPA Behavior tuning:** +```yaml +autoscaling: + behavior: + scaleUp: + stabilizationWindowSeconds: 30 + policies: + - type: Percent + value: 100 # Double capacity per step + periodSeconds: 30 + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 10 + periodSeconds: 60 +``` + +### 5.3 Spot Instances + +**App pods: Yes.** The app is stateless when backed by external PostgreSQL. Pod heartbeat (NATS KV, 45s TTL) already handles pod death detection. AWS gives 2-minute interruption notice > 65s termination grace period. + +**NATS pods: No.** JetStream fileStore corruption risk on spot termination. + +### 5.4 GitOps: ArgoCD + External Secrets Operator + +- ArgoCD for Helm chart deployment (clear UI for render failures, diff views for subchart changes) +- External Secrets Operator (ESO) for secrets from AWS Secrets Manager / Vault +- The chart's `secret.secretName` already supports referencing external Secrets + +### 5.5 Disaster Recovery: Active-Passive Warm Standby + +**Target: RPO 1hr, RTO 15min** + +| Layer | Strategy | +|-------|----------| +| PostgreSQL | Managed service with cross-region read replica, promotable in ~5min | +| NATS | Let it rebuild in DR cluster. Events are in PostgreSQL. Polling fallback covers the gap. | +| Kubernetes | Velero backups every 6hr. ArgoCD reconciles from Git. | + +**Failover procedure:** +1. Promote PG read replica (5min) +2. ArgoCD activates standby cluster Application +3. Karpenter provisions nodes (~60s) +4. NATS boots, polling catches up from PG +5. DNS failover (Route53 health check) +6. **Total RTO: ~15 minutes** + +### 5.6 Secret Rotation + +**BETTER_AUTH_SECRET:** Session signing key. Rotation invalidates active sessions (users re-auth). Schedule during low-traffic windows. Better Auth does not natively support dual-key verification. + +**ENCRYPTION_KEY:** Credential vault AES-256-GCM key. Requires a re-encryption Job: +``` +bun run deco vault-rekey --old-key $OLD --new-key $NEW +``` + +### 5.7 Chaos Engineering: Chaos Mesh + +Beyond existing Toxiproxy resilience tests, run in staging: +1. **AZ partition** -- validate pod spread and HPA scale in remaining AZs +2. **NATS kill** -- validate polling fallback delivers within 5s +3. **PG connection exhaustion** -- validate readiness probe fails and pod removed from endpoints +4. **DNS failure** -- validate NATS infinite reconnect and PG pool recovery +5. **Runaway autoscaling** -- validate HPA maxReplicas and Karpenter limits + +### 5.8 Supply Chain Security + +- **Pin sidecar images by digest** (current `s3Sync.image.tag: "latest"` is a risk) +- The deployment template already supports digest-based images (`image: "repo@sha256:..."`) +- Add Kyverno/OPA Gatekeeper policy for Sigstore/Cosign image verification + +--- + +## Bugs & Issues Found + +### Bug: NatsNotifyStrategy Re-subscription Failure + +**File:** `apps/mesh/src/event-bus/nats-notify.ts:30` + +The `start()` method has `if (this.sub) return` which prevents re-subscription after NATS reconnect. After a NATS connection loss and recovery, the old subscription object may be stale but non-null, silently breaking the notify path. All event delivery falls back to 5s polling. + +**Fix:** Check if existing subscription is draining/closed before returning early, or unconditionally clean up and re-subscribe: +```typescript +if (this.sub) { + try { this.sub.unsubscribe(); } catch { /* ignore */ } + this.sub = null; +} +``` + +### Issue: Plugin Migration Race Condition + +**File:** `apps/mesh/src/database/migrate.ts:149-213` + +Plugin migrations directly query and insert into `plugin_migrations` without advisory locks. Two pods starting simultaneously can race. Kysely's built-in `kysely_migration_lock` only covers Kysely migrations, not plugin migrations. + +### Issue: Connection Pool Sizing + +With `NUM_THREADS: 4` and `DATABASE_POOL_MAX: 20`, each pod can open 80 PG connections. At 3-6 replicas, this exceeds PostgreSQL defaults and can cause connection exhaustion. + +--- + +## Prioritized Action Plan + +### Phase 1: Quick Wins (values.yaml + new templates, no code changes) + +| # | Action | Effort | Impact | +|---|--------|--------|--------| +| 1 | Add PodDisruptionBudget template | 1hr | Prevents total pod eviction | +| 2 | Add pod anti-affinity to values.yaml | 30min | Prevents node co-location | +| 3 | Add startupProbe, tighten liveness/readiness | 30min | Faster failure detection | +| 4 | Add preStop hook, increase terminationGracePeriod to 65s | 30min | Zero-downtime deploys | +| 5 | Harden topologySpreadConstraints to DoNotSchedule | 15min | Prevents AZ concentration | +| 6 | Pin sidecar images by digest (s3Sync "latest" is a risk) | 15min | Supply chain security | +| 7 | Add HPA behavior policies (scaleUp/scaleDown) | 30min | Prevent scaling oscillation | + +### Phase 2: Infrastructure (requires external services) + +| # | Action | Effort | Impact | +|---|--------|--------|--------| +| 8 | Enable 3-node NATS cluster | 2hr | Eliminates NATS SPOF | +| 9 | Deploy PgBouncer or use CloudNativePG pooler | 4hr | Prevents connection exhaustion | +| 10 | Add Ingress/HTTPProxy template with route-aware timeouts | 4hr | Proper SSE handling, TLS | +| 11 | Add NetworkPolicy templates | 2hr | Defense in depth | +| 12 | Set up External Secrets Operator | 4hr | Stop storing secrets in Helm values | +| 13 | Deploy cert-manager for TLS automation | 2hr | Automated certificate lifecycle | + +### Phase 3: Application Code Changes + +| # | Action | Effort | Impact | +|---|--------|--------|--------| +| 14 | Fix NatsNotifyStrategy re-subscription bug | 1hr | Prevents silent notify failure | +| 15 | Add advisory lock to plugin migrations | 1hr | Prevents migration race | +| 16 | Add init container for database migrations | 2hr | Safe rolling updates with DDL | +| 17 | Add database circuit breaker | 4hr | 30s -> sub-second failure response | +| 18 | Increase force-exit timeout from 55s to 58s | 15min | Align with 65s grace period | +| 19 | Add NATS client tuning (pingInterval, reconnectJitter) | 1hr | Faster dead connection detection | + +### Phase 4: Advanced HA + +| # | Action | Effort | Impact | +|---|--------|--------|--------| +| 20 | Deploy Karpenter with NodePool config | 1 day | Faster autoscaling, right-sized nodes | +| 21 | Set up ArgoCD + ApplicationSet for GitOps | 1 day | Reproducible deployments, DR standby | +| 22 | Add PG LISTEN/NOTIFY as fallback strategy | 2 days | Eliminate 5s polling gap | +| 23 | Implement DATABASE_READ_URL for read replicas | 2 days | Scale read-heavy workloads | +| 24 | Set up Chaos Mesh experiments | 2 days | Validate HA in staging | +| 25 | Add PriorityClass template | 1hr | Preemption protection | + +--- + +## Cost Analysis + +### Estimated Monthly (3-AZ, us-east-1) + +| Component | Without Optimization | With Optimization | +|-----------|---------------------|-------------------| +| 3x c6i.xlarge baseline (on-demand) | $460 | $280 (1yr RI) | +| 3x c6i.xlarge burst (spot) | $460 | $140 (spot avg) | +| NATS 3x m6i.large (on-demand) | $210 | $210 | +| NATS PVCs (3x 10Gi gp3) | $7 | $2 (3x 2Gi) | +| RDS PostgreSQL db.r6g.large | $220 | $135 (1yr RI) | +| **Total** | **~$1,357** | **~$767** | + +**Key savings levers:** +- Reserved Instances for baseline app + database (~40% savings) +- Spot instances for burst capacity (~70% savings) +- Right-size NATS JetStream storage (10Gi -> 2Gi) +- Right-size app resources after profiling actual usage +- OTel tail-based sampling to reduce trace storage costs diff --git a/docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md b/docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md new file mode 100644 index 0000000000..ca8cdb9c5a --- /dev/null +++ b/docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md @@ -0,0 +1,443 @@ +# Application Code HA Fixes Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Fix the NATS re-subscription bug, add advisory locks to plugin migrations, tune NATS client settings, and align the shutdown timeout with the new 65s termination grace period. + +**Architecture:** Targeted fixes to 3 files in `apps/mesh/src/`. Each task is independent and produces a working, testable change. TDD where tests exist; for infra-level code (NATS, migrations), manual verification steps. + +**Tech Stack:** TypeScript, Bun test runner, NATS client (`nats` npm package), Kysely, PostgreSQL + +--- + +### Task 1: Fix NatsNotifyStrategy Re-subscription Bug + +**Files:** +- Modify: `apps/mesh/src/event-bus/nats-notify.ts` +- Test: `apps/mesh/src/event-bus/nats-notify.test.ts` (create if not exists) + +**Context:** The `start()` method has `if (this.sub) return` on line 30. After NATS reconnects, the `NatsConnectionProvider` calls `fireReady()` which triggers `start()` again via the `onReady` callback. But the old `this.sub` is stale (its underlying connection is gone), so `start()` returns early and the subscription is never re-established. All event delivery silently falls back to 5s polling. + +- [ ] **Step 1: Write a test for re-subscription after reconnect** + +Create `apps/mesh/src/event-bus/nats-notify.test.ts`: + +```typescript +import { describe, test, expect, mock } from "bun:test"; +import { NatsNotifyStrategy } from "./nats-notify"; +import type { NatsConnection, Subscription } from "nats"; + +function createMockConnection(): { + nc: NatsConnection; + subs: Subscription[]; +} { + const subs: Subscription[] = []; + const nc = { + subscribe: mock((subject: string) => { + const sub: Subscription = { + unsubscribe: mock(() => {}), + drain: mock(() => Promise.resolve()), + isClosed: false, + [Symbol.asyncIterator]: () => ({ + next: () => new Promise(() => {}), // never resolves — simulates waiting + return: () => Promise.resolve({ done: true, value: undefined }), + throw: () => Promise.resolve({ done: true, value: undefined }), + }), + } as unknown as Subscription; + subs.push(sub); + return sub; + }), + publish: mock(() => {}), + isClosed: () => false, + isDraining: () => false, + } as unknown as NatsConnection; + + return { nc, subs }; +} + +describe("NatsNotifyStrategy", () => { + test("start() creates subscription", async () => { + const { nc } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.start(() => {}); + + expect(nc.subscribe).toHaveBeenCalledTimes(1); + }); + + test("start() re-subscribes when called again (reconnect scenario)", async () => { + const { nc, subs } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.start(() => {}); + expect(nc.subscribe).toHaveBeenCalledTimes(1); + + // Simulate reconnect: start() is called again + await strategy.start(); + expect(nc.subscribe).toHaveBeenCalledTimes(2); + // Old subscription should have been unsubscribed + expect(subs[0].unsubscribe).toHaveBeenCalledTimes(1); + }); + + test("stop() cleans up subscription", async () => { + const { nc, subs } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.start(() => {}); + await strategy.stop(); + + expect(subs[0].unsubscribe).toHaveBeenCalledTimes(1); + }); + + test("notify() publishes to NATS", async () => { + const { nc } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.notify("event-123"); + + expect(nc.publish).toHaveBeenCalledTimes(1); + }); + + test("notify() silently succeeds when NATS is disconnected", async () => { + const strategy = new NatsNotifyStrategy({ getConnection: () => null }); + + // Should not throw + await strategy.notify("event-123"); + }); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `bun test apps/mesh/src/event-bus/nats-notify.test.ts` + +Expected: The "re-subscribes when called again" test FAILS because `start()` returns early on `if (this.sub) return`. + +- [ ] **Step 3: Fix the bug — clean up stale subscription before re-subscribing** + +In `apps/mesh/src/event-bus/nats-notify.ts`, replace lines 29-32: + +```typescript + async start(onNotify?: () => void): Promise { + if (this.sub) return; + if (onNotify) this.onNotify = onNotify; + if (!this.onNotify) return; +``` + +With: + +```typescript + async start(onNotify?: () => void): Promise { + if (onNotify) this.onNotify = onNotify; + if (!this.onNotify) return; + + // Clean up stale subscription from previous connection (reconnect scenario). + // After NATS reconnects, the old Subscription object is dead but non-null. + if (this.sub) { + try { + this.sub.unsubscribe(); + } catch { + // ignore — connection may already be closed + } + this.sub = null; + } +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `bun test apps/mesh/src/event-bus/nats-notify.test.ts` + +Expected: All 5 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add apps/mesh/src/event-bus/nats-notify.ts apps/mesh/src/event-bus/nats-notify.test.ts +git commit -m "fix(event-bus): re-subscribe to NATS after reconnect + +The start() method had an early return when this.sub was non-null, +preventing re-subscription after NATS reconnects. The old subscription +object was stale, silently breaking the notify path and forcing all +event delivery to fall back to 5s polling." +``` + +--- + +### Task 2: Add Advisory Lock to Plugin Migrations + +**Files:** +- Modify: `apps/mesh/src/database/migrate.ts` + +**Context:** `runPluginMigrations()` (line 149-213) queries and inserts into `plugin_migrations` without any locking. Two pods starting simultaneously can race: both read the same "executed" set, both try to run the same migration, and the INSERT of the record can conflict or worse — the DDL can run twice. + +- [ ] **Step 1: Add advisory lock around plugin migration execution** + +In `apps/mesh/src/database/migrate.ts`, modify the `runPluginMigrations` function. Replace lines 149-153: + +```typescript +async function runPluginMigrations(db: Kysely): Promise { + const pluginMigrations = collectPluginMigrations(); + + if (pluginMigrations.length === 0) { + return 0; + } +``` + +With: + +```typescript +async function runPluginMigrations(db: Kysely): Promise { + const pluginMigrations = collectPluginMigrations(); + + if (pluginMigrations.length === 0) { + return 0; + } + + // Acquire advisory lock to prevent concurrent plugin migration execution. + // Kysely's built-in migration_lock only covers core migrations, not plugins. + await sql`SELECT pg_advisory_lock(hashtext('plugin_migrations'))`.execute(db); +``` + +Then, wrap the rest of the function body in a try/finally to release the lock. Replace lines 159-213 (the rest after the lock acquisition). The full function becomes: + +```typescript +async function runPluginMigrations(db: Kysely): Promise { + const pluginMigrations = collectPluginMigrations(); + + if (pluginMigrations.length === 0) { + return 0; + } + + // Acquire advisory lock to prevent concurrent plugin migration execution. + // Kysely's built-in migration_lock only covers core migrations, not plugins. + await sql`SELECT pg_advisory_lock(hashtext('plugin_migrations'))`.execute(db); + + try { + // Note: plugin_migrations table and old record migration are handled + // in runKyselyMigrations() before Kysely's migrator runs + + // Get already executed migrations + const executed = await sql<{ plugin_id: string; name: string }>` + SELECT plugin_id, name FROM plugin_migrations + `.execute(db); + const executedSet = new Set( + executed.rows.map((r) => `${r.plugin_id}/${r.name}`), + ); + + // Group migrations by plugin + const migrationsByPlugin = new Map< + string, + Array<{ + name: string; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + up: (db: any) => Promise; + }> + >(); + + for (const { pluginId, migration } of pluginMigrations) { + if (!migrationsByPlugin.has(pluginId)) { + migrationsByPlugin.set(pluginId, []); + } + migrationsByPlugin.get(pluginId)!.push({ + name: migration.name, + up: migration.up, + }); + } + + // Run pending migrations for each plugin + let totalPending = 0; + + for (const [pluginId, pluginMigrationList] of migrationsByPlugin) { + // Sort by name to ensure consistent order + pluginMigrationList.sort((a, b) => a.name.localeCompare(b.name)); + + for (const migration of pluginMigrationList) { + const key = `${pluginId}/${migration.name}`; + if (executedSet.has(key)) { + continue; // Already executed + } + + totalPending++; + await migration.up(db); + + // Record as executed + const timestamp = new Date().toISOString(); + await sql` + INSERT INTO plugin_migrations (plugin_id, name, timestamp) + VALUES (${pluginId}, ${migration.name}, ${timestamp}) + `.execute(db); + } + } + + return totalPending; + } finally { + await sql`SELECT pg_advisory_unlock(hashtext('plugin_migrations'))`.execute( + db, + ); + } +} +``` + +- [ ] **Step 2: Run existing tests** + +Run: `bun test apps/mesh/src/database/` + +Expected: All existing tests pass (the advisory lock is transparent when there is no contention). + +- [ ] **Step 3: Commit** + +```bash +git add apps/mesh/src/database/migrate.ts +git commit -m "fix(database): add advisory lock to plugin migrations + +Prevents race condition when multiple pods start simultaneously and +both try to run the same plugin migration. Kysely's built-in +migration_lock only covers core migrations." +``` + +--- + +### Task 3: Align Shutdown Timeout with 65s Grace Period + +**Files:** +- Modify: `apps/mesh/src/index.ts` + +**Context:** The Helm chart's `terminationGracePeriodSeconds` is being changed from 60 to 65 (Task 2 of the Helm plan). A 5s preStop hook runs before SIGTERM. The app's internal force-exit timeout should increase from 55s to 58s so the timeline is: 5s preStop + SIGTERM + 58s app timeout = 63s < 65s grace period. + +- [ ] **Step 1: Update the force-exit timeout** + +In `apps/mesh/src/index.ts`, line 158-161, replace: + +```typescript + const forceExitTimer = setTimeout(() => { + console.error("[shutdown] Timed out after 55s, forcing exit."); + process.exit(1); + }, 55_000); +``` + +With: + +```typescript + const forceExitTimer = setTimeout(() => { + console.error("[shutdown] Timed out after 58s, forcing exit."); + process.exit(1); + }, 58_000); +``` + +- [ ] **Step 2: Verify the change** + +Run: `grep -n "58_000\|58s" apps/mesh/src/index.ts` + +Expected: Line ~161 shows the updated timeout. + +- [ ] **Step 3: Run full test suite to verify no regressions** + +Run: `bun test` + +Expected: All tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add apps/mesh/src/index.ts +git commit -m "fix(shutdown): align force-exit timeout with 65s termination grace period + +The Helm chart's terminationGracePeriodSeconds increases from 60 to 65 +to accommodate a 5s preStop hook. The app's internal hard timeout +increases from 55s to 58s to stay within the grace period." +``` + +--- + +### Task 4: Tune NATS Client Connection Options + +**Files:** +- Modify: `apps/mesh/src/nats/connection.ts` + +**Context:** The `defaultConnect` function passes minimal options to the NATS client. Adding `pingInterval`, `reconnectJitter`, and a connection name improves dead-connection detection (120s default -> 60s) and prevents thundering herd on reconnect. + +- [ ] **Step 1: Update defaultConnect with tuned options** + +In `apps/mesh/src/nats/connection.ts`, replace lines 175-182: + +```typescript +function defaultConnect(opts: { + servers: string | string[]; + timeout: number; + reconnect: boolean; + maxReconnectAttempts: number; +}): Promise { + return connect(opts); +} +``` + +With: + +```typescript +function defaultConnect(opts: { + servers: string | string[]; + timeout: number; + reconnect: boolean; + maxReconnectAttempts: number; +}): Promise { + return connect({ + ...opts, + pingInterval: 20_000, + maxPingOut: 3, + reconnectTimeWait: 1_000, + reconnectJitter: 500, + reconnectJitterTLS: 1_000, + name: "mesh-app", + }); +} +``` + +- [ ] **Step 2: Run NATS-related tests** + +Run: `bun test apps/mesh/src/nats/` + +Expected: All tests pass. The mock `connectFn` in tests bypasses `defaultConnect`, so these options only affect real connections. + +- [ ] **Step 3: Run full test suite** + +Run: `bun test` + +Expected: All tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add apps/mesh/src/nats/connection.ts +git commit -m "feat(nats): tune client connection for faster dead-connection detection + +- pingInterval: 20s (was 120s default) for faster dead connection detection +- reconnectJitter: 500ms to prevent thundering herd on reconnect +- connection name for debugging in NATS monitoring" +``` + +--- + +### Task 5: Format and Final Verification + +- [ ] **Step 1: Run formatter** + +Run: `bun run fmt` + +- [ ] **Step 2: Run lint** + +Run: `bun run lint` + +- [ ] **Step 3: Run type check** + +Run: `bun run check` + +- [ ] **Step 4: Run full test suite** + +Run: `bun test` + +- [ ] **Step 5: Commit any formatting fixes** + +```bash +git add -A +git commit -m "chore: format" +``` diff --git a/docs/superpowers/plans/2026-04-07-helm-chart-ha.md b/docs/superpowers/plans/2026-04-07-helm-chart-ha.md new file mode 100644 index 0000000000..352f0a7d9b --- /dev/null +++ b/docs/superpowers/plans/2026-04-07-helm-chart-ha.md @@ -0,0 +1,698 @@ +# Helm Chart HA Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Harden the Helm chart for maximum pod-level availability, add missing K8s primitives (PDB, NetworkPolicy, Ingress), and configure NATS clustering. + +**Architecture:** Add new Helm templates (PDB, NetworkPolicy, Ingress, PriorityClass) and update `values.yaml` defaults for production HA. No application code changes — purely Helm chart modifications. + +**Tech Stack:** Helm 3, Kubernetes API (policy/v1, networking.k8s.io/v1, autoscaling/v2, scheduling.k8s.io/v1), NATS subchart v2.12.5 + +--- + +### Task 1: Add PodDisruptionBudget Template + +**Files:** +- Create: `deploy/helm/templates/pdb.yaml` + +- [ ] **Step 1: Create the PDB template** + +```yaml +{{- if or .Values.autoscaling.enabled (gt (int (default 1 .Values.replicaCount)) 1) }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} +spec: + maxUnavailable: 1 + selector: + matchLabels: + {{- include "chart-deco-studio.selectorLabels" . | nindent 6 }} +{{- end }} +``` + +- [ ] **Step 2: Validate template renders correctly** + +Run: `helm template test deploy/helm/ --set autoscaling.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep -A 10 PodDisruptionBudget` + +Expected: PDB manifest with `maxUnavailable: 1` and correct label selectors. + +- [ ] **Step 3: Validate PDB is not rendered for single replica** + +Run: `helm template test deploy/helm/ --set replicaCount=1 --set autoscaling.enabled=false | grep PodDisruptionBudget` + +Expected: No output (PDB not rendered). + +- [ ] **Step 4: Commit** + +```bash +git add deploy/helm/templates/pdb.yaml +git commit -m "feat(helm): add PodDisruptionBudget template for HA deployments" +``` + +--- + +### Task 2: Update values.yaml — Pod Scheduling, Probes, Shutdown + +**Files:** +- Modify: `deploy/helm/values.yaml` + +- [ ] **Step 1: Add startupProbe, update probes, add lifecycle hook, update terminationGracePeriod** + +In `deploy/helm/values.yaml`, replace the probes and add new fields. The full diff: + +Replace: +```yaml +livenessProbe: + httpGet: + path: /health/live + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 +readinessProbe: + httpGet: + path: /health/ready + port: http + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 4 + +terminationGracePeriodSeconds: 60 + +# Optional lifecycle hooks (preStop, postStart) +# lifecycle: {} +``` + +With: +```yaml +startupProbe: + httpGet: + path: /health/live + port: http + periodSeconds: 2 + failureThreshold: 30 + timeoutSeconds: 3 + +livenessProbe: + httpGet: + path: /health/live + port: http + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 +readinessProbe: + httpGet: + path: /health/ready + port: http + initialDelaySeconds: 0 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 2 + +terminationGracePeriodSeconds: 65 + +lifecycle: + preStop: + exec: + command: ["sleep", "5"] +``` + +- [ ] **Step 2: Update affinity defaults** + +Replace: +```yaml +affinity: {} +``` + +With: +```yaml +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + topologyKey: kubernetes.io/hostname +``` + +- [ ] **Step 3: Harden topologySpreadConstraints** + +Replace: +```yaml +topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio +``` + +With: +```yaml +topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio +``` + +- [ ] **Step 4: Pin s3Sync image tag** + +Replace: +```yaml +s3Sync: + enabled: false + image: + repository: amazon/aws-cli + tag: "latest" +``` + +With: +```yaml +s3Sync: + enabled: false + image: + repository: amazon/aws-cli + tag: "2.22.35" +``` + +- [ ] **Step 5: Validate template renders** + +Run: `helm template test deploy/helm/ --set database.engine=postgresql --set database.url=postgresql://x | head -100` + +Expected: Deployment with startupProbe, lifecycle.preStop, updated terminationGracePeriodSeconds: 65, and anti-affinity. + +- [ ] **Step 6: Commit** + +```bash +git add deploy/helm/values.yaml +git commit -m "feat(helm): harden probes, add preStop hook, pod anti-affinity, pin s3Sync image" +``` + +--- + +### Task 3: Add startupProbe Support to Deployment Template + +**Files:** +- Modify: `deploy/helm/templates/deployment.yaml` + +The deployment template currently renders `livenessProbe` and `readinessProbe` but does not render `startupProbe`. Add it. + +- [ ] **Step 1: Add startupProbe block to deployment template** + +In `deploy/helm/templates/deployment.yaml`, after the readinessProbe block (after line 105), add: + +```yaml + {{- with .Values.startupProbe }} + startupProbe: + {{- toYaml . | nindent 12 }} + {{- end }} +``` + +The insertion point is right after: +```yaml + {{- with .Values.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} +``` + +- [ ] **Step 2: Validate startupProbe renders** + +Run: `helm template test deploy/helm/ --set database.engine=postgresql --set database.url=postgresql://x | grep -A 6 startupProbe` + +Expected: +``` + startupProbe: + httpGet: + path: /health/live + port: http + periodSeconds: 2 + failureThreshold: 30 + timeoutSeconds: 3 +``` + +- [ ] **Step 3: Commit** + +```bash +git add deploy/helm/templates/deployment.yaml +git commit -m "feat(helm): add startupProbe support to deployment template" +``` + +--- + +### Task 4: Add HPA Behavior Policies + +**Files:** +- Modify: `deploy/helm/values.yaml` +- Modify: `deploy/helm/templates/hpa.yaml` + +- [ ] **Step 1: Add behavior config to values.yaml** + +In `deploy/helm/values.yaml`, inside the `autoscaling` block, after `targetMemoryUtilizationPercentage: 80`, add: + +```yaml + behavior: + scaleUp: + stabilizationWindowSeconds: 30 + policies: + - type: Percent + value: 100 + periodSeconds: 30 + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 10 + periodSeconds: 60 +``` + +- [ ] **Step 2: Update HPA template to render behavior** + +In `deploy/helm/templates/hpa.yaml`, before the final `{{- end }}`, add the behavior block. Replace the full file with: + +```yaml +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "chart-deco-studio.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} + {{- with .Values.autoscaling.behavior }} + behavior: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} +``` + +- [ ] **Step 3: Validate HPA renders with behavior** + +Run: `helm template test deploy/helm/ --set autoscaling.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep -A 15 behavior` + +Expected: behavior block with scaleUp and scaleDown policies. + +- [ ] **Step 4: Commit** + +```bash +git add deploy/helm/values.yaml deploy/helm/templates/hpa.yaml +git commit -m "feat(helm): add HPA behavior policies for scaling stability" +``` + +--- + +### Task 5: Enable 3-Node NATS Cluster + +**Files:** +- Modify: `deploy/helm/values.yaml` + +- [ ] **Step 1: Update NATS subchart values for clustering** + +In `deploy/helm/values.yaml`, replace the `nats:` block: + +```yaml +nats: + enabled: true + config: + cluster: + enabled: true + replicas: 3 + jetstream: + enabled: true + memoryStore: + enabled: true + maxSize: 512Mi + fileStore: + enabled: true + pvc: + enabled: true + size: 2Gi + storageClassName: "" + podTemplate: + merge: + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["nats"] + topologyKey: kubernetes.io/hostname +``` + +- [ ] **Step 2: Validate NATS subchart renders** + +Run: `helm dependency update deploy/helm/ && helm template test deploy/helm/ --set database.engine=postgresql --set database.url=postgresql://x | grep -c "kind: StatefulSet"` + +Expected: At least 1 (NATS StatefulSet). + +- [ ] **Step 3: Commit** + +```bash +git add deploy/helm/values.yaml +git commit -m "feat(helm): enable 3-node NATS cluster with pod anti-affinity" +``` + +--- + +### Task 6: Add NetworkPolicy Templates + +**Files:** +- Create: `deploy/helm/templates/networkpolicy.yaml` +- Modify: `deploy/helm/values.yaml` + +- [ ] **Step 1: Add networkPolicy config to values.yaml** + +After the `env: []` block at the end of `deploy/helm/values.yaml`, add: + +```yaml + +# Network Policies (optional - requires a CNI that supports NetworkPolicy) +networkPolicy: + enabled: false + # Ingress controller namespace (for allowing inbound traffic) + ingressNamespace: "ingress-nginx" + # PostgreSQL CIDR (for egress to database) + databaseCIDR: "10.0.0.0/8" +``` + +- [ ] **Step 2: Create the NetworkPolicy template** + +Create `deploy/helm/templates/networkpolicy.yaml`: + +```yaml +{{- if .Values.networkPolicy.enabled }} +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "chart-deco-studio.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + # Allow traffic from ingress controller + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Values.networkPolicy.ingressNamespace }} + ports: + - port: {{ .Values.service.targetPort | default 3000 }} + protocol: TCP + # Allow health checks from kubelet + - ports: + - port: {{ .Values.service.targetPort | default 3000 }} + protocol: TCP + egress: + # NATS + {{- if .Values.nats.enabled }} + - to: + - podSelector: + matchLabels: + app.kubernetes.io/name: nats + ports: + - port: 4222 + protocol: TCP + {{- end }} + # PostgreSQL + - to: + - ipBlock: + cidr: {{ .Values.networkPolicy.databaseCIDR }} + ports: + - port: 5432 + protocol: TCP + # DNS + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + # External HTTPS (MCP servers, OAuth providers, OTel) + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + - 169.254.169.254/32 + ports: + - port: 443 + protocol: TCP + - port: 80 + protocol: TCP +{{- end }} +``` + +- [ ] **Step 3: Validate NetworkPolicy renders when enabled** + +Run: `helm template test deploy/helm/ --set networkPolicy.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep -A 5 NetworkPolicy` + +Expected: NetworkPolicy manifest with correct podSelector. + +- [ ] **Step 4: Validate NetworkPolicy is NOT rendered when disabled** + +Run: `helm template test deploy/helm/ --set database.engine=postgresql --set database.url=postgresql://x | grep NetworkPolicy` + +Expected: No output. + +- [ ] **Step 5: Commit** + +```bash +git add deploy/helm/templates/networkpolicy.yaml deploy/helm/values.yaml +git commit -m "feat(helm): add optional NetworkPolicy template" +``` + +--- + +### Task 7: Add Ingress Template + +**Files:** +- Create: `deploy/helm/templates/ingress.yaml` +- Modify: `deploy/helm/values.yaml` + +- [ ] **Step 1: Add ingress config to values.yaml** + +In `deploy/helm/values.yaml`, after the `service:` block, add: + +```yaml + +ingress: + enabled: false + className: "" + annotations: {} + # For NGINX Ingress with SSE support: + # nginx.ingress.kubernetes.io/proxy-buffering: "off" + # nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + # nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + # nginx.ingress.kubernetes.io/proxy-body-size: "10m" + hosts: + - host: mesh.example.com + paths: + - path: / + pathType: Prefix + tls: [] + # - secretName: mesh-tls + # hosts: + # - mesh.example.com +``` + +- [ ] **Step 2: Create the Ingress template** + +Create `deploy/helm/templates/ingress.yaml`: + +```yaml +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ include "chart-deco-studio.fullname" $ }} + port: + number: {{ $.Values.service.port }} + {{- end }} + {{- end }} +{{- end }} +``` + +- [ ] **Step 3: Validate Ingress renders when enabled** + +Run: `helm template test deploy/helm/ --set ingress.enabled=true --set ingress.className=nginx --set database.engine=postgresql --set database.url=postgresql://x | grep -A 20 "kind: Ingress"` + +Expected: Ingress manifest with correct service backend. + +- [ ] **Step 4: Commit** + +```bash +git add deploy/helm/templates/ingress.yaml deploy/helm/values.yaml +git commit -m "feat(helm): add optional Ingress template with SSE annotation examples" +``` + +--- + +### Task 8: Add PriorityClass Template + +**Files:** +- Create: `deploy/helm/templates/priorityclass.yaml` +- Modify: `deploy/helm/values.yaml` +- Modify: `deploy/helm/templates/deployment.yaml` + +- [ ] **Step 1: Add priorityClass config to values.yaml** + +In `deploy/helm/values.yaml`, after the `securityContext:` block, add: + +```yaml + +# Priority class for pod scheduling (optional) +priorityClass: + enabled: false + name: "mcp-mesh-control-plane" + value: 1000000 + preemptionPolicy: PreemptLowerPriority +``` + +- [ ] **Step 2: Create the PriorityClass template** + +Create `deploy/helm/templates/priorityclass.yaml`: + +```yaml +{{- if .Values.priorityClass.enabled }} +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: {{ .Values.priorityClass.name }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} +value: {{ .Values.priorityClass.value }} +globalDefault: false +preemptionPolicy: {{ .Values.priorityClass.preemptionPolicy }} +description: "Priority for MCP Mesh control plane pods" +{{- end }} +``` + +- [ ] **Step 3: Add priorityClassName to Deployment template** + +In `deploy/helm/templates/deployment.yaml`, inside `spec.template.spec`, after `serviceAccountName`, add: + +```yaml + {{- if .Values.priorityClass.enabled }} + priorityClassName: {{ .Values.priorityClass.name }} + {{- end }} +``` + +- [ ] **Step 4: Validate PriorityClass renders** + +Run: `helm template test deploy/helm/ --set priorityClass.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep -A 5 PriorityClass` + +Expected: PriorityClass manifest with value 1000000. + +- [ ] **Step 5: Commit** + +```bash +git add deploy/helm/templates/priorityclass.yaml deploy/helm/templates/deployment.yaml deploy/helm/values.yaml +git commit -m "feat(helm): add optional PriorityClass for preemption protection" +``` + +--- + +### Task 9: Format and Lint + +- [ ] **Step 1: Run formatter** + +Run: `bun run fmt` + +- [ ] **Step 2: Run lint** + +Run: `bun run lint` + +Fix any issues found. + +- [ ] **Step 3: Commit any formatting fixes** + +```bash +git add -A +git commit -m "chore(helm): format" +``` diff --git a/docs/superpowers/plans/2026-04-07-infra-ha-advanced.md b/docs/superpowers/plans/2026-04-07-infra-ha-advanced.md new file mode 100644 index 0000000000..620bcfa54a --- /dev/null +++ b/docs/superpowers/plans/2026-04-07-infra-ha-advanced.md @@ -0,0 +1,528 @@ +# Infrastructure & Advanced HA Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Document and configure the external infrastructure needed for production HA: External Secrets Operator, cert-manager, Karpenter NodePool, and production values overlay. + +**Architecture:** This plan produces documentation and example configuration files. Unlike the Helm chart and app code plans, most of these items are cluster-level resources deployed outside the Helm chart. The deliverables are: a production values overlay, infrastructure setup docs, and example manifests in `deploy/`. + +**Tech Stack:** Kubernetes, Karpenter, ArgoCD, External Secrets Operator, cert-manager, Velero + +**Note:** This plan is for documentation and configuration. It does NOT modify application code or the core Helm chart templates (those are covered by the other two plans). + +--- + +### Task 1: Create Production Values Overlay + +**Files:** +- Create: `deploy/helm/values-production.yaml` + +This file serves as a reference for production deployments with all HA features enabled. + +- [ ] **Step 1: Create the production values overlay** + +Create `deploy/helm/values-production.yaml`: + +```yaml +# Production values overlay for MCP Mesh HA deployment. +# Usage: helm install mesh deploy/helm/ -f deploy/helm/values-production.yaml +# +# Prerequisites: +# - PostgreSQL (RDS/CloudNativePG) with DATABASE_URL in secret +# - External Secrets Operator for secret management +# - cert-manager for TLS certificates +# - CNI with NetworkPolicy support (Calico/Cilium) +# - 3+ AZ cluster with 6+ nodes + +replicaCount: 3 + +image: + pullPolicy: IfNotPresent + command: + - bun + - run + - deco + - --no-local-mode + - --num-threads + - "4" + +# --- Autoscaling --- +autoscaling: + enabled: true + minReplicas: 3 + maxReplicas: 6 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 + behavior: + scaleUp: + stabilizationWindowSeconds: 30 + policies: + - type: Percent + value: 100 + periodSeconds: 30 + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 10 + periodSeconds: 60 + +# --- Database --- +database: + engine: postgresql + # URL provided via secret.secretName (External Secrets Operator) + +# --- Persistence --- +persistence: + enabled: true + accessMode: ReadWriteOnce + storageClass: "gp3" + size: 10Gi + +# --- Probes --- +startupProbe: + httpGet: + path: /health/live + port: http + periodSeconds: 2 + failureThreshold: 30 + timeoutSeconds: 3 + +livenessProbe: + httpGet: + path: /health/live + port: http + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + +readinessProbe: + httpGet: + path: /health/ready + port: http + initialDelaySeconds: 0 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 2 + +# --- Shutdown --- +terminationGracePeriodSeconds: 65 +lifecycle: + preStop: + exec: + command: ["sleep", "5"] + +# --- Scheduling --- +nodeSelector: {} + +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + topologyKey: kubernetes.io/hostname + +topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio + +# --- Priority --- +priorityClass: + enabled: true + name: "mcp-mesh-control-plane" + value: 1000000 + preemptionPolicy: PreemptLowerPriority + +# --- Secret --- +secret: + secretName: "mesh-production-secrets" # Managed by External Secrets Operator + +# --- Ingress --- +ingress: + enabled: true + className: "nginx" + annotations: + nginx.ingress.kubernetes.io/proxy-buffering: "off" + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-body-size: "10m" + nginx.ingress.kubernetes.io/proxy-http-version: "1.1" + cert-manager.io/cluster-issuer: "letsencrypt-prod" + hosts: + - host: mesh.example.com + paths: + - path: / + pathType: Prefix + tls: + - secretName: mesh-tls + hosts: + - mesh.example.com + +# --- Network Policy --- +networkPolicy: + enabled: true + ingressNamespace: "ingress-nginx" + databaseCIDR: "10.0.0.0/8" + +# --- NATS --- +nats: + enabled: true + config: + cluster: + enabled: true + replicas: 3 + jetstream: + enabled: true + memoryStore: + enabled: true + maxSize: 512Mi + fileStore: + enabled: true + pvc: + enabled: true + size: 2Gi + storageClassName: "" + podTemplate: + merge: + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["nats"] + topologyKey: kubernetes.io/hostname + +# --- ConfigMap --- +configMap: + meshConfig: + NODE_ENV: "production" + PORT: "3000" + HOST: "0.0.0.0" + BETTER_AUTH_URL: "https://mesh.example.com" + BASE_URL: "https://mesh.example.com" + DATA_DIR: "/app/data" + DATABASE_POOL_MAX: "10" + DECO_AI_GATEWAY_ENABLED: "false" + +# --- OTel --- +otel: + enabled: true + protocol: "http/protobuf" + service: "mcp-mesh" + collector: + enabled: true + image: + repository: otel/opentelemetry-collector-contrib + tag: "0.115.1" + pullPolicy: IfNotPresent + +# --- S3 Sync --- +s3Sync: + enabled: false + image: + repository: amazon/aws-cli + tag: "2.22.35" +``` + +- [ ] **Step 2: Validate production overlay renders** + +Run: `helm template mesh-prod deploy/helm/ -f deploy/helm/values-production.yaml --set database.url=postgresql://x | head -20` + +Expected: Rendered manifests without errors. + +- [ ] **Step 3: Commit** + +```bash +git add deploy/helm/values-production.yaml +git commit -m "feat(helm): add production values overlay with full HA configuration" +``` + +--- + +### Task 2: Create Infrastructure Setup Guide + +**Files:** +- Create: `deploy/infrastructure/README.md` +- Create: `deploy/infrastructure/external-secret.yaml` +- Create: `deploy/infrastructure/cert-issuer.yaml` +- Create: `deploy/infrastructure/karpenter-nodepool.yaml` + +- [ ] **Step 1: Create the infrastructure directory** + +Run: `mkdir -p deploy/infrastructure` + +- [ ] **Step 2: Create External Secrets example** + +Create `deploy/infrastructure/external-secret.yaml`: + +```yaml +# External Secrets Operator: sync secrets from AWS Secrets Manager. +# Prerequisites: +# - External Secrets Operator installed (helm install external-secrets external-secrets/external-secrets) +# - ClusterSecretStore configured for AWS Secrets Manager +# - Secrets created in AWS Secrets Manager at the referenced paths +# +# Usage: kubectl apply -f deploy/infrastructure/external-secret.yaml -n mesh-production +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: mesh-production-secrets +spec: + refreshInterval: 1h + secretStoreRef: + name: aws-secrets-manager + kind: ClusterSecretStore + target: + name: mesh-production-secrets # Must match secret.secretName in values-production.yaml + creationPolicy: Owner + data: + - secretKey: BETTER_AUTH_SECRET + remoteRef: + key: mesh/production/auth + property: secret + - secretKey: ENCRYPTION_KEY + remoteRef: + key: mesh/production/encryption + property: key + - secretKey: DATABASE_URL + remoteRef: + key: mesh/production/database + property: url +``` + +- [ ] **Step 3: Create cert-manager ClusterIssuer example** + +Create `deploy/infrastructure/cert-issuer.yaml`: + +```yaml +# cert-manager ClusterIssuer for Let's Encrypt TLS certificates. +# Prerequisites: +# - cert-manager installed (helm install cert-manager jetstack/cert-manager --set crds.enabled=true) +# +# Usage: kubectl apply -f deploy/infrastructure/cert-issuer.yaml +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: ops@example.com # CHANGE THIS + privateKeySecretRef: + name: letsencrypt-prod + solvers: + - http01: + ingress: + ingressClassName: nginx +``` + +- [ ] **Step 4: Create Karpenter NodePool example** + +Create `deploy/infrastructure/karpenter-nodepool.yaml`: + +```yaml +# Karpenter NodePool for MCP Mesh workloads. +# Prerequisites: +# - Karpenter installed and configured with EC2NodeClass +# +# Usage: kubectl apply -f deploy/infrastructure/karpenter-nodepool.yaml +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: mesh-app +spec: + template: + spec: + requirements: + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand", "spot"] + - key: karpenter.k8s.aws/instance-category + operator: In + values: ["c", "m"] + - key: karpenter.k8s.aws/instance-generation + operator: Gt + values: ["5"] + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: default + limits: + cpu: "64" + memory: "128Gi" + disruption: + consolidationPolicy: WhenEmptyOrUnderutilized + consolidateAfter: 60s +--- +# NATS nodes must not use spot instances (JetStream fileStore corruption risk) +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: mesh-data +spec: + template: + spec: + requirements: + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: karpenter.k8s.aws/instance-category + operator: In + values: ["m", "r"] + - key: karpenter.k8s.aws/instance-generation + operator: Gt + values: ["5"] + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: default + taints: + - key: workload-type + value: data + effect: NoSchedule + limits: + cpu: "16" + memory: "64Gi" + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 300s +``` + +- [ ] **Step 5: Create the README** + +Create `deploy/infrastructure/README.md`: + +```markdown +# Infrastructure Setup for MCP Mesh HA + +This directory contains example manifests for the external infrastructure +components needed for a production HA deployment of MCP Mesh. + +## Prerequisites + +1. **Kubernetes cluster** with 3+ AZs and 6+ nodes +2. **PostgreSQL** — RDS Multi-AZ, Cloud SQL HA, or CloudNativePG +3. **External Secrets Operator** — for secret management +4. **cert-manager** — for TLS certificate automation +5. **Ingress controller** — NGINX Ingress or Envoy-based (Contour) +6. **CNI with NetworkPolicy** — Calico or Cilium + +## Setup Order + +1. Install infrastructure operators: + - External Secrets Operator + - cert-manager + - Karpenter (optional, replaces Cluster Autoscaler) + +2. Apply infrastructure manifests: + ```bash + kubectl apply -f deploy/infrastructure/cert-issuer.yaml + kubectl apply -f deploy/infrastructure/external-secret.yaml -n mesh-production + # kubectl apply -f deploy/infrastructure/karpenter-nodepool.yaml # if using Karpenter + ``` + +3. Deploy MCP Mesh with production values: + ```bash + helm install mesh deploy/helm/ \ + -f deploy/helm/values-production.yaml \ + --set database.url=postgresql://... \ + -n mesh-production + ``` + +## Files + +| File | Description | +|------|-------------| +| `external-secret.yaml` | ExternalSecret for AWS Secrets Manager | +| `cert-issuer.yaml` | cert-manager ClusterIssuer for Let's Encrypt | +| `karpenter-nodepool.yaml` | Karpenter NodePools for app and data workloads | + +## Disaster Recovery + +**Target:** RPO 1hr, RTO 15min + +1. PostgreSQL: Cross-region read replica (promotable in ~5min) +2. NATS: Let it rebuild — events are durable in PostgreSQL, polling fallback covers the gap +3. Kubernetes: ArgoCD reconciles from Git + Velero for PVC/CRD backups +4. DNS: Route53 health-check failover to DR region +``` + +- [ ] **Step 6: Commit** + +```bash +git add deploy/infrastructure/ +git commit -m "docs(infra): add infrastructure setup guide and example manifests + +Includes External Secrets Operator, cert-manager ClusterIssuer, +and Karpenter NodePool examples for production HA deployment." +``` + +--- + +### Task 3: Add Helm Chart Validation for SQLite + Autoscaling + +**Files:** +- Modify: `deploy/helm/templates/_helpers.tpl` + +**Context:** The `_helpers.tpl` already validates that autoscaling requires PostgreSQL or distributed storage (line 116-118). But there is no warning when `database.engine=sqlite` with `replicaCount > 1`. The existing validation on line 113-115 covers `replicaCount > 1` without distributed storage, which implicitly covers SQLite. This task adds a more explicit error message specifically for SQLite + autoscaling. + +- [ ] **Step 1: Verify existing validation already covers this case** + +Run: `helm template test deploy/helm/ --set autoscaling.enabled=true --set database.engine=sqlite 2>&1 | head -5` + +Expected: Error message about autoscaling requiring PostgreSQL or distributed storage. + +Since the existing validation already handles this, no change is needed. Mark this task as done. + +- [ ] **Step 2: Commit (no-op — validation already exists)** + +No commit needed. + +--- + +### Task 4: Format and Final Verification + +- [ ] **Step 1: Run formatter** + +Run: `bun run fmt` + +- [ ] **Step 2: Validate all Helm templates render without errors** + +Run: `helm template mesh-prod deploy/helm/ -f deploy/helm/values-production.yaml --set database.url=postgresql://x > /dev/null && echo "OK"` + +Expected: "OK" + +- [ ] **Step 3: Commit any formatting fixes** + +```bash +git add -A +git commit -m "chore: format" +``` From 2b4a1072999963527124bb11ce8c4f20ed447f89 Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:16:20 -0300 Subject: [PATCH 02/16] docs: improve plans based on 8-critic review Key changes: - Advisory lock: use pg_advisory_xact_lock on pinned connection (pool-safe) - Keep ScheduleAnyway as default (DoNotSchedule only in prod overlay) - Keep readinessProbe failureThreshold at 3 (not 2) - Remove PriorityClass and NetworkPolicy from Helm chart (YAGNI) - Slim production overlay to deltas only, rename to .example.yaml - Fix ESO API v1beta1 -> v1, reduce refreshInterval to 5m - Fix test mock async iterator to terminate on unsubscribe - Move NetworkPolicy to infrastructure examples with RFC1918 exclusions --- .../plans/2026-04-07-app-code-ha-fixes.md | 92 +++- .../plans/2026-04-07-helm-chart-ha.md | 318 +++--------- .../plans/2026-04-07-infra-ha-advanced.md | 455 +++++++----------- 3 files changed, 306 insertions(+), 559 deletions(-) diff --git a/docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md b/docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md index ca8cdb9c5a..c464af7222 100644 --- a/docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md +++ b/docs/superpowers/plans/2026-04-07-app-code-ha-fixes.md @@ -34,12 +34,19 @@ function createMockConnection(): { const subs: Subscription[] = []; const nc = { subscribe: mock((subject: string) => { + let resolveIterator: (() => void) | null = null; const sub: Subscription = { - unsubscribe: mock(() => {}), + unsubscribe: mock(() => { + resolveIterator?.(); + }), drain: mock(() => Promise.resolve()), isClosed: false, [Symbol.asyncIterator]: () => ({ - next: () => new Promise(() => {}), // never resolves — simulates waiting + next: () => + new Promise>((resolve) => { + resolveIterator = () => + resolve({ done: true, value: undefined }); + }), return: () => Promise.resolve({ done: true, value: undefined }), throw: () => Promise.resolve({ done: true, value: undefined }), }), @@ -98,12 +105,39 @@ describe("NatsNotifyStrategy", () => { expect(nc.publish).toHaveBeenCalledTimes(1); }); + test("notify() publishes to correct subject", async () => { + const { nc } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.notify("event-123"); + + expect(nc.publish).toHaveBeenCalledWith( + "mesh.events.notify", + expect.any(Uint8Array), + ); + }); + test("notify() silently succeeds when NATS is disconnected", async () => { const strategy = new NatsNotifyStrategy({ getConnection: () => null }); // Should not throw await strategy.notify("event-123"); }); + + test("start() with no connection cleans up old sub but does not crash", async () => { + const { nc, subs } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.start(() => {}); + expect(nc.subscribe).toHaveBeenCalledTimes(1); + + // Simulate NATS going away — getConnection returns null on reconnect attempt + const strategyWithNull = new NatsNotifyStrategy({ + getConnection: () => null, + }); + // This tests that start() handles null connection gracefully + await strategyWithNull.start(() => {}); + }); }); ``` @@ -193,9 +227,11 @@ async function runPluginMigrations(db: Kysely): Promise { return 0; } - // Acquire advisory lock to prevent concurrent plugin migration execution. - // Kysely's built-in migration_lock only covers core migrations, not plugins. - await sql`SELECT pg_advisory_lock(hashtext('plugin_migrations'))`.execute(db); + // Use a transaction-scoped advisory lock to prevent concurrent execution. + // Must use db.connection() to pin to a single connection (pool-safe). + // Lock ID 73649281 is a fixed constant for plugin migrations. + return await db.connection().execute(async (conn) => { + await sql`SELECT pg_advisory_xact_lock(73649281)`.execute(conn); ``` Then, wrap the rest of the function body in a try/finally to release the lock. Replace lines 159-213 (the rest after the lock acquisition). The full function becomes: @@ -208,18 +244,19 @@ async function runPluginMigrations(db: Kysely): Promise { return 0; } - // Acquire advisory lock to prevent concurrent plugin migration execution. - // Kysely's built-in migration_lock only covers core migrations, not plugins. - await sql`SELECT pg_advisory_lock(hashtext('plugin_migrations'))`.execute(db); + // Use a transaction-scoped advisory lock to prevent concurrent execution. + // Must use db.connection() to pin to a single connection (pool-safe). + // Lock ID 73649281 is a fixed constant for plugin migrations. + return await db.connection().execute(async (conn) => { + await sql`SELECT pg_advisory_xact_lock(73649281)`.execute(conn); - try { // Note: plugin_migrations table and old record migration are handled // in runKyselyMigrations() before Kysely's migrator runs // Get already executed migrations const executed = await sql<{ plugin_id: string; name: string }>` SELECT plugin_id, name FROM plugin_migrations - `.execute(db); + `.execute(conn); const executedSet = new Set( executed.rows.map((r) => `${r.plugin_id}/${r.name}`), ); @@ -258,23 +295,20 @@ async function runPluginMigrations(db: Kysely): Promise { } totalPending++; - await migration.up(db); + await migration.up(conn); // Record as executed const timestamp = new Date().toISOString(); await sql` INSERT INTO plugin_migrations (plugin_id, name, timestamp) VALUES (${pluginId}, ${migration.name}, ${timestamp}) - `.execute(db); + `.execute(conn); } } return totalPending; - } finally { - await sql`SELECT pg_advisory_unlock(hashtext('plugin_migrations'))`.execute( - db, - ); - } + }); + // Advisory lock is automatically released when the connection returns to pool. } ``` @@ -290,9 +324,9 @@ Expected: All existing tests pass (the advisory lock is transparent when there i git add apps/mesh/src/database/migrate.ts git commit -m "fix(database): add advisory lock to plugin migrations -Prevents race condition when multiple pods start simultaneously and -both try to run the same plugin migration. Kysely's built-in -migration_lock only covers core migrations." +Uses pg_advisory_xact_lock on a pinned connection to prevent race +conditions when multiple pods start simultaneously. Transaction-scoped +lock auto-releases on connection return, safe with connection poolers." ``` --- @@ -441,3 +475,21 @@ Run: `bun test` git add -A git commit -m "chore: format" ``` + +--- + +## Critique Decisions + +**Adopted:** +- Fixed advisory lock to use `pg_advisory_xact_lock(73649281)` on a pinned connection via `db.connection().execute()` -- session-level `pg_advisory_lock` through a connection pool is broken because lock and unlock may execute on different connections. Also replaced `hashtext()` (undocumented internal PG function) with a hardcoded constant. (Correctness, Performance, Architecture, Documentation critics) +- Fixed mock async iterator to terminate `next()` when `unsubscribe()` is called, matching real NATS Subscription behavior (Testing critic) +- Added test for correct NATS subject in `notify()` (Testing critic) +- Added test for `start()` with null connection (Testing critic) + +**Rejected:** +- Adding a staleness check (`this.sub.isClosed`) before re-subscribing -- the NATS `Subscription` interface does not reliably expose `isClosed` in all states. Unconditionally cleaning up is simpler and correct. The churn cost (one unsubscribe+subscribe per reconnect) is negligible. (Performance critic) +- Adding resilience test for NATS reconnect delivery latency -- valuable but out of scope for this plan. Filed as follow-up. +- Adding concurrent migration test with embedded Postgres -- the advisory lock fix is straightforward and the existing Kysely migration tests validate single-pod behavior. Concurrent testing would require significant test infrastructure. (Testing critic) + +**Adapted:** +- NATS connection `name` kept static as `"mesh-app"` -- including pod hostname would require reading env vars at module init time which adds complexity. The static name is sufficient for distinguishing mesh app connections from other NATS clients. (Architecture critic) diff --git a/docs/superpowers/plans/2026-04-07-helm-chart-ha.md b/docs/superpowers/plans/2026-04-07-helm-chart-ha.md index 352f0a7d9b..0c62c1df72 100644 --- a/docs/superpowers/plans/2026-04-07-helm-chart-ha.md +++ b/docs/superpowers/plans/2026-04-07-helm-chart-ha.md @@ -2,11 +2,11 @@ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. -**Goal:** Harden the Helm chart for maximum pod-level availability, add missing K8s primitives (PDB, NetworkPolicy, Ingress), and configure NATS clustering. +**Goal:** Harden the Helm chart for maximum pod-level availability, add missing K8s primitives (PDB, Ingress), update probe strategy, and right-size NATS. -**Architecture:** Add new Helm templates (PDB, NetworkPolicy, Ingress, PriorityClass) and update `values.yaml` defaults for production HA. No application code changes — purely Helm chart modifications. +**Architecture:** Add new Helm templates (PDB, Ingress) and update `values.yaml` defaults for production HA. No application code changes -- purely Helm chart modifications. -**Tech Stack:** Helm 3, Kubernetes API (policy/v1, networking.k8s.io/v1, autoscaling/v2, scheduling.k8s.io/v1), NATS subchart v2.12.5 +**Tech Stack:** Helm 3, Kubernetes API (policy/v1, autoscaling/v2), NATS subchart v2.12.5 --- @@ -54,7 +54,7 @@ git commit -m "feat(helm): add PodDisruptionBudget template for HA deployments" --- -### Task 2: Update values.yaml — Pod Scheduling, Probes, Shutdown +### Task 2: Update values.yaml -- Pod Scheduling, Probes, Shutdown **Files:** - Modify: `deploy/helm/values.yaml` @@ -113,8 +113,9 @@ readinessProbe: initialDelaySeconds: 0 periodSeconds: 5 timeoutSeconds: 3 - failureThreshold: 2 + failureThreshold: 3 +# Shutdown timing budget: preStop(5s) + appTimeout(58s) + buffer(2s) = 65s terminationGracePeriodSeconds: 65 lifecycle: @@ -132,6 +133,8 @@ affinity: {} With: ```yaml +# NOTE: labelSelector uses the default chart name. If you use nameOverride or +# a different release name, update the matchLabels to match your deployment. affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: @@ -143,7 +146,7 @@ affinity: topologyKey: kubernetes.io/hostname ``` -- [ ] **Step 3: Harden topologySpreadConstraints** +- [ ] **Step 3: Add hostname spread constraint** Replace: ```yaml @@ -159,10 +162,13 @@ topologySpreadConstraints: With: ```yaml +# NOTE: labelSelector uses the default chart/release names. Update if using +# nameOverride or a different release name. Use DoNotSchedule for zone in +# production clusters with 3+ AZs to guarantee zone spread. topologySpreadConstraints: - maxSkew: 1 topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: DoNotSchedule + whenUnsatisfiable: ScheduleAnyway labelSelector: matchLabels: app.kubernetes.io/name: chart-deco-studio @@ -261,35 +267,16 @@ git commit -m "feat(helm): add startupProbe support to deployment template" --- -### Task 4: Add HPA Behavior Policies +### Task 4: Add HPA Behavior Support to Template **Files:** -- Modify: `deploy/helm/values.yaml` - Modify: `deploy/helm/templates/hpa.yaml` -- [ ] **Step 1: Add behavior config to values.yaml** - -In `deploy/helm/values.yaml`, inside the `autoscaling` block, after `targetMemoryUtilizationPercentage: 80`, add: - -```yaml - behavior: - scaleUp: - stabilizationWindowSeconds: 30 - policies: - - type: Percent - value: 100 - periodSeconds: 30 - scaleDown: - stabilizationWindowSeconds: 300 - policies: - - type: Percent - value: 10 - periodSeconds: 60 -``` +Note: We add the `behavior` rendering to the template but do NOT add default behavior values to `values.yaml`. Users should define behavior when they enable HPA and have traffic patterns to optimize for. -- [ ] **Step 2: Update HPA template to render behavior** +- [ ] **Step 1: Update HPA template to render behavior when present** -In `deploy/helm/templates/hpa.yaml`, before the final `{{- end }}`, add the behavior block. Replace the full file with: +In `deploy/helm/templates/hpa.yaml`, add the behavior block before the final `{{- end }}`. Replace the full file with: ```yaml {{- if .Values.autoscaling.enabled }} @@ -330,37 +317,42 @@ spec: {{- end }} ``` -- [ ] **Step 3: Validate HPA renders with behavior** +- [ ] **Step 2: Validate HPA renders without behavior (no default)** + +Run: `helm template test deploy/helm/ --set autoscaling.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep behavior` -Run: `helm template test deploy/helm/ --set autoscaling.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep -A 15 behavior` +Expected: No output (behavior not rendered when not configured). -Expected: behavior block with scaleUp and scaleDown policies. +- [ ] **Step 3: Validate HPA renders with behavior when provided** + +Run: `helm template test deploy/helm/ --set autoscaling.enabled=true --set autoscaling.behavior.scaleDown.stabilizationWindowSeconds=300 --set database.engine=postgresql --set database.url=postgresql://x | grep -A 5 behavior` + +Expected: behavior block renders. - [ ] **Step 4: Commit** ```bash -git add deploy/helm/values.yaml deploy/helm/templates/hpa.yaml -git commit -m "feat(helm): add HPA behavior policies for scaling stability" +git add deploy/helm/templates/hpa.yaml +git commit -m "feat(helm): add HPA behavior support to template" ``` --- -### Task 5: Enable 3-Node NATS Cluster +### Task 5: Right-Size NATS JetStream Storage **Files:** - Modify: `deploy/helm/values.yaml` -- [ ] **Step 1: Update NATS subchart values for clustering** +Note: We keep NATS as a single replica by default (clustering is a production overlay concern). Only right-size the JetStream storage which was overprovisioned. + +- [ ] **Step 1: Update NATS JetStream values** -In `deploy/helm/values.yaml`, replace the `nats:` block: +In `deploy/helm/values.yaml`, replace the `nats:` block. Only change the JetStream sizing: ```yaml nats: enabled: true config: - cluster: - enabled: true - replicas: 3 jetstream: enabled: true memoryStore: @@ -370,155 +362,20 @@ nats: enabled: true pvc: enabled: true - size: 2Gi - storageClassName: "" - podTemplate: - merge: - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: ["nats"] - topologyKey: kubernetes.io/hostname + size: 5Gi + storageClassName: "" # empty = use cluster default StorageClass ``` -- [ ] **Step 2: Validate NATS subchart renders** - -Run: `helm dependency update deploy/helm/ && helm template test deploy/helm/ --set database.engine=postgresql --set database.url=postgresql://x | grep -c "kind: StatefulSet"` - -Expected: At least 1 (NATS StatefulSet). - -- [ ] **Step 3: Commit** +- [ ] **Step 2: Commit** ```bash git add deploy/helm/values.yaml -git commit -m "feat(helm): enable 3-node NATS cluster with pod anti-affinity" -``` - ---- - -### Task 6: Add NetworkPolicy Templates - -**Files:** -- Create: `deploy/helm/templates/networkpolicy.yaml` -- Modify: `deploy/helm/values.yaml` - -- [ ] **Step 1: Add networkPolicy config to values.yaml** - -After the `env: []` block at the end of `deploy/helm/values.yaml`, add: - -```yaml - -# Network Policies (optional - requires a CNI that supports NetworkPolicy) -networkPolicy: - enabled: false - # Ingress controller namespace (for allowing inbound traffic) - ingressNamespace: "ingress-nginx" - # PostgreSQL CIDR (for egress to database) - databaseCIDR: "10.0.0.0/8" -``` - -- [ ] **Step 2: Create the NetworkPolicy template** - -Create `deploy/helm/templates/networkpolicy.yaml`: - -```yaml -{{- if .Values.networkPolicy.enabled }} -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: {{ include "chart-deco-studio.fullname" . }} - labels: - {{- include "chart-deco-studio.labels" . | nindent 4 }} -spec: - podSelector: - matchLabels: - {{- include "chart-deco-studio.selectorLabels" . | nindent 6 }} - policyTypes: - - Ingress - - Egress - ingress: - # Allow traffic from ingress controller - - from: - - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: {{ .Values.networkPolicy.ingressNamespace }} - ports: - - port: {{ .Values.service.targetPort | default 3000 }} - protocol: TCP - # Allow health checks from kubelet - - ports: - - port: {{ .Values.service.targetPort | default 3000 }} - protocol: TCP - egress: - # NATS - {{- if .Values.nats.enabled }} - - to: - - podSelector: - matchLabels: - app.kubernetes.io/name: nats - ports: - - port: 4222 - protocol: TCP - {{- end }} - # PostgreSQL - - to: - - ipBlock: - cidr: {{ .Values.networkPolicy.databaseCIDR }} - ports: - - port: 5432 - protocol: TCP - # DNS - - to: - - namespaceSelector: {} - podSelector: - matchLabels: - k8s-app: kube-dns - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP - # External HTTPS (MCP servers, OAuth providers, OTel) - - to: - - ipBlock: - cidr: 0.0.0.0/0 - except: - - 169.254.169.254/32 - ports: - - port: 443 - protocol: TCP - - port: 80 - protocol: TCP -{{- end }} -``` - -- [ ] **Step 3: Validate NetworkPolicy renders when enabled** - -Run: `helm template test deploy/helm/ --set networkPolicy.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep -A 5 NetworkPolicy` - -Expected: NetworkPolicy manifest with correct podSelector. - -- [ ] **Step 4: Validate NetworkPolicy is NOT rendered when disabled** - -Run: `helm template test deploy/helm/ --set database.engine=postgresql --set database.url=postgresql://x | grep NetworkPolicy` - -Expected: No output. - -- [ ] **Step 5: Commit** - -```bash -git add deploy/helm/templates/networkpolicy.yaml deploy/helm/values.yaml -git commit -m "feat(helm): add optional NetworkPolicy template" +git commit -m "feat(helm): right-size NATS JetStream storage (1Gi->512Mi mem, 10Gi->5Gi file)" ``` --- -### Task 7: Add Ingress Template +### Task 6: Add Ingress Template **Files:** - Create: `deploy/helm/templates/ingress.yaml` @@ -534,20 +391,12 @@ ingress: enabled: false className: "" annotations: {} - # For NGINX Ingress with SSE support: - # nginx.ingress.kubernetes.io/proxy-buffering: "off" - # nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" - # nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" - # nginx.ingress.kubernetes.io/proxy-body-size: "10m" hosts: - host: mesh.example.com paths: - path: / pathType: Prefix tls: [] - # - secretName: mesh-tls - # hosts: - # - mesh.example.com ``` - [ ] **Step 2: Create the Ingress template** @@ -608,77 +457,12 @@ Expected: Ingress manifest with correct service backend. ```bash git add deploy/helm/templates/ingress.yaml deploy/helm/values.yaml -git commit -m "feat(helm): add optional Ingress template with SSE annotation examples" -``` - ---- - -### Task 8: Add PriorityClass Template - -**Files:** -- Create: `deploy/helm/templates/priorityclass.yaml` -- Modify: `deploy/helm/values.yaml` -- Modify: `deploy/helm/templates/deployment.yaml` - -- [ ] **Step 1: Add priorityClass config to values.yaml** - -In `deploy/helm/values.yaml`, after the `securityContext:` block, add: - -```yaml - -# Priority class for pod scheduling (optional) -priorityClass: - enabled: false - name: "mcp-mesh-control-plane" - value: 1000000 - preemptionPolicy: PreemptLowerPriority -``` - -- [ ] **Step 2: Create the PriorityClass template** - -Create `deploy/helm/templates/priorityclass.yaml`: - -```yaml -{{- if .Values.priorityClass.enabled }} -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: {{ .Values.priorityClass.name }} - labels: - {{- include "chart-deco-studio.labels" . | nindent 4 }} -value: {{ .Values.priorityClass.value }} -globalDefault: false -preemptionPolicy: {{ .Values.priorityClass.preemptionPolicy }} -description: "Priority for MCP Mesh control plane pods" -{{- end }} -``` - -- [ ] **Step 3: Add priorityClassName to Deployment template** - -In `deploy/helm/templates/deployment.yaml`, inside `spec.template.spec`, after `serviceAccountName`, add: - -```yaml - {{- if .Values.priorityClass.enabled }} - priorityClassName: {{ .Values.priorityClass.name }} - {{- end }} -``` - -- [ ] **Step 4: Validate PriorityClass renders** - -Run: `helm template test deploy/helm/ --set priorityClass.enabled=true --set database.engine=postgresql --set database.url=postgresql://x | grep -A 5 PriorityClass` - -Expected: PriorityClass manifest with value 1000000. - -- [ ] **Step 5: Commit** - -```bash -git add deploy/helm/templates/priorityclass.yaml deploy/helm/templates/deployment.yaml deploy/helm/values.yaml -git commit -m "feat(helm): add optional PriorityClass for preemption protection" +git commit -m "feat(helm): add optional Ingress template" ``` --- -### Task 9: Format and Lint +### Task 7: Format and Lint - [ ] **Step 1: Run formatter** @@ -696,3 +480,25 @@ Fix any issues found. git add -A git commit -m "chore(helm): format" ``` + +--- + +## Critique Decisions + +**Adopted:** +- Kept `readinessProbe.failureThreshold` at 3 (not 2) to avoid flapping risk (Performance, Architecture critics) +- Kept `ScheduleAnyway` for zone topology as default, added comment about using `DoNotSchedule` in production (Performance, Scope critics) +- Removed PriorityClass task -- YAGNI for most deployments, cluster-scoped resource causes conflicts (Scope, Security, Architecture critics) +- Removed NetworkPolicy from Helm chart -- too environment-specific as a template (Scope critic). Moved to infra plan as example. +- Removed default HPA behavior values -- speculative without traffic data (Scope critic). Kept template support. +- Added hostname spread constraint as `ScheduleAnyway` (Architecture critic) +- Added comments documenting hardcoded label limitation (Duplication, Architecture critics) +- Added shutdown timing budget comment (Duplication critic) +- Right-sized JetStream to 512Mi/5Gi instead of aggressive 2Gi file (compromise on Documentation critic's sizing concern) + +**Rejected:** +- Moving label selectors into deployment template -- Helm values cannot use template functions; this is a known Helm limitation. Documented with comments instead. +- 3-node NATS cluster as default -- YAGNI for dev/staging (Scope critic). Moved to production overlay. + +**Adapted:** +- s3Sync image pinned to specific tag (not digest) -- digest is ideal but requires infrastructure to track digests. Tag pin is the pragmatic middle ground. diff --git a/docs/superpowers/plans/2026-04-07-infra-ha-advanced.md b/docs/superpowers/plans/2026-04-07-infra-ha-advanced.md index 620bcfa54a..fa181961fe 100644 --- a/docs/superpowers/plans/2026-04-07-infra-ha-advanced.md +++ b/docs/superpowers/plans/2026-04-07-infra-ha-advanced.md @@ -2,57 +2,54 @@ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. -**Goal:** Document and configure the external infrastructure needed for production HA: External Secrets Operator, cert-manager, Karpenter NodePool, and production values overlay. +**Goal:** Provide production configuration examples: a values overlay with HA features enabled, and reference infrastructure manifests. -**Architecture:** This plan produces documentation and example configuration files. Unlike the Helm chart and app code plans, most of these items are cluster-level resources deployed outside the Helm chart. The deliverables are: a production values overlay, infrastructure setup docs, and example manifests in `deploy/`. +**Architecture:** This plan produces example configuration files deployed outside the Helm chart. The deliverables are: a production values example overlay and infrastructure reference manifests in `deploy/`. -**Tech Stack:** Kubernetes, Karpenter, ArgoCD, External Secrets Operator, cert-manager, Velero +**Tech Stack:** Kubernetes, External Secrets Operator, cert-manager -**Note:** This plan is for documentation and configuration. It does NOT modify application code or the core Helm chart templates (those are covered by the other two plans). +**Note:** This plan is for documentation and configuration. It does NOT modify application code or the core Helm chart templates. --- -### Task 1: Create Production Values Overlay +### Task 1: Create Production Values Example Overlay **Files:** -- Create: `deploy/helm/values-production.yaml` +- Create: `deploy/helm/values-production.example.yaml` -This file serves as a reference for production deployments with all HA features enabled. +This file contains ONLY values that differ from the defaults in `values.yaml`. It serves as a starting point for production deployments. -- [ ] **Step 1: Create the production values overlay** +- [ ] **Step 1: Create the production values example** -Create `deploy/helm/values-production.yaml`: +Create `deploy/helm/values-production.example.yaml`: ```yaml # Production values overlay for MCP Mesh HA deployment. -# Usage: helm install mesh deploy/helm/ -f deploy/helm/values-production.yaml +# Contains ONLY values that differ from defaults in values.yaml. +# Usage: helm install mesh deploy/helm/ -f deploy/helm/values-production.example.yaml # # Prerequisites: -# - PostgreSQL (RDS/CloudNativePG) with DATABASE_URL in secret +# - PostgreSQL (RDS Multi-AZ, CloudNativePG, or Cloud SQL HA) # - External Secrets Operator for secret management -# - cert-manager for TLS certificates -# - CNI with NetworkPolicy support (Calico/Cilium) +# - cert-manager for TLS certificates (optional) # - 3+ AZ cluster with 6+ nodes replicaCount: 3 -image: - pullPolicy: IfNotPresent - command: - - bun - - run - - deco - - --no-local-mode - - --num-threads - - "4" +# --- Database (required for multi-replica) --- +database: + engine: postgresql + # URL provided via secret.secretName below + +# --- Secret (managed by External Secrets Operator) --- +secret: + secretName: "mesh-production-secrets" # --- Autoscaling --- autoscaling: enabled: true minReplicas: 3 maxReplicas: 6 - targetCPUUtilizationPercentage: 80 - targetMemoryUtilizationPercentage: 80 behavior: scaleUp: stabilizationWindowSeconds: 30 @@ -67,65 +64,7 @@ autoscaling: value: 10 periodSeconds: 60 -# --- Database --- -database: - engine: postgresql - # URL provided via secret.secretName (External Secrets Operator) - -# --- Persistence --- -persistence: - enabled: true - accessMode: ReadWriteOnce - storageClass: "gp3" - size: 10Gi - -# --- Probes --- -startupProbe: - httpGet: - path: /health/live - port: http - periodSeconds: 2 - failureThreshold: 30 - timeoutSeconds: 3 - -livenessProbe: - httpGet: - path: /health/live - port: http - initialDelaySeconds: 0 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 - -readinessProbe: - httpGet: - path: /health/ready - port: http - initialDelaySeconds: 0 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 2 - -# --- Shutdown --- -terminationGracePeriodSeconds: 65 -lifecycle: - preStop: - exec: - command: ["sleep", "5"] - -# --- Scheduling --- -nodeSelector: {} - -affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/name: chart-deco-studio - topologyKey: kubernetes.io/hostname - +# --- Topology: use DoNotSchedule for zone spread in 3+ AZ clusters --- topologySpreadConstraints: - maxSkew: 1 topologyKey: topology.kubernetes.io/zone @@ -133,54 +72,16 @@ topologySpreadConstraints: labelSelector: matchLabels: app.kubernetes.io/name: chart-deco-studio - app.kubernetes.io/instance: deco-studio + app.kubernetes.io/instance: deco-studio # Change to your release name - maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway labelSelector: matchLabels: app.kubernetes.io/name: chart-deco-studio - app.kubernetes.io/instance: deco-studio - -# --- Priority --- -priorityClass: - enabled: true - name: "mcp-mesh-control-plane" - value: 1000000 - preemptionPolicy: PreemptLowerPriority + app.kubernetes.io/instance: deco-studio # Change to your release name -# --- Secret --- -secret: - secretName: "mesh-production-secrets" # Managed by External Secrets Operator - -# --- Ingress --- -ingress: - enabled: true - className: "nginx" - annotations: - nginx.ingress.kubernetes.io/proxy-buffering: "off" - nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" - nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" - nginx.ingress.kubernetes.io/proxy-body-size: "10m" - nginx.ingress.kubernetes.io/proxy-http-version: "1.1" - cert-manager.io/cluster-issuer: "letsencrypt-prod" - hosts: - - host: mesh.example.com - paths: - - path: / - pathType: Prefix - tls: - - secretName: mesh-tls - hosts: - - mesh.example.com - -# --- Network Policy --- -networkPolicy: - enabled: true - ingressNamespace: "ingress-nginx" - databaseCIDR: "10.0.0.0/8" - -# --- NATS --- +# --- NATS: 3-node cluster for production --- nats: enabled: true config: @@ -196,8 +97,7 @@ nats: enabled: true pvc: enabled: true - size: 2Gi - storageClassName: "" + size: 5Gi podTemplate: merge: spec: @@ -211,60 +111,69 @@ nats: values: ["nats"] topologyKey: kubernetes.io/hostname -# --- ConfigMap --- +# --- Ingress (example with NGINX + SSE support) --- +ingress: + enabled: true + className: "nginx" + annotations: + nginx.ingress.kubernetes.io/proxy-buffering: "off" + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-body-size: "10m" + nginx.ingress.kubernetes.io/proxy-http-version: "1.1" + cert-manager.io/cluster-issuer: "letsencrypt-prod" + hosts: + - host: mesh.example.com # CHANGE THIS + paths: + - path: / + pathType: Prefix + tls: + - secretName: mesh-tls + hosts: + - mesh.example.com # CHANGE THIS + +# --- ConfigMap overrides --- configMap: meshConfig: NODE_ENV: "production" PORT: "3000" HOST: "0.0.0.0" - BETTER_AUTH_URL: "https://mesh.example.com" - BASE_URL: "https://mesh.example.com" + BETTER_AUTH_URL: "https://mesh.example.com" # CHANGE THIS + BASE_URL: "https://mesh.example.com" # CHANGE THIS DATA_DIR: "/app/data" - DATABASE_POOL_MAX: "10" DECO_AI_GATEWAY_ENABLED: "false" -# --- OTel --- +# --- OTel (optional) --- otel: enabled: true protocol: "http/protobuf" service: "mcp-mesh" collector: enabled: true - image: - repository: otel/opentelemetry-collector-contrib - tag: "0.115.1" - pullPolicy: IfNotPresent - -# --- S3 Sync --- -s3Sync: - enabled: false - image: - repository: amazon/aws-cli - tag: "2.22.35" ``` - [ ] **Step 2: Validate production overlay renders** -Run: `helm template mesh-prod deploy/helm/ -f deploy/helm/values-production.yaml --set database.url=postgresql://x | head -20` +Run: `helm template mesh-prod deploy/helm/ -f deploy/helm/values-production.example.yaml --set database.url=postgresql://x > /dev/null && echo "OK"` -Expected: Rendered manifests without errors. +Expected: "OK" (no rendering errors). - [ ] **Step 3: Commit** ```bash -git add deploy/helm/values-production.yaml -git commit -m "feat(helm): add production values overlay with full HA configuration" +git add deploy/helm/values-production.example.yaml +git commit -m "feat(helm): add production values example overlay" ``` --- -### Task 2: Create Infrastructure Setup Guide +### Task 2: Create Infrastructure Reference Manifests **Files:** - Create: `deploy/infrastructure/README.md` - Create: `deploy/infrastructure/external-secret.yaml` - Create: `deploy/infrastructure/cert-issuer.yaml` -- Create: `deploy/infrastructure/karpenter-nodepool.yaml` +- Create: `deploy/infrastructure/networkpolicy.yaml` - [ ] **Step 1: Create the infrastructure directory** @@ -277,23 +186,22 @@ Create `deploy/infrastructure/external-secret.yaml`: ```yaml # External Secrets Operator: sync secrets from AWS Secrets Manager. # Prerequisites: -# - External Secrets Operator installed (helm install external-secrets external-secrets/external-secrets) +# - External Secrets Operator installed # - ClusterSecretStore configured for AWS Secrets Manager -# - Secrets created in AWS Secrets Manager at the referenced paths # # Usage: kubectl apply -f deploy/infrastructure/external-secret.yaml -n mesh-production --- -apiVersion: external-secrets.io/v1beta1 +apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: mesh-production-secrets spec: - refreshInterval: 1h + refreshInterval: 5m secretStoreRef: name: aws-secrets-manager kind: ClusterSecretStore target: - name: mesh-production-secrets # Must match secret.secretName in values-production.yaml + name: mesh-production-secrets # Must match secret.secretName in values creationPolicy: Owner data: - secretKey: BETTER_AUTH_SECRET @@ -337,83 +245,79 @@ spec: ingressClassName: nginx ``` -- [ ] **Step 4: Create Karpenter NodePool example** +- [ ] **Step 4: Create NetworkPolicy example** -Create `deploy/infrastructure/karpenter-nodepool.yaml`: +Create `deploy/infrastructure/networkpolicy.yaml`: ```yaml -# Karpenter NodePool for MCP Mesh workloads. -# Prerequisites: -# - Karpenter installed and configured with EC2NodeClass +# NetworkPolicy example for MCP Mesh. +# This is highly environment-specific -- adapt to your cluster's CNI, +# ingress controller, and network layout before applying. # -# Usage: kubectl apply -f deploy/infrastructure/karpenter-nodepool.yaml +# Usage: kubectl apply -f deploy/infrastructure/networkpolicy.yaml -n mesh-production --- -apiVersion: karpenter.sh/v1 -kind: NodePool +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy metadata: - name: mesh-app + name: mcp-mesh spec: - template: - spec: - requirements: - - key: kubernetes.io/arch - operator: In - values: ["amd64"] - - key: karpenter.sh/capacity-type - operator: In - values: ["on-demand", "spot"] - - key: karpenter.k8s.aws/instance-category - operator: In - values: ["c", "m"] - - key: karpenter.k8s.aws/instance-generation - operator: Gt - values: ["5"] - nodeClassRef: - group: karpenter.k8s.aws - kind: EC2NodeClass - name: default - limits: - cpu: "64" - memory: "128Gi" - disruption: - consolidationPolicy: WhenEmptyOrUnderutilized - consolidateAfter: 60s ---- -# NATS nodes must not use spot instances (JetStream fileStore corruption risk) -apiVersion: karpenter.sh/v1 -kind: NodePool -metadata: - name: mesh-data -spec: - template: - spec: - requirements: - - key: kubernetes.io/arch - operator: In - values: ["amd64"] - - key: karpenter.sh/capacity-type - operator: In - values: ["on-demand"] - - key: karpenter.k8s.aws/instance-category - operator: In - values: ["m", "r"] - - key: karpenter.k8s.aws/instance-generation - operator: Gt - values: ["5"] - nodeClassRef: - group: karpenter.k8s.aws - kind: EC2NodeClass - name: default - taints: - - key: workload-type - value: data - effect: NoSchedule - limits: - cpu: "16" - memory: "64Gi" - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 300s + podSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio # Match your chart name + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: ingress-nginx # Your ingress namespace + ports: + - port: 3000 + protocol: TCP + # Kubelet health checks (host-network, bypasses NetworkPolicy) + - ports: + - port: 3000 + protocol: TCP + egress: + # NATS + - to: + - podSelector: + matchLabels: + app.kubernetes.io/name: nats + ports: + - port: 4222 + protocol: TCP + # PostgreSQL + - to: + - ipBlock: + cidr: 10.0.0.0/8 # Your database CIDR + ports: + - port: 5432 + protocol: TCP + # DNS + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + # External HTTPS only (MCP servers, OAuth providers) + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + - 169.254.169.254/32 + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + ports: + - port: 443 + protocol: TCP ``` - [ ] **Step 5: Create the README** @@ -423,92 +327,56 @@ Create `deploy/infrastructure/README.md`: ```markdown # Infrastructure Setup for MCP Mesh HA -This directory contains example manifests for the external infrastructure -components needed for a production HA deployment of MCP Mesh. +Reference manifests for external infrastructure components needed for +production HA deployment of MCP Mesh. Adapt to your environment. ## Prerequisites -1. **Kubernetes cluster** with 3+ AZs and 6+ nodes -2. **PostgreSQL** — RDS Multi-AZ, Cloud SQL HA, or CloudNativePG -3. **External Secrets Operator** — for secret management -4. **cert-manager** — for TLS certificate automation -5. **Ingress controller** — NGINX Ingress or Envoy-based (Contour) -6. **CNI with NetworkPolicy** — Calico or Cilium +1. **Kubernetes cluster** with 3+ AZs +2. **PostgreSQL** -- RDS Multi-AZ, Cloud SQL HA, or CloudNativePG +3. **External Secrets Operator** -- for secret management +4. **cert-manager** -- for TLS certificate automation (optional) +5. **Ingress controller** -- NGINX Ingress or Envoy-based (Contour) -## Setup Order +## Files + +| File | Description | +|------|-------------| +| `external-secret.yaml` | ExternalSecret for AWS Secrets Manager | +| `cert-issuer.yaml` | cert-manager ClusterIssuer for Let's Encrypt | +| `networkpolicy.yaml` | NetworkPolicy example (adapt to your CNI) | -1. Install infrastructure operators: - - External Secrets Operator - - cert-manager - - Karpenter (optional, replaces Cluster Autoscaler) +## Deployment -2. Apply infrastructure manifests: +1. Apply infrastructure manifests (adapt first): ```bash kubectl apply -f deploy/infrastructure/cert-issuer.yaml kubectl apply -f deploy/infrastructure/external-secret.yaml -n mesh-production - # kubectl apply -f deploy/infrastructure/karpenter-nodepool.yaml # if using Karpenter + kubectl apply -f deploy/infrastructure/networkpolicy.yaml -n mesh-production ``` -3. Deploy MCP Mesh with production values: +2. Deploy MCP Mesh with production values: ```bash helm install mesh deploy/helm/ \ - -f deploy/helm/values-production.yaml \ + -f deploy/helm/values-production.example.yaml \ --set database.url=postgresql://... \ -n mesh-production ``` - -## Files - -| File | Description | -|------|-------------| -| `external-secret.yaml` | ExternalSecret for AWS Secrets Manager | -| `cert-issuer.yaml` | cert-manager ClusterIssuer for Let's Encrypt | -| `karpenter-nodepool.yaml` | Karpenter NodePools for app and data workloads | - -## Disaster Recovery - -**Target:** RPO 1hr, RTO 15min - -1. PostgreSQL: Cross-region read replica (promotable in ~5min) -2. NATS: Let it rebuild — events are durable in PostgreSQL, polling fallback covers the gap -3. Kubernetes: ArgoCD reconciles from Git + Velero for PVC/CRD backups -4. DNS: Route53 health-check failover to DR region ``` - [ ] **Step 6: Commit** ```bash git add deploy/infrastructure/ -git commit -m "docs(infra): add infrastructure setup guide and example manifests +git commit -m "docs(infra): add infrastructure reference manifests -Includes External Secrets Operator, cert-manager ClusterIssuer, -and Karpenter NodePool examples for production HA deployment." +Includes External Secrets Operator (v1 API), cert-manager ClusterIssuer, +and NetworkPolicy example for production HA deployment." ``` --- -### Task 3: Add Helm Chart Validation for SQLite + Autoscaling - -**Files:** -- Modify: `deploy/helm/templates/_helpers.tpl` - -**Context:** The `_helpers.tpl` already validates that autoscaling requires PostgreSQL or distributed storage (line 116-118). But there is no warning when `database.engine=sqlite` with `replicaCount > 1`. The existing validation on line 113-115 covers `replicaCount > 1` without distributed storage, which implicitly covers SQLite. This task adds a more explicit error message specifically for SQLite + autoscaling. - -- [ ] **Step 1: Verify existing validation already covers this case** - -Run: `helm template test deploy/helm/ --set autoscaling.enabled=true --set database.engine=sqlite 2>&1 | head -5` - -Expected: Error message about autoscaling requiring PostgreSQL or distributed storage. - -Since the existing validation already handles this, no change is needed. Mark this task as done. - -- [ ] **Step 2: Commit (no-op — validation already exists)** - -No commit needed. - ---- - -### Task 4: Format and Final Verification +### Task 3: Format and Final Verification - [ ] **Step 1: Run formatter** @@ -516,7 +384,7 @@ Run: `bun run fmt` - [ ] **Step 2: Validate all Helm templates render without errors** -Run: `helm template mesh-prod deploy/helm/ -f deploy/helm/values-production.yaml --set database.url=postgresql://x > /dev/null && echo "OK"` +Run: `helm template mesh-prod deploy/helm/ -f deploy/helm/values-production.example.yaml --set database.url=postgresql://x > /dev/null && echo "OK"` Expected: "OK" @@ -526,3 +394,24 @@ Expected: "OK" git add -A git commit -m "chore: format" ``` + +--- + +## Critique Decisions + +**Adopted:** +- Slimmed `values-production.yaml` to deltas only, renamed to `.example.yaml` (Duplication, Scope critics) +- Fixed ESO API from `v1beta1` to `v1` (Documentation critic) +- Reduced ESO `refreshInterval` from `1h` to `5m` (Security critic) +- Removed port 80 from NetworkPolicy egress, added RFC1918 exclusions (Security critic) +- Moved NetworkPolicy from Helm template to infrastructure example (Scope critic) +- Removed `DATABASE_POOL_MAX` override (Correctness critic -- was backwards) +- Removed `persistence` from overlay to avoid RWO + multi-replica conflict (Correctness critic) +- Removed Karpenter NodePool -- AWS-specific, out of scope for this repo (Scope critic) +- Removed DR section from README -- belongs in deployment guide (Scope critic) + +**Rejected:** +- Adding Helm validation for empty secrets in production -- Better Auth already fails on startup with invalid secrets. Adding chart-level validation is an unrelated concern. + +**Adapted:** +- Kept 3-node NATS cluster in production overlay only (not as default) with pod anti-affinity (Scope + Architecture compromise) From dd7fcff765bc4fe3aaf80b7b34ddd825020209b1 Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:19:14 -0300 Subject: [PATCH 03/16] feat(helm): add PodDisruptionBudget template for HA deployments --- deploy/helm/templates/pdb.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 deploy/helm/templates/pdb.yaml diff --git a/deploy/helm/templates/pdb.yaml b/deploy/helm/templates/pdb.yaml new file mode 100644 index 0000000000..d24b1177fb --- /dev/null +++ b/deploy/helm/templates/pdb.yaml @@ -0,0 +1,13 @@ +{{- if or .Values.autoscaling.enabled (gt (int (default 1 .Values.replicaCount)) 1) }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} +spec: + maxUnavailable: 1 + selector: + matchLabels: + {{- include "chart-deco-studio.selectorLabels" . | nindent 6 }} +{{- end }} From bad42330a3f78122e89d7e988a5316179a8e9baa Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:19:20 -0300 Subject: [PATCH 04/16] feat(helm): add startupProbe support to deployment template --- deploy/helm/templates/deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deploy/helm/templates/deployment.yaml b/deploy/helm/templates/deployment.yaml index 6909e8caa0..663dc64ea5 100644 --- a/deploy/helm/templates/deployment.yaml +++ b/deploy/helm/templates/deployment.yaml @@ -103,6 +103,10 @@ spec: readinessProbe: {{- toYaml . | nindent 12 }} {{- end }} + {{- with .Values.startupProbe }} + startupProbe: + {{- toYaml . | nindent 12 }} + {{- end }} {{- with .Values.lifecycle }} lifecycle: {{- toYaml . | nindent 12 }} From 50edfc4170b200d46fd2ccf31e02f26d21ca6cbf Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:19:32 -0300 Subject: [PATCH 05/16] feat(helm): add optional Ingress template --- deploy/helm/templates/ingress.yaml | 41 ++++++++++++++++++++++++++++++ deploy/helm/values.yaml | 11 ++++++++ 2 files changed, 52 insertions(+) create mode 100644 deploy/helm/templates/ingress.yaml diff --git a/deploy/helm/templates/ingress.yaml b/deploy/helm/templates/ingress.yaml new file mode 100644 index 0000000000..6bbf5aaf89 --- /dev/null +++ b/deploy/helm/templates/ingress.yaml @@ -0,0 +1,41 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ include "chart-deco-studio.fullname" $ }} + port: + number: {{ $.Values.service.port }} + {{- end }} + {{- end }} +{{- end }} diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index 1360924b3f..1410c8f3cd 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -26,6 +26,17 @@ service: port: 80 targetPort: 3000 +ingress: + enabled: false + className: "" + annotations: {} + hosts: + - host: mesh.example.com + paths: + - path: / + pathType: Prefix + tls: [] + resources: requests: memory: "2Gi" From 2bf3dfc0a400391c7c3f976099e26bf481480403 Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:20:41 -0300 Subject: [PATCH 06/16] feat(helm): harden probes, add preStop hook, pod anti-affinity, pin s3Sync image --- deploy/helm/values.yaml | 60 ++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index 1410c8f3cd..e05daa41a6 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -121,11 +121,19 @@ securityContext: - ALL readOnlyRootFilesystem: false +startupProbe: + httpGet: + path: /health/live + port: http + periodSeconds: 2 + failureThreshold: 30 + timeoutSeconds: 3 + livenessProbe: httpGet: path: /health/live port: http - initialDelaySeconds: 30 + initialDelaySeconds: 0 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3 @@ -133,15 +141,18 @@ readinessProbe: httpGet: path: /health/ready port: http - initialDelaySeconds: 10 + initialDelaySeconds: 0 periodSeconds: 5 timeoutSeconds: 3 - failureThreshold: 4 + failureThreshold: 3 -terminationGracePeriodSeconds: 60 +# Shutdown timing budget: preStop(5s) + appTimeout(58s) + buffer(2s) = 65s +terminationGracePeriodSeconds: 65 -# Optional lifecycle hooks (preStop, postStart) -# lifecycle: {} +lifecycle: + preStop: + exec: + command: ["sleep", "5"] nodeSelector: kubernetes.io/arch: amd64 @@ -154,21 +165,21 @@ tolerations: [] # value: "dev" # effect: "NoSchedule" -affinity: {} -# Example: -# affinity: -# podAntiAffinity: -# preferredDuringSchedulingIgnoredDuringExecution: -# - weight: 100 -# podAffinityTerm: -# labelSelector: -# matchLabels: -# app.kubernetes.io/name: chart-deco-studio -# topologyKey: kubernetes.io/hostname +# NOTE: labelSelector uses the default chart name. If you use nameOverride or +# a different release name, update the matchLabels to match your deployment. +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + topologyKey: kubernetes.io/hostname -# Topology Spread Constraints (optional - leave empty [] to disable) -# IMPORTANT: labelSelector is required when topologySpreadConstraints is configured -#topologySpreadConstraints: [] +# NOTE: labelSelector uses the default chart/release names. Update if using +# nameOverride or a different release name. Use DoNotSchedule for zone in +# production clusters with 3+ AZs to guarantee zone spread. topologySpreadConstraints: - maxSkew: 1 topologyKey: topology.kubernetes.io/zone @@ -177,6 +188,13 @@ topologySpreadConstraints: matchLabels: app.kubernetes.io/name: chart-deco-studio app.kubernetes.io/instance: deco-studio + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio # OpenTelemetry configuration (optional) otel: @@ -212,7 +230,7 @@ s3Sync: enabled: false image: repository: amazon/aws-cli - tag: "latest" + tag: "2.22.35" pullPolicy: IfNotPresent bucket: "" # Required when enabled region: "us-east-1" From d22f42537a12234fe8a8e7cf241481e39168dcbc Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:20:49 -0300 Subject: [PATCH 07/16] feat(helm): add HPA behavior support to template --- deploy/helm/templates/hpa.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deploy/helm/templates/hpa.yaml b/deploy/helm/templates/hpa.yaml index b5712f7381..35e3aa03c6 100644 --- a/deploy/helm/templates/hpa.yaml +++ b/deploy/helm/templates/hpa.yaml @@ -29,4 +29,8 @@ spec: type: Utilization averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} {{- end }} + {{- with .Values.autoscaling.behavior }} + behavior: + {{- toYaml . | nindent 4 }} + {{- end }} {{- end }} From 19fc9372365066583180b3b1e27c75ac77bd4956 Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:20:56 -0300 Subject: [PATCH 08/16] feat(helm): right-size NATS JetStream storage (1Gi->512Mi mem, 10Gi->5Gi file) --- deploy/helm/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index e05daa41a6..256ae95d06 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -258,12 +258,12 @@ nats: enabled: true memoryStore: enabled: true - maxSize: 1Gi + maxSize: 512Mi fileStore: enabled: true pvc: enabled: true - size: 10Gi + size: 5Gi storageClassName: "" # empty = use cluster default StorageClass # Additional environment variables (merged with existing envs) From 0e3e0bdf9f7e4332cdeaee8f8ef29ac667d5dc3f Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:22:25 -0300 Subject: [PATCH 09/16] fix(event-bus): re-subscribe to NATS after reconnect The start() method had an early return when this.sub was non-null, preventing re-subscription after NATS reconnects. The old subscription object was stale, silently breaking the notify path and forcing all event delivery to fall back to 5s polling. Co-Authored-By: Claude Sonnet 4.6 --- apps/mesh/src/event-bus/nats-notify.test.ts | 91 +++++++++++++++++++++ apps/mesh/src/event-bus/nats-notify.ts | 12 ++- 2 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 apps/mesh/src/event-bus/nats-notify.test.ts diff --git a/apps/mesh/src/event-bus/nats-notify.test.ts b/apps/mesh/src/event-bus/nats-notify.test.ts new file mode 100644 index 0000000000..52526b6b1c --- /dev/null +++ b/apps/mesh/src/event-bus/nats-notify.test.ts @@ -0,0 +1,91 @@ +import { describe, expect, mock, test } from "bun:test"; +import type { NatsConnection, Subscription } from "nats"; +import { NatsNotifyStrategy } from "./nats-notify"; + +function createMockConnection(): { + nc: NatsConnection; + subs: Subscription[]; +} { + const subs: Subscription[] = []; + const nc = { + subscribe: mock((subject: string) => { + let resolveIterator: (() => void) | null = null; + const sub: Subscription = { + unsubscribe: mock(() => { + resolveIterator?.(); + }), + drain: mock(() => Promise.resolve()), + isClosed: false, + [Symbol.asyncIterator]: () => ({ + next: () => + new Promise>((resolve) => { + resolveIterator = () => resolve({ done: true, value: undefined }); + }), + return: () => Promise.resolve({ done: true, value: undefined }), + throw: () => Promise.resolve({ done: true, value: undefined }), + }), + } as unknown as Subscription; + subs.push(sub); + return sub; + }), + publish: mock(() => {}), + isClosed: () => false, + isDraining: () => false, + } as unknown as NatsConnection; + + return { nc, subs }; +} + +describe("NatsNotifyStrategy", () => { + test("start() creates subscription", async () => { + const { nc } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.start(() => {}); + + expect(nc.subscribe).toHaveBeenCalledTimes(1); + }); + + test("start() re-subscribes when called again (reconnect scenario)", async () => { + const { nc, subs } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.start(() => {}); + expect(nc.subscribe).toHaveBeenCalledTimes(1); + + // Simulate reconnect: start() is called again + await strategy.start(); + expect(nc.subscribe).toHaveBeenCalledTimes(2); + // Old subscription should have been unsubscribed + expect(subs[0].unsubscribe).toHaveBeenCalledTimes(1); + }); + + test("stop() cleans up subscription", async () => { + const { nc, subs } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.start(() => {}); + await strategy.stop(); + + expect(subs[0].unsubscribe).toHaveBeenCalledTimes(1); + }); + + test("notify() publishes to correct subject", async () => { + const { nc } = createMockConnection(); + const strategy = new NatsNotifyStrategy({ getConnection: () => nc }); + + await strategy.notify("event-123"); + + expect(nc.publish).toHaveBeenCalledWith( + "mesh.events.notify", + expect.any(Uint8Array), + ); + }); + + test("notify() silently succeeds when NATS is disconnected", async () => { + const strategy = new NatsNotifyStrategy({ getConnection: () => null }); + + // Should not throw + await strategy.notify("event-123"); + }); +}); diff --git a/apps/mesh/src/event-bus/nats-notify.ts b/apps/mesh/src/event-bus/nats-notify.ts index f51da7360b..60622adf3e 100644 --- a/apps/mesh/src/event-bus/nats-notify.ts +++ b/apps/mesh/src/event-bus/nats-notify.ts @@ -27,10 +27,20 @@ export class NatsNotifyStrategy implements NotifyStrategy { constructor(private readonly options: NatsNotifyStrategyOptions) {} async start(onNotify?: () => void): Promise { - if (this.sub) return; if (onNotify) this.onNotify = onNotify; if (!this.onNotify) return; + // Clean up stale subscription from previous connection (reconnect scenario). + // After NATS reconnects, the old Subscription object is dead but non-null. + if (this.sub) { + try { + this.sub.unsubscribe(); + } catch { + // ignore — connection may already be closed + } + this.sub = null; + } + const nc = this.options.getConnection(); if (!nc) return; // NATS not ready — polling strategy is safety net From 047175bd9943d197e9f7ee0d11ded914e4371810 Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:23:22 -0300 Subject: [PATCH 10/16] fix(database): add advisory lock to plugin migrations Uses pg_advisory_xact_lock on a pinned connection to prevent race conditions when multiple pods start simultaneously. Transaction-scoped lock auto-releases on connection return, safe with connection poolers. --- apps/mesh/src/database/migrate.ts | 106 ++++++++++++++++-------------- 1 file changed, 57 insertions(+), 49 deletions(-) diff --git a/apps/mesh/src/database/migrate.ts b/apps/mesh/src/database/migrate.ts index e5d2137153..82c93f0909 100644 --- a/apps/mesh/src/database/migrate.ts +++ b/apps/mesh/src/database/migrate.ts @@ -153,63 +153,71 @@ async function runPluginMigrations(db: Kysely): Promise { return 0; } - // Note: plugin_migrations table and old record migration are handled - // in runKyselyMigrations() before Kysely's migrator runs - - // Get already executed migrations - const executed = await sql<{ plugin_id: string; name: string }>` - SELECT plugin_id, name FROM plugin_migrations - `.execute(db); - const executedSet = new Set( - executed.rows.map((r) => `${r.plugin_id}/${r.name}`), - ); - - // Group migrations by plugin - const migrationsByPlugin = new Map< - string, - Array<{ - name: string; - // eslint-disable-next-line @typescript-eslint/no-explicit-any - up: (db: any) => Promise; - }> - >(); - - for (const { pluginId, migration } of pluginMigrations) { - if (!migrationsByPlugin.has(pluginId)) { - migrationsByPlugin.set(pluginId, []); + // Use a transaction-scoped advisory lock to prevent concurrent execution. + // Must use db.connection() to pin to a single connection (pool-safe). + // Lock ID 73649281 is a fixed constant for plugin migrations. + return await db.connection().execute(async (conn) => { + await sql`SELECT pg_advisory_xact_lock(73649281)`.execute(conn); + + // Note: plugin_migrations table and old record migration are handled + // in runKyselyMigrations() before Kysely's migrator runs + + // Get already executed migrations + const executed = await sql<{ plugin_id: string; name: string }>` + SELECT plugin_id, name FROM plugin_migrations + `.execute(conn); + const executedSet = new Set( + executed.rows.map((r) => `${r.plugin_id}/${r.name}`), + ); + + // Group migrations by plugin + const migrationsByPlugin = new Map< + string, + Array<{ + name: string; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + up: (db: any) => Promise; + }> + >(); + + for (const { pluginId, migration } of pluginMigrations) { + if (!migrationsByPlugin.has(pluginId)) { + migrationsByPlugin.set(pluginId, []); + } + migrationsByPlugin.get(pluginId)!.push({ + name: migration.name, + up: migration.up, + }); } - migrationsByPlugin.get(pluginId)!.push({ - name: migration.name, - up: migration.up, - }); - } - // Run pending migrations for each plugin - let totalPending = 0; + // Run pending migrations for each plugin + let totalPending = 0; - for (const [pluginId, pluginMigrationList] of migrationsByPlugin) { - // Sort by name to ensure consistent order - pluginMigrationList.sort((a, b) => a.name.localeCompare(b.name)); + for (const [pluginId, pluginMigrationList] of migrationsByPlugin) { + // Sort by name to ensure consistent order + pluginMigrationList.sort((a, b) => a.name.localeCompare(b.name)); - for (const migration of pluginMigrationList) { - const key = `${pluginId}/${migration.name}`; - if (executedSet.has(key)) { - continue; // Already executed - } + for (const migration of pluginMigrationList) { + const key = `${pluginId}/${migration.name}`; + if (executedSet.has(key)) { + continue; // Already executed + } - totalPending++; - await migration.up(db); + totalPending++; + await migration.up(conn); - // Record as executed - const timestamp = new Date().toISOString(); - await sql` - INSERT INTO plugin_migrations (plugin_id, name, timestamp) - VALUES (${pluginId}, ${migration.name}, ${timestamp}) - `.execute(db); + // Record as executed + const timestamp = new Date().toISOString(); + await sql` + INSERT INTO plugin_migrations (plugin_id, name, timestamp) + VALUES (${pluginId}, ${migration.name}, ${timestamp}) + `.execute(conn); + } } - } - return totalPending; + return totalPending; + }); + // Advisory lock is automatically released when the connection returns to pool. } // ============================================================================ From 51a1a872581c368df6702292ff222a76bfd3c8aa Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:23:55 -0300 Subject: [PATCH 11/16] fix(shutdown): align force-exit timeout with 65s termination grace period The Helm chart's terminationGracePeriodSeconds increases from 60 to 65 to accommodate a 5s preStop hook. The app's internal hard timeout increases from 55s to 58s to stay within the grace period. --- apps/mesh/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/mesh/src/index.ts b/apps/mesh/src/index.ts index 48e31a3bfd..12020a6bcc 100644 --- a/apps/mesh/src/index.ts +++ b/apps/mesh/src/index.ts @@ -156,9 +156,9 @@ async function gracefulShutdown(signal: string) { console.log(`\n[shutdown] Received ${signal}, shutting down gracefully...`); const forceExitTimer = setTimeout(() => { - console.error("[shutdown] Timed out after 55s, forcing exit."); + console.error("[shutdown] Timed out after 58s, forcing exit."); process.exit(1); - }, 55_000); + }, 58_000); forceExitTimer.unref?.(); let exitCode = 0; From 397c67091693c9ba234c33fce7ef2dc11cd5e9ab Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:23:57 -0300 Subject: [PATCH 12/16] feat(nats): tune client connection for faster dead-connection detection - pingInterval: 20s (was 120s default) for faster dead connection detection - reconnectJitter: 500ms to prevent thundering herd on reconnect - connection name for debugging in NATS monitoring --- apps/mesh/src/nats/connection.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/apps/mesh/src/nats/connection.ts b/apps/mesh/src/nats/connection.ts index a915eca534..22837f3776 100644 --- a/apps/mesh/src/nats/connection.ts +++ b/apps/mesh/src/nats/connection.ts @@ -178,7 +178,15 @@ function defaultConnect(opts: { reconnect: boolean; maxReconnectAttempts: number; }): Promise { - return connect(opts); + return connect({ + ...opts, + pingInterval: 20_000, + maxPingOut: 3, + reconnectTimeWait: 1_000, + reconnectJitter: 500, + reconnectJitterTLS: 1_000, + name: "mesh-app", + }); } function sleep(ms: number): Promise { From ff7c75291695987e2649eba80cb9d86de4f480f0 Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:25:52 -0300 Subject: [PATCH 13/16] fix(test): resolve type errors in nats-notify test --- apps/mesh/src/event-bus/nats-notify.test.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/mesh/src/event-bus/nats-notify.test.ts b/apps/mesh/src/event-bus/nats-notify.test.ts index 52526b6b1c..6181838e7b 100644 --- a/apps/mesh/src/event-bus/nats-notify.test.ts +++ b/apps/mesh/src/event-bus/nats-notify.test.ts @@ -8,7 +8,7 @@ function createMockConnection(): { } { const subs: Subscription[] = []; const nc = { - subscribe: mock((subject: string) => { + subscribe: mock((_subject: string) => { let resolveIterator: (() => void) | null = null; const sub: Subscription = { unsubscribe: mock(() => { @@ -57,7 +57,7 @@ describe("NatsNotifyStrategy", () => { await strategy.start(); expect(nc.subscribe).toHaveBeenCalledTimes(2); // Old subscription should have been unsubscribed - expect(subs[0].unsubscribe).toHaveBeenCalledTimes(1); + expect(subs[0]!.unsubscribe).toHaveBeenCalledTimes(1); }); test("stop() cleans up subscription", async () => { @@ -67,7 +67,7 @@ describe("NatsNotifyStrategy", () => { await strategy.start(() => {}); await strategy.stop(); - expect(subs[0].unsubscribe).toHaveBeenCalledTimes(1); + expect(subs[0]!.unsubscribe).toHaveBeenCalledTimes(1); }); test("notify() publishes to correct subject", async () => { From e0f6797a02551ed9fa46ef94a6a2cb0700922a0e Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:26:54 -0300 Subject: [PATCH 14/16] feat(helm): add production values example overlay --- deploy/helm/values-production.example.yaml | 126 +++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 deploy/helm/values-production.example.yaml diff --git a/deploy/helm/values-production.example.yaml b/deploy/helm/values-production.example.yaml new file mode 100644 index 0000000000..d2d803b797 --- /dev/null +++ b/deploy/helm/values-production.example.yaml @@ -0,0 +1,126 @@ +# Production values overlay for MCP Mesh HA deployment. +# Contains ONLY values that differ from defaults in values.yaml. +# Usage: helm install mesh deploy/helm/ -f deploy/helm/values-production.example.yaml +# +# Prerequisites: +# - PostgreSQL (RDS Multi-AZ, CloudNativePG, or Cloud SQL HA) +# - External Secrets Operator for secret management +# - cert-manager for TLS certificates (optional) +# - 3+ AZ cluster with 6+ nodes + +replicaCount: 3 + +# --- Database (required for multi-replica) --- +database: + engine: postgresql + # URL provided via secret.secretName below + +# --- Secret (managed by External Secrets Operator) --- +secret: + secretName: "mesh-production-secrets" + +# --- Autoscaling --- +autoscaling: + enabled: true + minReplicas: 3 + maxReplicas: 6 + behavior: + scaleUp: + stabilizationWindowSeconds: 30 + policies: + - type: Percent + value: 100 + periodSeconds: 30 + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 10 + periodSeconds: 60 + +# --- Topology: use DoNotSchedule for zone spread in 3+ AZ clusters --- +topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio # Change to your release name + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + app.kubernetes.io/instance: deco-studio # Change to your release name + +# --- NATS: 3-node cluster for production --- +nats: + enabled: true + config: + cluster: + enabled: true + replicas: 3 + jetstream: + enabled: true + memoryStore: + enabled: true + maxSize: 512Mi + fileStore: + enabled: true + pvc: + enabled: true + size: 5Gi + podTemplate: + merge: + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["nats"] + topologyKey: kubernetes.io/hostname + +# --- Ingress (example with NGINX + SSE support) --- +ingress: + enabled: true + className: "nginx" + annotations: + nginx.ingress.kubernetes.io/proxy-buffering: "off" + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-body-size: "10m" + nginx.ingress.kubernetes.io/proxy-http-version: "1.1" + cert-manager.io/cluster-issuer: "letsencrypt-prod" + hosts: + - host: mesh.example.com # CHANGE THIS + paths: + - path: / + pathType: Prefix + tls: + - secretName: mesh-tls + hosts: + - mesh.example.com # CHANGE THIS + +# --- ConfigMap overrides --- +configMap: + meshConfig: + NODE_ENV: "production" + PORT: "3000" + HOST: "0.0.0.0" + BETTER_AUTH_URL: "https://mesh.example.com" # CHANGE THIS + BASE_URL: "https://mesh.example.com" # CHANGE THIS + DATA_DIR: "/app/data" + DECO_AI_GATEWAY_ENABLED: "false" + +# --- OTel (optional) --- +otel: + enabled: true + protocol: "http/protobuf" + service: "mcp-mesh" + collector: + enabled: true From fb4c8ba42caca81a3ac903e83681f5daf3b72d4b Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:27:11 -0300 Subject: [PATCH 15/16] docs(infra): add infrastructure reference manifests Includes External Secrets Operator (v1 API), cert-manager ClusterIssuer, and NetworkPolicy example for production HA deployment. --- deploy/infrastructure/README.md | 37 +++++++++++++ deploy/infrastructure/cert-issuer.yaml | 20 +++++++ deploy/infrastructure/external-secret.yaml | 32 +++++++++++ deploy/infrastructure/networkpolicy.yaml | 63 ++++++++++++++++++++++ 4 files changed, 152 insertions(+) create mode 100644 deploy/infrastructure/README.md create mode 100644 deploy/infrastructure/cert-issuer.yaml create mode 100644 deploy/infrastructure/external-secret.yaml create mode 100644 deploy/infrastructure/networkpolicy.yaml diff --git a/deploy/infrastructure/README.md b/deploy/infrastructure/README.md new file mode 100644 index 0000000000..9e04b0a2fe --- /dev/null +++ b/deploy/infrastructure/README.md @@ -0,0 +1,37 @@ +# Infrastructure Setup for MCP Mesh HA + +Reference manifests for external infrastructure components needed for +production HA deployment of MCP Mesh. Adapt to your environment. + +## Prerequisites + +1. **Kubernetes cluster** with 3+ AZs +2. **PostgreSQL** -- RDS Multi-AZ, Cloud SQL HA, or CloudNativePG +3. **External Secrets Operator** -- for secret management +4. **cert-manager** -- for TLS certificate automation (optional) +5. **Ingress controller** -- NGINX Ingress or Envoy-based (Contour) + +## Files + +| File | Description | +|------|-------------| +| `external-secret.yaml` | ExternalSecret for AWS Secrets Manager | +| `cert-issuer.yaml` | cert-manager ClusterIssuer for Let's Encrypt | +| `networkpolicy.yaml` | NetworkPolicy example (adapt to your CNI) | + +## Deployment + +1. Apply infrastructure manifests (adapt first): + ```bash + kubectl apply -f deploy/infrastructure/cert-issuer.yaml + kubectl apply -f deploy/infrastructure/external-secret.yaml -n mesh-production + kubectl apply -f deploy/infrastructure/networkpolicy.yaml -n mesh-production + ``` + +2. Deploy MCP Mesh with production values: + ```bash + helm install mesh deploy/helm/ \ + -f deploy/helm/values-production.example.yaml \ + --set database.url=postgresql://... \ + -n mesh-production + ``` diff --git a/deploy/infrastructure/cert-issuer.yaml b/deploy/infrastructure/cert-issuer.yaml new file mode 100644 index 0000000000..de358e0a4a --- /dev/null +++ b/deploy/infrastructure/cert-issuer.yaml @@ -0,0 +1,20 @@ +# cert-manager ClusterIssuer for Let's Encrypt TLS certificates. +# Prerequisites: +# - cert-manager installed (helm install cert-manager jetstack/cert-manager --set crds.enabled=true) +# +# Usage: kubectl apply -f deploy/infrastructure/cert-issuer.yaml +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: ops@example.com # CHANGE THIS + privateKeySecretRef: + name: letsencrypt-prod + solvers: + - http01: + ingress: + ingressClassName: nginx diff --git a/deploy/infrastructure/external-secret.yaml b/deploy/infrastructure/external-secret.yaml new file mode 100644 index 0000000000..11911c23d4 --- /dev/null +++ b/deploy/infrastructure/external-secret.yaml @@ -0,0 +1,32 @@ +# External Secrets Operator: sync secrets from AWS Secrets Manager. +# Prerequisites: +# - External Secrets Operator installed +# - ClusterSecretStore configured for AWS Secrets Manager +# +# Usage: kubectl apply -f deploy/infrastructure/external-secret.yaml -n mesh-production +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: mesh-production-secrets +spec: + refreshInterval: 5m + secretStoreRef: + name: aws-secrets-manager + kind: ClusterSecretStore + target: + name: mesh-production-secrets + creationPolicy: Owner + data: + - secretKey: BETTER_AUTH_SECRET + remoteRef: + key: mesh/production/auth + property: secret + - secretKey: ENCRYPTION_KEY + remoteRef: + key: mesh/production/encryption + property: key + - secretKey: DATABASE_URL + remoteRef: + key: mesh/production/database + property: url diff --git a/deploy/infrastructure/networkpolicy.yaml b/deploy/infrastructure/networkpolicy.yaml new file mode 100644 index 0000000000..8627055af6 --- /dev/null +++ b/deploy/infrastructure/networkpolicy.yaml @@ -0,0 +1,63 @@ +# NetworkPolicy example for MCP Mesh. +# This is highly environment-specific -- adapt to your cluster's CNI, +# ingress controller, and network layout before applying. +# +# Usage: kubectl apply -f deploy/infrastructure/networkpolicy.yaml -n mesh-production +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: mcp-mesh +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: chart-deco-studio + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: ingress-nginx + ports: + - port: 3000 + protocol: TCP + - ports: + - port: 3000 + protocol: TCP + egress: + - to: + - podSelector: + matchLabels: + app.kubernetes.io/name: nats + ports: + - port: 4222 + protocol: TCP + - to: + - ipBlock: + cidr: 10.0.0.0/8 + ports: + - port: 5432 + protocol: TCP + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + - 169.254.169.254/32 + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + ports: + - port: 443 + protocol: TCP From b489facda59aefbe6adb6d18d79bffe14b8c907a Mon Sep 17 00:00:00 2001 From: gimenes Date: Tue, 7 Apr 2026 21:30:37 -0300 Subject: [PATCH 16/16] fix(database): use transaction for advisory lock (not just pinned connection) db.connection().execute() pins the connection but does NOT wrap in a transaction. pg_advisory_xact_lock releases at transaction end, so with auto-commit each statement gets its own implicit transaction and the lock releases immediately. db.transaction().execute() both pins the connection AND wraps in a transaction, making the lock effective. --- apps/mesh/src/database/migrate.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/apps/mesh/src/database/migrate.ts b/apps/mesh/src/database/migrate.ts index 82c93f0909..395dea5fb7 100644 --- a/apps/mesh/src/database/migrate.ts +++ b/apps/mesh/src/database/migrate.ts @@ -154,10 +154,11 @@ async function runPluginMigrations(db: Kysely): Promise { } // Use a transaction-scoped advisory lock to prevent concurrent execution. - // Must use db.connection() to pin to a single connection (pool-safe). + // db.transaction() both pins the connection AND wraps in a transaction, + // so pg_advisory_xact_lock holds until the transaction commits. // Lock ID 73649281 is a fixed constant for plugin migrations. - return await db.connection().execute(async (conn) => { - await sql`SELECT pg_advisory_xact_lock(73649281)`.execute(conn); + return await db.transaction().execute(async (trx) => { + await sql`SELECT pg_advisory_xact_lock(73649281)`.execute(trx); // Note: plugin_migrations table and old record migration are handled // in runKyselyMigrations() before Kysely's migrator runs @@ -165,7 +166,7 @@ async function runPluginMigrations(db: Kysely): Promise { // Get already executed migrations const executed = await sql<{ plugin_id: string; name: string }>` SELECT plugin_id, name FROM plugin_migrations - `.execute(conn); + `.execute(trx); const executedSet = new Set( executed.rows.map((r) => `${r.plugin_id}/${r.name}`), ); @@ -204,14 +205,14 @@ async function runPluginMigrations(db: Kysely): Promise { } totalPending++; - await migration.up(conn); + await migration.up(trx); // Record as executed const timestamp = new Date().toISOString(); await sql` INSERT INTO plugin_migrations (plugin_id, name, timestamp) VALUES (${pluginId}, ${migration.name}, ${timestamp}) - `.execute(conn); + `.execute(trx); } }