From 0c02a2d7feb056cf603b2ac4f3767776727a7902 Mon Sep 17 00:00:00 2001
From: gaurav0107 <gauravdubey0107@gmail.com>
Date: Sun, 7 Jun 2026 17:29:54 +0530
Subject: [PATCH] fix(deploy): Recreate strategy for api+ingest-worker, bump
 helm timeout to 20m

The previous deploy hit ``Error: UPGRADE FAILED ... context deadline
exceeded`` after 10m. Cluster events showed:

    FailedScheduling pod/tracebility-web-...
    0/2 nodes are available: 1 Insufficient memory ...

GKE Autopilot's just-in-time node scaling kicks in when a rolling
update needs 2x the capacity of one pod (default maxSurge=1). Scale-up
takes ~5 min on top of image pull + probes; helm --wait at 10m wasn't
enough. The atomic rollback then reverted everything to the previous
SHA.

Two changes:

1. ``api`` and ``ingest-worker`` deployments switch to ``strategy:
   type: Recreate`` so a single-replica rollout doesn't briefly need
   2x capacity. ``ingest-api`` already uses Recreate (predates this
   PR). ``web`` keeps RollingUpdate because it fronts the LB and a
   brief outage there is user-visible.

2. ``--timeout 10m`` -> ``20m`` in the workflow so an Autopilot
   scale-up (rare now but possible) doesn't trip the gate.

Verified: ``helm template`` renders all three deployments with the
Recreate strategy block.

Signed-off-by: Gaurav Dubey <gauravdubey0107@gmail.com>
Signed-off-by: gaurav0107 <gauravdubey0107@gmail.com>
---
 .github/workflows/deploy.yml                              | 2 +-
 deploy/helm/tracebility/templates/api-deployment.yaml     | 8 ++++++++
 .../tracebility/templates/ingest-worker-deployment.yaml   | 6 ++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 0aa629a..64865ea 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -140,7 +140,7 @@ jobs:
             --set image.tag="${{ steps.sha.outputs.sha }}" \
             --atomic \
             --wait \
-            --timeout 10m
+            --timeout 20m
 
       - name: Rollout status (sanity)
         run: |
diff --git a/deploy/helm/tracebility/templates/api-deployment.yaml b/deploy/helm/tracebility/templates/api-deployment.yaml
index 9b19b3e..b339d94 100644
--- a/deploy/helm/tracebility/templates/api-deployment.yaml
+++ b/deploy/helm/tracebility/templates/api-deployment.yaml
@@ -8,6 +8,14 @@ metadata:
     app.kubernetes.io/component: api
 spec:
   replicas: {{ .Values.api.replicaCount }}
+  # Recreate so a single-replica rollout doesn't briefly need 2x the
+  # capacity of one pod. With Autopilot's just-in-time node scaling,
+  # RollingUpdate's default maxSurge=1 forces a node scale-up on every
+  # deploy; that takes ~5min and pushes the helm --wait past timeout.
+  # Brief outage during rollout is acceptable for the api service —
+  # the LB is fronted by web, not api directly.
+  strategy:
+    type: Recreate
   selector:
     matchLabels:
       {{- include "tracebility.api.selectorLabels" . | nindent 6 }}
diff --git a/deploy/helm/tracebility/templates/ingest-worker-deployment.yaml b/deploy/helm/tracebility/templates/ingest-worker-deployment.yaml
index ffb4ffb..1a30553 100644
--- a/deploy/helm/tracebility/templates/ingest-worker-deployment.yaml
+++ b/deploy/helm/tracebility/templates/ingest-worker-deployment.yaml
@@ -8,6 +8,12 @@ metadata:
     app.kubernetes.io/component: ingest-worker
 spec:
   replicas: {{ .Values.ingestWorker.replicaCount }}
+  # Recreate so a single-replica rollout doesn't need 2x capacity. The
+  # worker is a Redis-stream consumer; redelivery on a brief outage is
+  # already in the failure model (XACK after successful insert), so
+  # killing the old pod before starting the new one is safe.
+  strategy:
+    type: Recreate
   selector:
     matchLabels:
       {{- include "tracebility.ingestWorker.selectorLabels" . | nindent 6 }}