-
Notifications
You must be signed in to change notification settings - Fork 1
235 lines (216 loc) · 11 KB
/
deploy.yml
File metadata and controls
235 lines (216 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
name: Deploy (prod)
# Push-to-main deploy pipeline for the Azure VM stack.
# - backend/** changes -> build + push codetutor-backend to GHCR, pull on VM, restart service
# - runner-image/** changes -> build + push codetutor-runner to GHCR, pull on VM (no restart; new sessions use it)
#
# Auth: OIDC federation with an Azure SP scoped to codetutor-ai-prod-rg.
# No long-lived Azure creds; GHCR push uses the workflow's built-in GITHUB_TOKEN.
#
# First deploy creates the GHCR packages as private. Flip them public manually
# (UI or `gh api --method PATCH /user/packages/container/<name>/visibility -f visibility=public`)
# so the VM can pull anonymously.
on:
push:
branches: [main]
paths:
- 'backend/**'
- 'runner-image/**'
- 'docker-compose.yml'
- 'docker-compose.prod.yml'
- 'infra/scripts/**'
- '.github/workflows/deploy.yml'
workflow_dispatch:
inputs:
service:
description: 'Which image to rebuild + redeploy'
required: true
default: 'backend'
type: choice
options:
- backend
- runner-image
- both
concurrency:
# S-15 (bucket 7): single named group shared by backend + runner jobs.
# Combined with cancel-in-progress:false, this serializes every push that
# touches either image — without it, a backend and runner deploy racing on
# the same VM would both call `git reset --hard origin/main` and the
# runner could reset the tree out from under a still-deploying backend.
group: deploy-prod
cancel-in-progress: false
permissions:
id-token: write # needed for azure/login OIDC exchange
contents: read
packages: write # GHCR push
env:
BACKEND_IMAGE: ghcr.io/msrivas-7/codetutor-backend
RUNNER_IMAGE: ghcr.io/msrivas-7/codetutor-runner
AZURE_RG: codetutor-ai-prod-rg
AZURE_VM: codetutor-ai-vm
jobs:
changes:
name: Detect changed paths
runs-on: ubuntu-latest
outputs:
backend: ${{ steps.filter.outputs.backend }}
runner: ${{ steps.filter.outputs.runner }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: dorny/paths-filter@d1c1ffe0248fe513906c8e24db8ea791d46f8590 # v3
id: filter
with:
filters: |
backend:
- 'backend/**'
- 'docker-compose.yml'
- 'docker-compose.prod.yml'
- 'infra/scripts/**'
runner:
- 'runner-image/**'
build-backend:
name: Build backend + deploy
needs: changes
if: |
needs.changes.outputs.backend == 'true' ||
(github.event_name == 'workflow_dispatch' && contains(fromJSON('["backend", "both"]'), inputs.service))
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- name: Verify Supabase migrations applied to prod
# SRE-C1 from audit-v2. Prevents the repeat incident "merge a migration,
# forget to run `supabase db push` against prod, deploy, backend 500s on
# the first write that touches the missing table." Compares the max
# version in supabase_migrations.schema_migrations against the newest
# file in supabase/migrations/. Fails the deploy on mismatch so the
# operator has to push the migration before re-running.
#
# Requires PROD_DATABASE_URL repo secret (direct Supabase connection
# string, NOT the pooler URL — schema_migrations is in the internal
# supabase schema). If the secret isn't set, skips with a warning so
# the deploy still works until it's wired up.
env:
PROD_DATABASE_URL: ${{ secrets.PROD_DATABASE_URL }}
run: |
if [ -z "$PROD_DATABASE_URL" ]; then
echo "::warning::PROD_DATABASE_URL secret not set — skipping migration drift check. Add it under repo Settings → Secrets → Actions to enable."
exit 0
fi
latest=$(ls -1 supabase/migrations 2>/dev/null | grep -oE '^[0-9]+' | sort | tail -1)
if [ -z "$latest" ]; then
echo "No migrations in repo — nothing to verify."
exit 0
fi
applied=$(PGCONNECT_TIMEOUT=10 psql "$PROD_DATABASE_URL" -At -c "SELECT coalesce(max(version), '0') FROM supabase_migrations.schema_migrations" 2>&1)
if ! echo "$applied" | grep -qE '^[0-9]+$'; then
echo "::error::Failed to query prod schema_migrations: $applied"
exit 1
fi
if [ "$applied" != "$latest" ]; then
echo "::error::Migration drift: applied=$applied, latest in repo=$latest. Run: npx supabase db push --db-url \"\$PROD_DATABASE_URL\" — then re-run this workflow."
exit 1
fi
echo "✓ Migrations up-to-date (version $applied)"
- uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
- name: Log in to GHCR
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build + push backend image
# Context is repo root (not ./backend) because the Dockerfile COPYs
# paths like `backend/package.json` — matching how docker-compose
# builds on the VM (context: .).
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
with:
context: .
file: ./backend/Dockerfile
push: true
tags: |
${{ env.BACKEND_IMAGE }}:${{ github.sha }}
${{ env.BACKEND_IMAGE }}:latest
cache-from: type=gha,scope=backend
cache-to: type=gha,scope=backend,mode=max
- name: Azure login (OIDC)
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Sync repo, pull image, deploy with rollback
# az vm run-command reports "Succeeded" on invocation even if the
# inner script fails, so we grep for a sentinel at the end. The VM-
# side script (infra/scripts/vm-deploy-backend.sh) does a targeted
# `docker compose up -d backend` instead of systemctl restart — Caddy
# stays up so TLS connections aren't dropped — and rolls back to the
# prior SHA + prior :latest image if /api/health doesn't come green.
# PREV_SHA is captured BEFORE the forward reset so the script can
# reverse-reset to it on failure. NEW_SHA is the git HEAD AFTER reset
# (not ${{ github.sha }}) so a concurrent PR merging in between still
# deploys what actually landed on main. The script pulls that tag
# directly instead of the floating :latest.
# SRE-H2: (1) `git status --porcelain` precheck refuses to deploy if
# the VM working tree has uncommitted changes (an oncall hotfix). The
# prior behaviour silently `git reset --hard`-ed them away — fine for
# a routine deploy, disastrous if the "hotfix" was an active mitigation
# for an ongoing incident. Abort loudly instead; operator can `git
# stash` or commit + push before re-running. (2) timeout capped at 10m
# so an unreachable VM fails the workflow promptly instead of hanging
# 90 min on the default `az vm run-command` client timeout.
timeout-minutes: 12
run: |
out=$(az vm run-command invoke \
-g "$AZURE_RG" -n "$AZURE_VM" \
--command-id RunShellScript \
--scripts 'set -e; cd /opt/codetutor && if ! sudo -u codetutor git diff --quiet HEAD; then echo "DEPLOY_ABORT: dirty working tree on VM. SSH in and stash/commit/reset manually before redeploying."; sudo -u codetutor git status --short; exit 1; fi && sudo -u codetutor git fetch origin main && PREV_SHA=$(sudo -u codetutor git rev-parse HEAD) && sudo -u codetutor git reset --hard origin/main && NEW_SHA=$(sudo -u codetutor git rev-parse HEAD) && bash infra/scripts/vm-deploy-backend.sh "$PREV_SHA" "$NEW_SHA"' \
--query "value[0].message" -o tsv)
echo "$out"
echo "$out" | grep -q DEPLOY_OK || { echo "::error::deploy sentinel missing - deploy failed or rolled back"; exit 1; }
build-runner:
name: Build runner-image + push
needs: changes
if: |
needs.changes.outputs.runner == 'true' ||
(github.event_name == 'workflow_dispatch' && contains(fromJSON('["runner-image", "both"]'), inputs.service))
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
- name: Log in to GHCR
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build + push runner image
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
with:
context: ./runner-image
push: true
tags: |
${{ env.RUNNER_IMAGE }}:${{ github.sha }}
${{ env.RUNNER_IMAGE }}:latest
cache-from: type=gha,scope=runner
cache-to: type=gha,scope=runner,mode=max
- name: Azure login (OIDC)
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Sync repo, pull runner image (no restart; new sessions pick it up)
# Pulls the specific ${sha} tag and retags it to :latest locally so
# the compose env file (which refers to :latest) reuses an immutable
# SHA image instead of whatever GHCR's :latest currently points at.
# Matches the backend job's pinning approach. New sessions pick up
# the retagged :latest on their next create.
# SRE-H2: same dirty-tree + timeout hardening as the backend job.
timeout-minutes: 12
run: |
out=$(az vm run-command invoke \
-g "$AZURE_RG" -n "$AZURE_VM" \
--command-id RunShellScript \
--scripts 'set -e; cd /opt/codetutor && if ! sudo -u codetutor git diff --quiet HEAD; then echo "DEPLOY_ABORT: dirty working tree on VM."; sudo -u codetutor git status --short; exit 1; fi && sudo -u codetutor git fetch origin main && sudo -u codetutor git reset --hard origin/main && NEW_SHA=$(sudo -u codetutor git rev-parse HEAD) && sudo -u codetutor docker pull ghcr.io/msrivas-7/codetutor-runner:$NEW_SHA && sudo -u codetutor docker tag ghcr.io/msrivas-7/codetutor-runner:$NEW_SHA ghcr.io/msrivas-7/codetutor-runner:latest && echo DEPLOY_OK' \
--query "value[0].message" -o tsv)
echo "$out"
echo "$out" | grep -q DEPLOY_OK || { echo "::error::deploy sentinel missing - deploy failed on VM"; exit 1; }