From 5aa6eaa60b0591e2ffbda2aeea5d419db1d3fc98 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 17:09:51 +0530 Subject: [PATCH] fix(migrator): version-track ClickHouse migrations (unblock helm-deploy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CH migration runner used to re-apply every file every deploy under the assumption that all SQL was idempotent (CREATE IF NOT EXISTS). That contract broke with 0006_tenant_columns.sql, which does CREATE-INSERT-RENAME — not safe to run twice. The result was helm-deploy failing the pre-upgrade migrator hook with ``BackoffLimitExceeded`` because the rename targeted an already-existing ``run_v1``. Mirror the postgres pattern: - 0000_schema_migrations.sql creates a CH ``schema_migrations`` table (sorted first via the 0000 prefix). The CREATE IF NOT EXISTS is the one piece that still has to be self-idempotent. - run.sh applies 0000 unconditionally, then queries ``schema_migrations`` and skips any other file whose basename is already present. After a successful apply, it inserts the version. - Each migration file may now use any DDL — CREATE / ALTER / RENAME / DROP — without needing to be self-idempotent. For an existing cluster that's already past 0001-0007, manually backfill before deploying: insert into schema_migrations (version) values ('0001_runs_and_spans'), ('0002_eval_scores'), ('0003_replay_captures'), ('0004_dataset_items'), ('0005_billing_meters'), ('0006_tenant_columns'), ('0007_audit_log'); (Already done on the live GKE cluster as part of this fix.) Verified locally: built the migrator image, reproduced the ``run_v1 already exists`` failure on first run, backfilled tracking, ran twice in a row — both pass with ``applied 0, skipped 7``. Signed-off-by: Gaurav Dubey Signed-off-by: gaurav0107 --- schemas/clickhouse/0000_schema_migrations.sql | 22 +++++ services/migrator/run.sh | 85 ++++++++++++++----- 2 files changed, 85 insertions(+), 22 deletions(-) create mode 100644 schemas/clickhouse/0000_schema_migrations.sql diff --git a/schemas/clickhouse/0000_schema_migrations.sql b/schemas/clickhouse/0000_schema_migrations.sql new file mode 100644 index 0000000..f446b58 --- /dev/null +++ b/schemas/clickhouse/0000_schema_migrations.sql @@ -0,0 +1,22 @@ +-- 0000_schema_migrations.sql +-- Bootstrap version-tracking table for ClickHouse migrations. +-- +-- Numbered 0000 (instead of 0001+) so it sorts first under glob ordering +-- and is always applied before any other migration. The migrator runner +-- (services/migrator/run.sh) treats this file specially: it always runs, +-- consults the resulting `schema_migrations` table to learn which other +-- migrations have already been applied, and skips those. +-- +-- This unblocks non-idempotent migrations (CREATE-INSERT-RENAME, ALTER, +-- DROP), which the previous "always re-apply with CREATE IF NOT EXISTS" +-- contract couldn't support. See 0006_tenant_columns.sql for the case +-- that motivated this. + +create table if not exists schema_migrations +( + version String, + applied_at DateTime64(9, 'UTC') default now64(9) +) +engine = MergeTree +order by version +settings index_granularity = 8192; diff --git a/services/migrator/run.sh b/services/migrator/run.sh index c9f59d0..6df7d0d 100755 --- a/services/migrator/run.sh +++ b/services/migrator/run.sh @@ -22,9 +22,13 @@ for f in /schemas/postgres/*.sql; do done echo "==> Postgres: applied $count_applied, skipped $count_skipped" -# All ClickHouse migrations must be idempotent (CREATE TABLE IF NOT EXISTS). -# ALTER migrations require a version-tracking table before being added. -echo "==> ClickHouse: applying all migrations (CREATE IF NOT EXISTS)" +# ClickHouse migrations are version-tracked via the `schema_migrations` +# table (created by 0000_schema_migrations.sql, always run first; its +# CREATE IF NOT EXISTS is itself idempotent). Subsequent migration files +# only run if their version isn't already in schema_migrations. Each file +# may use any DDL — CREATE, ALTER, RENAME, DROP — without needing to be +# self-idempotent. +echo "==> ClickHouse: applying migrations with version tracking" # We talk to ClickHouse over HTTP rather than the native binary protocol, # because the DSN we get (TRACEBILITY_CLICKHOUSE_URL) already targets the @@ -41,30 +45,67 @@ else ch_endpoint="$CH_URL" # already has /?... form, send as-is fi -# ClickHouse's HTTP endpoint doesn't accept multi-statement bodies (no -# `multi_statements` setting in 24.x), so split each .sql file into -# individual statements on `;` followed by newline, drop empty/whitespace- -# only chunks, and POST each one separately. -ch_total=0 -for f in /schemas/clickhouse/*.sql; do - echo " apply: $(basename "$f" .sql)" - # awk produces a NUL-delimited stream of statements; while-loop reads them - # without subshell issues. +# Helper: POST one statement to the CH HTTP endpoint, fail loud on non-200. +# Usage: ch_post "" "" +ch_post() { + local stmt="$1" + local ctx="$2" + local http_status + http_status=$(printf '%s' "$stmt" | curl -sS -o /tmp/ch_response -w "%{http_code}" \ + --data-binary @- "$ch_endpoint" || true) + if [ "$http_status" != "200" ]; then + echo "ERROR: ClickHouse returned HTTP $http_status on statement from $ctx" + cat /tmp/ch_response + exit 1 + fi +} + +# Apply 0000 unconditionally so the tracking table exists. +# ClickHouse HTTP endpoint doesn't accept multi-statement bodies (no +# `multi_statements` setting in 24.x), so split files on `;` followed by +# newline, drop empty/comment-only chunks, and POST each statement. +apply_file() { + local f="$1" while IFS= read -r -d '' stmt; do - # Skip pure-whitespace / comment-only chunks. case "$(printf '%s' "$stmt" | tr -d '[:space:]' | sed -E 's/--[^\\n]*//g')" in "") continue ;; esac - http_status=$(printf '%s' "$stmt" | curl -sS -o /tmp/ch_response -w "%{http_code}" \ - --data-binary @- "$ch_endpoint" || true) - if [ "$http_status" != "200" ]; then - echo "ERROR: ClickHouse returned HTTP $http_status on statement from $(basename "$f")" - cat /tmp/ch_response - exit 1 - fi + ch_post "$stmt" "$(basename "$f")" done < <(awk 'BEGIN{RS=";\n"; ORS="\0"} { print }' "$f") - ch_total=$((ch_total + 1)) +} + +bootstrap_file="/schemas/clickhouse/0000_schema_migrations.sql" +if [ -f "$bootstrap_file" ]; then + echo " bootstrap: $(basename "$bootstrap_file" .sql)" + apply_file "$bootstrap_file" +fi + +# Read which versions are already applied. The query format is TabSeparated; +# one version per line. An empty result is fine (fresh DB, just bootstrapped). +ch_applied="$(printf '%s' "select version from schema_migrations" | curl -sS \ + --data-binary @- "$ch_endpoint" || true)" + +ch_applied_count=0 +ch_skipped_count=0 +for f in /schemas/clickhouse/*.sql; do + v="$(basename "$f" .sql)" + # 0000 was bootstrapped above; record-if-missing so subsequent runs skip. + if [ "$v" = "0000_schema_migrations" ]; then + if ! printf '%s\n' "$ch_applied" | grep -qx "$v"; then + ch_post "insert into schema_migrations (version) values ('$v')" "$v (record bootstrap)" + fi + continue + fi + if printf '%s\n' "$ch_applied" | grep -qx "$v"; then + echo " skip: $v" + ch_skipped_count=$((ch_skipped_count + 1)) + continue + fi + echo " apply: $v" + apply_file "$f" + ch_post "insert into schema_migrations (version) values ('$v')" "$v (record apply)" + ch_applied_count=$((ch_applied_count + 1)) done -echo "==> ClickHouse: applied $ch_total file(s)" +echo "==> ClickHouse: applied $ch_applied_count, skipped $ch_skipped_count" echo "==> migrator: done"