From 0e5d29286ab622f2b2109204762b234cc0692ea6 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 15 Jun 2026 06:40:04 -0700 Subject: [PATCH 1/2] engine/direct: Fix WAL corruption after two consecutive failed deploys Recovering a header-only WAL (a deploy that opened the WAL but committed nothing, e.g. a crash between UpgradeToWrite and Finalize) advanced the in-memory serial without persisting it. After a second such failure the next WAL header was written two serials ahead of the committed state, and every later command failed WAL recovery until the WAL was deleted by hand. Only advance the serial when the WAL carried entries, i.e. when the merged state is actually written back. Co-authored-by: Isaac --- NEXT_CHANGELOG.md | 1 + .../wal/two-crashed-deploys/databricks.yml | 14 +++++++ .../wal/two-crashed-deploys/out.test.toml | 3 ++ .../deploy/wal/two-crashed-deploys/output.txt | 39 +++++++++++++++++++ .../deploy/wal/two-crashed-deploys/script | 22 +++++++++++ .../deploy/wal/two-crashed-deploys/test.py | 1 + bundle/direct/dstate/state.go | 17 +++++++- bundle/direct/dstate/state_test.go | 35 +++++++++++++++++ 8 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/two-crashed-deploys/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/two-crashed-deploys/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/two-crashed-deploys/output.txt create mode 100644 acceptance/bundle/deploy/wal/two-crashed-deploys/script create mode 100644 acceptance/bundle/deploy/wal/two-crashed-deploys/test.py diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index f023b573438..0156a189e61 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -14,6 +14,7 @@ * Bundle variable references now accept Unicode letters in path segments (e.g. `${var.变量}`). ([#5532](https://github.com/databricks/cli/pull/5532)) * Ignore remote changes for vector search direct_access_index_spec.schema_json to prevent drift when the backend normalizes the schema ([#5481](https://github.com/databricks/cli/pull/5481)). * Remove hidden, never-functional `--existing-dashboard-id`, `--existing-dashboard-path`, `--existing-alert-id`, and `--existing-genie-space-id` alias flags from `bundle generate`; use the documented `--existing-id` / `--existing-path` flags instead ([#5591](https://github.com/databricks/cli/pull/5591)). +* engine/direct: Fix WAL corruption after two consecutive failed deploys ([#5557](https://github.com/databricks/cli/issues/5557)). ### Dependency updates diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/databricks.yml b/acceptance/bundle/deploy/wal/two-crashed-deploys/databricks.yml new file mode 100644 index 00000000000..3d65ac2bfca --- /dev/null +++ b/acceptance/bundle/deploy/wal/two-crashed-deploys/databricks.yml @@ -0,0 +1,14 @@ +bundle: + name: wal-two-crashed-deploys + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/out.test.toml b/acceptance/bundle/deploy/wal/two-crashed-deploys/out.test.toml new file mode 100644 index 00000000000..e90b6d5d1ba --- /dev/null +++ b/acceptance/bundle/deploy/wal/two-crashed-deploys/out.test.toml @@ -0,0 +1,3 @@ +Local = true +Cloud = false +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/output.txt b/acceptance/bundle/deploy/wal/two-crashed-deploys/output.txt new file mode 100644 index 00000000000..1c70b4ae2ba --- /dev/null +++ b/acceptance/bundle/deploy/wal/two-crashed-deploys/output.txt @@ -0,0 +1,39 @@ + +=== First deploy (killed before recording the job, leaves a header-only WAL) +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-two-crashed-deploys/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] + +>>> cat .databricks/bundle/default/resources.json.wal +{"state_version":2,"cli_version":"[DEV_VERSION]","lineage":"[UUID]","serial":1} + +=== Second deploy (killed again, leaves another header-only WAL) +>>> errcode [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-two-crashed-deploys/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] + +>>> cat .databricks/bundle/default/resources.json.wal +{"state_version":2,"cli_version":"[DEV_VERSION]","lineage":"[UUID]","serial":1} + +=== Third deploy (must recover and succeed, not blocked by the leftover WAL) +>>> errcode [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-two-crashed-deploys/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> errcode assert_not_exists.py .databricks/bundle/default/resources.json.wal + +>>> errcode cat .databricks/bundle/default/resources.json +{ + "serial": 1, + "state_keys": [ + "resources.jobs.test_job" + ] +} diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/script b/acceptance/bundle/deploy/wal/two-crashed-deploys/script new file mode 100644 index 00000000000..89da7e1a277 --- /dev/null +++ b/acceptance/bundle/deploy/wal/two-crashed-deploys/script @@ -0,0 +1,22 @@ +# Two consecutive deploys are killed mid-apply, after UpgradeToWrite has written +# the WAL header but before Finalize runs (killed on the jobs/create call, before +# the job's state is recorded). A kill cannot be prevented by bailing out early, +# so each crash leaves a header-only WAL behind. Recovery must discard those +# no-op WALs without advancing the serial; otherwise the second crash would write +# its WAL header two serials ahead of the committed state and block every later +# command. Regression test for the dstate recovery fix. +kill_after.py "POST /api/2.2/jobs/create" 0 2 + +title "First deploy (killed before recording the job, leaves a header-only WAL)" +trace errcode $CLI bundle deploy +trace cat .databricks/bundle/default/resources.json.wal + +title "Second deploy (killed again, leaves another header-only WAL)" +trace errcode $CLI bundle deploy --force-lock +trace cat .databricks/bundle/default/resources.json.wal + +title "Third deploy (must recover and succeed, not blocked by the leftover WAL)" +trace errcode $CLI bundle deploy --force-lock + +trace errcode assert_not_exists.py .databricks/bundle/default/resources.json.wal +trace errcode cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/test.py b/acceptance/bundle/deploy/wal/two-crashed-deploys/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/two-crashed-deploys/test.py @@ -0,0 +1 @@ +print("test") diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 54505677663..dff484ca30c 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -286,6 +286,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) scanner.Buffer(make([]byte, 0, initialBufferSize), maxWalEntrySize) lineNumber := 0 var corruptedLines [][]byte + var newSerial int for scanner.Scan() { lineNumber++ @@ -309,7 +310,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) if header.Serial > expectedSerial { return false, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) } - db.Data.Serial = expectedSerial + newSerial = header.Serial } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { @@ -344,7 +345,19 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) } } - return lineNumber > 1, nil + hasEntries := lineNumber > 1 + + // Only advance the serial when the WAL carried entries, because the caller + // (replayWAL) persists the new state file only in that case. A header-only + // WAL is a deploy that started but committed nothing; advancing the serial + // for it leaves the in-memory serial ahead of the persisted one, so the + // next deploy writes its WAL header at serial+2 and recovery rejects it as + // "ahead of expected". See acceptance/bundle/deploy/wal/two-crashed-deploys. + if hasEntries { + db.Data.Serial = newSerial + } + + return hasEntries, nil } // Finalize replays the WAL (if open for write), captures the resulting state, and resets. diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index bbfd2559951..b2d13c0a6c7 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -1,6 +1,7 @@ package dstate import ( + "encoding/json" "os" "path/filepath" "testing" @@ -55,6 +56,40 @@ func TestPanicOnDoubleOpen(t *testing.T) { mustFinalize(t, &db) } +func TestHeaderOnlyWALRecoveryDoesNotAdvanceSerial(t *testing.T) { + path := filepath.Join(t.TempDir(), "state.json") + walPath := path + walSuffix + + // Commit serial 1 with one resource. + var db DeploymentState + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) + require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{}, nil)) + mustFinalize(t, &db) + + var committed DeploymentState + require.NoError(t, committed.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) + lineage := committed.Data.Lineage + require.Equal(t, 1, committed.Data.Serial) + mustFinalize(t, &committed) + + // A deploy that opens the WAL for write but commits nothing (e.g. planning + // fails after UpgradeToWrite) leaves a header-only WAL behind, here at the + // expected serial 2. Recovering it must not advance the serial past the + // committed 1, otherwise a second such failed deploy would write its header + // at serial 3 and be rejected as ahead of the committed state. + header := Header{Lineage: lineage, Serial: 2, StateVersion: currentStateVersion} + headerLine, err := json.Marshal(header) + require.NoError(t, err) + require.NoError(t, os.WriteFile(walPath, append(headerLine, '\n'), 0o600)) + + var recovered DeploymentState + require.NoError(t, recovered.Open(t.Context(), path, WithRecovery(true), WithWrite(false))) + assert.Equal(t, 1, recovered.Data.Serial) + assert.Equal(t, "123", recovered.GetResourceID("jobs.my_job")) + assert.NoFileExists(t, walPath) + mustFinalize(t, &recovered) +} + func TestDeleteState(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") From ec65a6bf9472b10c26e22fa97c91fe9f2e26c544 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 15 Jun 2026 06:46:36 -0700 Subject: [PATCH 2/2] acceptance: Rename WAL recovery test to header-only-wal Name it after the WAL condition it recovers, matching sibling tests (empty-wal, stale-wal, future-serial-wal). Use a generic test-bundle name. Co-authored-by: Isaac --- .../{two-crashed-deploys => header-only-wal}/databricks.yml | 2 +- .../{two-crashed-deploys => header-only-wal}/out.test.toml | 0 .../wal/{two-crashed-deploys => header-only-wal}/output.txt | 6 +++--- .../wal/{two-crashed-deploys => header-only-wal}/script | 0 .../wal/{two-crashed-deploys => header-only-wal}/test.py | 0 bundle/direct/dstate/state.go | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename acceptance/bundle/deploy/wal/{two-crashed-deploys => header-only-wal}/databricks.yml (89%) rename acceptance/bundle/deploy/wal/{two-crashed-deploys => header-only-wal}/out.test.toml (100%) rename acceptance/bundle/deploy/wal/{two-crashed-deploys => header-only-wal}/output.txt (76%) rename acceptance/bundle/deploy/wal/{two-crashed-deploys => header-only-wal}/script (100%) rename acceptance/bundle/deploy/wal/{two-crashed-deploys => header-only-wal}/test.py (100%) diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/databricks.yml b/acceptance/bundle/deploy/wal/header-only-wal/databricks.yml similarity index 89% rename from acceptance/bundle/deploy/wal/two-crashed-deploys/databricks.yml rename to acceptance/bundle/deploy/wal/header-only-wal/databricks.yml index 3d65ac2bfca..b0d3c1cbca3 100644 --- a/acceptance/bundle/deploy/wal/two-crashed-deploys/databricks.yml +++ b/acceptance/bundle/deploy/wal/header-only-wal/databricks.yml @@ -1,5 +1,5 @@ bundle: - name: wal-two-crashed-deploys + name: test-bundle resources: jobs: diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/out.test.toml b/acceptance/bundle/deploy/wal/header-only-wal/out.test.toml similarity index 100% rename from acceptance/bundle/deploy/wal/two-crashed-deploys/out.test.toml rename to acceptance/bundle/deploy/wal/header-only-wal/out.test.toml diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/output.txt b/acceptance/bundle/deploy/wal/header-only-wal/output.txt similarity index 76% rename from acceptance/bundle/deploy/wal/two-crashed-deploys/output.txt rename to acceptance/bundle/deploy/wal/header-only-wal/output.txt index 1c70b4ae2ba..4048098e51c 100644 --- a/acceptance/bundle/deploy/wal/two-crashed-deploys/output.txt +++ b/acceptance/bundle/deploy/wal/header-only-wal/output.txt @@ -1,7 +1,7 @@ === First deploy (killed before recording the job, leaves a header-only WAL) >>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-two-crashed-deploys/default/files... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... Deploying resources... [PROCESS_KILLED] @@ -12,7 +12,7 @@ Exit code: [KILLED] === Second deploy (killed again, leaves another header-only WAL) >>> errcode [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-two-crashed-deploys/default/files... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... Deploying resources... [PROCESS_KILLED] @@ -23,7 +23,7 @@ Exit code: [KILLED] === Third deploy (must recover and succeed, not blocked by the leftover WAL) >>> errcode [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-two-crashed-deploys/default/files... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... Deploying resources... Updating deployment state... Deployment complete! diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/script b/acceptance/bundle/deploy/wal/header-only-wal/script similarity index 100% rename from acceptance/bundle/deploy/wal/two-crashed-deploys/script rename to acceptance/bundle/deploy/wal/header-only-wal/script diff --git a/acceptance/bundle/deploy/wal/two-crashed-deploys/test.py b/acceptance/bundle/deploy/wal/header-only-wal/test.py similarity index 100% rename from acceptance/bundle/deploy/wal/two-crashed-deploys/test.py rename to acceptance/bundle/deploy/wal/header-only-wal/test.py diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index dff484ca30c..0bf50809e16 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -352,7 +352,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) // WAL is a deploy that started but committed nothing; advancing the serial // for it leaves the in-memory serial ahead of the persisted one, so the // next deploy writes its WAL header at serial+2 and recovery rejects it as - // "ahead of expected". See acceptance/bundle/deploy/wal/two-crashed-deploys. + // "ahead of expected". See acceptance/bundle/deploy/wal/header-only-wal. if hasEntries { db.Data.Serial = newSerial }