diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 0156a189e6..5050be6a0a 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -14,7 +14,8 @@ * Bundle variable references now accept Unicode letters in path segments (e.g. `${var.变量}`). ([#5532](https://github.com/databricks/cli/pull/5532)) * Ignore remote changes for vector search direct_access_index_spec.schema_json to prevent drift when the backend normalizes the schema ([#5481](https://github.com/databricks/cli/pull/5481)). * Remove hidden, never-functional `--existing-dashboard-id`, `--existing-dashboard-path`, `--existing-alert-id`, and `--existing-genie-space-id` alias flags from `bundle generate`; use the documented `--existing-id` / `--existing-path` flags instead ([#5591](https://github.com/databricks/cli/pull/5591)). -* engine/direct: Fix WAL corruption after two consecutive failed deploys ([#5557](https://github.com/databricks/cli/issues/5557)). +* engine/direct: Fix WAL corruption after two consecutive failed deploys ([#5606](https://github.com/databricks/cli/pull/5606)). +* engine/direct: Don't open the deployment state WAL when a deploy's plan fails ([#5607](https://github.com/databricks/cli/pull/5607)). ### Dependency updates diff --git a/acceptance/bundle/deploy/wal/failed-plan-no-wal/databricks.yml b/acceptance/bundle/deploy/wal/failed-plan-no-wal/databricks.yml new file mode 100644 index 0000000000..b0d3c1cbca --- /dev/null +++ b/acceptance/bundle/deploy/wal/failed-plan-no-wal/databricks.yml @@ -0,0 +1,14 @@ +bundle: + name: test-bundle + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge diff --git a/acceptance/bundle/deploy/wal/failed-plan-no-wal/out.test.toml b/acceptance/bundle/deploy/wal/failed-plan-no-wal/out.test.toml new file mode 100644 index 0000000000..e90b6d5d1b --- /dev/null +++ b/acceptance/bundle/deploy/wal/failed-plan-no-wal/out.test.toml @@ -0,0 +1,3 @@ +Local = true +Cloud = false +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/failed-plan-no-wal/output.txt b/acceptance/bundle/deploy/wal/failed-plan-no-wal/output.txt new file mode 100644 index 0000000000..7d830c172b --- /dev/null +++ b/acceptance/bundle/deploy/wal/failed-plan-no-wal/output.txt @@ -0,0 +1,48 @@ + +=== Deploy 1 (normal: creates the job and the committed state) +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== Deploy 2 (planning fails, must not leave a WAL) +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Error: cannot plan resources.jobs.test_job: reading id="[NUMID]": Fault injected by test. (403 INJECTED) + +Endpoint: GET [DATABRICKS_URL]/api/2.2/jobs/get?job_id=[NUMID] +HTTP Status: 403 Forbidden +API error_code: INJECTED +API message: Fault injected by test. + +Error: planning failed + + +Exit code: 1 + +>>> assert_not_exists.py .databricks/bundle/default/resources.json.wal + +=== Deploy 3 (planning fails again, must not leave a WAL) +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Error: cannot plan resources.jobs.test_job: reading id="[NUMID]": Fault injected by test. (403 INJECTED) + +Endpoint: GET [DATABRICKS_URL]/api/2.2/jobs/get?job_id=[NUMID] +HTTP Status: 403 Forbidden +API error_code: INJECTED +API message: Fault injected by test. + +Error: planning failed + + +Exit code: 1 + +>>> assert_not_exists.py .databricks/bundle/default/resources.json.wal + +=== Deploy 4 (fault expired: recovers and succeeds) +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/deploy/wal/failed-plan-no-wal/script b/acceptance/bundle/deploy/wal/failed-plan-no-wal/script new file mode 100644 index 0000000000..b4fd487862 --- /dev/null +++ b/acceptance/bundle/deploy/wal/failed-plan-no-wal/script @@ -0,0 +1,29 @@ +# A failed plan must not leave a write-ahead log behind, so repeated planning +# failures never block a later, healthy deploy. Previously a failed plan still +# opened the WAL for write (UpgradeToWrite) and returned without finalizing, +# leaving a header-only WAL; after two failures the WAL serial drifted two ahead +# of the committed serial and every later command failed WAL recovery until the +# WAL was deleted by hand. +# +# A first deploy creates the job normally. An injected fault then makes the next +# two deploys fail while planning (planning refreshes the existing job via +# jobs/get). The final deploy, with the fault expired, must recover and succeed. +# A non-retried 403 is used so the failure is immediate; a 5xx would be retried +# with backoff. + +title "Deploy 1 (normal: creates the job and the committed state)" +trace $CLI bundle deploy + +# Fail the plan-stage refresh GET for the next two deploys only. +fault.py "GET /api/2.2/jobs/get" 403 0 2 + +title "Deploy 2 (planning fails, must not leave a WAL)" +trace errcode $CLI bundle deploy +trace assert_not_exists.py .databricks/bundle/default/resources.json.wal + +title "Deploy 3 (planning fails again, must not leave a WAL)" +trace errcode $CLI bundle deploy +trace assert_not_exists.py .databricks/bundle/default/resources.json.wal + +title "Deploy 4 (fault expired: recovers and succeeds)" +trace $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/failed-plan-no-wal/test.py b/acceptance/bundle/deploy/wal/failed-plan-no-wal/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/failed-plan-no-wal/test.py @@ -0,0 +1 @@ +print("test") diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 8518230770..e3b0b777eb 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -170,6 +170,13 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand plan = RunPlan(ctx, b, engine) } + // Stop before opening the WAL for write if planning failed. UpgradeToWrite + // writes a WAL header that only deployCore's Finalize commits or discards; + // returning past it without finalizing leaves a header-only WAL behind. + if logdiag.HasError(ctx) { + return + } + if engine.IsDirect() { // Upgrade from read (opened by process.go) to write mode if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { @@ -187,6 +194,9 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand } } + // InitForApply receives ctx and could log a diagnostic without returning an + // error, so re-check before deploying. (UpgradeToWrite above takes no ctx and + // thus cannot log, so the earlier check is enough to guard the WAL open.) if logdiag.HasError(ctx) { return }