diff --git a/acceptance/internal/config.go b/acceptance/internal/config.go index 2129fa5573..3910107eb9 100644 --- a/acceptance/internal/config.go +++ b/acceptance/internal/config.go @@ -156,6 +156,12 @@ type ServerStub struct { // Configure as "1ms", "2s", "3m", etc. // See [time.ParseDuration] for details. Delay time.Duration + + // Number of times to kill the caller process before returning normal responses. + // 0 = never kill (default), 1 = kill once then allow, 2 = kill twice then allow, etc. + // Useful for testing crash recovery scenarios where first deploy crashes but retry succeeds. + // Requires DATABRICKS_CLI_TEST_PID=1 to be set in the test environment. + KillCaller int } // FindConfigs finds all the config relevant for this test, diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index a401a1cac8..5ce41378e7 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -184,6 +184,9 @@ func startLocalServer(t *testing.T, s.ResponseCallback = logResponseCallback(t) } + // Track remaining kill counts per pattern (for KillCaller > 0) + killCounters := make(map[string]int) + for ind := range stubs { // We want later stubs takes precedence, because then leaf configs take precedence over parent directory configs // In gorilla/mux earlier handlers take precedence, so we need to reverse the order @@ -191,6 +194,12 @@ func startLocalServer(t *testing.T, require.NotEmpty(t, stub.Pattern) items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) + + // Initialize kill counter for this pattern + if stub.KillCaller > 0 { + killCounters[stub.Pattern] = stub.KillCaller + } + s.Handle(items[0], items[1], func(req testserver.Request) any { if stub.Delay > 0 { ctx := req.Context @@ -209,6 +218,11 @@ func startLocalServer(t *testing.T, } } + if shouldKillCaller(stub, killCounters) { + killCaller(t, stub.Pattern, req.Headers) + return testserver.Response{StatusCode: http.StatusOK} + } + return stub.Response }) } @@ -218,6 +232,37 @@ func startLocalServer(t *testing.T, return s.URL } +func shouldKillCaller(stub ServerStub, killCounters map[string]int) bool { + if stub.KillCaller <= 0 || killCounters[stub.Pattern] <= 0 { + return false + } + killCounters[stub.Pattern]-- + return true +} + +func killCaller(t *testing.T, pattern string, headers http.Header) { + pid := testserver.ExtractPidFromHeaders(headers) + if pid == 0 { + t.Errorf("KillCaller configured but test-pid not found in User-Agent") + return + } + + process, err := os.FindProcess(pid) + if err != nil { + t.Errorf("Failed to find process %d: %s", pid, err) + return + } + + // Use process.Kill() for cross-platform compatibility. + // On Unix, this sends SIGKILL. On Windows, this calls TerminateProcess. + if err := process.Kill(); err != nil { + t.Errorf("Failed to kill process %d: %s", pid, err) + return + } + + t.Logf("KillCaller: killed PID %d (pattern: %s)", pid, pattern) +} + func startProxyServer(t *testing.T, logRequests bool, includeHeaders []string, diff --git a/acceptance/selftest/kill_caller/currentuser/out.test.toml b/acceptance/selftest/kill_caller/currentuser/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/selftest/kill_caller/currentuser/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/selftest/kill_caller/currentuser/output.txt b/acceptance/selftest/kill_caller/currentuser/output.txt new file mode 100644 index 0000000000..637ea530b4 --- /dev/null +++ b/acceptance/selftest/kill_caller/currentuser/output.txt @@ -0,0 +1,6 @@ + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Script continued after kill diff --git a/acceptance/selftest/kill_caller/currentuser/script b/acceptance/selftest/kill_caller/currentuser/script new file mode 100644 index 0000000000..821c42d8cf --- /dev/null +++ b/acceptance/selftest/kill_caller/currentuser/script @@ -0,0 +1,2 @@ +trace errcode $CLI current-user me +echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/currentuser/test.toml b/acceptance/selftest/kill_caller/currentuser/test.toml new file mode 100644 index 0000000000..b76fe401fc --- /dev/null +++ b/acceptance/selftest/kill_caller/currentuser/test.toml @@ -0,0 +1,4 @@ +# Kill the CLI when it calls /Me endpoint (once, then allow) +[[Server]] +Pattern = "GET /api/2.0/preview/scim/v2/Me" +KillCaller = 1 diff --git a/acceptance/selftest/kill_caller/multiple/out.test.toml b/acceptance/selftest/kill_caller/multiple/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/selftest/kill_caller/multiple/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/selftest/kill_caller/multiple/output.txt b/acceptance/selftest/kill_caller/multiple/output.txt new file mode 100644 index 0000000000..538672bf86 --- /dev/null +++ b/acceptance/selftest/kill_caller/multiple/output.txt @@ -0,0 +1,25 @@ + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Attempt 1 done + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Attempt 2 done + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Attempt 3 done + +>>> [CLI] current-user me +{ + "id":"123", + "userName":"test@example.com" +} +Attempt 4 done - success! diff --git a/acceptance/selftest/kill_caller/multiple/script b/acceptance/selftest/kill_caller/multiple/script new file mode 100644 index 0000000000..03628e203e --- /dev/null +++ b/acceptance/selftest/kill_caller/multiple/script @@ -0,0 +1,13 @@ +# First 3 attempts should be killed +trace errcode $CLI current-user me +echo "Attempt 1 done" + +trace errcode $CLI current-user me +echo "Attempt 2 done" + +trace errcode $CLI current-user me +echo "Attempt 3 done" + +# 4th attempt should succeed +trace $CLI current-user me +echo "Attempt 4 done - success!" diff --git a/acceptance/selftest/kill_caller/multiple/test.toml b/acceptance/selftest/kill_caller/multiple/test.toml new file mode 100644 index 0000000000..5485fc6a6b --- /dev/null +++ b/acceptance/selftest/kill_caller/multiple/test.toml @@ -0,0 +1,10 @@ +# Kill the CLI 3 times, then allow the 4th request to succeed +[[Server]] +Pattern = "GET /api/2.0/preview/scim/v2/Me" +KillCaller = 3 +Response.Body = ''' +{ + "id": "123", + "userName": "test@example.com" +} +''' diff --git a/acceptance/selftest/kill_caller/test.toml b/acceptance/selftest/kill_caller/test.toml new file mode 100644 index 0000000000..faa7caf7b0 --- /dev/null +++ b/acceptance/selftest/kill_caller/test.toml @@ -0,0 +1,30 @@ +# KillCaller tests verify the test server's ability to terminate CLI processes mid-request. +# This enables testing crash recovery scenarios, e.g., "bundle deploy" fails on first attempt +# but succeeds on retry. Each subdirectory tests a different endpoint or retry count. + +Local = true +Env.DATABRICKS_CLI_TEST_PID = "1" + +[[Repls]] +# macOS bash shows "Killed: 9" (with signal number), Linux shows "Killed" +# Normalize the whole killed line to a placeholder +Old = 'script: line \d+:\s+\d+ Killed(: 9)?\s+"\$@"' +New = '[PROCESS_KILLED]' + +[[Repls]] +# On Windows, there's no "Killed" message - just empty line before Exit code +# Insert [PROCESS_KILLED] placeholder for consistency +Old = '(\n>>> errcode [^\n]+\n)\nExit code:' +New = """${1}[PROCESS_KILLED] + +Exit code:""" + +[[Repls]] +# Normalize exit code: 137 on Unix (128 + SIGKILL), 1 on Windows +Old = 'Exit code: (137|1)' +New = 'Exit code: [KILLED]' + +[[Repls]] +# Normalize Windows line endings (CRLF -> LF) - must be LAST +Old = "\r" +New = '' diff --git a/acceptance/selftest/kill_caller/workspace/out.test.toml b/acceptance/selftest/kill_caller/workspace/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/selftest/kill_caller/workspace/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/selftest/kill_caller/workspace/output.txt b/acceptance/selftest/kill_caller/workspace/output.txt new file mode 100644 index 0000000000..57eb88c9ad --- /dev/null +++ b/acceptance/selftest/kill_caller/workspace/output.txt @@ -0,0 +1,6 @@ + +>>> errcode [CLI] workspace list / +[PROCESS_KILLED] + +Exit code: [KILLED] +Script continued after kill diff --git a/acceptance/selftest/kill_caller/workspace/script b/acceptance/selftest/kill_caller/workspace/script new file mode 100644 index 0000000000..076972136c --- /dev/null +++ b/acceptance/selftest/kill_caller/workspace/script @@ -0,0 +1,2 @@ +trace errcode $CLI workspace list / +echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/workspace/test.toml b/acceptance/selftest/kill_caller/workspace/test.toml new file mode 100644 index 0000000000..eac10a6329 --- /dev/null +++ b/acceptance/selftest/kill_caller/workspace/test.toml @@ -0,0 +1,4 @@ +# Kill the CLI when it calls workspace list endpoint (once, then allow) +[[Server]] +Pattern = "GET /api/2.0/workspace/list" +KillCaller = 1 diff --git a/cmd/pipelines/root/root.go b/cmd/pipelines/root/root.go index 72d0b52664..ade98d899e 100644 --- a/cmd/pipelines/root/root.go +++ b/cmd/pipelines/root/root.go @@ -8,6 +8,7 @@ import ( "os" "strings" + "github.com/databricks/cli/cmd/root" "github.com/databricks/cli/internal/build" "github.com/databricks/cli/libs/log" "github.com/spf13/cobra" @@ -72,6 +73,7 @@ func New(ctx context.Context) *cobra.Command { ctx = withCommandInUserAgent(ctx, cmd) ctx = withCommandExecIdInUserAgent(ctx) ctx = withUpstreamInUserAgent(ctx) + ctx = root.InjectTestPidToUserAgent(ctx) cmd.SetContext(ctx) return nil } diff --git a/cmd/root/root.go b/cmd/root/root.go index 96fde20846..39c36e86d3 100644 --- a/cmd/root/root.go +++ b/cmd/root/root.go @@ -79,6 +79,7 @@ func New(ctx context.Context) *cobra.Command { ctx = withCommandInUserAgent(ctx, cmd) ctx = withCommandExecIdInUserAgent(ctx) ctx = withUpstreamInUserAgent(ctx) + ctx = InjectTestPidToUserAgent(ctx) cmd.SetContext(ctx) return nil } diff --git a/cmd/root/user_agent_test_pid.go b/cmd/root/user_agent_test_pid.go new file mode 100644 index 0000000000..7148fb3674 --- /dev/null +++ b/cmd/root/user_agent_test_pid.go @@ -0,0 +1,28 @@ +package root + +import ( + "context" + "os" + "strconv" + + "github.com/databricks/cli/libs/env" + "github.com/databricks/databricks-sdk-go/useragent" +) + +const ( + // TestPidEnvVar is the environment variable that enables PID injection into the user agent. + // When set to "1", the CLI will include its process ID in the user agent string. + // This is used by the test server to identify and signal the CLI process. + TestPidEnvVar = "DATABRICKS_CLI_TEST_PID" + testPidKey = "test-pid" +) + +// InjectTestPidToUserAgent adds the current process ID to the user agent if +// DATABRICKS_CLI_TEST_PID=1 is set. This enables the test server to identify +// and signal this process during acceptance tests. +func InjectTestPidToUserAgent(ctx context.Context) context.Context { + if env.Get(ctx, TestPidEnvVar) != "1" { + return ctx + } + return useragent.InContext(ctx, testPidKey, strconv.Itoa(os.Getpid())) +} diff --git a/libs/testserver/server.go b/libs/testserver/server.go index 8b8e346e99..735b43cadc 100644 --- a/libs/testserver/server.go +++ b/libs/testserver/server.go @@ -10,14 +10,32 @@ import ( "net/http/httptest" "net/url" "reflect" + "regexp" + "strconv" "strings" "sync" - "github.com/gorilla/mux" - "github.com/databricks/cli/internal/testutil" + "github.com/gorilla/mux" ) +const testPidKey = "test-pid" + +var testPidRegex = regexp.MustCompile(testPidKey + `/(\d+)`) + +func ExtractPidFromHeaders(headers http.Header) int { + ua := headers.Get("User-Agent") + matches := testPidRegex.FindStringSubmatch(ua) + if len(matches) < 2 { + return 0 + } + pid, err := strconv.Atoi(matches[1]) + if err != nil { + return 0 + } + return pid +} + type Server struct { *httptest.Server Router *mux.Router