From 550374cb989015e2cd29fcd40e12b63c133d001c Mon Sep 17 00:00:00 2001 From: Anwell Wang Date: Fri, 12 Jun 2026 17:27:48 +0000 Subject: [PATCH 1/4] sandbox: Show a clear error when Sandboxes is unavailable in the region A 503 from the /api/2.0/lakebox routes means the sandbox service is not deployed for the workspace's region, but the raw gateway error gives users no hint of that. Translate it at the API-wrapper level so every sandbox command reports the actual cause. Co-authored-by: Isaac --- cmd/sandbox/api.go | 35 +++++++++++++++++++++++++---------- cmd/sandbox/api_test.go | 16 ++++++++++++++++ 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/cmd/sandbox/api.go b/cmd/sandbox/api.go index 1fc05957e3..33fe516ba9 100644 --- a/cmd/sandbox/api.go +++ b/cmd/sandbox/api.go @@ -2,6 +2,7 @@ package sandbox import ( "context" + "errors" "fmt" "net/http" "net/url" @@ -10,6 +11,7 @@ import ( "github.com/databricks/cli/libs/auth" "github.com/databricks/databricks-sdk-go" + "github.com/databricks/databricks-sdk-go/apierr" "github.com/databricks/databricks-sdk-go/client" ) @@ -36,6 +38,19 @@ const ( sandboxKeysAPIPath = sandboxAPIRoot + "/ssh-keys" ) +// translateError rewrites API errors that have a better sandbox-specific +// explanation. A 503 on the sandbox routes means the sandbox service is not +// deployed for this workspace's region — and by the time it reaches us the +// SDK has already spent its retry budget on it, ruling out transient +// unavailability. The gateway-level 503 body adds nothing for the user, so +// it is dropped rather than wrapped. +func translateError(err error) error { + if apiErr, ok := errors.AsType[*apierr.APIError](err); ok && apiErr.StatusCode == http.StatusServiceUnavailable { + return errors.New("the Databricks Sandboxes feature is not available in your region") + } + return err +} + // orgIDHeader scopes the credential to a workspace on multi-workspace // gateways. Without it, requests fail with "Credential was not sent or was // of an unsupported type for this API." @@ -202,7 +217,7 @@ func (a *sandboxAPI) create(ctx context.Context, name string) (*createResponse, var resp createResponse err := a.c.Do(ctx, http.MethodPost, sandboxAPIPath, a.headers(), nil, body, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return &resp, nil } @@ -241,7 +256,7 @@ func (a *sandboxAPI) listPage(ctx context.Context, pageToken string) (*listRespo var resp listResponse err := a.c.Do(ctx, http.MethodGet, sandboxAPIPath, a.headers(), nil, query, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return &resp, nil } @@ -251,7 +266,7 @@ func (a *sandboxAPI) get(ctx context.Context, id string) (*sandboxEntry, error) var resp sandboxEntry err := a.c.Do(ctx, http.MethodGet, sandboxPath(id), a.headers(), nil, nil, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return &resp, nil } @@ -275,14 +290,14 @@ func (a *sandboxAPI) update(ctx context.Context, id string, name *string, idleTi var resp sandboxEntry err := a.c.Do(ctx, http.MethodPatch, sandboxPath(id), a.headers(), nil, body, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return &resp, nil } // delete calls DELETE /api/2.0/lakebox/sandboxes/{id}. func (a *sandboxAPI) delete(ctx context.Context, id string) error { - return a.c.Do(ctx, http.MethodDelete, sandboxPath(id), a.headers(), nil, nil, nil) + return translateError(a.c.Do(ctx, http.MethodDelete, sandboxPath(id), a.headers(), nil, nil, nil)) } // stop calls POST /api/2.0/lakebox/sandboxes/{id}/stop and returns the @@ -292,7 +307,7 @@ func (a *sandboxAPI) stop(ctx context.Context, id string) (*sandboxEntry, error) var resp sandboxEntry err := a.c.Do(ctx, http.MethodPost, sandboxPath(id)+"/stop", a.headers(), nil, body, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return &resp, nil } @@ -304,7 +319,7 @@ func (a *sandboxAPI) start(ctx context.Context, id string) (*sandboxEntry, error var resp sandboxEntry err := a.c.Do(ctx, http.MethodPost, sandboxPath(id)+"/start", a.headers(), nil, body, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return &resp, nil } @@ -318,7 +333,7 @@ func (a *sandboxAPI) registerKey(ctx context.Context, publicKey, name string) (* var resp sshKeyEntry err := a.c.Do(ctx, http.MethodPost, sandboxKeysAPIPath, a.headers(), nil, registerKeyRequest{PublicKey: publicKey, Name: name}, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return &resp, nil } @@ -347,12 +362,12 @@ func (a *sandboxAPI) listKeys(ctx context.Context) ([]sshKeyEntry, error) { var resp listKeysResponse err := a.c.Do(ctx, http.MethodGet, sandboxKeysAPIPath, a.headers(), nil, nil, &resp) if err != nil { - return nil, err + return nil, translateError(err) } return resp.SshKeys, nil } // deleteKey calls DELETE /api/2.0/lakebox/ssh-keys/{key_hash}. func (a *sandboxAPI) deleteKey(ctx context.Context, keyHash string) error { - return a.c.Do(ctx, http.MethodDelete, sandboxKeysAPIPath+"/"+url.PathEscape(keyHash), a.headers(), nil, nil, nil) + return translateError(a.c.Do(ctx, http.MethodDelete, sandboxKeysAPIPath+"/"+url.PathEscape(keyHash), a.headers(), nil, nil, nil)) } diff --git a/cmd/sandbox/api_test.go b/cmd/sandbox/api_test.go index 97025c9c7c..2fa57e1e6f 100644 --- a/cmd/sandbox/api_test.go +++ b/cmd/sandbox/api_test.go @@ -1,9 +1,11 @@ package sandbox import ( + "net/http" "strings" "testing" + "github.com/databricks/databricks-sdk-go/apierr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -21,6 +23,20 @@ func TestValidateNameRejectsOversize(t *testing.T) { assert.Contains(t, err.Error(), "256") } +func TestTranslateErrorRewrites503(t *testing.T) { + orig := &apierr.APIError{StatusCode: http.StatusServiceUnavailable, Message: "Service Unavailable"} + err := translateError(orig) + require.Error(t, err) + assert.Equal(t, "the Databricks Sandboxes feature is not available in your region", err.Error()) +} + +func TestTranslateErrorPassesThroughOthers(t *testing.T) { + require.NoError(t, translateError(nil)) + + notFound := &apierr.APIError{StatusCode: http.StatusNotFound, Message: "Sandbox not found"} + assert.Equal(t, error(notFound), translateError(notFound)) +} + func TestValidateNameCountsBytesNotRunes(t *testing.T) { // 64 panda emoji = 64 × 4 bytes = 256 bytes — at the limit, OK. require.NoError(t, validateName(strings.Repeat("🐼", 64))) From 8e29a43fd214a0a02b68bd3d493c981bc3bb1c9d Mon Sep 17 00:00:00 2001 From: Anwell Wang Date: Fri, 12 Jun 2026 18:02:14 +0000 Subject: [PATCH 2/4] sandbox: Cap 503 retries by count and hedge the unavailability message The SDK retries 503s for up to five minutes, so in regions without the sandbox service every command hung before erroring. Counting attempts in the ErrorRetriable hook (rather than racing a context deadline) halts the retry loop with the real APIError, so the translated message is deterministic. Since a few transient 503s are still retried, the message hedges between region unavailability and a temporary outage. Co-authored-by: Isaac --- .../list/region-unavailable/out.test.toml | 3 + .../list/region-unavailable/output.txt | 3 + .../sandbox/list/region-unavailable/script | 1 + .../sandbox/list/region-unavailable/test.toml | 7 ++ cmd/sandbox/api.go | 96 ++++++++++++++----- cmd/sandbox/api_test.go | 15 ++- 6 files changed, 98 insertions(+), 27 deletions(-) create mode 100644 acceptance/cmd/sandbox/list/region-unavailable/out.test.toml create mode 100644 acceptance/cmd/sandbox/list/region-unavailable/output.txt create mode 100644 acceptance/cmd/sandbox/list/region-unavailable/script create mode 100644 acceptance/cmd/sandbox/list/region-unavailable/test.toml diff --git a/acceptance/cmd/sandbox/list/region-unavailable/out.test.toml b/acceptance/cmd/sandbox/list/region-unavailable/out.test.toml new file mode 100644 index 0000000000..d6187dcb04 --- /dev/null +++ b/acceptance/cmd/sandbox/list/region-unavailable/out.test.toml @@ -0,0 +1,3 @@ +Local = true +Cloud = false +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = [] diff --git a/acceptance/cmd/sandbox/list/region-unavailable/output.txt b/acceptance/cmd/sandbox/list/region-unavailable/output.txt new file mode 100644 index 0000000000..e296e2c8e8 --- /dev/null +++ b/acceptance/cmd/sandbox/list/region-unavailable/output.txt @@ -0,0 +1,3 @@ +Error: failed to list sandboxes: the Databricks Sandboxes feature is not available in your region, or the service is temporarily unavailable + +Exit code: 1 diff --git a/acceptance/cmd/sandbox/list/region-unavailable/script b/acceptance/cmd/sandbox/list/region-unavailable/script new file mode 100644 index 0000000000..62cc0f91e4 --- /dev/null +++ b/acceptance/cmd/sandbox/list/region-unavailable/script @@ -0,0 +1 @@ +errcode $CLI sandbox list diff --git a/acceptance/cmd/sandbox/list/region-unavailable/test.toml b/acceptance/cmd/sandbox/list/region-unavailable/test.toml new file mode 100644 index 0000000000..bfb96a3b4d --- /dev/null +++ b/acceptance/cmd/sandbox/list/region-unavailable/test.toml @@ -0,0 +1,7 @@ +# Workspaces in regions without the sandbox service answer every lakebox +# request with a gateway-level 503. The CLI retries it max503Attempts times, +# then reports the hedged region/availability message. +[[Server]] +Pattern = "GET /api/2.0/lakebox/sandboxes" +Response.StatusCode = 503 +Response.Body = 'no healthy upstream' diff --git a/cmd/sandbox/api.go b/cmd/sandbox/api.go index 33fe516ba9..0642c38ae4 100644 --- a/cmd/sandbox/api.go +++ b/cmd/sandbox/api.go @@ -13,6 +13,8 @@ import ( "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/apierr" "github.com/databricks/databricks-sdk-go/client" + "github.com/databricks/databricks-sdk-go/config" + "github.com/databricks/databricks-sdk-go/httpclient" ) // sandboxPath returns the URL path for a single sandbox resource. The ID is @@ -38,15 +40,37 @@ const ( sandboxKeysAPIPath = sandboxAPIRoot + "/ssh-keys" ) -// translateError rewrites API errors that have a better sandbox-specific -// explanation. A 503 on the sandbox routes means the sandbox service is not -// deployed for this workspace's region — and by the time it reaches us the -// SDK has already spent its retry budget on it, ruling out transient -// unavailability. The gateway-level 503 body adds nothing for the user, so -// it is dropped rather than wrapped. +// max503Attempts caps attempts when the server keeps answering 503: regions +// without the sandbox service 503 forever, and the SDK's default 5-minute +// retry budget would hang the command. A couple of retries still absorbs +// transient blips. +const max503Attempts = 3 + +type attempt503CounterKey struct{} + +// arm503Budget attaches a fresh 503 attempt counter to the request context. +// Retries of one request run sequentially, so a plain *int suffices. +func arm503Budget(ctx context.Context) context.Context { + return context.WithValue(ctx, attempt503CounterKey{}, new(int)) +} + +// allow503Retry consumes one unit of the request's 503 budget. Unarmed +// contexts get no retries. +func allow503Retry(ctx context.Context) bool { + n, ok := ctx.Value(attempt503CounterKey{}).(*int) + if !ok { + return false + } + *n++ + return *n < max503Attempts +} + +// translateError rewrites a 503 — after max503Attempts it usually means the +// sandbox service is not deployed in this region. The gateway 503 body adds +// nothing for the user, so it is dropped rather than wrapped. func translateError(err error) error { if apiErr, ok := errors.AsType[*apierr.APIError](err); ok && apiErr.StatusCode == http.StatusServiceUnavailable { - return errors.New("the Databricks Sandboxes feature is not available in your region") + return errors.New("the Databricks Sandboxes feature is not available in your region, or the service is temporarily unavailable") } return err } @@ -191,7 +215,21 @@ type registerKeyRequest struct { // newSandboxAPI returns a sandboxAPI bound to the workspace client's config. func newSandboxAPI(w *databricks.WorkspaceClient) (*sandboxAPI, error) { - c, err := client.New(w.Config) + clientCfg, err := config.HTTPClientConfigFromConfig(w.Config) + if err != nil { + return nil, fmt.Errorf("failed to create sandbox API client: %w", err) + } + defaultRetriable := clientCfg.ErrorRetriable + // Cap 503 retries by count, not deadline: the final 503 halts the retry + // loop with the APIError itself, so translateError always sees the real + // status code instead of a racy context.DeadlineExceeded. + clientCfg.ErrorRetriable = func(ctx context.Context, err error) bool { + if apiErr, ok := errors.AsType[*apierr.APIError](err); ok && apiErr.StatusCode == http.StatusServiceUnavailable { + return allow503Retry(ctx) + } + return defaultRetriable(ctx, err) + } + c, err := client.NewWithClient(w.Config, httpclient.NewApiClient(clientCfg)) if err != nil { return nil, fmt.Errorf("failed to create sandbox API client: %w", err) } @@ -210,14 +248,20 @@ func (a *sandboxAPI) headers() map[string]string { return map[string]string{orgIDHeader: wsID} } +// do issues one sandbox API request: arms the 503 retry budget, attaches +// the workspace routing headers, and translates terminal errors. +func (a *sandboxAPI) do(ctx context.Context, method, path string, request, response any) error { + return translateError(a.c.Do(arm503Budget(ctx), method, path, a.headers(), nil, request, response)) +} + // create calls POST /api/2.0/lakebox/sandboxes. An empty `name` is omitted // so the server treats it as "unset" rather than "explicit empty string". func (a *sandboxAPI) create(ctx context.Context, name string) (*createResponse, error) { body := createRequest{Sandbox: sandboxCreateBody{Name: name}} var resp createResponse - err := a.c.Do(ctx, http.MethodPost, sandboxAPIPath, a.headers(), nil, body, &resp) + err := a.do(ctx, http.MethodPost, sandboxAPIPath, body, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return &resp, nil } @@ -254,9 +298,9 @@ func (a *sandboxAPI) listPage(ctx context.Context, pageToken string) (*listRespo query["page_token"] = pageToken } var resp listResponse - err := a.c.Do(ctx, http.MethodGet, sandboxAPIPath, a.headers(), nil, query, &resp) + err := a.do(ctx, http.MethodGet, sandboxAPIPath, query, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return &resp, nil } @@ -264,9 +308,9 @@ func (a *sandboxAPI) listPage(ctx context.Context, pageToken string) (*listRespo // get calls GET /api/2.0/lakebox/sandboxes/{id}. func (a *sandboxAPI) get(ctx context.Context, id string) (*sandboxEntry, error) { var resp sandboxEntry - err := a.c.Do(ctx, http.MethodGet, sandboxPath(id), a.headers(), nil, nil, &resp) + err := a.do(ctx, http.MethodGet, sandboxPath(id), nil, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return &resp, nil } @@ -288,16 +332,16 @@ func (a *sandboxAPI) update(ctx context.Context, id string, name *string, idleTi NoAutostop: noAutostop, } var resp sandboxEntry - err := a.c.Do(ctx, http.MethodPatch, sandboxPath(id), a.headers(), nil, body, &resp) + err := a.do(ctx, http.MethodPatch, sandboxPath(id), body, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return &resp, nil } // delete calls DELETE /api/2.0/lakebox/sandboxes/{id}. func (a *sandboxAPI) delete(ctx context.Context, id string) error { - return translateError(a.c.Do(ctx, http.MethodDelete, sandboxPath(id), a.headers(), nil, nil, nil)) + return a.do(ctx, http.MethodDelete, sandboxPath(id), nil, nil) } // stop calls POST /api/2.0/lakebox/sandboxes/{id}/stop and returns the @@ -305,9 +349,9 @@ func (a *sandboxAPI) delete(ctx context.Context, id string) error { func (a *sandboxAPI) stop(ctx context.Context, id string) (*sandboxEntry, error) { body := map[string]string{"sandbox_id": id} var resp sandboxEntry - err := a.c.Do(ctx, http.MethodPost, sandboxPath(id)+"/stop", a.headers(), nil, body, &resp) + err := a.do(ctx, http.MethodPost, sandboxPath(id)+"/stop", body, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return &resp, nil } @@ -317,9 +361,9 @@ func (a *sandboxAPI) stop(ctx context.Context, id string) (*sandboxEntry, error) func (a *sandboxAPI) start(ctx context.Context, id string) (*sandboxEntry, error) { body := map[string]string{"sandbox_id": id} var resp sandboxEntry - err := a.c.Do(ctx, http.MethodPost, sandboxPath(id)+"/start", a.headers(), nil, body, &resp) + err := a.do(ctx, http.MethodPost, sandboxPath(id)+"/start", body, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return &resp, nil } @@ -331,9 +375,9 @@ func (a *sandboxAPI) start(ctx context.Context, id string) (*sandboxEntry, error // `create` call. func (a *sandboxAPI) registerKey(ctx context.Context, publicKey, name string) (*sshKeyEntry, error) { var resp sshKeyEntry - err := a.c.Do(ctx, http.MethodPost, sandboxKeysAPIPath, a.headers(), nil, registerKeyRequest{PublicKey: publicKey, Name: name}, &resp) + err := a.do(ctx, http.MethodPost, sandboxKeysAPIPath, registerKeyRequest{PublicKey: publicKey, Name: name}, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return &resp, nil } @@ -360,14 +404,14 @@ type listKeysResponse struct { // listKeys calls GET /api/2.0/lakebox/ssh-keys. func (a *sandboxAPI) listKeys(ctx context.Context) ([]sshKeyEntry, error) { var resp listKeysResponse - err := a.c.Do(ctx, http.MethodGet, sandboxKeysAPIPath, a.headers(), nil, nil, &resp) + err := a.do(ctx, http.MethodGet, sandboxKeysAPIPath, nil, &resp) if err != nil { - return nil, translateError(err) + return nil, err } return resp.SshKeys, nil } // deleteKey calls DELETE /api/2.0/lakebox/ssh-keys/{key_hash}. func (a *sandboxAPI) deleteKey(ctx context.Context, keyHash string) error { - return translateError(a.c.Do(ctx, http.MethodDelete, sandboxKeysAPIPath+"/"+url.PathEscape(keyHash), a.headers(), nil, nil, nil)) + return a.do(ctx, http.MethodDelete, sandboxKeysAPIPath+"/"+url.PathEscape(keyHash), nil, nil) } diff --git a/cmd/sandbox/api_test.go b/cmd/sandbox/api_test.go index 2fa57e1e6f..21d58436a6 100644 --- a/cmd/sandbox/api_test.go +++ b/cmd/sandbox/api_test.go @@ -27,7 +27,20 @@ func TestTranslateErrorRewrites503(t *testing.T) { orig := &apierr.APIError{StatusCode: http.StatusServiceUnavailable, Message: "Service Unavailable"} err := translateError(orig) require.Error(t, err) - assert.Equal(t, "the Databricks Sandboxes feature is not available in your region", err.Error()) + assert.Equal(t, "the Databricks Sandboxes feature is not available in your region, or the service is temporarily unavailable", err.Error()) +} + +func TestAllow503RetryConsumesBudget(t *testing.T) { + ctx := arm503Budget(t.Context()) + // max503Attempts-1 retries are allowed, then the budget is exhausted. + for range max503Attempts - 1 { + assert.True(t, allow503Retry(ctx)) + } + assert.False(t, allow503Retry(ctx)) +} + +func TestAllow503RetryUnarmedContext(t *testing.T) { + assert.False(t, allow503Retry(t.Context())) } func TestTranslateErrorPassesThroughOthers(t *testing.T) { From 53cf62b07a034416487e8c639918d69c199366fc Mon Sep 17 00:00:00 2001 From: Anwell Wang Date: Fri, 12 Jun 2026 19:00:01 +0000 Subject: [PATCH 3/4] sandbox: Use singular "Databricks Sandbox" in the unavailability message Co-authored-by: Isaac --- acceptance/cmd/sandbox/list/region-unavailable/output.txt | 2 +- cmd/sandbox/api.go | 2 +- cmd/sandbox/api_test.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/acceptance/cmd/sandbox/list/region-unavailable/output.txt b/acceptance/cmd/sandbox/list/region-unavailable/output.txt index e296e2c8e8..564ddd8a4f 100644 --- a/acceptance/cmd/sandbox/list/region-unavailable/output.txt +++ b/acceptance/cmd/sandbox/list/region-unavailable/output.txt @@ -1,3 +1,3 @@ -Error: failed to list sandboxes: the Databricks Sandboxes feature is not available in your region, or the service is temporarily unavailable +Error: failed to list sandboxes: the Databricks Sandbox feature is not available in your region, or the service is temporarily unavailable Exit code: 1 diff --git a/cmd/sandbox/api.go b/cmd/sandbox/api.go index 0642c38ae4..892883b361 100644 --- a/cmd/sandbox/api.go +++ b/cmd/sandbox/api.go @@ -70,7 +70,7 @@ func allow503Retry(ctx context.Context) bool { // nothing for the user, so it is dropped rather than wrapped. func translateError(err error) error { if apiErr, ok := errors.AsType[*apierr.APIError](err); ok && apiErr.StatusCode == http.StatusServiceUnavailable { - return errors.New("the Databricks Sandboxes feature is not available in your region, or the service is temporarily unavailable") + return errors.New("the Databricks Sandbox feature is not available in your region, or the service is temporarily unavailable") } return err } diff --git a/cmd/sandbox/api_test.go b/cmd/sandbox/api_test.go index 21d58436a6..e9bf139c0a 100644 --- a/cmd/sandbox/api_test.go +++ b/cmd/sandbox/api_test.go @@ -27,7 +27,7 @@ func TestTranslateErrorRewrites503(t *testing.T) { orig := &apierr.APIError{StatusCode: http.StatusServiceUnavailable, Message: "Service Unavailable"} err := translateError(orig) require.Error(t, err) - assert.Equal(t, "the Databricks Sandboxes feature is not available in your region, or the service is temporarily unavailable", err.Error()) + assert.Equal(t, "the Databricks Sandbox feature is not available in your region, or the service is temporarily unavailable", err.Error()) } func TestAllow503RetryConsumesBudget(t *testing.T) { From b30a09daad24aaea931008f97c5590a37d2521af Mon Sep 17 00:00:00 2001 From: Anwell Wang Date: Fri, 12 Jun 2026 19:05:23 +0000 Subject: [PATCH 4/4] sandbox: Tighten comments Co-authored-by: Isaac --- .../sandbox/list/region-unavailable/test.toml | 5 ++--- cmd/sandbox/api.go | 20 ++++++++----------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/acceptance/cmd/sandbox/list/region-unavailable/test.toml b/acceptance/cmd/sandbox/list/region-unavailable/test.toml index bfb96a3b4d..fc08c98d7c 100644 --- a/acceptance/cmd/sandbox/list/region-unavailable/test.toml +++ b/acceptance/cmd/sandbox/list/region-unavailable/test.toml @@ -1,6 +1,5 @@ -# Workspaces in regions without the sandbox service answer every lakebox -# request with a gateway-level 503. The CLI retries it max503Attempts times, -# then reports the hedged region/availability message. +# A persistent 503 is retried max503Attempts times, then reported as a +# user-facing unavailability error. [[Server]] Pattern = "GET /api/2.0/lakebox/sandboxes" Response.StatusCode = 503 diff --git a/cmd/sandbox/api.go b/cmd/sandbox/api.go index 892883b361..74cc444a58 100644 --- a/cmd/sandbox/api.go +++ b/cmd/sandbox/api.go @@ -40,16 +40,14 @@ const ( sandboxKeysAPIPath = sandboxAPIRoot + "/ssh-keys" ) -// max503Attempts caps attempts when the server keeps answering 503: regions -// without the sandbox service 503 forever, and the SDK's default 5-minute -// retry budget would hang the command. A couple of retries still absorbs -// transient blips. +// max503Attempts caps attempts when the server keeps answering 503; the +// SDK's default budget would otherwise retry for up to 5 minutes. const max503Attempts = 3 type attempt503CounterKey struct{} -// arm503Budget attaches a fresh 503 attempt counter to the request context. -// Retries of one request run sequentially, so a plain *int suffices. +// arm503Budget attaches the request's 503 attempt counter; retries of one +// request run sequentially, so a plain *int suffices. func arm503Budget(ctx context.Context) context.Context { return context.WithValue(ctx, attempt503CounterKey{}, new(int)) } @@ -65,9 +63,8 @@ func allow503Retry(ctx context.Context) bool { return *n < max503Attempts } -// translateError rewrites a 503 — after max503Attempts it usually means the -// sandbox service is not deployed in this region. The gateway 503 body adds -// nothing for the user, so it is dropped rather than wrapped. +// translateError replaces a 503 with a user-facing message; the gateway +// body adds nothing, so it is dropped rather than wrapped. func translateError(err error) error { if apiErr, ok := errors.AsType[*apierr.APIError](err); ok && apiErr.StatusCode == http.StatusServiceUnavailable { return errors.New("the Databricks Sandbox feature is not available in your region, or the service is temporarily unavailable") @@ -220,9 +217,8 @@ func newSandboxAPI(w *databricks.WorkspaceClient) (*sandboxAPI, error) { return nil, fmt.Errorf("failed to create sandbox API client: %w", err) } defaultRetriable := clientCfg.ErrorRetriable - // Cap 503 retries by count, not deadline: the final 503 halts the retry - // loop with the APIError itself, so translateError always sees the real - // status code instead of a racy context.DeadlineExceeded. + // Cap 503 retries by count, not deadline, so the final 503 surfaces as + // the APIError rather than a racy context.DeadlineExceeded. clientCfg.ErrorRetriable = func(ctx context.Context, err error) bool { if apiErr, ok := errors.AsType[*apierr.APIError](err); ok && apiErr.StatusCode == http.StatusServiceUnavailable { return allow503Retry(ctx)