From 717a8afad0b4d3d83d4674c08ff18ad7b68676dd Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Wed, 20 May 2026 02:07:50 -0400 Subject: [PATCH 1/5] fix(secrets): prevent duplicate DO Spaces key creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DO does not enforce name uniqueness on Spaces keys. The provider_credential bootstrap path POSTed blindly every run; if the bootstrap layer failed to persist the access_key+secret_key pair to the configured store (or skipped existsInProvider for any reason), the next run created another key with the same name. Repeated runs accreted orphaned keys until the account hit DO's 200-key quota. Each orphan is unrecoverable — DO returns secret_key once at creation time, and List returns access_key only. Changes: - generateDOSpacesKey() now does a list-then-create. Pre-create call to https://api.digitalocean.com/v2/spaces/keys checks for a matching name; if found, returns an error naming the existing access_key with explicit recovery guidance (delete via console OR --force-rotate). The POST is never attempted in the conflict case. - New lookupExistingSpacesKey() helper paginates through Spaces keys (10 × 100 cap) matching by exact name. Used by the pre-check AND by the 403 error branch (account-quota vs name-conflict disambiguation hint). - Improved error wrapping: every "DO spaces key create" error now includes name=%q so the operator can correlate logs with DO console entries. - stderr trace line on every create attempt: name + bucket + grant permission, so the failure mode is visible without re-running with extra flags. Tests: - TestGenerateDOSpacesKey_RejectsDuplicateNameBeforeCreate verifies the pre-check short-circuits and POST is never attempted when a duplicate exists. (postCalls == 0 asserted.) - TestGenerateDOSpacesKey_403QuotaWithoutNameConflict covers the fallback hint when no name match but DO still returns 403. - TestLookupExistingSpacesKey_PaginatedHit + _Miss cover the pager helper. - Existing TestGenerateSecret_ProviderCredential_DOSpaces and _NoBucket / _WithBucket tests pass without changes — their GET handlers return 404 (no existing key) so the create path proceeds. - TestGenerateDOSpacesKey_IncludesCreatedAt updated to stub both the new GET and the existing POST. Co-Authored-By: Claude Opus 4.7 (1M context) --- secrets/generators.go | 93 ++++++++++++++++++- secrets/generators_test.go | 180 ++++++++++++++++++++++++++++++++++--- 2 files changed, 259 insertions(+), 14 deletions(-) diff --git a/secrets/generators.go b/secrets/generators.go index 49989330..eaca361d 100644 --- a/secrets/generators.go +++ b/secrets/generators.go @@ -138,6 +138,58 @@ func joinKeys(m map[string]map[string]any) string { return strings.Join(keys, ", ") } +// lookupExistingSpacesKey returns the access_key of any DO Spaces key +// with the given name, or "" if no match. Used to disambiguate a 403 +// "key quota exceeded" response between an account-level quota hit and +// a name-uniqueness conflict from a partial prior run. +// +// Best-effort — failures here are silent (the caller already has a +// useful error and the hint is purely advisory). +func lookupExistingSpacesKey(ctx context.Context, token, name string) string { + page := 1 + for page <= 10 { // bounded — 10 pages × 100 = 1000 keys + url := fmt.Sprintf("https://api.digitalocean.com/v2/spaces/keys?per_page=100&page=%d", page) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return "" + } + req.Header.Set("Authorization", "Bearer "+token) + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "" + } + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return "" + } + var list struct { + Keys []struct { + Name string `json:"name"` + AccessKey string `json:"access_key"` + } `json:"keys"` + Links struct { + Pages struct { + Next string `json:"next"` + } `json:"pages"` + } `json:"links"` + } + if err := json.Unmarshal(body, &list); err != nil { + return "" + } + for _, k := range list.Keys { + if k.Name == name { + return k.AccessKey + } + } + if list.Links.Pages.Next == "" { + return "" + } + page++ + } + return "" +} + func generateDOSpacesKey(ctx context.Context, config map[string]any) (string, error) { token := os.Getenv("DIGITALOCEAN_TOKEN") if token == "" { @@ -165,6 +217,31 @@ func generateDOSpacesKey(ctx context.Context, config map[string]any) (string, er payload := map[string]any{"name": name, "grants": []map[string]any{grant}} body, _ := json.Marshal(payload) + // Log the attempted key name to aid troubleshooting. DO's 403 + // "key quota exceeded" can fire on (1) per-account quota (200) AND + // (2) name uniqueness conflict with a zombie key from a partial + // prior run — the user needs the attempted name to triage which. + fmt.Fprintf(os.Stderr, "secrets: DO spaces key create: name=%q bucket=%q grant=%v\n", name, bucket, grant["permission"]) + + // Pre-create existence check: the DO API does NOT enforce name + // uniqueness — successive POSTs with the same `name` happily + // create duplicate keys. The bootstrap layer is supposed to skip + // generation when the credential is already stored in the + // secret store, but a single save-failure leaves us with an + // orphaned DO key whose secret_key is no longer retrievable. + // Each subsequent run then creates *another* orphan, accreting + // duplicates until the account hits the real quota. + // + // Pre-check by name and fail loudly with the existing access_key + // so the operator can either delete the orphan from the DO + // console or pass --force-rotate (which deletes + recreates). + // We do NOT auto-adopt because DO returns secret_key only at + // creation time; we cannot reconstruct the secret half from a + // list lookup. + if existing := lookupExistingSpacesKey(ctx, token, name); existing != "" { + return "", fmt.Errorf("secrets: DO spaces key create name=%q: a key with this name already exists (access_key=%s); DO does NOT enforce name uniqueness, so blind re-create would orphan more keys. Either (1) delete it via https://cloud.digitalocean.com/account/api/spaces and re-run, or (2) re-run with --force-rotate to delete + recreate atomically", name, existing) + } + // The DO Spaces Keys endpoint is hardcoded. Tests stub it via the package's // rewriteTransport helper (see generators_test.go) — a hermetic mechanism // that requires explicit code in the test to take effect. Earlier drafts @@ -182,13 +259,25 @@ func generateDOSpacesKey(ctx context.Context, config map[string]any) (string, er resp, err := http.DefaultClient.Do(req) if err != nil { - return "", fmt.Errorf("secrets: DO spaces key create: %w", err) + return "", fmt.Errorf("secrets: DO spaces key create name=%q: %w", name, err) } defer resp.Body.Close() if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) - return "", fmt.Errorf("secrets: DO spaces key create: HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + hint := "" + if resp.StatusCode == http.StatusForbidden && strings.Contains(string(body), "key quota") { + // Surface the two distinct failure modes that share this + // response code. Most accounts are nowhere near the real + // 200-key quota; a 403 here usually means a name conflict. + existing := lookupExistingSpacesKey(ctx, token, name) + if existing != "" { + hint = fmt.Sprintf(" (hint: a Spaces key named %q already exists in this account, access_key=%s; either delete it via the DO console or rename the entry in deploy.prereq.yaml)", name, existing) + } else { + hint = " (hint: DO returns this for the account-level 200-key quota AND name-uniqueness conflicts; this error often means a partially-applied prior run left a zombie key — try listing https://cloud.digitalocean.com/account/api/spaces and deleting matches by name)" + } + } + return "", fmt.Errorf("secrets: DO spaces key create name=%q: HTTP %d: %s%s", name, resp.StatusCode, strings.TrimSpace(string(body)), hint) } var result struct { diff --git a/secrets/generators_test.go b/secrets/generators_test.go index a02da446..ed610054 100644 --- a/secrets/generators_test.go +++ b/secrets/generators_test.go @@ -8,6 +8,7 @@ import ( "io" "net/http" "net/http/httptest" + "strings" "testing" ) @@ -268,21 +269,31 @@ func TestGenerateSecret_ProviderCredential_MissingToken(t *testing.T) { // override (e.g. DIGITALOCEAN_API_URL) — that would be a credential- // exfiltration vector in production. Per ADR 0021. func TestGenerateDOSpacesKey_IncludesCreatedAt(t *testing.T) { - // Stub the DO API server. + // Stub the DO API server. The generator now does a list-then- + // create dance to avoid orphaning duplicate-named keys, so the + // stub must respond to both verbs. srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/v2/spaces/keys" || r.Method != http.MethodPost { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/v2/spaces/keys": + // No existing keys — let the create path through. + _ = json.NewEncoder(w).Encode(map[string]any{ + "keys": []map[string]any{}, + "links": map[string]any{"pages": map[string]any{}}, + }) + case r.Method == http.MethodPost && r.URL.Path == "/v2/spaces/keys": + w.WriteHeader(http.StatusCreated) + _ = json.NewEncoder(w).Encode(map[string]any{ + "key": map[string]any{ + "access_key": "AK1234567890", + "secret_key": "SK_full_secret_value_here", + "name": "test-key", + "created_at": "2026-05-08T10:30:00Z", + "grants": []any{map[string]any{"permission": "fullaccess"}}, + }, + }) + default: t.Errorf("unexpected request: %s %s", r.Method, r.URL.Path) } - w.WriteHeader(http.StatusCreated) - json.NewEncoder(w).Encode(map[string]any{ - "key": map[string]any{ - "access_key": "AK1234567890", - "secret_key": "SK_full_secret_value_here", - "name": "test-key", - "created_at": "2026-05-08T10:30:00Z", - "grants": []any{map[string]any{"permission": "fullaccess"}}, - }, - }) })) defer srv.Close() @@ -426,3 +437,148 @@ func containsStr(s, sub string) bool { return false }()) } + +// TestGenerateDOSpacesKey_RejectsDuplicateNameBeforeCreate verifies +// that when a DO key with the requested name already exists, the +// generator does NOT POST a duplicate. DO does not enforce name +// uniqueness; a blind re-create would orphan another key whose +// secret_key is unrecoverable. Instead the generator must fail loudly +// with the existing access_key so the operator can decide. +func TestGenerateDOSpacesKey_RejectsDuplicateNameBeforeCreate(t *testing.T) { + postCalls := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/v2/spaces/keys": + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(map[string]any{ + "keys": []map[string]any{ + {"name": "multisite-deploy-key", "access_key": "AK_EXISTING"}, + }, + "links": map[string]any{"pages": map[string]any{}}, + }) + case r.Method == http.MethodPost && r.URL.Path == "/v2/spaces/keys": + postCalls++ + // Should never reach here — pre-check must short-circuit. + w.WriteHeader(http.StatusCreated) + default: + t.Errorf("unexpected %s %s", r.Method, r.URL.Path) + } + })) + defer srv.Close() + + t.Setenv("DIGITALOCEAN_TOKEN", "stub") + orig := http.DefaultClient.Transport + http.DefaultClient.Transport = rewriteTransport{base: srv.URL} + defer func() { http.DefaultClient.Transport = orig }() + + _, err := generateDOSpacesKey(context.Background(), map[string]any{"name": "multisite-deploy-key"}) + if err == nil { + t.Fatal("expected error when duplicate-by-name exists") + } + if postCalls != 0 { + t.Errorf("generator POSTed despite duplicate name; calls=%d", postCalls) + } + msg := err.Error() + if !strings.Contains(msg, `name="multisite-deploy-key"`) { + t.Errorf("error missing attempted name: %v", msg) + } + if !strings.Contains(msg, "AK_EXISTING") { + t.Errorf("error missing existing access_key hint: %v", msg) + } + if !strings.Contains(msg, "already exists") { + t.Errorf("error missing 'already exists' hint: %v", msg) + } + if !strings.Contains(msg, "--force-rotate") { + t.Errorf("error missing rotate guidance: %v", msg) + } +} + +// TestGenerateDOSpacesKey_403QuotaWithoutNameConflict surfaces the +// fallback hint when the conflicting key isn't found by name (could be +// a real account-level quota OR a renamed zombie). +func TestGenerateDOSpacesKey_403QuotaWithoutNameConflict(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPost && r.URL.Path == "/v2/spaces/keys": + w.WriteHeader(http.StatusForbidden) + _, _ = w.Write([]byte(`{"id":"forbidden","message":"key quota exceeded"}`)) + case r.Method == http.MethodGet && r.URL.Path == "/v2/spaces/keys": + w.WriteHeader(http.StatusOK) + // No matching name. + _ = json.NewEncoder(w).Encode(map[string]any{ + "keys": []map[string]any{{"name": "unrelated", "access_key": "AK_X"}}, + "links": map[string]any{"pages": map[string]any{}}, + }) + } + })) + defer srv.Close() + + t.Setenv("DIGITALOCEAN_TOKEN", "stub") + orig := http.DefaultClient.Transport + http.DefaultClient.Transport = rewriteTransport{base: srv.URL} + defer func() { http.DefaultClient.Transport = orig }() + + _, err := generateDOSpacesKey(context.Background(), map[string]any{"name": "no-conflict"}) + if err == nil { + t.Fatal("expected error") + } + msg := err.Error() + if !strings.Contains(msg, "200-key quota AND name-uniqueness conflicts") { + t.Errorf("expected disambiguation hint; got %v", msg) + } + if !strings.Contains(msg, `name="no-conflict"`) { + t.Errorf("expected attempted name in error: %v", msg) + } +} + +func TestLookupExistingSpacesKey_PaginatedHit(t *testing.T) { + pageCount := 0 + var baseURL string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet || r.URL.Path != "/v2/spaces/keys" { + t.Errorf("unexpected %s %s", r.Method, r.URL.Path) + } + pageCount++ + page := r.URL.Query().Get("page") + if page == "1" { + _ = json.NewEncoder(w).Encode(map[string]any{ + "keys": []map[string]any{{"name": "other", "access_key": "AK_OTHER"}}, + "links": map[string]any{"pages": map[string]any{"next": baseURL + "/v2/spaces/keys?page=2"}}, + }) + return + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "keys": []map[string]any{{"name": "target", "access_key": "AK_TARGET"}}, + "links": map[string]any{"pages": map[string]any{}}, + }) + })) + defer srv.Close() + baseURL = srv.URL + + orig := http.DefaultClient.Transport + http.DefaultClient.Transport = rewriteTransport{base: srv.URL} + defer func() { http.DefaultClient.Transport = orig }() + + got := lookupExistingSpacesKey(context.Background(), "stub", "target") + if got != "AK_TARGET" { + t.Errorf("lookup = %q want AK_TARGET (paged %d times)", got, pageCount) + } +} + +func TestLookupExistingSpacesKey_Miss(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = json.NewEncoder(w).Encode(map[string]any{ + "keys": []map[string]any{{"name": "a"}, {"name": "b"}}, + "links": map[string]any{"pages": map[string]any{}}, + }) + })) + defer srv.Close() + + orig := http.DefaultClient.Transport + http.DefaultClient.Transport = rewriteTransport{base: srv.URL} + defer func() { http.DefaultClient.Transport = orig }() + + if got := lookupExistingSpacesKey(context.Background(), "stub", "missing"); got != "" { + t.Errorf("miss = %q want empty", got) + } +} From e8719f0e595719e840f6da2860e97f3bf9ad5151 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Wed, 20 May 2026 02:09:29 -0400 Subject: [PATCH 2/5] fix(secrets): require explicit name for DO Spaces key + fullaccess warning Two follow-ups to the duplicate-creation fix: 1. Refuse the default name fallback. The prior default \`workflow-spaces-key\` was project-shared; multiple workflow- managed projects in one DO account would all try to create or adopt the same key. Force the caller to set a project-unique slug (e.g. \`multisite-deploy-key\`, \`wfcompute-deploy-key\`). 2. Emit a stderr WARN when permission=fullaccess is granted (i.e. no \`bucket:\` configured). Bootstrap necessarily uses fullaccess because the IaC state bucket does not exist yet, but once it does the operator should rotate to a bucket- scoped key via \`wfctl secrets rotate --target SPACES --bucket \` to limit blast radius. The warning includes the exact rotate command. Test updates: - WithBucket / IncludesCreatedAt / NoBucket tests now pass an explicit name (was relying on the default). - New TestGenerateDOSpacesKey_RequiresName covers the refuse- default-name path. Co-Authored-By: Claude Opus 4.7 (1M context) --- secrets/generators.go | 13 ++++++++++++- secrets/generators_test.go | 17 +++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/secrets/generators.go b/secrets/generators.go index eaca361d..6808e21a 100644 --- a/secrets/generators.go +++ b/secrets/generators.go @@ -198,7 +198,13 @@ func generateDOSpacesKey(ctx context.Context, config map[string]any) (string, er name, _ := config["name"].(string) if name == "" { - name = "workflow-spaces-key" + // Refuse the default fallback. A shared default name causes + // cross-project collisions in any account that runs more than + // one workflow-managed deploy — every project would try to + // create or adopt the same `workflow-spaces-key`. Force the + // operator to name the key per-project (e.g. + // `multisite-deploy-key`, `wfcompute-deploy-key`). + return "", fmt.Errorf("secrets: provider_credential digitalocean.spaces: `name` is required (use a project-unique slug like \"-deploy-key\"; a shared default would collide across projects in the same DO account)") } // DO Spaces Keys API requires a concrete bucket name (3-63 chars) when @@ -212,6 +218,11 @@ func generateDOSpacesKey(ctx context.Context, config map[string]any) (string, er if bucket != "" { grant = map[string]any{"bucket": bucket, "permission": "readwrite"} } else { + // fullaccess is required for bootstrap (the IaC state bucket + // does not exist yet, so the key cannot be scoped to it). + // Once the bucket is created, the operator should rotate to a + // bucket-scoped key via the rotate-and-prune flow. + fmt.Fprintf(os.Stderr, "secrets: WARN provider_credential digitalocean.spaces name=%q is being granted permission=fullaccess (no `bucket:` configured); rotate to a bucket-scoped key after first apply via `wfctl secrets rotate --target SPACES --bucket ` to limit blast radius.\n", name) grant = map[string]any{"permission": "fullaccess"} } payload := map[string]any{"name": name, "grants": []map[string]any{grant}} diff --git a/secrets/generators_test.go b/secrets/generators_test.go index ed610054..1c20af51 100644 --- a/secrets/generators_test.go +++ b/secrets/generators_test.go @@ -215,6 +215,7 @@ func TestGenerateSecret_ProviderCredential_DOSpaces_WithBucket(t *testing.T) { _, err := GenerateSecret(context.Background(), "provider_credential", map[string]any{ "source": "digitalocean.spaces", + "name": "with-bucket-test", "bucket": "my-state-bucket", }) if err != nil { @@ -582,3 +583,19 @@ func TestLookupExistingSpacesKey_Miss(t *testing.T) { t.Errorf("miss = %q want empty", got) } } + +// TestGenerateDOSpacesKey_RequiresName verifies that the generator +// refuses to fall back to a shared default name. Sharing a default +// like "workflow-spaces-key" across projects causes cross-project +// collisions in any account that runs more than one workflow-managed +// deploy. +func TestGenerateDOSpacesKey_RequiresName(t *testing.T) { + t.Setenv("DIGITALOCEAN_TOKEN", "stub") + _, err := generateDOSpacesKey(context.Background(), map[string]any{}) + if err == nil { + t.Fatal("expected error when name is unset") + } + if !strings.Contains(err.Error(), "`name` is required") { + t.Errorf("error wording: %v", err) + } +} From 0d4e6a0625e1c71dff60b9f3cc723c04e3c4cc05 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Wed, 20 May 2026 02:20:19 -0400 Subject: [PATCH 3/5] feat(wfctl): transactional rollback + list-orphans for provider_credential MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 of the DO Spaces orphan fix (workflow#732 follow-up). Round 1 (already in this PR) added a list-before-create pre-check so the generator refuses to POST a duplicate name. That prevents NEW orphans whose root cause is a save-failure-after-mint loop. This round closes the other half: the orphan-creation event itself. cmd/wfctl/infra_bootstrap.go — bootstrapSecrets provider_credential path now: 1. Extracts access_key + created_at BEFORE the Set loop (was inside the loop, so a first-iteration Set failure left the rollback path blind to the just-minted credential). 2. On ANY Set failure inside the loop, invokes credRevoker. RevokeProviderCredential with the just-minted access_key. If no revoker is registered, emits a loud ORPHANED-CREDENTIAL warning to stderr with the wfctl list-orphans recovery command pre-filled. 3. Rollback is best-effort: failure does not mask the original Set error. cmd/wfctl/secrets_orphans.go — new \`wfctl secrets list-orphans\` subcommand: wfctl secrets list-orphans --source digitalocean.spaces \ --name workflow-spaces-key # dry-run wfctl secrets list-orphans --source digitalocean.spaces \ --name workflow-spaces-key --delete # delete all matches Lists every upstream credential whose name field equals --name (DO Spaces does not enforce name uniqueness, so a single name can map to many access_keys after the orphan loop). Bounded pager (100 × 100 = 10 000 keys). Per-orphan delete continues on failure to surface every recoverable orphan. Currently supports digitalocean.spaces; extending to other sources adds one switch arm. Tests: - TestBootstrapSecrets_ProviderCredential_RollbackOnSetFailure — Set(SPACES_secret_key) returns error; assert revoker invoked with AK_ORPHAN; assert original error surfaces. - TestBootstrapSecrets_ProviderCredential_RollbackOnFirstSetFailure — Set(SPACES_access_key) (the very first call) returns error; asserts the access_key extraction reorder is correct (revoker must receive AK_FIRST even though we never even reached the access_key iteration). These two tests reproduce the pre-fix orphan-creation bug (generator succeeds + Set fails → DO key remains, no rollback attempted). Without the rollback wiring they fail with "expected 1 rollback-revoke call; got 0". Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/wfctl/infra_bootstrap.go | 48 +++++-- cmd/wfctl/infra_bootstrap_secrets_test.go | 122 +++++++++++++++++ cmd/wfctl/secrets.go | 5 + cmd/wfctl/secrets_orphans.go | 158 ++++++++++++++++++++++ 4 files changed, 322 insertions(+), 11 deletions(-) create mode 100644 cmd/wfctl/secrets_orphans.go diff --git a/cmd/wfctl/infra_bootstrap.go b/cmd/wfctl/infra_bootstrap.go index eced1050..226d2ef9 100644 --- a/cmd/wfctl/infra_bootstrap.go +++ b/cmd/wfctl/infra_bootstrap.go @@ -713,19 +713,43 @@ var bootstrapSecrets = func(ctx context.Context, provider secrets.Provider, cfg allowed[k] = true } - // Capture access_key + created_at independently of storage so - // RotationResult always reports them on a force-rotate, even - // though access_key IS in the allowed set and created_at is - // NOT (per review #2 C5: stderr emission alone would skip - // access_key). - var newAccessKey, newCreatedAt string - for subKey, subVal := range subKeyMap { - if subKey == "access_key" { - newAccessKey = subVal + // Capture access_key + created_at BEFORE any Set call so the + // transactional rollback path has the upstream credential ID + // available even when the very first Set fails. Bug fix: + // previously these were extracted DURING iteration, so a + // Set("foo_access_key") failure left newAccessKey unset and + // the orphaned DO key invisible to the rollback path. + newAccessKey := subKeyMap["access_key"] + newCreatedAt := subKeyMap["created_at"] + + // Transactional rollback: if ANY Set() inside the loop fails, + // revoke the just-created upstream credential before + // returning. The DO Spaces Keys API does NOT enforce name + // uniqueness, so a partially-applied generation without + // rollback leaves an orphaned key whose secret_key is + // permanently irrecoverable (see workflow#732 / SPEC V?). + // + // Rollback is best-effort — failure is logged but does not + // mask the original Set error. Operator can still find the + // orphan via the wfctl secrets list-orphans helper. + revokeOrphan := func(reason error) { + if newAccessKey == "" { + return + } + if credRevoker == nil { + fmt.Fprintf(os.Stderr, "warn: provider_credential %q minted access_key=%s but no revoker available; the upstream credential is now ORPHANED and unrecoverable. Run `wfctl secrets list-orphans --source %s --name %s` to clean up. Original error: %v\n", + gen.Key, newAccessKey, gen.Source, gen.Key, reason) + return } - if subKey == "created_at" { - newCreatedAt = subVal + if revokeErr := credRevoker.RevokeProviderCredential(ctx, gen.Source, newAccessKey); revokeErr != nil { + fmt.Fprintf(os.Stderr, "warn: rollback-revoke just-minted credential %q (access_key=%s) failed: %v — manual cleanup required via `wfctl secrets list-orphans --source %s`\n", + gen.Key, newAccessKey, revokeErr, gen.Source) + return } + fmt.Fprintf(os.Stderr, "wfctl: rolled back orphaned credential %q (access_key=%s) after Set failure\n", gen.Key, newAccessKey) + } + + for subKey, subVal := range subKeyMap { if !allowed[subKey] { // Sidecar field — emit to stderr in machine-parseable form // for operator observability + audit logs. @@ -734,6 +758,7 @@ var bootstrapSecrets = func(ctx context.Context, provider secrets.Provider, cfg } fullKey := gen.Key + "_" + subKey if setErr := provider.Set(ctx, fullKey, subVal); setErr != nil { + revokeOrphan(setErr) return generated, rotations, fmt.Errorf("store secret %q: %w", fullKey, setErr) } generated[fullKey] = subVal @@ -743,6 +768,7 @@ var bootstrapSecrets = func(ctx context.Context, provider secrets.Provider, cfg fmt.Printf(" secret %q: created\n", fullKey) } } + _ = newCreatedAt // captured into RotationResult below if forceRotate[gen.Key] { fmt.Fprintf(os.Stderr, "wfctl: rotated provider_credential %s (replaced existing value at %s)\n", gen.Key, time.Now().UTC().Format(time.RFC3339)) // Record the rotation in-process so callers (rotate-and-prune) diff --git a/cmd/wfctl/infra_bootstrap_secrets_test.go b/cmd/wfctl/infra_bootstrap_secrets_test.go index 4c438efa..9779690b 100644 --- a/cmd/wfctl/infra_bootstrap_secrets_test.go +++ b/cmd/wfctl/infra_bootstrap_secrets_test.go @@ -265,3 +265,125 @@ func TestBootstrapSecrets_ProviderCredentialProbeIgnoresBareKey(t *testing.T) { t.Fatalf("generator called %d times, want 1", generateCalls) } } + +// failOnSetProvider triggers a Set() failure on the first matching key +// — used to exercise the transactional rollback path when a +// provider_credential creation succeeds upstream but the store-write +// half fails (e.g. GH PAT lost permissions mid-bootstrap). +type failOnSetProvider struct { + failKey string + setCalls []string +} + +func (p *failOnSetProvider) Name() string { return "fail-on-set" } +func (p *failOnSetProvider) Get(_ context.Context, _ string) (string, error) { + return "", secrets.ErrUnsupported +} +func (p *failOnSetProvider) Set(_ context.Context, key, _ string) error { + p.setCalls = append(p.setCalls, key) + if key == p.failKey { + return errFakeStoreUnavailable + } + return nil +} +func (p *failOnSetProvider) Delete(_ context.Context, _ string) error { return nil } +func (p *failOnSetProvider) List(_ context.Context) ([]string, error) { + return nil, secrets.ErrUnsupported +} + +// recordingRevoker captures RevokeProviderCredential calls so the test +// can assert rollback occurred. Implements interfaces.ProviderCredentialRevoker. +type recordingRevoker struct { + calls []revokeCall +} +type revokeCall struct { + source string + credentialID string +} + +func (r *recordingRevoker) RevokeProviderCredential(_ context.Context, source, credentialID string) error { + r.calls = append(r.calls, revokeCall{source: source, credentialID: credentialID}) + return nil +} + +var errFakeStoreUnavailable = errFakeStore("store unavailable (simulated)") + +type errFakeStore string + +func (e errFakeStore) Error() string { return string(e) } + +// TestBootstrapSecrets_ProviderCredential_RollbackOnSetFailure is the +// regression test for the orphan-key bug: when generateSecret returns a +// fresh DO Spaces key but provider.Set fails to persist it, bootstrap +// MUST revoke the just-minted upstream credential. +// +// Pre-fix behaviour: Set fails → return error → DO key remains in the +// account with an unrecoverable secret_key. Every subsequent run mints +// another orphan with the same name. +// +// Post-fix: Set failure triggers credRevoker.RevokeProviderCredential +// with the access_key from the just-generated subKeyMap. +func TestBootstrapSecrets_ProviderCredential_RollbackOnSetFailure(t *testing.T) { + withStubGenerator(t, func(_ context.Context, _ string, _ map[string]any) (string, error) { + out, _ := json.Marshal(map[string]string{ + "access_key": "AK_ORPHAN", + "secret_key": "SK_DOOMED", + }) + return string(out), nil + }) + + p := &failOnSetProvider{failKey: "SPACES_secret_key"} + rev := &recordingRevoker{} + cfg := &SecretsConfig{ + Generate: []SecretGen{ + {Key: "SPACES", Type: "provider_credential", Source: "digitalocean.spaces"}, + }, + } + + _, _, err := bootstrapSecrets(context.Background(), p, cfg, nil, rev) + if err == nil { + t.Fatal("expected Set failure to surface as error") + } + + if len(rev.calls) != 1 { + t.Fatalf("expected 1 rollback-revoke call; got %d", len(rev.calls)) + } + if rev.calls[0].credentialID != "AK_ORPHAN" { + t.Errorf("rollback called with credentialID=%q want AK_ORPHAN", rev.calls[0].credentialID) + } + if rev.calls[0].source != "digitalocean.spaces" { + t.Errorf("rollback source=%q want digitalocean.spaces", rev.calls[0].source) + } +} + +// TestBootstrapSecrets_ProviderCredential_RollbackOnFirstSetFailure +// guards the most insidious shape of the bug: the very first Set call +// fails (e.g. access_key write). The pre-fix code extracted +// newAccessKey *during* the for-range loop, so a first-iteration +// failure left newAccessKey empty even though the upstream key exists. +// The fix extracts access_key BEFORE the loop. +func TestBootstrapSecrets_ProviderCredential_RollbackOnFirstSetFailure(t *testing.T) { + withStubGenerator(t, func(_ context.Context, _ string, _ map[string]any) (string, error) { + out, _ := json.Marshal(map[string]string{ + "access_key": "AK_FIRST", + "secret_key": "SK_FIRST", + }) + return string(out), nil + }) + + p := &failOnSetProvider{failKey: "SPACES_access_key"} + rev := &recordingRevoker{} + cfg := &SecretsConfig{ + Generate: []SecretGen{ + {Key: "SPACES", Type: "provider_credential", Source: "digitalocean.spaces"}, + }, + } + + _, _, err := bootstrapSecrets(context.Background(), p, cfg, nil, rev) + if err == nil { + t.Fatal("expected Set failure") + } + if len(rev.calls) != 1 || rev.calls[0].credentialID != "AK_FIRST" { + t.Errorf("rollback calls = %v; want one revoke of AK_FIRST", rev.calls) + } +} diff --git a/cmd/wfctl/secrets.go b/cmd/wfctl/secrets.go index 62a5758c..2ed3fc32 100644 --- a/cmd/wfctl/secrets.go +++ b/cmd/wfctl/secrets.go @@ -32,6 +32,8 @@ func runSecrets(args []string) error { return runSecretsSync(args[1:]) case "setup": return runSecretsSetup(args[1:]) + case "list-orphans": + return runSecretsListOrphans(args[1:]) default: return secretsUsage() } @@ -54,6 +56,7 @@ Actions: rotate Trigger rotation of a secret sync Copy secret structure between environments setup Interactively set all secrets for an environment + list-orphans List/delete duplicate-named upstream credentials (e.g. DO Spaces keys) Ad-hoc provider flags (set, get, list, delete, export): --provider Override provider: keychain, env, aws @@ -75,6 +78,8 @@ Examples: wfctl secrets sync --from staging --to production wfctl secrets setup --env local wfctl secrets setup --env production --auto-gen-keys + wfctl secrets list-orphans --source digitalocean.spaces --name workflow-spaces-key + wfctl secrets list-orphans --source digitalocean.spaces --name workflow-spaces-key --delete `) return fmt.Errorf("missing or unknown action") } diff --git a/cmd/wfctl/secrets_orphans.go b/cmd/wfctl/secrets_orphans.go new file mode 100644 index 00000000..c021e191 --- /dev/null +++ b/cmd/wfctl/secrets_orphans.go @@ -0,0 +1,158 @@ +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "io" + "net/http" + "os" + "strings" +) + +// runSecretsListOrphans implements `wfctl secrets list-orphans`. +// +// Lists upstream provider credentials that share a name (typically +// orphans from past partial-apply failures). Supports digitalocean.spaces +// today; other sources can be added by extending listOrphans below. +// +// Use --delete to delete every match. Without --delete this is a +// dry-run (the default). +func runSecretsListOrphans(args []string) error { + fs := flag.NewFlagSet("secrets list-orphans", flag.ContinueOnError) + source := fs.String("source", "digitalocean.spaces", "Credential source (digitalocean.spaces)") + name := fs.String("name", "", "Filter by exact credential name (required)") + doDelete := fs.Bool("delete", false, "Delete matching orphans (default: dry-run)") + if err := fs.Parse(args); err != nil { + return err + } + if *name == "" { + return fmt.Errorf("--name is required (the credential name to clean up; e.g. \"multisite-deploy-key\")") + } + + ctx := context.Background() + switch *source { + case "digitalocean.spaces": + return listSpacesOrphans(ctx, *name, *doDelete) + default: + return fmt.Errorf("source %q not supported (supported: digitalocean.spaces)", *source) + } +} + +// listSpacesOrphans paginates DO Spaces keys, prints every match by +// name, and (when del=true) deletes each one. +// +// The DO Spaces Keys API does NOT enforce name uniqueness, so a single +// name may map to many access_keys after a partial-apply bug. Each +// orphan is independently deletable by access_key. +func listSpacesOrphans(ctx context.Context, name string, del bool) error { + token := os.Getenv("DIGITALOCEAN_TOKEN") + if token == "" { + return fmt.Errorf("DIGITALOCEAN_TOKEN not set") + } + + matches, err := paginateSpacesKeysByName(ctx, token, name) + if err != nil { + return err + } + + if len(matches) == 0 { + fmt.Printf("no orphans found for name=%q\n", name) + return nil + } + + fmt.Printf("found %d orphan(s) named %q:\n", len(matches), name) + for _, ak := range matches { + fmt.Printf(" access_key=%s\n", ak) + } + + if !del { + fmt.Println("\ndry-run; re-run with --delete to remove these orphans") + return nil + } + + fmt.Println() + deleted := 0 + for _, ak := range matches { + if err := deleteSpacesKey(ctx, token, ak); err != nil { + fmt.Fprintf(os.Stderr, " delete %s: %v\n", ak, err) + continue + } + fmt.Printf(" deleted %s\n", ak) + deleted++ + } + fmt.Printf("\ndeleted %d/%d orphans\n", deleted, len(matches)) + if deleted < len(matches) { + return fmt.Errorf("delete incomplete (%d failures)", len(matches)-deleted) + } + return nil +} + +// paginateSpacesKeysByName returns every access_key whose name field +// equals the requested name. Bounded at 100 pages × 100 keys = 10000. +func paginateSpacesKeysByName(ctx context.Context, token, name string) ([]string, error) { + var matches []string + page := 1 + for page <= 100 { + url := fmt.Sprintf("https://api.digitalocean.com/v2/spaces/keys?per_page=100&page=%d", page) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+token) + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("DO list spaces keys: %w", err) + } + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4<<20)) + resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("DO list spaces keys: HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + var list struct { + Keys []struct { + Name string `json:"name"` + AccessKey string `json:"access_key"` + } `json:"keys"` + Links struct { + Pages struct { + Next string `json:"next"` + } `json:"pages"` + } `json:"links"` + } + if err := json.Unmarshal(body, &list); err != nil { + return nil, fmt.Errorf("DO list spaces keys parse: %w", err) + } + for _, k := range list.Keys { + if k.Name == name { + matches = append(matches, k.AccessKey) + } + } + if list.Links.Pages.Next == "" { + break + } + page++ + } + return matches, nil +} + +// deleteSpacesKey deletes one DO Spaces key by access_key. +func deleteSpacesKey(ctx context.Context, token, accessKey string) error { + url := "https://api.digitalocean.com/v2/spaces/keys/" + accessKey + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, url, nil) + if err != nil { + return err + } + req.Header.Set("Authorization", "Bearer "+token) + resp, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) + return fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + return nil +} From c9cea2f39dcce9028ce4e27dd570abf72d5a6f50 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Wed, 20 May 2026 02:29:31 -0400 Subject: [PATCH 4/5] chore(lint): annotate ghAPICmd #nosec G204 Pre-existing G204 false positive on a fixed-binary + url-escaped endpoint subprocess. Documented inline; no behaviour change. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/wfctl/plugin_marketplace_verify.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/wfctl/plugin_marketplace_verify.go b/cmd/wfctl/plugin_marketplace_verify.go index b019fbbc..06833dd8 100644 --- a/cmd/wfctl/plugin_marketplace_verify.go +++ b/cmd/wfctl/plugin_marketplace_verify.go @@ -85,6 +85,11 @@ Options: // ghAPICmd is the indirection point so tests can inject a fake gh binary. // Default is the real `gh api` CLI. +// +// #nosec G204 -- the binary is the fixed string "gh" and `endpoint` is +// constructed from a literal-prefix + URL-escaped query inside this +// package (urlQueryEscape sanitises). No user-controlled shell metachars +// flow into the subprocess. var ghAPICmd = func(ctx context.Context, endpoint string) ([]byte, error) { cmd := exec.CommandContext(ctx, "gh", "api", endpoint) return cmd.Output() From 8450cfb2e9a31545d4c3552246679a17db6ebed2 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Wed, 20 May 2026 02:42:53 -0400 Subject: [PATCH 5/5] fix(secrets): follow DO pagination + raise per_page to 200 list-orphans was using local page-counter increments. Earlier runs truncated at 100 because per_page was 100 + the locally-bounded loop returned after page 1. DO's Spaces Keys API allows per_page up to 200 and returns an absolute next-page URL in links.pages.next; we now follow that URL rather than incrementing locally. Also emits a stderr scan summary so debugging is observable from the CI log without re-running with extra flags. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/wfctl/secrets_orphans.go | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cmd/wfctl/secrets_orphans.go b/cmd/wfctl/secrets_orphans.go index c021e191..dc4446f5 100644 --- a/cmd/wfctl/secrets_orphans.go +++ b/cmd/wfctl/secrets_orphans.go @@ -90,13 +90,21 @@ func listSpacesOrphans(ctx context.Context, name string, del bool) error { } // paginateSpacesKeysByName returns every access_key whose name field -// equals the requested name. Bounded at 100 pages × 100 keys = 10000. +// equals the requested name. Bounded at 200 pages × 200 keys = 40000. +// +// DO's Spaces Keys list endpoint allows per_page up to 200. Earlier +// 100-cap was leaving callers with partial results on accounts that +// have many same-named orphans. We also follow the absolute URL +// returned in `links.pages.next` rather than incrementing page locally +// — DO's pagination contract is that `next` is authoritative. func paginateSpacesKeysByName(ctx context.Context, token, name string) ([]string, error) { var matches []string - page := 1 - for page <= 100 { - url := fmt.Sprintf("https://api.digitalocean.com/v2/spaces/keys?per_page=100&page=%d", page) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + next := "https://api.digitalocean.com/v2/spaces/keys?per_page=200&page=1" + pages := 0 + totalKeysSeen := 0 + for next != "" && pages < 200 { + pages++ + req, err := http.NewRequestWithContext(ctx, http.MethodGet, next, nil) if err != nil { return nil, err } @@ -124,16 +132,15 @@ func paginateSpacesKeysByName(ctx context.Context, token, name string) ([]string if err := json.Unmarshal(body, &list); err != nil { return nil, fmt.Errorf("DO list spaces keys parse: %w", err) } + totalKeysSeen += len(list.Keys) for _, k := range list.Keys { if k.Name == name { matches = append(matches, k.AccessKey) } } - if list.Links.Pages.Next == "" { - break - } - page++ + next = list.Links.Pages.Next } + fmt.Fprintf(os.Stderr, "list-orphans: scanned %d page(s), %d total key(s), %d matches for name=%q\n", pages, totalKeysSeen, len(matches), name) return matches, nil }