From 5b4d3b8ea6c2a751c94face1005684aaa7e627b1 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 7 May 2026 12:37:43 -0400 Subject: [PATCH 1/2] fix(iac): refresh-outputs merges live Outputs to preserve write-only fields Replace semantics in refreshOne clobbered create-time fields absent from the cloud Read response (e.g., DO Droplet user_data). After refresh, state held user_data="" causing planner to emit a REPLACE, which hit a 422 when the Volume was already attached (TC2 run 25508442022). New merge semantics: start from src.Outputs clone, overlay live.Outputs keys, so fields not returned by Read are preserved while cloud truth wins for fields that are returned. Adds three new tests (MergePreservesFieldsNotInRead, LiveOverridesExisting, NewFieldsFromLiveAdded) and ADR 011. Co-Authored-By: Claude Sonnet 4.6 --- .../011-refresh-outputs-merge-semantics.md | 59 +++++++++++++ iac/refreshoutputs/refresh.go | 18 +++- iac/refreshoutputs/refresh_test.go | 88 +++++++++++++++++++ 3 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 docs/adr/011-refresh-outputs-merge-semantics.md diff --git a/docs/adr/011-refresh-outputs-merge-semantics.md b/docs/adr/011-refresh-outputs-merge-semantics.md new file mode 100644 index 00000000..4c117df4 --- /dev/null +++ b/docs/adr/011-refresh-outputs-merge-semantics.md @@ -0,0 +1,59 @@ +# ADR 011: refresh-outputs merge semantics + +**Status:** Accepted +**Date:** 2026-05-06 + +## Context + +`iac/refreshoutputs/refresh.go::refreshOne` previously **replaced** `dst.Outputs` +entirely with the cloud Read response (`live.Outputs`). This was correct for fields +that cloud Read returns, but caused silent data loss for fields that a provider's +Read endpoint does not return. + +The canonical example is DigitalOcean Droplets: `user_data` (cloud-init script) is +accepted at creation time but is **never returned by the Read endpoint**. After a +refresh-outputs pass, state would contain `user_data: ""`. The planner's next +plan/apply cycle would compare `state.user_data=""` against `config.user_data=""` +and emit a REPLACE action. Apply would attempt delete+create; with a Volume already +attached the DO API returned a 422, blocking the TC2 cutover (run 25508442022). + +The previous TC2 run (25507244491) succeeded only because that Droplet was a GHOST +at refresh time — the ghost-tolerance fix (PR #572) skipped it, preserving `user_data` +in state. Once the Droplet was created the next refresh clobbered the field. + +## Decision + +`refreshOne` now **merges** `live.Outputs` into `src.Outputs` rather than replacing +it: + +1. Start from a clone of `src.Outputs` (preserves all create-time fields). +2. For each field `k` in `live.Outputs`: if it is absent or differs from `merged[k]`, + set `merged[k] = v` and mark `needsUpdate = true`. +3. If any field changed, assign `dst.Outputs = merged`. + +Semantics: +- **Cloud truth wins** for any field present in the live Read response. +- **Create-time fields are preserved** for fields absent from the live Read response. +- The existing "skip write when nothing changed" optimisation is retained. + +## Trade-offs + +If a cloud provider truly removes a field from a live resource's outputs (rare for +IaC-managed resources), refresh-outputs will keep the stale value in state. The +planner may not detect the removal unless it also re-reads outputs. + +**This is acceptable** because: +- Provider-managed removal of a previously-set output field is uncommon for + IaC-controlled resources. +- The remediation path is well-defined: `wfctl infra apply --refresh` performs a + full reconcile and will surface the discrepancy as a plan diff. +- The alternative (replace semantics) causes false REPLACE storms for write-only + fields, which is a far more disruptive failure mode. + +## References + +- TC2 run 25508442022 — failure: 422 `storage already associated with another droplet` + caused by `user_data` clobber. +- TC2 run 25507244491 — success: ghost-skip preserved `user_data`. +- PR #572 — ghost-tolerance fix (`ErrResourceNotFound` skip in `refreshOne`). +- DO Droplet API docs: `user_data` is a create-time-only attribute. diff --git a/iac/refreshoutputs/refresh.go b/iac/refreshoutputs/refresh.go index e3c2b914..36edab77 100644 --- a/iac/refreshoutputs/refresh.go +++ b/iac/refreshoutputs/refresh.go @@ -105,8 +105,22 @@ func refreshOne(ctx context.Context, p interfaces.IaCProvider, dst *interfaces.R } return fmt.Errorf("could not refresh %q: %w", src.Name, err) } - if !reflect.DeepEqual(live.Outputs, src.Outputs) { - dst.Outputs = cloneMap(live.Outputs) + // Merge: preserve fields in src.Outputs that don't appear in live.Outputs. + // Some cloud Read endpoints don't return all fields that were captured at + // create-time (e.g., DO Droplet's user_data is write-only on Read). A naive + // replace clobbers those fields, causing plan to falsely detect drift on the + // next plan/apply cycle. Merge ensures refresh-outputs is idempotent for + // fields beyond cloud's Read scope. + needsUpdate := false + merged := cloneMap(src.Outputs) + for k, v := range live.Outputs { + if existing, ok := merged[k]; !ok || !reflect.DeepEqual(existing, v) { + merged[k] = v + needsUpdate = true + } + } + if needsUpdate { + dst.Outputs = merged } return nil } diff --git a/iac/refreshoutputs/refresh_test.go b/iac/refreshoutputs/refresh_test.go index ca1a2067..77d2bf0d 100644 --- a/iac/refreshoutputs/refresh_test.go +++ b/iac/refreshoutputs/refresh_test.go @@ -213,3 +213,91 @@ func TestRefresh_PropagateNonGhostError(t *testing.T) { t.Errorf("error should propagate underlying cause; got: %v", err) } } + +// TestRefresh_MergePreservesFieldsNotInRead verifies that fields present in +// src.Outputs but absent from the live Read response are preserved in +// dst.Outputs. This covers cloud providers whose Read endpoints are write-only +// for some fields (e.g., DO Droplet user_data). +func TestRefresh_MergePreservesFieldsNotInRead(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "coredump-staging-droplet", + Type: "infra.droplet", + ProviderID: "droplet-1", + // user_data was captured at create-time; Read won't return it. + Outputs: map[string]any{"id": "x", "user_data": ""}, + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + // provider Read only returns id — user_data is omitted (write-only on Read) + "droplet-1": {"id": "x"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "x" { + t.Errorf("id should be present: %v", got) + } + if got := refreshed[0].Outputs["user_data"]; got != "" { + t.Errorf("user_data should be preserved from src (not in Read response): %v", got) + } +} + +// TestRefresh_LiveOverridesExisting verifies that when the cloud Read response +// returns a field that also exists in src.Outputs with a different value, the +// live (cloud) value wins. +func TestRefresh_LiveOverridesExisting(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "coredump-staging-droplet", + Type: "infra.droplet", + ProviderID: "droplet-2", + Outputs: map[string]any{"id": "x"}, + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + // provider returns updated id — cloud truth wins + "droplet-2": {"id": "y"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "y" { + t.Errorf("id should be updated to cloud value 'y', got: %v", got) + } +} + +// TestRefresh_NewFieldsFromLiveAdded verifies that new fields returned by the +// cloud Read response (not present in src.Outputs) are added to dst.Outputs. +func TestRefresh_NewFieldsFromLiveAdded(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "coredump-staging-droplet", + Type: "infra.droplet", + ProviderID: "droplet-3", + Outputs: map[string]any{"id": "x"}, + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + // provider now also returns private_ip (newly available after provisioning) + "droplet-3": {"id": "x", "private_ip": "10.0.0.5"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "x" { + t.Errorf("id should remain: %v", got) + } + if got := refreshed[0].Outputs["private_ip"]; got != "10.0.0.5" { + t.Errorf("private_ip from live Read should be added: %v", got) + } +} From fbda503f06d6fdcd41a8e0662a2cfb8c2b5f4fd6 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 7 May 2026 12:45:09 -0400 Subject: [PATCH 2/2] fix(iac): nil-safe copy-on-write merge in refreshOne; add nil-src test Address Copilot review findings: - nil src.Outputs + non-empty live.Outputs panicked on map assignment. Switch to copy-on-write: allocate the merged map only on first detected difference, seeding it with maps.Copy(merged, src.Outputs) which is nil-safe (copies nothing from nil). - Remove per-resource unconditional clone (now only allocated when a change is detected). - Remove the now-unused cloneMap helper. - Add TestRefresh_NilSrcOutputs_LiveFieldsPopulated (would have caught the nil-map panic). Co-Authored-By: Claude Sonnet 4.6 --- iac/refreshoutputs/refresh.go | 30 +++++++++++++----------------- iac/refreshoutputs/refresh_test.go | 30 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/iac/refreshoutputs/refresh.go b/iac/refreshoutputs/refresh.go index 36edab77..162d2304 100644 --- a/iac/refreshoutputs/refresh.go +++ b/iac/refreshoutputs/refresh.go @@ -111,27 +111,23 @@ func refreshOne(ctx context.Context, p interfaces.IaCProvider, dst *interfaces.R // replace clobbers those fields, causing plan to falsely detect drift on the // next plan/apply cycle. Merge ensures refresh-outputs is idempotent for // fields beyond cloud's Read scope. - needsUpdate := false - merged := cloneMap(src.Outputs) + // + // Copy-on-write: the clone is only allocated on the first detected change, + // so resources that haven't changed incur no per-resource allocation. + var merged map[string]any for k, v := range live.Outputs { - if existing, ok := merged[k]; !ok || !reflect.DeepEqual(existing, v) { - merged[k] = v - needsUpdate = true + if existing, ok := src.Outputs[k]; ok && reflect.DeepEqual(existing, v) { + continue } + // First change detected: clone src.Outputs (nil-safe) and start merging. + if merged == nil { + merged = make(map[string]any, len(src.Outputs)+len(live.Outputs)) + maps.Copy(merged, src.Outputs) + } + merged[k] = v } - if needsUpdate { + if merged != nil { dst.Outputs = merged } return nil } - -// cloneMap returns an independent shallow copy of m. Callers receive a map -// they can mutate without aliasing the live driver output. -func cloneMap(m map[string]any) map[string]any { - if m == nil { - return nil - } - c := make(map[string]any, len(m)) - maps.Copy(c, m) - return c -} diff --git a/iac/refreshoutputs/refresh_test.go b/iac/refreshoutputs/refresh_test.go index 77d2bf0d..345f1d28 100644 --- a/iac/refreshoutputs/refresh_test.go +++ b/iac/refreshoutputs/refresh_test.go @@ -301,3 +301,33 @@ func TestRefresh_NewFieldsFromLiveAdded(t *testing.T) { t.Errorf("private_ip from live Read should be added: %v", got) } } + +// TestRefresh_NilSrcOutputs_LiveFieldsPopulated verifies that when a resource +// has no previously-persisted Outputs (nil map), and the provider Read returns +// non-empty Outputs, the merge correctly populates dst.Outputs without panicking. +// This covers resources that enter state before any outputs were captured. +func TestRefresh_NilSrcOutputs_LiveFieldsPopulated(t *testing.T) { + states := []interfaces.ResourceState{ + { + Name: "new-droplet", + Type: "infra.droplet", + ProviderID: "droplet-nil", + Outputs: nil, // no persisted outputs yet + }, + } + fakeProvider := &fakeIaCProvider{readOutputs: map[string]map[string]any{ + "droplet-nil": {"id": "abc123", "ip": "1.2.3.4"}, + }} + + refreshed, err := Refresh(context.Background(), fakeProvider, states, Options{Concurrency: 1}) + if err != nil { + t.Fatal(err) + } + + if got := refreshed[0].Outputs["id"]; got != "abc123" { + t.Errorf("id should be populated from live Read: %v", got) + } + if got := refreshed[0].Outputs["ip"]; got != "1.2.3.4" { + t.Errorf("ip should be populated from live Read: %v", got) + } +}