From d2cd4c50c36487bdc60719c08439d24eabd290cb Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 00:38:41 -0400 Subject: [PATCH 01/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20workflow=20core=20=E2=86=92=20strict-contra?= =?UTF-8?q?ct=20plugins?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Design for removing aws-sdk-go-v2, Azure/azure-sdk-for-go, and cloud.google.com/go + google.golang.org/api direct deps from workflow core's module/ package. Architecture: 3 extension surfaces, 3 strategies: - IaC state backends → new IaCStateBackend strict proto contract; iac.state stays core, config.backend dispatches to plugin gRPC client. - platform.* provisioners → new PlatformBackend strict proto contract; module types + provider: key stay core, kind backend stays in-core, cloud backends (eks/gke/ecs/route53/ec2/autoscaling) extract. - standalone modules/steps (apigateway, codebuild, dynamodb, s3_upload, storage.s3, storage.gcs) → plugin-native module/step types via the existing ModuleFactories/StepFactories SDK — no new contract. Credentials (Option 1): each plugin-native module carries its own credentials: block + builds aws.Config in-process; optional in-plugin credentials_ref for DRY. cloud_account_aws*.go deleted; azure/gcp cloud_account files have no SDK import and stay. 4 phases: A azure (validates IaCStateBackend), B aws (largest), C gcp, D digitalocean (spaces backend, minor bump + migration doc). Includes Assumptions + Rollback sections + self-challenge top-3 doubts (PlatformBackend over-generality, provider-separability fragility, benchmark-could-invalidate-unary-default — all with mitigations deferred to writing-plans). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 docs/plans/2026-05-14-cloud-sdk-extraction-design.md diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md new file mode 100644 index 00000000..ec7b4c2c --- /dev/null +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -0,0 +1,164 @@ +# Cloud-SDK Extraction: workflow core → strict-contract plugins + +**Date:** 2026-05-14 +**Status:** Design — approved in brainstorm, pending adversarial review +**Owner:** autonomous pipeline (workflow#TBD) + +## Problem + +Workflow core's `module/` package imports three cloud SDK trees directly: + +| SDK | go.mod weight | Files (real imports) | +|-----|---------------|----------------------| +| `github.com/aws/aws-sdk-go-v2/*` | ~15 service packages | 13 | +| `github.com/Azure/azure-sdk-for-go/sdk/*` | azcore + azblob | 2 | +| `cloud.google.com/go/storage` + `google.golang.org/api/*` | storage + container | 3 | + +Every dependabot bump of a cloud SDK (PRs #400/#419/#421/#635 as of this writing) churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. + +Precedent: workflow#617 removed the legacy DigitalOcean IaC modules + `godo` from core; IaC resource provisioning moved entirely to `workflow-plugin-digitalocean`. This design extends the same principle to the *remaining* cloud functionality that never went through that extraction: IaC **state backends**, managed-service **platform** provisioners, and a handful of standalone modules/steps. + +## Goals + +- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` entirely. +- Cloud functionality remains available, loaded via strict-contract gRPC plugins (the existing sidecar model). +- `kind` Kubernetes backend (no SDK) stays in core — local-dev/test path must not require a plugin. + +## Non-Goals + +- Re-homing the IaC *resource provider* contract (`IaCProviderRequired`) — already extracted, not touched here. +- Changing how plugins are discovered/installed (`wfctl plugin install` flow unchanged). +- Backwards-compatible yaml — this is a **clean break** with a migration guide (per workflow#617 precedent). + +## Architecture + +Three extension surfaces, three handling strategies: + +### 1. IaC state backends → new `IaCStateBackend` strict proto contract + +`iac.state` **stays a core module type**. The state store is engine infrastructure — the orchestrator reads/writes it during every plan/apply cycle — so it keeps a stable core seam. What changes: `config.backend` no longer dispatches a hardcoded `switch` in `module/iac_module.go`; instead core resolves an `IaCStateBackend` gRPC client from whichever loaded plugin registered that backend name. + +```proto +// workflow/plugin/external/proto/iac_state.proto +service IaCStateBackend { + rpc GetState (GetStateRequest) returns (GetStateResponse); + rpc PutState (PutStateRequest) returns (PutStateResponse); + rpc DeleteState(DeleteStateRequest) returns (DeleteStateResponse); + rpc ListStates (ListStatesRequest) returns (ListStatesResponse); + rpc AcquireLease(AcquireLeaseRequest) returns (AcquireLeaseResponse); + rpc ReleaseLease(ReleaseLeaseRequest) returns (ReleaseLeaseResponse); +} +message GetStateResponse { bytes state = 1; bool exists = 2; } +message PutStateRequest { string key = 1; bytes state = 2; string content_type = 3; } +// ... lease messages carry lease_id + duration_seconds +``` + +Backend ownership — every cloud plugin implements the contract for its native storage: + +| backend name | plugin | storage | +|--------------|--------|---------| +| `s3` | workflow-plugin-aws | AWS S3 | +| `azure_blob` | workflow-plugin-azure | Azure Blob | +| `gcs` | workflow-plugin-gcp | Google Cloud Storage | +| `spaces` | workflow-plugin-digitalocean | DO Spaces (S3-compatible) | + +`memory`, `filesystem`, `postgres` backends stay **in core** — no cloud SDK, no reason to extract. + +**Unary GET+PUT vs streaming:** decided by benchmark, not assumption. The writing-plans phase includes a task that drives a 1 MB synthetic state blob through a full plan→apply cycle (Get + Put + AcquireLease + ReleaseLease per resource batch) over unary RPC, measures p50/p99 added latency vs the in-process baseline, and only adopts chunked streaming if unary clears no acceptable bar. Default build target: **unary**, because (a) gRPC's default 4 MB message cap covers typical state files, (b) streaming adds protocol complexity that must be justified by data, and (c) the in-process baseline this replaces was itself a single blob read/write. + +### 2. Managed-service platform provisioners → new `PlatformBackend` strict proto contract + +The `platform.*` module family (`platform.kubernetes`, `platform.ecs`, `platform.networking`, `platform.dns`, `platform.autoscaling`) keeps its module types **and its `provider:` config key** in core — no yaml UX break. Each `platform.*` module currently dispatches to a provider-specific backend via an in-process interface (`kubernetesBackend`, etc.). The cloud-backed implementations (EKS, GKE, ECS, Route53, EC2, ApplicationAutoScaling) move behind the `PlatformBackend` gRPC contract; the `kind` backend stays in-core. + +```proto +// workflow/plugin/external/proto/platform_backend.proto +service PlatformBackend { + rpc Plan (PlatformPlanRequest) returns (PlatformPlanResponse); + rpc Apply (PlatformApplyRequest) returns (PlatformApplyResponse); + rpc Destroy(PlatformDestroyRequest) returns (PlatformDestroyResponse); +} +// Request carries: platform_type (kubernetes|ecs|...), provider (eks|gke|...), +// desired-state struct, current-state struct. +// Response carries: plan actions / applied state / errors. +``` + +When `provider != kind` (or any other in-core backend), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. + +### 3. Standalone modules / steps → plugin-native types (existing SDK surface, no new contract) + +These are user-facing pipeline functionality, not engine infrastructure. They become **plugin-native module/step types** via the existing `ModuleFactories` / `StepFactories` plugin SDK — which is *already* a gRPC sidecar path (`RemoteModule`). No new contract. + +| core file | becomes | plugin | +|-----------|---------|--------| +| `aws_api_gateway.go` | `aws.apigateway` module | aws | +| `codebuild.go` | `aws.codebuild` module | aws | +| `nosql_dynamodb.go` | `nosql.dynamodb` module | aws | +| `pipeline_step_s3_upload.go` | `step.s3_upload` | aws | +| `s3_storage.go` / `storage_artifact_s3.go` | `storage.s3` module | aws | +| `storage_gcs.go` | `storage.gcs` module | gcp | + +Credential handling (Option 1, approved): the deleted `cloud_account_aws.go` + `_creds.go` (`AWSConfigProvider` / `AWSConfig()`) is **not** replaced by a core contract. Each plugin-native AWS module carries its own `credentials:` config block and builds `aws.Config` in-process via a shared in-plugin `buildAWSConfig` helper — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. `cloud_account_azure.go` and `cloud_account_gcp.go` have **no SDK imports** (pure config-map parsing) and stay in core untouched. + +## Phases + +Each phase is one workflow-core PR (deleting files + wiring the contract dispatch) plus one PR per affected plugin. Phases are independent — a plugin can ship its half ahead of core deleting its half, because core keeps the old in-process path until the plugin contract is wired. + +**Phase A — Azure** (smallest, validates the `IaCStateBackend` contract end-to-end): +- New `IaCStateBackend` proto contract in core. +- workflow-plugin-azure implements `azure_blob` backend. +- Core: delete `iac_state_azure.go`, strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go`, drop `Azure/azure-sdk-for-go` from go.mod. + +**Phase B — AWS** (largest — 13 files, 3 surfaces): +- New `PlatformBackend` proto contract in core. +- workflow-plugin-aws implements: `IaCStateBackend` (`s3`), `PlatformBackend` (eks/ecs/networking/dns/autoscaling), and plugin-native `aws.apigateway` / `aws.codebuild` / `nosql.dynamodb` / `step.s3_upload` / `storage.s3` types. +- Core: delete the 13 AWS files (+ the EKS half of `platform_kubernetes_kind.go`), drop `aws-sdk-go-v2` from go.mod. + +**Phase C — GCP** (3 files): +- workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), `PlatformBackend` (gke), plugin-native `storage.gcs`. +- Core: delete `iac_state_gcs.go`, `storage_gcs.go`, the GKE half of `platform_kubernetes_kind.go`; drop `cloud.google.com/go` + `google.golang.org/api`. + +**Phase D — DigitalOcean compat:** +- workflow-plugin-digitalocean implements `IaCStateBackend` for `spaces` (S3-compatible — pulls `aws-sdk-go-v2/service/s3`, the one service package, not the whole tree). +- **Minor version bump** (compatibility break marker) + migration doc: wfctl users with `iac.state` `backend: spaces` must now have workflow-plugin-digitalocean ≥ that minor loaded. No app.yaml change for them — the backend name `spaces` is unchanged — but the plugin must be present in `data/plugins/` / `wfctl.yaml`. + +`platform_kubernetes_kind.go` is touched by both B (EKS) and C (GKE) — it gets split: `kind` backend stays, EKS+GKE backends extracted. Phase B and C must coordinate on that file (B lands first, leaves a GKE shim; C removes the shim). + +## Migration (user-facing) + +Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-14-cloud-sdk-extraction.md`: + +- `iac.state` with `backend: s3|azure_blob|gcs|spaces` → load the matching plugin (`wfctl plugin install workflow-plugin-{aws,azure,gcp,digitalocean}`). yaml `backend:` value unchanged. +- `platform.kubernetes` `provider: eks|gke` etc. → load the matching plugin. yaml `provider:` value unchanged. +- `aws.apigateway` / former `cloud.account`-brokered AWS modules → module type renamed to plugin-native form; `credentials:` block moves inline (or `credentials_ref:` an `aws.credentials` module). **This is the only yaml-shape change.** +- `memory` / `filesystem` / `postgres` state backends, `kind` k8s backend → no change, still core. + +## Assumptions + +1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case but a hostile-large state is out of initial scope (documented limitation, not a silent failure: `PutState` returns a clear "state exceeds transport limit" error). +2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are already interface-segregated such that the `kind` impl can stay while cloud impls extract. If a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first. +3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive, core ignores unknown backend registrations until its own half lands. +4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. The DO plugin already carries `godo`; adding one AWS service package is the minimal cost of self-contained `spaces` state support (vs. forcing DO users to also load workflow-plugin-aws). +5. **`cloud_account_azure.go` / `cloud_account_gcp.go` genuinely have zero SDK imports.** Verified by grep at design time; if a future change adds an SDK import there, that file joins its phase's extraction. +6. **No core code outside `module/` imports these SDKs.** Verified: the only `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` imports are under `module/`. `cmd/`, `engine.go`, `schema/`, `plugin/` are clean. + +## Rollback + +This design changes **plugin loading paths** and **go.mod dependency trees** — runtime-affecting per the `runtime-launch-validation` trigger list. + +- **Per-phase revert:** each phase is an isolated core PR + plugin PR(s). Reverting the core PR restores the in-process backend `switch` / `platform.*` cloud backends and re-adds the SDK to `go.mod` — the deleted files are recoverable from git. The plugin PRs are additive (new contract impls / module types) and can stay merged harmlessly even if core reverts. +- **Forward-fix preferred over revert:** because core keeps the old in-process path until the contract dispatch is wired *in the same PR*, a broken phase fails at PR CI (image-launch / strict-contracts gates), not in production. The revert path exists but the gate is the primary safety. +- **DO `spaces` break (Phase D):** the only change with an external-user-visible compat break. Rollback = revert the DO plugin minor bump; `spaces` users on the prior plugin version are unaffected because the prior version still has the in-core path's expectations. The break only bites users who upgrade core past the phase-D core PR *without* upgrading the DO plugin — the migration doc + `minEngineVersion` bump in the plugin manifest is the guard. + +## Self-challenge — top doubts surfaced + +1. **`PlatformBackend` may be over-general.** Five `platform.*` types (kubernetes/ecs/networking/dns/autoscaling) behind one `Plan/Apply/Destroy` contract with a `platform_type` discriminator — this risks a lowest-common-denominator contract that fits none of them well. *Mitigation:* writing-plans should validate the contract shape against all five backend interfaces before locking the proto; if they don't unify cleanly, split into per-family contracts or fold the cloud platform provisioners into the existing `IaCProviderRequired` resource model instead. +2. **Assumption 2 (clean provider-separability) is the most fragile.** If `platform_kubernetes_kind.go`'s `kubernetesBackend` interface returns SDK-typed values, "keep kind, extract EKS/GKE" requires a non-trivial interface refactor that this design hand-waves. *Mitigation:* the first task of Phase B/C is an interface-audit spike — if it fails, the phase re-scopes. +3. **The state-backend benchmark could come back "streaming required"** and invalidate the unary default, reshaping the `IaCStateBackend` proto after Phase A has already shipped it. *Mitigation:* run the benchmark *before* finalizing the Phase A proto — it's a writing-plans task ordered ahead of the contract lock, not after. + +## Open items deferred to writing-plans + +- Exact proto field layouts for all three contracts (sketches above are directional). +- Whether `PlatformBackend` is one contract or per-family (gated on self-challenge doubt #1). +- The `platform.*` backend interface-audit spike (gated on self-challenge doubt #2). +- Benchmark harness location + acceptance threshold (gated on self-challenge doubt #3). +- Per-plugin CHANGELOG + the consolidated migration doc wording. From 98448d5d66a82961485fb30db120f7f6606aacf4 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 00:45:25 -0400 Subject: [PATCH 02/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20adversarial=20review=20cycle=201=20revision?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 2 Critical + 5 Important findings from adversarial-design-review: Critical: - iac_state_spaces.go (core file importing aws-sdk s3) now has an explicit home: deleted by Phase B's core PR; Phase D reframed from soft-compat to a real clean-break for the `spaces` backend. Goal "core drops aws-sdk-go-v2 entirely" is now actually achieved by the phases as written + enforced by a go list -deps CI gate. - kinesis: added Non-Goals entry explaining it's a transitive dep of modular/modules/eventbus/v2, not a direct workflow import — out of scope, with the go mod why chain documented so the literal ask is fully answered. Important: - Full grep-verified 13-file AWS inventory table in Phase B with per-file destinations; reconciled aws_api_gateway.go (route-sync module) vs platform_apigateway.go (provisioner) as two distinct files. - aksBackend assigned to Phase A (Azure gets the PlatformBackend half too); platform_kubernetes_kind.go split now spans 3 phases (aks/eks/gke) with explicit always-compiles coordination. - Proto contracts fold into existing plugin/external/proto/iac.proto (8 services already) instead of new files — matches precedent. - New Security section: secret-redaction in config-version-store/tracing + gRPC interceptor logging are blocking writing-plans tasks; credentials_ref blast radius documented as strictly narrower than today's cloud.account. Minor: - IaCStateBackend RPC set now maps 1:1 to the real module.IaCStateStore interface (GetState/SaveState/ListStates/DeleteState/Lock/Unlock) — no speculative surface. - Phase D rollback restated as a matched pair (Phase B core PR + DO plugin PR). - IaCProviderRequired/ResourceDriver reuse promoted to a first-class Alternatives Considered entry with accept/reject rationale + retained as the gated fallback for PlatformBackend. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 182 +++++++++++------- 1 file changed, 114 insertions(+), 68 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index ec7b4c2c..23bb37b3 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,18 +1,18 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — approved in brainstorm, pending adversarial review +**Status:** Design — revised after adversarial review cycle 1 **Owner:** autonomous pipeline (workflow#TBD) ## Problem -Workflow core's `module/` package imports three cloud SDK trees directly: +Workflow core's `module/` package imports three cloud SDK trees directly. File counts are grep-verified (`awk` over import blocks, comment-only matches excluded): -| SDK | go.mod weight | Files (real imports) | -|-----|---------------|----------------------| -| `github.com/aws/aws-sdk-go-v2/*` | ~15 service packages | 13 | -| `github.com/Azure/azure-sdk-for-go/sdk/*` | azcore + azblob | 2 | -| `cloud.google.com/go/storage` + `google.golang.org/api/*` | storage + container | 3 | +| SDK | Files (real imports) | +|-----|----------------------| +| `github.com/aws/aws-sdk-go-v2/*` | **13** | +| `github.com/Azure/azure-sdk-for-go/sdk/*` (azcore + azblob) | **2** | +| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** | Every dependabot bump of a cloud SDK (PRs #400/#419/#421/#635 as of this writing) churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. @@ -20,7 +20,7 @@ Precedent: workflow#617 removed the legacy DigitalOcean IaC modules + `godo` fro ## Goals -- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` entirely. +- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` **entirely** — verified by a `go list -deps` gate in the final phase's CI. - Cloud functionality remains available, loaded via strict-contract gRPC plugins (the existing sidecar model). - `kind` Kubernetes backend (no SDK) stays in core — local-dev/test path must not require a plugin. @@ -29,6 +29,7 @@ Precedent: workflow#617 removed the legacy DigitalOcean IaC modules + `godo` fro - Re-homing the IaC *resource provider* contract (`IaCProviderRequired`) — already extracted, not touched here. - Changing how plugins are discovered/installed (`wfctl plugin install` flow unchanged). - Backwards-compatible yaml — this is a **clean break** with a migration guide (per workflow#617 precedent). +- **Removing `aws-sdk-go-v2/service/kinesis`.** The user's original ask said "kinesis and azcore." `go mod why github.com/aws/aws-sdk-go-v2/service/kinesis` resolves to `workflow → workflow/module → github.com/GoCodeAlone/modular/modules/eventbus/v2 → kinesis` — kinesis is a **transitive dependency of `modular/modules/eventbus/v2`**, not a direct workflow import (the only `module/` reference is a string literal `"kinesis-provider"` in a test). Removing it is an upstream `modular` concern, not addressable by extracting workflow's own SDK usage. Out of scope here; tracked separately if `modular` eventbus ever needs the same treatment. ## Architecture @@ -38,127 +39,172 @@ Three extension surfaces, three handling strategies: `iac.state` **stays a core module type**. The state store is engine infrastructure — the orchestrator reads/writes it during every plan/apply cycle — so it keeps a stable core seam. What changes: `config.backend` no longer dispatches a hardcoded `switch` in `module/iac_module.go`; instead core resolves an `IaCStateBackend` gRPC client from whichever loaded plugin registered that backend name. +The contract maps **1:1 onto the existing `module.IaCStateStore` interface** (`module/iac_state.go:21`) — six methods, no speculative surface: + ```proto -// workflow/plugin/external/proto/iac_state.proto +// Added as a new service INSIDE plugin/external/proto/iac.proto — matches the +// established precedent (iac.proto already holds 8 services / 598 lines; +// state + platform contracts version alongside the resource-provider contract). service IaCStateBackend { - rpc GetState (GetStateRequest) returns (GetStateResponse); - rpc PutState (PutStateRequest) returns (PutStateResponse); - rpc DeleteState(DeleteStateRequest) returns (DeleteStateResponse); - rpc ListStates (ListStatesRequest) returns (ListStatesResponse); - rpc AcquireLease(AcquireLeaseRequest) returns (AcquireLeaseResponse); - rpc ReleaseLease(ReleaseLeaseRequest) returns (ReleaseLeaseResponse); + rpc GetState (GetStateRequest) returns (GetStateResponse); // → IaCStateStore.GetState + rpc SaveState (SaveStateRequest) returns (SaveStateResponse); // → IaCStateStore.SaveState + rpc ListStates (ListStatesRequest) returns (ListStatesResponse); // → IaCStateStore.ListStates(filter) + rpc DeleteState(DeleteStateRequest) returns (DeleteStateResponse);// → IaCStateStore.DeleteState + rpc Lock (LockRequest) returns (LockResponse); // → IaCStateStore.Lock + rpc Unlock (UnlockRequest) returns (UnlockResponse); // → IaCStateStore.Unlock } -message GetStateResponse { bytes state = 1; bool exists = 2; } -message PutStateRequest { string key = 1; bytes state = 2; string content_type = 3; } -// ... lease messages carry lease_id + duration_seconds +message GetStateResponse { IaCState state = 1; bool exists = 2; } +message SaveStateRequest { IaCState state = 1; } +message ListStatesRequest { map filter = 1; } +// IaCState mirrors module.IaCState; Lock/Unlock carry resource_id only +// (the in-core IaCStateStore.Lock signature takes no lease token/duration). ``` Backend ownership — every cloud plugin implements the contract for its native storage: -| backend name | plugin | storage | -|--------------|--------|---------| -| `s3` | workflow-plugin-aws | AWS S3 | -| `azure_blob` | workflow-plugin-azure | Azure Blob | -| `gcs` | workflow-plugin-gcp | Google Cloud Storage | -| `spaces` | workflow-plugin-digitalocean | DO Spaces (S3-compatible) | +| backend name | plugin | storage | core file deleted | +|--------------|--------|---------|-------------------| +| `s3` | workflow-plugin-aws | AWS S3 | `iac_state_spaces.go` (the S3-compatible store; also the `spaces` impl) | +| `azure_blob` | workflow-plugin-azure | Azure Blob | `iac_state_azure.go` | +| `gcs` | workflow-plugin-gcp | Google Cloud Storage | `iac_state_gcs.go` | +| `spaces` | workflow-plugin-digitalocean | DO Spaces (S3-compatible) | (shares `iac_state_spaces.go` deletion — see Phase D) | `memory`, `filesystem`, `postgres` backends stay **in core** — no cloud SDK, no reason to extract. -**Unary GET+PUT vs streaming:** decided by benchmark, not assumption. The writing-plans phase includes a task that drives a 1 MB synthetic state blob through a full plan→apply cycle (Get + Put + AcquireLease + ReleaseLease per resource batch) over unary RPC, measures p50/p99 added latency vs the in-process baseline, and only adopts chunked streaming if unary clears no acceptable bar. Default build target: **unary**, because (a) gRPC's default 4 MB message cap covers typical state files, (b) streaming adds protocol complexity that must be justified by data, and (c) the in-process baseline this replaces was itself a single blob read/write. +**Unary GET+SAVE vs streaming:** decided by benchmark, not assumption. The writing-plans phase includes a task that drives a 1 MB synthetic state blob through a full plan→apply cycle (GetState + SaveState + Lock + Unlock per resource batch) over unary RPC, measures p50/p99 added latency vs the in-process baseline, and only adopts chunked streaming if unary clears no acceptable bar. Default build target: **unary**, because (a) gRPC's default 4 MB message cap covers typical state files, (b) streaming adds protocol complexity that must be justified by data, and (c) the in-process baseline this replaces was itself a single blob read/write. This task is ordered **before** the Phase A proto is locked (per self-challenge doubt #3). ### 2. Managed-service platform provisioners → new `PlatformBackend` strict proto contract -The `platform.*` module family (`platform.kubernetes`, `platform.ecs`, `platform.networking`, `platform.dns`, `platform.autoscaling`) keeps its module types **and its `provider:` config key** in core — no yaml UX break. Each `platform.*` module currently dispatches to a provider-specific backend via an in-process interface (`kubernetesBackend`, etc.). The cloud-backed implementations (EKS, GKE, ECS, Route53, EC2, ApplicationAutoScaling) move behind the `PlatformBackend` gRPC contract; the `kind` backend stays in-core. +The `platform.*` module family (`platform.kubernetes`, `platform.ecs`, `platform.networking`, `platform.dns`, `platform.autoscaling`) keeps its module types **and its `provider:` config key** in core — no yaml UX break. Each `platform.*` module currently dispatches to a provider-specific backend via an in-process interface (`kubernetesBackend`, etc.). The cloud-backed implementations (EKS, GKE, AKS, ECS, Route53, EC2, ApplicationAutoScaling) move behind the `PlatformBackend` gRPC contract; the `kind` backend stays in-core. ```proto -// workflow/plugin/external/proto/platform_backend.proto +// Added as a new service INSIDE plugin/external/proto/iac.proto (same rationale +// as IaCStateBackend — co-versioned with the resource-provider contract). service PlatformBackend { - rpc Plan (PlatformPlanRequest) returns (PlatformPlanResponse); - rpc Apply (PlatformApplyRequest) returns (PlatformApplyResponse); + rpc Plan (PlatformPlanRequest) returns (PlatformPlanResponse); + rpc Apply (PlatformApplyRequest) returns (PlatformApplyResponse); rpc Destroy(PlatformDestroyRequest) returns (PlatformDestroyResponse); } -// Request carries: platform_type (kubernetes|ecs|...), provider (eks|gke|...), +// Request carries: platform_type (kubernetes|ecs|...), provider (eks|gke|aks|...), // desired-state struct, current-state struct. // Response carries: plan actions / applied state / errors. ``` When `provider != kind` (or any other in-core backend), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. +**The `PlatformBackend` shape is gated** — see Alternatives Considered #1 and self-challenge doubt #1. The first writing-plans task for Phase B is an interface-audit spike that validates one unified `Plan/Apply/Destroy` contract against all five `platform.*` backend interfaces *before* the proto is locked. If they don't unify cleanly, the fallback is folding the cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` model instead of inventing `PlatformBackend`. + ### 3. Standalone modules / steps → plugin-native types (existing SDK surface, no new contract) These are user-facing pipeline functionality, not engine infrastructure. They become **plugin-native module/step types** via the existing `ModuleFactories` / `StepFactories` plugin SDK — which is *already* a gRPC sidecar path (`RemoteModule`). No new contract. | core file | becomes | plugin | |-----------|---------|--------| -| `aws_api_gateway.go` | `aws.apigateway` module | aws | +| `aws_api_gateway.go` (`AWSAPIGateway` — route-sync module) | `aws.apigateway` module | aws | +| `platform_apigateway.go` (`Platform*Gateway*` — provisioner) | folds into `PlatformBackend` (`platform.apigateway` provider) **or** `aws.apigateway` — resolved by the interface-audit spike | aws | | `codebuild.go` | `aws.codebuild` module | aws | | `nosql_dynamodb.go` | `nosql.dynamodb` module | aws | | `pipeline_step_s3_upload.go` | `step.s3_upload` | aws | -| `s3_storage.go` / `storage_artifact_s3.go` | `storage.s3` module | aws | +| `s3_storage.go` | `storage.s3` module | aws | | `storage_gcs.go` | `storage.gcs` module | gcp | -Credential handling (Option 1, approved): the deleted `cloud_account_aws.go` + `_creds.go` (`AWSConfigProvider` / `AWSConfig()`) is **not** replaced by a core contract. Each plugin-native AWS module carries its own `credentials:` config block and builds `aws.Config` in-process via a shared in-plugin `buildAWSConfig` helper — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. `cloud_account_azure.go` and `cloud_account_gcp.go` have **no SDK imports** (pure config-map parsing) and stay in core untouched. +(`storage_artifact_s3.go` references the AWS SDK only in comments — verified comment-only, **not** a real import, stays in core.) -## Phases +Credential handling (Option 1, approved): the deleted `cloud_account_aws.go` + `_creds.go` (`AWSConfigProvider` / `AWSConfig()`) is **not** replaced by a core contract. Each plugin-native AWS module carries its own `credentials:` config block and builds `aws.Config` in-process via a shared in-plugin `buildAWSConfig` helper — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. `cloud_account_azure.go` and `cloud_account_gcp.go` reference the SDKs **only in comments** (verified — they are pure config-map parsing) and stay in core untouched. + +## Security -Each phase is one workflow-core PR (deleting files + wiring the contract dispatch) plus one PR per affected plugin. Phases are independent — a plugin can ship its half ahead of core deleting its half, because core keeps the old in-process path until the plugin contract is wired. +Option 1 moves raw cloud secrets (`accessKey`/`secretKey`/`account_key`/etc.) inline into every plugin-native module's `credentials:` config block — multiplying the number of config sites holding plaintext secrets versus today's single `cloud.account` module. This is not unprecedented (`iac_module.go`'s current `spaces` case already inlines `accessKey`/`secretKey`), but the multiplication needs explicit handling: -**Phase A — Azure** (smallest, validates the `IaCStateBackend` contract end-to-end): -- New `IaCStateBackend` proto contract in core. -- workflow-plugin-azure implements `azure_blob` backend. -- Core: delete `iac_state_azure.go`, strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go`, drop `Azure/azure-sdk-for-go` from go.mod. +- **Config-version store + execution tracing.** Workflow's config-version store (SHA-256 content-addressed) and execution-tracing layer marshal module config. Plugin-native module config carrying inline credentials MUST be redacted before persistence/tracing. Writing-plans task: extend the existing PII/secret redaction (already per-tenant-toggleable per `workflow-cloud`) to recognise the `credentials:` / `credentials_ref:` keys on plugin module config, OR confirm the existing redaction already covers any key matching a secret-pattern. This is a **blocking** task — it ships in the same phase as the first plugin-native AWS module, not after. +- **gRPC sidecar request logging.** The `IaCStateBackend` / `PlatformBackend` requests cross the engine↔plugin gRPC boundary. State payloads are not secrets, but `credentials:` blocks passed in `CreateModule` requests to the plugin ARE. Confirm the plugin SDK's gRPC interceptors do not log full request bodies at info level; if they do, add a redacting interceptor. Writing-plans task in Phase A (first contract wired). +- **`credentials_ref:` blast radius.** A `credentials_ref` resolves to an in-plugin `aws.credentials` module within the *same plugin process* — it does not broaden which process can read the secret (engine never sees the resolved `aws.Config`, only the plugin does). This is strictly *narrower* than today's `cloud.account` (which builds `aws.Config` in the engine process). Documented as an improvement, not a risk. -**Phase B — AWS** (largest — 13 files, 3 surfaces): -- New `PlatformBackend` proto contract in core. -- workflow-plugin-aws implements: `IaCStateBackend` (`s3`), `PlatformBackend` (eks/ecs/networking/dns/autoscaling), and plugin-native `aws.apigateway` / `aws.codebuild` / `nosql.dynamodb` / `step.s3_upload` / `storage.s3` types. -- Core: delete the 13 AWS files (+ the EKS half of `platform_kubernetes_kind.go`), drop `aws-sdk-go-v2` from go.mod. +## Phases + +Each phase is one workflow-core PR (deleting files + wiring the contract dispatch) plus one PR per affected plugin. Within a phase, the plugin PR may merge ahead of the core PR — core keeps the old in-process path until the contract dispatch is wired in the core PR, so a plugin implementing the published proto is harmless to load early. + +**Phase A — Azure** (smallest, validates BOTH new contracts end-to-end): +- Run the state-backend benchmark task; lock the `IaCStateBackend` proto shape. +- Run the `platform.*` interface-audit spike; lock or re-scope the `PlatformBackend` proto shape. +- Add `IaCStateBackend` + `PlatformBackend` services to `plugin/external/proto/iac.proto`. +- Add the secret-redaction + gRPC-interceptor security tasks (blocking). +- workflow-plugin-azure implements `azure_blob` `IaCStateBackend` + `aks` `PlatformBackend`. +- Core: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go`; extract the `aksBackend` from `platform_kubernetes_kind.go` (leave a registration shim); drop `Azure/azure-sdk-for-go` from go.mod. + +**Phase B — AWS** (largest — 13 files, 3 surfaces). Complete file inventory + destination: + +| core file | destination | +|-----------|-------------| +| `iac_state_spaces.go` | aws plugin — `s3` `IaCStateBackend` (DELETE from core; also removes the `spaces` case dependency — see Phase D) | +| `cloud_account_aws.go` | DELETE (Option 1 — no replacement) | +| `cloud_account_aws_creds.go` | DELETE (Option 1 — no replacement) | +| `aws_api_gateway.go` | aws plugin — `aws.apigateway` module | +| `platform_apigateway.go` | aws plugin — `PlatformBackend` or `aws.apigateway` (gated on interface-audit spike) | +| `codebuild.go` | aws plugin — `aws.codebuild` module | +| `pipeline_step_s3_upload.go` | aws plugin — `step.s3_upload` | +| `s3_storage.go` | aws plugin — `storage.s3` module | +| `platform_autoscaling.go` | aws plugin — `PlatformBackend` (`autoscaling`) | +| `platform_dns_backends.go` | aws plugin — `PlatformBackend` (`dns`/route53) | +| `platform_ecs.go` | aws plugin — `PlatformBackend` (`ecs`) | +| `platform_networking.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | +| `platform_kubernetes_kind.go` | SPLIT — `kind` stays core; `eksBackend` → aws plugin `PlatformBackend` | + +- Core: delete the AWS files per the table, drop `aws-sdk-go-v2` from go.mod. **Phase C — GCP** (3 files): -- workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), `PlatformBackend` (gke), plugin-native `storage.gcs`. -- Core: delete `iac_state_gcs.go`, `storage_gcs.go`, the GKE half of `platform_kubernetes_kind.go`; drop `cloud.google.com/go` + `google.golang.org/api`. +- workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), `PlatformBackend` (`gke`), plugin-native `storage.gcs`. +- Core: delete `iac_state_gcs.go`, `storage_gcs.go`; extract `gkeBackend` from `platform_kubernetes_kind.go`; drop `cloud.google.com/go` + `google.golang.org/api`. -**Phase D — DigitalOcean compat:** +**Phase D — DigitalOcean (`spaces` clean-break):** - workflow-plugin-digitalocean implements `IaCStateBackend` for `spaces` (S3-compatible — pulls `aws-sdk-go-v2/service/s3`, the one service package, not the whole tree). -- **Minor version bump** (compatibility break marker) + migration doc: wfctl users with `iac.state` `backend: spaces` must now have workflow-plugin-digitalocean ≥ that minor loaded. No app.yaml change for them — the backend name `spaces` is unchanged — but the plugin must be present in `data/plugins/` / `wfctl.yaml`. +- **This is a clean break, not soft-compat.** `iac_state_spaces.go` and the `spaces` case in `iac_module.go` are deleted by **Phase B's core PR** (the file is shared — `iac_state_spaces.go` is the S3-compatible store that backs *both* `s3` and `spaces`). After Phase B's core PR merges, `iac.state` with `backend: spaces` fails to build unless workflow-plugin-digitalocean (the version implementing `IaCStateBackend`) is loaded. +- **Minor version bump** on workflow-plugin-digitalocean (compatibility-break marker) + `minEngineVersion` set to the core version that drops the in-core `spaces` case + migration doc. -`platform_kubernetes_kind.go` is touched by both B (EKS) and C (GKE) — it gets split: `kind` backend stays, EKS+GKE backends extracted. Phase B and C must coordinate on that file (B lands first, leaves a GKE shim; C removes the shim). +**`platform_kubernetes_kind.go` is split across three phases** — A (`aksBackend`), B (`eksBackend`), C (`gkeBackend`); `kind` stays. Coordination: Phase A lands first and extracts `aksBackend` behind a registration shim; B and C each remove their backend + their shim entry; the last phase to land (C) removes the final shim scaffolding. Whichever order B/C land, the file must always compile with `kind` + whatever backends haven't extracted yet. ## Migration (user-facing) Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-14-cloud-sdk-extraction.md`: -- `iac.state` with `backend: s3|azure_blob|gcs|spaces` → load the matching plugin (`wfctl plugin install workflow-plugin-{aws,azure,gcp,digitalocean}`). yaml `backend:` value unchanged. -- `platform.kubernetes` `provider: eks|gke` etc. → load the matching plugin. yaml `provider:` value unchanged. -- `aws.apigateway` / former `cloud.account`-brokered AWS modules → module type renamed to plugin-native form; `credentials:` block moves inline (or `credentials_ref:` an `aws.credentials` module). **This is the only yaml-shape change.** +- `iac.state` with `backend: s3|azure_blob|gcs|spaces` → load the matching plugin (`wfctl plugin install workflow-plugin-{aws,azure,gcp,digitalocean}`). yaml `backend:` value unchanged. **Hard requirement after the relevant phase merges** — the in-core backend is deleted, not deprecated. +- `platform.kubernetes` / `platform.ecs` / etc. with a cloud `provider:` → load the matching plugin. yaml `provider:` value unchanged. Hard requirement after the relevant phase. +- `aws.apigateway` and other former `cloud.account`-brokered AWS modules → module type renamed to plugin-native form; `credentials:` block moves inline (or `credentials_ref:` an `aws.credentials` module). **This is the only yaml-shape change.** - `memory` / `filesystem` / `postgres` state backends, `kind` k8s backend → no change, still core. ## Assumptions -1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case but a hostile-large state is out of initial scope (documented limitation, not a silent failure: `PutState` returns a clear "state exceeds transport limit" error). -2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are already interface-segregated such that the `kind` impl can stay while cloud impls extract. If a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first. -3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive, core ignores unknown backend registrations until its own half lands. +1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case but a hostile-large state is out of initial scope (documented limitation, not a silent failure: `SaveState` returns a clear "state exceeds transport limit" error). The benchmark runs before the proto is locked. +2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are interface-segregated such that the `kind` impl can stay while cloud impls extract. **This is the most fragile assumption** — Phase A/B's first task is an interface-audit spike that validates it; if a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first and the phase re-scopes. +3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend`/`PlatformBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive, core ignores unknown backend registrations until its own half lands. 4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. The DO plugin already carries `godo`; adding one AWS service package is the minimal cost of self-contained `spaces` state support (vs. forcing DO users to also load workflow-plugin-aws). -5. **`cloud_account_azure.go` / `cloud_account_gcp.go` genuinely have zero SDK imports.** Verified by grep at design time; if a future change adds an SDK import there, that file joins its phase's extraction. -6. **No core code outside `module/` imports these SDKs.** Verified: the only `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` imports are under `module/`. `cmd/`, `engine.go`, `schema/`, `plugin/` are clean. +5. **`cloud_account_azure.go` / `cloud_account_gcp.go` genuinely have zero real SDK imports.** Verified by `awk` over import blocks at design time — they reference the SDKs only in comments. If a future change adds a real SDK import there, that file joins its phase's extraction. +6. **No core code outside `module/` imports these SDKs.** Verified: the only real `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` imports are under `module/`. `cmd/`, `engine.go`, `schema/`, `plugin/` are clean. A `go list -deps` CI gate in the final phase enforces this permanently. ## Rollback This design changes **plugin loading paths** and **go.mod dependency trees** — runtime-affecting per the `runtime-launch-validation` trigger list. -- **Per-phase revert:** each phase is an isolated core PR + plugin PR(s). Reverting the core PR restores the in-process backend `switch` / `platform.*` cloud backends and re-adds the SDK to `go.mod` — the deleted files are recoverable from git. The plugin PRs are additive (new contract impls / module types) and can stay merged harmlessly even if core reverts. -- **Forward-fix preferred over revert:** because core keeps the old in-process path until the contract dispatch is wired *in the same PR*, a broken phase fails at PR CI (image-launch / strict-contracts gates), not in production. The revert path exists but the gate is the primary safety. -- **DO `spaces` break (Phase D):** the only change with an external-user-visible compat break. Rollback = revert the DO plugin minor bump; `spaces` users on the prior plugin version are unaffected because the prior version still has the in-core path's expectations. The break only bites users who upgrade core past the phase-D core PR *without* upgrading the DO plugin — the migration doc + `minEngineVersion` bump in the plugin manifest is the guard. +- **Per-phase revert:** each phase is an isolated core PR + plugin PR(s). Reverting the **core PR** restores the in-process backend `switch` / `platform.*` cloud backends and re-adds the SDK to `go.mod` — the deleted files are recoverable from git. The plugin PRs are additive (new contract impls / module types) and can stay merged harmlessly even if core reverts. **Phase D has no separate core PR** — its core deletion *is* Phase B's core PR — so a Phase D rollback means reverting Phase B's core PR + the DO plugin PR together. +- **Forward-fix preferred over revert:** because core keeps the old in-process path until the contract dispatch is wired *in the same core PR*, a broken phase fails at PR CI (image-launch / strict-contracts gates), not in production. The revert path exists but the gate is the primary safety. +- **`spaces` clean-break (Phase B core PR + Phase D plugin PR):** the only change with an external-user-visible compat break. Rollback = revert Phase B's core PR (restores `iac_state_spaces.go` + the `spaces` case) **and** revert the DO plugin minor bump, together — they are a matched pair. The migration doc + the DO plugin's `minEngineVersion` bump is the forward guard: a user on a core version past Phase B without the new DO plugin gets a clear build-time "backend spaces requires workflow-plugin-digitalocean ≥ X" error, not a silent failure. + +## Alternatives Considered + +1. **Fold cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contracts instead of inventing `PlatformBackend`.** An EKS/GKE/AKS cluster — and arguably an ECS service, a Route53 zone, an EC2 VPC — is structurally a managed resource with create/plan/apply/destroy/status, which is exactly what the battle-tested `ResourceDriver` contract already models (8 services in `iac.proto`, multiple ADRs through the strict-contracts cutover). Inventing `PlatformBackend` risks the lowest-common-denominator problem (self-challenge doubt #1). **Rejected as the default** because the `platform.*` modules have a distinct plan/apply *lifecycle surface* (they sync against live cloud state continuously, not just declaratively reconcile) and a distinct `provider:` UX the user explicitly asked to preserve — but **retained as the gated fallback**: Phase A/B's interface-audit spike decides. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the implementation folds them into `ResourceDriver` rather than shipping a bad `PlatformBackend`. +2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "drops the AWS *service-provider* tree, keeps one S3 client." The S3 client is small and stable; DO Spaces + AWS S3 are the same API; keeping one shared S3-compatible store in core avoids forcing *both* the AWS and DO plugins to each carry an S3 client and avoids a clean-break for existing `spaces` users. **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the "core has zero cloud SDKs" invariant the `go list -deps` gate is meant to enforce — a partial extraction is a maintenance trap. The cost (both aws + DO plugins carry an S3 client) is real but bounded: it's one service package, and each plugin is independently versioned anyway. +3. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. -## Self-challenge — top doubts surfaced +## Self-challenge — top doubts surfaced (carried forward, with mitigations now wired into phases) -1. **`PlatformBackend` may be over-general.** Five `platform.*` types (kubernetes/ecs/networking/dns/autoscaling) behind one `Plan/Apply/Destroy` contract with a `platform_type` discriminator — this risks a lowest-common-denominator contract that fits none of them well. *Mitigation:* writing-plans should validate the contract shape against all five backend interfaces before locking the proto; if they don't unify cleanly, split into per-family contracts or fold the cloud platform provisioners into the existing `IaCProviderRequired` resource model instead. -2. **Assumption 2 (clean provider-separability) is the most fragile.** If `platform_kubernetes_kind.go`'s `kubernetesBackend` interface returns SDK-typed values, "keep kind, extract EKS/GKE" requires a non-trivial interface refactor that this design hand-waves. *Mitigation:* the first task of Phase B/C is an interface-audit spike — if it fails, the phase re-scopes. -3. **The state-backend benchmark could come back "streaming required"** and invalidate the unary default, reshaping the `IaCStateBackend` proto after Phase A has already shipped it. *Mitigation:* run the benchmark *before* finalizing the Phase A proto — it's a writing-plans task ordered ahead of the contract lock, not after. +1. **`PlatformBackend` may be over-general.** Mitigation: interface-audit spike is Phase A/B task 1, ordered before the proto lock; Alternatives Considered #1 is the documented fallback. +2. **Assumption 2 (clean provider-separability) is the most fragile.** Mitigation: same interface-audit spike; if it fails, the phase re-scopes to do the interface-extraction refactor first. +3. **The state-backend benchmark could come back "streaming required"** and reshape the `IaCStateBackend` proto. Mitigation: benchmark is a Phase A task ordered *before* the proto lock — the proto is not committed until the benchmark result is in. ## Open items deferred to writing-plans -- Exact proto field layouts for all three contracts (sketches above are directional). -- Whether `PlatformBackend` is one contract or per-family (gated on self-challenge doubt #1). -- The `platform.*` backend interface-audit spike (gated on self-challenge doubt #2). -- Benchmark harness location + acceptance threshold (gated on self-challenge doubt #3). -- Per-plugin CHANGELOG + the consolidated migration doc wording. +- Exact proto field layouts for both new contracts (sketches above are directional; field-level layout follows the benchmark + interface-audit results). +- Whether `PlatformBackend` ships as designed or folds into `ResourceDriver` (gated on the interface-audit spike — Alternatives Considered #1). +- Benchmark harness location + the concrete acceptance threshold (p99 added latency bar). +- Exact wording of the secret-redaction extension + whether existing redaction already covers `credentials:` keys. +- Per-plugin CHANGELOG entries + the consolidated migration doc wording. From 18534e8f47162a788148ba13ca730afd5c2380f5 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 00:52:11 -0400 Subject: [PATCH 03/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20adversarial=20review=20cycle=202=20revision?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 2 Critical + 3 Important from cycle-2 review: Critical: - platform_kubernetes_kind.go handling reworked. Added Phase 0: a pure mechanical precursor file-split (kind/eks/gke/aks → 4 files, each with its own import block). The "always compiles across phases" property is now structural, not asserted. Added a verified per-file import-ownership table. - Corrected the false Phase A rationale: aksBackend uses raw net/http REST, NOT the Azure SDK (verified — no azure-sdk symbol in the aksBackend region). The Azure go.mod drop comes entirely from iac_state_azure.go deletion + iac_module.go edit; aksBackend extraction is code-organisation, not a dependency change. - Documented the eksBackend → cloud_account_aws.go call-graph edge as a hard same-commit atomicity constraint (verified: eksBackend calls awsProviderFrom + AWSConfig at platform_kubernetes_kind.go:96,105,138). Important: - Phase B core-PR bullet now explicitly lists "strip the spaces case from iac_module.go" (was only obliquely referenced). - New §Failure modes section: orphaned-lock-on-plugin-crash → lease_ttl_seconds contract field; SaveState lost-response retry → documented idempotent (full-state replace, last-writer-wins); plugin-unreachable → abort before mutation; PlatformBackend mid-Apply crash → identical to today's in-process risk, no new mitigation. - §Security gRPC-logging bullet concretized: VERIFIED plugin SDK adds no body-logging interceptor (grpc.NewServer(opts...) passthrough; only callback_server.go logs, never module config). Writing-plans adds a guard test instead of a conditional interceptor. Minor: file-count table footnoted (count = importers, not deletions); shared s3compat module added as Alternatives Considered #3 (deferred, not rejected); self-challenge doubt numbering tidied (2 mitigations cover 3 doubts, intentionally). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 113 ++++++++++++------ 1 file changed, 75 insertions(+), 38 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 23bb37b3..9cafbdd5 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -6,13 +6,13 @@ ## Problem -Workflow core's `module/` package imports three cloud SDK trees directly. File counts are grep-verified (`awk` over import blocks, comment-only matches excluded): +Workflow core's `module/` package imports three cloud SDK trees directly. File counts are grep-verified (`awk` over import blocks, comment-only matches excluded). "Files" = files with a real import — not all are *deleted* (e.g. `iac_module.go` is *edited* to strip a `case`, not deleted; see Phases): -| SDK | Files (real imports) | -|-----|----------------------| -| `github.com/aws/aws-sdk-go-v2/*` | **13** | -| `github.com/Azure/azure-sdk-for-go/sdk/*` (azcore + azblob) | **2** | -| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** | +| SDK | Files (real imports) | how core sheds it | +|-----|----------------------|-------------------| +| `github.com/aws/aws-sdk-go-v2/*` | **13** | 11 deleted, `iac_module.go` edited (strip `spaces` case), `platform_kubernetes_eks.go` deleted (post Phase-0 split) | +| `github.com/Azure/azure-sdk-for-go/sdk/*` (azcore + azblob) | **2** | `iac_state_azure.go` deleted, `iac_module.go` edited (strip `azure_blob` case) | +| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** | `iac_state_gcs.go` + `storage_gcs.go` deleted, `platform_kubernetes_gke.go` deleted (post Phase-0 split) | Every dependabot bump of a cloud SDK (PRs #400/#419/#421/#635 as of this writing) churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. @@ -54,10 +54,11 @@ service IaCStateBackend { rpc Unlock (UnlockRequest) returns (UnlockResponse); // → IaCStateStore.Unlock } message GetStateResponse { IaCState state = 1; bool exists = 2; } -message SaveStateRequest { IaCState state = 1; } +message SaveStateRequest { IaCState state = 1; } // idempotent: full-state replace, last-writer-wins message ListStatesRequest { map filter = 1; } -// IaCState mirrors module.IaCState; Lock/Unlock carry resource_id only -// (the in-core IaCStateStore.Lock signature takes no lease token/duration). +message LockRequest { string resource_id = 1; int64 lease_ttl_seconds = 2; } // TTL: plugin-backed locks self-clear on orphan; in-core backends ignore it +// IaCState mirrors module.IaCState. lease_ttl_seconds is contract-only — the +// module.IaCStateStore interface gains no method; the core dispatcher defaults it. ``` Backend ownership — every cloud plugin implements the contract for its native storage: @@ -117,51 +118,84 @@ Credential handling (Option 1, approved): the deleted `cloud_account_aws.go` + ` Option 1 moves raw cloud secrets (`accessKey`/`secretKey`/`account_key`/etc.) inline into every plugin-native module's `credentials:` config block — multiplying the number of config sites holding plaintext secrets versus today's single `cloud.account` module. This is not unprecedented (`iac_module.go`'s current `spaces` case already inlines `accessKey`/`secretKey`), but the multiplication needs explicit handling: - **Config-version store + execution tracing.** Workflow's config-version store (SHA-256 content-addressed) and execution-tracing layer marshal module config. Plugin-native module config carrying inline credentials MUST be redacted before persistence/tracing. Writing-plans task: extend the existing PII/secret redaction (already per-tenant-toggleable per `workflow-cloud`) to recognise the `credentials:` / `credentials_ref:` keys on plugin module config, OR confirm the existing redaction already covers any key matching a secret-pattern. This is a **blocking** task — it ships in the same phase as the first plugin-native AWS module, not after. -- **gRPC sidecar request logging.** The `IaCStateBackend` / `PlatformBackend` requests cross the engine↔plugin gRPC boundary. State payloads are not secrets, but `credentials:` blocks passed in `CreateModule` requests to the plugin ARE. Confirm the plugin SDK's gRPC interceptors do not log full request bodies at info level; if they do, add a redacting interceptor. Writing-plans task in Phase A (first contract wired). +- **gRPC sidecar request logging.** The `IaCStateBackend` / `PlatformBackend` requests cross the engine↔plugin gRPC boundary, and `credentials:` blocks ride in `CreateModule` requests. **Verified at design time:** `plugin/external/grpc_plugin.go:39` constructs the server as `grpc.NewServer(opts...)` with `opts` passed straight through from the go-plugin broker — workflow's plugin SDK adds **no body-logging interceptor**. The only request-body logging anywhere in `plugin/external/` is `callback_server.go:85,118` (the plugin→host callback path: a `Log` RPC's `req.Message`, and a subscribe RPC's topic byte-count) — neither touches module config. `CreateModule` is dispatched at `adapter.go:477` with no logging of the request. **Conclusion: no redacting interceptor is needed today.** Writing-plans adds a guard test asserting no interceptor logs `CreateModule` bodies, so a future SDK change that adds one fails CI rather than silently leaking. - **`credentials_ref:` blast radius.** A `credentials_ref` resolves to an in-plugin `aws.credentials` module within the *same plugin process* — it does not broaden which process can read the secret (engine never sees the resolved `aws.Config`, only the plugin does). This is strictly *narrower* than today's `cloud.account` (which builds `aws.Config` in the engine process). Documented as an improvement, not a risk. +## Failure modes + +Moving the IaC state store behind a gRPC sidecar introduces a partial-failure surface on the engine's hottest path (every plan/apply does `Lock` → `GetState` → ... → `SaveState` → `Unlock`). The in-process store had none of these: + +- **Plugin crashes between `Lock` and `Unlock` → orphaned lock.** An in-process lock dies with the process; a gRPC-plugin lock can outlive a plugin crash if the plugin persisted it (S3/Blob lock objects do persist). Mitigation, **wired into the contract**: `LockRequest` carries an optional `lease_ttl_seconds` field. Plugin-backed implementations write the lock with that TTL (S3 object with expiry metadata, Blob lease duration, etc.) so an orphaned lock self-clears. In-core backends (`memory`/`filesystem`/`postgres`) ignore the field — their `Lock` is process-scoped or transactional and cannot orphan across a crash. The `module.IaCStateStore` interface gains no new method; `lease_ttl_seconds` is contract-only, defaulted by the core dispatcher. +- **`SaveState` succeeds plugin-side but the gRPC response is lost → engine retries → double-write.** `SaveState` MUST be idempotent: it is a full-state replace keyed by `resource_id` (the existing `IaCStateStore.SaveState` is already "insert or replace"), so a retried identical `SaveState` is a no-op-equivalent. The contract documents `SaveState` as idempotent; the plugin implementations use unconditional PUT (overwrite), not append. No sequence number needed — IaC state is last-writer-wins by design. +- **Plugin unreachable at plan/apply start.** Core's `iac.state` dispatch returns a clear `"iac.state backend %q: plugin unreachable"` error and the plan/apply aborts *before* mutating anything — no partial state. This matches today's behavior when a misconfigured backend fails to construct in `IaCModule.Init()`. +- **`PlatformBackend` plugin crash mid-`Apply`.** A `platform.*` apply that crashes mid-flight leaves real cloud resources in an indeterminate state — but this is **identical to today's in-process risk** (an in-process `eksBackend.apply()` panic leaves the same indeterminate cloud state). The gRPC boundary does not worsen it; the next `Plan` reconciles against live cloud state as it does today. No new mitigation needed — documented as unchanged. + +## Per-file import ownership (verified) + +`module/platform_kubernetes_kind.go` is the one file shared across phases. Verified import ownership (`grep` per SDK symbol against the single import block at lines 3-19): + +| backend | cloud SDK imports it owns | extracted in | +|---------|--------------------------|--------------| +| `kindBackend` | none (in-memory) | — stays in core | +| `eksBackend` | `aws-sdk-go-v2/aws`, `service/eks`, `eks/types`; also **calls `awsProviderFrom` + `AWSConfig()` from `cloud_account_aws.go`** | Phase B | +| `gkeBackend` | `google.golang.org/api/container/v1`, `google.golang.org/api/option` | Phase C | +| `aksBackend` | **none** — uses raw `net/http` REST against the Azure management API (the file header comment "Requires the Azure SDK" is stale; verified no `azure-sdk-for-go` symbol in the `aksBackend` region) | Phase A | + +Two consequences this corrects from earlier drafts: +- **`aksBackend` extraction does NOT drop the Azure SDK** — `aksBackend` never imported it. The Azure go.mod drop is achieved entirely by deleting `iac_state_azure.go` + editing `iac_module.go`. Moving `aksBackend` to the plugin is still done (cloud-platform code belongs in the plugin, and `PlatformBackend` needs an Azure impl) but it is a *code-organisation* change, not a *dependency* change. +- **`eksBackend` has a hard call-graph edge to `cloud_account_aws.go`** — they MUST be removed in the same commit, or core fails to compile. + +## Phase 0 — precursor: split `platform_kubernetes_kind.go` by backend + +A pure mechanical refactor, no behavior change, landed **before** Phase A. Split the one shared file into four: +- `platform_kubernetes_kind.go` — `kindBackend` + the shared `kubernetesBackend` interface + `PlatformKubernetes` module shell. No cloud SDK imports after the split. +- `platform_kubernetes_eks.go` — `eksBackend` only; owns the `aws-sdk-go-v2` imports. +- `platform_kubernetes_gke.go` — `gkeBackend` only; owns the `google.golang.org/api` imports. +- `platform_kubernetes_aks.go` — `aksBackend` only; owns `net/http` (no cloud SDK). + +After Phase 0, each subsequent phase deletes *its own backend file* with its own self-contained import block — the "always compiles" property is then structural, not asserted. Verification: `go build ./... && go test ./module/...` green, zero behavior diff (the split moves code, touches no logic). This is the single cheapest de-risking move in the plan — it converts the fragile "extract-from-shared-file-in-place" path into four trivially-reviewable deletions. + ## Phases -Each phase is one workflow-core PR (deleting files + wiring the contract dispatch) plus one PR per affected plugin. Within a phase, the plugin PR may merge ahead of the core PR — core keeps the old in-process path until the contract dispatch is wired in the core PR, so a plugin implementing the published proto is harmless to load early. +Each phase is one workflow-core PR (deleting files + wiring the contract dispatch) plus one PR per affected plugin. Within a phase, the plugin PR may merge ahead of the core PR — core keeps the old in-process path until the contract dispatch is wired in the core PR, so a plugin implementing the published proto is harmless to load early. **Atomicity rule:** within a core PR, a deleted file and every file that references its symbols are removed in the *same commit* (the build gate enforces this — a dangling reference fails CI). **Phase A — Azure** (smallest, validates BOTH new contracts end-to-end): - Run the state-backend benchmark task; lock the `IaCStateBackend` proto shape. -- Run the `platform.*` interface-audit spike; lock or re-scope the `PlatformBackend` proto shape. +- Run the `platform.*` interface-audit spike; lock or re-scope the `PlatformBackend` proto shape (Alternatives Considered #1). - Add `IaCStateBackend` + `PlatformBackend` services to `plugin/external/proto/iac.proto`. -- Add the secret-redaction + gRPC-interceptor security tasks (blocking). +- Add the secret-redaction task + the gRPC-interceptor guard test (security tasks, blocking). - workflow-plugin-azure implements `azure_blob` `IaCStateBackend` + `aks` `PlatformBackend`. -- Core: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go`; extract the `aksBackend` from `platform_kubernetes_kind.go` (leave a registration shim); drop `Azure/azure-sdk-for-go` from go.mod. +- Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go` **(this + the deletion is what drops `Azure/azure-sdk-for-go` from go.mod)**; delete `platform_kubernetes_aks.go` (from Phase 0) and wire its `PlatformBackend` dispatch. **Phase B — AWS** (largest — 13 files, 3 surfaces). Complete file inventory + destination: -| core file | destination | -|-----------|-------------| -| `iac_state_spaces.go` | aws plugin — `s3` `IaCStateBackend` (DELETE from core; also removes the `spaces` case dependency — see Phase D) | -| `cloud_account_aws.go` | DELETE (Option 1 — no replacement) | -| `cloud_account_aws_creds.go` | DELETE (Option 1 — no replacement) | -| `aws_api_gateway.go` | aws plugin — `aws.apigateway` module | -| `platform_apigateway.go` | aws plugin — `PlatformBackend` or `aws.apigateway` (gated on interface-audit spike) | -| `codebuild.go` | aws plugin — `aws.codebuild` module | -| `pipeline_step_s3_upload.go` | aws plugin — `step.s3_upload` | -| `s3_storage.go` | aws plugin — `storage.s3` module | -| `platform_autoscaling.go` | aws plugin — `PlatformBackend` (`autoscaling`) | -| `platform_dns_backends.go` | aws plugin — `PlatformBackend` (`dns`/route53) | -| `platform_ecs.go` | aws plugin — `PlatformBackend` (`ecs`) | -| `platform_networking.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | -| `platform_kubernetes_kind.go` | SPLIT — `kind` stays core; `eksBackend` → aws plugin `PlatformBackend` | - -- Core: delete the AWS files per the table, drop `aws-sdk-go-v2` from go.mod. +| core file | destination | atomicity note | +|-----------|-------------|----------------| +| `iac_state_spaces.go` | aws plugin — `s3` `IaCStateBackend` (DELETE from core) | shared with `spaces` — see Phase D | +| `cloud_account_aws.go` | DELETE (Option 1 — no replacement) | **same commit as `platform_kubernetes_eks.go`** (call-graph edge) | +| `cloud_account_aws_creds.go` | DELETE (Option 1 — no replacement) | same commit as above | +| `platform_kubernetes_eks.go` (from Phase 0) | aws plugin — `eks` `PlatformBackend` | **same commit as `cloud_account_aws*.go`** | +| `aws_api_gateway.go` | aws plugin — `aws.apigateway` module | — | +| `platform_apigateway.go` | aws plugin — `PlatformBackend` or `aws.apigateway` (gated on interface-audit spike) | — | +| `codebuild.go` | aws plugin — `aws.codebuild` module | — | +| `pipeline_step_s3_upload.go` | aws plugin — `step.s3_upload` | — | +| `s3_storage.go` | aws plugin — `storage.s3` module | — | +| `platform_autoscaling.go` | aws plugin — `PlatformBackend` (`autoscaling`) | — | +| `platform_dns_backends.go` | aws plugin — `PlatformBackend` (`dns`/route53) | — | +| `platform_ecs.go` | aws plugin — `PlatformBackend` (`ecs`) | — | +| `platform_networking.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | — | + +- Core PR also: **strip the `spaces` case from `iac_module.go`** (it calls `NewSpacesIaCStateStore` from the deleted `iac_state_spaces.go` — same compile-dependency pattern as Phase A's `azure_blob` strip). Drop `aws-sdk-go-v2` from go.mod. **Phase C — GCP** (3 files): - workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), `PlatformBackend` (`gke`), plugin-native `storage.gcs`. -- Core: delete `iac_state_gcs.go`, `storage_gcs.go`; extract `gkeBackend` from `platform_kubernetes_kind.go`; drop `cloud.google.com/go` + `google.golang.org/api`. +- Core PR: delete `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_gke.go` (from Phase 0); drop `cloud.google.com/go` + `google.golang.org/api`. After Phase C, `go list -deps ./...` shows zero cloud-SDK packages — the permanent CI gate is added here. **Phase D — DigitalOcean (`spaces` clean-break):** - workflow-plugin-digitalocean implements `IaCStateBackend` for `spaces` (S3-compatible — pulls `aws-sdk-go-v2/service/s3`, the one service package, not the whole tree). -- **This is a clean break, not soft-compat.** `iac_state_spaces.go` and the `spaces` case in `iac_module.go` are deleted by **Phase B's core PR** (the file is shared — `iac_state_spaces.go` is the S3-compatible store that backs *both* `s3` and `spaces`). After Phase B's core PR merges, `iac.state` with `backend: spaces` fails to build unless workflow-plugin-digitalocean (the version implementing `IaCStateBackend`) is loaded. +- **This is a clean break, not soft-compat.** `iac_state_spaces.go` + the `spaces` case in `iac_module.go` are deleted by **Phase B's core PR** (`iac_state_spaces.go` is the one S3-compatible store backing *both* `s3` and `spaces`). After Phase B's core PR merges, `iac.state` with `backend: spaces` fails to build unless the DO plugin version implementing `IaCStateBackend` is loaded. - **Minor version bump** on workflow-plugin-digitalocean (compatibility-break marker) + `minEngineVersion` set to the core version that drops the in-core `spaces` case + migration doc. - -**`platform_kubernetes_kind.go` is split across three phases** — A (`aksBackend`), B (`eksBackend`), C (`gkeBackend`); `kind` stays. Coordination: Phase A lands first and extracts `aksBackend` behind a registration shim; B and C each remove their backend + their shim entry; the last phase to land (C) removes the final shim scaffolding. Whichever order B/C land, the file must always compile with `kind` + whatever backends haven't extracted yet. +- **Sequencing:** the DO plugin PR (implementing `spaces` `IaCStateBackend`) MUST merge + release before Phase B's core PR merges — otherwise there is a window where `backend: spaces` has no implementation anywhere. Writing-plans orders the DO plugin PR as a Phase-B blocker. ## Migration (user-facing) @@ -193,12 +227,14 @@ This design changes **plugin loading paths** and **go.mod dependency trees** — 1. **Fold cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contracts instead of inventing `PlatformBackend`.** An EKS/GKE/AKS cluster — and arguably an ECS service, a Route53 zone, an EC2 VPC — is structurally a managed resource with create/plan/apply/destroy/status, which is exactly what the battle-tested `ResourceDriver` contract already models (8 services in `iac.proto`, multiple ADRs through the strict-contracts cutover). Inventing `PlatformBackend` risks the lowest-common-denominator problem (self-challenge doubt #1). **Rejected as the default** because the `platform.*` modules have a distinct plan/apply *lifecycle surface* (they sync against live cloud state continuously, not just declaratively reconcile) and a distinct `provider:` UX the user explicitly asked to preserve — but **retained as the gated fallback**: Phase A/B's interface-audit spike decides. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the implementation folds them into `ResourceDriver` rather than shipping a bad `PlatformBackend`. 2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "drops the AWS *service-provider* tree, keeps one S3 client." The S3 client is small and stable; DO Spaces + AWS S3 are the same API; keeping one shared S3-compatible store in core avoids forcing *both* the AWS and DO plugins to each carry an S3 client and avoids a clean-break for existing `spaces` users. **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the "core has zero cloud SDKs" invariant the `go list -deps` gate is meant to enforce — a partial extraction is a maintenance trap. The cost (both aws + DO plugins carry an S3 client) is real but bounded: it's one service package, and each plugin is independently versioned anyway. -3. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. +3. **A shared `s3compat` Go module consumed by both the aws and DO plugins** (instead of each independently re-implementing the S3-compatible state store + `buildAWSConfig`). Keeps "core has zero cloud SDKs" intact while eliminating the cross-plugin duplication Alternative #2 dismisses as "bounded." **Deferred, not rejected:** it is a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary, so it can land as a follow-up after the extraction is proven. Forcing it into the critical path now couples the aws and DO plugin release cadences; the duplication is a small, well-understood `buildAWSConfig` + thin S3 wrapper. Writing-plans logs it as a post-extraction cleanup candidate. +4. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. ## Self-challenge — top doubts surfaced (carried forward, with mitigations now wired into phases) -1. **`PlatformBackend` may be over-general.** Mitigation: interface-audit spike is Phase A/B task 1, ordered before the proto lock; Alternatives Considered #1 is the documented fallback. -2. **Assumption 2 (clean provider-separability) is the most fragile.** Mitigation: same interface-audit spike; if it fails, the phase re-scopes to do the interface-extraction refactor first. +Two distinct mitigations cover three doubts (#1 and #2 share the interface-audit spike — that is intentional, not redundant coverage theatre): + +1. **`PlatformBackend` may be over-general** AND **2. clean provider-separability (Assumption 2) is fragile.** Both are settled by the *one* interface-audit spike — Phase 0/A task 1, ordered before the proto lock. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the fallback is folding cloud platform provisioners into `ResourceDriver` (Alternatives Considered #1); if a backend interface leaks SDK types into its core module shell, the phase re-scopes to do the interface-extraction refactor first. Phase 0's mechanical file-split also de-risks #2 structurally — each backend's imports are isolated before any extraction. 3. **The state-backend benchmark could come back "streaming required"** and reshape the `IaCStateBackend` proto. Mitigation: benchmark is a Phase A task ordered *before* the proto lock — the proto is not committed until the benchmark result is in. ## Open items deferred to writing-plans @@ -207,4 +243,5 @@ This design changes **plugin loading paths** and **go.mod dependency trees** — - Whether `PlatformBackend` ships as designed or folds into `ResourceDriver` (gated on the interface-audit spike — Alternatives Considered #1). - Benchmark harness location + the concrete acceptance threshold (p99 added latency bar). - Exact wording of the secret-redaction extension + whether existing redaction already covers `credentials:` keys. +- The `s3compat` shared-module cleanup (Alternatives Considered #3) — logged as a post-extraction follow-up candidate, not in the critical path. - Per-plugin CHANGELOG entries + the consolidated migration doc wording. From 5fb20003f50922f31239a36c49d4f701195c5a67 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 00:52:26 -0400 Subject: [PATCH 04/39] docs(plans): fix stale Phase A/B refs + Status line post-cycle-2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sed in the cycle-2 commit ran from the wrong cwd — Status line still said "cycle 1" and two interface-audit-spike references still said "Phase A/B" instead of "Phase 0/A". Pure text cleanup, no design change. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/plans/2026-05-14-cloud-sdk-extraction-design.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 9cafbdd5..a9847c85 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,7 +1,7 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — revised after adversarial review cycle 1 +**Status:** Design — revised after adversarial review cycle 2 **Owner:** autonomous pipeline (workflow#TBD) ## Problem @@ -209,7 +209,7 @@ Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-1 ## Assumptions 1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case but a hostile-large state is out of initial scope (documented limitation, not a silent failure: `SaveState` returns a clear "state exceeds transport limit" error). The benchmark runs before the proto is locked. -2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are interface-segregated such that the `kind` impl can stay while cloud impls extract. **This is the most fragile assumption** — Phase A/B's first task is an interface-audit spike that validates it; if a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first and the phase re-scopes. +2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are interface-segregated such that the `kind` impl can stay while cloud impls extract. **This is the most fragile assumption** — the Phase 0/A interface-audit spike (first writing-plans task) validates this that validates it; if a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first and the phase re-scopes. 3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend`/`PlatformBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive, core ignores unknown backend registrations until its own half lands. 4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. The DO plugin already carries `godo`; adding one AWS service package is the minimal cost of self-contained `spaces` state support (vs. forcing DO users to also load workflow-plugin-aws). 5. **`cloud_account_azure.go` / `cloud_account_gcp.go` genuinely have zero real SDK imports.** Verified by `awk` over import blocks at design time — they reference the SDKs only in comments. If a future change adds a real SDK import there, that file joins its phase's extraction. @@ -225,7 +225,7 @@ This design changes **plugin loading paths** and **go.mod dependency trees** — ## Alternatives Considered -1. **Fold cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contracts instead of inventing `PlatformBackend`.** An EKS/GKE/AKS cluster — and arguably an ECS service, a Route53 zone, an EC2 VPC — is structurally a managed resource with create/plan/apply/destroy/status, which is exactly what the battle-tested `ResourceDriver` contract already models (8 services in `iac.proto`, multiple ADRs through the strict-contracts cutover). Inventing `PlatformBackend` risks the lowest-common-denominator problem (self-challenge doubt #1). **Rejected as the default** because the `platform.*` modules have a distinct plan/apply *lifecycle surface* (they sync against live cloud state continuously, not just declaratively reconcile) and a distinct `provider:` UX the user explicitly asked to preserve — but **retained as the gated fallback**: Phase A/B's interface-audit spike decides. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the implementation folds them into `ResourceDriver` rather than shipping a bad `PlatformBackend`. +1. **Fold cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contracts instead of inventing `PlatformBackend`.** An EKS/GKE/AKS cluster — and arguably an ECS service, a Route53 zone, an EC2 VPC — is structurally a managed resource with create/plan/apply/destroy/status, which is exactly what the battle-tested `ResourceDriver` contract already models (8 services in `iac.proto`, multiple ADRs through the strict-contracts cutover). Inventing `PlatformBackend` risks the lowest-common-denominator problem (self-challenge doubt #1). **Rejected as the default** because the `platform.*` modules have a distinct plan/apply *lifecycle surface* (they sync against live cloud state continuously, not just declaratively reconcile) and a distinct `provider:` UX the user explicitly asked to preserve — but **retained as the gated fallback**: the Phase 0/A interface-audit spike decides. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the implementation folds them into `ResourceDriver` rather than shipping a bad `PlatformBackend`. 2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "drops the AWS *service-provider* tree, keeps one S3 client." The S3 client is small and stable; DO Spaces + AWS S3 are the same API; keeping one shared S3-compatible store in core avoids forcing *both* the AWS and DO plugins to each carry an S3 client and avoids a clean-break for existing `spaces` users. **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the "core has zero cloud SDKs" invariant the `go list -deps` gate is meant to enforce — a partial extraction is a maintenance trap. The cost (both aws + DO plugins carry an S3 client) is real but bounded: it's one service package, and each plugin is independently versioned anyway. 3. **A shared `s3compat` Go module consumed by both the aws and DO plugins** (instead of each independently re-implementing the S3-compatible state store + `buildAWSConfig`). Keeps "core has zero cloud SDKs" intact while eliminating the cross-plugin duplication Alternative #2 dismisses as "bounded." **Deferred, not rejected:** it is a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary, so it can land as a follow-up after the extraction is proven. Forcing it into the critical path now couples the aws and DO plugin release cadences; the duplication is a small, well-understood `buildAWSConfig` + thin S3 wrapper. Writing-plans logs it as a post-extraction cleanup candidate. 4. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. From 39a3b434fa9ba531ebb2e85e78308fa1551bff40 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 00:59:02 -0400 Subject: [PATCH 05/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20adversarial=20review=20cycle=203=20revision?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 2 Critical + 2 Important from cycle-3 review: Critical (same root — symbol-level coupling the import-block audit missed): - parseStringSlice (in cloud_account_aws.go, which Phase B deletes) and safeIntToInt32 (in core-staying platform_kubernetes.go) are pure helpers the plugin-bound backend files call. An import-block audit is symbol-blind. Fix: Phase 0 now does TWO moves — the file split AND relocating both helpers into a new SDK-free core module/cloud_helpers.go. Per-file table gains a "cross-file symbol deps (the trap)" column listing every helper edge per backend. Phase 0 acceptance criteria now include a grep that no core file references the helpers from their old homes. - §Phase 0 corrected: platform_kubernetes.go is a SEPARATE existing file (module shell + kubernetesBackend interface + safeIntToInt32) — NOT touched by the split; only platform_kubernetes_kind.go (holds all 4 backends) is split. Earlier draft conflated the two files. Important: - Per-file ownership table relabelled "intended post-split — verified by the Phase 0 build gate" (was asserted-as-verified against an unsplit file — same hand-waving class cycle-2 flagged for "always compiles"). - lease_ttl_seconds DROPPED from the Phase A proto. It was a contract field with no enforced semantics and no implementing backend in scope — YAGNI. §Failure-modes orphaned-lock reworked: documented limitation + operator-side lock-object delete for recovery; TTL is a planned ADDITIVE follow-up paired with a conformance test, shipped with the first backend that honors expiry. Added explicit Lock-contention behavior (immediate error, matches today's in-process IaCStateStore.Lock — no new waiting state). Minor: Phase 0 rollback sentence added; garbled §Assumptions 2 sentence fixed; §Assumptions 2 notes Phase 0 de-risks it structurally. Also: removed a stray stale cycle-1 copy of this doc that was sitting untracked in the main workflow checkout (the canonical doc is here in the feat/cloud-sdk-extraction worktree). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 52 ++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index a9847c85..7270f6d8 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -56,9 +56,11 @@ service IaCStateBackend { message GetStateResponse { IaCState state = 1; bool exists = 2; } message SaveStateRequest { IaCState state = 1; } // idempotent: full-state replace, last-writer-wins message ListStatesRequest { map filter = 1; } -message LockRequest { string resource_id = 1; int64 lease_ttl_seconds = 2; } // TTL: plugin-backed locks self-clear on orphan; in-core backends ignore it -// IaCState mirrors module.IaCState. lease_ttl_seconds is contract-only — the -// module.IaCStateStore interface gains no method; the core dispatcher defaults it. +message LockRequest { string resource_id = 1; } // 1:1 with IaCStateStore.Lock — no TTL field (see Failure modes) +// IaCState mirrors module.IaCState. The proto is exactly the 6-method interface, +// nothing speculative — a lock-lease/TTL field is a planned ADDITIVE follow-up +// (Failure modes §), deferred until the first plugin backend implements honored +// expiry so it ships with a conformance test instead of as a no-op field. ``` Backend ownership — every cloud plugin implements the contract for its native storage: @@ -125,35 +127,49 @@ Option 1 moves raw cloud secrets (`accessKey`/`secretKey`/`account_key`/etc.) in Moving the IaC state store behind a gRPC sidecar introduces a partial-failure surface on the engine's hottest path (every plan/apply does `Lock` → `GetState` → ... → `SaveState` → `Unlock`). The in-process store had none of these: -- **Plugin crashes between `Lock` and `Unlock` → orphaned lock.** An in-process lock dies with the process; a gRPC-plugin lock can outlive a plugin crash if the plugin persisted it (S3/Blob lock objects do persist). Mitigation, **wired into the contract**: `LockRequest` carries an optional `lease_ttl_seconds` field. Plugin-backed implementations write the lock with that TTL (S3 object with expiry metadata, Blob lease duration, etc.) so an orphaned lock self-clears. In-core backends (`memory`/`filesystem`/`postgres`) ignore the field — their `Lock` is process-scoped or transactional and cannot orphan across a crash. The `module.IaCStateStore` interface gains no new method; `lease_ttl_seconds` is contract-only, defaulted by the core dispatcher. +- **Plugin crashes between `Lock` and `Unlock` → orphaned lock.** An in-process lock dies with the process; a gRPC-plugin lock can outlive a plugin crash if the plugin persisted it (S3/Blob lock objects do persist). **Initial scope:** this is a *documented limitation*, not silently broken. The `IaCStateBackend` contract ships as exactly the 6-method `IaCStateStore` interface — no TTL field — because no plugin backend in Phases A–D implements honored expiry yet, and a no-op TTL field is worse than none (it implies a guarantee that isn't enforced). Recovery for an orphaned lock is operator-side: delete the backend's lock object directly (it is a plain object/blob in the user's own bucket — `aws s3 rm`, `az storage blob delete`, etc.; the lock key format is documented per backend). **Planned additive follow-up:** once the first plugin backend implements honored expiry (S3 object-expiry metadata, Blob lease duration), `LockRequest` gains an optional `lease_ttl_seconds` field *paired with a contract conformance test* that asserts the plugin's lock object actually carries expiry — shipped with semantics, not as a field. Tracked as an open item. +- **`Lock` contention against a still-held lock.** Core's `iac.state` dispatch returns an immediate error on `Lock` contention — it does **not** block waiting for the lock to free. This matches today's in-process `IaCStateStore.Lock` ("Returns an error if the resource is already locked"). The gRPC boundary does not change this: a held lock — whether held by a live plan or orphaned by a dead plugin — surfaces the same immediate "resource locked" error, and orphaned-lock recovery is the operator-side delete above. No new waiting/lease-timeout state is introduced. - **`SaveState` succeeds plugin-side but the gRPC response is lost → engine retries → double-write.** `SaveState` MUST be idempotent: it is a full-state replace keyed by `resource_id` (the existing `IaCStateStore.SaveState` is already "insert or replace"), so a retried identical `SaveState` is a no-op-equivalent. The contract documents `SaveState` as idempotent; the plugin implementations use unconditional PUT (overwrite), not append. No sequence number needed — IaC state is last-writer-wins by design. - **Plugin unreachable at plan/apply start.** Core's `iac.state` dispatch returns a clear `"iac.state backend %q: plugin unreachable"` error and the plan/apply aborts *before* mutating anything — no partial state. This matches today's behavior when a misconfigured backend fails to construct in `IaCModule.Init()`. - **`PlatformBackend` plugin crash mid-`Apply`.** A `platform.*` apply that crashes mid-flight leaves real cloud resources in an indeterminate state — but this is **identical to today's in-process risk** (an in-process `eksBackend.apply()` panic leaves the same indeterminate cloud state). The gRPC boundary does not worsen it; the next `Plan` reconciles against live cloud state as it does today. No new mitigation needed — documented as unchanged. -## Per-file import ownership (verified) +## Per-file import + symbol ownership (intended post-split — verified by the Phase 0 build gate) `module/platform_kubernetes_kind.go` is the one file shared across phases. Verified import ownership (`grep` per SDK symbol against the single import block at lines 3-19): -| backend | cloud SDK imports it owns | extracted in | -|---------|--------------------------|--------------| -| `kindBackend` | none (in-memory) | — stays in core | -| `eksBackend` | `aws-sdk-go-v2/aws`, `service/eks`, `eks/types`; also **calls `awsProviderFrom` + `AWSConfig()` from `cloud_account_aws.go`** | Phase B | -| `gkeBackend` | `google.golang.org/api/container/v1`, `google.golang.org/api/option` | Phase C | -| `aksBackend` | **none** — uses raw `net/http` REST against the Azure management API (the file header comment "Requires the Azure SDK" is stale; verified no `azure-sdk-for-go` symbol in the `aksBackend` region) | Phase A | +**Intended post-split layout** (asserted as the Phase 0 target — *verified* by the Phase 0 PR's `go build` gate, not by this design doc; the current file is one unsplit block, so per-file ownership cannot be a present-tense verified fact): -Two consequences this corrects from earlier drafts: +| backend | cloud SDK it will own | cross-file symbol deps (the trap) | extracted in | +|---------|----------------------|-----------------------------------|--------------| +| `kindBackend` | none (in-memory) | none | — stays in core | +| `eksBackend` | `aws-sdk-go-v2/aws`, `service/eks`, `eks/types` | calls `awsProviderFrom` + `AWSConfig()` (in `cloud_account_aws.go`); `parseStringSlice` (in `cloud_account_aws.go`); `safeIntToInt32` (in `platform_kubernetes.go`) | Phase B | +| `gkeBackend` | `google.golang.org/api/container/v1`, `google.golang.org/api/option` | `safeIntToInt32` (in `platform_kubernetes.go`) | Phase C | +| `aksBackend` | **none** — raw `net/http` REST against the Azure management API (the file header comment "Requires the Azure SDK" is stale; verified no `azure-sdk-for-go` symbol in the `aksBackend` region) | none | Phase A | + +Three consequences this corrects from earlier drafts: - **`aksBackend` extraction does NOT drop the Azure SDK** — `aksBackend` never imported it. The Azure go.mod drop is achieved entirely by deleting `iac_state_azure.go` + editing `iac_module.go`. Moving `aksBackend` to the plugin is still done (cloud-platform code belongs in the plugin, and `PlatformBackend` needs an Azure impl) but it is a *code-organisation* change, not a *dependency* change. -- **`eksBackend` has a hard call-graph edge to `cloud_account_aws.go`** — they MUST be removed in the same commit, or core fails to compile. +- **`eksBackend` has a hard call-graph edge to `cloud_account_aws.go`** — they MUST be removed in the same Phase B commit. +- **Symbol-level coupling, not just import-level.** The extracted backends call two pure helper funcs that live in core files: `parseStringSlice` (in `cloud_account_aws.go`, which Phase B *deletes*) and `safeIntToInt32` (in `platform_kubernetes.go`, which *stays* in core). An import-block audit misses these — a file can be "import-clean" and still fail to compile standalone. Phase 0 must relocate them (below). + +## Phase 0 — precursor: split `platform_kubernetes_kind.go` + relocate shared helpers -## Phase 0 — precursor: split `platform_kubernetes_kind.go` by backend +A pure mechanical refactor, no behavior change, landed **before** Phase A. Two moves: -A pure mechanical refactor, no behavior change, landed **before** Phase A. Split the one shared file into four: -- `platform_kubernetes_kind.go` — `kindBackend` + the shared `kubernetesBackend` interface + `PlatformKubernetes` module shell. No cloud SDK imports after the split. +**1. Split the one shared backend file into four** (note: `platform_kubernetes.go` is a *separate, already-existing* file holding the `PlatformKubernetes` module shell + the `kubernetesBackend` interface + `safeIntToInt32` — it is **not** touched by the split; only `platform_kubernetes_kind.go`, which currently holds all four backends, is split): +- `platform_kubernetes_kind.go` — `kindBackend` only. No cloud SDK, no cross-file helper deps. - `platform_kubernetes_eks.go` — `eksBackend` only; owns the `aws-sdk-go-v2` imports. - `platform_kubernetes_gke.go` — `gkeBackend` only; owns the `google.golang.org/api` imports. - `platform_kubernetes_aks.go` — `aksBackend` only; owns `net/http` (no cloud SDK). -After Phase 0, each subsequent phase deletes *its own backend file* with its own self-contained import block — the "always compiles" property is then structural, not asserted. Verification: `go build ./... && go test ./module/...` green, zero behavior diff (the split moves code, touches no logic). This is the single cheapest de-risking move in the plan — it converts the fragile "extract-from-shared-file-in-place" path into four trivially-reviewable deletions. +**2. Relocate the two shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: +- `parseStringSlice` moves out of `cloud_account_aws.go` (which Phase B deletes) — otherwise its plugin-bound consumers (`platform_ecs.go`, `platform_kubernetes_eks.go`) lose it. +- `safeIntToInt32` moves out of `platform_kubernetes.go` — it is used by `platform_autoscaling.go`, `platform_ecs.go`, `platform_networking.go`, `platform_kubernetes_eks.go`, `platform_kubernetes_gke.go` (all plugin-bound) *and* by `platform_kubernetes.go` itself (core-resident). A neutral home means both sides keep working. + +Both helpers are tiny pure functions (no SDK, no state). `cloud_helpers.go` stays in core permanently. When a plugin-bound file moves to its plugin in Phases A–D, that plugin gets its own copy of whichever helpers it uses (≤15 lines each — duplication of a pure stdlib-only helper across process boundaries is correct, not a smell; a shared plugin-side util module is the Alternatives-Considered-#3 follow-up). + +After Phase 0, each subsequent phase deletes *its own backend file* — self-contained at both the import-block AND symbol level. **Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `git diff` shows pure code movement (zero logic change); a grep confirms no remaining core file references `parseStringSlice`/`safeIntToInt32` from their old homes. This is the single cheapest de-risking move in the plan — it converts the fragile "extract-from-shared-file-in-place" path into trivially-reviewable deletions. + +**Phase 0 rollback:** a pure file-split + helper-relocation with zero behavior diff — revert is a single `git revert` with no contract, no go.mod, and no runtime impact. It is the one phase with a trivial rollback story. ## Phases @@ -209,7 +225,7 @@ Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-1 ## Assumptions 1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case but a hostile-large state is out of initial scope (documented limitation, not a silent failure: `SaveState` returns a clear "state exceeds transport limit" error). The benchmark runs before the proto is locked. -2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are interface-segregated such that the `kind` impl can stay while cloud impls extract. **This is the most fragile assumption** — the Phase 0/A interface-audit spike (first writing-plans task) validates this that validates it; if a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first and the phase re-scopes. +2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are interface-segregated such that the `kind` impl can stay while cloud impls extract. **This is the most fragile assumption** — the Phase 0/A interface-audit spike (first writing-plans task) validates it; if a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first and the phase re-scopes. Phase 0's mechanical split + helper relocation de-risks this structurally: after Phase 0, the audit operates on already-separated files, not an assertion about an unsplit one. 3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend`/`PlatformBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive, core ignores unknown backend registrations until its own half lands. 4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. The DO plugin already carries `godo`; adding one AWS service package is the minimal cost of self-contained `spaces` state support (vs. forcing DO users to also load workflow-plugin-aws). 5. **`cloud_account_azure.go` / `cloud_account_gcp.go` genuinely have zero real SDK imports.** Verified by `awk` over import blocks at design time — they reference the SDKs only in comments. If a future change adds a real SDK import there, that file joins its phase's extraction. From a6d639668499b824391610e9fab8448a7279feba Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:05:23 -0400 Subject: [PATCH 06/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20adversarial=20review=20cycle=204=20revision?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 2 Critical + 2 Important from cycle-4 review: Critical 1 — the per-file symbol-ownership table was wrong AGAIN (3rd cycle running): claimed gkeBackend depends on safeIntToInt32 (it doesn't — that's eksBackend) and aksBackend has no cross-file deps (it does — CloudCredentials/CloudCredentialProvider from cloud_account.go, same as gke). STRUCTURAL FIX: deleted the hand-maintained table entirely. The symbol-ownership map is now a Phase 0 build artifact — scripts/audit-cloud-symbols.sh, committed + re-run in CI — not a design-doc claim that rots on every edit. The design commits to the *method* + the *known shape* (cloud_account.go stays core; all 3 cloud backends bind to it via k.provider.GetCredentials; eksBackend additionally binds to the Phase-B-deleted cloud_account_aws.go; aksBackend imports no cloud SDK). Critical 2 — Phase 0's "split into four, zero logic change" silently dropped the single func init() that registers kind/k3s/eks/gke/aks. Splitting REQUIRES partitioning init() per-file (a distribution, not zero-change). Phase 0 now has an explicit step 2 for the init() partition; relabelled "behavior-equivalent" not "zero logic change"; k3s documented as reusing kindBackend (both stay core). Important 1 — platform.* cloud credential flow across PlatformBackend was unspecified (aksBackend needs CloudCredentials — how does it reach the plugin?). Added: PlatformBackend requests carry a CloudCredentials proto message; engine resolves k.provider.GetCredentials() in-core (config-map parsing, no SDK) and serialises it. Unified with the Architecture-3 credentials story — ONE CloudCredentials proto shape for both surfaces, so secret-redaction has one shape to redact. Important 2 — core actually imports FOUR cloud SDK trees, not three: godo is still in cloud_account_do.go + 5 platform_do_*.go files. §Problem now acknowledges godo as a 4th tree, explicitly scopes it OUT (user's ask was 3 trees), and the go list -deps gate is reworded to assert "zero packages from the three in-scope trees" not "zero cloud SDKs". All "zero cloud SDKs" phrasing reconciled throughout. Minor: ListStates filter + remaining-proto-messages notes folded in. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 7270f6d8..55b37bec 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,7 +1,7 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — revised after adversarial review cycle 2 +**Status:** Design — revised after adversarial review cycle 4 **Owner:** autonomous pipeline (workflow#TBD) ## Problem @@ -16,11 +16,13 @@ Workflow core's `module/` package imports three cloud SDK trees directly. File c Every dependabot bump of a cloud SDK (PRs #400/#419/#421/#635 as of this writing) churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. -Precedent: workflow#617 removed the legacy DigitalOcean IaC modules + `godo` from core; IaC resource provisioning moved entirely to `workflow-plugin-digitalocean`. This design extends the same principle to the *remaining* cloud functionality that never went through that extraction: IaC **state backends**, managed-service **platform** provisioners, and a handful of standalone modules/steps. +Precedent: workflow#617 removed the legacy DigitalOcean IaC *resource* modules + `godo` from those; IaC resource provisioning moved to `workflow-plugin-digitalocean`. This design extends the same principle to the *remaining* cloud functionality that never went through that extraction: IaC **state backends**, managed-service **platform** provisioners, and a handful of standalone modules/steps. + +**A fourth tree — `github.com/digitalocean/godo` — is still in core but out of scope here.** `module/cloud_account_do.go` + five `module/platform_do_*.go` files (`platform_do_app.go`, `platform_do_dns.go`, `platform_do_networking.go`, `platform_doks.go`, `platform_do_database.go`) still import `godo` — workflow#617 removed the DO *IaC resource* path but these `platform.do_*` modules survived it. The user's ask scoped this work to three SDK trees (aws/azure/gcp); `godo` extraction is a structurally-identical follow-up (the `platform.do_*` modules would extract via the same `PlatformBackend` contract this design introduces) but is **not** in this design's scope. Consequence: the `go list -deps` CI gate added in the final phase asserts **zero `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` packages** — it does *not* assert "zero cloud SDKs" while `godo` remains. The design's phrasing is corrected throughout to "the three in-scope SDK trees," not "all cloud SDKs." ## Goals -- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` **entirely** — verified by a `go list -deps` gate in the final phase's CI. +- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` (the three in-scope trees) **entirely** — verified by a `go list -deps` gate in the final phase's CI asserting zero packages from those three trees. `godo` is out of scope (see Problem). - Cloud functionality remains available, loaded via strict-contract gRPC plugins (the existing sidecar model). - `kind` Kubernetes backend (no SDK) stays in core — local-dev/test path must not require a plugin. @@ -88,12 +90,15 @@ service PlatformBackend { rpc Apply (PlatformApplyRequest) returns (PlatformApplyResponse); rpc Destroy(PlatformDestroyRequest) returns (PlatformDestroyResponse); } -// Request carries: platform_type (kubernetes|ecs|...), provider (eks|gke|aks|...), -// desired-state struct, current-state struct. +// Each request carries: platform_type (kubernetes|ecs|...), provider (eks|gke|aks|...), +// desired-state struct, current-state struct, AND a CloudCredentials message. // Response carries: plan actions / applied state / errors. +// (remaining request/response message field layouts: deferred to writing-plans.) ``` -When `provider != kind` (or any other in-core backend), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. +**Credential flow across the boundary.** Every cloud `platform.*` backend today reaches credentials via `k.provider.GetCredentials()` returning a `module.CloudCredentials` struct (`module/cloud_account.go:18`) — `eksBackend`, `gkeBackend`, and `aksBackend` all bind to it; `aksBackend.azureToken` even takes `*CloudCredentials` directly. `cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) **stays in core** — it is the provider-agnostic credential abstraction, not cloud-SDK code. When a backend moves to a plugin, the engine resolves `k.provider.GetCredentials()` *in-core* (no SDK needed — it's config-map parsing) and serialises the resulting `CloudCredentials` into a proto `CloudCredentials` message carried on every `PlatformBackend` request. The plugin builds its cloud SDK client from that message. **This is the same shape as the §Architecture-3 `credentials:` story** — one `CloudCredentials` proto message serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact, not two. + +When `provider != kind` (and `!= k3s` — `k3s` also maps to the in-core `kindBackend`), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. **The `PlatformBackend` shape is gated** — see Alternatives Considered #1 and self-challenge doubt #1. The first writing-plans task for Phase B is an interface-audit spike that validates one unified `Plan/Apply/Destroy` contract against all five `platform.*` backend interfaces *before* the proto is locked. If they don't unify cleanly, the fallback is folding the cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` model instead of inventing `PlatformBackend`. @@ -133,43 +138,35 @@ Moving the IaC state store behind a gRPC sidecar introduces a partial-failure su - **Plugin unreachable at plan/apply start.** Core's `iac.state` dispatch returns a clear `"iac.state backend %q: plugin unreachable"` error and the plan/apply aborts *before* mutating anything — no partial state. This matches today's behavior when a misconfigured backend fails to construct in `IaCModule.Init()`. - **`PlatformBackend` plugin crash mid-`Apply`.** A `platform.*` apply that crashes mid-flight leaves real cloud resources in an indeterminate state — but this is **identical to today's in-process risk** (an in-process `eksBackend.apply()` panic leaves the same indeterminate cloud state). The gRPC boundary does not worsen it; the next `Plan` reconciles against live cloud state as it does today. No new mitigation needed — documented as unchanged. -## Per-file import + symbol ownership (intended post-split — verified by the Phase 0 build gate) +## Cross-file coupling: the symbol-ownership audit is a Phase 0 deliverable, not a design-doc table -`module/platform_kubernetes_kind.go` is the one file shared across phases. Verified import ownership (`grep` per SDK symbol against the single import block at lines 3-19): +Three prior review cycles each found a hand-maintained per-file ownership table in this design *wrong* — the design doc is the wrong place for a precise symbol map, because the map is derived data that rots on every edit. **The map is therefore a Phase 0 build artifact, not a design claim.** What the design commits to is the *method* and the *known shape*: -**Intended post-split layout** (asserted as the Phase 0 target — *verified* by the Phase 0 PR's `go build` gate, not by this design doc; the current file is one unsplit block, so per-file ownership cannot be a present-tense verified fact): +**Known shape (the parts that survive any audit):** +- `module/platform_kubernetes_kind.go` currently holds **four** backends (`kindBackend`, `eksBackend`, `gkeBackend`, `aksBackend`) plus one shared `func init()` registering five names — `kind`, `k3s`, `eks`, `gke`, `aks` (`k3s` reuses `kindBackend`). `module/platform_kubernetes.go` is a **separate, already-existing** file holding the `PlatformKubernetes` module shell + the `kubernetesBackend` interface — untouched by the split. +- **All three cloud backends bind to `module/cloud_account.go`** via `k.provider.GetCredentials() → CloudCredentials`. `cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic credential abstraction — **it stays in core**, is never deleted by any phase, and is the symbol home all cloud platform code binds to. The `PlatformBackend` contract carries `CloudCredentials` across the boundary (§Architecture-2). +- `eksBackend` *additionally* binds to `cloud_account_aws.go` (`awsProviderFrom`, `AWSConfig`, `parseStringSlice`) — and `cloud_account_aws.go` is **deleted by Phase B**. `eksBackend` and `cloud_account_aws.go` therefore leave core in the same Phase B commit. +- `aksBackend` imports **no cloud SDK** — raw `net/http` REST against the Azure management API (the stale file-header comment "Requires the Azure SDK" notwithstanding). Its extraction is code-organisation, not a dependency change; the Azure go.mod drop comes entirely from `iac_state_azure.go` deletion + `iac_module.go` edit. -| backend | cloud SDK it will own | cross-file symbol deps (the trap) | extracted in | -|---------|----------------------|-----------------------------------|--------------| -| `kindBackend` | none (in-memory) | none | — stays in core | -| `eksBackend` | `aws-sdk-go-v2/aws`, `service/eks`, `eks/types` | calls `awsProviderFrom` + `AWSConfig()` (in `cloud_account_aws.go`); `parseStringSlice` (in `cloud_account_aws.go`); `safeIntToInt32` (in `platform_kubernetes.go`) | Phase B | -| `gkeBackend` | `google.golang.org/api/container/v1`, `google.golang.org/api/option` | `safeIntToInt32` (in `platform_kubernetes.go`) | Phase C | -| `aksBackend` | **none** — raw `net/http` REST against the Azure management API (the file header comment "Requires the Azure SDK" is stale; verified no `azure-sdk-for-go` symbol in the `aksBackend` region) | none | Phase A | +**The method — `scripts/audit-cloud-symbols.sh`, produced as Phase 0 task 1:** a script that, for each backend region and each plugin-bound `module/*.go` file, greps every cross-file function/type reference and emits the authoritative ownership map. Its output is committed alongside Phase 0 and re-run in CI on every subsequent phase PR. The design does not transcribe its output — the script *is* the source of truth, eliminating the recurring transcription defect. Two helper funcs are already known to need relocation (below); the script catches any the eye missed. -Three consequences this corrects from earlier drafts: -- **`aksBackend` extraction does NOT drop the Azure SDK** — `aksBackend` never imported it. The Azure go.mod drop is achieved entirely by deleting `iac_state_azure.go` + editing `iac_module.go`. Moving `aksBackend` to the plugin is still done (cloud-platform code belongs in the plugin, and `PlatformBackend` needs an Azure impl) but it is a *code-organisation* change, not a *dependency* change. -- **`eksBackend` has a hard call-graph edge to `cloud_account_aws.go`** — they MUST be removed in the same Phase B commit. -- **Symbol-level coupling, not just import-level.** The extracted backends call two pure helper funcs that live in core files: `parseStringSlice` (in `cloud_account_aws.go`, which Phase B *deletes*) and `safeIntToInt32` (in `platform_kubernetes.go`, which *stays* in core). An import-block audit misses these — a file can be "import-clean" and still fail to compile standalone. Phase 0 must relocate them (below). +## Phase 0 — precursor: split `platform_kubernetes_kind.go`, partition `init()`, relocate shared helpers -## Phase 0 — precursor: split `platform_kubernetes_kind.go` + relocate shared helpers +A mechanical, behavior-equivalent refactor landed **before** Phase A. Three moves: -A pure mechanical refactor, no behavior change, landed **before** Phase A. Two moves: +**1. Split the one shared backend file into four.** `platform_kubernetes_kind.go` (currently all four backends) → `platform_kubernetes_kind.go` (`kindBackend` only), `platform_kubernetes_eks.go`, `platform_kubernetes_gke.go`, `platform_kubernetes_aks.go`. Each new file owns its own import block. -**1. Split the one shared backend file into four** (note: `platform_kubernetes.go` is a *separate, already-existing* file holding the `PlatformKubernetes` module shell + the `kubernetesBackend` interface + `safeIntToInt32` — it is **not** touched by the split; only `platform_kubernetes_kind.go`, which currently holds all four backends, is split): -- `platform_kubernetes_kind.go` — `kindBackend` only. No cloud SDK, no cross-file helper deps. -- `platform_kubernetes_eks.go` — `eksBackend` only; owns the `aws-sdk-go-v2` imports. -- `platform_kubernetes_gke.go` — `gkeBackend` only; owns the `google.golang.org/api` imports. -- `platform_kubernetes_aks.go` — `aksBackend` only; owns `net/http` (no cloud SDK). +**2. Partition the shared `func init()` per-file.** The one `init()` registering `kind`/`k3s`/`eks`/`gke`/`aks` **cannot** be split untouched — each new file gets its own `init()` registering only its backend(s) (`kind` *and* `k3s` both register from `platform_kubernetes_kind.go`, since `k3s` reuses `kindBackend`). This is a *distribution* of the registration, not a behavior change — the same five names are registered after the split — but it is **not** "zero logic change," and the design says so plainly. The payoff: when Phase A deletes `platform_kubernetes_aks.go`, the `aks` registration goes with it; no dangling `RegisterKubernetesBackend("aks", …)` is left behind for the build gate to catch as a late surprise. -**2. Relocate the two shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: -- `parseStringSlice` moves out of `cloud_account_aws.go` (which Phase B deletes) — otherwise its plugin-bound consumers (`platform_ecs.go`, `platform_kubernetes_eks.go`) lose it. -- `safeIntToInt32` moves out of `platform_kubernetes.go` — it is used by `platform_autoscaling.go`, `platform_ecs.go`, `platform_networking.go`, `platform_kubernetes_eks.go`, `platform_kubernetes_gke.go` (all plugin-bound) *and* by `platform_kubernetes.go` itself (core-resident). A neutral home means both sides keep working. +**3. Relocate the two shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: +- `parseStringSlice` moves out of `cloud_account_aws.go` (Phase B deletes that file) — its plugin-bound consumers (`platform_ecs.go`, `platform_kubernetes_eks.go`) would otherwise lose it. +- `safeIntToInt32` moves out of `platform_kubernetes.go` — used by `platform_autoscaling.go`, `platform_ecs.go`, `platform_networking.go`, `platform_kubernetes_eks.go` (all plugin-bound) *and* by core-resident `platform_kubernetes.go`. A neutral home keeps both sides compiling. -Both helpers are tiny pure functions (no SDK, no state). `cloud_helpers.go` stays in core permanently. When a plugin-bound file moves to its plugin in Phases A–D, that plugin gets its own copy of whichever helpers it uses (≤15 lines each — duplication of a pure stdlib-only helper across process boundaries is correct, not a smell; a shared plugin-side util module is the Alternatives-Considered-#3 follow-up). +Both helpers are tiny pure functions (no SDK, no state). `cloud_helpers.go` stays in core permanently. When a plugin-bound file moves to its plugin, that plugin gets its own copy of whichever helpers it uses (≤15 lines each — duplicating a pure stdlib-only helper across a process boundary is correct, not a smell; the shared plugin-side util module is the Alternatives-Considered-#3 follow-up). -After Phase 0, each subsequent phase deletes *its own backend file* — self-contained at both the import-block AND symbol level. **Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `git diff` shows pure code movement (zero logic change); a grep confirms no remaining core file references `parseStringSlice`/`safeIntToInt32` from their old homes. This is the single cheapest de-risking move in the plan — it converts the fragile "extract-from-shared-file-in-place" path into trivially-reviewable deletions. +**Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `scripts/audit-cloud-symbols.sh` committed and its output shows zero cross-file symbol dep from any plugin-bound file into a to-be-deleted file *except* the known `eksBackend → cloud_account_aws.go` edge (which Phase B handles atomically); `git diff` is pure code movement + the mechanical `init()` partition, no logic edits. After Phase 0, each subsequent phase deletes *its own* backend file — self-contained at import-block AND symbol level. -**Phase 0 rollback:** a pure file-split + helper-relocation with zero behavior diff — revert is a single `git revert` with no contract, no go.mod, and no runtime impact. It is the one phase with a trivial rollback story. +**Phase 0 rollback:** a file-split + `init()` partition + helper-relocation with no behavior diff — revert is a single `git revert`, no contract, no go.mod, no runtime impact. The one phase with a trivial rollback story. ## Phases @@ -242,8 +239,8 @@ This design changes **plugin loading paths** and **go.mod dependency trees** — ## Alternatives Considered 1. **Fold cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contracts instead of inventing `PlatformBackend`.** An EKS/GKE/AKS cluster — and arguably an ECS service, a Route53 zone, an EC2 VPC — is structurally a managed resource with create/plan/apply/destroy/status, which is exactly what the battle-tested `ResourceDriver` contract already models (8 services in `iac.proto`, multiple ADRs through the strict-contracts cutover). Inventing `PlatformBackend` risks the lowest-common-denominator problem (self-challenge doubt #1). **Rejected as the default** because the `platform.*` modules have a distinct plan/apply *lifecycle surface* (they sync against live cloud state continuously, not just declaratively reconcile) and a distinct `provider:` UX the user explicitly asked to preserve — but **retained as the gated fallback**: the Phase 0/A interface-audit spike decides. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the implementation folds them into `ResourceDriver` rather than shipping a bad `PlatformBackend`. -2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "drops the AWS *service-provider* tree, keeps one S3 client." The S3 client is small and stable; DO Spaces + AWS S3 are the same API; keeping one shared S3-compatible store in core avoids forcing *both* the AWS and DO plugins to each carry an S3 client and avoids a clean-break for existing `spaces` users. **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the "core has zero cloud SDKs" invariant the `go list -deps` gate is meant to enforce — a partial extraction is a maintenance trap. The cost (both aws + DO plugins carry an S3 client) is real but bounded: it's one service package, and each plugin is independently versioned anyway. -3. **A shared `s3compat` Go module consumed by both the aws and DO plugins** (instead of each independently re-implementing the S3-compatible state store + `buildAWSConfig`). Keeps "core has zero cloud SDKs" intact while eliminating the cross-plugin duplication Alternative #2 dismisses as "bounded." **Deferred, not rejected:** it is a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary, so it can land as a follow-up after the extraction is proven. Forcing it into the critical path now couples the aws and DO plugin release cadences; the duplication is a small, well-understood `buildAWSConfig` + thin S3 wrapper. Writing-plans logs it as a post-extraction cleanup candidate. +2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "drops the AWS *service-provider* tree, keeps one S3 client." The S3 client is small and stable; DO Spaces + AWS S3 are the same API; keeping one shared S3-compatible store in core avoids forcing *both* the AWS and DO plugins to each carry an S3 client and avoids a clean-break for existing `spaces` users. **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the "core drops the three in-scope SDK trees" invariant the `go list -deps` gate enforces — a partial extraction is a maintenance trap. The cost (both aws + DO plugins carry an S3 client) is real but bounded: it's one service package, and each plugin is independently versioned anyway. +3. **A shared `s3compat` Go module consumed by both the aws and DO plugins** (instead of each independently re-implementing the S3-compatible state store + `buildAWSConfig`). Keeps the three-in-scope-trees invariant intact while eliminating the cross-plugin duplication Alternative #2 dismisses as "bounded." **Deferred, not rejected:** it is a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary, so it can land as a follow-up after the extraction is proven. Forcing it into the critical path now couples the aws and DO plugin release cadences; the duplication is a small, well-understood `buildAWSConfig` + thin S3 wrapper. Writing-plans logs it as a post-extraction cleanup candidate. 4. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. ## Self-challenge — top doubts surfaced (carried forward, with mitigations now wired into phases) From e71fa1068f426438e4045ab0c9147fe392a1e94c Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:12:11 -0400 Subject: [PATCH 07/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20adversarial=20review=20cycle=205=20revision?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 1 Critical + 2 Important from cycle-5 review: Critical — the init()-partition fix (cycle-4) was kubernetes-only, but the SAME defect class exists in platform_dns.go / platform_ecs.go / platform_networking.go / platform_autoscaling.go: each has a single func init() registering BOTH a core-staying `mock` backend AND a plugin-bound `aws` backend. The old Phase B inventory moved those files wholesale → would exile the mock backends + dangle the route53 registration. FIX: Phase 0 generalized from "split platform_kubernetes_kind.go" to a repo-wide uniform `_core.go` / `_.go` convention across the WHOLE platform.* family. Every mixed init() is partitioned; the audit script flags any init() registering a mix of core-staying + plugin-bound factories as a CI failure. Phase B inventory rewritten to delete only `_aws.go`/`_eks.go` files, never a mixed file. Important 1 — the cycle-4 "Known shape" prose reintroduced hand-maintained cross-file symbol claims (one already incomplete: parseStringSlice consumers). FIX: cut all per-file symbol enumerations; the section now states only invariants the script VERIFIES (not discovers) + the method. No transcribed symbol lists remain. Important 2 + own finding — cycle-4 said the engine resolves credentials in-core "no SDK needed." VERIFIED FALSE: cloud_account_aws_creds.go's awsProfileResolver calls config.LoadDefaultConfig(WithSharedConfigProfile) and awsRoleARNResolver calls sts.AssumeRole — both need the AWS SDK. FIX: §Architecture-2 corrected — engine passes the DECLARED credential config (plain strings) in the CloudCredentials proto; the PLUGIN resolves (incl. the SDK-bearing profile/role_arn paths). Both cloud_account_aws.go AND cloud_account_aws_creds.go deleted by Phase B, no core replacement — all AWS cred resolution moves plugin-side. azure/gcp resolver files stay (their resolvers are genuinely SDK-free). Minor — backend-name collision: core-reserved names (memory/filesystem/ postgres/kind/k3s/mock) cause a load-time error if a plugin collides, not silent shadowing. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 66 +++++++++---------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 55b37bec..3e2f3743 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,7 +1,7 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — revised after adversarial review cycle 4 +**Status:** Design — revised after adversarial review cycle 5 **Owner:** autonomous pipeline (workflow#TBD) ## Problem @@ -96,7 +96,7 @@ service PlatformBackend { // (remaining request/response message field layouts: deferred to writing-plans.) ``` -**Credential flow across the boundary.** Every cloud `platform.*` backend today reaches credentials via `k.provider.GetCredentials()` returning a `module.CloudCredentials` struct (`module/cloud_account.go:18`) — `eksBackend`, `gkeBackend`, and `aksBackend` all bind to it; `aksBackend.azureToken` even takes `*CloudCredentials` directly. `cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) **stays in core** — it is the provider-agnostic credential abstraction, not cloud-SDK code. When a backend moves to a plugin, the engine resolves `k.provider.GetCredentials()` *in-core* (no SDK needed — it's config-map parsing) and serialises the resulting `CloudCredentials` into a proto `CloudCredentials` message carried on every `PlatformBackend` request. The plugin builds its cloud SDK client from that message. **This is the same shape as the §Architecture-3 `credentials:` story** — one `CloudCredentials` proto message serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact, not two. +**Credential flow across the boundary — engine passes *declared* config, plugin *resolves*.** Every cloud `platform.*` backend today reaches credentials via `k.provider.GetCredentials()`; `aksBackend.azureToken` takes `*CloudCredentials` directly. Critical correction from earlier drafts: the AWS credential *resolvers* are **not** SDK-free — `module/cloud_account_aws_creds.go`'s `awsProfileResolver` calls `config.LoadDefaultConfig(WithSharedConfigProfile)` and `awsRoleARNResolver` calls `sts.AssumeRole`; both genuinely need the AWS SDK. So the engine **cannot** "resolve credentials in-core" for AWS. The model is therefore: `cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) **stays in core** as the provider-agnostic holder of the *declared* credential config (provider, region, `credentials.{type,accessKey,secretKey,sessionToken,roleArn,profile,...}` — all plain strings, no SDK); the engine serialises that **declared config** into a proto `CloudCredentials` message on every `PlatformBackend` request; the **plugin** performs the actual resolution (static/env/profile/role_arn — including the SDK-bearing profile-chain and STS-AssumeRole paths) in-process with its own SDK. Consequence: `cloud_account_aws.go` *and* `cloud_account_aws_creds.go` are both **deleted by Phase B with no core replacement** — all AWS credential resolution moves plugin-side. This *simplifies* the design (no in-core resolution path, no `AWSConfigProvider` interface) and is the same shape as the §Architecture-3 `credentials:` story — one `CloudCredentials` proto message of *declared* config serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact. When `provider != kind` (and `!= k3s` — `k3s` also maps to the in-core `kindBackend`), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. @@ -118,7 +118,7 @@ These are user-facing pipeline functionality, not engine infrastructure. They be (`storage_artifact_s3.go` references the AWS SDK only in comments — verified comment-only, **not** a real import, stays in core.) -Credential handling (Option 1, approved): the deleted `cloud_account_aws.go` + `_creds.go` (`AWSConfigProvider` / `AWSConfig()`) is **not** replaced by a core contract. Each plugin-native AWS module carries its own `credentials:` config block and builds `aws.Config` in-process via a shared in-plugin `buildAWSConfig` helper — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. `cloud_account_azure.go` and `cloud_account_gcp.go` reference the SDKs **only in comments** (verified — they are pure config-map parsing) and stay in core untouched. +Credential handling (Option 1, approved): the deleted `cloud_account_aws.go` + `cloud_account_aws_creds.go` (`AWSConfigProvider`, `AWSConfig()`, and the four `aws*Resolver` types + their `RegisterCredentialResolver` `init()`) are **not** replaced by a core contract. Each plugin-native AWS module carries its own `credentials:` config block and resolves it in-process via a shared in-plugin `buildAWSConfig` helper that owns the static/env/profile/role_arn logic — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. `cloud_account_azure.go` and `cloud_account_gcp.go` reference the SDKs **only in comments** (verified — pure config-map parsing, their resolvers populate the `CloudCredentials` struct with declared values and never call an SDK) and **stay in core untouched** — they remain valid `CloudCredentialProvider` resolvers for the declared-config-passthrough path. Only the *AWS* resolvers move, because only they carry SDK calls. ## Security @@ -137,36 +137,36 @@ Moving the IaC state store behind a gRPC sidecar introduces a partial-failure su - **`SaveState` succeeds plugin-side but the gRPC response is lost → engine retries → double-write.** `SaveState` MUST be idempotent: it is a full-state replace keyed by `resource_id` (the existing `IaCStateStore.SaveState` is already "insert or replace"), so a retried identical `SaveState` is a no-op-equivalent. The contract documents `SaveState` as idempotent; the plugin implementations use unconditional PUT (overwrite), not append. No sequence number needed — IaC state is last-writer-wins by design. - **Plugin unreachable at plan/apply start.** Core's `iac.state` dispatch returns a clear `"iac.state backend %q: plugin unreachable"` error and the plan/apply aborts *before* mutating anything — no partial state. This matches today's behavior when a misconfigured backend fails to construct in `IaCModule.Init()`. - **`PlatformBackend` plugin crash mid-`Apply`.** A `platform.*` apply that crashes mid-flight leaves real cloud resources in an indeterminate state — but this is **identical to today's in-process risk** (an in-process `eksBackend.apply()` panic leaves the same indeterminate cloud state). The gRPC boundary does not worsen it; the next `Plan` reconciles against live cloud state as it does today. No new mitigation needed — documented as unchanged. +- **A plugin registers a backend/provider name that collides with a core-reserved one.** Core-registered names (`iac.state`: `memory`/`filesystem`/`postgres`; `platform.kubernetes`: `kind`/`k3s`; the `mock` backend of every `platform.*` family) are **reserved**. A plugin registration that collides with a reserved name is a **load-time error** — core fails to start with `"plugin %q registered reserved backend name %q"` rather than silently shadowing (in either direction). This makes a malformed or adversarial plugin manifest a hard, immediate failure, not a confusing runtime mis-dispatch. -## Cross-file coupling: the symbol-ownership audit is a Phase 0 deliverable, not a design-doc table +## Cross-file coupling: the symbol-ownership map is a Phase 0 build artifact, not a design-doc claim -Three prior review cycles each found a hand-maintained per-file ownership table in this design *wrong* — the design doc is the wrong place for a precise symbol map, because the map is derived data that rots on every edit. **The map is therefore a Phase 0 build artifact, not a design claim.** What the design commits to is the *method* and the *known shape*: +Four prior review cycles each found a hand-maintained per-file ownership claim in this design *wrong* — first as a table, then (cycle 5) as prose. The lesson is structural: **a precise symbol map is derived data; it rots on every edit and the design doc is the wrong place for it.** The design therefore commits to a *method* and a small set of *invariants*, and delegates the exact map to a script. -**Known shape (the parts that survive any audit):** -- `module/platform_kubernetes_kind.go` currently holds **four** backends (`kindBackend`, `eksBackend`, `gkeBackend`, `aksBackend`) plus one shared `func init()` registering five names — `kind`, `k3s`, `eks`, `gke`, `aks` (`k3s` reuses `kindBackend`). `module/platform_kubernetes.go` is a **separate, already-existing** file holding the `PlatformKubernetes` module shell + the `kubernetesBackend` interface — untouched by the split. -- **All three cloud backends bind to `module/cloud_account.go`** via `k.provider.GetCredentials() → CloudCredentials`. `cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic credential abstraction — **it stays in core**, is never deleted by any phase, and is the symbol home all cloud platform code binds to. The `PlatformBackend` contract carries `CloudCredentials` across the boundary (§Architecture-2). -- `eksBackend` *additionally* binds to `cloud_account_aws.go` (`awsProviderFrom`, `AWSConfig`, `parseStringSlice`) — and `cloud_account_aws.go` is **deleted by Phase B**. `eksBackend` and `cloud_account_aws.go` therefore leave core in the same Phase B commit. -- `aksBackend` imports **no cloud SDK** — raw `net/http` REST against the Azure management API (the stale file-header comment "Requires the Azure SDK" notwithstanding). Its extraction is code-organisation, not a dependency change; the Azure go.mod drop comes entirely from `iac_state_azure.go` deletion + `iac_module.go` edit. +**Invariants (the parts that survive any audit — these are load-bearing and the script verifies them, it doesn't discover them):** +- `module/cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic *declared-config* holder — **it stays in core, is never deleted by any phase**, and is the credential symbol-home all cloud platform code binds to. The `PlatformBackend` contract carries the declared `CloudCredentials` across the boundary (§Architecture-2). +- The `platform.*` family files each currently co-locate a **core-staying** backend (`mock`, plus `kind`/`k3s` for kubernetes) and one or more **plugin-bound** cloud backends behind a *single shared `func init()`*. This is true for `platform_kubernetes_kind.go`, `platform_dns.go`, `platform_ecs.go`, `platform_networking.go`, `platform_autoscaling.go` — verified. Splitting any one of them requires partitioning that `init()`; moving a file wholesale would either exile the `mock` backend or dangle a cloud registration. **Phase 0 fixes this for the whole family, not just kubernetes.** +- `cloud_account_aws.go` + `cloud_account_aws_creds.go` are **deleted by Phase B** (§Architecture-2, -3) — all AWS credential resolution moves plugin-side. Any core symbol they define and a *staying* file needs (the pure helper `parseStringSlice`; see Phase 0) must be relocated first. -**The method — `scripts/audit-cloud-symbols.sh`, produced as Phase 0 task 1:** a script that, for each backend region and each plugin-bound `module/*.go` file, greps every cross-file function/type reference and emits the authoritative ownership map. Its output is committed alongside Phase 0 and re-run in CI on every subsequent phase PR. The design does not transcribe its output — the script *is* the source of truth, eliminating the recurring transcription defect. Two helper funcs are already known to need relocation (below); the script catches any the eye missed. +**The method — `scripts/audit-cloud-symbols.sh`, Phase 0 task 1:** for each `platform.*` backend region and each plugin-bound `module/*.go` file, it greps every cross-file function/type reference and every `init()` that registers a *mix* of core-staying and plugin-bound factories, and emits the authoritative ownership + `init()`-partition map. Committed with Phase 0, re-run in CI on every subsequent phase PR. The design never transcribes its output — the script *is* the source of truth. A mixed-`init()` or a cross-file symbol edge into a to-be-deleted file is a Phase 0 (or phase-PR) **CI failure**, not a reviewer's catch. -## Phase 0 — precursor: split `platform_kubernetes_kind.go`, partition `init()`, relocate shared helpers +## Phase 0 — precursor: isolate every cloud backend behind a uniform file convention -A mechanical, behavior-equivalent refactor landed **before** Phase A. Three moves: +A mechanical, behavior-equivalent refactor landed **before** Phase A. It establishes — repo-wide across the `platform.*` family — the convention that makes every later phase a clean deletion: -**1. Split the one shared backend file into four.** `platform_kubernetes_kind.go` (currently all four backends) → `platform_kubernetes_kind.go` (`kindBackend` only), `platform_kubernetes_eks.go`, `platform_kubernetes_gke.go`, `platform_kubernetes_aks.go`. Each new file owns its own import block. +**1. Uniform `_core.go` / `_.go` file convention.** For each `platform.*` family, Phase 0 mechanically splits so that: +- `platform__core.go` (or the existing shell file) holds the module shell, the backend interface, the `mock` backend (and `kind`/`k3s` for kubernetes), and an `init()` registering **only** the core-staying backends. +- `platform__.go` holds exactly one cloud backend + its own import block + its own `init()` registering **only** that provider. -**2. Partition the shared `func init()` per-file.** The one `init()` registering `kind`/`k3s`/`eks`/`gke`/`aks` **cannot** be split untouched — each new file gets its own `init()` registering only its backend(s) (`kind` *and* `k3s` both register from `platform_kubernetes_kind.go`, since `k3s` reuses `kindBackend`). This is a *distribution* of the registration, not a behavior change — the same five names are registered after the split — but it is **not** "zero logic change," and the design says so plainly. The payoff: when Phase A deletes `platform_kubernetes_aks.go`, the `aks` registration goes with it; no dangling `RegisterKubernetesBackend("aks", …)` is left behind for the build gate to catch as a late surprise. +Concretely: `platform_kubernetes_kind.go` (currently all four k8s backends) splits into `platform_kubernetes_kind.go` (kind+k3s) / `_eks.go` / `_gke.go` / `_aks.go`; `platform_dns_backends.go` (currently `mockDNSBackend` + `route53Backend`) splits into a mock-stays file / `platform_dns_aws.go`; `platform_ecs.go` / `platform_networking.go` / `platform_autoscaling.go` each split their `mock`+`aws` backends + their shared `init()` the same way. The exact file list is the audit-script's output, not enumerated here — but the *rule* is fixed: after Phase 0, no `init()` registers both a core-staying and a plugin-bound factory, and no file holds both. -**3. Relocate the two shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: -- `parseStringSlice` moves out of `cloud_account_aws.go` (Phase B deletes that file) — its plugin-bound consumers (`platform_ecs.go`, `platform_kubernetes_eks.go`) would otherwise lose it. -- `safeIntToInt32` moves out of `platform_kubernetes.go` — used by `platform_autoscaling.go`, `platform_ecs.go`, `platform_networking.go`, `platform_kubernetes_eks.go` (all plugin-bound) *and* by core-resident `platform_kubernetes.go`. A neutral home keeps both sides compiling. +**2. Relocate shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: `parseStringSlice` (out of the Phase-B-deleted `cloud_account_aws.go`) and `safeIntToInt32` (out of `platform_kubernetes.go`, used by core-resident *and* plugin-bound files). Both are ≤15-line pure functions, no SDK, no state. `cloud_helpers.go` stays in core permanently; when a plugin-bound file moves to its plugin it gets its own copy of whichever helpers it uses (duplicating a pure stdlib-only helper across a process boundary is correct, not a smell — the shared plugin-side util module is the Alternatives-Considered-#3 follow-up). The audit script's job is to confirm the relocation is *complete* — no staying file references the helpers from their old homes. -Both helpers are tiny pure functions (no SDK, no state). `cloud_helpers.go` stays in core permanently. When a plugin-bound file moves to its plugin, that plugin gets its own copy of whichever helpers it uses (≤15 lines each — duplicating a pure stdlib-only helper across a process boundary is correct, not a smell; the shared plugin-side util module is the Alternatives-Considered-#3 follow-up). +This is **not** "zero logic change" — partitioning a shared `init()` distributes registration calls across files. It is *behavior-equivalent*: the same backend names are registered after the split as before. The design states this plainly rather than mislabelling it. -**Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `scripts/audit-cloud-symbols.sh` committed and its output shows zero cross-file symbol dep from any plugin-bound file into a to-be-deleted file *except* the known `eksBackend → cloud_account_aws.go` edge (which Phase B handles atomically); `git diff` is pure code movement + the mechanical `init()` partition, no logic edits. After Phase 0, each subsequent phase deletes *its own* backend file — self-contained at import-block AND symbol level. +**Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `scripts/audit-cloud-symbols.sh` committed, and its output shows (a) no `init()` mixing core-staying + plugin-bound registrations, (b) no cross-file symbol edge from a plugin-bound file into a to-be-deleted file, (c) the helper relocation complete; `git diff` is pure code movement + mechanical `init()` partition, no logic edits. After Phase 0, every subsequent phase deletes *only* `_.go` files — self-contained at import-block, `init()`, AND symbol level. -**Phase 0 rollback:** a file-split + `init()` partition + helper-relocation with no behavior diff — revert is a single `git revert`, no contract, no go.mod, no runtime impact. The one phase with a trivial rollback story. +**Phase 0 rollback:** a file-split + `init()` partition + helper relocation with no behavior diff — revert is a single `git revert`, no contract, no go.mod, no runtime impact. The one phase with a trivial rollback story. ## Phases @@ -180,29 +180,29 @@ Each phase is one workflow-core PR (deleting files + wiring the contract dispatc - workflow-plugin-azure implements `azure_blob` `IaCStateBackend` + `aks` `PlatformBackend`. - Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go` **(this + the deletion is what drops `Azure/azure-sdk-for-go` from go.mod)**; delete `platform_kubernetes_aks.go` (from Phase 0) and wire its `PlatformBackend` dispatch. -**Phase B — AWS** (largest — 13 files, 3 surfaces). Complete file inventory + destination: +**Phase B — AWS** (largest — 13 SDK-importing files, 3 surfaces). After Phase 0's split, every `platform.*` AWS backend lives in its own `_aws.go` file with its own `init()` — so Phase B deletes `_aws.go` / `_eks.go` files cleanly, never a mixed file. Inventory + destination (file names post-Phase-0; the authoritative list is the audit-script output): -| core file | destination | atomicity note | -|-----------|-------------|----------------| +| core file (post Phase 0) | destination | atomicity note | +|--------------------------|-------------|----------------| | `iac_state_spaces.go` | aws plugin — `s3` `IaCStateBackend` (DELETE from core) | shared with `spaces` — see Phase D | -| `cloud_account_aws.go` | DELETE (Option 1 — no replacement) | **same commit as `platform_kubernetes_eks.go`** (call-graph edge) | -| `cloud_account_aws_creds.go` | DELETE (Option 1 — no replacement) | same commit as above | -| `platform_kubernetes_eks.go` (from Phase 0) | aws plugin — `eks` `PlatformBackend` | **same commit as `cloud_account_aws*.go`** | +| `cloud_account_aws.go` | DELETE (Option 1 — no replacement) | **same commit as `platform_kubernetes_eks.go`** (call-graph edge: `awsProviderFrom`/`AWSConfig`) | +| `cloud_account_aws_creds.go` | DELETE (Option 1 — the 4 `aws*Resolver` types move plugin-side) | same commit as above | +| `platform_kubernetes_eks.go` | aws plugin — `eks` `PlatformBackend` | **same commit as `cloud_account_aws*.go`** | +| `platform_ecs_aws.go` | aws plugin — `PlatformBackend` (`ecs`) | `_core.go` with the `mock` ECS backend stays | +| `platform_networking_aws.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | `_core.go` with the `mock` networking backend stays | +| `platform_autoscaling_aws.go` | aws plugin — `PlatformBackend` (`autoscaling`) | `_core.go` with the `mock` autoscaling backend stays | +| `platform_dns_aws.go` | aws plugin — `PlatformBackend` (`dns`/route53) | `_core.go` with `mockDNSBackend` stays | | `aws_api_gateway.go` | aws plugin — `aws.apigateway` module | — | | `platform_apigateway.go` | aws plugin — `PlatformBackend` or `aws.apigateway` (gated on interface-audit spike) | — | | `codebuild.go` | aws plugin — `aws.codebuild` module | — | | `pipeline_step_s3_upload.go` | aws plugin — `step.s3_upload` | — | | `s3_storage.go` | aws plugin — `storage.s3` module | — | -| `platform_autoscaling.go` | aws plugin — `PlatformBackend` (`autoscaling`) | — | -| `platform_dns_backends.go` | aws plugin — `PlatformBackend` (`dns`/route53) | — | -| `platform_ecs.go` | aws plugin — `PlatformBackend` (`ecs`) | — | -| `platform_networking.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | — | -- Core PR also: **strip the `spaces` case from `iac_module.go`** (it calls `NewSpacesIaCStateStore` from the deleted `iac_state_spaces.go` — same compile-dependency pattern as Phase A's `azure_blob` strip). Drop `aws-sdk-go-v2` from go.mod. +- Core PR also: **strip the `spaces` case from `iac_module.go`** (it calls `NewSpacesIaCStateStore` from the deleted `iac_state_spaces.go` — same compile-dependency pattern as Phase A's `azure_blob` strip). Drop `aws-sdk-go-v2` from go.mod. (The `_core.go` files holding the `mock` ECS/networking/autoscaling/DNS backends + their interfaces + module shells **stay in core** — only the `_aws.go` files leave.) **Phase C — GCP** (3 files): - workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), `PlatformBackend` (`gke`), plugin-native `storage.gcs`. -- Core PR: delete `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_gke.go` (from Phase 0); drop `cloud.google.com/go` + `google.golang.org/api`. After Phase C, `go list -deps ./...` shows zero cloud-SDK packages — the permanent CI gate is added here. +- Core PR: delete `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_gke.go` (from Phase 0); drop `cloud.google.com/go` + `google.golang.org/api`. After Phase C, `go list -deps ./...` shows zero packages from the three in-scope SDK trees (`aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com`+`google.golang.org/api`) — the permanent CI gate is added here. (`godo` remains — out of scope, see Problem.) **Phase D — DigitalOcean (`spaces` clean-break):** - workflow-plugin-digitalocean implements `IaCStateBackend` for `spaces` (S3-compatible — pulls `aws-sdk-go-v2/service/s3`, the one service package, not the whole tree). From ac10b67d9036882cd6b883c93175b27bdcad3a5b Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:18:32 -0400 Subject: [PATCH 08/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20adversarial=20review=20cycle=206=20revision?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 1 Critical + 2 Important from cycle-6 review: Critical — cycle-5's credential-flow fix replaced one false claim with another: it said the CloudCredentials struct already holds "declared config (plain strings incl. profile)". VERIFIED FALSE — the struct (cloud_account.go:18) has no Profile field (profile lives in Extra map) and the resolvers mutate it in-place with RESOLVED values. FIX, cleaner than the struct change the reviewer proposed: the struct needs NO change (Extra map already carries markers, RoleARN field exists). Instead, cloud_account_aws_creds.go is EDITED not deleted — the SDK-bearing tails of awsProfileResolver/awsRoleARNResolver (config.LoadDefaultConfig, sts.AssumeRole) are removed; they keep their SDK-free heads (record declared inputs + an Extra["credential_source"] marker, exactly as awsStaticResolver already does). After the edit the file is SDK-free and stays in core alongside the azure/gcp resolver files. Only cloud_account_aws.go (the pure-SDK AWSConfig() builder + AWSConfigProvider + awsProviderFrom) is deleted; its profile-chain/STS logic moves into the plugin's buildAWSConfig. Every in-core resolver becomes uniformly "declare, don't resolve"; the plugin honors the markers. No unregistered- resolver failure mode — the resolver init() registrations stay. Important 1 — §Phase-0 misidentified the DNS file with the mixed init(). VERIFIED: platform_dns.go:66 has the init() (+ interface + factory registry); platform_dns_backends.go has both impls + the route53 SDK import, NO init(). DNS is a TWO-file split, unlike single-file ecs/networking/autoscaling. §Phase-0 now states the per-family layout explicitly (kubernetes one-file, dns two-file, ecs/networking/autoscaling one-file) and notes the audit script determines it. Important 2 — azure/gcp resolvers (and now aws profile/role_arn) emit deferred-resolution markers for env/CLI/managed-identity/workload-identity/ profile/role_arn — NOT plain-string passthrough. §Architecture-3 + Assumption 5 now state the plugin MUST implement marker handling for every deferred type, not just AWS profile/role_arn. Minor — safeIntToInt32 relocation rationale clarified (it's a clean copy-source for the plugin-bound files, not a hard core necessity); parseStringSlice IS a hard necessity (its file is deleted). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 3e2f3743..aa57811e 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,7 +1,7 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — revised after adversarial review cycle 5 +**Status:** Design — revised after adversarial review cycle 6 **Owner:** autonomous pipeline (workflow#TBD) ## Problem @@ -96,7 +96,11 @@ service PlatformBackend { // (remaining request/response message field layouts: deferred to writing-plans.) ``` -**Credential flow across the boundary — engine passes *declared* config, plugin *resolves*.** Every cloud `platform.*` backend today reaches credentials via `k.provider.GetCredentials()`; `aksBackend.azureToken` takes `*CloudCredentials` directly. Critical correction from earlier drafts: the AWS credential *resolvers* are **not** SDK-free — `module/cloud_account_aws_creds.go`'s `awsProfileResolver` calls `config.LoadDefaultConfig(WithSharedConfigProfile)` and `awsRoleARNResolver` calls `sts.AssumeRole`; both genuinely need the AWS SDK. So the engine **cannot** "resolve credentials in-core" for AWS. The model is therefore: `cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) **stays in core** as the provider-agnostic holder of the *declared* credential config (provider, region, `credentials.{type,accessKey,secretKey,sessionToken,roleArn,profile,...}` — all plain strings, no SDK); the engine serialises that **declared config** into a proto `CloudCredentials` message on every `PlatformBackend` request; the **plugin** performs the actual resolution (static/env/profile/role_arn — including the SDK-bearing profile-chain and STS-AssumeRole paths) in-process with its own SDK. Consequence: `cloud_account_aws.go` *and* `cloud_account_aws_creds.go` are both **deleted by Phase B with no core replacement** — all AWS credential resolution moves plugin-side. This *simplifies* the design (no in-core resolution path, no `AWSConfigProvider` interface) and is the same shape as the §Architecture-3 `credentials:` story — one `CloudCredentials` proto message of *declared* config serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact. +**Credential flow across the boundary — in-core resolvers *declare*, the plugin *resolves*.** Every cloud `platform.*` backend today reaches credentials via `k.provider.GetCredentials() → module.CloudCredentials` (`module/cloud_account.go:18`); `aksBackend.azureToken` takes `*CloudCredentials` directly. Verified shape of the existing pieces: +- `CloudCredentials` is a **plain-field struct** — `Provider/Region/AccessKey/SecretKey/SessionToken/RoleARN/ProjectID/TenantID/ClientID/.../Token` plus `Extra map[string]string`. No `Profile` field; `profile` already lives in `Extra["profile"]`. It is cleanly proto-serialisable as-is — **no struct change is needed**. +- The credential *resolvers* split two ways. `awsStaticResolver` / `awsEnvResolver` (and **all** azure/gcp resolvers) are **already SDK-free** — they read declared config / env vars / emit an `Extra["credential_source"]` marker, and never call an SDK. Only `awsProfileResolver` and `awsRoleARNResolver` have an **SDK-bearing tail** (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `sts.AssumeRole`) that resolves the profile/role into `AccessKey/SecretKey` *in-core*. + +The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **edits** `cloud_account_aws_creds.go` — the `awsProfileResolver`/`awsRoleARNResolver` SDK tails are removed; they keep their SDK-free heads (record `Extra["profile"]` / `RoleARN` + a `credential_source` marker, exactly as `awsStaticResolver` already records `AccessKey`). After the edit the file imports no SDK and **stays in core**, alongside the untouched azure/gcp resolver files. The engine serialises the resolver-populated `CloudCredentials` struct (resolved values for static/env; declared values + `credential_source` marker for profile/role_arn/managed-identity/cli/workload-identity) into a proto `CloudCredentials` message on every `PlatformBackend` request. The **plugin** performs any SDK-bearing resolution (profile-chain, STS AssumeRole, managed-identity, ADC) in-process. Only `cloud_account_aws.go` — the `AWSConfig()` builder + `AWSConfigProvider` interface + `awsProviderFrom`, which *is* pure SDK — is **deleted by Phase B**; its profile-chain/STS logic moves into the plugin's `buildAWSConfig`. This is the same shape as the §Architecture-3 `credentials:` story — one `CloudCredentials` proto message serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact. When `provider != kind` (and `!= k3s` — `k3s` also maps to the in-core `kindBackend`), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. @@ -118,7 +122,9 @@ These are user-facing pipeline functionality, not engine infrastructure. They be (`storage_artifact_s3.go` references the AWS SDK only in comments — verified comment-only, **not** a real import, stays in core.) -Credential handling (Option 1, approved): the deleted `cloud_account_aws.go` + `cloud_account_aws_creds.go` (`AWSConfigProvider`, `AWSConfig()`, and the four `aws*Resolver` types + their `RegisterCredentialResolver` `init()`) are **not** replaced by a core contract. Each plugin-native AWS module carries its own `credentials:` config block and resolves it in-process via a shared in-plugin `buildAWSConfig` helper that owns the static/env/profile/role_arn logic — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. `cloud_account_azure.go` and `cloud_account_gcp.go` reference the SDKs **only in comments** (verified — pure config-map parsing, their resolvers populate the `CloudCredentials` struct with declared values and never call an SDK) and **stay in core untouched** — they remain valid `CloudCredentialProvider` resolvers for the declared-config-passthrough path. Only the *AWS* resolvers move, because only they carry SDK calls. +Credential handling (Option 1, approved): only `cloud_account_aws.go` (`AWSConfigProvider`, `AWSConfig()`, `awsProviderFrom` — pure SDK config-building) is **deleted**, with no core replacement. `cloud_account_aws_creds.go` is **edited** (its two SDK-bearing resolver tails removed — see §Architecture-2) and stays in core; `cloud_account_azure.go` / `cloud_account_gcp.go` are **untouched** — all three are SDK-free declare-don't-resolve resolver files after the edit. Each plugin-native AWS module carries its own `credentials:` config block and resolves it in-process via a shared in-plugin `buildAWSConfig` helper that owns the static/env/profile/role_arn logic (the logic deleted from `cloud_account_aws.go`) — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. + +**Resolvers emit *markers*, not always plain values.** For credential types `static` / `env`, the in-core resolver records concrete declared values into `CloudCredentials`. For `profile` / `role_arn` (AWS) and `managed_identity` / `azure_cli` / `workload_identity` / `application_default` (azure/gcp), the resolver records the *declared inputs* (`Extra["profile"]`, `RoleARN`, etc.) **plus** an `Extra["credential_source"]` marker — it does **not** resolve to concrete keys. The plugin reads the marker and performs the SDK-bearing resolution. This is not a "no-op passthrough": the plugin **must** implement marker handling for every deferred type, exactly as it implements profile/role_arn for AWS. ## Security @@ -146,7 +152,7 @@ Four prior review cycles each found a hand-maintained per-file ownership claim i **Invariants (the parts that survive any audit — these are load-bearing and the script verifies them, it doesn't discover them):** - `module/cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic *declared-config* holder — **it stays in core, is never deleted by any phase**, and is the credential symbol-home all cloud platform code binds to. The `PlatformBackend` contract carries the declared `CloudCredentials` across the boundary (§Architecture-2). - The `platform.*` family files each currently co-locate a **core-staying** backend (`mock`, plus `kind`/`k3s` for kubernetes) and one or more **plugin-bound** cloud backends behind a *single shared `func init()`*. This is true for `platform_kubernetes_kind.go`, `platform_dns.go`, `platform_ecs.go`, `platform_networking.go`, `platform_autoscaling.go` — verified. Splitting any one of them requires partitioning that `init()`; moving a file wholesale would either exile the `mock` backend or dangle a cloud registration. **Phase 0 fixes this for the whole family, not just kubernetes.** -- `cloud_account_aws.go` + `cloud_account_aws_creds.go` are **deleted by Phase B** (§Architecture-2, -3) — all AWS credential resolution moves plugin-side. Any core symbol they define and a *staying* file needs (the pure helper `parseStringSlice`; see Phase 0) must be relocated first. +- `cloud_account_aws.go` is **deleted by Phase B** (the SDK `AWSConfig()` builder); `cloud_account_aws_creds.go` is **edited, not deleted** (SDK-bearing resolver tails removed — §Architecture-2, -3). Any core symbol `cloud_account_aws.go` defines and a *staying* file needs (the pure helper `parseStringSlice`; see Phase 0) must be relocated first. **The method — `scripts/audit-cloud-symbols.sh`, Phase 0 task 1:** for each `platform.*` backend region and each plugin-bound `module/*.go` file, it greps every cross-file function/type reference and every `init()` that registers a *mix* of core-staying and plugin-bound factories, and emits the authoritative ownership + `init()`-partition map. Committed with Phase 0, re-run in CI on every subsequent phase PR. The design never transcribes its output — the script *is* the source of truth. A mixed-`init()` or a cross-file symbol edge into a to-be-deleted file is a Phase 0 (or phase-PR) **CI failure**, not a reviewer's catch. @@ -158,9 +164,18 @@ A mechanical, behavior-equivalent refactor landed **before** Phase A. It establi - `platform__core.go` (or the existing shell file) holds the module shell, the backend interface, the `mock` backend (and `kind`/`k3s` for kubernetes), and an `init()` registering **only** the core-staying backends. - `platform__.go` holds exactly one cloud backend + its own import block + its own `init()` registering **only** that provider. -Concretely: `platform_kubernetes_kind.go` (currently all four k8s backends) splits into `platform_kubernetes_kind.go` (kind+k3s) / `_eks.go` / `_gke.go` / `_aks.go`; `platform_dns_backends.go` (currently `mockDNSBackend` + `route53Backend`) splits into a mock-stays file / `platform_dns_aws.go`; `platform_ecs.go` / `platform_networking.go` / `platform_autoscaling.go` each split their `mock`+`aws` backends + their shared `init()` the same way. The exact file list is the audit-script's output, not enumerated here — but the *rule* is fixed: after Phase 0, no `init()` registers both a core-staying and a plugin-bound factory, and no file holds both. +The families are **not** uniform in current layout — the audit script (Phase 0 task 1) determines per-family whether it is a one-file or two-file split, but the verified shapes are: +- **kubernetes** — one file (`platform_kubernetes_kind.go`) holds all four backends + the shared `init()`; splits into `platform_kubernetes_kind.go` (kind+k3s, keeps a core `init()`) / `_eks.go` / `_gke.go` / `_aks.go`, each with its own `init()`. +- **dns** — *two* files: `platform_dns.go` holds the shared `init()` + `dnsBackend` interface + `DNSBackendFactory` registry; `platform_dns_backends.go` holds both `mockDNSBackend` and `route53Backend` impls + the route53 SDK import. Phase 0 partitions the `init()` in `platform_dns.go` (mock-registration stays, aws-registration moves) **and** moves `route53Backend` + the SDK import out of `platform_dns_backends.go` into a new `platform_dns_aws.go`. +- **ecs / networking / autoscaling** — each a single file with `mock`+`aws` backends + a shared `init()`; split like kubernetes (`_core.go` keeps mock + core `init()`, `_aws.go` takes the cloud backend + its own `init()`). + +The exact post-split file list is the audit-script's output, not hand-enumerated — but the *rule* is fixed: after Phase 0, no `init()` registers both a core-staying and a plugin-bound factory, and no file holds both a core-staying and a plugin-bound backend impl. + +**2. Relocate shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: +- `parseStringSlice` — currently in `cloud_account_aws.go` (Phase-B-*deleted*). It **must** relocate or its staying/plugin-bound consumers break. +- `safeIntToInt32` — currently in `platform_kubernetes.go` (a *core-staying* file). Relocation here is not a hard necessity for core's sake (core could keep it) — it is done so the soon-to-extract `_eks.go`/`_gke.go`/`_aks.go` files have a clean, SDK-free copy-source when they move to the plugin. `platform_kubernetes.go` updates its own reference to the new `cloud_helpers.go` home. -**2. Relocate shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: `parseStringSlice` (out of the Phase-B-deleted `cloud_account_aws.go`) and `safeIntToInt32` (out of `platform_kubernetes.go`, used by core-resident *and* plugin-bound files). Both are ≤15-line pure functions, no SDK, no state. `cloud_helpers.go` stays in core permanently; when a plugin-bound file moves to its plugin it gets its own copy of whichever helpers it uses (duplicating a pure stdlib-only helper across a process boundary is correct, not a smell — the shared plugin-side util module is the Alternatives-Considered-#3 follow-up). The audit script's job is to confirm the relocation is *complete* — no staying file references the helpers from their old homes. +Both are ≤15-line pure functions, no SDK, no state. `cloud_helpers.go` stays in core permanently; when a plugin-bound file moves to its plugin it gets its own copy of whichever helpers it uses (duplicating a pure stdlib-only helper across a process boundary is correct, not a smell — the shared plugin-side util module is the Alternatives-Considered-#3 follow-up). The audit script confirms the relocation is *complete* — no file references the helpers from their old homes. This is **not** "zero logic change" — partitioning a shared `init()` distributes registration calls across files. It is *behavior-equivalent*: the same backend names are registered after the split as before. The design states this plainly rather than mislabelling it. @@ -186,7 +201,7 @@ Each phase is one workflow-core PR (deleting files + wiring the contract dispatc |--------------------------|-------------|----------------| | `iac_state_spaces.go` | aws plugin — `s3` `IaCStateBackend` (DELETE from core) | shared with `spaces` — see Phase D | | `cloud_account_aws.go` | DELETE (Option 1 — no replacement) | **same commit as `platform_kubernetes_eks.go`** (call-graph edge: `awsProviderFrom`/`AWSConfig`) | -| `cloud_account_aws_creds.go` | DELETE (Option 1 — the 4 `aws*Resolver` types move plugin-side) | same commit as above | +| `cloud_account_aws_creds.go` | **EDIT** (not delete) — remove the SDK-bearing tails of `awsProfileResolver`/`awsRoleARNResolver`; file becomes SDK-free, stays in core (§Architecture-2) | the resolver `init()` registrations stay — `provider: aws` credential resolution still works in-core, now declare-only | | `platform_kubernetes_eks.go` | aws plugin — `eks` `PlatformBackend` | **same commit as `cloud_account_aws*.go`** | | `platform_ecs_aws.go` | aws plugin — `PlatformBackend` (`ecs`) | `_core.go` with the `mock` ECS backend stays | | `platform_networking_aws.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | `_core.go` with the `mock` networking backend stays | @@ -225,7 +240,7 @@ Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-1 2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are interface-segregated such that the `kind` impl can stay while cloud impls extract. **This is the most fragile assumption** — the Phase 0/A interface-audit spike (first writing-plans task) validates it; if a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first and the phase re-scopes. Phase 0's mechanical split + helper relocation de-risks this structurally: after Phase 0, the audit operates on already-separated files, not an assertion about an unsplit one. 3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend`/`PlatformBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive, core ignores unknown backend registrations until its own half lands. 4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. The DO plugin already carries `godo`; adding one AWS service package is the minimal cost of self-contained `spaces` state support (vs. forcing DO users to also load workflow-plugin-aws). -5. **`cloud_account_azure.go` / `cloud_account_gcp.go` genuinely have zero real SDK imports.** Verified by `awk` over import blocks at design time — they reference the SDKs only in comments. If a future change adds a real SDK import there, that file joins its phase's extraction. +5. **The credential resolvers can all be made SDK-free in-core.** `cloud_account_azure.go` / `cloud_account_gcp.go` are *already* SDK-free (verified — SDK references only in comments); `cloud_account_aws_creds.go`'s `awsStaticResolver`/`awsEnvResolver` are already SDK-free, and `awsProfileResolver`/`awsRoleARNResolver` become SDK-free once their resolution tails are removed (Phase B edit). The load-bearing assumption is that **a resolver does not *need* to resolve in-core** — for the deferred credential types (profile/role_arn/managed-identity/cli/workload-identity/ADC) it is sufficient for the in-core resolver to record the declared inputs + an `Extra["credential_source"]` marker, and for the plugin to honor the marker. If some credential type genuinely cannot be expressed as "declared inputs + marker" (none identified — even ADC is just a marker), that type would need a different mechanism. The plugin **must** implement marker handling for every deferred type, not just AWS profile/role_arn. 6. **No core code outside `module/` imports these SDKs.** Verified: the only real `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` imports are under `module/`. `cmd/`, `engine.go`, `schema/`, `plugin/` are clean. A `go list -deps` CI gate in the final phase enforces this permanently. ## Rollback From 808ec51428a245884c519bc8af38b3732a686cc5 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:24:04 -0400 Subject: [PATCH 09/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20adversarial=20review=20cycle=207=20revision?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 2 Critical from cycle-7 review (architecture confirmed sound; these are the last two extraction-mechanic precision gaps): C1 — "remove the SDK tail, file becomes SDK-free" mischaracterized the awsRoleARNResolver edit. VERIFIED: awsProfileResolver's SDK calls ARE a clean contiguous tail, but awsRoleARNResolver's SDK block (base-config build + sts.AssumeRole, ~45 lines) is the larger half of the method, after the declared-input recording. FIX: §Architecture-2 re-characterizes the edit as a deliberate Resolve() body REWRITE (not a one-line snip) — explicitly per-resolver. Added a Phase B CI invariant: an import-block grep (folded into audit-cloud-symbols.sh) asserts cloud_account_aws_creds.go has zero aws-sdk-go-v2 imports post-rewrite — mechanically enforced, not prose-asserted. C2 — cloud_account_aws.go defines FOUR symbols, not one; the symbol-ownership invariant named only parseStringSlice. VERIFIED + fixed: - AWSConfigProvider interface signature names aws.Config → CANNOT stay in core, deleted with the file. - awsProviderFrom → deleted with the interface. - ValidateCredentials → verified NO real caller (only a comment ref in cmd/wfctl/deploy.go:866) → deletes cleanly. - The 8 awsProviderFrom consumers are all verified plugin-bound — but each currently does awsProviderFrom(k.provider).AWSConfig(ctx); in the plugin there's no cloud.account to type-assert. §Cross-file-coupling invariant 3 now states Phase B must REWRITE all 8 consumers to obtain creds from the CloudCredentials proto + buildAWSConfig — explicit Phase B scope, not a footnote. Phase B table atomicity column updated. Minor (M1) — platform_dns_backends.go renamed → platform_dns_core.go in Phase 0 so the dns family conforms to the uniform _core.go/_aws.go naming; no special-case three-file layout. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index aa57811e..3dc6f2df 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,7 +1,7 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — revised after adversarial review cycle 6 +**Status:** Design — revised after adversarial review cycle 7 **Owner:** autonomous pipeline (workflow#TBD) ## Problem @@ -100,7 +100,13 @@ service PlatformBackend { - `CloudCredentials` is a **plain-field struct** — `Provider/Region/AccessKey/SecretKey/SessionToken/RoleARN/ProjectID/TenantID/ClientID/.../Token` plus `Extra map[string]string`. No `Profile` field; `profile` already lives in `Extra["profile"]`. It is cleanly proto-serialisable as-is — **no struct change is needed**. - The credential *resolvers* split two ways. `awsStaticResolver` / `awsEnvResolver` (and **all** azure/gcp resolvers) are **already SDK-free** — they read declared config / env vars / emit an `Extra["credential_source"]` marker, and never call an SDK. Only `awsProfileResolver` and `awsRoleARNResolver` have an **SDK-bearing tail** (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `sts.AssumeRole`) that resolves the profile/role into `AccessKey/SecretKey` *in-core*. -The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **edits** `cloud_account_aws_creds.go` — the `awsProfileResolver`/`awsRoleARNResolver` SDK tails are removed; they keep their SDK-free heads (record `Extra["profile"]` / `RoleARN` + a `credential_source` marker, exactly as `awsStaticResolver` already records `AccessKey`). After the edit the file imports no SDK and **stays in core**, alongside the untouched azure/gcp resolver files. The engine serialises the resolver-populated `CloudCredentials` struct (resolved values for static/env; declared values + `credential_source` marker for profile/role_arn/managed-identity/cli/workload-identity) into a proto `CloudCredentials` message on every `PlatformBackend` request. The **plugin** performs any SDK-bearing resolution (profile-chain, STS AssumeRole, managed-identity, ADC) in-process. Only `cloud_account_aws.go` — the `AWSConfig()` builder + `AWSConfigProvider` interface + `awsProviderFrom`, which *is* pure SDK — is **deleted by Phase B**; its profile-chain/STS logic moves into the plugin's `buildAWSConfig`. This is the same shape as the §Architecture-3 `credentials:` story — one `CloudCredentials` proto message serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact. +The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **rewrites** the two SDK-bearing resolver bodies in `cloud_account_aws_creds.go` — this is a deliberate `Resolve()` body rewrite, **not** a one-line "snip the tail": +- `awsProfileResolver.Resolve` — its SDK calls (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `cfg.Credentials.Retrieve`) *are* a clean contiguous tail after the marker-record (`Extra["profile"] = profile`); the rewrite ends the method right after the marker-record. +- `awsRoleARNResolver.Resolve` — the SDK block (base-config build + `sts.NewFromConfig` + `AssumeRole`) is contiguous *after* the declared-input recording (`RoleARN`, `Extra["external_id"]`, `roleArn`-required validation, `sessionName` parse), but it is the larger half of the method. The rewrite **deletes that entire block** and ends the method after the declared-input recording + `credential_source` marker. Calling this "remove a tail" understates it — it is a body rewrite, and the design says so. + +After both rewrites, `cloud_account_aws_creds.go` imports **no** `aws-sdk-go-v2` package (verified: those imports are used *only* by the two resolver bodies being rewritten) and **stays in core**, alongside the untouched azure/gcp resolver files. **Phase B CI invariant:** an import-block grep (folded into `scripts/audit-cloud-symbols.sh`) asserts `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-rewrite — the "stays SDK-free in core" claim is mechanically enforced, not asserted in prose. + +The engine serialises the resolver-populated `CloudCredentials` struct (resolved values for static/env; declared values + `credential_source` marker for profile/role_arn/managed-identity/cli/workload-identity) into a proto `CloudCredentials` message on every `PlatformBackend` request. The **plugin** performs any SDK-bearing resolution (profile-chain, STS AssumeRole, managed-identity, ADC) in-process. `cloud_account_aws.go` — the `AWSConfig()` builder + `AWSConfigProvider` interface + `awsProviderFrom` + `ValidateCredentials`, all pure SDK — is **deleted by Phase B**; its profile-chain/STS logic moves into the plugin's `buildAWSConfig`. (See §Cross-file-coupling invariant 3 for the `AWSConfigProvider`-consumer rewrite this forces.) This is the same shape as the §Architecture-3 `credentials:` story — one `CloudCredentials` proto message serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact. When `provider != kind` (and `!= k3s` — `k3s` also maps to the in-core `kindBackend`), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. @@ -152,7 +158,12 @@ Four prior review cycles each found a hand-maintained per-file ownership claim i **Invariants (the parts that survive any audit — these are load-bearing and the script verifies them, it doesn't discover them):** - `module/cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic *declared-config* holder — **it stays in core, is never deleted by any phase**, and is the credential symbol-home all cloud platform code binds to. The `PlatformBackend` contract carries the declared `CloudCredentials` across the boundary (§Architecture-2). - The `platform.*` family files each currently co-locate a **core-staying** backend (`mock`, plus `kind`/`k3s` for kubernetes) and one or more **plugin-bound** cloud backends behind a *single shared `func init()`*. This is true for `platform_kubernetes_kind.go`, `platform_dns.go`, `platform_ecs.go`, `platform_networking.go`, `platform_autoscaling.go` — verified. Splitting any one of them requires partitioning that `init()`; moving a file wholesale would either exile the `mock` backend or dangle a cloud registration. **Phase 0 fixes this for the whole family, not just kubernetes.** -- `cloud_account_aws.go` is **deleted by Phase B** (the SDK `AWSConfig()` builder); `cloud_account_aws_creds.go` is **edited, not deleted** (SDK-bearing resolver tails removed — §Architecture-2, -3). Any core symbol `cloud_account_aws.go` defines and a *staying* file needs (the pure helper `parseStringSlice`; see Phase 0) must be relocated first. +- `cloud_account_aws.go` is **deleted by Phase B**. It defines **four** symbols, not one: the `AWSConfig()` builder, the `AWSConfigProvider` interface, `awsProviderFrom()`, and `ValidateCredentials()` — plus the pure helper `parseStringSlice`. Dispositions: + - `parseStringSlice` — pure, no SDK → **relocated to `cloud_helpers.go` in Phase 0** (a staying file, `cloud_account.go`-adjacent, needs it; see Phase 0). + - `AWSConfigProvider` interface — its method signature is `AWSConfig(ctx) (aws.Config, error)`, which **names the SDK type `aws.Config`** → it **cannot stay in core** and has no SDK-free equivalent. It is deleted with the file. + - `awsProviderFrom()` — a type-assertion helper returning `AWSConfigProvider` → deleted with the interface. + - `ValidateCredentials()` — verified: **no real caller** outside `cloud_account_aws.go` (the only repo match is a *comment* in `cmd/wfctl/deploy.go:866`) → deleted cleanly with the file. + - **Consumer rewrite (explicit Phase B scope):** `awsProviderFrom`/`AWSConfigProvider` are referenced by 8 files — `aws_api_gateway.go`, `codebuild.go`, `platform_apigateway.go`, `platform_autoscaling.go`, `platform_dns_backends.go`, `platform_ecs.go`, `platform_kubernetes_kind.go`, `platform_networking.go` — **all verified plugin-bound** (each is in the Phase B inventory or splits so only its `_aws.go`/`_eks.go` half references the symbol; no core-staying `_core.go` shell touches it). But "plugin-bound" is not "free" — each of those consumers currently does `awsProviderFrom(k.provider).AWSConfig(ctx)` to get a live `aws.Config`. In the plugin they have no `cloud.account` to type-assert. **Phase B must rewrite every one of those 8 consumers** to obtain credentials from the `CloudCredentials` proto message (passed on the `PlatformBackend` request / carried in the plugin-native module's `credentials:` config) and build `aws.Config` via the plugin's `buildAWSConfig` helper. This is real Phase B work, not a localized symbol deletion — the design states it as scope, not a footnote. **The method — `scripts/audit-cloud-symbols.sh`, Phase 0 task 1:** for each `platform.*` backend region and each plugin-bound `module/*.go` file, it greps every cross-file function/type reference and every `init()` that registers a *mix* of core-staying and plugin-bound factories, and emits the authoritative ownership + `init()`-partition map. Committed with Phase 0, re-run in CI on every subsequent phase PR. The design never transcribes its output — the script *is* the source of truth. A mixed-`init()` or a cross-file symbol edge into a to-be-deleted file is a Phase 0 (or phase-PR) **CI failure**, not a reviewer's catch. @@ -166,7 +177,7 @@ A mechanical, behavior-equivalent refactor landed **before** Phase A. It establi The families are **not** uniform in current layout — the audit script (Phase 0 task 1) determines per-family whether it is a one-file or two-file split, but the verified shapes are: - **kubernetes** — one file (`platform_kubernetes_kind.go`) holds all four backends + the shared `init()`; splits into `platform_kubernetes_kind.go` (kind+k3s, keeps a core `init()`) / `_eks.go` / `_gke.go` / `_aks.go`, each with its own `init()`. -- **dns** — *two* files: `platform_dns.go` holds the shared `init()` + `dnsBackend` interface + `DNSBackendFactory` registry; `platform_dns_backends.go` holds both `mockDNSBackend` and `route53Backend` impls + the route53 SDK import. Phase 0 partitions the `init()` in `platform_dns.go` (mock-registration stays, aws-registration moves) **and** moves `route53Backend` + the SDK import out of `platform_dns_backends.go` into a new `platform_dns_aws.go`. +- **dns** — *two* files: `platform_dns.go` holds the shared `init()` + `dnsBackend` interface + `DNSBackendFactory` registry; `platform_dns_backends.go` holds both `mockDNSBackend` and `route53Backend` impls + the route53 SDK import. Phase 0 partitions the `init()` in `platform_dns.go` (mock-registration stays, aws-registration moves), moves `route53Backend` + the SDK import out of `platform_dns_backends.go` into a new `platform_dns_aws.go`, **and renames the now-mock-only `platform_dns_backends.go` → `platform_dns_core.go`** so the dns family conforms to the same `_core.go` / `_aws.go` naming as every other family — no special-case three-file layout. - **ecs / networking / autoscaling** — each a single file with `mock`+`aws` backends + a shared `init()`; split like kubernetes (`_core.go` keeps mock + core `init()`, `_aws.go` takes the cloud backend + its own `init()`). The exact post-split file list is the audit-script's output, not hand-enumerated — but the *rule* is fixed: after Phase 0, no `init()` registers both a core-staying and a plugin-bound factory, and no file holds both a core-staying and a plugin-bound backend impl. @@ -200,13 +211,13 @@ Each phase is one workflow-core PR (deleting files + wiring the contract dispatc | core file (post Phase 0) | destination | atomicity note | |--------------------------|-------------|----------------| | `iac_state_spaces.go` | aws plugin — `s3` `IaCStateBackend` (DELETE from core) | shared with `spaces` — see Phase D | -| `cloud_account_aws.go` | DELETE (Option 1 — no replacement) | **same commit as `platform_kubernetes_eks.go`** (call-graph edge: `awsProviderFrom`/`AWSConfig`) | +| `cloud_account_aws.go` | DELETE — `AWSConfig()` / `AWSConfigProvider` / `awsProviderFrom` / `ValidateCredentials` (all pure SDK; `parseStringSlice` relocated in Phase 0). **Forces the 8-consumer rewrite** — see §Cross-file-coupling invariant 3 | **same commit as all 8 `awsProviderFrom` consumers' rewrites** — deleting the interface without rewriting its consumers fails the build | | `cloud_account_aws_creds.go` | **EDIT** (not delete) — remove the SDK-bearing tails of `awsProfileResolver`/`awsRoleARNResolver`; file becomes SDK-free, stays in core (§Architecture-2) | the resolver `init()` registrations stay — `provider: aws` credential resolution still works in-core, now declare-only | | `platform_kubernetes_eks.go` | aws plugin — `eks` `PlatformBackend` | **same commit as `cloud_account_aws*.go`** | | `platform_ecs_aws.go` | aws plugin — `PlatformBackend` (`ecs`) | `_core.go` with the `mock` ECS backend stays | | `platform_networking_aws.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | `_core.go` with the `mock` networking backend stays | | `platform_autoscaling_aws.go` | aws plugin — `PlatformBackend` (`autoscaling`) | `_core.go` with the `mock` autoscaling backend stays | -| `platform_dns_aws.go` | aws plugin — `PlatformBackend` (`dns`/route53) | `_core.go` with `mockDNSBackend` stays | +| `platform_dns_aws.go` (created by Phase 0 from `platform_dns_backends.go`) | aws plugin — `PlatformBackend` (`dns`/route53) | `platform_dns_core.go` (Phase-0 rename of `platform_dns_backends.go`) with `mockDNSBackend` + the `platform_dns.go` shell stay | | `aws_api_gateway.go` | aws plugin — `aws.apigateway` module | — | | `platform_apigateway.go` | aws plugin — `PlatformBackend` or `aws.apigateway` (gated on interface-audit spike) | — | | `codebuild.go` | aws plugin — `aws.codebuild` module | — | From 337d0e035d7529a45b8020b0dd4c7877a210df56 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:37:51 -0400 Subject: [PATCH 10/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20cycle-8=20re-baseline=20against=20post-#653?= =?UTF-8?q?=20main?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cycle-8 adversarial review caught the design's file/symbol inventory as stale: it predated issue #653 (closed 2026-05-13), which already removed the AWS IaC modules, platform/providers/aws/, and stubbed the codebuild + EKS backends. Re-baselined every file/symbol claim against origin/main HEAD (worktree confirmed 0 commits behind origin/main): - Added "Relationship to issue #653" section — this design is #653's named successor, extracting the AWS surface #653 scoped out ("RBAC/secrets/artifact stay") plus the untouched Azure/GCP surfaces. - Problem table corrected: AWS 6 real-import files (not 13), Azure 3, GCP 3. storage_artifact_s3.go is comment-only — stays in core. - cloud_account_aws.go is dead code — zero non-test consumers verified; deleted outright, no 8-consumer rewrite (awsProviderFrom + consumers removed by #653). - Phase 0 shrunk to a single-file split (platform_kubernetes_kind.go); parseStringSlice + safeIntToInt32 no longer exist — helper-relocation task deleted. - PlatformBackend now serves only aks + gke (eks already a #653 SDK-free stub); interface-audit spike audits one interface, not five. - Phase B inventory rewritten; Phase A/C file lists corrected. - Self-challenge doubt #4 + Assumption 7 added: inventory staleness is the cycle-8 defect class; audit script makes it CI-enforced. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 297 +++++++++--------- 1 file changed, 151 insertions(+), 146 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 3dc6f2df..b8299eb3 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,37 +1,50 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — revised after adversarial review cycle 7 +**Status:** Design — re-baselined after adversarial review cycle 8 (design verified against current `origin/main`, post-#653) **Owner:** autonomous pipeline (workflow#TBD) +## Relationship to issue #653 (predecessor — read this first) + +Issue **#653** ("Audit AWS SDK usage in workflow core") closed 2026-05-13. Its three merged PRs already did a large slice of AWS-side extraction: + +- **PR #657** — removed the AWS *IaC modules* from workflow core. +- **PR #659** — stripped the AWS SDK from the `codebuild` and `EKS` backends (replaced `codebuildAWSBackend` and `eksBackend` with SDK-free migration-error stubs). +- **PR #662** — tombstoned `platform/providers/aws/` and promoted the EKS absent-package CI gate. + +Consequence: files this design's *earlier drafts* (cycles 1–7) enumerated — `aws_api_gateway.go`, `codebuild.go`, `platform_apigateway.go`, `platform_autoscaling.go`, `platform_ecs.go`, `platform_networking.go`, the `awsProviderFrom` helper, the `platform_*_aws` backends — **no longer exist in `module/`**. #653 explicitly scoped *out* "RBAC/secrets/artifact" AWS usage ("RBAC/secrets/artifact stay"). **This design is #653's successor: it extracts the AWS SDK surface #653 deliberately left, plus the entirely-untouched Azure and GCP surfaces.** Every file/symbol claim below is grep-verified against `origin/main` HEAD (the worktree branch is 0 commits behind `origin/main`). + ## Problem -Workflow core's `module/` package imports three cloud SDK trees directly. File counts are grep-verified (`awk` over import blocks, comment-only matches excluded). "Files" = files with a real import — not all are *deleted* (e.g. `iac_module.go` is *edited* to strip a `case`, not deleted; see Phases): +Workflow core's `module/` package still imports three cloud SDK trees directly. Real-import counts (comment-only matches excluded — e.g. `storage_artifact_s3.go` *names* `aws-sdk-go-v2` only in a doc comment and stays in core): -| SDK | Files (real imports) | how core sheds it | -|-----|----------------------|-------------------| -| `github.com/aws/aws-sdk-go-v2/*` | **13** | 11 deleted, `iac_module.go` edited (strip `spaces` case), `platform_kubernetes_eks.go` deleted (post Phase-0 split) | -| `github.com/Azure/azure-sdk-for-go/sdk/*` (azcore + azblob) | **2** | `iac_state_azure.go` deleted, `iac_module.go` edited (strip `azure_blob` case) | -| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** | `iac_state_gcs.go` + `storage_gcs.go` deleted, `platform_kubernetes_gke.go` deleted (post Phase-0 split) | +| SDK tree | Files with a real import | how core sheds it | +|----------|--------------------------|-------------------| +| `github.com/aws/aws-sdk-go-v2/*` | **6** — `cloud_account_aws.go`, `cloud_account_aws_creds.go`, `iac_state_spaces.go`, `nosql_dynamodb.go`, `pipeline_step_s3_upload.go`, `s3_storage.go` | 4 deleted, `cloud_account_aws_creds.go` edited (resolver bodies rewritten SDK-free, stays in core), `iac_module.go` edited (strip `spaces` case) | +| `github.com/Azure/azure-sdk-for-go/sdk/*` (azcore + azblob) | **3** — `iac_state_azure.go`, `iac_module.go`, `platform_kubernetes_kind.go` (the `aksBackend`) | `iac_state_azure.go` deleted, `iac_module.go` edited (strip `azure_blob` case + `newAzureSharedKeyCredential`), `platform_kubernetes_aks.go` deleted (post Phase-0 split) | +| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** — `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_kind.go` (the `gkeBackend`) | `iac_state_gcs.go` + `storage_gcs.go` deleted, `platform_kubernetes_gke.go` deleted (post Phase-0 split) | -Every dependabot bump of a cloud SDK (PRs #400/#419/#421/#635 as of this writing) churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. +`cloud_account_azure.go` and `cloud_account_gcp.go` are **already SDK-free** (verified: 0 SDK imports — they are pure declare-don't-resolve resolver files). Only the AWS credential resolvers carry SDK. -Precedent: workflow#617 removed the legacy DigitalOcean IaC *resource* modules + `godo` from those; IaC resource provisioning moved to `workflow-plugin-digitalocean`. This design extends the same principle to the *remaining* cloud functionality that never went through that extraction: IaC **state backends**, managed-service **platform** provisioners, and a handful of standalone modules/steps. +Every dependabot bump of a cloud SDK churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. -**A fourth tree — `github.com/digitalocean/godo` — is still in core but out of scope here.** `module/cloud_account_do.go` + five `module/platform_do_*.go` files (`platform_do_app.go`, `platform_do_dns.go`, `platform_do_networking.go`, `platform_doks.go`, `platform_do_database.go`) still import `godo` — workflow#617 removed the DO *IaC resource* path but these `platform.do_*` modules survived it. The user's ask scoped this work to three SDK trees (aws/azure/gcp); `godo` extraction is a structurally-identical follow-up (the `platform.do_*` modules would extract via the same `PlatformBackend` contract this design introduces) but is **not** in this design's scope. Consequence: the `go list -deps` CI gate added in the final phase asserts **zero `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` packages** — it does *not* assert "zero cloud SDKs" while `godo` remains. The design's phrasing is corrected throughout to "the three in-scope SDK trees," not "all cloud SDKs." +Precedent: workflow#617 removed the legacy DigitalOcean IaC *resource* modules + `godo`; #653 removed the AWS IaC *resource* modules + AWS `platform/providers/`. This design extends the same principle to the *remaining* cloud functionality neither extraction touched: IaC **state backends**, the managed-Kubernetes **platform** provisioners, and a handful of standalone modules/steps. + +**A fourth tree — `github.com/digitalocean/godo` — is still in core but out of scope here.** `module/cloud_account_do.go` + the `module/platform_do_*.go` files still import `godo`; workflow#617 removed the DO *IaC resource* path but the `platform.do_*` modules survived it. The user's ask scoped this work to three SDK trees (aws/azure/gcp); `godo` extraction is a structurally-identical follow-up but is **not** in this design's scope. Consequence: the `go list -deps` CI gate added in the final phase asserts **zero `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` packages** — it does *not* assert "zero cloud SDKs" while `godo` remains. ## Goals -- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` (the three in-scope trees) **entirely** — verified by a `go list -deps` gate in the final phase's CI asserting zero packages from those three trees. `godo` is out of scope (see Problem). +- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` (the three in-scope trees) **entirely** — verified by a `go list -deps` gate in the final phase's CI asserting zero packages from those three trees. `godo` is out of scope. - Cloud functionality remains available, loaded via strict-contract gRPC plugins (the existing sidecar model). -- `kind` Kubernetes backend (no SDK) stays in core — local-dev/test path must not require a plugin. +- `kind` / `k3s` Kubernetes backends (no SDK) stay in core — local-dev/test path must not require a plugin. The `eks` backend is *already* an SDK-free migration-error stub (`eksErrorBackend`, courtesy #653) and stays in core unchanged. ## Non-Goals -- Re-homing the IaC *resource provider* contract (`IaCProviderRequired`) — already extracted, not touched here. +- Re-homing the IaC *resource provider* contract (`IaCProviderRequired`) — already extracted (#617, #653), not touched here. - Changing how plugins are discovered/installed (`wfctl plugin install` flow unchanged). -- Backwards-compatible yaml — this is a **clean break** with a migration guide (per workflow#617 precedent). -- **Removing `aws-sdk-go-v2/service/kinesis`.** The user's original ask said "kinesis and azcore." `go mod why github.com/aws/aws-sdk-go-v2/service/kinesis` resolves to `workflow → workflow/module → github.com/GoCodeAlone/modular/modules/eventbus/v2 → kinesis` — kinesis is a **transitive dependency of `modular/modules/eventbus/v2`**, not a direct workflow import (the only `module/` reference is a string literal `"kinesis-provider"` in a test). Removing it is an upstream `modular` concern, not addressable by extracting workflow's own SDK usage. Out of scope here; tracked separately if `modular` eventbus ever needs the same treatment. +- Backwards-compatible yaml — this is a **clean break** with a migration guide (per workflow#617 / #653 precedent). +- **Removing `aws-sdk-go-v2/service/kinesis`.** `go mod why` resolves it to `workflow → workflow/module → modular/modules/eventbus/v2 → kinesis` — a **transitive dependency of `modular`**, not a direct workflow import. Out of scope; an upstream `modular` concern. +- **Re-doing #653's work.** The AWS IaC modules, `platform/providers/aws/`, and the codebuild/EKS backends are already gone. This design does not re-extract them. ## Architecture @@ -41,12 +54,12 @@ Three extension surfaces, three handling strategies: `iac.state` **stays a core module type**. The state store is engine infrastructure — the orchestrator reads/writes it during every plan/apply cycle — so it keeps a stable core seam. What changes: `config.backend` no longer dispatches a hardcoded `switch` in `module/iac_module.go`; instead core resolves an `IaCStateBackend` gRPC client from whichever loaded plugin registered that backend name. -The contract maps **1:1 onto the existing `module.IaCStateStore` interface** (`module/iac_state.go:21`) — six methods, no speculative surface: +The contract maps **1:1 onto the existing `module.IaCStateStore` interface** (`module/iac_state.go`) — six methods, no speculative surface: ```proto // Added as a new service INSIDE plugin/external/proto/iac.proto — matches the -// established precedent (iac.proto already holds 8 services / 598 lines; -// state + platform contracts version alongside the resource-provider contract). +// established precedent (iac.proto already holds multiple services; state + +// platform contracts version alongside the resource-provider contract). service IaCStateBackend { rpc GetState (GetStateRequest) returns (GetStateResponse); // → IaCStateStore.GetState rpc SaveState (SaveStateRequest) returns (SaveStateResponse); // → IaCStateStore.SaveState @@ -59,28 +72,38 @@ message GetStateResponse { IaCState state = 1; bool exists = 2; } message SaveStateRequest { IaCState state = 1; } // idempotent: full-state replace, last-writer-wins message ListStatesRequest { map filter = 1; } message LockRequest { string resource_id = 1; } // 1:1 with IaCStateStore.Lock — no TTL field (see Failure modes) -// IaCState mirrors module.IaCState. The proto is exactly the 6-method interface, -// nothing speculative — a lock-lease/TTL field is a planned ADDITIVE follow-up -// (Failure modes §), deferred until the first plugin backend implements honored -// expiry so it ships with a conformance test instead of as a no-op field. +// IaCState mirrors module.IaCState. A lock-lease/TTL field is a planned ADDITIVE +// follow-up (Failure modes §), deferred until the first plugin backend implements +// honored expiry so it ships with a conformance test instead of as a no-op field. ``` Backend ownership — every cloud plugin implements the contract for its native storage: | backend name | plugin | storage | core file deleted | |--------------|--------|---------|-------------------| -| `s3` | workflow-plugin-aws | AWS S3 | `iac_state_spaces.go` (the S3-compatible store; also the `spaces` impl) | +| `s3` | workflow-plugin-aws | AWS S3 | `iac_state_spaces.go` (the `SpacesIaCStateStore` — an S3-compatible store backing *both* `s3` and `spaces`) | | `azure_blob` | workflow-plugin-azure | Azure Blob | `iac_state_azure.go` | | `gcs` | workflow-plugin-gcp | Google Cloud Storage | `iac_state_gcs.go` | | `spaces` | workflow-plugin-digitalocean | DO Spaces (S3-compatible) | (shares `iac_state_spaces.go` deletion — see Phase D) | `memory`, `filesystem`, `postgres` backends stay **in core** — no cloud SDK, no reason to extract. -**Unary GET+SAVE vs streaming:** decided by benchmark, not assumption. The writing-plans phase includes a task that drives a 1 MB synthetic state blob through a full plan→apply cycle (GetState + SaveState + Lock + Unlock per resource batch) over unary RPC, measures p50/p99 added latency vs the in-process baseline, and only adopts chunked streaming if unary clears no acceptable bar. Default build target: **unary**, because (a) gRPC's default 4 MB message cap covers typical state files, (b) streaming adds protocol complexity that must be justified by data, and (c) the in-process baseline this replaces was itself a single blob read/write. This task is ordered **before** the Phase A proto is locked (per self-challenge doubt #3). +**Unary GET+SAVE vs streaming:** decided by benchmark, not assumption. The writing-plans phase includes a task that drives a 1 MB synthetic state blob through a full plan→apply cycle (GetState + SaveState + Lock + Unlock per resource batch) over unary RPC, measures p50/p99 added latency vs the in-process baseline, and only adopts chunked streaming if unary clears no acceptable bar. Default build target: **unary** — (a) gRPC's default 4 MB message cap covers typical state files, (b) streaming adds protocol complexity that must be justified by data, (c) the in-process baseline this replaces was itself a single blob read/write. This task is ordered **before** the Phase A proto is locked. + +### 2. Managed-Kubernetes platform provisioners → new `PlatformBackend` strict proto contract + +**Post-#653 this surface is small.** The only `module/platform_*.go` file that still imports a cloud SDK is `platform_kubernetes_kind.go`, which holds **four** `kubernetesBackend` implementations behind one shared `init()`: + +| backend | SDK | disposition | +|---------|-----|-------------| +| `kindBackend` (serves `kind` *and* `k3s`) | none | **stays in core** | +| `eksErrorBackend` (serves `eks`) | none (already a #653 migration-error stub) | **stays in core** unchanged | +| `gkeBackend` (serves `gke`) | `google.golang.org/api/container` | → workflow-plugin-gcp | +| `aksBackend` (serves `aks`) | `Azure/azure-sdk-for-go` | → workflow-plugin-azure | -### 2. Managed-service platform provisioners → new `PlatformBackend` strict proto contract +There is **no** `platform.ecs` / `platform.networking` / `platform.autoscaling` / `platform.dns` / `platform.apigateway` cloud-SDK surface left — #653 removed it. `platform_dns.go` / `platform_dns_backends.go` still exist but carry **no cloud SDK import** (verified). So `PlatformBackend` serves exactly two cloud backends: `aks` and `gke`. -The `platform.*` module family (`platform.kubernetes`, `platform.ecs`, `platform.networking`, `platform.dns`, `platform.autoscaling`) keeps its module types **and its `provider:` config key** in core — no yaml UX break. Each `platform.*` module currently dispatches to a provider-specific backend via an in-process interface (`kubernetesBackend`, etc.). The cloud-backed implementations (EKS, GKE, AKS, ECS, Route53, EC2, ApplicationAutoScaling) move behind the `PlatformBackend` gRPC contract; the `kind` backend stays in-core. +`platform.kubernetes` keeps its module type **and its `provider:` config key** in core — no yaml UX break. The `kubernetesBackend` interface (`module/platform_kubernetes.go`) stays in core; the cloud impls move behind the `PlatformBackend` gRPC contract. ```proto // Added as a new service INSIDE plugin/external/proto/iac.proto (same rationale @@ -90,197 +113,179 @@ service PlatformBackend { rpc Apply (PlatformApplyRequest) returns (PlatformApplyResponse); rpc Destroy(PlatformDestroyRequest) returns (PlatformDestroyResponse); } -// Each request carries: platform_type (kubernetes|ecs|...), provider (eks|gke|aks|...), -// desired-state struct, current-state struct, AND a CloudCredentials message. -// Response carries: plan actions / applied state / errors. -// (remaining request/response message field layouts: deferred to writing-plans.) +// Each request carries: platform_type (currently always "kubernetes"), +// provider (gke|aks), desired-state struct, current-state struct, AND a +// CloudCredentials message. Response: plan actions / applied state / errors. +// (remaining message field layouts: deferred to writing-plans.) ``` -**Credential flow across the boundary — in-core resolvers *declare*, the plugin *resolves*.** Every cloud `platform.*` backend today reaches credentials via `k.provider.GetCredentials() → module.CloudCredentials` (`module/cloud_account.go:18`); `aksBackend.azureToken` takes `*CloudCredentials` directly. Verified shape of the existing pieces: -- `CloudCredentials` is a **plain-field struct** — `Provider/Region/AccessKey/SecretKey/SessionToken/RoleARN/ProjectID/TenantID/ClientID/.../Token` plus `Extra map[string]string`. No `Profile` field; `profile` already lives in `Extra["profile"]`. It is cleanly proto-serialisable as-is — **no struct change is needed**. -- The credential *resolvers* split two ways. `awsStaticResolver` / `awsEnvResolver` (and **all** azure/gcp resolvers) are **already SDK-free** — they read declared config / env vars / emit an `Extra["credential_source"]` marker, and never call an SDK. Only `awsProfileResolver` and `awsRoleARNResolver` have an **SDK-bearing tail** (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `sts.AssumeRole`) that resolves the profile/role into `AccessKey/SecretKey` *in-core*. +**The `PlatformBackend` shape is gated, but the gate is now nearly trivial** (cycle-8 note): the design's earlier draft worried about unifying *five* `platform.*` backend interfaces. Post-#653 there is **one** interface to audit — `kubernetesBackend` (4 methods: `plan`/`apply`/`status`/`destroy`) — and only two cloud impls behind it (`gkeBackend`, `aksBackend`), both managed-Kubernetes clusters. The Phase 0/A interface-audit spike validates that `kubernetesBackend`'s 4 methods map cleanly onto `Plan/Apply/Destroy/Status` *before* the proto is locked. The risk that drove Alternatives Considered #1 (lowest-common-denominator across heterogeneous platform families) is largely gone with the heterogeneous families gone — but the fallback (fold into `ResourceDriver`) is retained for the spike's decision. -The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **rewrites** the two SDK-bearing resolver bodies in `cloud_account_aws_creds.go` — this is a deliberate `Resolve()` body rewrite, **not** a one-line "snip the tail": -- `awsProfileResolver.Resolve` — its SDK calls (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `cfg.Credentials.Retrieve`) *are* a clean contiguous tail after the marker-record (`Extra["profile"] = profile`); the rewrite ends the method right after the marker-record. -- `awsRoleARNResolver.Resolve` — the SDK block (base-config build + `sts.NewFromConfig` + `AssumeRole`) is contiguous *after* the declared-input recording (`RoleARN`, `Extra["external_id"]`, `roleArn`-required validation, `sessionName` parse), but it is the larger half of the method. The rewrite **deletes that entire block** and ends the method after the declared-input recording + `credential_source` marker. Calling this "remove a tail" understates it — it is a body rewrite, and the design says so. +**Credential flow across the boundary — in-core resolvers *declare*, the plugin *resolves*.** The cloud backends reach credentials via `module.CloudCredentials` (`module/cloud_account.go`); `aksBackend.azureToken(creds *CloudCredentials)` takes it directly (verified). Verified shape of the existing pieces: +- `CloudCredentials` is a **plain-field struct** — `Provider/Region/AccessKey/SecretKey/SessionToken/RoleARN/ProjectID/TenantID/ClientID/.../Token` plus `Extra map[string]string`. No `Profile` field; `profile` lives in `Extra["profile"]`. Cleanly proto-serialisable as-is — **no struct change needed**. +- The AWS credential *resolvers* split two ways. `awsStaticResolver` / `awsEnvResolver` are **already SDK-free**. `awsProfileResolver` and `awsRoleARNResolver` (verified, `cloud_account_aws_creds.go`) have an **SDK-bearing block** (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `sts.AssumeRole`) that resolves the profile/role into `AccessKey/SecretKey` *in-core*. The azure/gcp resolvers (`cloud_account_azure.go`, `cloud_account_gcp.go`) are **already SDK-free**. -After both rewrites, `cloud_account_aws_creds.go` imports **no** `aws-sdk-go-v2` package (verified: those imports are used *only* by the two resolver bodies being rewritten) and **stays in core**, alongside the untouched azure/gcp resolver files. **Phase B CI invariant:** an import-block grep (folded into `scripts/audit-cloud-symbols.sh`) asserts `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-rewrite — the "stays SDK-free in core" claim is mechanically enforced, not asserted in prose. +The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **rewrites** the two SDK-bearing AWS resolver bodies — a deliberate `Resolve()` body rewrite, **not** a one-line "snip the tail": +- `awsProfileResolver.Resolve` — its SDK calls (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `cfg.Credentials.Retrieve`) *are* a clean contiguous tail after the marker-record (`m.creds.Extra["profile"] = profile`); the rewrite ends the method right after the marker-record. +- `awsRoleARNResolver.Resolve` — the SDK block (base-config build + `sts.NewFromConfig` + `AssumeRole` + result-record) is contiguous *after* the declared-input recording (`RoleARN`, `Extra["external_id"]`, `roleArn`-required validation, `sessionName` parse) but is the **larger half** of the method. The rewrite **deletes that entire block** and ends the method after the declared-input recording + a `credential_source` marker. Calling this "remove a tail" understates it — it is a body rewrite. -The engine serialises the resolver-populated `CloudCredentials` struct (resolved values for static/env; declared values + `credential_source` marker for profile/role_arn/managed-identity/cli/workload-identity) into a proto `CloudCredentials` message on every `PlatformBackend` request. The **plugin** performs any SDK-bearing resolution (profile-chain, STS AssumeRole, managed-identity, ADC) in-process. `cloud_account_aws.go` — the `AWSConfig()` builder + `AWSConfigProvider` interface + `awsProviderFrom` + `ValidateCredentials`, all pure SDK — is **deleted by Phase B**; its profile-chain/STS logic moves into the plugin's `buildAWSConfig`. (See §Cross-file-coupling invariant 3 for the `AWSConfigProvider`-consumer rewrite this forces.) This is the same shape as the §Architecture-3 `credentials:` story — one `CloudCredentials` proto message serves both the `PlatformBackend` contract and the plugin-native module path, so the secret-redaction task (§Security) has exactly one shape to redact. +After both rewrites, `cloud_account_aws_creds.go` imports **no** `aws-sdk-go-v2` package (verified: the 4 SDK imports — `aws`, `config`, `credentials`, `sts` — are used *only* by those two resolver bodies; `init()` + `awsStaticResolver` + `awsEnvResolver` are SDK-free) and **stays in core**. **Phase B CI invariant:** an import-block grep (folded into `scripts/audit-cloud-symbols.sh`) asserts `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-rewrite. -When `provider != kind` (and `!= k3s` — `k3s` also maps to the in-core `kindBackend`), core's `platform.*` module resolves a `PlatformBackend` client from the plugin that registered `(platform_type, provider)`. +The engine serialises the resolver-populated `CloudCredentials` struct into a proto `CloudCredentials` message on every `PlatformBackend` request. The **plugin** performs any SDK-bearing resolution (profile-chain, STS AssumeRole, managed-identity, ADC) in-process. -**The `PlatformBackend` shape is gated** — see Alternatives Considered #1 and self-challenge doubt #1. The first writing-plans task for Phase B is an interface-audit spike that validates one unified `Plan/Apply/Destroy` contract against all five `platform.*` backend interfaces *before* the proto is locked. If they don't unify cleanly, the fallback is folding the cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` model instead of inventing `PlatformBackend`. +When `provider ∉ {kind, k3s, eks}` core's `platform.kubernetes` module resolves a `PlatformBackend` client from the plugin that registered `(kubernetes, provider)`. ### 3. Standalone modules / steps → plugin-native types (existing SDK surface, no new contract) -These are user-facing pipeline functionality, not engine infrastructure. They become **plugin-native module/step types** via the existing `ModuleFactories` / `StepFactories` plugin SDK — which is *already* a gRPC sidecar path (`RemoteModule`). No new contract. +These are user-facing pipeline functionality, not engine infrastructure. They become **plugin-native module/step types** via the existing `ModuleFactories` / `StepFactories` plugin SDK — already a gRPC sidecar path (`RemoteModule`). No new contract. **Note on current registration site:** these types are today registered by *built-in in-process engine plugins* under `plugins/` (which import `module.*` directly), not by `engine.go`. Extracting each one means the built-in plugin's factory map drops that entry and the impl moves to the external gRPC plugin. -| core file | becomes | plugin | -|-----------|---------|--------| -| `aws_api_gateway.go` (`AWSAPIGateway` — route-sync module) | `aws.apigateway` module | aws | -| `platform_apigateway.go` (`Platform*Gateway*` — provisioner) | folds into `PlatformBackend` (`platform.apigateway` provider) **or** `aws.apigateway` — resolved by the interface-audit spike | aws | -| `codebuild.go` | `aws.codebuild` module | aws | -| `nosql_dynamodb.go` | `nosql.dynamodb` module | aws | -| `pipeline_step_s3_upload.go` | `step.s3_upload` | aws | -| `s3_storage.go` | `storage.s3` module | aws | -| `storage_gcs.go` | `storage.gcs` module | gcp | +| core file | current built-in registration | becomes | plugin | +|-----------|-------------------------------|---------|--------| +| `nosql_dynamodb.go` (`DynamoDBNoSQL`) | `plugins/datastores/plugin.go` `"nosql.dynamodb"` | `nosql.dynamodb` module | aws | +| `pipeline_step_s3_upload.go` (`S3UploadStep`) | `plugins/pipelinesteps/plugin.go` `"step.s3_upload"` | `step.s3_upload` | aws | +| `s3_storage.go` (`S3Storage`) | `plugins/storage/plugin.go` `"storage.s3"` (factory at :90) | `storage.s3` module | aws | +| `storage_gcs.go` (`GCSStorage`) | `plugins/storage/plugin.go` `"storage.gcs"` (factory at :109) | `storage.gcs` module | gcp | -(`storage_artifact_s3.go` references the AWS SDK only in comments — verified comment-only, **not** a real import, stays in core.) +`storage_artifact_s3.go` references the AWS SDK **only in a doc comment** (verified — its actual imports are `context`/`fmt`/`io`/`modular`; the real impl is a filesystem fallback) — **not a real import, stays in core untouched.** -Credential handling (Option 1, approved): only `cloud_account_aws.go` (`AWSConfigProvider`, `AWSConfig()`, `awsProviderFrom` — pure SDK config-building) is **deleted**, with no core replacement. `cloud_account_aws_creds.go` is **edited** (its two SDK-bearing resolver tails removed — see §Architecture-2) and stays in core; `cloud_account_azure.go` / `cloud_account_gcp.go` are **untouched** — all three are SDK-free declare-don't-resolve resolver files after the edit. Each plugin-native AWS module carries its own `credentials:` config block and resolves it in-process via a shared in-plugin `buildAWSConfig` helper that owns the static/env/profile/role_arn logic (the logic deleted from `cloud_account_aws.go`) — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. +`cloud_account_aws.go` — defines `AWSConfigProvider` interface + `AWSConfig()` method + `ValidateCredentials()` method, all pure SDK — is **dead code**: a repo-wide grep for `AWSConfigProvider` / `awsProviderFrom` / `.AWSConfig(` returns **zero non-test consumers** (the `awsProviderFrom` helper and every consumer were removed by #653). It is **deleted outright by Phase B with no consumer rewrite and no core replacement** — this is a trivial dead-code deletion, not the multi-consumer refactor earlier drafts described. -**Resolvers emit *markers*, not always plain values.** For credential types `static` / `env`, the in-core resolver records concrete declared values into `CloudCredentials`. For `profile` / `role_arn` (AWS) and `managed_identity` / `azure_cli` / `workload_identity` / `application_default` (azure/gcp), the resolver records the *declared inputs* (`Extra["profile"]`, `RoleARN`, etc.) **plus** an `Extra["credential_source"]` marker — it does **not** resolve to concrete keys. The plugin reads the marker and performs the SDK-bearing resolution. This is not a "no-op passthrough": the plugin **must** implement marker handling for every deferred type, exactly as it implements profile/role_arn for AWS. +Credential handling (Option 1, approved): each plugin-native AWS module carries its own `credentials:` config block and resolves it in-process via a shared in-plugin `buildAWSConfig` helper that owns the static/env/profile/role_arn logic — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. + +**Resolvers emit *markers*, not always plain values.** For credential types `static` / `env`, the in-core resolver records concrete declared values into `CloudCredentials`. For `profile` / `role_arn` (AWS) and `managed_identity` / `client_credentials` / `cli` (azure) and the gcp equivalents, the resolver records the *declared inputs* (`Extra["profile"]`, `RoleARN`, etc.) **plus** an `Extra["credential_source"]` marker — it does **not** resolve to concrete keys. The plugin reads the marker and performs the SDK-bearing resolution. This is not a "no-op passthrough": the plugin **must** implement marker handling for every deferred type. ## Security -Option 1 moves raw cloud secrets (`accessKey`/`secretKey`/`account_key`/etc.) inline into every plugin-native module's `credentials:` config block — multiplying the number of config sites holding plaintext secrets versus today's single `cloud.account` module. This is not unprecedented (`iac_module.go`'s current `spaces` case already inlines `accessKey`/`secretKey`), but the multiplication needs explicit handling: +Option 1 moves raw cloud secrets (`accessKey`/`secretKey`/`account_key`/etc.) inline into every plugin-native module's `credentials:` config block — multiplying the number of config sites holding plaintext secrets versus today's single `cloud.account` module. Not unprecedented (`iac_module.go`'s current `spaces` case already inlines `accessKey`/`secretKey`), but the multiplication needs explicit handling: -- **Config-version store + execution tracing.** Workflow's config-version store (SHA-256 content-addressed) and execution-tracing layer marshal module config. Plugin-native module config carrying inline credentials MUST be redacted before persistence/tracing. Writing-plans task: extend the existing PII/secret redaction (already per-tenant-toggleable per `workflow-cloud`) to recognise the `credentials:` / `credentials_ref:` keys on plugin module config, OR confirm the existing redaction already covers any key matching a secret-pattern. This is a **blocking** task — it ships in the same phase as the first plugin-native AWS module, not after. -- **gRPC sidecar request logging.** The `IaCStateBackend` / `PlatformBackend` requests cross the engine↔plugin gRPC boundary, and `credentials:` blocks ride in `CreateModule` requests. **Verified at design time:** `plugin/external/grpc_plugin.go:39` constructs the server as `grpc.NewServer(opts...)` with `opts` passed straight through from the go-plugin broker — workflow's plugin SDK adds **no body-logging interceptor**. The only request-body logging anywhere in `plugin/external/` is `callback_server.go:85,118` (the plugin→host callback path: a `Log` RPC's `req.Message`, and a subscribe RPC's topic byte-count) — neither touches module config. `CreateModule` is dispatched at `adapter.go:477` with no logging of the request. **Conclusion: no redacting interceptor is needed today.** Writing-plans adds a guard test asserting no interceptor logs `CreateModule` bodies, so a future SDK change that adds one fails CI rather than silently leaking. -- **`credentials_ref:` blast radius.** A `credentials_ref` resolves to an in-plugin `aws.credentials` module within the *same plugin process* — it does not broaden which process can read the secret (engine never sees the resolved `aws.Config`, only the plugin does). This is strictly *narrower* than today's `cloud.account` (which builds `aws.Config` in the engine process). Documented as an improvement, not a risk. +- **Config-version store + execution tracing.** Workflow's config-version store (SHA-256 content-addressed) and execution-tracing layer marshal module config. Plugin-native module config carrying inline credentials MUST be redacted before persistence/tracing. Writing-plans task: extend the existing PII/secret redaction (already per-tenant-toggleable per `workflow-cloud`) to recognise `credentials:` / `credentials_ref:` keys on plugin module config, OR confirm the existing redaction already covers any key matching a secret-pattern. **Blocking** — ships in the same phase as the first plugin-native AWS module. +- **gRPC sidecar request logging.** `IaCStateBackend` / `PlatformBackend` requests cross the engine↔plugin gRPC boundary, and `credentials:` blocks ride in `CreateModule` requests. **Verified at design time:** `plugin/external/grpc_plugin.go:39` constructs the server as `grpc.NewServer(opts...)` with `opts` passed straight through from the go-plugin broker — workflow's plugin SDK adds **no body-logging interceptor**. The only request-body logging in `plugin/external/` is `callback_server.go:85,118` (plugin→host callback path) — neither touches module config. `CreateModule` is dispatched at `adapter.go:477` with no logging. **Conclusion: no redacting interceptor needed today.** Writing-plans adds a guard test asserting no interceptor logs `CreateModule` bodies, so a future SDK change that adds one fails CI. +- **`credentials_ref:` blast radius.** A `credentials_ref` resolves to an in-plugin `aws.credentials` module within the *same plugin process* — it does not broaden which process can read the secret (engine never sees the resolved `aws.Config`, only the plugin does). Strictly *narrower* than today's `cloud.account` (which builds `aws.Config` in the engine process). Documented as an improvement. ## Failure modes -Moving the IaC state store behind a gRPC sidecar introduces a partial-failure surface on the engine's hottest path (every plan/apply does `Lock` → `GetState` → ... → `SaveState` → `Unlock`). The in-process store had none of these: +Moving the IaC state store behind a gRPC sidecar introduces a partial-failure surface on the engine's hottest path (every plan/apply does `Lock` → `GetState` → ... → `SaveState` → `Unlock`): -- **Plugin crashes between `Lock` and `Unlock` → orphaned lock.** An in-process lock dies with the process; a gRPC-plugin lock can outlive a plugin crash if the plugin persisted it (S3/Blob lock objects do persist). **Initial scope:** this is a *documented limitation*, not silently broken. The `IaCStateBackend` contract ships as exactly the 6-method `IaCStateStore` interface — no TTL field — because no plugin backend in Phases A–D implements honored expiry yet, and a no-op TTL field is worse than none (it implies a guarantee that isn't enforced). Recovery for an orphaned lock is operator-side: delete the backend's lock object directly (it is a plain object/blob in the user's own bucket — `aws s3 rm`, `az storage blob delete`, etc.; the lock key format is documented per backend). **Planned additive follow-up:** once the first plugin backend implements honored expiry (S3 object-expiry metadata, Blob lease duration), `LockRequest` gains an optional `lease_ttl_seconds` field *paired with a contract conformance test* that asserts the plugin's lock object actually carries expiry — shipped with semantics, not as a field. Tracked as an open item. -- **`Lock` contention against a still-held lock.** Core's `iac.state` dispatch returns an immediate error on `Lock` contention — it does **not** block waiting for the lock to free. This matches today's in-process `IaCStateStore.Lock` ("Returns an error if the resource is already locked"). The gRPC boundary does not change this: a held lock — whether held by a live plan or orphaned by a dead plugin — surfaces the same immediate "resource locked" error, and orphaned-lock recovery is the operator-side delete above. No new waiting/lease-timeout state is introduced. -- **`SaveState` succeeds plugin-side but the gRPC response is lost → engine retries → double-write.** `SaveState` MUST be idempotent: it is a full-state replace keyed by `resource_id` (the existing `IaCStateStore.SaveState` is already "insert or replace"), so a retried identical `SaveState` is a no-op-equivalent. The contract documents `SaveState` as idempotent; the plugin implementations use unconditional PUT (overwrite), not append. No sequence number needed — IaC state is last-writer-wins by design. -- **Plugin unreachable at plan/apply start.** Core's `iac.state` dispatch returns a clear `"iac.state backend %q: plugin unreachable"` error and the plan/apply aborts *before* mutating anything — no partial state. This matches today's behavior when a misconfigured backend fails to construct in `IaCModule.Init()`. -- **`PlatformBackend` plugin crash mid-`Apply`.** A `platform.*` apply that crashes mid-flight leaves real cloud resources in an indeterminate state — but this is **identical to today's in-process risk** (an in-process `eksBackend.apply()` panic leaves the same indeterminate cloud state). The gRPC boundary does not worsen it; the next `Plan` reconciles against live cloud state as it does today. No new mitigation needed — documented as unchanged. -- **A plugin registers a backend/provider name that collides with a core-reserved one.** Core-registered names (`iac.state`: `memory`/`filesystem`/`postgres`; `platform.kubernetes`: `kind`/`k3s`; the `mock` backend of every `platform.*` family) are **reserved**. A plugin registration that collides with a reserved name is a **load-time error** — core fails to start with `"plugin %q registered reserved backend name %q"` rather than silently shadowing (in either direction). This makes a malformed or adversarial plugin manifest a hard, immediate failure, not a confusing runtime mis-dispatch. +- **Plugin crashes between `Lock` and `Unlock` → orphaned lock.** An in-process lock dies with the process; a gRPC-plugin lock can outlive a plugin crash if persisted (S3/Blob lock objects persist). **Initial scope:** documented limitation, not silently broken. The contract ships as exactly the 6-method `IaCStateStore` interface — no TTL field — because no Phase A–D plugin backend implements honored expiry yet, and a no-op TTL field implies a guarantee that isn't enforced. Recovery: operator deletes the backend's lock object directly (a plain object/blob in the user's own bucket; lock key format documented per backend). **Planned additive follow-up:** once a backend implements honored expiry, `LockRequest` gains an optional `lease_ttl_seconds` field *paired with a conformance test*. Tracked as an open item. +- **`Lock` contention against a still-held lock.** Core's `iac.state` dispatch returns an immediate error — it does not block. Matches today's in-process `IaCStateStore.Lock`. The gRPC boundary doesn't change this; orphaned-lock recovery is the operator-side delete above. +- **`SaveState` succeeds plugin-side but the gRPC response is lost → engine retries → double-write.** `SaveState` MUST be idempotent: full-state replace keyed by `resource_id` (existing `IaCStateStore.SaveState` is already insert-or-replace), so a retried identical `SaveState` is no-op-equivalent. Plugin implementations use unconditional PUT (overwrite), not append. IaC state is last-writer-wins by design. +- **Plugin unreachable at plan/apply start.** Core's `iac.state` dispatch returns a clear `"iac.state backend %q: plugin unreachable"` error and the plan/apply aborts *before* mutating anything. Matches today's behavior when a misconfigured backend fails to construct in `IaCModule.Init()`. +- **`PlatformBackend` plugin crash mid-`Apply`.** A `platform.kubernetes` apply crashing mid-flight leaves a real cloud cluster in an indeterminate state — but this is **identical to today's in-process risk** (an in-process `aksBackend.apply()` panic leaves the same indeterminate state). The next `Plan` reconciles against live cloud state as today. Documented as unchanged. +- **A plugin registers a backend/provider name colliding with a core-reserved one.** Core-registered names (`iac.state`: `memory`/`filesystem`/`postgres`; `platform.kubernetes`: `kind`/`k3s`/`eks`; the `mock` backend of every `platform.*` family) are **reserved**. A colliding plugin registration is a **load-time error** — core fails to start with `"plugin %q registered reserved backend name %q"` rather than silently shadowing. ## Cross-file coupling: the symbol-ownership map is a Phase 0 build artifact, not a design-doc claim -Four prior review cycles each found a hand-maintained per-file ownership claim in this design *wrong* — first as a table, then (cycle 5) as prose. The lesson is structural: **a precise symbol map is derived data; it rots on every edit and the design doc is the wrong place for it.** The design therefore commits to a *method* and a small set of *invariants*, and delegates the exact map to a script. - -**Invariants (the parts that survive any audit — these are load-bearing and the script verifies them, it doesn't discover them):** -- `module/cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic *declared-config* holder — **it stays in core, is never deleted by any phase**, and is the credential symbol-home all cloud platform code binds to. The `PlatformBackend` contract carries the declared `CloudCredentials` across the boundary (§Architecture-2). -- The `platform.*` family files each currently co-locate a **core-staying** backend (`mock`, plus `kind`/`k3s` for kubernetes) and one or more **plugin-bound** cloud backends behind a *single shared `func init()`*. This is true for `platform_kubernetes_kind.go`, `platform_dns.go`, `platform_ecs.go`, `platform_networking.go`, `platform_autoscaling.go` — verified. Splitting any one of them requires partitioning that `init()`; moving a file wholesale would either exile the `mock` backend or dangle a cloud registration. **Phase 0 fixes this for the whole family, not just kubernetes.** -- `cloud_account_aws.go` is **deleted by Phase B**. It defines **four** symbols, not one: the `AWSConfig()` builder, the `AWSConfigProvider` interface, `awsProviderFrom()`, and `ValidateCredentials()` — plus the pure helper `parseStringSlice`. Dispositions: - - `parseStringSlice` — pure, no SDK → **relocated to `cloud_helpers.go` in Phase 0** (a staying file, `cloud_account.go`-adjacent, needs it; see Phase 0). - - `AWSConfigProvider` interface — its method signature is `AWSConfig(ctx) (aws.Config, error)`, which **names the SDK type `aws.Config`** → it **cannot stay in core** and has no SDK-free equivalent. It is deleted with the file. - - `awsProviderFrom()` — a type-assertion helper returning `AWSConfigProvider` → deleted with the interface. - - `ValidateCredentials()` — verified: **no real caller** outside `cloud_account_aws.go` (the only repo match is a *comment* in `cmd/wfctl/deploy.go:866`) → deleted cleanly with the file. - - **Consumer rewrite (explicit Phase B scope):** `awsProviderFrom`/`AWSConfigProvider` are referenced by 8 files — `aws_api_gateway.go`, `codebuild.go`, `platform_apigateway.go`, `platform_autoscaling.go`, `platform_dns_backends.go`, `platform_ecs.go`, `platform_kubernetes_kind.go`, `platform_networking.go` — **all verified plugin-bound** (each is in the Phase B inventory or splits so only its `_aws.go`/`_eks.go` half references the symbol; no core-staying `_core.go` shell touches it). But "plugin-bound" is not "free" — each of those consumers currently does `awsProviderFrom(k.provider).AWSConfig(ctx)` to get a live `aws.Config`. In the plugin they have no `cloud.account` to type-assert. **Phase B must rewrite every one of those 8 consumers** to obtain credentials from the `CloudCredentials` proto message (passed on the `PlatformBackend` request / carried in the plugin-native module's `credentials:` config) and build `aws.Config` via the plugin's `buildAWSConfig` helper. This is real Phase B work, not a localized symbol deletion — the design states it as scope, not a footnote. +Prior review cycles each found a hand-maintained per-file ownership claim in this design *wrong* — and cycle 8 found the whole inventory stale because it predated #653. The lesson is structural: **a precise symbol map is derived data; it rots on every upstream merge and the design doc is the wrong place for it.** The design commits to a *method* and a small set of *invariants*, and delegates the exact map to a script that runs in CI. -**The method — `scripts/audit-cloud-symbols.sh`, Phase 0 task 1:** for each `platform.*` backend region and each plugin-bound `module/*.go` file, it greps every cross-file function/type reference and every `init()` that registers a *mix* of core-staying and plugin-bound factories, and emits the authoritative ownership + `init()`-partition map. Committed with Phase 0, re-run in CI on every subsequent phase PR. The design never transcribes its output — the script *is* the source of truth. A mixed-`init()` or a cross-file symbol edge into a to-be-deleted file is a Phase 0 (or phase-PR) **CI failure**, not a reviewer's catch. +**Invariants (load-bearing; the script verifies them, it doesn't discover them):** +- `module/cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic *declared-config* holder — **it stays in core, is never deleted by any phase**, and is the credential symbol-home all cloud platform code binds to. The `PlatformBackend` contract carries the declared `CloudCredentials` across the boundary. +- `module/platform_kubernetes_kind.go` co-locates **core-staying** backends (`kindBackend` serving kind+k3s, `eksErrorBackend` serving eks) and **plugin-bound** cloud backends (`gkeBackend`, `aksBackend`) behind a *single shared `func init()`* (verified — `init()` registers all five names). Splitting it requires partitioning that `init()`. **Phase 0 does exactly this** — and it is the *only* `platform.*` file needing a split, because #653 already removed the rest. +- `cloud_account_aws.go` is **dead code, deleted outright by Phase B.** It defines `AWSConfig()` / `AWSConfigProvider` / `ValidateCredentials` — all pure SDK — and a repo-wide grep confirms **zero non-test consumers** of any of them (`awsProviderFrom` and its consumers were removed by #653). No consumer rewrite, no helper relocation: earlier drafts' "8-consumer rewrite" and "`parseStringSlice` relocation" are obsolete — `parseStringSlice` and `safeIntToInt32` **no longer exist anywhere in `module/`** (verified). There is no shared-helper-relocation work in this design. -## Phase 0 — precursor: isolate every cloud backend behind a uniform file convention +**The method — `scripts/audit-cloud-symbols.sh`, Phase 0 task 1:** for the `platform_kubernetes_kind.go` split and each plugin-bound `module/*.go` file, it greps every cross-file function/type reference and asserts (a) no `init()` registers a *mix* of core-staying and plugin-bound factories, (b) no cross-file symbol edge from a core-staying file into a to-be-deleted file, (c) `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports after the Phase B resolver rewrite. Committed with Phase 0, re-run in CI on every subsequent phase PR. The design never transcribes its output — the script *is* the source of truth. -A mechanical, behavior-equivalent refactor landed **before** Phase A. It establishes — repo-wide across the `platform.*` family — the convention that makes every later phase a clean deletion: +## Phase 0 — precursor: split the one remaining mixed cloud-backend file -**1. Uniform `_core.go` / `_.go` file convention.** For each `platform.*` family, Phase 0 mechanically splits so that: -- `platform__core.go` (or the existing shell file) holds the module shell, the backend interface, the `mock` backend (and `kind`/`k3s` for kubernetes), and an `init()` registering **only** the core-staying backends. -- `platform__.go` holds exactly one cloud backend + its own import block + its own `init()` registering **only** that provider. +Post-#653, Phase 0 is small: a mechanical, behavior-equivalent split of the **single** `module/` file that still co-locates core-staying and plugin-bound cloud backends. -The families are **not** uniform in current layout — the audit script (Phase 0 task 1) determines per-family whether it is a one-file or two-file split, but the verified shapes are: -- **kubernetes** — one file (`platform_kubernetes_kind.go`) holds all four backends + the shared `init()`; splits into `platform_kubernetes_kind.go` (kind+k3s, keeps a core `init()`) / `_eks.go` / `_gke.go` / `_aks.go`, each with its own `init()`. -- **dns** — *two* files: `platform_dns.go` holds the shared `init()` + `dnsBackend` interface + `DNSBackendFactory` registry; `platform_dns_backends.go` holds both `mockDNSBackend` and `route53Backend` impls + the route53 SDK import. Phase 0 partitions the `init()` in `platform_dns.go` (mock-registration stays, aws-registration moves), moves `route53Backend` + the SDK import out of `platform_dns_backends.go` into a new `platform_dns_aws.go`, **and renames the now-mock-only `platform_dns_backends.go` → `platform_dns_core.go`** so the dns family conforms to the same `_core.go` / `_aws.go` naming as every other family — no special-case three-file layout. -- **ecs / networking / autoscaling** — each a single file with `mock`+`aws` backends + a shared `init()`; split like kubernetes (`_core.go` keeps mock + core `init()`, `_aws.go` takes the cloud backend + its own `init()`). +**1. Split `platform_kubernetes_kind.go`** into: +- `platform_kubernetes_core.go` — holds `kindBackend` (serves `kind` + `k3s`), `eksErrorBackend` (serves `eks`), and an `init()` registering **only** those three core-staying names. +- `platform_kubernetes_gke.go` — holds `gkeBackend` + its `google.golang.org/api/container` import + an `init()` registering only `gke`. +- `platform_kubernetes_aks.go` — holds `aksBackend` (incl. `azureToken`) + its `Azure/azure-sdk-for-go` import + an `init()` registering only `aks`. -The exact post-split file list is the audit-script's output, not hand-enumerated — but the *rule* is fixed: after Phase 0, no `init()` registers both a core-staying and a plugin-bound factory, and no file holds both a core-staying and a plugin-bound backend impl. +After the split, no `init()` registers both a core-staying and a plugin-bound factory, and no file holds both a core-staying and a plugin-bound backend impl. `platform_kubernetes.go` (the `PlatformKubernetes` shell, `kubernetesBackend` interface, `RegisterKubernetesBackend`, `intFromAny` helper) is untouched and stays in core. -**2. Relocate shared pure helpers into a new SDK-free core file** `module/cloud_helpers.go`: -- `parseStringSlice` — currently in `cloud_account_aws.go` (Phase-B-*deleted*). It **must** relocate or its staying/plugin-bound consumers break. -- `safeIntToInt32` — currently in `platform_kubernetes.go` (a *core-staying* file). Relocation here is not a hard necessity for core's sake (core could keep it) — it is done so the soon-to-extract `_eks.go`/`_gke.go`/`_aks.go` files have a clean, SDK-free copy-source when they move to the plugin. `platform_kubernetes.go` updates its own reference to the new `cloud_helpers.go` home. +**2. Create `scripts/audit-cloud-symbols.sh`** (the Cross-file-coupling method above). No shared-helper-relocation step — there are no shared helpers to relocate (verified). -Both are ≤15-line pure functions, no SDK, no state. `cloud_helpers.go` stays in core permanently; when a plugin-bound file moves to its plugin it gets its own copy of whichever helpers it uses (duplicating a pure stdlib-only helper across a process boundary is correct, not a smell — the shared plugin-side util module is the Alternatives-Considered-#3 follow-up). The audit script confirms the relocation is *complete* — no file references the helpers from their old homes. +This is **not** "zero logic change" — partitioning a shared `init()` distributes registration calls across files. It is *behavior-equivalent*: the same five backend names are registered after the split as before. -This is **not** "zero logic change" — partitioning a shared `init()` distributes registration calls across files. It is *behavior-equivalent*: the same backend names are registered after the split as before. The design states this plainly rather than mislabelling it. +**Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `scripts/audit-cloud-symbols.sh` committed, output shows no mixed `init()` and no cross-file edge into a to-be-deleted file; `git diff` is pure code movement + mechanical `init()` partition, no logic edits. -**Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `scripts/audit-cloud-symbols.sh` committed, and its output shows (a) no `init()` mixing core-staying + plugin-bound registrations, (b) no cross-file symbol edge from a plugin-bound file into a to-be-deleted file, (c) the helper relocation complete; `git diff` is pure code movement + mechanical `init()` partition, no logic edits. After Phase 0, every subsequent phase deletes *only* `_.go` files — self-contained at import-block, `init()`, AND symbol level. - -**Phase 0 rollback:** a file-split + `init()` partition + helper relocation with no behavior diff — revert is a single `git revert`, no contract, no go.mod, no runtime impact. The one phase with a trivial rollback story. +**Phase 0 rollback:** a file-split + `init()` partition with no behavior diff — revert is a single `git revert`, no contract, no go.mod, no runtime impact. ## Phases -Each phase is one workflow-core PR (deleting files + wiring the contract dispatch) plus one PR per affected plugin. Within a phase, the plugin PR may merge ahead of the core PR — core keeps the old in-process path until the contract dispatch is wired in the core PR, so a plugin implementing the published proto is harmless to load early. **Atomicity rule:** within a core PR, a deleted file and every file that references its symbols are removed in the *same commit* (the build gate enforces this — a dangling reference fails CI). +Each phase is one workflow-core PR (deleting/editing files + wiring the contract dispatch) plus one PR per affected plugin. Within a phase, the plugin PR may merge ahead of the core PR — core keeps the old in-process path until the contract dispatch is wired in the core PR. **Atomicity rule:** within a core PR, a deleted file and every file referencing its symbols are removed in the *same commit* (the build gate enforces this). -**Phase A — Azure** (smallest, validates BOTH new contracts end-to-end): +**Phase A — Azure** (smallest; validates BOTH new contracts end-to-end): - Run the state-backend benchmark task; lock the `IaCStateBackend` proto shape. -- Run the `platform.*` interface-audit spike; lock or re-scope the `PlatformBackend` proto shape (Alternatives Considered #1). +- Run the `kubernetesBackend` interface-audit spike; lock or re-scope the `PlatformBackend` proto shape. - Add `IaCStateBackend` + `PlatformBackend` services to `plugin/external/proto/iac.proto`. -- Add the secret-redaction task + the gRPC-interceptor guard test (security tasks, blocking). +- Add the secret-redaction task + the gRPC-interceptor guard test (blocking). - workflow-plugin-azure implements `azure_blob` `IaCStateBackend` + `aks` `PlatformBackend`. -- Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go` **(this + the deletion is what drops `Azure/azure-sdk-for-go` from go.mod)**; delete `platform_kubernetes_aks.go` (from Phase 0) and wire its `PlatformBackend` dispatch. - -**Phase B — AWS** (largest — 13 SDK-importing files, 3 surfaces). After Phase 0's split, every `platform.*` AWS backend lives in its own `_aws.go` file with its own `init()` — so Phase B deletes `_aws.go` / `_eks.go` files cleanly, never a mixed file. Inventory + destination (file names post-Phase-0; the authoritative list is the audit-script output): - -| core file (post Phase 0) | destination | atomicity note | -|--------------------------|-------------|----------------| -| `iac_state_spaces.go` | aws plugin — `s3` `IaCStateBackend` (DELETE from core) | shared with `spaces` — see Phase D | -| `cloud_account_aws.go` | DELETE — `AWSConfig()` / `AWSConfigProvider` / `awsProviderFrom` / `ValidateCredentials` (all pure SDK; `parseStringSlice` relocated in Phase 0). **Forces the 8-consumer rewrite** — see §Cross-file-coupling invariant 3 | **same commit as all 8 `awsProviderFrom` consumers' rewrites** — deleting the interface without rewriting its consumers fails the build | -| `cloud_account_aws_creds.go` | **EDIT** (not delete) — remove the SDK-bearing tails of `awsProfileResolver`/`awsRoleARNResolver`; file becomes SDK-free, stays in core (§Architecture-2) | the resolver `init()` registrations stay — `provider: aws` credential resolution still works in-core, now declare-only | -| `platform_kubernetes_eks.go` | aws plugin — `eks` `PlatformBackend` | **same commit as `cloud_account_aws*.go`** | -| `platform_ecs_aws.go` | aws plugin — `PlatformBackend` (`ecs`) | `_core.go` with the `mock` ECS backend stays | -| `platform_networking_aws.go` | aws plugin — `PlatformBackend` (`networking`/ec2) | `_core.go` with the `mock` networking backend stays | -| `platform_autoscaling_aws.go` | aws plugin — `PlatformBackend` (`autoscaling`) | `_core.go` with the `mock` autoscaling backend stays | -| `platform_dns_aws.go` (created by Phase 0 from `platform_dns_backends.go`) | aws plugin — `PlatformBackend` (`dns`/route53) | `platform_dns_core.go` (Phase-0 rename of `platform_dns_backends.go`) with `mockDNSBackend` + the `platform_dns.go` shell stay | -| `aws_api_gateway.go` | aws plugin — `aws.apigateway` module | — | -| `platform_apigateway.go` | aws plugin — `PlatformBackend` or `aws.apigateway` (gated on interface-audit spike) | — | -| `codebuild.go` | aws plugin — `aws.codebuild` module | — | -| `pipeline_step_s3_upload.go` | aws plugin — `step.s3_upload` | — | -| `s3_storage.go` | aws plugin — `storage.s3` module | — | - -- Core PR also: **strip the `spaces` case from `iac_module.go`** (it calls `NewSpacesIaCStateStore` from the deleted `iac_state_spaces.go` — same compile-dependency pattern as Phase A's `azure_blob` strip). Drop `aws-sdk-go-v2` from go.mod. (The `_core.go` files holding the `mock` ECS/networking/autoscaling/DNS backends + their interfaces + module shells **stay in core** — only the `_aws.go` files leave.) - -**Phase C — GCP** (3 files): +- Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go`; delete `platform_kubernetes_aks.go` (the Phase-0 split file) and wire its `PlatformBackend` dispatch. This drops `Azure/azure-sdk-for-go` from `go.mod`. + +**Phase B — AWS.** Inventory + destination (the authoritative list is the audit-script output): + +| core file | disposition | atomicity note | +|-----------|-------------|----------------| +| `iac_state_spaces.go` | DELETE → aws plugin `s3` `IaCStateBackend` | shared with `spaces` — see Phase D | +| `cloud_account_aws.go` | DELETE outright — dead code, **zero non-test consumers verified** | no consumer rewrite; trivial deletion | +| `cloud_account_aws_creds.go` | **EDIT** — rewrite `awsProfileResolver`/`awsRoleARNResolver` bodies SDK-free; file stays in core | the resolver `init()` registrations stay — `provider: aws` credential resolution still works in-core, now declare-only | +| `nosql_dynamodb.go` | DELETE → aws plugin `nosql.dynamodb`; drop the entry from `plugins/datastores/plugin.go` | same commit as the built-in-plugin factory-map edit | +| `pipeline_step_s3_upload.go` | DELETE → aws plugin `step.s3_upload`; drop from `plugins/pipelinesteps/plugin.go` | same commit | +| `s3_storage.go` | DELETE → aws plugin `storage.s3`; drop from `plugins/storage/plugin.go` | same commit | + +- Core PR also: **strip the `spaces` case from `iac_module.go`** (it calls `NewSpacesIaCStateStore` from the deleted `iac_state_spaces.go`). Drop `aws-sdk-go-v2` from `go.mod`. +- **No AWS `platform.*` work** — #653 already stubbed `eks` (`eksErrorBackend` stays in core) and removed `platform/providers/aws/`. +- `storage_artifact_s3.go` stays in core (comment-only SDK reference). + +**Phase C — GCP:** - workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), `PlatformBackend` (`gke`), plugin-native `storage.gcs`. -- Core PR: delete `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_gke.go` (from Phase 0); drop `cloud.google.com/go` + `google.golang.org/api`. After Phase C, `go list -deps ./...` shows zero packages from the three in-scope SDK trees (`aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com`+`google.golang.org/api`) — the permanent CI gate is added here. (`godo` remains — out of scope, see Problem.) +- Core PR: delete `iac_state_gcs.go`, `storage_gcs.go` (drop the entry from `plugins/storage/plugin.go`), `platform_kubernetes_gke.go` (the Phase-0 split file); strip the `gcs` case from `iac_module.go`; drop `cloud.google.com/go` + `google.golang.org/api`. After Phase C, `go list -deps ./...` shows zero packages from the three in-scope SDK trees — the permanent CI gate is added here. (`godo` remains — out of scope.) **Phase D — DigitalOcean (`spaces` clean-break):** - workflow-plugin-digitalocean implements `IaCStateBackend` for `spaces` (S3-compatible — pulls `aws-sdk-go-v2/service/s3`, the one service package, not the whole tree). -- **This is a clean break, not soft-compat.** `iac_state_spaces.go` + the `spaces` case in `iac_module.go` are deleted by **Phase B's core PR** (`iac_state_spaces.go` is the one S3-compatible store backing *both* `s3` and `spaces`). After Phase B's core PR merges, `iac.state` with `backend: spaces` fails to build unless the DO plugin version implementing `IaCStateBackend` is loaded. +- **Clean break, not soft-compat.** `iac_state_spaces.go` + the `spaces` case in `iac_module.go` are deleted by **Phase B's core PR** (`iac_state_spaces.go` is the one S3-compatible store backing *both* `s3` and `spaces`). After Phase B's core PR merges, `iac.state` with `backend: spaces` fails to build unless the DO plugin version implementing `IaCStateBackend` is loaded. - **Minor version bump** on workflow-plugin-digitalocean (compatibility-break marker) + `minEngineVersion` set to the core version that drops the in-core `spaces` case + migration doc. -- **Sequencing:** the DO plugin PR (implementing `spaces` `IaCStateBackend`) MUST merge + release before Phase B's core PR merges — otherwise there is a window where `backend: spaces` has no implementation anywhere. Writing-plans orders the DO plugin PR as a Phase-B blocker. +- **Sequencing:** the DO plugin PR (implementing `spaces` `IaCStateBackend`) MUST merge + release **before** Phase B's core PR merges — otherwise `backend: spaces` has no implementation anywhere. Writing-plans orders the DO plugin PR as a Phase-B blocker. ## Migration (user-facing) Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-14-cloud-sdk-extraction.md`: -- `iac.state` with `backend: s3|azure_blob|gcs|spaces` → load the matching plugin (`wfctl plugin install workflow-plugin-{aws,azure,gcp,digitalocean}`). yaml `backend:` value unchanged. **Hard requirement after the relevant phase merges** — the in-core backend is deleted, not deprecated. -- `platform.kubernetes` / `platform.ecs` / etc. with a cloud `provider:` → load the matching plugin. yaml `provider:` value unchanged. Hard requirement after the relevant phase. -- `aws.apigateway` and other former `cloud.account`-brokered AWS modules → module type renamed to plugin-native form; `credentials:` block moves inline (or `credentials_ref:` an `aws.credentials` module). **This is the only yaml-shape change.** -- `memory` / `filesystem` / `postgres` state backends, `kind` k8s backend → no change, still core. +- `iac.state` with `backend: s3|azure_blob|gcs|spaces` → load the matching plugin (`wfctl plugin install workflow-plugin-{aws,azure,gcp,digitalocean}`). yaml `backend:` value unchanged. **Hard requirement after the relevant phase merges.** +- `platform.kubernetes` with `provider: gke|aks` → load the matching plugin. yaml `provider:` value unchanged. (`kind`/`k3s`/`eks` unchanged — still core.) +- `nosql.dynamodb`, `step.s3_upload`, `storage.s3`, `storage.gcs` → load the matching plugin. Module/step type names unchanged; `credentials:` block moves inline (or `credentials_ref:` an in-plugin `aws.credentials`/`gcp.credentials` module). **This inline-credentials move is the only yaml-shape change.** +- `memory` / `filesystem` / `postgres` state backends, `kind`/`k3s`/`eks` k8s backends, `storage.artifact` (`storage_artifact_s3.go`) → no change, still core. ## Assumptions -1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case but a hostile-large state is out of initial scope (documented limitation, not a silent failure: `SaveState` returns a clear "state exceeds transport limit" error). The benchmark runs before the proto is locked. -2. **The `platform.*` backend interfaces are cleanly provider-separable.** The design assumes `kubernetesBackend` / `ecsBackend` / etc. are interface-segregated such that the `kind` impl can stay while cloud impls extract. **This is the most fragile assumption** — the Phase 0/A interface-audit spike (first writing-plans task) validates it; if a backend interface leaks SDK types into the core module shell, that shell needs an interface-extraction refactor first and the phase re-scopes. Phase 0's mechanical split + helper relocation de-risks this structurally: after Phase 0, the audit operates on already-separated files, not an assertion about an unsplit one. -3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend`/`PlatformBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive, core ignores unknown backend registrations until its own half lands. -4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. The DO plugin already carries `godo`; adding one AWS service package is the minimal cost of self-contained `spaces` state support (vs. forcing DO users to also load workflow-plugin-aws). -5. **The credential resolvers can all be made SDK-free in-core.** `cloud_account_azure.go` / `cloud_account_gcp.go` are *already* SDK-free (verified — SDK references only in comments); `cloud_account_aws_creds.go`'s `awsStaticResolver`/`awsEnvResolver` are already SDK-free, and `awsProfileResolver`/`awsRoleARNResolver` become SDK-free once their resolution tails are removed (Phase B edit). The load-bearing assumption is that **a resolver does not *need* to resolve in-core** — for the deferred credential types (profile/role_arn/managed-identity/cli/workload-identity/ADC) it is sufficient for the in-core resolver to record the declared inputs + an `Extra["credential_source"]` marker, and for the plugin to honor the marker. If some credential type genuinely cannot be expressed as "declared inputs + marker" (none identified — even ADC is just a marker), that type would need a different mechanism. The plugin **must** implement marker handling for every deferred type, not just AWS profile/role_arn. -6. **No core code outside `module/` imports these SDKs.** Verified: the only real `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` imports are under `module/`. `cmd/`, `engine.go`, `schema/`, `plugin/` are clean. A `go list -deps` CI gate in the final phase enforces this permanently. +1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case; a hostile-large state is out of initial scope (`SaveState` returns a clear "state exceeds transport limit" error). The benchmark runs before the proto is locked. +2. **`kubernetesBackend` is cleanly provider-separable.** The design assumes the `kubernetesBackend` interface is segregated such that `kindBackend`/`eksErrorBackend` can stay while `gkeBackend`/`aksBackend` extract. Post-#653 this is **much less fragile than earlier drafts** — there is one interface, not five, and `eksErrorBackend` already proves a core-staying SDK-free impl coexists with cloud impls behind the same interface. The Phase 0/A interface-audit spike still validates it formally before the proto lock. +3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend`/`PlatformBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive. +4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. Adding one AWS service package is the minimal cost of self-contained `spaces` state support. +5. **The credential resolvers can all be made SDK-free in-core.** `cloud_account_azure.go` / `cloud_account_gcp.go` are *already* SDK-free (verified — 0 SDK imports); `cloud_account_aws_creds.go`'s `awsStaticResolver`/`awsEnvResolver` are already SDK-free, and `awsProfileResolver`/`awsRoleARNResolver` become SDK-free once their SDK blocks are rewritten out (Phase B). The load-bearing assumption: a resolver does not *need* to resolve in-core — for deferred credential types it records declared inputs + an `Extra["credential_source"]` marker, and the plugin honors the marker. The plugin **must** implement marker handling for every deferred type. +6. **No core code outside `module/` imports these SDKs.** Verified: the only real `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` imports are under `module/`. A `go list -deps` CI gate in Phase C enforces this permanently. +7. **#653 is final and merged.** This design builds on `origin/main` post-#653. If #653 work were reverted, this design's file inventory would need re-baselining — but #653's issue is *closed* and all three PRs are merged, so this is a stable foundation. ## Rollback This design changes **plugin loading paths** and **go.mod dependency trees** — runtime-affecting per the `runtime-launch-validation` trigger list. -- **Per-phase revert:** each phase is an isolated core PR + plugin PR(s). Reverting the **core PR** restores the in-process backend `switch` / `platform.*` cloud backends and re-adds the SDK to `go.mod` — the deleted files are recoverable from git. The plugin PRs are additive (new contract impls / module types) and can stay merged harmlessly even if core reverts. **Phase D has no separate core PR** — its core deletion *is* Phase B's core PR — so a Phase D rollback means reverting Phase B's core PR + the DO plugin PR together. +- **Per-phase revert:** each phase is an isolated core PR + plugin PR(s). Reverting the **core PR** restores the in-process backend `switch` / cloud backends and re-adds the SDK to `go.mod` — deleted files recoverable from git. Plugin PRs are additive and can stay merged harmlessly even if core reverts. **Phase D has no separate core PR** — its core deletion *is* Phase B's core PR — so a Phase D rollback means reverting Phase B's core PR + the DO plugin PR together. - **Forward-fix preferred over revert:** because core keeps the old in-process path until the contract dispatch is wired *in the same core PR*, a broken phase fails at PR CI (image-launch / strict-contracts gates), not in production. The revert path exists but the gate is the primary safety. -- **`spaces` clean-break (Phase B core PR + Phase D plugin PR):** the only change with an external-user-visible compat break. Rollback = revert Phase B's core PR (restores `iac_state_spaces.go` + the `spaces` case) **and** revert the DO plugin minor bump, together — they are a matched pair. The migration doc + the DO plugin's `minEngineVersion` bump is the forward guard: a user on a core version past Phase B without the new DO plugin gets a clear build-time "backend spaces requires workflow-plugin-digitalocean ≥ X" error, not a silent failure. +- **`spaces` clean-break (Phase B core PR + Phase D plugin PR):** the only change with an external-user-visible compat break. Rollback = revert Phase B's core PR (restores `iac_state_spaces.go` + the `spaces` case) **and** revert the DO plugin minor bump, together — a matched pair. The migration doc + the DO plugin's `minEngineVersion` bump is the forward guard. ## Alternatives Considered -1. **Fold cloud platform provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contracts instead of inventing `PlatformBackend`.** An EKS/GKE/AKS cluster — and arguably an ECS service, a Route53 zone, an EC2 VPC — is structurally a managed resource with create/plan/apply/destroy/status, which is exactly what the battle-tested `ResourceDriver` contract already models (8 services in `iac.proto`, multiple ADRs through the strict-contracts cutover). Inventing `PlatformBackend` risks the lowest-common-denominator problem (self-challenge doubt #1). **Rejected as the default** because the `platform.*` modules have a distinct plan/apply *lifecycle surface* (they sync against live cloud state continuously, not just declaratively reconcile) and a distinct `provider:` UX the user explicitly asked to preserve — but **retained as the gated fallback**: the Phase 0/A interface-audit spike decides. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the implementation folds them into `ResourceDriver` rather than shipping a bad `PlatformBackend`. -2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "drops the AWS *service-provider* tree, keeps one S3 client." The S3 client is small and stable; DO Spaces + AWS S3 are the same API; keeping one shared S3-compatible store in core avoids forcing *both* the AWS and DO plugins to each carry an S3 client and avoids a clean-break for existing `spaces` users. **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the "core drops the three in-scope SDK trees" invariant the `go list -deps` gate enforces — a partial extraction is a maintenance trap. The cost (both aws + DO plugins carry an S3 client) is real but bounded: it's one service package, and each plugin is independently versioned anyway. -3. **A shared `s3compat` Go module consumed by both the aws and DO plugins** (instead of each independently re-implementing the S3-compatible state store + `buildAWSConfig`). Keeps the three-in-scope-trees invariant intact while eliminating the cross-plugin duplication Alternative #2 dismisses as "bounded." **Deferred, not rejected:** it is a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary, so it can land as a follow-up after the extraction is proven. Forcing it into the critical path now couples the aws and DO plugin release cadences; the duplication is a small, well-understood `buildAWSConfig` + thin S3 wrapper. Writing-plans logs it as a post-extraction cleanup candidate. +1. **Fold the cloud Kubernetes provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contract instead of inventing `PlatformBackend`.** A GKE/AKS cluster is structurally a managed resource with create/plan/apply/destroy/status — exactly what `ResourceDriver` already models. **Rejected as the default** because `platform.kubernetes` has a distinct `provider:` UX the user explicitly asked to preserve, and a continuous-reconciliation lifecycle surface — but **retained as the gated fallback**: the Phase 0/A `kubernetesBackend` interface-audit spike decides. Post-#653 the case for a dedicated `PlatformBackend` is *weaker* (only 2 cloud backends, both Kubernetes) — the spike may well conclude `ResourceDriver` suffices. The design defers to the spike rather than pre-committing. +2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "keeps one S3 client." **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the `go list -deps` gate. The cost (both aws + DO plugins carry an S3 client) is real but bounded — one service package, independently versioned. +3. **A shared `s3compat` Go module consumed by both the aws and DO plugins** (instead of each re-implementing the S3-compatible state store + `buildAWSConfig`). **Deferred, not rejected:** a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary — lands as a follow-up after the extraction is proven. Writing-plans logs it as a post-extraction cleanup candidate. 4. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. +5. **Wait for / extend #653 to also extract state backends + `platform.kubernetes`.** #653's issue is closed with an explicit scope boundary ("RBAC/secrets/artifact stay"). Extending a closed issue rather than opening a clearly-scoped successor would muddy the audit trail. **Rejected** — this design is the named successor and cites #653 as predecessor. -## Self-challenge — top doubts surfaced (carried forward, with mitigations now wired into phases) - -Two distinct mitigations cover three doubts (#1 and #2 share the interface-audit spike — that is intentional, not redundant coverage theatre): +## Self-challenge — top doubts surfaced (carried forward, with mitigations wired into phases) -1. **`PlatformBackend` may be over-general** AND **2. clean provider-separability (Assumption 2) is fragile.** Both are settled by the *one* interface-audit spike — Phase 0/A task 1, ordered before the proto lock. If the five `platform.*` backend interfaces don't unify behind one `Plan/Apply/Destroy`, the fallback is folding cloud platform provisioners into `ResourceDriver` (Alternatives Considered #1); if a backend interface leaks SDK types into its core module shell, the phase re-scopes to do the interface-extraction refactor first. Phase 0's mechanical file-split also de-risks #2 structurally — each backend's imports are isolated before any extraction. -3. **The state-backend benchmark could come back "streaming required"** and reshape the `IaCStateBackend` proto. Mitigation: benchmark is a Phase A task ordered *before* the proto lock — the proto is not committed until the benchmark result is in. +1. **`PlatformBackend` may be over-general** AND **2. clean provider-separability (Assumption 2) is fragile.** Both are settled by the *one* `kubernetesBackend` interface-audit spike — Phase 0/A task 1, ordered before the proto lock. Post-#653 both doubts are materially smaller: one interface, two cloud impls, and `eksErrorBackend` already demonstrates an SDK-free core impl behind that interface. If `kubernetesBackend`'s 4 methods don't map cleanly onto `Plan/Apply/Destroy/Status`, the fallback is folding into `ResourceDriver` (Alternatives #1). +3. **The state-backend benchmark could come back "streaming required"** and reshape the `IaCStateBackend` proto. Mitigation: benchmark is a Phase A task ordered *before* the proto lock. +4. **The inventory could be stale again** — cycle 8 caught exactly this (the design predated #653). Mitigation: every file/symbol claim in this revision is grep-verified against `origin/main` HEAD, the worktree is confirmed 0 commits behind `origin/main`, and `scripts/audit-cloud-symbols.sh` (Phase 0 task 1) makes the inventory a CI-enforced build artifact from Phase 0 onward — not a prose claim that can rot. ## Open items deferred to writing-plans - Exact proto field layouts for both new contracts (sketches above are directional; field-level layout follows the benchmark + interface-audit results). -- Whether `PlatformBackend` ships as designed or folds into `ResourceDriver` (gated on the interface-audit spike — Alternatives Considered #1). +- Whether `PlatformBackend` ships as designed or folds into `ResourceDriver` (gated on the `kubernetesBackend` interface-audit spike — Alternatives Considered #1). - Benchmark harness location + the concrete acceptance threshold (p99 added latency bar). - Exact wording of the secret-redaction extension + whether existing redaction already covers `credentials:` keys. -- The `s3compat` shared-module cleanup (Alternatives Considered #3) — logged as a post-extraction follow-up candidate, not in the critical path. +- The `s3compat` shared-module cleanup (Alternatives Considered #3) — logged as a post-extraction follow-up candidate. - Per-plugin CHANGELOG entries + the consolidated migration doc wording. From b9facf5f07b8d9f74939397316cb137e7604681d Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:47:05 -0400 Subject: [PATCH 11/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20cycle-9=20re-baseline=20+=20audit=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cycle-9 adversarial review caught aksBackend mis-classified as an azure-sdk importer: platform_kubernetes_kind.go's azure-sdk-for-go match is a stale doc comment (line 332) — aksBackend.azureToken is a plain net/http OAuth2 client. An import-block-disciplined re-survey found a second comment-only false positive: nosql_dynamodb.go. Structural fix for the recurring "grep matched a comment" defect class: added scripts/audit-cloud-symbols.sh, which parses Go import(...) blocks (never comments) and emits the comment-immune real-import map. Its output now populates every file table in the design — prose claims replaced by a build artifact. Formalized + CI-wired in Phase 0. Corrected inventory (audit-script output): AWS 5 real-import files (not 6), Azure 2 (not 3), GCP 3. nosql_dynamodb.go + storage_artifact_s3.go are comment-only stubs — out of scope, stay in core. Design consequences of aksBackend being SDK-free: - Only gkeBackend carries a cloud platform SDK. kind/k3s/eks/aks all stay in core. - Architecture §2 no longer proposes a new PlatformBackend contract. The gke cross-process mechanism is gated on an interface-audit spike whose preferred outcome is folding into the existing ResourceDriver contract — a dedicated contract for one backend is YAGNI. - Phase A (Azure) is now pure IaCStateBackend — touches no platform file. - Phase 0 splits platform_kubernetes_kind.go into _core.go (kind/k3s/ eks/aks — all SDK-free) + _gke.go (the lone SDK-bearing backend), and fixes the stale line-332 comment. - The gke platform extraction + its contract decision move to Phase C. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 248 +++++++++--------- scripts/audit-cloud-symbols.sh | 92 +++++++ 2 files changed, 211 insertions(+), 129 deletions(-) create mode 100755 scripts/audit-cloud-symbols.sh diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index b8299eb3..77e8dd1d 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,7 +1,7 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — re-baselined after adversarial review cycle 8 (design verified against current `origin/main`, post-#653) +**Status:** Design — re-baselined after adversarial review cycle 9 (inventory regenerated by `scripts/audit-cloud-symbols.sh`, import-block-disciplined) **Owner:** autonomous pipeline (workflow#TBD) ## Relationship to issue #653 (predecessor — read this first) @@ -12,54 +12,58 @@ Issue **#653** ("Audit AWS SDK usage in workflow core") closed 2026-05-13. Its t - **PR #659** — stripped the AWS SDK from the `codebuild` and `EKS` backends (replaced `codebuildAWSBackend` and `eksBackend` with SDK-free migration-error stubs). - **PR #662** — tombstoned `platform/providers/aws/` and promoted the EKS absent-package CI gate. -Consequence: files this design's *earlier drafts* (cycles 1–7) enumerated — `aws_api_gateway.go`, `codebuild.go`, `platform_apigateway.go`, `platform_autoscaling.go`, `platform_ecs.go`, `platform_networking.go`, the `awsProviderFrom` helper, the `platform_*_aws` backends — **no longer exist in `module/`**. #653 explicitly scoped *out* "RBAC/secrets/artifact" AWS usage ("RBAC/secrets/artifact stay"). **This design is #653's successor: it extracts the AWS SDK surface #653 deliberately left, plus the entirely-untouched Azure and GCP surfaces.** Every file/symbol claim below is grep-verified against `origin/main` HEAD (the worktree branch is 0 commits behind `origin/main`). +Consequence: files this design's *earlier drafts* (cycles 1–7) enumerated — `aws_api_gateway.go`, `codebuild.go`, `platform_apigateway.go`, `platform_autoscaling.go`, `platform_ecs.go`, `platform_networking.go`, the `awsProviderFrom` helper, the `platform_*_aws` backends — **no longer exist in `module/`**. #653 explicitly scoped *out* "RBAC/secrets/artifact" AWS usage. **This design is #653's successor: it extracts the AWS SDK surface #653 deliberately left, plus the entirely-untouched Azure and GCP surfaces.** -## Problem +## Inventory provenance — `scripts/audit-cloud-symbols.sh` -Workflow core's `module/` package still imports three cloud SDK trees directly. Real-import counts (comment-only matches excluded — e.g. `storage_artifact_s3.go` *names* `aws-sdk-go-v2` only in a doc comment and stays in core): +Cycles 2–9 each found a hand-maintained file/symbol claim wrong. The recurring defect was *grep matching an SDK name inside a doc comment, not a real import* (cycle 9 caught `platform_kubernetes_kind.go`; the same pass found `nosql_dynamodb.go` is also comment-only). The fix is structural: a script — `scripts/audit-cloud-symbols.sh`, committed alongside this design and formalized/extended in Phase 0 — parses the Go `import (...)` block of every `module/*.go` file and emits the comment-immune real-import map. **Every file claim below is that script's output, not a prose assertion.** Current output: -| SDK tree | Files with a real import | how core sheds it | -|----------|--------------------------|-------------------| -| `github.com/aws/aws-sdk-go-v2/*` | **6** — `cloud_account_aws.go`, `cloud_account_aws_creds.go`, `iac_state_spaces.go`, `nosql_dynamodb.go`, `pipeline_step_s3_upload.go`, `s3_storage.go` | 4 deleted, `cloud_account_aws_creds.go` edited (resolver bodies rewritten SDK-free, stays in core), `iac_module.go` edited (strip `spaces` case) | -| `github.com/Azure/azure-sdk-for-go/sdk/*` (azcore + azblob) | **3** — `iac_state_azure.go`, `iac_module.go`, `platform_kubernetes_kind.go` (the `aksBackend`) | `iac_state_azure.go` deleted, `iac_module.go` edited (strip `azure_blob` case + `newAzureSharedKeyCredential`), `platform_kubernetes_aks.go` deleted (post Phase-0 split) | -| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** — `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_kind.go` (the `gkeBackend`) | `iac_state_gcs.go` + `storage_gcs.go` deleted, `platform_kubernetes_gke.go` deleted (post Phase-0 split) | +| SDK tree | Real-import files (`module/`, `*_test.go` excluded) | comment-only (false positives — stay in core) | +|----------|------------------------------------------------------|-----------------------------------------------| +| `github.com/aws/aws-sdk-go-v2/*` | **5** — `cloud_account_aws.go`, `cloud_account_aws_creds.go`, `iac_state_spaces.go`, `pipeline_step_s3_upload.go`, `s3_storage.go` | `nosql_dynamodb.go`, `storage_artifact_s3.go` | +| `github.com/Azure/azure-sdk-for-go/sdk/*` | **2** — `iac_module.go`, `iac_state_azure.go` | `cloud_account_azure.go`, `platform_kubernetes_kind.go` | +| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** — `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_kind.go` (the `gkeBackend`) | — | -`cloud_account_azure.go` and `cloud_account_gcp.go` are **already SDK-free** (verified: 0 SDK imports — they are pure declare-don't-resolve resolver files). Only the AWS credential resolvers carry SDK. +Key facts this regenerated inventory establishes (all verified against `origin/main` HEAD; the worktree is 0 commits behind `origin/main`): +- **`aksBackend` is SDK-free.** `platform_kubernetes_kind.go`'s azure-sdk match is a *stale doc comment* (line 332); `aksBackend.azureToken()` is a plain `net/http` OAuth2 client-credentials POST against `login.microsoftonline.com`. No `azure-sdk-for-go` import. It stays in core, exactly like `kindBackend` and `eksErrorBackend`. +- **The only SDK-bearing `platform.*` backend is `gkeBackend`** (`google.golang.org/api/container/v1`). `kind`/`k3s`/`eks`/`aks` are all SDK-free. +- **`nosql_dynamodb.go` and `storage_artifact_s3.go` are comment-only stubs** (in-memory / filesystem fallbacks; the SDK names appear only in "Full implementation would use…" comments). They carry no cloud SDK and are **out of scope** — they stay in core untouched. +- `cloud_account_azure.go` / `cloud_account_gcp.go` are already SDK-free declare-don't-resolve resolver files. Only the AWS credential resolvers carry SDK. -Every dependabot bump of a cloud SDK churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. +## Problem -Precedent: workflow#617 removed the legacy DigitalOcean IaC *resource* modules + `godo`; #653 removed the AWS IaC *resource* modules + AWS `platform/providers/`. This design extends the same principle to the *remaining* cloud functionality neither extraction touched: IaC **state backends**, the managed-Kubernetes **platform** provisioners, and a handful of standalone modules/steps. +Every dependabot bump of a cloud SDK churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. workflow#617 removed the DO IaC *resource* path + `godo`; #653 removed the AWS IaC *resource* modules + `platform/providers/aws/`. This design extends the same principle to the *remaining* cloud functionality neither extraction touched: IaC **state backends**, the one managed-Kubernetes **platform** backend that still carries an SDK (`gke`), and two standalone S3/GCS modules + one step. -**A fourth tree — `github.com/digitalocean/godo` — is still in core but out of scope here.** `module/cloud_account_do.go` + the `module/platform_do_*.go` files still import `godo`; workflow#617 removed the DO *IaC resource* path but the `platform.do_*` modules survived it. The user's ask scoped this work to three SDK trees (aws/azure/gcp); `godo` extraction is a structurally-identical follow-up but is **not** in this design's scope. Consequence: the `go list -deps` CI gate added in the final phase asserts **zero `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` packages** — it does *not* assert "zero cloud SDKs" while `godo` remains. +**A fourth tree — `github.com/digitalocean/godo` — is still in core but out of scope here.** `module/cloud_account_do.go` + the `module/platform_do_*.go` files still import `godo`. The user's ask scoped this work to three SDK trees (aws/azure/gcp); `godo` extraction is a structurally-identical follow-up. Consequence: the `go list -deps` CI gate added in the final phase asserts **zero `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` packages** — it does *not* assert "zero cloud SDKs" while `godo` remains. ## Goals - workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` (the three in-scope trees) **entirely** — verified by a `go list -deps` gate in the final phase's CI asserting zero packages from those three trees. `godo` is out of scope. - Cloud functionality remains available, loaded via strict-contract gRPC plugins (the existing sidecar model). -- `kind` / `k3s` Kubernetes backends (no SDK) stay in core — local-dev/test path must not require a plugin. The `eks` backend is *already* an SDK-free migration-error stub (`eksErrorBackend`, courtesy #653) and stays in core unchanged. +- The SDK-free Kubernetes backends — `kind`, `k3s`, `eks` (a #653 migration-error stub), **and `aks`** (a `net/http` OAuth2 client) — stay in core unchanged. Local-dev/test paths must not require a plugin. ## Non-Goals -- Re-homing the IaC *resource provider* contract (`IaCProviderRequired`) — already extracted (#617, #653), not touched here. +- Re-homing the IaC *resource provider* contract (`IaCProviderRequired`) — already extracted (#617, #653). - Changing how plugins are discovered/installed (`wfctl plugin install` flow unchanged). -- Backwards-compatible yaml — this is a **clean break** with a migration guide (per workflow#617 / #653 precedent). -- **Removing `aws-sdk-go-v2/service/kinesis`.** `go mod why` resolves it to `workflow → workflow/module → modular/modules/eventbus/v2 → kinesis` — a **transitive dependency of `modular`**, not a direct workflow import. Out of scope; an upstream `modular` concern. -- **Re-doing #653's work.** The AWS IaC modules, `platform/providers/aws/`, and the codebuild/EKS backends are already gone. This design does not re-extract them. +- Backwards-compatible yaml — this is a **clean break** with a migration guide (per #617 / #653 precedent). +- **Removing `aws-sdk-go-v2/service/kinesis`** — a transitive dependency of `modular`, not a direct workflow import. An upstream `modular` concern. +- **Touching the comment-only stubs.** `nosql_dynamodb.go` and `storage_artifact_s3.go` carry no SDK — they are not part of an SDK-extraction effort. (That their "real" implementations are unbuilt stubs is a separate concern, not this design's.) +- **Re-doing #653's work** — the AWS IaC modules, `platform/providers/aws/`, and the codebuild/EKS backends are already gone. ## Architecture -Three extension surfaces, three handling strategies: +Two extension surfaces drive new work; a third is the existing plugin-native path. ### 1. IaC state backends → new `IaCStateBackend` strict proto contract -`iac.state` **stays a core module type**. The state store is engine infrastructure — the orchestrator reads/writes it during every plan/apply cycle — so it keeps a stable core seam. What changes: `config.backend` no longer dispatches a hardcoded `switch` in `module/iac_module.go`; instead core resolves an `IaCStateBackend` gRPC client from whichever loaded plugin registered that backend name. +`iac.state` **stays a core module type**. The state store is engine infrastructure — the orchestrator reads/writes it during every plan/apply cycle — so it keeps a stable core seam. What changes: `config.backend` no longer dispatches a hardcoded `switch` in `module/iac_module.go` (current cases: `memory`, `filesystem`, `spaces`, `gcs`, `azure_blob`, `postgres`); instead core resolves an `IaCStateBackend` gRPC client from whichever loaded plugin registered that backend name. The contract maps **1:1 onto the existing `module.IaCStateStore` interface** (`module/iac_state.go`) — six methods, no speculative surface: ```proto // Added as a new service INSIDE plugin/external/proto/iac.proto — matches the -// established precedent (iac.proto already holds multiple services; state + -// platform contracts version alongside the resource-provider contract). +// established precedent (iac.proto already holds multiple services). service IaCStateBackend { rpc GetState (GetStateRequest) returns (GetStateResponse); // → IaCStateStore.GetState rpc SaveState (SaveStateRequest) returns (SaveStateResponse); // → IaCStateStore.SaveState @@ -72,12 +76,9 @@ message GetStateResponse { IaCState state = 1; bool exists = 2; } message SaveStateRequest { IaCState state = 1; } // idempotent: full-state replace, last-writer-wins message ListStatesRequest { map filter = 1; } message LockRequest { string resource_id = 1; } // 1:1 with IaCStateStore.Lock — no TTL field (see Failure modes) -// IaCState mirrors module.IaCState. A lock-lease/TTL field is a planned ADDITIVE -// follow-up (Failure modes §), deferred until the first plugin backend implements -// honored expiry so it ships with a conformance test instead of as a no-op field. ``` -Backend ownership — every cloud plugin implements the contract for its native storage: +Backend ownership: | backend name | plugin | storage | core file deleted | |--------------|--------|---------|-------------------| @@ -86,205 +87,194 @@ Backend ownership — every cloud plugin implements the contract for its native | `gcs` | workflow-plugin-gcp | Google Cloud Storage | `iac_state_gcs.go` | | `spaces` | workflow-plugin-digitalocean | DO Spaces (S3-compatible) | (shares `iac_state_spaces.go` deletion — see Phase D) | -`memory`, `filesystem`, `postgres` backends stay **in core** — no cloud SDK, no reason to extract. +`memory`, `filesystem`, `postgres` backends stay **in core** — no cloud SDK. -**Unary GET+SAVE vs streaming:** decided by benchmark, not assumption. The writing-plans phase includes a task that drives a 1 MB synthetic state blob through a full plan→apply cycle (GetState + SaveState + Lock + Unlock per resource batch) over unary RPC, measures p50/p99 added latency vs the in-process baseline, and only adopts chunked streaming if unary clears no acceptable bar. Default build target: **unary** — (a) gRPC's default 4 MB message cap covers typical state files, (b) streaming adds protocol complexity that must be justified by data, (c) the in-process baseline this replaces was itself a single blob read/write. This task is ordered **before** the Phase A proto is locked. +**Unary GET+SAVE vs streaming:** decided by benchmark, not assumption. The writing-plans phase includes a task that drives a 1 MB synthetic state blob through a full plan→apply cycle over unary RPC, measures p50/p99 added latency vs the in-process baseline, and only adopts chunked streaming if unary clears no acceptable bar. Default build target: **unary** (gRPC's 4 MB default cap covers typical state files; the in-process baseline this replaces was itself a single blob read/write). Ordered **before** the Phase A proto is locked. -### 2. Managed-Kubernetes platform provisioners → new `PlatformBackend` strict proto contract +### 2. The `gke` Kubernetes backend → gRPC extraction (contract shape gated on a spike) -**Post-#653 this surface is small.** The only `module/platform_*.go` file that still imports a cloud SDK is `platform_kubernetes_kind.go`, which holds **four** `kubernetesBackend` implementations behind one shared `init()`: +**Post-#653 and post-cycle-9-re-baseline, this surface is one backend.** `platform_kubernetes_kind.go` holds four `kubernetesBackend` implementations behind one shared `init()`: | backend | SDK | disposition | |---------|-----|-------------| | `kindBackend` (serves `kind` *and* `k3s`) | none | **stays in core** | -| `eksErrorBackend` (serves `eks`) | none (already a #653 migration-error stub) | **stays in core** unchanged | -| `gkeBackend` (serves `gke`) | `google.golang.org/api/container` | → workflow-plugin-gcp | -| `aksBackend` (serves `aks`) | `Azure/azure-sdk-for-go` | → workflow-plugin-azure | +| `eksErrorBackend` (serves `eks`) | none (#653 migration-error stub) | **stays in core** unchanged | +| `aksBackend` (serves `aks`) | **none** — `net/http` OAuth2, not azure-sdk | **stays in core** | +| `gkeBackend` (serves `gke`) | `google.golang.org/api/container/v1` | → workflow-plugin-gcp | -There is **no** `platform.ecs` / `platform.networking` / `platform.autoscaling` / `platform.dns` / `platform.apigateway` cloud-SDK surface left — #653 removed it. `platform_dns.go` / `platform_dns_backends.go` still exist but carry **no cloud SDK import** (verified). So `PlatformBackend` serves exactly two cloud backends: `aks` and `gke`. +So **exactly one** `platform.*` backend carries a cloud SDK: `gkeBackend`. `platform.kubernetes` keeps its module type **and its `provider:` config key** in core — no yaml UX break — and the `kubernetesBackend` interface + the `kind`/`k3s`/`eks`/`aks` impls all stay in core. Only `gke`'s SDK-bearing implementation moves to a plugin. -`platform.kubernetes` keeps its module type **and its `provider:` config key** in core — no yaml UX break. The `kubernetesBackend` interface (`module/platform_kubernetes.go`) stays in core; the cloud impls move behind the `PlatformBackend` gRPC contract. +**A *new* `PlatformBackend` gRPC contract for a single backend is a YAGNI risk, and the design does not pre-commit to one.** Instead, the cross-process mechanism for `gke` is **gated on an interface-audit spike** (a writing-plans task, ordered before Phase C). The spike audits the 4-method `kubernetesBackend` interface (`plan`/`apply`/`status`/`destroy`) against the existing contracts and picks, in preference order: -```proto -// Added as a new service INSIDE plugin/external/proto/iac.proto (same rationale -// as IaCStateBackend — co-versioned with the resource-provider contract). -service PlatformBackend { - rpc Plan (PlatformPlanRequest) returns (PlatformPlanResponse); - rpc Apply (PlatformApplyRequest) returns (PlatformApplyResponse); - rpc Destroy(PlatformDestroyRequest) returns (PlatformDestroyResponse); -} -// Each request carries: platform_type (currently always "kubernetes"), -// provider (gke|aks), desired-state struct, current-state struct, AND a -// CloudCredentials message. Response: plan actions / applied state / errors. -// (remaining message field layouts: deferred to writing-plans.) -``` +1. **Fold `gke` into the existing `IaCProviderRequired` / `ResourceDriver` contract.** A GKE cluster is a managed resource with create/plan/apply/destroy/status — exactly what `ResourceDriver` already models (battle-tested, multiple ADRs through the strict-contracts cutover). *Preferred* — reuses a proven contract, adds zero new proto surface. +2. **Make `gke` a plugin-native `kubernetesBackend`** via the existing `ModuleFactories`/`RemoteModule` plugin SDK, the same path §3 uses — if `ResourceDriver`'s lifecycle shape doesn't fit the continuous-reconciliation behavior of `platform.kubernetes`. +3. **A minimal new `PlatformBackend` service** (3–4 RPCs) — *only* if neither (1) nor (2) fits cleanly. With one backend this is the least likely outcome; the design records it as the fallback, not the plan. -**The `PlatformBackend` shape is gated, but the gate is now nearly trivial** (cycle-8 note): the design's earlier draft worried about unifying *five* `platform.*` backend interfaces. Post-#653 there is **one** interface to audit — `kubernetesBackend` (4 methods: `plan`/`apply`/`status`/`destroy`) — and only two cloud impls behind it (`gkeBackend`, `aksBackend`), both managed-Kubernetes clusters. The Phase 0/A interface-audit spike validates that `kubernetesBackend`'s 4 methods map cleanly onto `Plan/Apply/Destroy/Status` *before* the proto is locked. The risk that drove Alternatives Considered #1 (lowest-common-denominator across heterogeneous platform families) is largely gone with the heterogeneous families gone — but the fallback (fold into `ResourceDriver`) is retained for the spike's decision. +Whatever the spike picks, `platform.kubernetes` core dispatches on `provider:` exactly as today; when `provider == gke` it resolves a gRPC client of the chosen contract from workflow-plugin-gcp; for `kind`/`k3s`/`eks`/`aks` it uses the in-core backend unchanged. -**Credential flow across the boundary — in-core resolvers *declare*, the plugin *resolves*.** The cloud backends reach credentials via `module.CloudCredentials` (`module/cloud_account.go`); `aksBackend.azureToken(creds *CloudCredentials)` takes it directly (verified). Verified shape of the existing pieces: -- `CloudCredentials` is a **plain-field struct** — `Provider/Region/AccessKey/SecretKey/SessionToken/RoleARN/ProjectID/TenantID/ClientID/.../Token` plus `Extra map[string]string`. No `Profile` field; `profile` lives in `Extra["profile"]`. Cleanly proto-serialisable as-is — **no struct change needed**. -- The AWS credential *resolvers* split two ways. `awsStaticResolver` / `awsEnvResolver` are **already SDK-free**. `awsProfileResolver` and `awsRoleARNResolver` (verified, `cloud_account_aws_creds.go`) have an **SDK-bearing block** (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `sts.AssumeRole`) that resolves the profile/role into `AccessKey/SecretKey` *in-core*. The azure/gcp resolvers (`cloud_account_azure.go`, `cloud_account_gcp.go`) are **already SDK-free**. +**Credential flow.** `gkeBackend` reaches credentials via `module.CloudCredentials` (`module/cloud_account.go`) — a plain-field struct (`Provider/Region/AccessKey/SecretKey/.../ProjectID/...` + `Extra map[string]string`), cleanly proto-serialisable as-is, **no struct change needed**. The chosen contract's request carries a serialised `CloudCredentials`; the plugin performs any SDK-bearing resolution in-process. The gcp credential resolvers (`cloud_account_gcp.go`) are already SDK-free. -The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **rewrites** the two SDK-bearing AWS resolver bodies — a deliberate `Resolve()` body rewrite, **not** a one-line "snip the tail": -- `awsProfileResolver.Resolve` — its SDK calls (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `cfg.Credentials.Retrieve`) *are* a clean contiguous tail after the marker-record (`m.creds.Extra["profile"] = profile`); the rewrite ends the method right after the marker-record. -- `awsRoleARNResolver.Resolve` — the SDK block (base-config build + `sts.NewFromConfig` + `AssumeRole` + result-record) is contiguous *after* the declared-input recording (`RoleARN`, `Extra["external_id"]`, `roleArn`-required validation, `sessionName` parse) but is the **larger half** of the method. The rewrite **deletes that entire block** and ends the method after the declared-input recording + a `credential_source` marker. Calling this "remove a tail" understates it — it is a body rewrite. +### 3. Standalone S3/GCS modules + step → plugin-native types (existing SDK surface, no new contract) -After both rewrites, `cloud_account_aws_creds.go` imports **no** `aws-sdk-go-v2` package (verified: the 4 SDK imports — `aws`, `config`, `credentials`, `sts` — are used *only* by those two resolver bodies; `init()` + `awsStaticResolver` + `awsEnvResolver` are SDK-free) and **stays in core**. **Phase B CI invariant:** an import-block grep (folded into `scripts/audit-cloud-symbols.sh`) asserts `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-rewrite. +These are user-facing pipeline functionality, not engine infrastructure. They become **plugin-native module/step types** via the existing `ModuleFactories` / `StepFactories` plugin SDK — already a gRPC sidecar path (`RemoteModule`). No new contract. **Current registration site:** these types are registered today by *built-in in-process engine plugins* under `plugins/` (which import `module.*` directly), not by `engine.go`. Extracting each one means the built-in plugin's factory map drops that entry and the impl moves to the external gRPC plugin. -The engine serialises the resolver-populated `CloudCredentials` struct into a proto `CloudCredentials` message on every `PlatformBackend` request. The **plugin** performs any SDK-bearing resolution (profile-chain, STS AssumeRole, managed-identity, ADC) in-process. +| core file | current built-in registration | becomes | plugin | +|-----------|-------------------------------|---------|--------| +| `pipeline_step_s3_upload.go` (`S3UploadStep`) | `plugins/pipelinesteps/plugin.go` `"step.s3_upload"` (:183) | `step.s3_upload` | aws | +| `s3_storage.go` (`S3Storage`) | `plugins/storage/plugin.go` `"storage.s3"` (:89) | `storage.s3` module | aws | +| `storage_gcs.go` (`GCSStorage`) | `plugins/storage/plugin.go` `"storage.gcs"` (:109) | `storage.gcs` module | gcp | -When `provider ∉ {kind, k3s, eks}` core's `platform.kubernetes` module resolves a `PlatformBackend` client from the plugin that registered `(kubernetes, provider)`. +(`nosql_dynamodb.go` and `storage_artifact_s3.go` are **not** in this table — they carry no SDK, see Inventory provenance. They stay in core.) -### 3. Standalone modules / steps → plugin-native types (existing SDK surface, no new contract) +**Credential handling (Option 1, approved):** each plugin-native module carries its own `credentials:` config block and resolves it in-process via a shared in-plugin `buildAWSConfig` (resp. gcp) helper — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many such modules, each plugin offers an optional in-plugin `aws.credentials` / `gcp.credentials` module + a `credentials_ref:` key — DRY handled entirely inside the plugin, no core contract. -These are user-facing pipeline functionality, not engine infrastructure. They become **plugin-native module/step types** via the existing `ModuleFactories` / `StepFactories` plugin SDK — already a gRPC sidecar path (`RemoteModule`). No new contract. **Note on current registration site:** these types are today registered by *built-in in-process engine plugins* under `plugins/` (which import `module.*` directly), not by `engine.go`. Extracting each one means the built-in plugin's factory map drops that entry and the impl moves to the external gRPC plugin. +### Credential resolvers — in-core resolvers *declare*, the plugin *resolves* -| core file | current built-in registration | becomes | plugin | -|-----------|-------------------------------|---------|--------| -| `nosql_dynamodb.go` (`DynamoDBNoSQL`) | `plugins/datastores/plugin.go` `"nosql.dynamodb"` | `nosql.dynamodb` module | aws | -| `pipeline_step_s3_upload.go` (`S3UploadStep`) | `plugins/pipelinesteps/plugin.go` `"step.s3_upload"` | `step.s3_upload` | aws | -| `s3_storage.go` (`S3Storage`) | `plugins/storage/plugin.go` `"storage.s3"` (factory at :90) | `storage.s3` module | aws | -| `storage_gcs.go` (`GCSStorage`) | `plugins/storage/plugin.go` `"storage.gcs"` (factory at :109) | `storage.gcs` module | gcp | +The AWS credential resolvers (`cloud_account_aws_creds.go`) split two ways. `awsStaticResolver` / `awsEnvResolver` are **already SDK-free**. `awsProfileResolver` and `awsRoleARNResolver` (verified) have an **SDK-bearing block** (`config.LoadDefaultConfig(WithSharedConfigProfile)`, `sts.AssumeRole`) that resolves the profile/role into `AccessKey/SecretKey` *in-core*. The azure/gcp resolvers are already SDK-free. -`storage_artifact_s3.go` references the AWS SDK **only in a doc comment** (verified — its actual imports are `context`/`fmt`/`io`/`modular`; the real impl is a filesystem fallback) — **not a real import, stays in core untouched.** +The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **rewrites** the two SDK-bearing AWS resolver bodies — a deliberate `Resolve()` body rewrite, **not** a one-line "snip the tail": +- `awsProfileResolver.Resolve` — its SDK calls are a clean contiguous tail after the marker-record (`m.creds.Extra["profile"] = profile`); the rewrite ends the method right after the marker-record. +- `awsRoleARNResolver.Resolve` — the SDK block (base-config build + `sts.NewFromConfig` + `AssumeRole` + result-record) is contiguous *after* the declared-input recording but is the **larger half** of the method. The rewrite **deletes that entire block** and ends the method after the declared-input recording + a `credential_source` marker. -`cloud_account_aws.go` — defines `AWSConfigProvider` interface + `AWSConfig()` method + `ValidateCredentials()` method, all pure SDK — is **dead code**: a repo-wide grep for `AWSConfigProvider` / `awsProviderFrom` / `.AWSConfig(` returns **zero non-test consumers** (the `awsProviderFrom` helper and every consumer were removed by #653). It is **deleted outright by Phase B with no consumer rewrite and no core replacement** — this is a trivial dead-code deletion, not the multi-consumer refactor earlier drafts described. +After both rewrites, `cloud_account_aws_creds.go` imports **no** `aws-sdk-go-v2` package (the 4 SDK imports — `aws`, `config`, `credentials`, `sts` — are used *only* by those two resolver bodies) and **stays in core**. **Phase B CI invariant:** `scripts/audit-cloud-symbols.sh --check` (with the Phase-B marker present) asserts `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-rewrite. -Credential handling (Option 1, approved): each plugin-native AWS module carries its own `credentials:` config block and resolves it in-process via a shared in-plugin `buildAWSConfig` helper that owns the static/env/profile/role_arn logic — exactly the workflow-plugin-digitalocean model. To avoid yaml redundancy when a config declares many AWS modules, each plugin offers an optional in-plugin `aws.credentials` (resp. `gcp.credentials`) module + a `credentials_ref:` key — DRY handled entirely inside the plugin, still no core contract. +`cloud_account_aws.go` — `AWSConfigProvider` interface + `AWSConfig()` method + `ValidateCredentials()` method, all pure SDK — is **dead code**: a repo-wide grep for `AWSConfigProvider` / `awsProviderFrom` / `.AWSConfig(` returns **zero non-test consumers** (the `awsProviderFrom` helper and every consumer were removed by #653). It is **deleted outright by Phase B with no consumer rewrite and no core replacement.** -**Resolvers emit *markers*, not always plain values.** For credential types `static` / `env`, the in-core resolver records concrete declared values into `CloudCredentials`. For `profile` / `role_arn` (AWS) and `managed_identity` / `client_credentials` / `cli` (azure) and the gcp equivalents, the resolver records the *declared inputs* (`Extra["profile"]`, `RoleARN`, etc.) **plus** an `Extra["credential_source"]` marker — it does **not** resolve to concrete keys. The plugin reads the marker and performs the SDK-bearing resolution. This is not a "no-op passthrough": the plugin **must** implement marker handling for every deferred type. +**Resolvers emit *markers*, not always plain values.** For `static` / `env`, the in-core resolver records concrete declared values. For `profile` / `role_arn` (AWS) and the azure/gcp deferred types, the resolver records the *declared inputs* + an `Extra["credential_source"]` marker — it does **not** resolve to concrete keys. The plugin reads the marker and performs the SDK-bearing resolution. The plugin **must** implement marker handling for every deferred type. ## Security -Option 1 moves raw cloud secrets (`accessKey`/`secretKey`/`account_key`/etc.) inline into every plugin-native module's `credentials:` config block — multiplying the number of config sites holding plaintext secrets versus today's single `cloud.account` module. Not unprecedented (`iac_module.go`'s current `spaces` case already inlines `accessKey`/`secretKey`), but the multiplication needs explicit handling: +Option 1 moves raw cloud secrets inline into every plugin-native module's `credentials:` config block — multiplying the config sites holding plaintext secrets versus today's single `cloud.account` module. Not unprecedented (`iac_module.go`'s `spaces` case already inlines `accessKey`/`secretKey`), but needs explicit handling: -- **Config-version store + execution tracing.** Workflow's config-version store (SHA-256 content-addressed) and execution-tracing layer marshal module config. Plugin-native module config carrying inline credentials MUST be redacted before persistence/tracing. Writing-plans task: extend the existing PII/secret redaction (already per-tenant-toggleable per `workflow-cloud`) to recognise `credentials:` / `credentials_ref:` keys on plugin module config, OR confirm the existing redaction already covers any key matching a secret-pattern. **Blocking** — ships in the same phase as the first plugin-native AWS module. -- **gRPC sidecar request logging.** `IaCStateBackend` / `PlatformBackend` requests cross the engine↔plugin gRPC boundary, and `credentials:` blocks ride in `CreateModule` requests. **Verified at design time:** `plugin/external/grpc_plugin.go:39` constructs the server as `grpc.NewServer(opts...)` with `opts` passed straight through from the go-plugin broker — workflow's plugin SDK adds **no body-logging interceptor**. The only request-body logging in `plugin/external/` is `callback_server.go:85,118` (plugin→host callback path) — neither touches module config. `CreateModule` is dispatched at `adapter.go:477` with no logging. **Conclusion: no redacting interceptor needed today.** Writing-plans adds a guard test asserting no interceptor logs `CreateModule` bodies, so a future SDK change that adds one fails CI. -- **`credentials_ref:` blast radius.** A `credentials_ref` resolves to an in-plugin `aws.credentials` module within the *same plugin process* — it does not broaden which process can read the secret (engine never sees the resolved `aws.Config`, only the plugin does). Strictly *narrower* than today's `cloud.account` (which builds `aws.Config` in the engine process). Documented as an improvement. +- **Config-version store + execution tracing.** Workflow's config-version store (SHA-256 content-addressed) and execution-tracing layer marshal module config. Plugin-native module config carrying inline credentials MUST be redacted before persistence/tracing. Writing-plans task: extend the existing PII/secret redaction (already per-tenant-toggleable per `workflow-cloud`) to recognise `credentials:` / `credentials_ref:` keys, OR confirm the existing redaction already covers any key matching a secret-pattern. **Blocking** — ships in the same phase as the first plugin-native module. +- **gRPC sidecar request logging.** `IaCStateBackend` requests (and the `gke` contract's requests) cross the engine↔plugin gRPC boundary; `credentials:` blocks ride in `CreateModule` requests. **Verified at design time:** `plugin/external/grpc_plugin.go:39` constructs the server as `grpc.NewServer(opts...)` with `opts` passed straight through — no body-logging interceptor. The only request-body logging in `plugin/external/` is `callback_server.go:85,118` (plugin→host callback path) — neither touches module config. `CreateModule` is dispatched at `adapter.go:477` with no logging. **Conclusion: no redacting interceptor needed today.** Writing-plans adds a guard test asserting no interceptor logs `CreateModule` bodies. +- **`credentials_ref:` blast radius.** A `credentials_ref` resolves to an in-plugin `aws.credentials` module within the *same plugin process* — it does not broaden which process can read the secret. Strictly *narrower* than today's `cloud.account` (which builds `aws.Config` in the engine process). ## Failure modes Moving the IaC state store behind a gRPC sidecar introduces a partial-failure surface on the engine's hottest path (every plan/apply does `Lock` → `GetState` → ... → `SaveState` → `Unlock`): -- **Plugin crashes between `Lock` and `Unlock` → orphaned lock.** An in-process lock dies with the process; a gRPC-plugin lock can outlive a plugin crash if persisted (S3/Blob lock objects persist). **Initial scope:** documented limitation, not silently broken. The contract ships as exactly the 6-method `IaCStateStore` interface — no TTL field — because no Phase A–D plugin backend implements honored expiry yet, and a no-op TTL field implies a guarantee that isn't enforced. Recovery: operator deletes the backend's lock object directly (a plain object/blob in the user's own bucket; lock key format documented per backend). **Planned additive follow-up:** once a backend implements honored expiry, `LockRequest` gains an optional `lease_ttl_seconds` field *paired with a conformance test*. Tracked as an open item. -- **`Lock` contention against a still-held lock.** Core's `iac.state` dispatch returns an immediate error — it does not block. Matches today's in-process `IaCStateStore.Lock`. The gRPC boundary doesn't change this; orphaned-lock recovery is the operator-side delete above. -- **`SaveState` succeeds plugin-side but the gRPC response is lost → engine retries → double-write.** `SaveState` MUST be idempotent: full-state replace keyed by `resource_id` (existing `IaCStateStore.SaveState` is already insert-or-replace), so a retried identical `SaveState` is no-op-equivalent. Plugin implementations use unconditional PUT (overwrite), not append. IaC state is last-writer-wins by design. +- **Plugin crashes between `Lock` and `Unlock` → orphaned lock.** An in-process lock dies with the process; a gRPC-plugin lock can outlive a crash if persisted (S3/Blob lock objects persist). **Initial scope:** documented limitation. The contract ships as exactly the 6-method `IaCStateStore` interface — no TTL field — because no Phase A–D backend implements honored expiry yet, and a no-op TTL field implies an unenforced guarantee. Recovery: operator deletes the backend's lock object directly (a plain object/blob in the user's own bucket; lock key format documented per backend). **Planned additive follow-up:** once a backend implements honored expiry, `LockRequest` gains an optional `lease_ttl_seconds` field *paired with a conformance test*. Tracked as an open item. +- **`Lock` contention against a still-held lock.** Core's `iac.state` dispatch returns an immediate error — it does not block. Matches today's in-process `IaCStateStore.Lock`. Orphaned-lock recovery is the operator-side delete above. +- **`SaveState` succeeds plugin-side but the gRPC response is lost → engine retries → double-write.** `SaveState` MUST be idempotent: full-state replace keyed by `resource_id` (existing `IaCStateStore.SaveState` is already insert-or-replace). Plugin implementations use unconditional PUT (overwrite), not append. Last-writer-wins by design. - **Plugin unreachable at plan/apply start.** Core's `iac.state` dispatch returns a clear `"iac.state backend %q: plugin unreachable"` error and the plan/apply aborts *before* mutating anything. Matches today's behavior when a misconfigured backend fails to construct in `IaCModule.Init()`. -- **`PlatformBackend` plugin crash mid-`Apply`.** A `platform.kubernetes` apply crashing mid-flight leaves a real cloud cluster in an indeterminate state — but this is **identical to today's in-process risk** (an in-process `aksBackend.apply()` panic leaves the same indeterminate state). The next `Plan` reconciles against live cloud state as today. Documented as unchanged. -- **A plugin registers a backend/provider name colliding with a core-reserved one.** Core-registered names (`iac.state`: `memory`/`filesystem`/`postgres`; `platform.kubernetes`: `kind`/`k3s`/`eks`; the `mock` backend of every `platform.*` family) are **reserved**. A colliding plugin registration is a **load-time error** — core fails to start with `"plugin %q registered reserved backend name %q"` rather than silently shadowing. +- **`gke` plugin crash mid-`apply`.** A `platform.kubernetes` apply (provider `gke`) crashing mid-flight leaves a real cloud cluster in an indeterminate state — **identical to today's in-process risk** (an in-process `gkeBackend.apply()` panic leaves the same state). The next `plan` reconciles against live cloud state as today. Unchanged. +- **A plugin registers a backend/provider name colliding with a core-reserved one.** Core-registered names (`iac.state`: `memory`/`filesystem`/`postgres`; `platform.kubernetes`: `kind`/`k3s`/`eks`/`aks`; the `mock` backends) are **reserved**. A colliding plugin registration is a **load-time error** — core fails to start with `"plugin %q registered reserved backend name %q"` rather than silently shadowing. -## Cross-file coupling: the symbol-ownership map is a Phase 0 build artifact, not a design-doc claim +## Cross-file coupling: the symbol-ownership map is a build artifact, not a design-doc claim -Prior review cycles each found a hand-maintained per-file ownership claim in this design *wrong* — and cycle 8 found the whole inventory stale because it predated #653. The lesson is structural: **a precise symbol map is derived data; it rots on every upstream merge and the design doc is the wrong place for it.** The design commits to a *method* and a small set of *invariants*, and delegates the exact map to a script that runs in CI. +Cycles 2–9 each found a hand-maintained ownership claim wrong — cycle 8 (whole inventory predated #653), cycle 9 (comment-only false positives). The lesson is structural: **a precise symbol map is derived data; it rots on every upstream merge and on every comment.** `scripts/audit-cloud-symbols.sh` (committed with this design, formalized in Phase 0) is now the source of truth — it parses `import (...)` blocks, never comments, and runs in CI on every phase PR. **Invariants (load-bearing; the script verifies them, it doesn't discover them):** -- `module/cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic *declared-config* holder — **it stays in core, is never deleted by any phase**, and is the credential symbol-home all cloud platform code binds to. The `PlatformBackend` contract carries the declared `CloudCredentials` across the boundary. -- `module/platform_kubernetes_kind.go` co-locates **core-staying** backends (`kindBackend` serving kind+k3s, `eksErrorBackend` serving eks) and **plugin-bound** cloud backends (`gkeBackend`, `aksBackend`) behind a *single shared `func init()`* (verified — `init()` registers all five names). Splitting it requires partitioning that `init()`. **Phase 0 does exactly this** — and it is the *only* `platform.*` file needing a split, because #653 already removed the rest. -- `cloud_account_aws.go` is **dead code, deleted outright by Phase B.** It defines `AWSConfig()` / `AWSConfigProvider` / `ValidateCredentials` — all pure SDK — and a repo-wide grep confirms **zero non-test consumers** of any of them (`awsProviderFrom` and its consumers were removed by #653). No consumer rewrite, no helper relocation: earlier drafts' "8-consumer rewrite" and "`parseStringSlice` relocation" are obsolete — `parseStringSlice` and `safeIntToInt32` **no longer exist anywhere in `module/`** (verified). There is no shared-helper-relocation work in this design. +- `module/cloud_account.go` (`CloudCredentials` / `CloudCredentialProvider` / `CloudAccount`) is the provider-agnostic *declared-config* holder — **it stays in core, is never deleted by any phase**, and is the credential symbol-home all cloud platform code binds to. +- `module/platform_kubernetes_kind.go` holds **four** `kubernetesBackend` impls behind **one shared `init()`** registering `kind`/`k3s`/`eks`/`gke`/`aks` (verified). Of the four impls, only `gkeBackend` carries a cloud SDK. Phase 0 splits this file; the `init()` must be partitioned so no `init()` registers both a core-staying and a plugin-bound factory. +- `cloud_account_aws.go` is **dead code, deleted outright by Phase B** — `AWSConfig()` / `AWSConfigProvider` / `ValidateCredentials`, all pure SDK, with **zero non-test consumers verified**. No consumer rewrite. `parseStringSlice` and `safeIntToInt32` (referenced by earlier drafts) **no longer exist anywhere in `module/`** — there is no shared-helper-relocation work in this design. -**The method — `scripts/audit-cloud-symbols.sh`, Phase 0 task 1:** for the `platform_kubernetes_kind.go` split and each plugin-bound `module/*.go` file, it greps every cross-file function/type reference and asserts (a) no `init()` registers a *mix* of core-staying and plugin-bound factories, (b) no cross-file symbol edge from a core-staying file into a to-be-deleted file, (c) `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports after the Phase B resolver rewrite. Committed with Phase 0, re-run in CI on every subsequent phase PR. The design never transcribes its output — the script *is* the source of truth. +**The script's checks:** (a) the comment-immune real-import map per SDK tree; (b) `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-Phase-B (gated on a `.phase-b-complete` marker); (c) advisory readout of `platform_kubernetes_kind.go`'s backend-type count and shared-`init()` count for the Phase 0 split. Phase 0 extends it with the full `init()`-partition assertion. ## Phase 0 — precursor: split the one remaining mixed cloud-backend file -Post-#653, Phase 0 is small: a mechanical, behavior-equivalent split of the **single** `module/` file that still co-locates core-staying and plugin-bound cloud backends. +Post-#653 and post-cycle-9, Phase 0 is small: a mechanical, behavior-equivalent split of the **single** `module/` file that co-locates a plugin-bound cloud backend with core-staying ones. **1. Split `platform_kubernetes_kind.go`** into: -- `platform_kubernetes_core.go` — holds `kindBackend` (serves `kind` + `k3s`), `eksErrorBackend` (serves `eks`), and an `init()` registering **only** those three core-staying names. -- `platform_kubernetes_gke.go` — holds `gkeBackend` + its `google.golang.org/api/container` import + an `init()` registering only `gke`. -- `platform_kubernetes_aks.go` — holds `aksBackend` (incl. `azureToken`) + its `Azure/azure-sdk-for-go` import + an `init()` registering only `aks`. +- `platform_kubernetes_core.go` — holds `kindBackend` (serves `kind` + `k3s`), `eksErrorBackend` (serves `eks`), **`aksBackend` (serves `aks` — SDK-free `net/http` OAuth2)**, and an `init()` registering **only** those four core-staying names. +- `platform_kubernetes_gke.go` — holds `gkeBackend` + its `google.golang.org/api/container` + `option` imports + an `init()` registering only `gke`. + +After the split, no `init()` registers both a core-staying and a plugin-bound factory, and only `platform_kubernetes_gke.go` holds a plugin-bound backend. `platform_kubernetes.go` (the `PlatformKubernetes` shell, `kubernetesBackend` interface, `RegisterKubernetesBackend`, `intFromAny` helper) is untouched. -After the split, no `init()` registers both a core-staying and a plugin-bound factory, and no file holds both a core-staying and a plugin-bound backend impl. `platform_kubernetes.go` (the `PlatformKubernetes` shell, `kubernetesBackend` interface, `RegisterKubernetesBackend`, `intFromAny` helper) is untouched and stays in core. +**2. Fix the stale doc comment** at `platform_kubernetes_kind.go:332` ("Requires the Azure SDK…") — it describes an implementation that doesn't exist (`azureToken` is pure `net/http`) and is the exact thing that fooled cycle-8's re-baseline. Phase 0 is already touching this file; correct the comment so neither the audit script's future readers nor a human reviewer is misled again. -**2. Create `scripts/audit-cloud-symbols.sh`** (the Cross-file-coupling method above). No shared-helper-relocation step — there are no shared helpers to relocate (verified). +**3. Formalize `scripts/audit-cloud-symbols.sh`** — already drafted and committed with this design; Phase 0 extends it with the full `init()`-partition assertion and wires it into CI (run on every subsequent phase PR). No shared-helper-relocation step — there are no shared helpers to relocate (verified). This is **not** "zero logic change" — partitioning a shared `init()` distributes registration calls across files. It is *behavior-equivalent*: the same five backend names are registered after the split as before. -**Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `scripts/audit-cloud-symbols.sh` committed, output shows no mixed `init()` and no cross-file edge into a to-be-deleted file; `git diff` is pure code movement + mechanical `init()` partition, no logic edits. +**Phase 0 acceptance criteria:** `go build ./... && go vet ./... && go test ./module/...` green; `scripts/audit-cloud-symbols.sh` output shows no `init()` mixing core-staying + plugin-bound registrations; `git diff` is pure code movement + mechanical `init()` partition + the one comment fix, no logic edits. -**Phase 0 rollback:** a file-split + `init()` partition with no behavior diff — revert is a single `git revert`, no contract, no go.mod, no runtime impact. +**Phase 0 rollback:** a file-split + `init()` partition + comment fix with no behavior diff — revert is a single `git revert`, no contract, no go.mod, no runtime impact. ## Phases Each phase is one workflow-core PR (deleting/editing files + wiring the contract dispatch) plus one PR per affected plugin. Within a phase, the plugin PR may merge ahead of the core PR — core keeps the old in-process path until the contract dispatch is wired in the core PR. **Atomicity rule:** within a core PR, a deleted file and every file referencing its symbols are removed in the *same commit* (the build gate enforces this). -**Phase A — Azure** (smallest; validates BOTH new contracts end-to-end): +**Phase A — Azure** (smallest; validates the `IaCStateBackend` contract end-to-end): - Run the state-backend benchmark task; lock the `IaCStateBackend` proto shape. -- Run the `kubernetesBackend` interface-audit spike; lock or re-scope the `PlatformBackend` proto shape. -- Add `IaCStateBackend` + `PlatformBackend` services to `plugin/external/proto/iac.proto`. +- Add the `IaCStateBackend` service to `plugin/external/proto/iac.proto`. - Add the secret-redaction task + the gRPC-interceptor guard test (blocking). -- workflow-plugin-azure implements `azure_blob` `IaCStateBackend` + `aks` `PlatformBackend`. -- Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` from `iac_module.go`; delete `platform_kubernetes_aks.go` (the Phase-0 split file) and wire its `PlatformBackend` dispatch. This drops `Azure/azure-sdk-for-go` from `go.mod`. +- workflow-plugin-azure implements `azure_blob` `IaCStateBackend`. +- Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` wrapper from `iac_module.go`. **This (the deletion + the `iac_module.go` edit — the only two real azure-sdk importers) is what drops `Azure/azure-sdk-for-go` from `go.mod`.** Phase A touches **no `platform.*` file** — `aksBackend` is SDK-free and stays in core. -**Phase B — AWS.** Inventory + destination (the authoritative list is the audit-script output): +**Phase B — AWS.** Inventory + destination (authoritative list = `audit-cloud-symbols.sh` output): | core file | disposition | atomicity note | |-----------|-------------|----------------| | `iac_state_spaces.go` | DELETE → aws plugin `s3` `IaCStateBackend` | shared with `spaces` — see Phase D | | `cloud_account_aws.go` | DELETE outright — dead code, **zero non-test consumers verified** | no consumer rewrite; trivial deletion | -| `cloud_account_aws_creds.go` | **EDIT** — rewrite `awsProfileResolver`/`awsRoleARNResolver` bodies SDK-free; file stays in core | the resolver `init()` registrations stay — `provider: aws` credential resolution still works in-core, now declare-only | -| `nosql_dynamodb.go` | DELETE → aws plugin `nosql.dynamodb`; drop the entry from `plugins/datastores/plugin.go` | same commit as the built-in-plugin factory-map edit | -| `pipeline_step_s3_upload.go` | DELETE → aws plugin `step.s3_upload`; drop from `plugins/pipelinesteps/plugin.go` | same commit | +| `cloud_account_aws_creds.go` | **EDIT** — rewrite `awsProfileResolver`/`awsRoleARNResolver` bodies SDK-free; file stays in core | resolver `init()` registrations stay — `provider: aws` credential resolution still works in-core, now declare-only | +| `pipeline_step_s3_upload.go` | DELETE → aws plugin `step.s3_upload`; drop the entry from `plugins/pipelinesteps/plugin.go` | same commit as the built-in-plugin factory-map edit | | `s3_storage.go` | DELETE → aws plugin `storage.s3`; drop from `plugins/storage/plugin.go` | same commit | -- Core PR also: **strip the `spaces` case from `iac_module.go`** (it calls `NewSpacesIaCStateStore` from the deleted `iac_state_spaces.go`). Drop `aws-sdk-go-v2` from `go.mod`. -- **No AWS `platform.*` work** — #653 already stubbed `eks` (`eksErrorBackend` stays in core) and removed `platform/providers/aws/`. -- `storage_artifact_s3.go` stays in core (comment-only SDK reference). +- Core PR also: **strip the `spaces` case from `iac_module.go`** (it calls `NewSpacesIaCStateStore` from the deleted `iac_state_spaces.go`). Drop `aws-sdk-go-v2` from `go.mod`; set the `.phase-b-complete` marker so `audit-cloud-symbols.sh --check` enforces the `cloud_account_aws_creds.go` zero-import invariant. +- **No AWS `platform.*` work** — #653 already stubbed `eks`, and `aks` is SDK-free. +- `nosql_dynamodb.go` and `storage_artifact_s3.go` stay in core (comment-only — see Inventory provenance). -**Phase C — GCP:** -- workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), `PlatformBackend` (`gke`), plugin-native `storage.gcs`. -- Core PR: delete `iac_state_gcs.go`, `storage_gcs.go` (drop the entry from `plugins/storage/plugin.go`), `platform_kubernetes_gke.go` (the Phase-0 split file); strip the `gcs` case from `iac_module.go`; drop `cloud.google.com/go` + `google.golang.org/api`. After Phase C, `go list -deps ./...` shows zero packages from the three in-scope SDK trees — the permanent CI gate is added here. (`godo` remains — out of scope.) +**Phase C — GCP** (includes the one platform-backend extraction): +- Run the `kubernetesBackend` interface-audit spike (Architecture §2) — picks the `gke` cross-process contract: `ResourceDriver` fold (preferred), plugin-native `kubernetesBackend`, or a minimal new contract (fallback). Locks that decision *before* the gcp plugin work. +- workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), the chosen `gke` contract, and plugin-native `storage.gcs`. +- Core PR: delete `iac_state_gcs.go`; delete `storage_gcs.go` (drop the entry from `plugins/storage/plugin.go`); delete `platform_kubernetes_gke.go` (the Phase-0 split file) and wire its `gke` dispatch; strip the `gcs` case from `iac_module.go`; drop `cloud.google.com/go` + `google.golang.org/api`. +- After Phase C, `go list -deps ./...` shows zero packages from the three in-scope SDK trees — the permanent CI gate is added here. (`godo` remains — out of scope.) **Phase D — DigitalOcean (`spaces` clean-break):** - workflow-plugin-digitalocean implements `IaCStateBackend` for `spaces` (S3-compatible — pulls `aws-sdk-go-v2/service/s3`, the one service package, not the whole tree). - **Clean break, not soft-compat.** `iac_state_spaces.go` + the `spaces` case in `iac_module.go` are deleted by **Phase B's core PR** (`iac_state_spaces.go` is the one S3-compatible store backing *both* `s3` and `spaces`). After Phase B's core PR merges, `iac.state` with `backend: spaces` fails to build unless the DO plugin version implementing `IaCStateBackend` is loaded. - **Minor version bump** on workflow-plugin-digitalocean (compatibility-break marker) + `minEngineVersion` set to the core version that drops the in-core `spaces` case + migration doc. -- **Sequencing:** the DO plugin PR (implementing `spaces` `IaCStateBackend`) MUST merge + release **before** Phase B's core PR merges — otherwise `backend: spaces` has no implementation anywhere. Writing-plans orders the DO plugin PR as a Phase-B blocker. +- **Sequencing:** the DO plugin PR MUST merge + release **before** Phase B's core PR merges — otherwise `backend: spaces` has no implementation anywhere. Writing-plans orders the DO plugin PR as a Phase-B blocker. ## Migration (user-facing) Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-14-cloud-sdk-extraction.md`: - `iac.state` with `backend: s3|azure_blob|gcs|spaces` → load the matching plugin (`wfctl plugin install workflow-plugin-{aws,azure,gcp,digitalocean}`). yaml `backend:` value unchanged. **Hard requirement after the relevant phase merges.** -- `platform.kubernetes` with `provider: gke|aks` → load the matching plugin. yaml `provider:` value unchanged. (`kind`/`k3s`/`eks` unchanged — still core.) -- `nosql.dynamodb`, `step.s3_upload`, `storage.s3`, `storage.gcs` → load the matching plugin. Module/step type names unchanged; `credentials:` block moves inline (or `credentials_ref:` an in-plugin `aws.credentials`/`gcp.credentials` module). **This inline-credentials move is the only yaml-shape change.** -- `memory` / `filesystem` / `postgres` state backends, `kind`/`k3s`/`eks` k8s backends, `storage.artifact` (`storage_artifact_s3.go`) → no change, still core. +- `platform.kubernetes` with `provider: gke` → load workflow-plugin-gcp. yaml `provider:` value unchanged. **`provider: kind|k3s|eks|aks` are unchanged — still core, no plugin needed.** +- `step.s3_upload`, `storage.s3`, `storage.gcs` → load the matching plugin. Step/module type names unchanged; `credentials:` block moves inline (or `credentials_ref:` an in-plugin `aws.credentials`/`gcp.credentials` module). **This inline-credentials move is the only yaml-shape change.** +- `memory` / `filesystem` / `postgres` state backends, `kind`/`k3s`/`eks`/`aks` k8s backends, `nosql.dynamodb`, `storage.artifact` → no change, still core. ## Assumptions -1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark task validates the typical case; a hostile-large state is out of initial scope (`SaveState` returns a clear "state exceeds transport limit" error). The benchmark runs before the proto is locked. -2. **`kubernetesBackend` is cleanly provider-separable.** The design assumes the `kubernetesBackend` interface is segregated such that `kindBackend`/`eksErrorBackend` can stay while `gkeBackend`/`aksBackend` extract. Post-#653 this is **much less fragile than earlier drafts** — there is one interface, not five, and `eksErrorBackend` already proves a core-staying SDK-free impl coexists with cloud impls behind the same interface. The Phase 0/A interface-audit spike still validates it formally before the proto lock. -3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend`/`PlatformBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive. -4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. Adding one AWS service package is the minimal cost of self-contained `spaces` state support. -5. **The credential resolvers can all be made SDK-free in-core.** `cloud_account_azure.go` / `cloud_account_gcp.go` are *already* SDK-free (verified — 0 SDK imports); `cloud_account_aws_creds.go`'s `awsStaticResolver`/`awsEnvResolver` are already SDK-free, and `awsProfileResolver`/`awsRoleARNResolver` become SDK-free once their SDK blocks are rewritten out (Phase B). The load-bearing assumption: a resolver does not *need* to resolve in-core — for deferred credential types it records declared inputs + an `Extra["credential_source"]` marker, and the plugin honors the marker. The plugin **must** implement marker handling for every deferred type. -6. **No core code outside `module/` imports these SDKs.** Verified: the only real `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` imports are under `module/`. A `go list -deps` CI gate in Phase C enforces this permanently. -7. **#653 is final and merged.** This design builds on `origin/main` post-#653. If #653 work were reverted, this design's file inventory would need re-baselining — but #653's issue is *closed* and all three PRs are merged, so this is a stable foundation. +1. **gRPC's 4 MB default message cap covers real-world IaC state files.** If a deployment's state exceeds 4 MB the unary `IaCStateBackend` contract needs streaming — the benchmark validates the typical case; `SaveState` returns a clear "state exceeds transport limit" error for the hostile-large case. Benchmark runs before the proto is locked. +2. **`kubernetesBackend` is cleanly provider-separable.** The interface must be segregated such that `kind`/`k3s`/`eks`/`aks` impls stay while `gke` extracts. Post-cycle-9 this is **low-risk** — three SDK-free impls (`kindBackend`, `eksErrorBackend`, `aksBackend`) already coexist behind the interface, proving it doesn't leak SDK types into the core module shell. The interface-audit spike (Phase C precursor) confirms formally. +3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive. +4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. One AWS service package is the minimal cost of self-contained `spaces` state support. +5. **The credential resolvers can all be made SDK-free in-core.** `cloud_account_azure.go` / `cloud_account_gcp.go` are *already* SDK-free (verified); `awsStaticResolver`/`awsEnvResolver` are already SDK-free; `awsProfileResolver`/`awsRoleARNResolver` become SDK-free once their SDK blocks are rewritten out (Phase B). Load-bearing claim: a resolver does not *need* to resolve in-core — for deferred types it records declared inputs + an `Extra["credential_source"]` marker, and the plugin honors the marker. +6. **No core code outside `module/` imports these SDKs.** Verified: the only real imports are under `module/`. A `go list -deps` CI gate in Phase C enforces this permanently. +7. **#653 is final and merged.** This design builds on `origin/main` post-#653 (issue closed, all three PRs merged). A stable foundation. ## Rollback This design changes **plugin loading paths** and **go.mod dependency trees** — runtime-affecting per the `runtime-launch-validation` trigger list. -- **Per-phase revert:** each phase is an isolated core PR + plugin PR(s). Reverting the **core PR** restores the in-process backend `switch` / cloud backends and re-adds the SDK to `go.mod` — deleted files recoverable from git. Plugin PRs are additive and can stay merged harmlessly even if core reverts. **Phase D has no separate core PR** — its core deletion *is* Phase B's core PR — so a Phase D rollback means reverting Phase B's core PR + the DO plugin PR together. +- **Per-phase revert:** each phase is an isolated core PR + plugin PR(s). Reverting the **core PR** restores the in-process backend `switch` / `gke` backend and re-adds the SDK to `go.mod` — deleted files recoverable from git. Plugin PRs are additive and can stay merged harmlessly even if core reverts. **Phase D has no separate core PR** — its core deletion *is* Phase B's core PR — so a Phase D rollback means reverting Phase B's core PR + the DO plugin PR together. - **Forward-fix preferred over revert:** because core keeps the old in-process path until the contract dispatch is wired *in the same core PR*, a broken phase fails at PR CI (image-launch / strict-contracts gates), not in production. The revert path exists but the gate is the primary safety. -- **`spaces` clean-break (Phase B core PR + Phase D plugin PR):** the only change with an external-user-visible compat break. Rollback = revert Phase B's core PR (restores `iac_state_spaces.go` + the `spaces` case) **and** revert the DO plugin minor bump, together — a matched pair. The migration doc + the DO plugin's `minEngineVersion` bump is the forward guard. +- **`spaces` clean-break (Phase B core PR + Phase D plugin PR):** the only change with an external-user-visible compat break. Rollback = revert Phase B's core PR + the DO plugin minor bump, together — a matched pair. The migration doc + the DO plugin's `minEngineVersion` bump is the forward guard. ## Alternatives Considered -1. **Fold the cloud Kubernetes provisioners into the existing `IaCProviderRequired` / `ResourceDriver` contract instead of inventing `PlatformBackend`.** A GKE/AKS cluster is structurally a managed resource with create/plan/apply/destroy/status — exactly what `ResourceDriver` already models. **Rejected as the default** because `platform.kubernetes` has a distinct `provider:` UX the user explicitly asked to preserve, and a continuous-reconciliation lifecycle surface — but **retained as the gated fallback**: the Phase 0/A `kubernetesBackend` interface-audit spike decides. Post-#653 the case for a dedicated `PlatformBackend` is *weaker* (only 2 cloud backends, both Kubernetes) — the spike may well conclude `ResourceDriver` suffices. The design defers to the spike rather than pre-committing. -2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "keeps one S3 client." **Rejected** because it leaves dependabot churning one AWS package indefinitely and weakens the `go list -deps` gate. The cost (both aws + DO plugins carry an S3 client) is real but bounded — one service package, independently versioned. -3. **A shared `s3compat` Go module consumed by both the aws and DO plugins** (instead of each re-implementing the S3-compatible state store + `buildAWSConfig`). **Deferred, not rejected:** a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary — lands as a follow-up after the extraction is proven. Writing-plans logs it as a post-extraction cleanup candidate. +1. **Fold the `gke` provisioner into `ResourceDriver` instead of inventing `PlatformBackend`.** This is no longer an "alternative" — post-cycle-9 it is the **preferred path** inside Architecture §2 (the spike's option 1). A GKE cluster is structurally a managed resource; with only one SDK-bearing platform backend, a dedicated new contract is YAGNI. Retained here only to record that the dedicated-contract option was considered and demoted to a gated fallback. +2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "keeps one S3 client." **Rejected** — leaves dependabot churning one AWS package indefinitely and weakens the `go list -deps` gate. The cost (both aws + DO plugins carry an S3 client) is real but bounded. +3. **A shared `s3compat` Go module consumed by both the aws and DO plugins.** **Deferred, not rejected:** a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary — lands as a follow-up after the extraction is proven. 4. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. -5. **Wait for / extend #653 to also extract state backends + `platform.kubernetes`.** #653's issue is closed with an explicit scope boundary ("RBAC/secrets/artifact stay"). Extending a closed issue rather than opening a clearly-scoped successor would muddy the audit trail. **Rejected** — this design is the named successor and cites #653 as predecessor. +5. **Wait for / extend #653 to also extract state backends + `gke`.** #653's issue is closed with an explicit scope boundary. Extending a closed issue rather than opening a clearly-scoped successor would muddy the audit trail. **Rejected** — this design is the named successor. ## Self-challenge — top doubts surfaced (carried forward, with mitigations wired into phases) -1. **`PlatformBackend` may be over-general** AND **2. clean provider-separability (Assumption 2) is fragile.** Both are settled by the *one* `kubernetesBackend` interface-audit spike — Phase 0/A task 1, ordered before the proto lock. Post-#653 both doubts are materially smaller: one interface, two cloud impls, and `eksErrorBackend` already demonstrates an SDK-free core impl behind that interface. If `kubernetesBackend`'s 4 methods don't map cleanly onto `Plan/Apply/Destroy/Status`, the fallback is folding into `ResourceDriver` (Alternatives #1). +1. **A new `PlatformBackend` contract would be over-general for one backend.** Settled: Architecture §2 does **not** propose a new contract — it gates `gke`'s cross-process mechanism on an interface-audit spike whose *preferred* outcome is folding into the existing `ResourceDriver`. The dedicated-contract option is the explicit fallback, not the plan. +2. **Clean provider-separability (Assumption 2) could be fragile.** Low-risk post-cycle-9: three SDK-free `kubernetesBackend` impls already coexist behind the interface. The spike confirms formally before the Phase C `gke` work. 3. **The state-backend benchmark could come back "streaming required"** and reshape the `IaCStateBackend` proto. Mitigation: benchmark is a Phase A task ordered *before* the proto lock. -4. **The inventory could be stale again** — cycle 8 caught exactly this (the design predated #653). Mitigation: every file/symbol claim in this revision is grep-verified against `origin/main` HEAD, the worktree is confirmed 0 commits behind `origin/main`, and `scripts/audit-cloud-symbols.sh` (Phase 0 task 1) makes the inventory a CI-enforced build artifact from Phase 0 onward — not a prose claim that can rot. +4. **The inventory could be stale or comment-fooled again** — cycles 8 and 9 both hit exactly this (cycle 8: predated #653; cycle 9: comment-only false positives in `platform_kubernetes_kind.go` *and* `nosql_dynamodb.go`). Mitigation: `scripts/audit-cloud-symbols.sh` is committed *with this design*, parses `import (...)` blocks (never comments), and its output — not prose — populates every file table above. From Phase 0 it runs in CI on every phase PR. The defect class is now structurally closed, not just patched. ## Open items deferred to writing-plans -- Exact proto field layouts for both new contracts (sketches above are directional; field-level layout follows the benchmark + interface-audit results). -- Whether `PlatformBackend` ships as designed or folds into `ResourceDriver` (gated on the `kubernetesBackend` interface-audit spike — Alternatives Considered #1). +- Exact proto field layouts for `IaCStateBackend` (sketch above is directional; field-level layout follows the benchmark). +- The `gke` cross-process contract decision (`ResourceDriver` fold / plugin-native / minimal new contract) — gated on the `kubernetesBackend` interface-audit spike, a Phase C precursor task. - Benchmark harness location + the concrete acceptance threshold (p99 added latency bar). - Exact wording of the secret-redaction extension + whether existing redaction already covers `credentials:` keys. - The `s3compat` shared-module cleanup (Alternatives Considered #3) — logged as a post-extraction follow-up candidate. diff --git a/scripts/audit-cloud-symbols.sh b/scripts/audit-cloud-symbols.sh new file mode 100755 index 00000000..94355957 --- /dev/null +++ b/scripts/audit-cloud-symbols.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# audit-cloud-symbols.sh — authoritative cloud-SDK ownership map for module/. +# +# Drafted as a verification aid for the cloud-SDK-extraction design +# (docs/plans/2026-05-14-cloud-sdk-extraction-design.md); formalized and +# extended in that design's Phase 0. It exists because four review cycles +# kept finding hand-maintained inventory claims wrong — the recurring +# defect was grep matching SDK names inside doc comments, not real imports. +# +# This script answers, mechanically and comment-immune: +# 1. Which module/*.go files have a REAL import of each in-scope SDK tree +# (parsed from the `import (...)` block — never from comments). +# 2. Which files name an SDK only in comments (false positives to ignore). +# 3. Whether cloud_account_aws_creds.go still imports aws-sdk-go-v2 +# (Phase B rewrite invariant: must be zero post-rewrite). +# 4. Whether any single file's import block mixes core-staying and +# plugin-bound concerns (advisory — full init()-partition check is a +# Phase 0 extension). +# +# Exit non-zero if invoked with --check and an invariant is violated. + +set -euo pipefail + +cd "$(dirname "$0")/.." + +SDK_TREES=( + 'aws-sdk-go-v2' + 'azure-sdk-for-go' + 'cloud.google.com/go' + 'google.golang.org/api' +) + +# Extract just the Go import block of a file (handles single `import (...)`). +import_block() { + awk '/^import \(/{f=1} f{print} /^\)/{if(f)exit}' "$1" +} + +real_import() { # file, sdk → 0 if sdk appears in the import block + import_block "$1" | grep -q "$2" +} + +CHECK=0 +[[ "${1:-}" == "--check" ]] && CHECK=1 +FAIL=0 + +echo "== Cloud-SDK real-import map (module/, *_test.go excluded) ==" +for sdk in "${SDK_TREES[@]}"; do + echo + echo "### $sdk" + # Every file that names the SDK anywhere (import or comment): + while IFS= read -r f; do + [[ -z "$f" ]] && continue + if real_import "$f" "$sdk"; then + echo " REAL $f" + else + echo " comment-only $f (false positive — ignore)" + fi + done < <(grep -rl "$sdk" module --include='*.go' | grep -v '_test\.go' | sort) +done + +echo +echo "== Invariant: cloud_account_aws_creds.go imports of aws-sdk-go-v2 ==" +CREDS=module/cloud_account_aws_creds.go +if [[ -f "$CREDS" ]]; then + n=$(import_block "$CREDS" | grep -c 'aws-sdk-go-v2' || true) + echo " aws-sdk-go-v2 import lines in $CREDS: $n" + echo " (pre-extraction: nonzero is expected; Phase B rewrite invariant: MUST be 0)" + # Only enforced once the design's Phase B marker file exists. + if [[ $CHECK -eq 1 && -f .phase-b-complete && $n -ne 0 ]]; then + echo " INVARIANT VIOLATED: cloud_account_aws_creds.go still imports aws-sdk-go-v2 post-Phase-B" + FAIL=1 + fi +fi + +echo +echo "== Advisory: platform_kubernetes_kind.go backend split readiness ==" +KIND=module/platform_kubernetes_kind.go +if [[ -f "$KIND" ]]; then + echo " backend types: $(grep -cE '^type .*[Bb]ackend struct' "$KIND") (expect kind/eksError/gke/aks pre-Phase-0)" + echo " shared init(): $(grep -c '^func init()' "$KIND") (expect 1 pre-Phase-0; 0 here post-split — each _provider.go gets its own)" + echo " real SDK imports here:" + for sdk in "${SDK_TREES[@]}"; do + real_import "$KIND" "$sdk" && echo " REAL: $sdk" + done +fi + +echo +if [[ $CHECK -eq 1 ]]; then + [[ $FAIL -eq 0 ]] && echo "audit-cloud-symbols: OK" || { echo "audit-cloud-symbols: FAIL"; exit 1; } +else + echo "audit-cloud-symbols: report-only (pass --check to enforce invariants)" +fi From 264791bc35fe2b43f2affad59f92a02b9c55b348 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:55:59 -0400 Subject: [PATCH 12/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20cycle-10=20re-baseline,=20AWS=20scope=20bou?= =?UTF-8?q?ndary=20explicit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cycle-10 adversarial review caught Assumption 6 as false: the cycle-9 audit script scanned only module/, missing five aws-sdk-go-v2 importers under provider/aws/, plugin/rbac/, iam/, artifact/. The design's Goal ("go.mod drops aws-sdk-go-v2 entirely") was therefore unachievable by the four phases as written. Structural fix — third defect-class variant closed: - audit-cloud-symbols.sh now scans the WHOLE REPO (not just module/) and splits results module/ vs. elsewhere. Comment-immune (cycle 9) + scope-complete (cycle 10) + CI-enforced (Phase 0). Whole-repo inventory result: - Azure + GCP SDK usage is entirely module/-resident → Phases A and C drop those trees from go.mod ENTIRELY (whole-graph go list -deps gate). - aws-sdk-go-v2 is split: 5 module/ files (in scope, Phase B) + 6 files in provider/aws/, plugin/rbac/aws.go, iam/aws.go, artifact/s3.go. Scope decision: the out-of-module/ AWS surface is exactly #653's deliberately-retained "RBAC/secrets/artifact stay" scope (plus the provider/aws deploy provider). This design does NOT unilaterally override #653's recent documented decision — it scopes that surface OUT (new Non-Goal, parallel to godo) and logs a recommended successor issue. Consequences threaded through the doc: - Goals section is now asymmetric: Azure/GCP full go.mod removal; AWS is module/-scoped removal (aws-sdk-go-v2 stays in go.mod for the out-of-scope surface). - Phase C CI gate is asymmetric: whole-graph zero for Azure/GCP, module/-scoped zero for AWS. - Assumption 6 rewritten to the verified truth; Assumption 7 notes #653's scope decision is respected, not contested. - Minors: I2 (awsRoleARNResolver rewrite — non-SDK required-check + sessionName extraction sit between declared-input recording and the SDK block; spelled out), M1 (Phase A also fixes iac_module.go's stale line-18 backend-list comment), M2 (internal/legacyaws noted). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 46 +++++++++++-------- scripts/audit-cloud-symbols.sh | 36 +++++++++------ 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 77e8dd1d..35055e64 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -1,7 +1,7 @@ # Cloud-SDK Extraction: workflow core → strict-contract plugins **Date:** 2026-05-14 -**Status:** Design — re-baselined after adversarial review cycle 9 (inventory regenerated by `scripts/audit-cloud-symbols.sh`, import-block-disciplined) +**Status:** Design — re-baselined after adversarial review cycle 10 (inventory regenerated whole-repo by `scripts/audit-cloud-symbols.sh`; AWS scope boundary made explicit) **Owner:** autonomous pipeline (workflow#TBD) ## Relationship to issue #653 (predecessor — read this first) @@ -16,34 +16,40 @@ Consequence: files this design's *earlier drafts* (cycles 1–7) enumerated — ## Inventory provenance — `scripts/audit-cloud-symbols.sh` -Cycles 2–9 each found a hand-maintained file/symbol claim wrong. The recurring defect was *grep matching an SDK name inside a doc comment, not a real import* (cycle 9 caught `platform_kubernetes_kind.go`; the same pass found `nosql_dynamodb.go` is also comment-only). The fix is structural: a script — `scripts/audit-cloud-symbols.sh`, committed alongside this design and formalized/extended in Phase 0 — parses the Go `import (...)` block of every `module/*.go` file and emits the comment-immune real-import map. **Every file claim below is that script's output, not a prose assertion.** Current output: +Cycles 2–10 each found a hand-maintained file/symbol claim wrong. Two distinct defect classes recurred: (a) *grep matching an SDK name inside a doc comment, not a real import* (cycle 9: `platform_kubernetes_kind.go` + `nosql_dynamodb.go`); (b) *a survey scoped too narrowly* — cycle 10 found the cycle-9 fix only scanned `module/` and missed five `aws-sdk-go-v2` importers under `provider/`, `plugin/rbac/`, `iam/`, `artifact/`. The fix is structural: `scripts/audit-cloud-symbols.sh` (committed alongside this design, formalized in Phase 0) parses the Go `import (...)` block of every `*.go` file **repo-wide** (`*_test.go` excluded) and emits the comment-immune, scope-complete real-import map, split by directory. **Every file claim below is that script's output, not a prose assertion.** Current output: -| SDK tree | Real-import files (`module/`, `*_test.go` excluded) | comment-only (false positives — stay in core) | -|----------|------------------------------------------------------|-----------------------------------------------| -| `github.com/aws/aws-sdk-go-v2/*` | **5** — `cloud_account_aws.go`, `cloud_account_aws_creds.go`, `iac_state_spaces.go`, `pipeline_step_s3_upload.go`, `s3_storage.go` | `nosql_dynamodb.go`, `storage_artifact_s3.go` | -| `github.com/Azure/azure-sdk-for-go/sdk/*` | **2** — `iac_module.go`, `iac_state_azure.go` | `cloud_account_azure.go`, `platform_kubernetes_kind.go` | -| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** — `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_kind.go` (the `gkeBackend`) | — | +| SDK tree | In `module/` (this design's scope) | Elsewhere (OUT of scope — see Non-Goals) | comment-only (false positives) | +|----------|------------------------------------|------------------------------------------|--------------------------------| +| `github.com/aws/aws-sdk-go-v2/*` | **5** — `cloud_account_aws.go`, `cloud_account_aws_creds.go`, `iac_state_spaces.go`, `pipeline_step_s3_upload.go`, `s3_storage.go` | **6** — `artifact/s3.go`, `iam/aws.go`, `plugin/rbac/aws.go`, `provider/aws/{clients,deploy,plugin}.go` | `module/nosql_dynamodb.go`, `module/storage_artifact_s3.go` | +| `github.com/Azure/azure-sdk-for-go/sdk/*` | **2** — `iac_module.go`, `iac_state_azure.go` | **0** — Azure SDK is entirely `module/`-resident | `module/cloud_account_azure.go`, `module/platform_kubernetes_kind.go` | +| `cloud.google.com/go/storage` + `google.golang.org/api/*` | **3** — `iac_state_gcs.go`, `storage_gcs.go`, `platform_kubernetes_kind.go` (`gkeBackend`) | **0** — GCP SDK is entirely `module/`-resident | — | -Key facts this regenerated inventory establishes (all verified against `origin/main` HEAD; the worktree is 0 commits behind `origin/main`): +**The AWS surface is split; Azure and GCP are not.** This drives the asymmetric Goal below: extracting the `module/` surface drops `Azure/azure-sdk-for-go` and the GCP trees from `go.mod` **entirely**, but `aws-sdk-go-v2` **remains** in `go.mod` after Phase B because `provider/aws/`, `plugin/rbac/aws.go`, `iam/aws.go`, and `artifact/s3.go` still import it — that surface is the deliberately-retained "RBAC/secrets/artifact stay" scope from #653 (plus the `provider/aws` deploy provider), and this design scopes it **out**, parallel to `godo`. See Non-Goals. + +Other key facts this regenerated inventory establishes (all verified against `origin/main` HEAD; the worktree is 0 commits behind `origin/main`): - **`aksBackend` is SDK-free.** `platform_kubernetes_kind.go`'s azure-sdk match is a *stale doc comment* (line 332); `aksBackend.azureToken()` is a plain `net/http` OAuth2 client-credentials POST against `login.microsoftonline.com`. No `azure-sdk-for-go` import. It stays in core, exactly like `kindBackend` and `eksErrorBackend`. -- **The only SDK-bearing `platform.*` backend is `gkeBackend`** (`google.golang.org/api/container/v1`). `kind`/`k3s`/`eks`/`aks` are all SDK-free. -- **`nosql_dynamodb.go` and `storage_artifact_s3.go` are comment-only stubs** (in-memory / filesystem fallbacks; the SDK names appear only in "Full implementation would use…" comments). They carry no cloud SDK and are **out of scope** — they stay in core untouched. +- **The only SDK-bearing `platform.*` backend is `gkeBackend`** (`google.golang.org/api/container/v1`). `kind`/`k3s`/`eks`/`aks` are all SDK-free. (`platform_kubernetes_kind.go` also imports `internal/legacyaws` — a #653 stdlib-only artifact backing the `eks` stub; not a cloud SDK, not in scope.) +- **`nosql_dynamodb.go` and `storage_artifact_s3.go` are comment-only stubs** (in-memory / filesystem fallbacks; the SDK names appear only in "Full implementation would use…" comments). They carry no cloud SDK and stay in core untouched. - `cloud_account_azure.go` / `cloud_account_gcp.go` are already SDK-free declare-don't-resolve resolver files. Only the AWS credential resolvers carry SDK. ## Problem Every dependabot bump of a cloud SDK churns workflow core's `go.sum`, inflates the binary, and couples core release cadence to vendor SDK release cadence. The `workflow-plugin-{aws,azure,gcp,digitalocean}` plugins already exist and already carry these SDKs for their IaC *resource provider* role — core's direct usage is redundant surface. workflow#617 removed the DO IaC *resource* path + `godo`; #653 removed the AWS IaC *resource* modules + `platform/providers/aws/`. This design extends the same principle to the *remaining* cloud functionality neither extraction touched: IaC **state backends**, the one managed-Kubernetes **platform** backend that still carries an SDK (`gke`), and two standalone S3/GCS modules + one step. -**A fourth tree — `github.com/digitalocean/godo` — is still in core but out of scope here.** `module/cloud_account_do.go` + the `module/platform_do_*.go` files still import `godo`. The user's ask scoped this work to three SDK trees (aws/azure/gcp); `godo` extraction is a structurally-identical follow-up. Consequence: the `go list -deps` CI gate added in the final phase asserts **zero `aws-sdk-go-v2` / `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api` packages** — it does *not* assert "zero cloud SDKs" while `godo` remains. +**A fourth tree — `github.com/digitalocean/godo` — is still in core but out of scope here.** `module/cloud_account_do.go` + the `module/platform_do_*.go` files still import `godo`. The user's ask scoped this work to three SDK trees (aws/azure/gcp); `godo` extraction is a structurally-identical follow-up. The final-phase CI gate is therefore scoped (see Goals for the AWS asymmetry): it asserts whole-graph zero for `azure-sdk-for-go` / `cloud.google.com` / `google.golang.org/api`, and `module/`-scoped zero for `aws-sdk-go-v2` — it does *not* assert "zero cloud SDKs" while `godo` and the out-of-scope AWS surface remain. ## Goals -- workflow core `go.mod` drops `aws-sdk-go-v2/*`, `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, `google.golang.org/api/*` (the three in-scope trees) **entirely** — verified by a `go list -deps` gate in the final phase's CI asserting zero packages from those three trees. `godo` is out of scope. +The Goal is **asymmetric** by SDK tree, because the inventory above shows AWS SDK usage straddles `module/` and out-of-scope packages while Azure and GCP are `module/`-only: + +- **Azure + GCP — full removal.** workflow core `go.mod` drops `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, and `google.golang.org/api/*` **entirely**. Verified by a `go list -deps ./...` gate in the final phase's CI asserting zero packages from those trees anywhere in core's build graph. +- **AWS — `module/` surface removal.** Phase B removes every `aws-sdk-go-v2` import from `module/` (the IaC-state store + the S3 modules/step + the dead `cloud_account_aws.go` + the resolver SDK tails). `aws-sdk-go-v2` **remains a `go.mod` entry** afterward because `provider/aws/`, `plugin/rbac/aws.go`, `iam/aws.go`, `artifact/s3.go` still import it — that surface is scoped out (see Non-Goals). The Phase C gate therefore asserts, for AWS, **zero `aws-sdk-go-v2` imports under `module/`** (`scripts/audit-cloud-symbols.sh --check`), not zero in the whole graph. This still delivers the user's "external libs out of the workflow module" intent for the `module/` package and removes `service/s3` as a *direct* `go.mod` require once the S3 modules leave. - Cloud functionality remains available, loaded via strict-contract gRPC plugins (the existing sidecar model). - The SDK-free Kubernetes backends — `kind`, `k3s`, `eks` (a #653 migration-error stub), **and `aks`** (a `net/http` OAuth2 client) — stay in core unchanged. Local-dev/test paths must not require a plugin. ## Non-Goals +- **The out-of-`module/` AWS SDK surface — `provider/aws/`, `plugin/rbac/aws.go`, `iam/aws.go`, `artifact/s3.go`.** These six files import `aws-sdk-go-v2` (cloudwatch/ecs/eks/iam/config/credentials/sts) and are **explicitly out of scope**, parallel to `godo`. Rationale: `plugin/rbac/aws.go` + `iam/aws.go` + `artifact/s3.go` are precisely the "RBAC/secrets/artifact stay" surface issue **#653 deliberately retained in core**; `provider/aws/` is the AWS deploy provider (referenced by `cmd/server/main.go`, `api/router.go`). Overriding #653's recent, documented scope decision is not something this design should do unilaterally. Consequence: `aws-sdk-go-v2` stays in `go.mod` after all four phases. **Recommended follow-up** (logged, not actioned here): a separate successor issue — like this design is #653's successor — to extract the AWS RBAC/IAM/artifact/deploy-provider surface into `workflow-plugin-aws` / a security plugin, at which point `aws-sdk-go-v2` can leave `go.mod` entirely. - Re-homing the IaC *resource provider* contract (`IaCProviderRequired`) — already extracted (#617, #653). - Changing how plugins are discovered/installed (`wfctl plugin install` flow unchanged). - Backwards-compatible yaml — this is a **clean break** with a migration guide (per #617 / #653 precedent). @@ -134,7 +140,7 @@ The AWS credential resolvers (`cloud_account_aws_creds.go`) split two ways. `aws The model: make **every** in-core resolver uniformly *declare, don't resolve*. Phase B **rewrites** the two SDK-bearing AWS resolver bodies — a deliberate `Resolve()` body rewrite, **not** a one-line "snip the tail": - `awsProfileResolver.Resolve` — its SDK calls are a clean contiguous tail after the marker-record (`m.creds.Extra["profile"] = profile`); the rewrite ends the method right after the marker-record. -- `awsRoleARNResolver.Resolve` — the SDK block (base-config build + `sts.NewFromConfig` + `AssumeRole` + result-record) is contiguous *after* the declared-input recording but is the **larger half** of the method. The rewrite **deletes that entire block** and ends the method after the declared-input recording + a `credential_source` marker. +- `awsRoleARNResolver.Resolve` — the SDK block (base-config build + `sts.NewFromConfig` + `AssumeRole` + result-record) is the **larger half** of the method. It is *not* perfectly contiguous after the declared-input recording: between the recording (`m.creds.RoleARN`, `Extra["external_id"]`) and the SDK block sit two non-SDK statements — the `roleARN == ""` required-check (`return fmt.Errorf(...)`) and the `sessionName` extraction. The rewrite **keeps the required-check, drops the `sessionName` extraction** (only the SDK `AssumeRole` consumed it), **deletes the SDK block**, and ends the method after the declared-input recording + a `credential_source` marker. Writing-plans spells out the exact line dispositions. After both rewrites, `cloud_account_aws_creds.go` imports **no** `aws-sdk-go-v2` package (the 4 SDK imports — `aws`, `config`, `credentials`, `sts` — are used *only* by those two resolver bodies) and **stays in core**. **Phase B CI invariant:** `scripts/audit-cloud-symbols.sh --check` (with the Phase-B marker present) asserts `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-rewrite. @@ -170,7 +176,7 @@ Cycles 2–9 each found a hand-maintained ownership claim wrong — cycle 8 (who - `module/platform_kubernetes_kind.go` holds **four** `kubernetesBackend` impls behind **one shared `init()`** registering `kind`/`k3s`/`eks`/`gke`/`aks` (verified). Of the four impls, only `gkeBackend` carries a cloud SDK. Phase 0 splits this file; the `init()` must be partitioned so no `init()` registers both a core-staying and a plugin-bound factory. - `cloud_account_aws.go` is **dead code, deleted outright by Phase B** — `AWSConfig()` / `AWSConfigProvider` / `ValidateCredentials`, all pure SDK, with **zero non-test consumers verified**. No consumer rewrite. `parseStringSlice` and `safeIntToInt32` (referenced by earlier drafts) **no longer exist anywhere in `module/`** — there is no shared-helper-relocation work in this design. -**The script's checks:** (a) the comment-immune real-import map per SDK tree; (b) `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-Phase-B (gated on a `.phase-b-complete` marker); (c) advisory readout of `platform_kubernetes_kind.go`'s backend-type count and shared-`init()` count for the Phase 0 split. Phase 0 extends it with the full `init()`-partition assertion. +**The script's checks:** (a) the comment-immune, **whole-repo** real-import map per SDK tree, split into `module/` (in-scope) vs. elsewhere (out-of-scope — `provider/`, `plugin/rbac/`, `iam/`, `artifact/`); (b) `cloud_account_aws_creds.go` has zero `aws-sdk-go-v2` imports post-Phase-B (gated on a `.phase-b-complete` marker); (c) advisory readout of `platform_kubernetes_kind.go`'s backend-type count and shared-`init()` count for the Phase 0 split. Phase 0 extends it with the full `init()`-partition assertion. ## Phase 0 — precursor: split the one remaining mixed cloud-backend file @@ -201,7 +207,7 @@ Each phase is one workflow-core PR (deleting/editing files + wiring the contract - Add the `IaCStateBackend` service to `plugin/external/proto/iac.proto`. - Add the secret-redaction task + the gRPC-interceptor guard test (blocking). - workflow-plugin-azure implements `azure_blob` `IaCStateBackend`. -- Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` wrapper from `iac_module.go`. **This (the deletion + the `iac_module.go` edit — the only two real azure-sdk importers) is what drops `Azure/azure-sdk-for-go` from `go.mod`.** Phase A touches **no `platform.*` file** — `aksBackend` is SDK-free and stays in core. +- Core PR: delete `iac_state_azure.go`; strip the `azure_blob` case + `newAzureSharedKeyCredential` wrapper from `iac_module.go`; while editing `iac_module.go`, also correct its stale line-18 doc comment ("Supported backends: 'memory' … 'filesystem' … 'spaces'" — the switch actually handles six). **The deletion + the `iac_module.go` edit — the only two real azure-sdk importers — is what drops `Azure/azure-sdk-for-go` from `go.mod` entirely.** Phase A touches **no `platform.*` file** — `aksBackend` is SDK-free and stays in core. **Phase B — AWS.** Inventory + destination (authoritative list = `audit-cloud-symbols.sh` output): @@ -221,7 +227,7 @@ Each phase is one workflow-core PR (deleting/editing files + wiring the contract - Run the `kubernetesBackend` interface-audit spike (Architecture §2) — picks the `gke` cross-process contract: `ResourceDriver` fold (preferred), plugin-native `kubernetesBackend`, or a minimal new contract (fallback). Locks that decision *before* the gcp plugin work. - workflow-plugin-gcp implements `IaCStateBackend` (`gcs`), the chosen `gke` contract, and plugin-native `storage.gcs`. - Core PR: delete `iac_state_gcs.go`; delete `storage_gcs.go` (drop the entry from `plugins/storage/plugin.go`); delete `platform_kubernetes_gke.go` (the Phase-0 split file) and wire its `gke` dispatch; strip the `gcs` case from `iac_module.go`; drop `cloud.google.com/go` + `google.golang.org/api`. -- After Phase C, `go list -deps ./...` shows zero packages from the three in-scope SDK trees — the permanent CI gate is added here. (`godo` remains — out of scope.) +- **Permanent CI gate added here, asymmetric per the Goals section:** (a) `go list -deps ./...` asserts **zero** `Azure/azure-sdk-for-go` and **zero** `cloud.google.com/go` / `google.golang.org/api` packages anywhere in core's build graph — Azure and GCP are fully gone; (b) `scripts/audit-cloud-symbols.sh --check` asserts **zero** `aws-sdk-go-v2` imports under `module/` — AWS is gone from the `module/` package, but `aws-sdk-go-v2` remains a `go.mod` entry for the out-of-scope `provider/aws/` / `plugin/rbac/` / `iam/` / `artifact/` surface (Non-Goals). `godo` remains — out of scope. **Phase D — DigitalOcean (`spaces` clean-break):** - workflow-plugin-digitalocean implements `IaCStateBackend` for `spaces` (S3-compatible — pulls `aws-sdk-go-v2/service/s3`, the one service package, not the whole tree). @@ -245,8 +251,8 @@ Published in each plugin's CHANGELOG + a consolidated `docs/migrations/2026-05-1 3. **Plugins may ship ahead of core.** A plugin implementing `IaCStateBackend` against the published proto is harmless to load on a core version that doesn't yet dispatch to it — the contract is additive. 4. **`aws-sdk-go-v2/service/s3` in workflow-plugin-digitalocean is acceptable.** DO Spaces is S3-API; there is no godo-native Spaces client. One AWS service package is the minimal cost of self-contained `spaces` state support. 5. **The credential resolvers can all be made SDK-free in-core.** `cloud_account_azure.go` / `cloud_account_gcp.go` are *already* SDK-free (verified); `awsStaticResolver`/`awsEnvResolver` are already SDK-free; `awsProfileResolver`/`awsRoleARNResolver` become SDK-free once their SDK blocks are rewritten out (Phase B). Load-bearing claim: a resolver does not *need* to resolve in-core — for deferred types it records declared inputs + an `Extra["credential_source"]` marker, and the plugin honors the marker. -6. **No core code outside `module/` imports these SDKs.** Verified: the only real imports are under `module/`. A `go list -deps` CI gate in Phase C enforces this permanently. -7. **#653 is final and merged.** This design builds on `origin/main` post-#653 (issue closed, all three PRs merged). A stable foundation. +6. **Azure and GCP SDK usage is `module/`-resident; AWS is not.** Verified whole-repo by `scripts/audit-cloud-symbols.sh`: `Azure/azure-sdk-for-go`, `cloud.google.com/go`, and `google.golang.org/api` real imports appear **only** under `module/` — so extracting the `module/` surface drops them from `go.mod` entirely. `aws-sdk-go-v2`, by contrast, is *also* imported by `provider/aws/`, `plugin/rbac/aws.go`, `iam/aws.go`, `artifact/s3.go` — the #653-retained surface this design scopes out (Non-Goals). The load-bearing consequence: the Phase C `go list -deps` gate can assert *whole-graph* zero for Azure/GCP but only *`module/`-scoped* zero for AWS. (An earlier draft of this design asserted "no core code outside `module/` imports these SDKs" — cycle 10 proved that false for AWS; this assumption now states the verified truth.) +7. **#653 is final and merged, and its "RBAC/secrets/artifact stay" scope decision is respected.** This design builds on `origin/main` post-#653 (issue closed, all three PRs merged) and does **not** override #653's decision to keep the AWS RBAC/IAM/artifact surface in core — it scopes that surface out and logs a successor follow-up (Non-Goals). A stable foundation; no contested scope. ## Rollback @@ -259,7 +265,7 @@ This design changes **plugin loading paths** and **go.mod dependency trees** — ## Alternatives Considered 1. **Fold the `gke` provisioner into `ResourceDriver` instead of inventing `PlatformBackend`.** This is no longer an "alternative" — post-cycle-9 it is the **preferred path** inside Architecture §2 (the spike's option 1). A GKE cluster is structurally a managed resource; with only one SDK-bearing platform backend, a dedicated new contract is YAGNI. Retained here only to record that the dedicated-contract option was considered and demoted to a gated fallback. -2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the Goal from "core drops `aws-sdk-go-v2/*` entirely" to "keeps one S3 client." **Rejected** — leaves dependabot churning one AWS package indefinitely and weakens the `go list -deps` gate. The cost (both aws + DO plugins carry an S3 client) is real but bounded. +2. **Leave `iac_state_spaces.go` in core, accept one `aws-sdk-go-v2/service/s3` dependency.** Downgrades the AWS Goal from "zero `aws-sdk-go-v2` under `module/`" to "keeps one S3 client in `module/`" — and would defeat the Phase C `module/`-scoped audit-script gate. **Rejected** — leaves an S3 client in the `module/` package the design is explicitly clearing, and `iac_state_spaces.go` is the one store backing both `s3` and `spaces` so it has to move for Phase D anyway. The cost (both aws + DO plugins carry an S3 client) is real but bounded. 3. **A shared `s3compat` Go module consumed by both the aws and DO plugins.** **Deferred, not rejected:** a *plugin-side* optimisation that doesn't affect the core contract or any phase boundary — lands as a follow-up after the extraction is proven. 4. **In-process Go-module plugin loading (build-tag imports) instead of gRPC sidecars.** Rejected in brainstorm by explicit user decision — strict gRPC sidecar model only. 5. **Wait for / extend #653 to also extract state backends + `gke`.** #653's issue is closed with an explicit scope boundary. Extending a closed issue rather than opening a clearly-scoped successor would muddy the audit trail. **Rejected** — this design is the named successor. @@ -269,7 +275,7 @@ This design changes **plugin loading paths** and **go.mod dependency trees** — 1. **A new `PlatformBackend` contract would be over-general for one backend.** Settled: Architecture §2 does **not** propose a new contract — it gates `gke`'s cross-process mechanism on an interface-audit spike whose *preferred* outcome is folding into the existing `ResourceDriver`. The dedicated-contract option is the explicit fallback, not the plan. 2. **Clean provider-separability (Assumption 2) could be fragile.** Low-risk post-cycle-9: three SDK-free `kubernetesBackend` impls already coexist behind the interface. The spike confirms formally before the Phase C `gke` work. 3. **The state-backend benchmark could come back "streaming required"** and reshape the `IaCStateBackend` proto. Mitigation: benchmark is a Phase A task ordered *before* the proto lock. -4. **The inventory could be stale or comment-fooled again** — cycles 8 and 9 both hit exactly this (cycle 8: predated #653; cycle 9: comment-only false positives in `platform_kubernetes_kind.go` *and* `nosql_dynamodb.go`). Mitigation: `scripts/audit-cloud-symbols.sh` is committed *with this design*, parses `import (...)` blocks (never comments), and its output — not prose — populates every file table above. From Phase 0 it runs in CI on every phase PR. The defect class is now structurally closed, not just patched. +4. **The inventory could be stale, comment-fooled, or scope-clipped again** — cycles 8, 9, and 10 each hit a variant (cycle 8: predated #653; cycle 9: comment-only false positives; cycle 10: a `module/`-only survey missed `aws-sdk-go-v2` importers in `provider/`, `plugin/rbac/`, `iam/`, `artifact/`). Mitigation: `scripts/audit-cloud-symbols.sh` is committed *with this design*, parses `import (...)` blocks (never comments), scans **the whole repo** (not just `module/`), and splits results by directory so the in-scope/out-of-scope boundary is mechanical. Its output — not prose — populates every file table above. From Phase 0 it runs in CI on every phase PR. All three defect-class variants are now structurally closed: comment-immune (parser), scope-complete (whole-repo), and rot-proof (CI-enforced). ## Open items deferred to writing-plans diff --git a/scripts/audit-cloud-symbols.sh b/scripts/audit-cloud-symbols.sh index 94355957..e9d31ef4 100755 --- a/scripts/audit-cloud-symbols.sh +++ b/scripts/audit-cloud-symbols.sh @@ -1,21 +1,23 @@ #!/usr/bin/env bash -# audit-cloud-symbols.sh — authoritative cloud-SDK ownership map for module/. +# audit-cloud-symbols.sh — authoritative cloud-SDK ownership map for the +# WHOLE workflow-core repo (not just module/). # # Drafted as a verification aid for the cloud-SDK-extraction design # (docs/plans/2026-05-14-cloud-sdk-extraction-design.md); formalized and -# extended in that design's Phase 0. It exists because four review cycles -# kept finding hand-maintained inventory claims wrong — the recurring -# defect was grep matching SDK names inside doc comments, not real imports. +# extended in that design's Phase 0. It exists because review cycles kept +# finding hand-maintained inventory claims wrong — first grep matching SDK +# names inside doc comments (cycle 9), then a survey scoped to module/ that +# missed aws-sdk importers in provider/, plugin/rbac/, iam/, artifact/ +# (cycle 10). This script is comment-immune AND whole-repo by construction. # -# This script answers, mechanically and comment-immune: -# 1. Which module/*.go files have a REAL import of each in-scope SDK tree -# (parsed from the `import (...)` block — never from comments). +# This script answers, mechanically: +# 1. Which *.go files (repo-wide, *_test.go excluded) have a REAL import +# of each in-scope SDK tree (parsed from the `import (...)` block — +# never from comments), split into module/ vs. elsewhere. # 2. Which files name an SDK only in comments (false positives to ignore). # 3. Whether cloud_account_aws_creds.go still imports aws-sdk-go-v2 # (Phase B rewrite invariant: must be zero post-rewrite). -# 4. Whether any single file's import block mixes core-staying and -# plugin-bound concerns (advisory — full init()-partition check is a -# Phase 0 extension). +# 4. platform_kubernetes_kind.go backend-split readiness (advisory). # # Exit non-zero if invoked with --check and an invariant is violated. @@ -43,19 +45,25 @@ CHECK=0 [[ "${1:-}" == "--check" ]] && CHECK=1 FAIL=0 -echo "== Cloud-SDK real-import map (module/, *_test.go excluded) ==" +echo "== Cloud-SDK real-import map (WHOLE REPO, *_test.go excluded) ==" +echo " module/ = this design's IaC-state/platform/standalone scope" +echo " elsewhere = out-of-scope surface (see design Non-Goals): provider/," +echo " plugin/rbac/, iam/, artifact/ — the #653 'RBAC/secrets/artifact stay'" +echo " surface, parallel to godo." for sdk in "${SDK_TREES[@]}"; do echo echo "### $sdk" # Every file that names the SDK anywhere (import or comment): while IFS= read -r f; do [[ -z "$f" ]] && continue + loc="module/ " + [[ "$f" != ./module/* && "$f" != module/* ]] && loc="elsewhere" if real_import "$f" "$sdk"; then - echo " REAL $f" + echo " REAL [$loc] $f" else - echo " comment-only $f (false positive — ignore)" + echo " comment-only $f (false positive — ignore)" fi - done < <(grep -rl "$sdk" module --include='*.go' | grep -v '_test\.go' | sort) + done < <(grep -rl "$sdk" . --include='*.go' | grep -v '_test\.go' | sort) done echo From 1b8fb3881168efa5677204eafab28fdb4f8c3f8a Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:59:08 -0400 Subject: [PATCH 13/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20d?= =?UTF-8?q?esign=20=E2=80=94=20cycle-11=20PASS,=20minor=20cleanups?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adversarial review cycle 11: PASS (zero Critical, zero Important). Two Minor nits applied: - audit-cloud-symbols.sh: real_import now also matches single-line `import "..."` form, not just parenthesized blocks — closes the one latent parser false-negative the reviewer flagged. - §Goals: clarified that the module/-scoped AWS-zero `--check` assertion is deferred-implementation added in Phase C (the committed script only enforces the cloud_account_aws_creds.go post-Phase-B invariant today), parallel to the Phase 0 init()-partition deferral. Design phase complete — proceeding to writing-plans. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/plans/2026-05-14-cloud-sdk-extraction-design.md | 2 +- scripts/audit-cloud-symbols.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index 35055e64..d9a142f9 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -43,7 +43,7 @@ Every dependabot bump of a cloud SDK churns workflow core's `go.sum`, inflates t The Goal is **asymmetric** by SDK tree, because the inventory above shows AWS SDK usage straddles `module/` and out-of-scope packages while Azure and GCP are `module/`-only: - **Azure + GCP — full removal.** workflow core `go.mod` drops `Azure/azure-sdk-for-go/*`, `cloud.google.com/go/*`, and `google.golang.org/api/*` **entirely**. Verified by a `go list -deps ./...` gate in the final phase's CI asserting zero packages from those trees anywhere in core's build graph. -- **AWS — `module/` surface removal.** Phase B removes every `aws-sdk-go-v2` import from `module/` (the IaC-state store + the S3 modules/step + the dead `cloud_account_aws.go` + the resolver SDK tails). `aws-sdk-go-v2` **remains a `go.mod` entry** afterward because `provider/aws/`, `plugin/rbac/aws.go`, `iam/aws.go`, `artifact/s3.go` still import it — that surface is scoped out (see Non-Goals). The Phase C gate therefore asserts, for AWS, **zero `aws-sdk-go-v2` imports under `module/`** (`scripts/audit-cloud-symbols.sh --check`), not zero in the whole graph. This still delivers the user's "external libs out of the workflow module" intent for the `module/` package and removes `service/s3` as a *direct* `go.mod` require once the S3 modules leave. +- **AWS — `module/` surface removal.** Phase B removes every `aws-sdk-go-v2` import from `module/` (the IaC-state store + the S3 modules/step + the dead `cloud_account_aws.go` + the resolver SDK tails). `aws-sdk-go-v2` **remains a `go.mod` entry** afterward because `provider/aws/`, `plugin/rbac/aws.go`, `iam/aws.go`, `artifact/s3.go` still import it — that surface is scoped out (see Non-Goals). The Phase C gate therefore asserts, for AWS, **zero `aws-sdk-go-v2` imports under `module/`**, not zero in the whole graph. (This `module/`-scoped AWS-zero assertion is *added to* `scripts/audit-cloud-symbols.sh --check` in Phase C — the script as committed with this design only enforces the `cloud_account_aws_creds.go` post-Phase-B invariant; the broader inventory-drift gate is deferred-implementation, parallel to how Phase 0 adds the `init()`-partition assertion.) This still delivers the user's "external libs out of the workflow module" intent for the `module/` package and removes `service/s3` as a *direct* `go.mod` require once the S3 modules leave. - Cloud functionality remains available, loaded via strict-contract gRPC plugins (the existing sidecar model). - The SDK-free Kubernetes backends — `kind`, `k3s`, `eks` (a #653 migration-error stub), **and `aks`** (a `net/http` OAuth2 client) — stay in core unchanged. Local-dev/test paths must not require a plugin. diff --git a/scripts/audit-cloud-symbols.sh b/scripts/audit-cloud-symbols.sh index e9d31ef4..a8e70feb 100755 --- a/scripts/audit-cloud-symbols.sh +++ b/scripts/audit-cloud-symbols.sh @@ -37,8 +37,8 @@ import_block() { awk '/^import \(/{f=1} f{print} /^\)/{if(f)exit}' "$1" } -real_import() { # file, sdk → 0 if sdk appears in the import block - import_block "$1" | grep -q "$2" +real_import() { # file, sdk → 0 if sdk appears in a real import (block OR single-line) + { import_block "$1"; grep -E '^import "' "$1" 2>/dev/null; } | grep -q "$2" } CHECK=0 From a81f0b6ea4e45ce48cd59ab002d631e24e2a4d93 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 01:59:45 -0400 Subject: [PATCH 14/39] fix(scripts): audit-cloud-symbols single-line-import grep poisoned the pipe The cycle-11 single-line-import hardening added an inner `grep -E '^import "'` whose no-match exit 1 poisoned the `| grep -q` pipe under `set -o pipefail`, making real_import() return false for every file lacking a single-line import. Added `|| true` on the inner grep. Verified: full report restored, all REAL/comment-only classifications correct. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/audit-cloud-symbols.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/audit-cloud-symbols.sh b/scripts/audit-cloud-symbols.sh index a8e70feb..eade7e22 100755 --- a/scripts/audit-cloud-symbols.sh +++ b/scripts/audit-cloud-symbols.sh @@ -38,7 +38,9 @@ import_block() { } real_import() { # file, sdk → 0 if sdk appears in a real import (block OR single-line) - { import_block "$1"; grep -E '^import "' "$1" 2>/dev/null; } | grep -q "$2" + # `|| true` on the inner grep: a no-match exit 1 must not poison the pipe + # under `set -o pipefail`. + { import_block "$1"; grep -E '^import "' "$1" 2>/dev/null || true; } | grep -q "$2" } CHECK=0 From 02fe6217a38e858aa318abe50b02674e6c3310a7 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:11:16 -0400 Subject: [PATCH 15/39] docs(plans): cloud-SDK extraction implementation plan (Phase 0 + Phase A) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bite-sized TDD plan for the first executable increment: Phase 0 (split platform_kubernetes_kind.go, fix the stale comment, wire the audit script into CI) + Phase A (IaCStateBackend proto + benchmark-gated proto-lock, host-side gRPC resolution, secret-redaction, gRPC-logging guard, workflow-plugin-azure implementation, core deletion dropping azure-sdk from go.mod). 14 tasks across 5 PRs. Phases B/C/D are explicitly scoped to a follow-on plan — their concrete tasks depend on Phase A's outputs (the benchmark-validated proto shape, the host-resolution pattern, the plugin-side serve path), so planning them now would be fiction. The design doc remains the authoritative B/C/D spec. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/plans/2026-05-14-cloud-sdk-extraction.md | 1135 +++++++++++++++++ 1 file changed, 1135 insertions(+) create mode 100644 docs/plans/2026-05-14-cloud-sdk-extraction.md diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md new file mode 100644 index 00000000..46e55e7f --- /dev/null +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -0,0 +1,1135 @@ +# Cloud-SDK Extraction Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Extract the Azure SDK (and establish the reusable `IaCStateBackend` gRPC contract + host-resolution pattern) out of workflow core's `module/` package into the `workflow-plugin-azure` sidecar, so `Azure/azure-sdk-for-go` drops from core's `go.mod` entirely. + +**Architecture:** A new strict `IaCStateBackend` gRPC service is added to `plugin/external/proto/iac.proto`, mapping 1:1 onto the existing 6-method `module.IaCStateStore` interface. Core's `iac.state` module stays, but its hardcoded backend `switch` gains a path that resolves an `IaCStateBackend` gRPC client from a loaded plugin. Phase 0 is a mechanical precursor that splits the one remaining mixed cloud-backend file. Phase A implements the contract end-to-end for the `azure_blob` backend — the pattern every later phase reuses. + +**Tech Stack:** Go 1.26+, `buf` for proto generation, `hashicorp/go-plugin` gRPC sidecars, the `modular` framework, `superpowers:executing-plans` TDD loop. + +**Base branch:** main (worktree branch `feat/cloud-sdk-extraction` already carries the committed design doc + `scripts/audit-cloud-symbols.sh`) + +**Design:** `docs/plans/2026-05-14-cloud-sdk-extraction-design.md` (adversarial review PASS, cycle 11) + +--- + +## Scope Manifest + +**PR Count:** 5 +**Tasks:** 14 +**Estimated Lines of Change:** ~1800 (informational; not enforced) + +**Out of scope:** +- **Phases B (AWS), C (GCP), D (DigitalOcean)** — deferred to a follow-on plan authored *after* Phase A merges. Their concrete tasks genuinely depend on Phase A's outputs: the benchmark-validated `IaCStateBackend` proto shape, the host-side gRPC-client resolution pattern, and the plugin-side state-backend serve path. Planning them now would be fiction. The design (`docs/plans/2026-05-14-cloud-sdk-extraction-design.md`) is the authoritative spec for B/C/D; this plan delivers Phase 0 + Phase A, which the design explicitly designates as the "validates the contract end-to-end" increment. +- The out-of-`module/` AWS SDK surface (`provider/aws/`, `plugin/rbac/aws.go`, `iam/aws.go`, `artifact/s3.go`) — per design Non-Goals (the #653-retained "RBAC/secrets/artifact stay" surface). +- `github.com/digitalocean/godo` extraction — per design Non-Goals. +- `aws-sdk-go-v2/service/kinesis` — transitive via `modular`, per design Non-Goals. +- Touching the comment-only stubs `nosql_dynamodb.go` / `storage_artifact_s3.go` — they carry no SDK. +- Changing `wfctl plugin install` discovery flow. + +**PR Grouping:** + +| PR # | Title | Tasks | Branch | +|------|-------|-------|--------| +| 1 | Phase 0: split platform_kubernetes_kind.go + wire audit script into CI | Task 1, Task 2, Task 3 | feat/cloud-sdk-extraction-p0 | +| 2 | Phase A: IaCStateBackend proto + benchmark harness + proto lock | Task 4, Task 5, Task 6 | feat/cloud-sdk-extraction-pa-proto | +| 3 | Phase A: host-side IaCStateBackend resolution + secret-redaction + gRPC-logging guard | Task 7, Task 8, Task 9, Task 10 | feat/cloud-sdk-extraction-pa-host | +| 4 | Phase A: workflow-plugin-azure implements azure_blob IaCStateBackend | Task 11, Task 12 | (cross-repo: workflow-plugin-azure `feat/azure-blob-state-backend`) | +| 5 | Phase A: core deletes iac_state_azure.go + strips azure_blob case → drops azure-sdk from go.mod | Task 13, Task 14 | feat/cloud-sdk-extraction-pa-core | + +**Status:** Draft + +--- + +## Cross-repo note + +PR 4 lands in a **different repository** (`/Users/jon/workspace/workflow-plugin-azure`), not the `workflow` worktree. The executing pipeline must create a branch + PR there separately. PR 4's plugin release (a tagged version implementing the published proto) **must merge and tag before PR 5** — PR 5's core deletion makes `backend: azure_blob` fail to build unless the plugin version implementing `IaCStateBackend` is loadable. PRs 2 and 3 can land in either order relative to each other but both precede PR 4 (the plugin needs the published proto) and PR 5. + +--- + +## PR 1 — Phase 0: split `platform_kubernetes_kind.go` + wire audit script into CI + +Mechanical, behavior-equivalent precursor. After this PR, no `init()` registers both a core-staying and a plugin-bound Kubernetes backend, and the single SDK-bearing platform file (`platform_kubernetes_gke.go`) is isolated for a later clean deletion. + +### Task 1: Split `platform_kubernetes_kind.go` into `_core.go` + `_gke.go` + +**Files:** +- Create: `module/platform_kubernetes_core.go` +- Create: `module/platform_kubernetes_gke.go` +- Modify: `module/platform_kubernetes_kind.go` (becomes empty → delete) — `git rm` it at the end +- Test: `module/platform_kubernetes_test.go` (existing — must stay green; no new test file, this is pure code movement verified by the existing suite + build) + +**Step 1: Establish the baseline — run the existing suite green before touching anything** + +Run: `go test ./module/ -run 'Kubernetes|Platform' -v` +Expected: PASS (all existing kubernetes/platform tests green — this is the behavior-equivalence baseline) + +**Step 2: Create `module/platform_kubernetes_core.go`** + +Move into this new file, verbatim, from `platform_kubernetes_kind.go`: +- the `kindBackend` type + all its methods (`plan`/`apply`/`status`/`destroy` and helpers) +- the `eksErrorBackend` type + all its methods +- the `aksBackend` type + all its methods (incl. `azureToken`, `aksResourceGroup`, `aksLocation`, `aksSubscriptionID`, `buildAgentPools`) — `aksBackend` is SDK-free (`net/http` OAuth2), it stays in core +- a new `func init()` registering **only** the four core-staying names: + +```go +func init() { + RegisterKubernetesBackend("kind", func(_ map[string]any) (kubernetesBackend, error) { + return &kindBackend{}, nil + }) + RegisterKubernetesBackend("k3s", func(_ map[string]any) (kubernetesBackend, error) { + return &kindBackend{}, nil + }) + RegisterKubernetesBackend("eks", func(_ map[string]any) (kubernetesBackend, error) { + return &eksErrorBackend{}, nil + }) + RegisterKubernetesBackend("aks", func(_ map[string]any) (kubernetesBackend, error) { + return &aksBackend{}, nil + }) +} +``` + +The import block for `_core.go` is exactly the imports those three backends use: `bytes`, `context`, `encoding/json`, `fmt`, `io`, `net/http`, `net/url`, `strings`, `time`, and `github.com/GoCodeAlone/workflow/internal/legacyaws` (the `eksErrorBackend` stub dependency). **No `google.golang.org/api` import** — that belongs only in `_gke.go`. + +**Step 3: Create `module/platform_kubernetes_gke.go`** + +Move into this new file, verbatim, from `platform_kubernetes_kind.go`: +- the `gkeBackend` type + all its methods (`gkeLocation`, `gkeProject`, `plan`, `apply`, `status`, `destroy`, `containerService`) +- a new `func init()` registering **only** `gke`: + +```go +func init() { + RegisterKubernetesBackend("gke", func(_ map[string]any) (kubernetesBackend, error) { + return &gkeBackend{}, nil + }) +} +``` + +The import block for `_gke.go` is exactly what `gkeBackend` uses, including `container "google.golang.org/api/container/v1"` and `"google.golang.org/api/option"`. + +**Step 4: Delete the now-empty original file** + +Run: `git rm module/platform_kubernetes_kind.go` +(All four backend types + the old `init()` have been moved out; the file is empty.) + +**Step 5: Build + vet** + +Run: `go build ./... && go vet ./module/...` +Expected: exit 0, no errors (pure code movement — every symbol still resolves, the SDK imports just live in different files) + +**Step 6: Run the kubernetes/platform suite — behavior equivalence** + +Run: `go test ./module/ -run 'Kubernetes|Platform' -v` +Expected: PASS — identical result to Step 1. The same five backend names (`kind`/`k3s`/`eks`/`gke`/`aks`) are registered after the split as before. + +**Step 7: Confirm the audit script sees the split correctly** + +Run: `bash scripts/audit-cloud-symbols.sh` +Expected: under "azure-sdk-for-go" the only `module/` entries are `iac_module.go` + `iac_state_azure.go` (both REAL) and `cloud_account_azure.go` (comment-only); `platform_kubernetes_kind.go` no longer appears (file deleted); under "google.golang.org/api" the gke real-import file is now `module/platform_kubernetes_gke.go`. + +**Step 8: Commit** + +```bash +git add module/platform_kubernetes_core.go module/platform_kubernetes_gke.go +git rm module/platform_kubernetes_kind.go +git commit -m "refactor(module): split platform_kubernetes_kind.go into _core + _gke + +Phase 0 precursor for cloud-SDK extraction. kindBackend/eksErrorBackend/ +aksBackend (all SDK-free) move to platform_kubernetes_core.go with a core +init(); gkeBackend (the only SDK-bearing k8s backend) moves to +platform_kubernetes_gke.go with its own init(). Behavior-equivalent: same +five backend names registered. Isolates the lone SDK-bearing platform +file for a later clean deletion." +``` + +Rollback: `git revert` — pure code movement, no behavior diff, no contract, no go.mod change. + +--- + +### Task 2: Fix the stale Azure-SDK doc comment + +**Files:** +- Modify: `module/platform_kubernetes_core.go` (the comment moved here with `aksBackend` in Task 1) + +**Step 1: Locate the stale comment** + +Run: `grep -n 'Requires the Azure SDK' module/platform_kubernetes_core.go` +Expected: one match — a doc comment above `aksBackend` reading approximately `// Requires the Azure SDK (github.com/Azure/azure-sdk-for-go) to be available.` + +**Step 2: Correct the comment** + +`aksBackend.azureToken` is a plain `net/http` OAuth2 client-credentials POST against `login.microsoftonline.com` — it does **not** import the Azure SDK. Replace the stale line with an accurate one, e.g.: + +```go +// aksBackend provisions AKS clusters via the Azure Resource Manager REST API. +// It authenticates with a net/http OAuth2 client-credentials flow against +// login.microsoftonline.com — it does NOT import github.com/Azure/azure-sdk-for-go. +``` + +**Step 3: Verify the audit script no longer flags the file as a comment-only Azure match** + +Run: `bash scripts/audit-cloud-symbols.sh | grep -A6 'azure-sdk-for-go'` +Expected: `module/platform_kubernetes_core.go` does **not** appear in the azure-sdk section at all (the SDK name is no longer even mentioned in the file). + +**Step 4: Build** + +Run: `go build ./module/...` +Expected: exit 0 (comment-only change). + +**Step 5: Commit** + +```bash +git add module/platform_kubernetes_core.go +git commit -m "docs(module): fix stale 'Requires the Azure SDK' comment on aksBackend + +aksBackend.azureToken is a net/http OAuth2 client, not an azure-sdk +consumer. The stale comment is what fooled an earlier inventory pass into +mis-counting platform_kubernetes_kind.go as an azure-sdk importer." +``` + +Rollback: `git revert` — comment-only. + +--- + +### Task 3: Extend `audit-cloud-symbols.sh` with the `init()`-partition assertion + wire it into CI + +**Files:** +- Modify: `scripts/audit-cloud-symbols.sh` +- Modify: `.github/workflows/ci.yml` (or the repo's primary CI workflow — confirm the exact filename with `ls .github/workflows/`) +- Test: `scripts/audit-cloud-symbols.sh --check` (the script self-verifies) + +**Step 1: Identify the CI workflow file** + +Run: `ls .github/workflows/` +Expected: a primary build/test workflow (e.g. `ci.yml`, `test.yml`, `go.yml`). Note its name for Step 4. + +**Step 2: Add the `init()`-partition assertion to the script** + +In `scripts/audit-cloud-symbols.sh`, extend the `--check` path so it fails if any post-Phase-0 file registers both a core-staying and a plugin-bound Kubernetes backend in one `init()`. Add, after the existing `platform_kubernetes_kind.go` advisory block (which becomes moot once the file is gone — guard it with a file-existence check): + +```bash +echo +echo "== Invariant: no init() mixes core-staying + plugin-bound k8s backends ==" +# Post-Phase-0, platform_kubernetes_core.go must register ONLY kind/k3s/eks/aks +# and platform_kubernetes_gke.go must register ONLY gke. A file registering a +# name from the other set is a partition violation. +CORE_K8S=module/platform_kubernetes_core.go +GKE_K8S=module/platform_kubernetes_gke.go +if [[ -f "$CORE_K8S" && -f "$GKE_K8S" ]]; then + if grep -qE 'RegisterKubernetesBackend\("gke"' "$CORE_K8S"; then + echo " VIOLATION: $CORE_K8S registers the plugin-bound 'gke' backend"; FAIL=1 + fi + for n in kind k3s eks aks; do + if grep -qE "RegisterKubernetesBackend\\(\"$n\"" "$GKE_K8S"; then + echo " VIOLATION: $GKE_K8S registers the core-staying '$n' backend"; FAIL=1 + fi + done + [[ $FAIL -eq 0 ]] && echo " OK — init() partition clean" +fi +``` + +Also guard the existing `platform_kubernetes_kind.go` advisory block with `[[ -f module/platform_kubernetes_kind.go ]]` so it silently skips post-Phase-0 (the file is gone). + +**Step 3: Run the script's check mode locally** + +Run: `bash scripts/audit-cloud-symbols.sh --check` +Expected: prints the real-import map, the new "init() partition clean" line shows `OK`, final line `audit-cloud-symbols: OK`, exit 0. + +**Step 4: Wire it into CI** + +Add a step to the CI workflow identified in Step 1, in the existing build/test job, after checkout + Go setup: + +```yaml + - name: Cloud-SDK inventory + partition audit + run: bash scripts/audit-cloud-symbols.sh --check +``` + +**Step 5: Verify the workflow YAML is valid** + +Run: `bash -n scripts/audit-cloud-symbols.sh` (script syntax) and visually confirm the YAML indentation matches the surrounding steps in the workflow file. +Expected: script syntax OK; YAML step nested at the same level as sibling steps. + +**Step 6: Commit** + +```bash +git add scripts/audit-cloud-symbols.sh .github/workflows/ +git commit -m "ci(audit): enforce k8s-backend init() partition + run audit on every PR + +Extends audit-cloud-symbols.sh --check with an init()-partition assertion +(platform_kubernetes_core.go registers only kind/k3s/eks/aks; _gke.go only +gke) and wires the script into CI so the cloud-SDK inventory becomes a +build-enforced artifact rather than a prose claim." +``` + +Rollback: `git revert` — CI-config + script change; reverting restores the prior (report-only) script and drops the CI step. Re-run `bash scripts/audit-cloud-symbols.sh` to confirm report-only mode after revert. + +--- + +## PR 2 — Phase A: `IaCStateBackend` proto + benchmark harness + proto lock + +Defines the new strict contract and validates the unary-transport decision with a real 1 MB-state benchmark *before* the proto is locked (design self-challenge doubt #3). + +### Task 4: Add the `IaCStateBackend` service + messages to `iac.proto` + +**Files:** +- Modify: `plugin/external/proto/iac.proto` +- Create (generated): `plugin/external/proto/iac.pb.go`, `plugin/external/proto/iac_grpc.pb.go` (regenerated by `buf`) +- Test: `plugin/external/proto/iac_statebackend_test.go` (a compile-level test that the generated Go types exist with the expected shape) + +**Step 1: Write the failing test** + +Create `plugin/external/proto/iac_statebackend_test.go`: + +```go +package proto + +import "testing" + +// Compile-level guard: the IaCStateBackend service + its messages must exist +// in the generated package with the IaCStateStore-mirroring shape. +func TestIaCStateBackendGeneratedTypesExist(t *testing.T) { + var _ IaCStateBackendServer // service interface generated + var _ IaCStateBackendClient // client interface generated + _ = &GetStateRequest{ResourceId: "r"} + _ = &GetStateResponse{Exists: true, State: &IaCState{}} + _ = &SaveStateRequest{State: &IaCState{}} + _ = &ListStatesRequest{Filter: map[string]string{"k": "v"}} + _ = &LockRequest{ResourceId: "r"} + _ = &UnlockRequest{ResourceId: "r"} + // IaCState mirrors module.IaCState's JSON-serialisable fields. + s := &IaCState{ResourceId: "r", ResourceType: "kubernetes", Provider: "azure", Status: "active"} + if s.GetResourceId() != "r" { + t.Fatalf("IaCState.ResourceId accessor missing") + } +} +``` + +**Step 2: Run the test to verify it fails** + +Run: `go test ./plugin/external/proto/ -run TestIaCStateBackendGeneratedTypesExist -v` +Expected: FAIL — build error, `IaCStateBackendServer` / `GetStateRequest` etc. undefined. + +**Step 3: Add the service + messages to `iac.proto`** + +Append to `plugin/external/proto/iac.proto` (after the `ResourceDriver` service, before EOF). Mirror `module.IaCState` field-for-field (see `module/iac_state.go:4-18`): + +```proto +// IaCStateBackend — strict contract for IaC state storage backends served by a +// plugin sidecar. Maps 1:1 onto module.IaCStateStore (6 methods). Unary RPCs: +// the PR 2 benchmark validated unary transport for 1 MB state blobs against the +// in-process baseline. No lock-lease/TTL field — added additively only once a +// plugin backend implements honored expiry with a conformance test. +service IaCStateBackend { + rpc GetState (GetStateRequest) returns (GetStateResponse); + rpc SaveState (SaveStateRequest) returns (SaveStateResponse); + rpc ListStates (ListStatesRequest) returns (ListStatesResponse); + rpc DeleteState(DeleteStateRequest) returns (DeleteStateResponse); + rpc Lock (LockRequest) returns (LockResponse); + rpc Unlock (UnlockRequest) returns (UnlockResponse); +} + +// IaCState mirrors module.IaCState (module/iac_state.go). Maps used there +// (Outputs, Config) carry arbitrary JSON — represented here as google.protobuf.Struct. +message IaCState { + string resource_id = 1; + string resource_type = 2; + string provider = 3; + string provider_ref = 4; + string provider_id = 5; + string config_hash = 6; + string status = 7; + google.protobuf.Struct outputs = 8; + google.protobuf.Struct config = 9; + repeated string dependencies = 10; + string created_at = 11; + string updated_at = 12; + string error = 13; +} + +message GetStateRequest { string resource_id = 1; } +message GetStateResponse { IaCState state = 1; bool exists = 2; } +message SaveStateRequest { IaCState state = 1; } // idempotent: full-state replace, last-writer-wins +message SaveStateResponse {} +message ListStatesRequest { map filter = 1; } +message ListStatesResponse { repeated IaCState states = 1; } +message DeleteStateRequest { string resource_id = 1; } +message DeleteStateResponse {} +message LockRequest { string resource_id = 1; } +message LockResponse {} +message UnlockRequest { string resource_id = 1; } +message UnlockResponse {} +``` + +Confirm `iac.proto` already imports `google/protobuf/struct.proto` (it uses `Struct` elsewhere); if not, add `import "google/protobuf/struct.proto";` near the top with the other imports. + +**Step 4: Regenerate the Go bindings** + +Run: `cd plugin/external/proto && buf generate` (per `plugin/external/proto/README.md`) +Expected: `iac.pb.go` + `iac_grpc.pb.go` regenerated, now containing `IaCStateBackendServer`, `IaCStateBackendClient`, and the message types. `git diff --stat` shows only the two `*.pb.go` files changed plus `iac.proto`. + +**Step 5: Run the test to verify it passes** + +Run: `go test ./plugin/external/proto/ -run TestIaCStateBackendGeneratedTypesExist -v` +Expected: PASS. + +**Step 6: Full build** + +Run: `go build ./...` +Expected: exit 0. + +**Step 7: Commit** + +```bash +git add plugin/external/proto/iac.proto plugin/external/proto/iac.pb.go plugin/external/proto/iac_grpc.pb.go plugin/external/proto/iac_statebackend_test.go +git commit -m "feat(proto): add IaCStateBackend service to iac.proto + +Strict 6-method contract mirroring module.IaCStateStore 1:1, with an +IaCState message mirroring module.IaCState. Unary RPCs. No TTL field +(additive follow-up, gated on a backend honoring expiry). Regenerated +bindings via buf." +``` + +Rollback: `git revert` — proto + generated code only, no runtime wiring yet; reverting leaves core building exactly as before. + +--- + +### Task 5: Build the `IaCStateBackend` round-trip benchmark harness + +**Files:** +- Create: `module/benchmark_iac_state_backend_test.go` +- Test: itself (a `Benchmark*` function) + +**Step 1: Write the benchmark** + +Create `module/benchmark_iac_state_backend_test.go`. It drives a synthetic ~1 MB `IaCState` through a full `Lock → GetState → SaveState → Unlock` cycle two ways: (a) directly against an in-process `IaCStateStore` (the `memory` backend — the baseline this design replaces), (b) against the same store wrapped behind a real in-memory gRPC `IaCStateBackend` server+client pair (the post-extraction path). It reports `b.ReportMetric` for added latency. + +```go +package module + +import ( + "context" + "net" + "strings" + "testing" + + pb "github.com/GoCodeAlone/workflow/plugin/external/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" +) + +// oneMBState builds an IaCState whose JSON payload is ~1 MB (Outputs map padded). +func oneMBState() *IaCState { + big := strings.Repeat("x", 1024) + outputs := make(map[string]any, 1024) + for i := 0; i < 1024; i++ { + outputs["k"+strings.Repeat("0", 3)+itoa(i)] = big + } + return &IaCState{ + ResourceID: "bench-resource", ResourceType: "kubernetes", Provider: "azure", + Status: "active", Outputs: outputs, Config: map[string]any{"size": "large"}, + CreatedAt: "2026-05-14T00:00:00Z", UpdatedAt: "2026-05-14T00:00:00Z", + } +} + +func itoa(i int) string { // tiny local helper; avoid strconv import noise in bench file + if i == 0 { + return "0" + } + var b []byte + for i > 0 { + b = append([]byte{byte('0' + i%10)}, b...) + i /= 10 + } + return string(b) +} + +// BenchmarkIaCStateBackend_InProcess is the baseline: direct IaCStateStore calls. +func BenchmarkIaCStateBackend_InProcess(b *testing.B) { + store := NewMemoryIaCStateStore() + st := oneMBState() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := store.Lock(st.ResourceID); err != nil { + b.Fatal(err) + } + if _, err := store.GetState(st.ResourceID); err != nil { + b.Fatal(err) + } + if err := store.SaveState(st); err != nil { + b.Fatal(err) + } + if err := store.Unlock(st.ResourceID); err != nil { + b.Fatal(err) + } + } +} + +// BenchmarkIaCStateBackend_GRPC is the post-extraction path: same store, same +// cycle, but every call crosses a real (in-memory bufconn) gRPC boundary. +func BenchmarkIaCStateBackend_GRPC(b *testing.B) { + lis := bufconn.Listen(4 << 20) // 4 MiB — gRPC default message cap + srv := grpc.NewServer() + pb.RegisterIaCStateBackendServer(srv, newBenchStateBackendServer(NewMemoryIaCStateStore())) + go func() { _ = srv.Serve(lis) }() + defer srv.Stop() + + conn, err := grpc.NewClient("passthrough:///bufnet", + grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { return lis.DialContext(ctx) }), + grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + b.Fatal(err) + } + defer conn.Close() + client := pb.NewIaCStateBackendClient(conn) + st := oneMBState() + pbState := iacStateToProto(st) // helper introduced in Task 7; for the bench, inline a minimal conversion + ctx := context.Background() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := client.Lock(ctx, &pb.LockRequest{ResourceId: st.ResourceID}); err != nil { + b.Fatal(err) + } + if _, err := client.GetState(ctx, &pb.GetStateRequest{ResourceId: st.ResourceID}); err != nil { + b.Fatal(err) + } + if _, err := client.SaveState(ctx, &pb.SaveStateRequest{State: pbState}); err != nil { + b.Fatal(err) + } + if _, err := client.Unlock(ctx, &pb.UnlockRequest{ResourceId: st.ResourceID}); err != nil { + b.Fatal(err) + } + } +} +``` + +> **Note for the executor:** `newBenchStateBackendServer` and `iacStateToProto` do not exist yet — Task 5 introduces a minimal local `newBenchStateBackendServer` in this same `_test.go` file (a thin adapter wrapping an `IaCStateStore` behind the generated `IaCStateBackendServer` interface, plus inline proto⇄struct conversion). Task 7 promotes the *production* converters into non-test code; the benchmark file may then be simplified to reuse them. Keep the bench file self-contained for this task. + +Add to the same file a minimal `newBenchStateBackendServer` implementing `pb.IaCStateBackendServer` by delegating to an `IaCStateStore`, with inline `IaCState`⇄`pb.IaCState` conversion (use `structpb.NewStruct` for the `Outputs`/`Config` maps). + +**Step 2: Run the benchmark to verify it builds + runs** + +Run: `go test ./module/ -bench BenchmarkIaCStateBackend -benchmem -run '^$' -count=6 | tee /tmp/iac-state-bench.txt` +Expected: both `BenchmarkIaCStateBackend_InProcess` and `_GRPC` run and report ns/op + B/op. (No assertion yet — Task 6 evaluates the numbers.) + +**Step 3: Commit** + +```bash +git add module/benchmark_iac_state_backend_test.go +git commit -m "test(module): add IaCStateBackend gRPC-vs-in-process benchmark harness + +Drives a ~1 MB synthetic IaCState through Lock/GetState/SaveState/Unlock +both in-process (baseline) and over a real bufconn gRPC boundary +(post-extraction path). Feeds the proto-transport decision in the next +task." +``` + +Rollback: `git revert` — test-only file. + +--- + +### Task 6: Run the benchmark, record the result, lock the proto-transport decision + +**Files:** +- Create: `docs/plans/2026-05-14-iac-state-backend-benchmark.md` (the recorded result + decision) +- Modify: `plugin/external/proto/iac.proto` (only if the benchmark forces a streaming redesign — expected: no change) + +**Step 1: Run the benchmark with statistical rigor** + +Run: `go test ./module/ -bench BenchmarkIaCStateBackend -benchmem -run '^$' -count=10 | tee /tmp/iac-state-bench.txt` +Expected: 10 samples each for `_InProcess` and `_GRPC`. + +**Step 2: Compute the added latency** + +Run: `go run golang.org/x/perf/cmd/benchstat /tmp/iac-state-bench.txt` (or `benchstat` if already on PATH per the Makefile's `bench-compare` target) +Expected: a side-by-side of `_InProcess` vs `_GRPC` ns/op with variance. + +**Step 3: Evaluate against the acceptance bar** + +Acceptance bar (set here, per design open-item "concrete acceptance threshold"): **unary transport is accepted if the gRPC path's p50 added latency for the full 4-call cycle is < 5 ms over the in-process baseline.** Rationale: an IaC plan/apply does one Lock/Get/Save/Unlock cycle per resource batch; sub-5 ms per cycle is negligible against real cloud-provider API latency (hundreds of ms). +- **If the bar is met** (expected — bufconn gRPC round-trips are tens of µs): the unary proto from Task 4 is **locked as-is**. No proto change. +- **If the bar is NOT met:** do NOT proceed. The proto needs a streaming redesign for `GetState`/`SaveState` — revise Task 4's proto, regenerate, re-run this task. This is the design's self-challenge doubt #3 gate. + +**Step 4: Record the result + decision** + +Create `docs/plans/2026-05-14-iac-state-backend-benchmark.md` with: the raw benchstat output, the computed p50 added latency, the 5 ms bar, and the verdict (`unary LOCKED` or `streaming required — proto revised`). This file is the durable evidence the design's "benchmark before proto lock" gate was honored. + +**Step 5: Commit** + +```bash +git add docs/plans/2026-05-14-iac-state-backend-benchmark.md +git commit -m "docs(plans): record IaCStateBackend transport benchmark — unary locked + +Benchmark result: gRPC bufconn round-trip adds p50 over the +in-process baseline for the full 1 MB-state Lock/Get/Save/Unlock cycle, +under the 5 ms acceptance bar. Unary IaCStateBackend proto locked; no +streaming redesign needed." +``` + +(If streaming was required, the commit also includes the revised `iac.proto` + regenerated bindings and the message reflects that.) + +Rollback: `git revert` — documentation; if a proto revision was included, reverting also reverts that (back to the Task 4 unary shape). + +--- + +## PR 3 — Phase A: host-side `IaCStateBackend` resolution + secret-redaction + gRPC-logging guard + +Wires the engine so `iac.state` can dispatch to a plugin-served backend, and lands the two blocking security tasks from the design's Security section. + +### Task 7: Production `IaCState` ⇄ `pb.IaCState` converters + an `IaCStateStore` gRPC client adapter + +**Files:** +- Create: `module/iac_state_grpc_client.go` +- Test: `module/iac_state_grpc_client_test.go` + +**Step 1: Write the failing test** + +Create `module/iac_state_grpc_client_test.go`. It stands up an in-memory `IaCStateBackend` server (delegating to `NewMemoryIaCStateStore()`), wraps the client end in the new `grpcIaCStateStore` adapter, and asserts the adapter satisfies `IaCStateStore` and round-trips a state correctly: + +```go +package module + +import ( + "context" + "net" + "testing" + + pb "github.com/GoCodeAlone/workflow/plugin/external/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" +) + +func TestGRPCIaCStateStoreRoundTrip(t *testing.T) { + lis := bufconn.Listen(4 << 20) + srv := grpc.NewServer() + pb.RegisterIaCStateBackendServer(srv, newBenchStateBackendServer(NewMemoryIaCStateStore())) + go func() { _ = srv.Serve(lis) }() + defer srv.Stop() + + conn, err := grpc.NewClient("passthrough:///bufnet", + grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { return lis.DialContext(ctx) }), + grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + t.Fatal(err) + } + defer conn.Close() + + var store IaCStateStore = newGRPCIaCStateStore(pb.NewIaCStateBackendClient(conn)) + + want := &IaCState{ResourceID: "r1", ResourceType: "kubernetes", Provider: "azure", Status: "active", + Outputs: map[string]any{"endpoint": "https://x"}, Config: map[string]any{"size": "L"}} + if err := store.SaveState(want); err != nil { + t.Fatalf("SaveState: %v", err) + } + got, err := store.GetState("r1") + if err != nil || got == nil { + t.Fatalf("GetState: %v (got=%v)", err, got) + } + if got.ResourceID != "r1" || got.Status != "active" || got.Outputs["endpoint"] != "https://x" { + t.Fatalf("round-trip mismatch: %+v", got) + } + if err := store.Lock("r1"); err != nil { + t.Fatalf("Lock: %v", err) + } + missing, err := store.GetState("nope") + if err != nil || missing != nil { + t.Fatalf("GetState(missing) should be nil,nil — got %v,%v", missing, err) + } + if err := store.Unlock("r1"); err != nil { + t.Fatalf("Unlock: %v", err) + } +} +``` + +**Step 2: Run the test to verify it fails** + +Run: `go test ./module/ -run TestGRPCIaCStateStoreRoundTrip -v` +Expected: FAIL — `newGRPCIaCStateStore` undefined. + +**Step 3: Implement the converters + adapter** + +Create `module/iac_state_grpc_client.go` with: `iacStateToProto(*IaCState) (*pb.IaCState, error)` and `iacStateFromProto(*pb.IaCState) (*IaCState, error)` (using `structpb.NewStruct` / `.AsMap()` for the `Outputs`/`Config` maps), and `grpcIaCStateStore` — a struct holding a `pb.IaCStateBackendClient` that implements all six `IaCStateStore` methods by delegating over gRPC. `GetState` maps a `GetStateResponse{Exists:false}` to `(nil, nil)` per the interface contract ("Returns nil, nil when not found"). Constructor: `newGRPCIaCStateStore(c pb.IaCStateBackendClient) *grpcIaCStateStore`. Use `context.Background()` for now (a context-plumbing follow-up can thread a real ctx later — out of scope here). + +Also: move the `newBenchStateBackendServer` helper out of the Task 5 `_test.go` file into this file as `iacStateBackendServer` (a production type — it is the *server* half core needs nowhere yet, but the Azure plugin's Task 11 needs the exact same delegation shape; keeping one canonical copy avoids drift). Update `module/benchmark_iac_state_backend_test.go` to use the promoted `iacStateToProto` + `iacStateBackendServer` and delete its inline copies. + +**Step 4: Run the test to verify it passes** + +Run: `go test ./module/ -run TestGRPCIaCStateStoreRoundTrip -v` +Expected: PASS. + +**Step 5: Re-run the benchmark to confirm the refactor didn't break it** + +Run: `go test ./module/ -bench BenchmarkIaCStateBackend -benchmem -run '^$' -count=1` +Expected: both benchmarks still run cleanly. + +**Step 6: Commit** + +```bash +git add module/iac_state_grpc_client.go module/iac_state_grpc_client_test.go module/benchmark_iac_state_backend_test.go +git commit -m "feat(module): IaCState proto converters + grpcIaCStateStore client adapter + +grpcIaCStateStore implements module.IaCStateStore over an +IaCStateBackendClient — the host-side half of the new contract. Promotes +the proto<->struct converters and the delegating server shape out of the +benchmark test file so the plugin side (Phase A PR4) reuses one canonical +copy." +``` + +Rollback: `git revert` — new file + test; no engine wiring yet, core builds unchanged. + +--- + +### Task 8: Engine-side plugin backend registry — resolve `iac.state` backends from loaded plugins + +**Files:** +- Modify: `module/iac_module.go` +- Create: `module/iac_state_plugin_registry.go` +- Test: `module/iac_state_plugin_registry_test.go` + +**Step 1: Spike — confirm the engine's external-plugin-manager handle (≤15 min, no code)** + +Read `engine.go` (`BuildFromConfig`) and `plugin/external/` to confirm how the engine loads external plugins at startup and where a module's `Init(app modular.Application)` can reach the set of loaded `ExternalPluginAdapter`s. The deploy path (`cmd/wfctl/deploy_providers.go`) scans `./data/plugins` directly; the engine likely already has an `ExternalPluginManager` in `BuildFromConfig`. Record the handle path in a one-paragraph comment at the top of `iac_state_plugin_registry.go`. **If the engine has no such handle reachable from module Init**, the fallback (design Architecture §1) is a package-level registry that the engine populates at plugin-load time — implement that instead. Pick whichever the spike confirms; both satisfy the design. + +**Step 2: Write the failing test** + +Create `module/iac_state_plugin_registry_test.go`. Test the registry: registering a backend name → `pb.IaCStateBackendClient` factory, and looking it up. Use a fake client. Assert: an unknown backend name returns `(nil, false)`; a registered name returns the client; registering a **reserved** name (`memory`/`filesystem`/`postgres`) returns an error (design Failure-modes "reserved-name collision"). + +```go +package module + +import "testing" + +func TestIaCStateBackendRegistry(t *testing.T) { + reg := newIaCStateBackendRegistry() + if _, ok := reg.resolve("azure_blob"); ok { + t.Fatal("empty registry should not resolve azure_blob") + } + fake := &fakeStateBackendClient{} + if err := reg.register("azure_blob", fake); err != nil { + t.Fatalf("register: %v", err) + } + got, ok := reg.resolve("azure_blob") + if !ok || got != fake { + t.Fatalf("resolve azure_blob: ok=%v got=%v", ok, got) + } + for _, reserved := range []string{"memory", "filesystem", "postgres"} { + if err := reg.register(reserved, fake); err == nil { + t.Fatalf("register(%q) must fail — reserved core backend name", reserved) + } + } +} +``` + +(Define a minimal `fakeStateBackendClient` satisfying `pb.IaCStateBackendClient` in the test file.) + +**Step 3: Run the test to verify it fails** + +Run: `go test ./module/ -run TestIaCStateBackendRegistry -v` +Expected: FAIL — `newIaCStateBackendRegistry` undefined. + +**Step 4: Implement the registry** + +Create `module/iac_state_plugin_registry.go`: an `iacStateBackendRegistry` struct wrapping a `map[string]pb.IaCStateBackendClient` + a mutex. `register(name, client)` rejects the reserved names `memory`/`filesystem`/`postgres` with a clear error (`"plugin registered reserved iac.state backend name %q"`). `resolve(name)` returns `(client, ok)`. Provide a package-level default registry instance the engine populates at plugin-load time, plus `newIaCStateBackendRegistry()` for tests. + +**Step 5: Run the test to verify it passes** + +Run: `go test ./module/ -run TestIaCStateBackendRegistry -v` +Expected: PASS. + +**Step 6: Wire `IaCModule.Init()` to consult the registry** + +Modify `module/iac_module.go` `Init()`: in the backend `switch`, for any backend name **not** in the core set (`memory`/`filesystem`/`postgres` — and, until later phases, still `spaces`/`gcs`/`azure_blob` keep their in-process cases for now), add a `default:` arm that consults the plugin registry: if `reg.resolve(m.backend)` succeeds, `m.store = newGRPCIaCStateStore(client)`; if not, return the existing `"unsupported backend"` error **extended** with `" (or load the plugin that provides it)"`. Crucially: the `default` arm must run *before* the final error return. The in-process `azure_blob` case stays untouched in this PR — PR 5 deletes it. The point of this task is the *plumbing* exists and is tested; PR 5 flips `azure_blob` to use it. + +Add a focused test in `iac_state_plugin_registry_test.go` constructing an `IaCModule` with `backend: "azure_blob_test_only"`, a registry pre-populated with a fake client for that name, and asserting `Init()` sets `m.store` to a `*grpcIaCStateStore`. + +**Step 7: Build + test** + +Run: `go build ./... && go test ./module/ -run 'IaCStateBackend|IaCModule' -v` +Expected: exit 0, PASS. + +**Step 8: Commit** + +```bash +git add module/iac_module.go module/iac_state_plugin_registry.go module/iac_state_plugin_registry_test.go +git commit -m "feat(module): engine-side iac.state plugin-backend registry + dispatch + +IaCModule.Init() now resolves non-core backend names from a registry the +engine populates at plugin-load time, constructing a grpcIaCStateStore +client. Reserved core names (memory/filesystem/postgres) are rejected at +registration. The in-process azure_blob case is untouched here — the +plumbing exists and is tested; Phase A PR5 flips azure_blob onto it." +``` + +Rollback: `git revert` — the registry is additive and the `azure_blob` in-process path is unchanged, so reverting leaves `iac.state` working exactly as before. Rollback note: revert commit + `go test ./module/...` to confirm in-process backends still construct. + +--- + +### Task 9: Extend secret redaction to recognise `credentials:` / `credentials_ref:` keys + +**Files:** +- Modify: `module/step_output_redactor.go` +- Test: `module/step_output_redactor_test.go` (existing — add cases) + +**Step 1: Write the failing test** + +Add to `module/step_output_redactor_test.go`: + +```go +func TestRedactCredentialsBlock(t *testing.T) { + in := map[string]any{ + "credentials": map[string]any{ + "accessKey": "AKIAEXAMPLE", + "secretKey": "supersecret", + }, + "credentials_ref": "aws-creds-module", + "bucket": "public-bucket-name", + } + out := RedactStepOutput(in) + creds := out["credentials"].(map[string]any) + if creds["accessKey"] != RedactionPlaceholder || creds["secretKey"] != RedactionPlaceholder { + t.Fatalf("credentials block not redacted: %+v", creds) + } + // credentials_ref is a module NAME, not a secret — must NOT be redacted. + if out["credentials_ref"] != "aws-creds-module" { + t.Fatalf("credentials_ref should not be redacted (it is a module reference)") + } + if out["bucket"] != "public-bucket-name" { + t.Fatalf("non-sensitive field wrongly redacted") + } +} +``` + +**Step 2: Run the test to verify it fails** + +Run: `go test ./module/ -run TestRedactCredentialsBlock -v` +Expected: FAIL — `accessKey` already matches `access_key`? Check: `SensitiveFieldPatterns` has `access_key` (underscore) but the key here is `accessKey` (camelCase). The existing substring match is case-insensitive but `accessKey` does not contain `access_key`. So `accessKey`/`secretKey` are **not** currently redacted → test fails. + +**Step 3: Implement** + +In `module/step_output_redactor.go`, add to `SensitiveFieldPatterns` the camelCase / bare forms that a `credentials:` block uses: `"accesskey"`, `"secretkey"`, `"sessiontoken"`, `"account_key"`, `"accountkey"`, `"clientsecret"`, `"client_secret"`. Because matching is case-insensitive substring, `"accesskey"` matches `accessKey`. Also ensure a key literally named `credentials` whose value is a map gets its children recursively redacted — the existing `redactMap` recursion already covers nested maps, so adding the leaf patterns is sufficient. Do **not** add `credentials_ref` to the patterns — it is a module reference, not a secret (the test guards this). + +**Step 4: Run the test to verify it passes** + +Run: `go test ./module/ -run 'Redact' -v` +Expected: PASS (the new test + all existing redaction tests still green). + +**Step 5: Commit** + +```bash +git add module/step_output_redactor.go module/step_output_redactor_test.go +git commit -m "feat(module): redact inline credentials: block keys (accessKey/secretKey/etc.) + +Option-1 credentials move raw cloud secrets inline into plugin-native +module config. Extends SensitiveFieldPatterns with the camelCase forms a +credentials: block uses so the config-version store + execution tracing +redact them. credentials_ref: (a module reference, not a secret) is +deliberately left un-redacted." +``` + +Rollback: `git revert` — redaction is additive; reverting only narrows what's redacted (no functional break, but re-widening is the forward fix). + +--- + +### Task 10: gRPC-interceptor guard test — assert no interceptor logs `CreateModule` bodies + +**Files:** +- Create: `plugin/external/grpc_logging_guard_test.go` +- Test: itself + +**Step 1: Write the guard test** + +The design's Security section verified at design time that `plugin/external/` adds no body-logging gRPC interceptor (only `callback_server.go:85,118` logs, and neither touches module config). This test makes that a permanent CI guard: if a future change adds an interceptor that could log `CreateModule` request bodies (which carry `credentials:` blocks), CI fails. + +Create `plugin/external/grpc_logging_guard_test.go`: + +```go +package external + +import ( + "os" + "regexp" + "testing" +) + +// The plugin SDK must NOT install a gRPC interceptor that logs request bodies — +// CreateModule requests carry inline credentials: blocks. This test fails if +// grpc.NewServer / grpc.NewClient anywhere in plugin/external/ is constructed +// with a *UnaryInterceptor option, forcing a reviewer to look. See the +// cloud-sdk-extraction design, Security section. +func TestNoBodyLoggingInterceptor(t *testing.T) { + interceptorOpt := regexp.MustCompile(`(Chain)?Unary(Server|Client)?Interceptor`) + entries, err := os.ReadDir(".") + if err != nil { + t.Fatal(err) + } + for _, e := range entries { + name := e.Name() + if e.IsDir() || !match(name, ".go") || match(name, "_test.go") { + continue + } + b, err := os.ReadFile(name) + if err != nil { + t.Fatal(err) + } + if interceptorOpt.Match(b) { + t.Fatalf("%s references a gRPC interceptor option — if it logs request "+ + "bodies it can leak inline credentials: blocks. Audit it and, if safe, "+ + "add an explicit allowlist entry to this test.", name) + } + } +} + +func match(s, suffix string) bool { return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix } +``` + +**Step 2: Run the test** + +Run: `go test ./plugin/external/ -run TestNoBodyLoggingInterceptor -v` +Expected: PASS (no interceptor exists today — design verified this). + +**Step 3: Commit** + +```bash +git add plugin/external/grpc_logging_guard_test.go +git commit -m "test(plugin/external): guard against gRPC body-logging interceptors + +CreateModule requests carry inline credentials: blocks. This guard fails +CI if any plugin/external/ file gains a gRPC interceptor option, forcing +a reviewer to confirm it cannot log request bodies. Implements the +cloud-sdk-extraction design's Security guard-test requirement." +``` + +Rollback: `git revert` — test-only. + +--- + +## PR 4 — Phase A: `workflow-plugin-azure` implements `azure_blob` `IaCStateBackend` (cross-repo) + +**Repository:** `/Users/jon/workspace/workflow-plugin-azure` (NOT the workflow worktree). Branch: `feat/azure-blob-state-backend`. This PR depends on PRs 2 (published proto) and is a prerequisite for PR 5. + +### Task 11: Port `AzureBlobIaCStateStore` into workflow-plugin-azure + serve it as `IaCStateBackend` + +**Files (in `/Users/jon/workspace/workflow-plugin-azure`):** +- Create: `internal/statebackend/azure_blob.go` (the ported store — copy from workflow's `module/iac_state_azure.go`) +- Create: `internal/statebackend/server.go` (the `IaCStateBackendServer` gRPC impl delegating to the store) +- Modify: the plugin's main entrypoint + `plugin.json` to advertise the `azure_blob` `IaCStateBackend` +- Test: `internal/statebackend/azure_blob_test.go` (port the existing tests from workflow's `module/iac_state_azure_test.go` if present; otherwise test against the `AzureBlobClient` interface with a fake) + +**Step 1: Inspect the current plugin structure** + +Run: `ls -R /Users/jon/workspace/workflow-plugin-azure/{cmd,internal,provider,drivers} 2>/dev/null; cat /Users/jon/workspace/workflow-plugin-azure/plugin.json` +Expected: understand where `sdk.ServeIaCPlugin` is called and how `plugin.json` declares capabilities. + +**Step 2: Port the store** + +Copy `module/iac_state_azure.go` from the workflow worktree into `internal/statebackend/azure_blob.go` in the plugin repo. It already carries its own `AzureBlobClient` interface + `azureRealClient` (azblob-backed) impl — it is self-contained. Adjust the package name. The plugin repo *gains* the `Azure/azure-sdk-for-go/sdk/storage/azblob` dependency (it likely already has it for its IaC resource-provider role — confirm with `grep azblob go.mod`). + +**Step 3: Port the tests, run them** + +Copy `module/iac_state_azure_test.go` (if it exists in the worktree) into `internal/statebackend/azure_blob_test.go`. Run: `go test ./internal/statebackend/ -v` +Expected: PASS — the store's logic is unchanged, only its home moved. + +**Step 4: Write the `IaCStateBackendServer` impl** + +Create `internal/statebackend/server.go` implementing `proto.IaCStateBackendServer` (from `github.com/GoCodeAlone/workflow/plugin/external/proto`) by delegating each RPC to an `AzureBlobIaCStateStore`, with the same `IaCState`⇄`pb.IaCState` conversion shape promoted in workflow Task 7. Mirror that converter exactly (the plugin imports the same `proto` package, so the wire types are identical). + +**Step 5: Wire it into the plugin's serve path + manifest** + +Register the `IaCStateBackend` service on the plugin's gRPC server alongside its existing `IaCProviderRequired` service, and add `azure_blob` to the plugin's advertised state-backend capabilities in `plugin.json` (mirror how the existing `iacProvider` capability is declared — the host's registry-population step in workflow Task 8 reads this). + +**Step 6: Build + load-test the plugin** + +Run: `go build ./... && go test ./...` in the plugin repo. +Expected: exit 0, PASS. +Then load-test: build the plugin binary, point a minimal workflow config with `iac.state` `backend: azure_blob` at it (using the workflow worktree's `server` binary built from PR 3's branch), and confirm the engine resolves the plugin-served backend. **Verification (plugin change class — load into host + exercise):** the engine logs the `iac.state` module constructing a `grpcIaCStateStore` for `azure_blob`, and a `SaveState`/`GetState` round-trips. Capture the transcript. + +**Step 7: Commit (in the plugin repo)** + +```bash +git add internal/statebackend/ plugin.json cmd/ +git commit -m "feat: serve azure_blob IaCStateBackend + +Ports AzureBlobIaCStateStore from workflow core and serves it behind the +new proto.IaCStateBackend gRPC contract. Advertises azure_blob in +plugin.json so the workflow engine resolves it at plugin-load time. This +plugin version is the prerequisite for workflow dropping its in-core +azure_blob backend." +``` + +Rollback: `git revert` in the plugin repo — additive (new service + capability); the plugin's existing IaC-provider role is untouched, so reverting leaves the plugin fully functional minus the new state backend. + +--- + +### Task 12: Tag a `workflow-plugin-azure` release implementing `IaCStateBackend` + +**Files (in `/Users/jon/workspace/workflow-plugin-azure`):** +- Modify: `CHANGELOG.md` + +**Step 1: Add the CHANGELOG entry** + +Document the new `azure_blob` `IaCStateBackend` capability and the migration note: workflow configs using `iac.state` `backend: azure_blob` on a workflow-core version that has dropped the in-core backend (the PR 5 core version) **must** load this plugin version or newer. + +**Step 2: Commit + tag** + +```bash +git add CHANGELOG.md +git commit -m "docs: changelog for azure_blob IaCStateBackend support" +# Minor version bump — new capability, additive. Confirm current latest tag first: +git tag -a vX.Y.0 -m "azure_blob IaCStateBackend support" +``` + +**Step 3: Push branch + open the plugin PR; after merge, push the tag** + +Run: `git push -u origin feat/azure-blob-state-backend` then `gh pr create ...`. After the plugin PR merges, push the tag so workflow Task 14 can pin to it. **Verification (version pin — the tag must be resolvable):** `git ls-remote --tags origin | grep vX.Y.0` returns the tag. + +Rollback: delete the tag (`git push origin :refs/tags/vX.Y.0`) + revert the CHANGELOG commit. The tag is the externally-visible artifact; deleting it before any consumer pins is clean. + +--- + +## PR 5 — Phase A: core deletes `iac_state_azure.go` + strips `azure_blob` case → drops azure-sdk from go.mod + +The payoff PR. **Prerequisite: PR 4's plugin version is merged + tagged** — after this PR, `backend: azure_blob` has no in-core implementation. + +### Task 13: Delete `iac_state_azure.go` + strip the `azure_blob` case from `iac_module.go` + +**Files:** +- Delete: `module/iac_state_azure.go` +- Delete: `module/iac_state_azure_test.go` (if it exists — its logic now lives + is tested in the plugin repo, Task 11) +- Modify: `module/iac_module.go` +- Modify: `go.mod`, `go.sum` +- Test: `module/iac_module_test.go` (existing + a new case) + +**Step 1: Write the failing test** + +Add to `module/iac_module_test.go` a test asserting that `backend: azure_blob` with **no plugin registered** now returns the plugin-guidance error (not a successful in-process construction), and that with a fake plugin client registered it constructs a `grpcIaCStateStore`: + +```go +func TestIaCModuleAzureBlobRequiresPlugin(t *testing.T) { + m := NewIaCModule("st", map[string]any{"backend": "azure_blob", "container": "c", + "account_url": "https://x", "account_name": "n", "account_key": "k"}) + err := m.Init(newTestApp(t)) + if err == nil { + t.Fatal("azure_blob with no plugin loaded must error — in-core backend is gone") + } + if !strings.Contains(err.Error(), "azure_blob") || !strings.Contains(err.Error(), "plugin") { + t.Fatalf("error should point at the missing plugin: %v", err) + } +} +``` + +(Reuse whatever test-app constructor `iac_module_test.go` already uses; `newTestApp` is a placeholder for the existing helper.) + +**Step 2: Run the test to verify it fails** + +Run: `go test ./module/ -run TestIaCModuleAzureBlobRequiresPlugin -v` +Expected: FAIL — the in-process `azure_blob` case still constructs an `AzureBlobIaCStateStore` successfully, so `Init()` returns nil. + +**Step 3: Strip the `azure_blob` case + `newAzureSharedKeyCredential`** + +In `module/iac_module.go`: remove the entire `case "azure_blob":` block (lines ~86-106) and the `newAzureSharedKeyCredential` helper + the `azblob` import. The `default:` arm added in Task 8 now handles `azure_blob` — it consults the plugin registry and returns the plugin-guidance error if unregistered. Also: while in this file, **fix the stale line-18 doc comment** (`"Supported backends: 'memory' … 'filesystem' … 'spaces'"`) to list all currently-supported backends accurately (`memory`, `filesystem`, `gcs`, `spaces`, `postgres`, plus "and any backend provided by a loaded plugin"). + +**Step 4: Delete `iac_state_azure.go`** + +Run: `git rm module/iac_state_azure.go` (and `git rm module/iac_state_azure_test.go` if present). + +**Step 5: Tidy go.mod** + +Run: `go mod tidy` +Expected: `go.mod` + `go.sum` lose `github.com/Azure/azure-sdk-for-go/sdk/azcore` and `.../sdk/storage/azblob` (and any now-unused transitive azure deps). Confirm with `git diff go.mod`. + +**Step 6: Run the audit script — Azure is gone** + +Run: `bash scripts/audit-cloud-symbols.sh | grep -A8 'azure-sdk-for-go'` +Expected: the `azure-sdk-for-go` section is **empty** (no REAL, no comment-only) — zero azure-sdk references anywhere in the repo. + +**Step 7: Build + test** + +Run: `go build ./... && go test ./module/ -run 'IaCModule|IaCStateBackend' -v` +Expected: exit 0; PASS including the new `TestIaCModuleAzureBlobRequiresPlugin`. + +**Step 8: Runtime-launch validation** + +This task changes plugin loading paths + `go.mod` — a `runtime-launch-validation` trigger. Build the server, launch it with a config that uses `iac.state` `backend: azure_blob` **with the Task 11 plugin available**, and confirm it reaches healthy startup + the backend resolves over gRPC. Then launch with the plugin **absent** and confirm a clean, actionable error (not a panic). Capture both transcripts. + +Run: `go build -o /tmp/server ./cmd/server && /tmp/server -config ...` +Expected: with plugin → engine ready, `iac.state` backend resolved; without plugin → clean `"iac.state backend \"azure_blob\": ... load the plugin"` error, exit non-zero, no panic. + +**Step 9: Commit** + +```bash +git add module/iac_module.go go.mod go.sum +git rm module/iac_state_azure.go +# git rm module/iac_state_azure_test.go # if it existed +git commit -m "feat(module)!: drop in-core azure_blob IaC state backend + +Deletes iac_state_azure.go and strips the azure_blob case + +newAzureSharedKeyCredential from iac_module.go. backend: azure_blob now +resolves an IaCStateBackend gRPC client from workflow-plugin-azure +(>= vX.Y.0). go mod tidy removes Azure/azure-sdk-for-go entirely — the +audit script confirms zero azure-sdk references repo-wide. + +BREAKING: iac.state with backend: azure_blob now requires +workflow-plugin-azure to be loaded. See docs/migrations. + +Rollback: revert this commit + go mod tidy restores the in-core backend +and re-adds azure-sdk to go.mod; smoke-check with an azure_blob config." +``` + +Rollback: revert the commit + `go mod tidy` (restores `iac_state_azure.go`, the in-core case, and the azure-sdk deps) + relaunch the server with an `azure_blob` config to confirm the in-core path works again. + +--- + +### Task 14: Migration doc + pin the engine's plugin-registry population to advertise `azure_blob` + +**Files:** +- Create: `docs/migrations/2026-05-14-cloud-sdk-extraction.md` +- Modify: the engine plugin-load wiring confirmed in Task 8's spike (where plugins are loaded → populate `iacStateBackendRegistry`) +- Test: `module/iac_state_plugin_registry_test.go` (extend) + a launch check + +**Step 1: Write the migration doc** + +Create `docs/migrations/2026-05-14-cloud-sdk-extraction.md` covering (per the design's Migration section, Phase A scope only): `iac.state` with `backend: azure_blob` now requires `wfctl plugin install workflow-plugin-azure` (≥ the Task 12 tag); the yaml `backend: azure_blob` value is unchanged; `memory`/`filesystem`/`postgres` are unaffected. Note that Phases B/C/D (AWS/GCP/DO) will follow the same pattern in subsequent releases. + +**Step 2: Wire plugin-load → registry population** + +In the engine plugin-load path (from Task 8's spike), after each external plugin is loaded, read its advertised `IaCStateBackend` capabilities from `plugin.json` / its `ContractRegistry` and call `iacStateBackendRegistry.register(name, client)` for each — building the `pb.IaCStateBackendClient` from the loaded plugin's gRPC connection (mirror the `typedIaCAdapter` construction in `cmd/wfctl/iac_typed_adapter.go`, but for the engine context and the `IaCStateBackend` service name `workflow.plugin.external.iac.IaCStateBackend`). + +**Step 3: Write/extend the test** + +Add a test that loads a fake plugin advertising `azure_blob` and asserts the engine's registry has it resolvable after load. If a full plugin-load test is too heavy for a unit test, assert the *population function* in isolation: given a fake `ExternalPluginAdapter` advertising `azure_blob`, the population step calls `registry.register("azure_blob", )`. + +**Step 4: Build + test + launch validation** + +Run: `go build ./... && go test ./module/ -run 'IaCStateBackend|IaCModule' -v` +Expected: exit 0, PASS. +Then the end-to-end launch check from Task 13 Step 8 should now work *without manual registry seeding* — the engine auto-populates from the loaded plugin. Re-run that launch with the Task 11 plugin in `./data/plugins/` and confirm `azure_blob` resolves with zero manual wiring. Capture the transcript. + +**Step 5: Commit** + +```bash +git add docs/migrations/2026-05-14-cloud-sdk-extraction.md module/ engine.go +git commit -m "feat(engine): auto-populate iac.state backend registry from loaded plugins + +At plugin-load time the engine reads each plugin's advertised +IaCStateBackend capabilities and registers a gRPC client into the +iac.state backend registry, so iac.state backend: azure_blob resolves +with zero manual wiring. Adds the user-facing migration doc. + +Rollback: revert this commit — iac.state plugin backends then require +manual registry seeding (the registry + dispatch from Task 8 remain); +core in-process backends (memory/filesystem/postgres) are unaffected." +``` + +Rollback: revert the commit; the registry + dispatch plumbing (Task 8) survive, only the auto-population is removed. Core backends unaffected. Relaunch with a `memory` backend config to confirm. + +--- + +## Notes for the executor + +- **TDD discipline:** every task above follows write-test → see-it-fail → implement → see-it-pass → commit. Do not skip the "see it fail" step — it proves the test exercises the new behavior. +- **Cross-repo PR 4:** create the branch + PR in `/Users/jon/workspace/workflow-plugin-azure` separately. It must merge + tag before PR 5. If the pipeline cannot operate cross-repo autonomously, surface PR 4 as a blocker for human action rather than skipping it. +- **PR ordering:** PR 1 → (PR 2, PR 3 in either order) → PR 4 → PR 5. PR 5 is the only breaking change and the only one that touches `go.mod`. +- **Benchmark gate (Task 6):** if the benchmark fails the 5 ms bar, STOP and redesign the proto for streaming before continuing — this is a design-mandated gate, not a formality. +- **Follow-on plan:** once PR 5 merges, author the Phase B/C/D plan. Phase B (AWS) reuses Task 7's converters + Task 8's registry + Task 11's plugin pattern; Phase C (GCP) additionally runs the `kubernetesBackend` interface-audit spike for the `gke` contract decision (design Architecture §2); Phase D (DigitalOcean `spaces`) rides Phase B's `iac_state_spaces.go` deletion. From 3499c8b14d755ba0368186b6c28aaca17880ccb2 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:19:52 -0400 Subject: [PATCH 16/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20p?= =?UTF-8?q?lan=20=E2=80=94=20address=20plan-phase=20adversarial=20review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan-phase adversarial review FAIL (1 Critical + 4 Important + 4 Minor). All addressed: - C1 (Critical): Task 4's proto used google.protobuf.Struct, which iac.proto:6-10 explicitly bans. Rewrote IaCState to carry Outputs/Config as `bytes outputs_json`/`bytes config_json` (the established ResourceState pattern); Tasks 5/7/11 now convert via encoding/json, not structpb. Removed the bogus struct.proto import step. - I1: Task 4 `buf generate` now runs from worktree root (buf.yaml lives there), not `cd plugin/external/proto`. - I2: Task 6 acknowledges the existing benchmark.yml (-bench=. picks up the new benchmarks automatically) — no redundant harness; clarified the task is a one-time decision gate. - I3: Task 8's embedded research spike resolved at plan time — engine.go was read; integration is the design-sanctioned package-level module.iacStateBackendRegistry populated by StdEngine.loadPluginInternal. Tasks 8/13/14 now have concrete file sets. - I4: Scope Manifest now declares PR 4 a human-action gate (cross-repo, workflow-plugin-azure) with the PR4->PR5 dependency stated explicitly. - M1: Task 5's benchmark file is now genuinely self-contained (local benchStateToProto + benchStateBackendServer; no forward references). - M2: Task 3 names ci.yml directly, places the audit job beside the existing godo-banned/aws-sdk-banned grep-gate jobs. - M3: Task 6 pins benchstat (go install + bare invocation). - M4: Task 9 states the redaction gap is verified against step_output_redactor.go:7-19, not a live deduction. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/plans/2026-05-14-cloud-sdk-extraction.md | 178 +++++++++++------- 1 file changed, 112 insertions(+), 66 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md index 46e55e7f..a83f7f4e 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -35,9 +35,11 @@ | 1 | Phase 0: split platform_kubernetes_kind.go + wire audit script into CI | Task 1, Task 2, Task 3 | feat/cloud-sdk-extraction-p0 | | 2 | Phase A: IaCStateBackend proto + benchmark harness + proto lock | Task 4, Task 5, Task 6 | feat/cloud-sdk-extraction-pa-proto | | 3 | Phase A: host-side IaCStateBackend resolution + secret-redaction + gRPC-logging guard | Task 7, Task 8, Task 9, Task 10 | feat/cloud-sdk-extraction-pa-host | -| 4 | Phase A: workflow-plugin-azure implements azure_blob IaCStateBackend | Task 11, Task 12 | (cross-repo: workflow-plugin-azure `feat/azure-blob-state-backend`) | +| 4 | **[HUMAN-GATE — cross-repo]** Phase A: workflow-plugin-azure implements azure_blob IaCStateBackend | Task 11, Task 12 | (cross-repo: `workflow-plugin-azure` repo, branch `feat/azure-blob-state-backend`) | | 5 | Phase A: core deletes iac_state_azure.go + strips azure_blob case → drops azure-sdk from go.mod | Task 13, Task 14 | feat/cloud-sdk-extraction-pa-core | +**PR 4 is a declared human-action gate.** It lands in a *different git repository* (`/Users/jon/workspace/workflow-plugin-azure`), which the autonomous worktree-scoped execution pipeline cannot branch/PR/tag. When execution reaches PR 4, the pipeline must **pause and surface PR 4 to a human operator** (with Tasks 11–12 as the checklist) — this is the expected, designed handoff, not an execution failure. PR 5 is **blocked on PR 4's plugin tag** existing and being installable (Task 13 Step 8 + Task 14 Step 4 runtime-launch validation load the tagged plugin binary). PRs 1–3 are fully autonomous-executable in the worktree; PR 5 resumes autonomously once the operator confirms PR 4 merged + tagged. + **Status:** Draft --- @@ -196,15 +198,10 @@ Rollback: `git revert` — comment-only. **Files:** - Modify: `scripts/audit-cloud-symbols.sh` -- Modify: `.github/workflows/ci.yml` (or the repo's primary CI workflow — confirm the exact filename with `ls .github/workflows/`) +- Modify: `.github/workflows/ci.yml` (verified — the repo's primary build/test workflow; it already hosts grep-gate jobs `godo-banned` and `aws-sdk-banned`, the natural neighbours for this audit step) - Test: `scripts/audit-cloud-symbols.sh --check` (the script self-verifies) -**Step 1: Identify the CI workflow file** - -Run: `ls .github/workflows/` -Expected: a primary build/test workflow (e.g. `ci.yml`, `test.yml`, `go.yml`). Note its name for Step 4. - -**Step 2: Add the `init()`-partition assertion to the script** +**Step 1: Add the `init()`-partition assertion to the script** In `scripts/audit-cloud-symbols.sh`, extend the `--check` path so it fails if any post-Phase-0 file registers both a core-staying and a plugin-bound Kubernetes backend in one `init()`. Add, after the existing `platform_kubernetes_kind.go` advisory block (which becomes moot once the file is gone — guard it with a file-existence check): @@ -231,26 +228,31 @@ fi Also guard the existing `platform_kubernetes_kind.go` advisory block with `[[ -f module/platform_kubernetes_kind.go ]]` so it silently skips post-Phase-0 (the file is gone). -**Step 3: Run the script's check mode locally** +**Step 2: Run the script's check mode locally** Run: `bash scripts/audit-cloud-symbols.sh --check` Expected: prints the real-import map, the new "init() partition clean" line shows `OK`, final line `audit-cloud-symbols: OK`, exit 0. -**Step 4: Wire it into CI** +**Step 3: Wire it into CI** -Add a step to the CI workflow identified in Step 1, in the existing build/test job, after checkout + Go setup: +Add a new job to `.github/workflows/ci.yml` alongside the existing `godo-banned` / `aws-sdk-banned` grep-gate jobs (same shape — `runs-on: ubuntu-latest`, checkout, run the script): ```yaml - - name: Cloud-SDK inventory + partition audit + cloud-sdk-audit: + name: Cloud-SDK inventory + k8s-backend init() partition audit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Audit cloud-SDK imports + init() partition run: bash scripts/audit-cloud-symbols.sh --check ``` -**Step 5: Verify the workflow YAML is valid** +**Step 4: Verify the workflow YAML is valid** -Run: `bash -n scripts/audit-cloud-symbols.sh` (script syntax) and visually confirm the YAML indentation matches the surrounding steps in the workflow file. -Expected: script syntax OK; YAML step nested at the same level as sibling steps. +Run: `bash -n scripts/audit-cloud-symbols.sh` (script syntax) and visually confirm the new job's indentation matches the sibling `godo-banned` / `aws-sdk-banned` jobs in `ci.yml`. +Expected: script syntax OK; the `cloud-sdk-audit` job nested at the same level as `godo-banned`. -**Step 6: Commit** +**Step 5: Commit** ```bash git add scripts/audit-cloud-symbols.sh .github/workflows/ @@ -297,8 +299,10 @@ func TestIaCStateBackendGeneratedTypesExist(t *testing.T) { _ = &ListStatesRequest{Filter: map[string]string{"k": "v"}} _ = &LockRequest{ResourceId: "r"} _ = &UnlockRequest{ResourceId: "r"} - // IaCState mirrors module.IaCState's JSON-serialisable fields. - s := &IaCState{ResourceId: "r", ResourceType: "kubernetes", Provider: "azure", Status: "active"} + // IaCState mirrors module.IaCState; free-form Outputs/Config cross the wire + // as JSON bytes per the iac.proto hard invariant (NO google.protobuf.Struct). + s := &IaCState{ResourceId: "r", ResourceType: "kubernetes", Provider: "azure", + Status: "active", OutputsJson: []byte(`{}`), ConfigJson: []byte(`{}`)} if s.GetResourceId() != "r" { t.Fatalf("IaCState.ResourceId accessor missing") } @@ -312,7 +316,7 @@ Expected: FAIL — build error, `IaCStateBackendServer` / `GetStateRequest` etc. **Step 3: Add the service + messages to `iac.proto`** -Append to `plugin/external/proto/iac.proto` (after the `ResourceDriver` service, before EOF). Mirror `module.IaCState` field-for-field (see `module/iac_state.go:4-18`): +Append to `plugin/external/proto/iac.proto` (after the `ResourceDriver` service, before EOF). Mirror `module.IaCState` field-for-field (see `module/iac_state.go:4-18`). **Hard invariant — `iac.proto:6-10`: NO `google.protobuf.Struct`, NO `google.protobuf.Any`.** Free-form `Outputs` / `Config` maps cross the wire as `bytes _json`, JSON-encoded by host/plugin — exactly the established `ResourceState` pattern (`iac.proto:144`, fields `applied_config_json` / `outputs_json`). Do **not** add a `struct.proto` import; `iac.proto` imports only `timestamp.proto` and that must not change. ```proto // IaCStateBackend — strict contract for IaC state storage backends served by a @@ -329,8 +333,9 @@ service IaCStateBackend { rpc Unlock (UnlockRequest) returns (UnlockResponse); } -// IaCState mirrors module.IaCState (module/iac_state.go). Maps used there -// (Outputs, Config) carry arbitrary JSON — represented here as google.protobuf.Struct. +// IaCState mirrors module.IaCState (module/iac_state.go:4-18). The free-form +// Outputs / Config map[string]any fields cross the wire as JSON bytes per the +// iac.proto hard invariant — same pattern as ResourceState.outputs_json. message IaCState { string resource_id = 1; string resource_type = 2; @@ -339,9 +344,9 @@ message IaCState { string provider_id = 5; string config_hash = 6; string status = 7; - google.protobuf.Struct outputs = 8; - google.protobuf.Struct config = 9; - repeated string dependencies = 10; + bytes outputs_json = 8; // JSON-encoded map[string]any (module.IaCState.Outputs) + bytes config_json = 9; // JSON-encoded map[string]any (module.IaCState.Config) + repeated string dependencies = 10; string created_at = 11; string updated_at = 12; string error = 13; @@ -361,12 +366,10 @@ message UnlockRequest { string resource_id = 1; } message UnlockResponse {} ``` -Confirm `iac.proto` already imports `google/protobuf/struct.proto` (it uses `Struct` elsewhere); if not, add `import "google/protobuf/struct.proto";` near the top with the other imports. - **Step 4: Regenerate the Go bindings** -Run: `cd plugin/external/proto && buf generate` (per `plugin/external/proto/README.md`) -Expected: `iac.pb.go` + `iac_grpc.pb.go` regenerated, now containing `IaCStateBackendServer`, `IaCStateBackendClient`, and the message types. `git diff --stat` shows only the two `*.pb.go` files changed plus `iac.proto`. +Run: `buf generate` **from the worktree root** (per `plugin/external/proto/README.md` — `buf.yaml` / `buf.gen.yaml` live at repo root; running from inside `plugin/external/proto/` will not find them). +Expected: `plugin/external/proto/iac.pb.go` + `iac_grpc.pb.go` regenerated, now containing `IaCStateBackendServer`, `IaCStateBackendClient`, and the message types. `git diff --stat` shows only the two `*.pb.go` files changed plus `iac.proto`. **Step 5: Run the test to verify it passes** @@ -402,14 +405,18 @@ Rollback: `git revert` — proto + generated code only, no runtime wiring yet; r **Step 1: Write the benchmark** -Create `module/benchmark_iac_state_backend_test.go`. It drives a synthetic ~1 MB `IaCState` through a full `Lock → GetState → SaveState → Unlock` cycle two ways: (a) directly against an in-process `IaCStateStore` (the `memory` backend — the baseline this design replaces), (b) against the same store wrapped behind a real in-memory gRPC `IaCStateBackend` server+client pair (the post-extraction path). It reports `b.ReportMetric` for added latency. +Create `module/benchmark_iac_state_backend_test.go`. It drives a synthetic ~1 MB `IaCState` through a full `Lock → GetState → SaveState → Unlock` cycle two ways: (a) directly against an in-process `IaCStateStore` (the `memory` backend — the baseline this design replaces), (b) against the same store wrapped behind a real in-memory gRPC `IaCStateBackend` server+client pair (the post-extraction path). + +This task's file is **fully self-contained** — it defines its own local `benchStateToProto` converter and `benchStateBackendServer`. Task 7 introduces the *production* converters + server type; once those exist, this file is simplified (Task 7 Step 3) to reuse them. Per the `iac.proto` hard invariant, the free-form `Outputs`/`Config` maps convert via `encoding/json`, **not** `structpb`. ```go package module import ( "context" + "encoding/json" "net" + "strconv" "strings" "testing" @@ -424,7 +431,7 @@ func oneMBState() *IaCState { big := strings.Repeat("x", 1024) outputs := make(map[string]any, 1024) for i := 0; i < 1024; i++ { - outputs["k"+strings.Repeat("0", 3)+itoa(i)] = big + outputs["k"+strconv.Itoa(i)] = big } return &IaCState{ ResourceID: "bench-resource", ResourceType: "kubernetes", Provider: "azure", @@ -433,16 +440,55 @@ func oneMBState() *IaCState { } } -func itoa(i int) string { // tiny local helper; avoid strconv import noise in bench file - if i == 0 { - return "0" +// benchStateToProto — local, self-contained IaCState -> pb.IaCState converter. +// Task 7 replaces this with the production iacStateToProto. +func benchStateToProto(s *IaCState) *pb.IaCState { + outJSON, _ := json.Marshal(s.Outputs) + cfgJSON, _ := json.Marshal(s.Config) + return &pb.IaCState{ + ResourceId: s.ResourceID, ResourceType: s.ResourceType, Provider: s.Provider, + Status: s.Status, OutputsJson: outJSON, ConfigJson: cfgJSON, + CreatedAt: s.CreatedAt, UpdatedAt: s.UpdatedAt, + } +} + +// benchStateBackendServer wraps an IaCStateStore behind pb.IaCStateBackendServer. +// Task 7 promotes this to the production iacStateBackendServer. +type benchStateBackendServer struct { + pb.UnimplementedIaCStateBackendServer + store IaCStateStore +} + +func (s *benchStateBackendServer) GetState(_ context.Context, r *pb.GetStateRequest) (*pb.GetStateResponse, error) { + st, err := s.store.GetState(r.ResourceId) + if err != nil { + return nil, err } - var b []byte - for i > 0 { - b = append([]byte{byte('0' + i%10)}, b...) - i /= 10 + if st == nil { + return &pb.GetStateResponse{Exists: false}, nil } - return string(b) + return &pb.GetStateResponse{Exists: true, State: benchStateToProto(st)}, nil +} +func (s *benchStateBackendServer) SaveState(_ context.Context, r *pb.SaveStateRequest) (*pb.SaveStateResponse, error) { + var outputs, config map[string]any + _ = json.Unmarshal(r.State.OutputsJson, &outputs) + _ = json.Unmarshal(r.State.ConfigJson, &config) + return &pb.SaveStateResponse{}, s.store.SaveState(&IaCState{ + ResourceID: r.State.ResourceId, ResourceType: r.State.ResourceType, + Provider: r.State.Provider, Status: r.State.Status, Outputs: outputs, Config: config, + }) +} +func (s *benchStateBackendServer) Lock(_ context.Context, r *pb.LockRequest) (*pb.LockResponse, error) { + return &pb.LockResponse{}, s.store.Lock(r.ResourceId) +} +func (s *benchStateBackendServer) Unlock(_ context.Context, r *pb.UnlockRequest) (*pb.UnlockResponse, error) { + return &pb.UnlockResponse{}, s.store.Unlock(r.ResourceId) +} +func (s *benchStateBackendServer) ListStates(_ context.Context, _ *pb.ListStatesRequest) (*pb.ListStatesResponse, error) { + return &pb.ListStatesResponse{}, nil +} +func (s *benchStateBackendServer) DeleteState(_ context.Context, r *pb.DeleteStateRequest) (*pb.DeleteStateResponse, error) { + return &pb.DeleteStateResponse{}, s.store.DeleteState(r.ResourceId) } // BenchmarkIaCStateBackend_InProcess is the baseline: direct IaCStateStore calls. @@ -471,7 +517,7 @@ func BenchmarkIaCStateBackend_InProcess(b *testing.B) { func BenchmarkIaCStateBackend_GRPC(b *testing.B) { lis := bufconn.Listen(4 << 20) // 4 MiB — gRPC default message cap srv := grpc.NewServer() - pb.RegisterIaCStateBackendServer(srv, newBenchStateBackendServer(NewMemoryIaCStateStore())) + pb.RegisterIaCStateBackendServer(srv, &benchStateBackendServer{store: NewMemoryIaCStateStore()}) go func() { _ = srv.Serve(lis) }() defer srv.Stop() @@ -484,7 +530,7 @@ func BenchmarkIaCStateBackend_GRPC(b *testing.B) { defer conn.Close() client := pb.NewIaCStateBackendClient(conn) st := oneMBState() - pbState := iacStateToProto(st) // helper introduced in Task 7; for the bench, inline a minimal conversion + pbState := benchStateToProto(st) ctx := context.Background() b.ResetTimer() for i := 0; i < b.N; i++ { @@ -504,10 +550,6 @@ func BenchmarkIaCStateBackend_GRPC(b *testing.B) { } ``` -> **Note for the executor:** `newBenchStateBackendServer` and `iacStateToProto` do not exist yet — Task 5 introduces a minimal local `newBenchStateBackendServer` in this same `_test.go` file (a thin adapter wrapping an `IaCStateStore` behind the generated `IaCStateBackendServer` interface, plus inline proto⇄struct conversion). Task 7 promotes the *production* converters into non-test code; the benchmark file may then be simplified to reuse them. Keep the bench file self-contained for this task. - -Add to the same file a minimal `newBenchStateBackendServer` implementing `pb.IaCStateBackendServer` by delegating to an `IaCStateStore`, with inline `IaCState`⇄`pb.IaCState` conversion (use `structpb.NewStruct` for the `Outputs`/`Config` maps). - **Step 2: Run the benchmark to verify it builds + runs** Run: `go test ./module/ -bench BenchmarkIaCStateBackend -benchmem -run '^$' -count=6 | tee /tmp/iac-state-bench.txt` @@ -535,6 +577,8 @@ Rollback: `git revert` — test-only file. - Create: `docs/plans/2026-05-14-iac-state-backend-benchmark.md` (the recorded result + decision) - Modify: `plugin/external/proto/iac.proto` (only if the benchmark forces a streaming redesign — expected: no change) +**Note on CI:** the repo already has `.github/workflows/benchmark.yml` running `make bench-baseline` / `make bench-compare` (which use `-bench=.`). The new `BenchmarkIaCStateBackend_*` functions are picked up by that workflow automatically — no new harness or workflow is needed. This task is a one-time *decision gate* (lock unary vs. streaming), not a recurring CI check; the recurring `benchmark.yml` regression-tracking is sufficient ongoing coverage. + **Step 1: Run the benchmark with statistical rigor** Run: `go test ./module/ -bench BenchmarkIaCStateBackend -benchmem -run '^$' -count=10 | tee /tmp/iac-state-bench.txt` @@ -542,7 +586,8 @@ Expected: 10 samples each for `_InProcess` and `_GRPC`. **Step 2: Compute the added latency** -Run: `go run golang.org/x/perf/cmd/benchstat /tmp/iac-state-bench.txt` (or `benchstat` if already on PATH per the Makefile's `bench-compare` target) +Install + run benchstat (the Makefile's `bench-compare` target assumes it on PATH): +Run: `go install golang.org/x/perf/cmd/benchstat@latest && benchstat /tmp/iac-state-bench.txt` Expected: a side-by-side of `_InProcess` vs `_GRPC` ns/op with variance. **Step 3: Evaluate against the acceptance bar** @@ -604,7 +649,7 @@ import ( func TestGRPCIaCStateStoreRoundTrip(t *testing.T) { lis := bufconn.Listen(4 << 20) srv := grpc.NewServer() - pb.RegisterIaCStateBackendServer(srv, newBenchStateBackendServer(NewMemoryIaCStateStore())) + pb.RegisterIaCStateBackendServer(srv, &iacStateBackendServer{store: NewMemoryIaCStateStore()}) go func() { _ = srv.Serve(lis) }() defer srv.Stop() @@ -650,9 +695,12 @@ Expected: FAIL — `newGRPCIaCStateStore` undefined. **Step 3: Implement the converters + adapter** -Create `module/iac_state_grpc_client.go` with: `iacStateToProto(*IaCState) (*pb.IaCState, error)` and `iacStateFromProto(*pb.IaCState) (*IaCState, error)` (using `structpb.NewStruct` / `.AsMap()` for the `Outputs`/`Config` maps), and `grpcIaCStateStore` — a struct holding a `pb.IaCStateBackendClient` that implements all six `IaCStateStore` methods by delegating over gRPC. `GetState` maps a `GetStateResponse{Exists:false}` to `(nil, nil)` per the interface contract ("Returns nil, nil when not found"). Constructor: `newGRPCIaCStateStore(c pb.IaCStateBackendClient) *grpcIaCStateStore`. Use `context.Background()` for now (a context-plumbing follow-up can thread a real ctx later — out of scope here). +Create `module/iac_state_grpc_client.go` with: +- `iacStateToProto(*IaCState) (*pb.IaCState, error)` and `iacStateFromProto(*pb.IaCState) (*IaCState, error)` — converting the free-form `Outputs`/`Config` maps via `encoding/json` `Marshal`/`Unmarshal` into/out of the `OutputsJson`/`ConfigJson` `[]byte` proto fields. **No `structpb`** — that violates the `iac.proto:6-10` hard invariant; the established pattern is JSON bytes (matches `ResourceState.outputs_json`). A `nil` map marshals to `[]byte("null")` — `iacStateFromProto` treats empty/`null`/`{}` bytes as a `nil` map. +- `grpcIaCStateStore` — a struct holding a `pb.IaCStateBackendClient` that implements all six `IaCStateStore` methods by delegating over gRPC. `GetState` maps a `GetStateResponse{Exists:false}` to `(nil, nil)` per the interface contract ("Returns nil, nil when not found"). Constructor: `newGRPCIaCStateStore(c pb.IaCStateBackendClient) *grpcIaCStateStore`. Use `context.Background()` for now (a context-plumbing follow-up can thread a real ctx later — out of scope here). +- `iacStateBackendServer` — the *production* server type: wraps an `IaCStateStore` behind `pb.IaCStateBackendServer`, delegating each RPC, using the same `iacStateToProto`/`iacStateFromProto` converters. Core does not yet *serve* this anywhere, but the Azure plugin's Task 11 needs the exact same delegation shape — keeping one canonical copy in core (which the plugin imports) avoids drift. -Also: move the `newBenchStateBackendServer` helper out of the Task 5 `_test.go` file into this file as `iacStateBackendServer` (a production type — it is the *server* half core needs nowhere yet, but the Azure plugin's Task 11 needs the exact same delegation shape; keeping one canonical copy avoids drift). Update `module/benchmark_iac_state_backend_test.go` to use the promoted `iacStateToProto` + `iacStateBackendServer` and delete its inline copies. +Then update `module/benchmark_iac_state_backend_test.go`: delete its local `benchStateToProto` + `benchStateBackendServer` and use the promoted `iacStateToProto` + `iacStateBackendServer` instead. **Step 4: Run the test to verify it passes** @@ -683,16 +731,14 @@ Rollback: `git revert` — new file + test; no engine wiring yet, core builds un ### Task 8: Engine-side plugin backend registry — resolve `iac.state` backends from loaded plugins +**Integration approach (resolved at plan time — `engine.go` was read; no open spike).** `engine.go` exposes no per-module handle to the external-plugin set reachable from `IaCModule.Init(app modular.Application)` — external plugins are loaded via `StdEngine.loadPluginInternal` (`engine.go:257`) through a `plugin.PluginLoader`, not a manager a module can query. Therefore the integration is the design's Architecture §1 fallback (which the design explicitly sanctions): **a package-level `module.iacStateBackendRegistry`**, populated by `engine.go`'s plugin-load path (Task 14 wires the population), consulted by `IaCModule.Init()` (this task). This task builds + tests the registry and the `IaCModule` dispatch; Task 14 wires `engine.go` → registry population. + **Files:** - Modify: `module/iac_module.go` - Create: `module/iac_state_plugin_registry.go` - Test: `module/iac_state_plugin_registry_test.go` -**Step 1: Spike — confirm the engine's external-plugin-manager handle (≤15 min, no code)** - -Read `engine.go` (`BuildFromConfig`) and `plugin/external/` to confirm how the engine loads external plugins at startup and where a module's `Init(app modular.Application)` can reach the set of loaded `ExternalPluginAdapter`s. The deploy path (`cmd/wfctl/deploy_providers.go`) scans `./data/plugins` directly; the engine likely already has an `ExternalPluginManager` in `BuildFromConfig`. Record the handle path in a one-paragraph comment at the top of `iac_state_plugin_registry.go`. **If the engine has no such handle reachable from module Init**, the fallback (design Architecture §1) is a package-level registry that the engine populates at plugin-load time — implement that instead. Pick whichever the spike confirms; both satisfy the design. - -**Step 2: Write the failing test** +**Step 1: Write the failing test** Create `module/iac_state_plugin_registry_test.go`. Test the registry: registering a backend name → `pb.IaCStateBackendClient` factory, and looking it up. Use a fake client. Assert: an unknown backend name returns `(nil, false)`; a registered name returns the client; registering a **reserved** name (`memory`/`filesystem`/`postgres`) returns an error (design Failure-modes "reserved-name collision"). @@ -724,32 +770,32 @@ func TestIaCStateBackendRegistry(t *testing.T) { (Define a minimal `fakeStateBackendClient` satisfying `pb.IaCStateBackendClient` in the test file.) -**Step 3: Run the test to verify it fails** +**Step 2: Run the test to verify it fails** Run: `go test ./module/ -run TestIaCStateBackendRegistry -v` Expected: FAIL — `newIaCStateBackendRegistry` undefined. -**Step 4: Implement the registry** +**Step 3: Implement the registry** Create `module/iac_state_plugin_registry.go`: an `iacStateBackendRegistry` struct wrapping a `map[string]pb.IaCStateBackendClient` + a mutex. `register(name, client)` rejects the reserved names `memory`/`filesystem`/`postgres` with a clear error (`"plugin registered reserved iac.state backend name %q"`). `resolve(name)` returns `(client, ok)`. Provide a package-level default registry instance the engine populates at plugin-load time, plus `newIaCStateBackendRegistry()` for tests. -**Step 5: Run the test to verify it passes** +**Step 4: Run the test to verify it passes** Run: `go test ./module/ -run TestIaCStateBackendRegistry -v` Expected: PASS. -**Step 6: Wire `IaCModule.Init()` to consult the registry** +**Step 5: Wire `IaCModule.Init()` to consult the registry** -Modify `module/iac_module.go` `Init()`: in the backend `switch`, for any backend name **not** in the core set (`memory`/`filesystem`/`postgres` — and, until later phases, still `spaces`/`gcs`/`azure_blob` keep their in-process cases for now), add a `default:` arm that consults the plugin registry: if `reg.resolve(m.backend)` succeeds, `m.store = newGRPCIaCStateStore(client)`; if not, return the existing `"unsupported backend"` error **extended** with `" (or load the plugin that provides it)"`. Crucially: the `default` arm must run *before* the final error return. The in-process `azure_blob` case stays untouched in this PR — PR 5 deletes it. The point of this task is the *plumbing* exists and is tested; PR 5 flips `azure_blob` to use it. +Modify `module/iac_module.go` `Init()`: in the backend `switch`, for any backend name **not** in the core set (`memory`/`filesystem`/`postgres` — and, until later phases, still `spaces`/`gcs`/`azure_blob` keep their in-process cases for now), add a `default:` arm that consults the package-level plugin registry: if `iacStateBackendRegistry.resolve(m.backend)` succeeds, `m.store = newGRPCIaCStateStore(client)`; if not, return the existing `"unsupported backend"` error **extended** with `" (or load the plugin that provides it)"`. Crucially: the `default` arm must run *before* the final error return. The in-process `azure_blob` case stays untouched in this PR — PR 5 deletes it. The point of this task is the *plumbing* exists and is tested; PR 5 flips `azure_blob` to use it. -Add a focused test in `iac_state_plugin_registry_test.go` constructing an `IaCModule` with `backend: "azure_blob_test_only"`, a registry pre-populated with a fake client for that name, and asserting `Init()` sets `m.store` to a `*grpcIaCStateStore`. +Add a focused test in `iac_state_plugin_registry_test.go` constructing an `IaCModule` with `backend: "azure_blob_test_only"`, the package-level registry pre-populated with a fake client for that name (clean it up with a `defer`), and asserting `Init()` sets `m.store` to a `*grpcIaCStateStore`. -**Step 7: Build + test** +**Step 6: Build + test** Run: `go build ./... && go test ./module/ -run 'IaCStateBackend|IaCModule' -v` Expected: exit 0, PASS. -**Step 8: Commit** +**Step 7: Commit** ```bash git add module/iac_module.go module/iac_state_plugin_registry.go module/iac_state_plugin_registry_test.go @@ -804,7 +850,7 @@ func TestRedactCredentialsBlock(t *testing.T) { **Step 2: Run the test to verify it fails** Run: `go test ./module/ -run TestRedactCredentialsBlock -v` -Expected: FAIL — `accessKey` already matches `access_key`? Check: `SensitiveFieldPatterns` has `access_key` (underscore) but the key here is `accessKey` (camelCase). The existing substring match is case-insensitive but `accessKey` does not contain `access_key`. So `accessKey`/`secretKey` are **not** currently redacted → test fails. +Expected: FAIL. Verified against `module/step_output_redactor.go:7-19`: `SensitiveFieldPatterns` contains `"access_key"` (underscore) but **not** `"accesskey"` / `"secretkey"`. Matching is case-insensitive substring, and `accessKey` (camelCase, no underscore) does not contain the substring `access_key` — so the `credentials:` block's `accessKey`/`secretKey` keys are not currently redacted, and the test fails as written. **Step 3: Implement** @@ -933,11 +979,11 @@ Expected: PASS — the store's logic is unchanged, only its home moved. **Step 4: Write the `IaCStateBackendServer` impl** -Create `internal/statebackend/server.go` implementing `proto.IaCStateBackendServer` (from `github.com/GoCodeAlone/workflow/plugin/external/proto`) by delegating each RPC to an `AzureBlobIaCStateStore`, with the same `IaCState`⇄`pb.IaCState` conversion shape promoted in workflow Task 7. Mirror that converter exactly (the plugin imports the same `proto` package, so the wire types are identical). +Create `internal/statebackend/server.go` implementing `proto.IaCStateBackendServer` (from `github.com/GoCodeAlone/workflow/plugin/external/proto`) by delegating each RPC to an `AzureBlobIaCStateStore`. Use **JSON `Marshal`/`Unmarshal`** for the `Outputs`/`Config` ⇄ `OutputsJson`/`ConfigJson` `[]byte` fields — mirror the workflow-core converters from Task 7 (`iacStateToProto`/`iacStateFromProto`) exactly; the plugin imports the same `proto` package so the wire types are identical. **No `structpb`** — the `iac.proto:6-10` hard invariant forbids it. **Step 5: Wire it into the plugin's serve path + manifest** -Register the `IaCStateBackend` service on the plugin's gRPC server alongside its existing `IaCProviderRequired` service, and add `azure_blob` to the plugin's advertised state-backend capabilities in `plugin.json` (mirror how the existing `iacProvider` capability is declared — the host's registry-population step in workflow Task 8 reads this). +Register the `IaCStateBackend` service on the plugin's gRPC server alongside its existing `IaCProviderRequired` service, and add `azure_blob` to the plugin's advertised state-backend capabilities in `plugin.json` (mirror how the existing `iacProvider` capability is declared — the engine's registry-population step in workflow Task 14 reads this). **Step 6: Build + load-test the plugin** @@ -1085,7 +1131,7 @@ Rollback: revert the commit + `go mod tidy` (restores `iac_state_azure.go`, the **Files:** - Create: `docs/migrations/2026-05-14-cloud-sdk-extraction.md` -- Modify: the engine plugin-load wiring confirmed in Task 8's spike (where plugins are loaded → populate `iacStateBackendRegistry`) +- Modify: `engine.go` — the `StdEngine.loadPluginInternal` path (per Task 8's resolved integration approach: external plugins load here; populate `module.iacStateBackendRegistry` after a successful external-plugin load) - Test: `module/iac_state_plugin_registry_test.go` (extend) + a launch check **Step 1: Write the migration doc** @@ -1094,7 +1140,7 @@ Create `docs/migrations/2026-05-14-cloud-sdk-extraction.md` covering (per the de **Step 2: Wire plugin-load → registry population** -In the engine plugin-load path (from Task 8's spike), after each external plugin is loaded, read its advertised `IaCStateBackend` capabilities from `plugin.json` / its `ContractRegistry` and call `iacStateBackendRegistry.register(name, client)` for each — building the `pb.IaCStateBackendClient` from the loaded plugin's gRPC connection (mirror the `typedIaCAdapter` construction in `cmd/wfctl/iac_typed_adapter.go`, but for the engine context and the `IaCStateBackend` service name `workflow.plugin.external.iac.IaCStateBackend`). +In `engine.go`'s `StdEngine.loadPluginInternal` (per Task 8's resolved integration approach), after a successful external-plugin load, read the plugin's advertised `IaCStateBackend` capabilities from `plugin.json` / its `ContractRegistry` and call `module.iacStateBackendRegistry.register(name, client)` for each — building the `pb.IaCStateBackendClient` from the loaded plugin's gRPC connection (mirror the `typedIaCAdapter` construction in `cmd/wfctl/iac_typed_adapter.go`, but for the engine context and the `IaCStateBackend` service name `workflow.plugin.external.iac.IaCStateBackend`). The registry must be exported from `module` for `engine.go` to populate it — add a `module.RegisterIaCStateBackend(name string, client pb.IaCStateBackendClient) error` wrapper if the registry itself stays unexported. **Step 3: Write/extend the test** From e34be7c4cd4fe9c6196636d2de163036b94621eb Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:26:46 -0400 Subject: [PATCH 17/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20p?= =?UTF-8?q?lan=20=E2=80=94=20plan-review=20cycle=202=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan-phase adversarial review cycle 2: all 9 cycle-1 findings confirmed resolved; 2 new Important + 2 Minor surfaced by the new-defect scan. All addressed: - I-A (Important): Task 9's redaction test was inconsistent with the actual redactMap behavior — a key named `credentials` matches the existing `credential` pattern and is wholesale-replaced with the placeholder STRING before any recursion, so the test's `.(map[string]any)` assertion panicked. Reworked Task 9: the `credentials:` block is ALREADY redacted wholesale (regression-tested); the real gap is `credentials_ref` being over-redacted (it's a module name, not a secret) — fix is a narrow `*_ref`-suffix exemption in isSensitiveField, not camelCase leaf patterns (which would be dead code given wholesale redaction happens first). - I-B (Important): Task 14's engine.go integration seam was under-specified and would fight loadPluginInternal's no-concrete-types precedent. Resolved at plan time (engine.go:305-327 read): Task 14 now defines an `IaCStateBackendProvider` optional interface and type-asserts it in loadPluginInternal exactly like the existing stepRegistrySetter/slogLoggerSetter pattern; ExternalPluginAdapter implements it. Concrete file set + code sketch added. - M-i: Task 6's benchmark.yml description corrected (runs `go test -bench=.` inline, not `make bench-baseline`). - M-ii: Task 4 notes the proto README's plugin.proto-specific wording is stale; trust root buf.yaml. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/plans/2026-05-14-cloud-sdk-extraction.md | 115 +++++++++++++----- 1 file changed, 84 insertions(+), 31 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md index a83f7f4e..39f228f7 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -368,7 +368,7 @@ message UnlockResponse {} **Step 4: Regenerate the Go bindings** -Run: `buf generate` **from the worktree root** (per `plugin/external/proto/README.md` — `buf.yaml` / `buf.gen.yaml` live at repo root; running from inside `plugin/external/proto/` will not find them). +Run: `buf generate` **from the worktree root** — `buf.yaml` / `buf.gen.yaml` live at repo root and `buf.yaml` globs the whole `plugin/external/proto` directory (so `iac.proto` is covered). Note: `plugin/external/proto/README.md`'s wording is stale (it references `plugin.proto` specifically) — trust the root `buf.yaml`, not the README prose. Running `buf` from inside `plugin/external/proto/` will not find the config files. Expected: `plugin/external/proto/iac.pb.go` + `iac_grpc.pb.go` regenerated, now containing `IaCStateBackendServer`, `IaCStateBackendClient`, and the message types. `git diff --stat` shows only the two `*.pb.go` files changed plus `iac.proto`. **Step 5: Run the test to verify it passes** @@ -577,7 +577,7 @@ Rollback: `git revert` — test-only file. - Create: `docs/plans/2026-05-14-iac-state-backend-benchmark.md` (the recorded result + decision) - Modify: `plugin/external/proto/iac.proto` (only if the benchmark forces a streaming redesign — expected: no change) -**Note on CI:** the repo already has `.github/workflows/benchmark.yml` running `make bench-baseline` / `make bench-compare` (which use `-bench=.`). The new `BenchmarkIaCStateBackend_*` functions are picked up by that workflow automatically — no new harness or workflow is needed. This task is a one-time *decision gate* (lock unary vs. streaming), not a recurring CI check; the recurring `benchmark.yml` regression-tracking is sufficient ongoing coverage. +**Note on CI:** the repo already has `.github/workflows/benchmark.yml`, which runs `go test -bench=. -benchmem -count=6 -run=^$` over `./...` inline. The new `BenchmarkIaCStateBackend_*` functions are picked up by that `-bench=.` automatically — no new harness or workflow is needed. This task is a one-time *decision gate* (lock unary vs. streaming), not a recurring CI check; the recurring `benchmark.yml` run is sufficient ongoing coverage. **Step 1: Run the benchmark with statistical rigor** @@ -812,11 +812,17 @@ Rollback: `git revert` — the registry is additive and the `azure_blob` in-proc --- -### Task 9: Extend secret redaction to recognise `credentials:` / `credentials_ref:` keys +### Task 9: Confirm `credentials:` redaction + exempt `credentials_ref:` from over-redaction + +**Verified redactor behavior (read `module/step_output_redactor.go` in full):** `redactMap` (lines 44-58) — if a key matches a `SensitiveFieldPatterns` substring (`isSensitiveField`, case-insensitive), the *whole value* is replaced with the `RedactionPlaceholder` **string** and the loop `continue`s — **it does not recurse into a sensitive-keyed map.** The pattern list already contains `"credential"` (line 11). Therefore: +- A key literally named `credentials` (matches substring `credential`) → its entire sub-tree is *already* replaced with `"[REDACTED]"`. **The `credentials:` block is already fully redacted** — the design's Security section explicitly allows "confirm the existing redaction already covers" as the resolution, and it does. +- A key named `credentials_ref` *also* matches `credential` → it is *also* redacted. But `credentials_ref` is a **module name, not a secret** — the design says it should be preserved (it's a reference for DRY). The existing behavior **over-redacts** it, costing trace debuggability. + +So Task 9 is **not** "add camelCase leaf patterns" (the `credentials:` block is caught wholesale already, before any recursion — leaf patterns would never be consulted). Task 9 is: lock in the `credentials:`-block redaction with a regression test, and add a narrow exemption so `credentials_ref` (a reference, not a secret) is preserved. **Files:** - Modify: `module/step_output_redactor.go` -- Test: `module/step_output_redactor_test.go` (existing — add cases) +- Test: `module/step_output_redactor_test.go` (existing — add a case) **Step 1: Write the failing test** @@ -833,13 +839,15 @@ func TestRedactCredentialsBlock(t *testing.T) { "bucket": "public-bucket-name", } out := RedactStepOutput(in) - creds := out["credentials"].(map[string]any) - if creds["accessKey"] != RedactionPlaceholder || creds["secretKey"] != RedactionPlaceholder { - t.Fatalf("credentials block not redacted: %+v", creds) + // The credentials: block is redacted WHOLESALE — the existing "credential" + // pattern replaces the whole sub-tree with the placeholder STRING (no + // recursion). That is safe and is the design-sanctioned "already covered". + if out["credentials"] != RedactionPlaceholder { + t.Fatalf("credentials block must be wholesale-redacted, got: %#v", out["credentials"]) } - // credentials_ref is a module NAME, not a secret — must NOT be redacted. + // credentials_ref is a module NAME, not a secret — must be PRESERVED. if out["credentials_ref"] != "aws-creds-module" { - t.Fatalf("credentials_ref should not be redacted (it is a module reference)") + t.Fatalf("credentials_ref must NOT be redacted (it is a module reference): %#v", out["credentials_ref"]) } if out["bucket"] != "public-bucket-name" { t.Fatalf("non-sensitive field wrongly redacted") @@ -850,28 +858,39 @@ func TestRedactCredentialsBlock(t *testing.T) { **Step 2: Run the test to verify it fails** Run: `go test ./module/ -run TestRedactCredentialsBlock -v` -Expected: FAIL. Verified against `module/step_output_redactor.go:7-19`: `SensitiveFieldPatterns` contains `"access_key"` (underscore) but **not** `"accesskey"` / `"secretkey"`. Matching is case-insensitive substring, and `accessKey` (camelCase, no underscore) does not contain the substring `access_key` — so the `credentials:` block's `accessKey`/`secretKey` keys are not currently redacted, and the test fails as written. +Expected: FAIL — `out["credentials_ref"]` is `"[REDACTED]"` (the key matches the existing `credential` substring pattern), not the preserved module name. (`out["credentials"]` already passes — it is correctly wholesale-redacted today.) + +**Step 3: Implement the `credentials_ref` exemption** + +In `module/step_output_redactor.go`, `isSensitiveField` already has an exemption mechanism (the `_display` suffix at line 64). Add a sibling exemption for reference keys: an exact-name exemption set so `credentials_ref` (and the general principle: a `*_ref` key is a name, not a secret) is never redacted. Minimal form — extend `isSensitiveField`: -**Step 3: Implement** +```go +// Reference keys hold module/resource NAMES, not secrets — never redact them, +// even though "credentials_ref" contains the "credential" substring. +if strings.HasSuffix(lower, "_ref") { + return false +} +``` -In `module/step_output_redactor.go`, add to `SensitiveFieldPatterns` the camelCase / bare forms that a `credentials:` block uses: `"accesskey"`, `"secretkey"`, `"sessiontoken"`, `"account_key"`, `"accountkey"`, `"clientsecret"`, `"client_secret"`. Because matching is case-insensitive substring, `"accesskey"` matches `accessKey`. Also ensure a key literally named `credentials` whose value is a map gets its children recursively redacted — the existing `redactMap` recursion already covers nested maps, so adding the leaf patterns is sufficient. Do **not** add `credentials_ref` to the patterns — it is a module reference, not a secret (the test guards this). +Place this immediately after the existing `_display`-suffix early-return. Do **not** add camelCase leaf patterns — they are dead code given the `credentials:` block is redacted wholesale before any recursion reaches the leaves. **Step 4: Run the test to verify it passes** Run: `go test ./module/ -run 'Redact' -v` -Expected: PASS (the new test + all existing redaction tests still green). +Expected: PASS — the new test + all existing redaction tests still green (the `_ref` exemption is narrow; no existing sensitive field name ends in `_ref`). **Step 5: Commit** ```bash git add module/step_output_redactor.go module/step_output_redactor_test.go -git commit -m "feat(module): redact inline credentials: block keys (accessKey/secretKey/etc.) +git commit -m "feat(module): exempt *_ref keys from redaction; lock in credentials: redaction Option-1 credentials move raw cloud secrets inline into plugin-native -module config. Extends SensitiveFieldPatterns with the camelCase forms a -credentials: block uses so the config-version store + execution tracing -redact them. credentials_ref: (a module reference, not a secret) is -deliberately left un-redacted." +module config under a credentials: key — already redacted wholesale by +the existing 'credential' pattern (regression test added). But that same +pattern over-redacts credentials_ref:, which holds a module NAME, not a +secret. Adds a narrow *_ref-suffix exemption to isSensitiveField so +reference keys are preserved for trace debuggability." ``` Rollback: `git revert` — redaction is additive; reverting only narrows what's redacted (no functional break, but re-widening is the forward fix). @@ -1127,35 +1146,69 @@ Rollback: revert the commit + `go mod tidy` (restores `iac_state_azure.go`, the --- -### Task 14: Migration doc + pin the engine's plugin-registry population to advertise `azure_blob` +### Task 14: Migration doc + wire engine plugin-load → `iac.state` backend registry + +**Integration seam (resolved at plan time — `engine.go:305-327` was read).** `loadPluginInternal` deliberately never references concrete plugin types; it injects engine capabilities into plugins via **optional-interface type-asserts** — the `stepRegistrySetter` and `slogLoggerSetter` pattern at `engine.go:316-325` (`type X interface {...}; if v, ok := p.(X); ok { ... }`). Task 14 follows that exact precedent **in reverse** (reading *from* the plugin, not injecting *into* it): define an optional interface the external-plugin adapter satisfies, type-assert `p` against it, and populate the registry. This keeps `engine.go` free of a `plugin/external` import + concrete type-assert. **Files:** - Create: `docs/migrations/2026-05-14-cloud-sdk-extraction.md` -- Modify: `engine.go` — the `StdEngine.loadPluginInternal` path (per Task 8's resolved integration approach: external plugins load here; populate `module.iacStateBackendRegistry` after a successful external-plugin load) -- Test: `module/iac_state_plugin_registry_test.go` (extend) + a launch check +- Create: `plugin/iac_state_backend_provider.go` — the `IaCStateBackendProvider` optional interface (in the `plugin` package, which `engine.go` already imports) +- Modify: `engine.go` — add the optional-interface type-assert in `loadPluginInternal` (beside `stepRegistrySetter` / `slogLoggerSetter`, ~`engine.go:316`) +- Modify: `plugin/external/adapter.go` — `*ExternalPluginAdapter` implements `IaCStateBackendClients()` (it has the gRPC `ClientConn` + `ContractRegistry`; this is in-repo, not cross-repo) +- Modify: `module/iac_state_plugin_registry.go` — add an exported `module.RegisterIaCStateBackend(name string, client pb.IaCStateBackendClient) error` wrapper (the registry struct itself stays unexported) +- Test: `plugin/external/adapter_test.go` (extend) + `module/iac_state_plugin_registry_test.go` (extend) + a launch check **Step 1: Write the migration doc** -Create `docs/migrations/2026-05-14-cloud-sdk-extraction.md` covering (per the design's Migration section, Phase A scope only): `iac.state` with `backend: azure_blob` now requires `wfctl plugin install workflow-plugin-azure` (≥ the Task 12 tag); the yaml `backend: azure_blob` value is unchanged; `memory`/`filesystem`/`postgres` are unaffected. Note that Phases B/C/D (AWS/GCP/DO) will follow the same pattern in subsequent releases. +Create `docs/migrations/2026-05-14-cloud-sdk-extraction.md` covering (per the design's Migration section, Phase A scope only): `iac.state` with `backend: azure_blob` now requires `wfctl plugin install workflow-plugin-azure` (≥ the Task 12 tag); the yaml `backend: azure_blob` value is unchanged; `memory`/`filesystem`/`postgres` are unaffected. Note that Phases B/C/D (AWS/GCP/DO) follow the same pattern in subsequent releases. + +**Step 2: Define the optional interface + `ExternalPluginAdapter` impl** -**Step 2: Wire plugin-load → registry population** +In a shared location both `engine.go` and `plugin/external` can see the type (e.g. `plugin/iac_state_backend_provider.go` in the `plugin` package, which `engine.go` already imports — `engine.go:21`): -In `engine.go`'s `StdEngine.loadPluginInternal` (per Task 8's resolved integration approach), after a successful external-plugin load, read the plugin's advertised `IaCStateBackend` capabilities from `plugin.json` / its `ContractRegistry` and call `module.iacStateBackendRegistry.register(name, client)` for each — building the `pb.IaCStateBackendClient` from the loaded plugin's gRPC connection (mirror the `typedIaCAdapter` construction in `cmd/wfctl/iac_typed_adapter.go`, but for the engine context and the `IaCStateBackend` service name `workflow.plugin.external.iac.IaCStateBackend`). The registry must be exported from `module` for `engine.go` to populate it — add a `module.RegisterIaCStateBackend(name string, client pb.IaCStateBackendClient) error` wrapper if the registry itself stays unexported. +```go +// IaCStateBackendProvider is the optional interface an external plugin adapter +// implements when it serves one or more iac.state backends. The engine +// type-asserts loaded plugins against it (same pattern as stepRegistrySetter) +// and populates module's iac.state backend registry from the result. +type IaCStateBackendProvider interface { + IaCStateBackendClients() map[string]proto.IaCStateBackendClient +} +``` -**Step 3: Write/extend the test** +In `plugin/external/adapter.go`, make `*ExternalPluginAdapter` implement `IaCStateBackendClients()`: it reads its own `ContractRegistry` for services advertising `workflow.plugin.external.iac.IaCStateBackend`, builds a `proto.IaCStateBackendClient` per advertised backend name off the adapter's existing gRPC `ClientConn` (mirror `typedIaCAdapter` construction in `cmd/wfctl/iac_typed_adapter.go`), and returns `name → client`. If the plugin advertises no state backend, return `nil` — the type-assert still succeeds, the map is just empty. -Add a test that loads a fake plugin advertising `azure_blob` and asserts the engine's registry has it resolvable after load. If a full plugin-load test is too heavy for a unit test, assert the *population function* in isolation: given a fake `ExternalPluginAdapter` advertising `azure_blob`, the population step calls `registry.register("azure_blob", )`. +**Step 3: Wire the type-assert into `loadPluginInternal`** -**Step 4: Build + test + launch validation** +In `engine.go` `loadPluginInternal`, beside the existing `stepRegistrySetter` / `slogLoggerSetter` asserts (~line 316), add: -Run: `go build ./... && go test ./module/ -run 'IaCStateBackend|IaCModule' -v` +```go +if provider, ok := p.(plugin.IaCStateBackendProvider); ok { + for name, client := range provider.IaCStateBackendClients() { + if err := module.RegisterIaCStateBackend(name, client); err != nil { + return fmt.Errorf("load plugin %q: %w", p.EngineManifest().Name, err) + } + } +} +``` + +`module.RegisterIaCStateBackend` (new exported wrapper, this task) delegates to the unexported `iacStateBackendRegistry.register` from Task 8 — which already rejects reserved names, so a plugin advertising `memory`/`filesystem`/`postgres` fails plugin-load with a clear error (design Failure-modes "reserved-name collision", now actually wired). + +**Step 4: Write/extend the tests** + +- `plugin/external/adapter_test.go`: a fake adapter with a `ContractRegistry` advertising `azure_blob` → `IaCStateBackendClients()` returns a one-entry map keyed `azure_blob`. +- `module/iac_state_plugin_registry_test.go`: `module.RegisterIaCStateBackend("azure_blob", fakeClient)` then `resolve("azure_blob")` succeeds; `module.RegisterIaCStateBackend("memory", fakeClient)` returns the reserved-name error. + +**Step 5: Build + test + launch validation** + +Run: `go build ./... && go test ./module/ -run 'IaCStateBackend|IaCModule' ./plugin/external/ -v` Expected: exit 0, PASS. -Then the end-to-end launch check from Task 13 Step 8 should now work *without manual registry seeding* — the engine auto-populates from the loaded plugin. Re-run that launch with the Task 11 plugin in `./data/plugins/` and confirm `azure_blob` resolves with zero manual wiring. Capture the transcript. +Then the end-to-end launch check from Task 13 Step 8 should now work *without manual registry seeding* — the engine auto-populates from the loaded plugin. Re-run that launch with the Task 11 plugin in `./data/plugins/` and confirm `azure_blob` resolves with zero manual wiring. Capture the transcript. **Rollback note (runtime-affecting — plugin loading path):** revert the commit; the registry + dispatch plumbing from Task 8 survive, only the engine auto-population is removed; relaunch with a `memory`-backend config to confirm core backends unaffected. -**Step 5: Commit** +**Step 6: Commit** ```bash -git add docs/migrations/2026-05-14-cloud-sdk-extraction.md module/ engine.go +git add docs/migrations/2026-05-14-cloud-sdk-extraction.md module/ engine.go plugin/ git commit -m "feat(engine): auto-populate iac.state backend registry from loaded plugins At plugin-load time the engine reads each plugin's advertised From bdf0de8e961143001414339a7aba45be434aa98e Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:31:12 -0400 Subject: [PATCH 18/39] =?UTF-8?q?docs(plans):=20cloud-SDK=20extraction=20p?= =?UTF-8?q?lan=20=E2=80=94=20plan-review=20cycle=203=20PASS=20+=20minor=20?= =?UTF-8?q?cleanups?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan-phase adversarial review cycle 3: PASS (zero Critical, zero Important). Two Minor doc-tightening fixes applied: - Task 9 Step 4 now names bearer_token_ref explicitly and explains why the *_ref exemption is safe for it (SecretRef is a reference struct, not a raw secret) rather than claiming no *_ref field exists. - engine.go line citations corrected to 311-326. Plan phase complete — proceeding to alignment-check. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/plans/2026-05-14-cloud-sdk-extraction.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md index 39f228f7..526b608e 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -877,7 +877,7 @@ Place this immediately after the existing `_display`-suffix early-return. Do **n **Step 4: Run the test to verify it passes** Run: `go test ./module/ -run 'Redact' -v` -Expected: PASS — the new test + all existing redaction tests still green (the `_ref` exemption is narrow; no existing sensitive field name ends in `_ref`). +Expected: PASS — the new test + all existing redaction tests still green. The `_ref` exemption is narrow: `SensitiveFieldPatterns` has no `_ref` entry, and the only `*_ref` config field in the repo, `bearer_token_ref` (`module/http_client.go`), is itself a `SecretRef` *reference* struct — a provider+key name pair, not a raw secret value — so exempting it is correct, not a leak. (`RedactStepOutput` is invoked on step *output* maps, not module config, narrowing the blast radius further.) **Step 5: Commit** @@ -1148,12 +1148,12 @@ Rollback: revert the commit + `go mod tidy` (restores `iac_state_azure.go`, the ### Task 14: Migration doc + wire engine plugin-load → `iac.state` backend registry -**Integration seam (resolved at plan time — `engine.go:305-327` was read).** `loadPluginInternal` deliberately never references concrete plugin types; it injects engine capabilities into plugins via **optional-interface type-asserts** — the `stepRegistrySetter` and `slogLoggerSetter` pattern at `engine.go:316-325` (`type X interface {...}; if v, ok := p.(X); ok { ... }`). Task 14 follows that exact precedent **in reverse** (reading *from* the plugin, not injecting *into* it): define an optional interface the external-plugin adapter satisfies, type-assert `p` against it, and populate the registry. This keeps `engine.go` free of a `plugin/external` import + concrete type-assert. +**Integration seam (resolved at plan time — `engine.go:311-326` was read).** `loadPluginInternal` deliberately never references concrete plugin types; it injects engine capabilities into plugins via **optional-interface type-asserts** — the `stepRegistrySetter` and `slogLoggerSetter` pattern at `engine.go:316-325` (`type X interface {...}; if v, ok := p.(X); ok { ... }`). Task 14 follows that exact precedent **in reverse** (reading *from* the plugin, not injecting *into* it): define an optional interface the external-plugin adapter satisfies, type-assert `p` against it, and populate the registry. This keeps `engine.go` free of a `plugin/external` import + concrete type-assert. **Files:** - Create: `docs/migrations/2026-05-14-cloud-sdk-extraction.md` - Create: `plugin/iac_state_backend_provider.go` — the `IaCStateBackendProvider` optional interface (in the `plugin` package, which `engine.go` already imports) -- Modify: `engine.go` — add the optional-interface type-assert in `loadPluginInternal` (beside `stepRegistrySetter` / `slogLoggerSetter`, ~`engine.go:316`) +- Modify: `engine.go` — add the optional-interface type-assert in `loadPluginInternal` (beside `stepRegistrySetter` / `slogLoggerSetter`, `engine.go:311-326`) - Modify: `plugin/external/adapter.go` — `*ExternalPluginAdapter` implements `IaCStateBackendClients()` (it has the gRPC `ClientConn` + `ContractRegistry`; this is in-repo, not cross-repo) - Modify: `module/iac_state_plugin_registry.go` — add an exported `module.RegisterIaCStateBackend(name string, client pb.IaCStateBackendClient) error` wrapper (the registry struct itself stays unexported) - Test: `plugin/external/adapter_test.go` (extend) + `module/iac_state_plugin_registry_test.go` (extend) + a launch check @@ -1180,7 +1180,7 @@ In `plugin/external/adapter.go`, make `*ExternalPluginAdapter` implement `IaCSta **Step 3: Wire the type-assert into `loadPluginInternal`** -In `engine.go` `loadPluginInternal`, beside the existing `stepRegistrySetter` / `slogLoggerSetter` asserts (~line 316), add: +In `engine.go` `loadPluginInternal`, beside the existing `stepRegistrySetter` / `slogLoggerSetter` asserts (`engine.go:311-326`), add: ```go if provider, ok := p.(plugin.IaCStateBackendProvider); ok { From e67757c6ef7654a1bcaabfa803373fc3ad9f1688 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:33:39 -0400 Subject: [PATCH 19/39] chore: lock scope for cloud-sdk-extraction (alignment passed) --- docs/plans/2026-05-14-cloud-sdk-extraction.md | 2 +- docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md index 526b608e..7cd1dd6f 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -40,7 +40,7 @@ **PR 4 is a declared human-action gate.** It lands in a *different git repository* (`/Users/jon/workspace/workflow-plugin-azure`), which the autonomous worktree-scoped execution pipeline cannot branch/PR/tag. When execution reaches PR 4, the pipeline must **pause and surface PR 4 to a human operator** (with Tasks 11–12 as the checklist) — this is the expected, designed handoff, not an execution failure. PR 5 is **blocked on PR 4's plugin tag** existing and being installable (Task 13 Step 8 + Task 14 Step 4 runtime-launch validation load the tagged plugin binary). PRs 1–3 are fully autonomous-executable in the worktree; PR 5 resumes autonomously once the operator confirms PR 4 merged + tagged. -**Status:** Draft +**Status:** Locked 2026-05-14T06:33:23Z --- diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock b/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock new file mode 100644 index 00000000..c7c6d895 --- /dev/null +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock @@ -0,0 +1 @@ +a66a505f3bcc075be3ad609c6979672b0cc7f9118e8ea4859da33092fdb96248 From cdc269432412891d52ee47f7164cb1e70f47c8e4 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:39:22 -0400 Subject: [PATCH 20/39] refactor(module): split platform_kubernetes_kind.go into _core + _gke Phase 0 precursor for cloud-SDK extraction. kindBackend/eksErrorBackend/ aksBackend (all SDK-free) move to platform_kubernetes_core.go with a core init(); gkeBackend (the only SDK-bearing k8s backend) moves to platform_kubernetes_gke.go with its own init(). Behavior-equivalent: same five backend names registered. Isolates the lone SDK-bearing platform file for a later clean deletion. --- ...es_kind.go => platform_kubernetes_core.go} | 217 ----------------- module/platform_kubernetes_gke.go | 227 ++++++++++++++++++ 2 files changed, 227 insertions(+), 217 deletions(-) rename module/{platform_kubernetes_kind.go => platform_kubernetes_core.go} (65%) create mode 100644 module/platform_kubernetes_gke.go diff --git a/module/platform_kubernetes_kind.go b/module/platform_kubernetes_core.go similarity index 65% rename from module/platform_kubernetes_kind.go rename to module/platform_kubernetes_core.go index 1e930b4d..75169774 100644 --- a/module/platform_kubernetes_kind.go +++ b/module/platform_kubernetes_core.go @@ -12,8 +12,6 @@ import ( "time" "github.com/GoCodeAlone/workflow/internal/legacyaws" - container "google.golang.org/api/container/v1" - "google.golang.org/api/option" ) // kindBackend implements kubernetesBackend using in-memory state. @@ -114,218 +112,6 @@ func (b *eksErrorBackend) destroy(k *PlatformKubernetes) error { return b.err(k) } -// ─── GKE backend ────────────────────────────────────────────────────────────── - -// gkeBackend manages Google Kubernetes Engine clusters via the GCP Container API. -type gkeBackend struct{} - -func (b *gkeBackend) gkeLocation(k *PlatformKubernetes) string { - if z, ok := k.config["zone"].(string); ok && z != "" { - return z - } - if r, ok := k.config["location"].(string); ok && r != "" { - return r - } - if k.provider != nil { - return k.provider.Region() - } - return "us-central1" -} - -func (b *gkeBackend) gkeProject(k *PlatformKubernetes) string { - if p, ok := k.config["project_id"].(string); ok && p != "" { - return p - } - if k.provider != nil { - if creds, err := k.provider.GetCredentials(context.Background()); err == nil && creds.ProjectID != "" { - return creds.ProjectID - } - } - return "" -} - -func (b *gkeBackend) plan(k *PlatformKubernetes) (*PlatformPlan, error) { - project := b.gkeProject(k) - if project == "" { - return nil, fmt.Errorf("gke plan: 'project_id' is required in module config or cloud account") - } - location := b.gkeLocation(k) - - plan := &PlatformPlan{ - Provider: "gke", - Resource: k.clusterName(), - } - - action := PlatformAction{Type: "create", Resource: k.clusterName(), Detail: fmt.Sprintf("create GKE cluster %q in %s", k.clusterName(), location)} - - if svc, svcErr := b.containerService(k); svcErr == nil { - name := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, k.clusterName()) - if cluster, getErr := svc.Projects.Locations.Clusters.Get(name).Context(context.Background()).Do(); getErr == nil { - action = PlatformAction{Type: "noop", Resource: k.clusterName(), Detail: fmt.Sprintf("GKE cluster %q exists (status: %s)", k.clusterName(), cluster.Status)} - } - } - - plan.Actions = []PlatformAction{action} - return plan, nil -} - -func (b *gkeBackend) apply(k *PlatformKubernetes) (*PlatformResult, error) { - project := b.gkeProject(k) - if project == "" { - return nil, fmt.Errorf("gke apply: 'project_id' is required in module config or cloud account") - } - location := b.gkeLocation(k) - - svc, err := b.containerService(k) - if err != nil { - return nil, fmt.Errorf("gke apply: GCP credentials: %w", err) - } - - version := k.state.Version - if version == "" { - version = "1.29" - } - - // Build node pools from nodeGroups config - var nodePools []*container.NodePool - for _, ng := range k.nodeGroups() { - machineType := ng.InstanceType - if machineType == "" { - machineType = "e2-medium" - } - nodePools = append(nodePools, &container.NodePool{ - Name: ng.Name, - InitialNodeCount: int64(ng.Min), - Config: &container.NodeConfig{ - MachineType: machineType, - }, - Autoscaling: &container.NodePoolAutoscaling{ - Enabled: true, - MinNodeCount: int64(ng.Min), - MaxNodeCount: int64(ng.Max), - }, - }) - } - if len(nodePools) == 0 { - nodePools = []*container.NodePool{{ - Name: "default-pool", - InitialNodeCount: 1, - Config: &container.NodeConfig{MachineType: "e2-medium"}, - }} - } - - parent := fmt.Sprintf("projects/%s/locations/%s", project, location) - req := &container.CreateClusterRequest{ - Cluster: &container.Cluster{ - Name: k.clusterName(), - InitialClusterVersion: version, - NodePools: nodePools, - }, - } - - _, err = svc.Projects.Locations.Clusters.Create(parent, req).Context(context.Background()).Do() - if err != nil { - if strings.Contains(err.Error(), "Already Exists") || strings.Contains(err.Error(), "ALREADY_EXISTS") { - return &PlatformResult{ - Success: true, - Message: fmt.Sprintf("GKE cluster %q already exists", k.clusterName()), - State: k.state, - }, nil - } - return nil, fmt.Errorf("gke apply: CreateCluster: %w", err) - } - - k.state.Status = "creating" - k.state.NodeGroups = k.nodeGroups() - k.state.CreatedAt = time.Now() - - return &PlatformResult{ - Success: true, - Message: fmt.Sprintf("GKE cluster %q creation initiated in %s", k.clusterName(), location), - State: k.state, - }, nil -} - -func (b *gkeBackend) status(k *PlatformKubernetes) (*KubernetesClusterState, error) { - project := b.gkeProject(k) - if project == "" { - k.state.Status = "unknown" - return k.state, nil - } - location := b.gkeLocation(k) - - if svc, svcErr := b.containerService(k); svcErr == nil { - name := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, k.clusterName()) - if cluster, getErr := svc.Projects.Locations.Clusters.Get(name).Context(context.Background()).Do(); getErr == nil { - k.state.Status = strings.ToLower(cluster.Status) - k.state.Endpoint = cluster.Endpoint - if cluster.CurrentMasterVersion != "" { - k.state.Version = cluster.CurrentMasterVersion - } - var groups []NodeGroupState - for _, np := range cluster.NodePools { - groups = append(groups, NodeGroupState{ - Name: np.Name, - Current: int(np.InitialNodeCount), - }) - } - k.state.NodeGroups = groups - } else { - k.state.Status = "not-found" - } - } - - return k.state, nil -} - -func (b *gkeBackend) destroy(k *PlatformKubernetes) error { - project := b.gkeProject(k) - if project == "" { - return fmt.Errorf("gke destroy: 'project_id' is required") - } - location := b.gkeLocation(k) - - svc, err := b.containerService(k) - if err != nil { - return fmt.Errorf("gke destroy: GCP credentials: %w", err) - } - - name := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, k.clusterName()) - _, err = svc.Projects.Locations.Clusters.Delete(name).Context(context.Background()).Do() - if err != nil { - if strings.Contains(err.Error(), "NOT_FOUND") || strings.Contains(err.Error(), "notFound") { - k.state.Status = "deleted" - return nil - } - return fmt.Errorf("gke destroy: DeleteCluster: %w", err) - } - - k.state.Status = "deleting" - return nil -} - -func (b *gkeBackend) containerService(k *PlatformKubernetes) (*container.Service, error) { - if k.provider == nil { - return nil, fmt.Errorf("no GCP cloud account configured") - } - - creds, err := k.provider.GetCredentials(context.Background()) - if err != nil { - return nil, fmt.Errorf("get GCP credentials: %w", err) - } - - var opts []option.ClientOption - if len(creds.ServiceAccountJSON) > 0 { - opts = append(opts, option.WithCredentialsJSON(creds.ServiceAccountJSON)) //nolint:staticcheck // SA1019: no alternative available without security advisory scope - } - - svc, err := container.NewService(context.Background(), opts...) - if err != nil { - return nil, fmt.Errorf("create container service: %w", err) - } - return svc, nil -} - // ─── AKS backend ────────────────────────────────────────────────────────────── // aksBackend manages Azure Kubernetes Service clusters. @@ -640,9 +426,6 @@ func init() { RegisterKubernetesBackend("eks", func(_ map[string]any) (kubernetesBackend, error) { return &eksErrorBackend{}, nil }) - RegisterKubernetesBackend("gke", func(_ map[string]any) (kubernetesBackend, error) { - return &gkeBackend{}, nil - }) RegisterKubernetesBackend("aks", func(_ map[string]any) (kubernetesBackend, error) { return &aksBackend{}, nil }) diff --git a/module/platform_kubernetes_gke.go b/module/platform_kubernetes_gke.go new file mode 100644 index 00000000..1f96cfc9 --- /dev/null +++ b/module/platform_kubernetes_gke.go @@ -0,0 +1,227 @@ +package module + +import ( + "context" + "fmt" + "strings" + "time" + + container "google.golang.org/api/container/v1" + "google.golang.org/api/option" +) + +// gkeBackend manages Google Kubernetes Engine clusters via the GCP Container API. +type gkeBackend struct{} + +func (b *gkeBackend) gkeLocation(k *PlatformKubernetes) string { + if z, ok := k.config["zone"].(string); ok && z != "" { + return z + } + if r, ok := k.config["location"].(string); ok && r != "" { + return r + } + if k.provider != nil { + return k.provider.Region() + } + return "us-central1" +} + +func (b *gkeBackend) gkeProject(k *PlatformKubernetes) string { + if p, ok := k.config["project_id"].(string); ok && p != "" { + return p + } + if k.provider != nil { + if creds, err := k.provider.GetCredentials(context.Background()); err == nil && creds.ProjectID != "" { + return creds.ProjectID + } + } + return "" +} + +func (b *gkeBackend) plan(k *PlatformKubernetes) (*PlatformPlan, error) { + project := b.gkeProject(k) + if project == "" { + return nil, fmt.Errorf("gke plan: 'project_id' is required in module config or cloud account") + } + location := b.gkeLocation(k) + + plan := &PlatformPlan{ + Provider: "gke", + Resource: k.clusterName(), + } + + action := PlatformAction{Type: "create", Resource: k.clusterName(), Detail: fmt.Sprintf("create GKE cluster %q in %s", k.clusterName(), location)} + + if svc, svcErr := b.containerService(k); svcErr == nil { + name := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, k.clusterName()) + if cluster, getErr := svc.Projects.Locations.Clusters.Get(name).Context(context.Background()).Do(); getErr == nil { + action = PlatformAction{Type: "noop", Resource: k.clusterName(), Detail: fmt.Sprintf("GKE cluster %q exists (status: %s)", k.clusterName(), cluster.Status)} + } + } + + plan.Actions = []PlatformAction{action} + return plan, nil +} + +func (b *gkeBackend) apply(k *PlatformKubernetes) (*PlatformResult, error) { + project := b.gkeProject(k) + if project == "" { + return nil, fmt.Errorf("gke apply: 'project_id' is required in module config or cloud account") + } + location := b.gkeLocation(k) + + svc, err := b.containerService(k) + if err != nil { + return nil, fmt.Errorf("gke apply: GCP credentials: %w", err) + } + + version := k.state.Version + if version == "" { + version = "1.29" + } + + // Build node pools from nodeGroups config + var nodePools []*container.NodePool + for _, ng := range k.nodeGroups() { + machineType := ng.InstanceType + if machineType == "" { + machineType = "e2-medium" + } + nodePools = append(nodePools, &container.NodePool{ + Name: ng.Name, + InitialNodeCount: int64(ng.Min), + Config: &container.NodeConfig{ + MachineType: machineType, + }, + Autoscaling: &container.NodePoolAutoscaling{ + Enabled: true, + MinNodeCount: int64(ng.Min), + MaxNodeCount: int64(ng.Max), + }, + }) + } + if len(nodePools) == 0 { + nodePools = []*container.NodePool{{ + Name: "default-pool", + InitialNodeCount: 1, + Config: &container.NodeConfig{MachineType: "e2-medium"}, + }} + } + + parent := fmt.Sprintf("projects/%s/locations/%s", project, location) + req := &container.CreateClusterRequest{ + Cluster: &container.Cluster{ + Name: k.clusterName(), + InitialClusterVersion: version, + NodePools: nodePools, + }, + } + + _, err = svc.Projects.Locations.Clusters.Create(parent, req).Context(context.Background()).Do() + if err != nil { + if strings.Contains(err.Error(), "Already Exists") || strings.Contains(err.Error(), "ALREADY_EXISTS") { + return &PlatformResult{ + Success: true, + Message: fmt.Sprintf("GKE cluster %q already exists", k.clusterName()), + State: k.state, + }, nil + } + return nil, fmt.Errorf("gke apply: CreateCluster: %w", err) + } + + k.state.Status = "creating" + k.state.NodeGroups = k.nodeGroups() + k.state.CreatedAt = time.Now() + + return &PlatformResult{ + Success: true, + Message: fmt.Sprintf("GKE cluster %q creation initiated in %s", k.clusterName(), location), + State: k.state, + }, nil +} + +func (b *gkeBackend) status(k *PlatformKubernetes) (*KubernetesClusterState, error) { + project := b.gkeProject(k) + if project == "" { + k.state.Status = "unknown" + return k.state, nil + } + location := b.gkeLocation(k) + + if svc, svcErr := b.containerService(k); svcErr == nil { + name := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, k.clusterName()) + if cluster, getErr := svc.Projects.Locations.Clusters.Get(name).Context(context.Background()).Do(); getErr == nil { + k.state.Status = strings.ToLower(cluster.Status) + k.state.Endpoint = cluster.Endpoint + if cluster.CurrentMasterVersion != "" { + k.state.Version = cluster.CurrentMasterVersion + } + var groups []NodeGroupState + for _, np := range cluster.NodePools { + groups = append(groups, NodeGroupState{ + Name: np.Name, + Current: int(np.InitialNodeCount), + }) + } + k.state.NodeGroups = groups + } else { + k.state.Status = "not-found" + } + } + + return k.state, nil +} + +func (b *gkeBackend) destroy(k *PlatformKubernetes) error { + project := b.gkeProject(k) + if project == "" { + return fmt.Errorf("gke destroy: 'project_id' is required") + } + location := b.gkeLocation(k) + + svc, err := b.containerService(k) + if err != nil { + return fmt.Errorf("gke destroy: GCP credentials: %w", err) + } + + name := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, k.clusterName()) + _, err = svc.Projects.Locations.Clusters.Delete(name).Context(context.Background()).Do() + if err != nil { + if strings.Contains(err.Error(), "NOT_FOUND") || strings.Contains(err.Error(), "notFound") { + k.state.Status = "deleted" + return nil + } + return fmt.Errorf("gke destroy: DeleteCluster: %w", err) + } + + k.state.Status = "deleting" + return nil +} + +func (b *gkeBackend) containerService(k *PlatformKubernetes) (*container.Service, error) { + if k.provider == nil { + return nil, fmt.Errorf("no GCP cloud account configured") + } + + creds, err := k.provider.GetCredentials(context.Background()) + if err != nil { + return nil, fmt.Errorf("get GCP credentials: %w", err) + } + + var opts []option.ClientOption + if len(creds.ServiceAccountJSON) > 0 { + opts = append(opts, option.WithCredentialsJSON(creds.ServiceAccountJSON)) //nolint:staticcheck // SA1019: no alternative available without security advisory scope + } + + svc, err := container.NewService(context.Background(), opts...) + if err != nil { + return nil, fmt.Errorf("create container service: %w", err) + } + return svc, nil +} + +func init() { + RegisterKubernetesBackend("gke", func(_ map[string]any) (kubernetesBackend, error) { + return &gkeBackend{}, nil + }) +} From c220bebeebd0f9367a3cdae57c84abcb95a8aca6 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:43:39 -0400 Subject: [PATCH 21/39] docs(module): add file-purpose headers to platform_kubernetes _core/_gke Code-review Minor: makes the Phase 0 SDK-free/SDK-bearing partition self-documenting for readers without the commit message. --- module/platform_kubernetes_core.go | 3 +++ module/platform_kubernetes_gke.go | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/module/platform_kubernetes_core.go b/module/platform_kubernetes_core.go index 75169774..17675813 100644 --- a/module/platform_kubernetes_core.go +++ b/module/platform_kubernetes_core.go @@ -1,3 +1,6 @@ +// platform_kubernetes_core.go holds the SDK-free Kubernetes backends +// (kind/k3s/eks/aks) and registers them. The lone SDK-bearing backend, gke, +// lives in platform_kubernetes_gke.go — see the cloud-SDK-extraction design. package module import ( diff --git a/module/platform_kubernetes_gke.go b/module/platform_kubernetes_gke.go index 1f96cfc9..4304ddf3 100644 --- a/module/platform_kubernetes_gke.go +++ b/module/platform_kubernetes_gke.go @@ -1,3 +1,7 @@ +// platform_kubernetes_gke.go holds the GKE Kubernetes backend — the only +// platform.kubernetes backend that imports a cloud SDK (google.golang.org/api). +// Isolated here so the cloud-SDK extraction can delete it cleanly; the SDK-free +// backends stay in platform_kubernetes_core.go. package module import ( From 443f49524cf1bdc53fecc1d90ac95ed94e27700a Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:44:10 -0400 Subject: [PATCH 22/39] docs(module): fix stale 'Requires the Azure SDK' comment on aksBackend aksBackend.azureToken is a net/http OAuth2 client, not an azure-sdk consumer. The stale comment is what fooled an earlier inventory pass into mis-counting platform_kubernetes_kind.go as an azure-sdk importer. --- module/platform_kubernetes_core.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/platform_kubernetes_core.go b/module/platform_kubernetes_core.go index 17675813..dbf33e39 100644 --- a/module/platform_kubernetes_core.go +++ b/module/platform_kubernetes_core.go @@ -117,8 +117,10 @@ func (b *eksErrorBackend) destroy(k *PlatformKubernetes) error { // ─── AKS backend ────────────────────────────────────────────────────────────── -// aksBackend manages Azure Kubernetes Service clusters. -// Requires the Azure SDK (github.com/Azure/azure-sdk-for-go) to be available. +// aksBackend manages Azure Kubernetes Service clusters via the Azure Resource +// Manager REST API. It authenticates with a net/http OAuth2 client-credentials +// flow against login.microsoftonline.com — it does NOT import +// github.com/Azure/azure-sdk-for-go, so it stays in workflow core. // When Azure credentials are not configured, returns clear errors. type aksBackend struct{} From bf203f4e7d913cfb0202cede28e9e7a38d747c30 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:45:44 -0400 Subject: [PATCH 23/39] ci(audit): enforce k8s-backend init() partition + run audit on every PR Extends audit-cloud-symbols.sh --check with an init()-partition assertion (platform_kubernetes_core.go registers only kind/k3s/eks/aks; _gke.go only gke) and adds a cloud-sdk-audit job to ci.yml beside godo-banned / aws-sdk-banned, so the cloud-SDK inventory becomes a build-enforced artifact rather than a prose claim. --- .github/workflows/ci.yml | 8 ++++++++ scripts/audit-cloud-symbols.sh | 21 ++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac32f14a..90187b3a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -394,6 +394,14 @@ jobs: run: | ! grep -qH "digitalocean/godo" go.mod example/go.mod + cloud-sdk-audit: + name: Cloud-SDK inventory + k8s-backend init() partition audit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Audit cloud-SDK imports + init() partition + run: bash scripts/audit-cloud-symbols.sh --check + aws-sdk-banned: name: Verify removed AWS SDK packages are not imported (issue #653) runs-on: ubuntu-latest diff --git a/scripts/audit-cloud-symbols.sh b/scripts/audit-cloud-symbols.sh index eade7e22..8833282a 100755 --- a/scripts/audit-cloud-symbols.sh +++ b/scripts/audit-cloud-symbols.sh @@ -85,7 +85,7 @@ fi echo echo "== Advisory: platform_kubernetes_kind.go backend split readiness ==" KIND=module/platform_kubernetes_kind.go -if [[ -f "$KIND" ]]; then +if [[ -f module/platform_kubernetes_kind.go ]]; then echo " backend types: $(grep -cE '^type .*[Bb]ackend struct' "$KIND") (expect kind/eksError/gke/aks pre-Phase-0)" echo " shared init(): $(grep -c '^func init()' "$KIND") (expect 1 pre-Phase-0; 0 here post-split — each _provider.go gets its own)" echo " real SDK imports here:" @@ -94,6 +94,25 @@ if [[ -f "$KIND" ]]; then done fi +echo +echo "== Invariant: no init() mixes core-staying + plugin-bound k8s backends ==" +# Post-Phase-0, platform_kubernetes_core.go must register ONLY kind/k3s/eks/aks +# and platform_kubernetes_gke.go must register ONLY gke. A file registering a +# name from the other set is a partition violation. +CORE_K8S=module/platform_kubernetes_core.go +GKE_K8S=module/platform_kubernetes_gke.go +if [[ -f "$CORE_K8S" && -f "$GKE_K8S" ]]; then + if grep -qE 'RegisterKubernetesBackend\("gke"' "$CORE_K8S"; then + echo " VIOLATION: $CORE_K8S registers the plugin-bound 'gke' backend"; FAIL=1 + fi + for n in kind k3s eks aks; do + if grep -qE "RegisterKubernetesBackend\\(\"$n\"" "$GKE_K8S"; then + echo " VIOLATION: $GKE_K8S registers the core-staying '$n' backend"; FAIL=1 + fi + done + [[ $FAIL -eq 0 ]] && echo " OK — init() partition clean" +fi + echo if [[ $CHECK -eq 1 ]]; then [[ $FAIL -eq 0 ]] && echo "audit-cloud-symbols: OK" || { echo "audit-cloud-symbols: FAIL"; exit 1; } From 67c8b0abbde16c870334787edbf4fbfe5be9a9f9 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:58:56 -0400 Subject: [PATCH 24/39] =?UTF-8?q?docs(plans):=20IaCStateBackend=20transpor?= =?UTF-8?q?t=20benchmark=20result=20=E2=80=94=20decision=20pending?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Task 6 measurement: gRPC cycle 6.511ms ±1% vs in-process 179ns, for a worst-case 1MB synthetic state. Exceeds the plan's <5ms acceptance bar. Root-cause analysis: the cost is json.Marshal/Unmarshal of the ~1MB map[string]any (inherent to the bytes outputs_json wire format the iac.proto invariant mandates) — NOT gRPC transport buffering or the 4MB message cap. The plan's contingency remedy (streaming redesign) addresses message-size-cap + memory-buffering, neither of which the benchmark hits; streaming would not move the number. Recommendation: retain unary (6.5ms is still negligible vs real cloud backend I/O — the design's own bar-rationale). Deviation from the literal 5ms estimate-bar is surfaced to the operator, not absorbed silently. Scope lock intact: Task 6 run + recorded, no task added/dropped. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-iac-state-backend-benchmark.md | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 docs/plans/2026-05-14-iac-state-backend-benchmark.md diff --git a/docs/plans/2026-05-14-iac-state-backend-benchmark.md b/docs/plans/2026-05-14-iac-state-backend-benchmark.md new file mode 100644 index 00000000..80b94999 --- /dev/null +++ b/docs/plans/2026-05-14-iac-state-backend-benchmark.md @@ -0,0 +1,65 @@ +# IaCStateBackend transport benchmark — result & decision + +**Date:** 2026-05-14 +**Task:** Plan Task 6 (PR 2) — "run the benchmark, lock the proto-transport decision" +**Status:** ⚠️ **DECISION PENDING OPERATOR REVIEW** — the measured result exceeds the plan's literal acceptance bar, but the plan's contingency remedy (streaming redesign) is demonstrably mis-targeted at the actual bottleneck. See "The conflict" and "Recommendation" below. + +## Measurement + +`go test ./module/ -bench BenchmarkIaCStateBackend -benchmem -run '^$' -count=10`, benchstat over 10 samples: + +| Benchmark | sec/op | B/op | allocs/op | +|---|---|---|---| +| `IaCStateBackend_InProcess` | **179.4 ns ± 1%** | 416 | 2 | +| `IaCStateBackend_GRPC` | **6.511 ms ± 1%** | 4.934 MiB ± 7% | 6.851 k | + +Each sample is one full `Lock → GetState → SaveState → Unlock` cycle against a ~1 MB synthetic `IaCState` (a 1024-entry `Outputs` map of 1 KiB string values). `_InProcess` calls the `memory` backend directly; `_GRPC` routes every call over a real in-memory `bufconn` gRPC boundary using the new `IaCStateBackend` service. + +**Added latency (gRPC over in-process): ≈ 6.51 ms p50 per cycle.** + +## The acceptance bar (plan Task 6, Step 3) + +> "unary transport is accepted if the gRPC path's p50 added latency for the full 4-call cycle is **< 5 ms** over the in-process baseline. … If the bar is NOT met: do NOT proceed. The proto needs a streaming redesign for `GetState`/`SaveState` — revise Task 4's proto, regenerate, re-run this task." + +**6.51 ms > 5 ms — the literal bar is not met.** + +## Root-cause analysis — what the 6.51 ms actually is + +Per cycle, for a 1 MB state: +- `Lock` / `Unlock`: trivial (a `resource_id` string each) — negligible. +- `GetState`: server-side `json.Marshal` of the ~1 MB `Outputs` map → proto-marshal the resulting `[]byte` → `bufconn` copy → client proto-unmarshal. +- `SaveState`: client sends a pre-built proto message (its JSON was marshalled **once**, outside the benchmark loop — so client-side JSON is amortised and *not* in the hot path) → `bufconn` copy → server proto-unmarshal → server-side `json.Unmarshal` of the ~1 MB `OutputsJson`. + +The 4.9 MiB/op + 6.8 k allocs/op profile is dominated by **one `json.Marshal` + one `json.Unmarshal` of a ~1 MB `map[string]any` per cycle**. That cost is **inherent to the `bytes _json` wire format** the `iac.proto:6-10` hard invariant mandates (no `google.protobuf.Struct`) — it is a *serialization-CPU* cost, not a gRPC *transport* cost. + +**Why streaming (the plan's contingency remedy) does not fix this:** chunked-stream `GetState`/`SaveState` addresses two things — (a) the gRPC 4 MB default message-size cap, and (b) peak memory from buffering one large message. The benchmark hits *neither*: the 1 MB payload is well under the 4 MB cap, and 4.9 MiB/op peak is not memory-pressure. Streaming would still `json.Marshal`/`json.Unmarshal` the same 1 MB — just in pieces. **The streaming redesign would do significant work (new proto shape, new converters, new plugin-serve pattern, re-review of Task 4) and not move the measured number.** + +## The conflict + +- **Literal plan reading:** 6.51 ms > 5 ms → "do NOT proceed; streaming redesign." +- **Technical reality:** (1) streaming does not reduce the JSON-serialization CPU that *is* the 6.51 ms; (2) the design's own stated rationale for the bar (Architecture §1) is *"sub-5 ms per cycle is negligible against real cloud-provider API latency (hundreds of ms)"* — and **6.51 ms is also negligible against hundreds of ms**. A real `azure_blob` backend's `GetState`/`SaveState` is an Azure Blob GET/PUT of the state object — tens to hundreds of ms of network I/O — which dwarfs 6.51 ms of local serialization. (3) 1 MB is a deliberately stressful synthetic size; typical IaC state is far smaller and the per-cycle cost scales with state size. + +The "5 ms" figure was a **pre-measurement estimate**; the post-measurement reality is "6.51 ms for a worst-case 1 MB state, dominated by a serialization cost streaming cannot remove." + +## Recommendation + +**Retain unary `IaCStateBackend`.** Do not do the streaming redesign — it is mis-targeted at the actual bottleneck and would not change the result. The unary proto from Task 4 stands. + +Rationale, in order: +1. Streaming does not address the measured cost (JSON CPU, not transport buffering / message cap). +2. 6.51 ms remains negligible against the real cloud-provider backend I/O latency that the design's own bar-rationale invokes — the same logic that justified "< 5 ms" justifies "< ~10 ms for a stress-test payload." +3. If the operator wants the literal bar honored, the *correct* remedy is a **serialization-format** change (e.g. a more compact binary state encoding instead of `json` inside the `bytes` field), **not** a transport-shape change — and that is a separate design question, not Task 4-redo work. + +## What this means for the locked plan + +- Task 6 is **not** being skipped or dropped — the benchmark was run, analysed, and recorded (this file). No task is added or removed; no PR is collapsed. The scope lock is intact. +- Task 6's Step 3 contingency branch ("streaming redesign") rested on the unstated assumption that *if the bar is exceeded, streaming is the fix*. The measurement falsifies that assumption. This is a **finding within Task 6**, recorded in Task 6's own artifact — the place adversarial-review designates for recorded overrides. +- **This deviation from the literal 5 ms threshold is surfaced to the operator, not silently absorbed.** If the operator confirms the recommendation, stamp this file `Status: Unary LOCKED (operator-confirmed deviation from the 5 ms estimate-bar; see Root-cause analysis)` and PR 2 / Phase A proceed unchanged. If the operator wants the bar honored literally, the follow-up is a serialization-format spike, not a streaming redesign. + +## Raw data + +``` +IaCStateBackend_InProcess-10 179.4n ± 1% 416.0 B ± 0% 2.000 allocs ± 0% +IaCStateBackend_GRPC-10 6.511m ± 1% 4.934Mi B ± 7% 6.851k allocs ± 0% +``` +(10 samples each; full per-sample output in the Task 6 run log.) From 20d7fbbac1e6a9bc9bd7f6ed8300cdf3344a3f4c Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 06:04:45 -0400 Subject: [PATCH 25/39] =?UTF-8?q?docs(plans):=20Task=206=20resolved=20?= =?UTF-8?q?=E2=80=94=20unary=20IaCStateBackend=20LOCKED=20(operator-confir?= =?UTF-8?q?med)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operator reviewed the 6.51ms benchmark + root-cause analysis and confirmed: "I'm not concerned about 6.51ms, that's acceptable." Task 6's gate resolves Unary LOCKED — the Task 4 proto stands, no streaming redesign, PR 2/3 proceed unchanged. Operator additionally raised a long-term architectural item: IaC state is persisted at-rest as JSON; a typed/compact binary format (pb/msgpack/CBOR) with JSON-export + content-detection-on-read would be better for processing/type-correctness/large-state scaling. Logged as a post-extraction follow-up in both the benchmark decision record and the design doc's Open items — distinct from the wire contract, cross-cutting across all IaCStateStore impls, needs its own brainstorming pass. Not actioned in this locked plan. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-14-cloud-sdk-extraction-design.md | 1 + .../2026-05-14-iac-state-backend-benchmark.md | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md index d9a142f9..4e7b1459 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction-design.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction-design.md @@ -285,3 +285,4 @@ This design changes **plugin loading paths** and **go.mod dependency trees** — - Exact wording of the secret-redaction extension + whether existing redaction already covers `credentials:` keys. - The `s3compat` shared-module cleanup (Alternatives Considered #3) — logged as a post-extraction follow-up candidate. - Per-plugin CHANGELOG entries + the consolidated migration doc wording. +- **IaC state at-rest format (operator-raised, 2026-05-14; post-extraction follow-up).** Every `IaCStateStore` backend currently persists `module.IaCState` as JSON. A typed/compact binary at-rest format (protobuf / msgpack / CBOR) would be better for processing, type-correctness, and large-state scaling — with JSON kept as an *export* option and content-detection on read so existing JSON state files keep working. Distinct from the `IaCStateBackend` *wire* contract (the proto carries opaque `bytes`; at-rest format is each backend's concern). Cross-cutting across all `IaCStateStore` impls (core + 4 plugins); needs its own brainstorming pass. See `docs/plans/2026-05-14-iac-state-backend-benchmark.md` §"Logged follow-up". Not in this design's scope. diff --git a/docs/plans/2026-05-14-iac-state-backend-benchmark.md b/docs/plans/2026-05-14-iac-state-backend-benchmark.md index 80b94999..5c8195a0 100644 --- a/docs/plans/2026-05-14-iac-state-backend-benchmark.md +++ b/docs/plans/2026-05-14-iac-state-backend-benchmark.md @@ -2,7 +2,7 @@ **Date:** 2026-05-14 **Task:** Plan Task 6 (PR 2) — "run the benchmark, lock the proto-transport decision" -**Status:** ⚠️ **DECISION PENDING OPERATOR REVIEW** — the measured result exceeds the plan's literal acceptance bar, but the plan's contingency remedy (streaming redesign) is demonstrably mis-targeted at the actual bottleneck. See "The conflict" and "Recommendation" below. +**Status:** ✅ **Unary LOCKED** (operator-confirmed 2026-05-14 — explicit deviation from the 5 ms estimate-bar; see "Operator decision" below). The measured 6.51 ms exceeded the plan's literal acceptance bar, but root-cause analysis showed the plan's contingency remedy (streaming redesign) was mis-targeted at the actual bottleneck. The operator reviewed the data and accepted unary. ## Measurement @@ -50,7 +50,20 @@ Rationale, in order: 2. 6.51 ms remains negligible against the real cloud-provider backend I/O latency that the design's own bar-rationale invokes — the same logic that justified "< 5 ms" justifies "< ~10 ms for a stress-test payload." 3. If the operator wants the literal bar honored, the *correct* remedy is a **serialization-format** change (e.g. a more compact binary state encoding instead of `json` inside the `bytes` field), **not** a transport-shape change — and that is a separate design question, not Task 4-redo work. -## What this means for the locked plan +## Operator decision (2026-05-14) + +The operator reviewed the measurement + root-cause analysis and **confirmed: retain unary.** Verbatim: *"I'm not concerned about 6.51ms, that's acceptable."* — Task 6's gate resolves **Unary LOCKED**; the Task 4 proto stands; PR 2 / PR 3 proceed unchanged. No streaming redesign, no serialization-format spike inside this locked plan. + +The operator additionally raised a **long-term architectural observation** (logged as a follow-up, NOT actioned in this locked plan): *"why are we storing the state file as JSON rather than binary, pb, etc? It makes sense to have a JSON export option, but otherwise we should keep a more optimal format … if we did change this, we'd need to detect file contents to determine whether to decode JSON or not, but when we store back to the file, we could then use the optimal format. … I'm just thinking about optimality long-term and for larger and larger state files, we should store what's best for processing, type correctness, etc."* + +### Logged follow-up — IaC state at-rest format (post-extraction) + +Distinct from the wire format settled here. The `IaCStateStore` backends (`memory`/`filesystem`/`postgres` in core; `s3`/`azure_blob`/`gcs`/`spaces` in plugins post-extraction) currently persist `module.IaCState` as **JSON** (the blob/file content). The operator's point: a typed/compact binary at-rest format (protobuf, msgpack, CBOR …) would be better for processing, type-correctness, and scaling to large state — with JSON retained as an *export* option, and **content-detection on read** (decode JSON vs. binary by inspecting the stored bytes) so existing JSON state files keep working while new writes use the optimal format. + +This is a **post-extraction follow-up**, not part of this design/plan: +- It does **not** affect the `IaCStateBackend` *wire* contract decided here — the proto carries opaque `bytes`; what a backend persists at-rest is the backend's own concern. +- It is a cross-cutting change to every `IaCStateStore` implementation (core + all four plugins) — its own brainstorming + design pass. +- Tracked in the cloud-SDK-extraction design doc's "Open items" section and to be picked up after the B/C/D follow-on plan, or sooner as an independent issue if state-file size becomes a pain point. - Task 6 is **not** being skipped or dropped — the benchmark was run, analysed, and recorded (this file). No task is added or removed; no PR is collapsed. The scope lock is intact. - Task 6's Step 3 contingency branch ("streaming redesign") rested on the unstated assumption that *if the bar is exceeded, streaming is the fix*. The measurement falsifies that assumption. This is a **finding within Task 6**, recorded in Task 6's own artifact — the place adversarial-review designates for recorded overrides. From 6062520205b992707a40b5045e3c9230d25aaf1a Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 06:31:25 -0400 Subject: [PATCH 26/39] Revert "chore: lock scope for cloud-sdk-extraction (alignment passed)" This reverts commit 6186e3d100e427807b9fd122e20df589d6bb6954. --- docs/plans/2026-05-14-cloud-sdk-extraction.md | 2 +- docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md index 7cd1dd6f..526b608e 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -40,7 +40,7 @@ **PR 4 is a declared human-action gate.** It lands in a *different git repository* (`/Users/jon/workspace/workflow-plugin-azure`), which the autonomous worktree-scoped execution pipeline cannot branch/PR/tag. When execution reaches PR 4, the pipeline must **pause and surface PR 4 to a human operator** (with Tasks 11–12 as the checklist) — this is the expected, designed handoff, not an execution failure. PR 5 is **blocked on PR 4's plugin tag** existing and being installable (Task 13 Step 8 + Task 14 Step 4 runtime-launch validation load the tagged plugin binary). PRs 1–3 are fully autonomous-executable in the worktree; PR 5 resumes autonomously once the operator confirms PR 4 merged + tagged. -**Status:** Locked 2026-05-14T06:33:23Z +**Status:** Draft --- diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock b/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock deleted file mode 100644 index c7c6d895..00000000 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock +++ /dev/null @@ -1 +0,0 @@ -a66a505f3bcc075be3ad609c6979672b0cc7f9118e8ea4859da33092fdb96248 From 2bbe60fff3f4b007bed2cd176b17b0d13f654176 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 06:34:49 -0400 Subject: [PATCH 27/39] =?UTF-8?q?docs(plans):=20amend=20cloud-sdk-extracti?= =?UTF-8?q?on=20plan=20=E2=80=94=20PR=206=20(ctx)=20+=20de-gate=20PR=204?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operator-approved scope amendment to the (reverted-to-Draft) plan: - ADR 0033: add ctx context.Context to module.IaCStateStore — new PR 6 / Task 15. Task 7 had to hardcode context.Background() in grpcIaCStateStore; the operator directed widening the interface now while we're at that boundary, so Phase B/C/D plugin backends inherit it ctx-ful. Bounded blast radius (~9 files, all in module/); interfaces.IaCStateStore already had ctx and is untouched. - ADR 0034: de-gate PR 4 from "HUMAN-GATE" to autonomous cross-repo. Operator: agents should operate in plugin repos directly; the real requirement is prompt clarity (absolute repo path stated up front), not a human hand-off. Plan's PR 4 row, Cross-repo note, and executor notes updated accordingly. - Manifest: 5 PRs/14 tasks -> 6 PRs/15 tasks. Execution order documented (PR 6 stacks on PR 3, runs before PR 4). Benchmark-gate executor note updated to RESOLVED (unary locked). Next: re-run alignment-check on the amended plan, then re-lock. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../0033-add-ctx-to-module-iac-state-store.md | 28 +++++ ...oss-repo-agent-operation-for-plugin-prs.md | 30 +++++ docs/plans/2026-05-14-cloud-sdk-extraction.md | 119 ++++++++++++++++-- 3 files changed, 165 insertions(+), 12 deletions(-) create mode 100644 decisions/0033-add-ctx-to-module-iac-state-store.md create mode 100644 decisions/0034-cross-repo-agent-operation-for-plugin-prs.md diff --git a/decisions/0033-add-ctx-to-module-iac-state-store.md b/decisions/0033-add-ctx-to-module-iac-state-store.md new file mode 100644 index 00000000..50ca2ed1 --- /dev/null +++ b/decisions/0033-add-ctx-to-module-iac-state-store.md @@ -0,0 +1,28 @@ +# 0033. Add context.Context to module.IaCStateStore mid-extraction + +**Status:** Accepted +**Date:** 2026-05-14 +**Decision-makers:** Jon (operator), autonomous pipeline +**Related:** docs/plans/2026-05-14-cloud-sdk-extraction.md (PR 6 / Task 15), docs/plans/2026-05-14-cloud-sdk-extraction-design.md, decisions/0031-strict-contracts-ergonomics.md + +## Context + +The cloud-SDK-extraction plan was scope-locked at 5 PRs / 14 tasks. During PR 3 / Task 7, the new host-side `grpcIaCStateStore` had to hardcode `context.Background()` on every RPC because `module.IaCStateStore`'s 6 methods (`module/iac_state.go:21`) take no `context.Context` — the call sites have no caller context to plumb. Task 7 shipped with a code comment flagging this as a known follow-up. The operator observed that, since the extraction is already rewriting that exact interface boundary, deferring the ctx change means a second cross-cutting PR later that touches the same files again. + +Investigation established the blast radius is bounded and entirely within `module/`: the interface + its 7 implementations (`memory`/`fs`/`postgres`/`spaces`/`gcs`/`azure`/`grpc_client`) + the one caller file `module/pipeline_step_iac.go` (whose pipeline steps already hold a `PipelineContext`). The separate, unrelated `interfaces.IaCStateStore` (`interfaces/iac_state.go:14`) already takes `context.Context` on every method and is **not** touched. Adding scope to a locked plan is "intentional friction" per `skills/scope-lock/SKILL.md`; the operator gave explicit approval after reviewing the scoped blast-radius analysis. + +## Decision + +We will add `ctx context.Context` as the first parameter to all 6 `module.IaCStateStore` methods now, as a new dedicated PR (PR 6 / Task 15) appended to the locked manifest — not deferred, and not folded into PR 3's existing tasks. + +Alternatives rejected: +- **Fold into PR 3's Task 7/8.** Rejected — it stretches those tasks' definitions past their locked scope and erodes per-PR review/revert granularity; the change is cohesive enough to stand alone. +- **Keep deferred (the original plan's posture).** Rejected by the operator — doing it post-extraction is a second cross-cutting PR re-touching the same files, and the Phase B/C/D plugin-side backend implementations would otherwise be built against a ctx-less interface and need their own follow-up retrofit. + +## Consequences + +- **Easier:** `grpcIaCStateStore` plumbs the caller's real context; `iacStateBackendServer` forwards its gRPC-received context into the store instead of discarding it; cancellation/deadline propagation works through the new contract. Phase B/C/D plugin backends are written ctx-ful from the start. +- **Easier:** removes the `context.Background()` wart and its apologetic comment from Task 7's code. +- **Harder / cost:** the locked plan grows to 6 PRs / 15 tasks; the manifest is amended, re-aligned, and re-locked (a new lock hash). PR 6 must land before PR 3 is finalized so Task 7's `grpcIaCStateStore` is amended in place rather than shipped ctx-less then re-touched. +- **New constraint:** every future `module.IaCStateStore` implementation (the four cloud plugins in Phase B/C/D) must accept and honor `ctx`. This is the intended outcome but is now a hard contract, not a nicety. +- **Bounded undo cost:** reverting is a single-PR revert of a mechanical signature widening; no data-format or wire-contract change is involved (the `IaCStateBackend` proto already carries gRPC's context implicitly). diff --git a/decisions/0034-cross-repo-agent-operation-for-plugin-prs.md b/decisions/0034-cross-repo-agent-operation-for-plugin-prs.md new file mode 100644 index 00000000..9b0f5f01 --- /dev/null +++ b/decisions/0034-cross-repo-agent-operation-for-plugin-prs.md @@ -0,0 +1,30 @@ +# 0034. Plugin-repo PRs run as autonomous cross-repo agent work, not human gates + +**Status:** Accepted +**Date:** 2026-05-14 +**Decision-makers:** Jon (operator), autonomous pipeline +**Related:** docs/plans/2026-05-14-cloud-sdk-extraction.md (PR 4), docs/plans/2026-05-14-cloud-sdk-extraction-design.md, decisions/0033-add-ctx-to-module-iac-state-store.md + +## Context + +The cloud-SDK-extraction plan's PR 4 (`workflow-plugin-azure` implements the `azure_blob` IaCStateBackend) lands in a *different git repository* than the worktree the subagent-driven pipeline runs in. The plan originally marked PR 4 a "HUMAN-GATE": the pipeline would pause and hand Tasks 11–12 to a human operator, on the conservative assumption that worktree-scoped subagents should not autonomously branch/commit/push/PR/tag in a second repo. + +The operator rejected that framing. The whole extraction effort is inherently multi-repo — Phases B/C/D each touch `workflow-plugin-{aws,gcp,digitalocean}`, and the design already assumes "one PR per affected plugin." Treating every plugin PR as a human gate would make the autonomous pipeline barely autonomous. The operator's directive: agents should operate in those other repo contexts directly; the real requirement is not a human gate but **prompt clarity** — each cross-repo agent must be told unambiguously which repository it is working in. + +## Decision + +We will treat plugin-repo PRs (PR 4 here, and the analogous plugin PRs in the deferred B/C/D plan) as **normal autonomous cross-repo agent work**, not human gates. The plan's PR 4 row, its "human-action gate" paragraph, and the executor notes are updated accordingly. + +The replacement requirement: every agent dispatched to do cross-repo work MUST receive, explicitly in its prompt, (a) the absolute path of the repository it operates in, (b) a statement that it is a *different* repo than the worktree, and (c) which repo each file path belongs to. The push + PR-creation steps still follow normal review discipline (feature branch, PR for review — never direct-to-default-branch), and a published release tag is still a deliberate, called-out step — but none of that requires pausing for a human to *perform* the work. + +Alternatives rejected: +- **Keep the human gate.** Rejected by the operator — it defeats the autonomous pipeline for an inherently multi-repo effort. +- **A single mega-worktree spanning all repos.** Rejected — the repos are independently versioned and released; conflating them breaks per-repo PR/review/tag boundaries. + +## Consequences + +- **Easier:** PR 4 (and B/C/D plugin PRs) execute autonomously; no operator hand-off mid-pipeline. The pipeline is genuinely autonomous end-to-end. +- **Easier:** consistent pattern for every plugin repo across all phases — no per-PR "is this a gate?" judgment. +- **Harder / risk:** an agent operating in the wrong repo is now a live failure mode. Mitigated by the mandatory prompt-clarity requirement (absolute repo path + explicit "different repo" callout in every cross-repo dispatch) and by the orchestrator verifying `git -C log` after cross-repo commits. +- **New constraint:** cross-repo agent prompts have a fixed preamble obligation (repo path + scope). The orchestrator owns enforcing it. +- **Unchanged:** push/PR still go through review; a published plugin release tag is still an explicit, deliberate step (PR 5 depends on PR 4's tag) — autonomy here means the agent *performs* the steps, not that review/release discipline is skipped. diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md index 526b608e..7e14e5dc 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -16,9 +16,11 @@ ## Scope Manifest -**PR Count:** 5 -**Tasks:** 14 -**Estimated Lines of Change:** ~1800 (informational; not enforced) +**PR Count:** 6 +**Tasks:** 15 +**Estimated Lines of Change:** ~1950 (informational; not enforced) + +**Amendment (2026-05-14):** PR 6 / Task 15 added by operator-approved scope amendment — `ctx context.Context` on `module.IaCStateStore` — see `decisions/0033-add-ctx-to-module-iac-state-store.md`. PR 4 de-gated from "HUMAN-GATE" to autonomous cross-repo per `decisions/0034-cross-repo-agent-operation-for-plugin-prs.md`. Original lock: 5 PRs / 14 tasks; manifest re-aligned + re-locked after amendment. **Out of scope:** - **Phases B (AWS), C (GCP), D (DigitalOcean)** — deferred to a follow-on plan authored *after* Phase A merges. Their concrete tasks genuinely depend on Phase A's outputs: the benchmark-validated `IaCStateBackend` proto shape, the host-side gRPC-client resolution pattern, and the plugin-side state-backend serve path. Planning them now would be fiction. The design (`docs/plans/2026-05-14-cloud-sdk-extraction-design.md`) is the authoritative spec for B/C/D; this plan delivers Phase 0 + Phase A, which the design explicitly designates as the "validates the contract end-to-end" increment. @@ -35,10 +37,13 @@ | 1 | Phase 0: split platform_kubernetes_kind.go + wire audit script into CI | Task 1, Task 2, Task 3 | feat/cloud-sdk-extraction-p0 | | 2 | Phase A: IaCStateBackend proto + benchmark harness + proto lock | Task 4, Task 5, Task 6 | feat/cloud-sdk-extraction-pa-proto | | 3 | Phase A: host-side IaCStateBackend resolution + secret-redaction + gRPC-logging guard | Task 7, Task 8, Task 9, Task 10 | feat/cloud-sdk-extraction-pa-host | -| 4 | **[HUMAN-GATE — cross-repo]** Phase A: workflow-plugin-azure implements azure_blob IaCStateBackend | Task 11, Task 12 | (cross-repo: `workflow-plugin-azure` repo, branch `feat/azure-blob-state-backend`) | +| 4 | Phase A: workflow-plugin-azure implements azure_blob IaCStateBackend (cross-repo) | Task 11, Task 12 | cross-repo: `workflow-plugin-azure` repo, branch `feat/azure-blob-state-backend` | | 5 | Phase A: core deletes iac_state_azure.go + strips azure_blob case → drops azure-sdk from go.mod | Task 13, Task 14 | feat/cloud-sdk-extraction-pa-core | +| 6 | Amendment: add `ctx context.Context` to `module.IaCStateStore` | Task 15 | feat/cloud-sdk-extraction-iacstore-ctx | + +**Execution order:** PR 1 → PR 2 → PR 3 (Tasks 7–8) → **PR 6** → PR 3 (Tasks 9–10) → PR 4 → PR 5. PR 6 (the `ctx` amendment) executes right after PR 3's Task 7/8 land — it amends `grpcIaCStateStore` (Task 7's file) and `IaCModule` dispatch (Task 8's wiring) in place, so it must run before PR 3 is finalized. All work lands on the single `feat/cloud-sdk-extraction` branch; `finishing-a-development-branch` splits it into the 6 PR branches per this table (PR 6 stacks on PR 3). -**PR 4 is a declared human-action gate.** It lands in a *different git repository* (`/Users/jon/workspace/workflow-plugin-azure`), which the autonomous worktree-scoped execution pipeline cannot branch/PR/tag. When execution reaches PR 4, the pipeline must **pause and surface PR 4 to a human operator** (with Tasks 11–12 as the checklist) — this is the expected, designed handoff, not an execution failure. PR 5 is **blocked on PR 4's plugin tag** existing and being installable (Task 13 Step 8 + Task 14 Step 4 runtime-launch validation load the tagged plugin binary). PRs 1–3 are fully autonomous-executable in the worktree; PR 5 resumes autonomously once the operator confirms PR 4 merged + tagged. +**PR 4 is autonomous cross-repo work** (de-gated 2026-05-14, `decisions/0034-...md`). It lands in a *different git repository* — `/Users/jon/workspace/workflow-plugin-azure`. A dispatched agent operates in that repo directly; **every cross-repo agent dispatch MUST state, explicitly in its prompt, the absolute path of the repo it works in and that it is a *different* repo than the worktree** (see "Notes for the executor"). Push + PR-creation follow normal review discipline (feature branch, PR — never direct-to-default-branch). PR 5 is **blocked on PR 4's plugin release tag** existing and being installable (Task 13 Step 8 + Task 14 Step 4 runtime-launch validation load the tagged plugin binary); the release tag (Task 12) is an explicit, deliberate step but not a human gate. **Status:** Draft @@ -46,7 +51,7 @@ ## Cross-repo note -PR 4 lands in a **different repository** (`/Users/jon/workspace/workflow-plugin-azure`), not the `workflow` worktree. The executing pipeline must create a branch + PR there separately. PR 4's plugin release (a tagged version implementing the published proto) **must merge and tag before PR 5** — PR 5's core deletion makes `backend: azure_blob` fail to build unless the plugin version implementing `IaCStateBackend` is loadable. PRs 2 and 3 can land in either order relative to each other but both precede PR 4 (the plugin needs the published proto) and PR 5. +PR 4 lands in a **different repository** (`/Users/jon/workspace/workflow-plugin-azure`), not the `workflow` worktree. This is **autonomous cross-repo agent work, not a human gate** (`decisions/0034-cross-repo-agent-operation-for-plugin-prs.md`) — a dispatched agent branches/commits/pushes/PRs/tags in that repo directly. The hard requirement: **every cross-repo agent dispatch must state, explicitly and up front in its prompt, the absolute path of the repository it operates in and that it is a *different* repo than the `workflow` worktree** — an agent operating in the wrong repo is the live failure mode this requirement guards against. Push + PR-creation follow normal review discipline (feature branch, PR — never direct-to-default-branch). PR 4's plugin release (a tagged version implementing the published proto) **must merge and tag before PR 5** — PR 5's core deletion makes `backend: azure_blob` fail to build unless the plugin version implementing `IaCStateBackend` is loadable. The release tag (Task 12) is an explicit, deliberate step. PRs 2 and 3 precede PR 4 (the plugin needs the published proto); PR 6 (the `ctx` amendment) precedes PR 4 too, so the plugin's `IaCStateBackendServer` is written against the ctx-ful `module.IaCStateStore` from the start. --- @@ -972,7 +977,7 @@ Rollback: `git revert` — test-only. ## PR 4 — Phase A: `workflow-plugin-azure` implements `azure_blob` `IaCStateBackend` (cross-repo) -**Repository:** `/Users/jon/workspace/workflow-plugin-azure` (NOT the workflow worktree). Branch: `feat/azure-blob-state-backend`. This PR depends on PRs 2 (published proto) and is a prerequisite for PR 5. +**Repository:** `/Users/jon/workspace/workflow-plugin-azure` — a **different git repository** than the `workflow` worktree the rest of this plan runs in. Branch: `feat/azure-blob-state-backend`. Autonomous cross-repo work, not a human gate (`decisions/0034-...md`). **The agent dispatched for Tasks 11–12 MUST be told, explicitly and up front, that it operates in `/Users/jon/workspace/workflow-plugin-azure` — a different repo — and every file path in Tasks 11–12 is relative to that repo, not the worktree.** This PR depends on PR 2 (published proto) + PR 6 (ctx-ful `module.IaCStateStore`, so the plugin's `IaCStateBackendServer` is written ctx-ful); it is a prerequisite for PR 5. ### Task 11: Port `AzureBlobIaCStateStore` into workflow-plugin-azure + serve it as `IaCStateBackend` @@ -1225,10 +1230,100 @@ Rollback: revert the commit; the registry + dispatch plumbing (Task 8) survive, --- +## PR 6 — Amendment: add `ctx context.Context` to `module.IaCStateStore` + +Operator-approved scope amendment (`decisions/0033-add-ctx-to-module-iac-state-store.md`). Widens the `module.IaCStateStore` interface's 6 methods to take `ctx context.Context` as the first parameter, so the gRPC contract plumbs real caller context instead of `context.Background()`, and Phase B/C/D plugin backends inherit a ctx-ful interface. **Executes after PR 3's Task 7/8 (which created the files it amends) and before PR 3 is finalized / before PR 4.** Bounded blast radius — entirely within `module/`. The separate `interfaces.IaCStateStore` already has `ctx` and is **not** touched. + +### Task 15: Widen `module.IaCStateStore` with `ctx context.Context` + +**Files:** +- Modify: `module/iac_state.go` — the `IaCStateStore` interface (6 method signatures) +- Modify: `module/iac_state_memory.go`, `module/iac_state_fs.go`, `module/iac_state_postgres.go`, `module/iac_state_spaces.go`, `module/iac_state_gcs.go`, `module/iac_state_azure.go` — the 6 in-process implementations +- Modify: `module/iac_state_grpc_client.go` — `grpcIaCStateStore` (the 6 methods gain `ctx`, pass it to `s.client.X(ctx, …)` instead of `context.Background()`; delete the "context.Background()" doc-comment paragraph added in Task 7) and `iacStateBackendServer` (its 6 RPC methods already receive `ctx` from gRPC — forward it: `s.store.X(rpcCtx, …)`) +- Modify: `module/pipeline_step_iac.go` — every `store.GetState(…)` / `store.SaveState(…)` / etc. call site gains the `ctx` the step already holds +- Modify: `module/iac_module.go` — only if it calls `m.store` methods (it has a type-assertion at ~`:147`; check whether `Start()`/`Stop()` invoke store methods and thread `ctx` if so) +- Modify (tests): `module/iac_state_grpc_client_test.go`, `module/benchmark_iac_state_backend_test.go`, `module/iac_state_plugin_registry_test.go`, and the `*_test.go` files of the 6 in-process impls — every store-method call site gains a `ctx` argument (`context.Background()` or `context.TODO()` is fine in tests) + +**Step 1: Widen the interface — this is the "failing test".** + +In `module/iac_state.go`, add `ctx context.Context` as the first parameter to all 6 `IaCStateStore` methods: +```go +type IaCStateStore interface { + GetState(ctx context.Context, resourceID string) (*IaCState, error) + SaveState(ctx context.Context, state *IaCState) error + ListStates(ctx context.Context, filter map[string]string) ([]*IaCState, error) + DeleteState(ctx context.Context, resourceID string) error + Lock(ctx context.Context, resourceID string) error + Unlock(ctx context.Context, resourceID string) error +} +``` +Add the `context` import if not present. (Keep the existing per-method doc comments.) + +**Step 2: Run the build to verify it fails everywhere.** + +Run: `GOWORK=off go build ./...` +Expected: FAIL — every `IaCStateStore` implementation no longer satisfies the interface, and every call site has the wrong arity. The compiler error list IS the worklist for Steps 3–5. + +**Step 3: Update the 6 in-process implementations + the gRPC adapter/server.** + +For each of `iac_state_memory.go`, `iac_state_fs.go`, `iac_state_postgres.go`, `iac_state_spaces.go`, `iac_state_gcs.go`, `iac_state_azure.go`: add `ctx context.Context` as the first parameter of each of the 6 methods. The `memory`/`fs` backends don't *use* ctx (they're synchronous in-memory/disk) — accept it, name it `ctx`, and it's fine for it to be unused at the leaf (Go permits an unused function parameter; do NOT add `_ = ctx`). `postgres`/`spaces`/`gcs`/`azure` backends that already build a `context.Background()` internally for their SDK/DB calls should use the passed `ctx` instead. + +In `module/iac_state_grpc_client.go`: +- `grpcIaCStateStore`'s 6 methods gain `ctx context.Context` and pass it straight through: `s.client.GetState(ctx, …)` etc. — replacing `context.Background()`. Delete the "All six methods call the backend with context.Background()…" doc-comment paragraph on the `grpcIaCStateStore` type (it is now false). +- `iacStateBackendServer`'s 6 RPC methods already receive a `ctx context.Context` from the gRPC framework — forward THAT ctx into the `s.store.X(ctx, …)` calls instead of dropping it. + +**Step 4: Update the caller in `module/pipeline_step_iac.go`.** + +Every `resolveIaCStore(...)` result is used to call store methods (`store.GetState(s.resourceID)` etc.). Each call site gains the step's context as the first arg. Read the file to find the `context.Context` the step already holds — IaC pipeline steps run with a `PipelineContext`; use its context (e.g. `pc.Ctx` / `pc.Context()` — use whatever the real field/method is). If a particular call site genuinely has no context in scope, `context.Background()` is an acceptable last resort, but prefer the real one. Check `module/iac_module.go` too — if `Start()`/`Stop()` call `m.store` methods, thread a context (`context.Background()` is acceptable for lifecycle hooks that have none). + +**Step 5: Update all test call sites.** + +`GOWORK=off go build ./...` will still fail on `*_test.go` files. Fix every store-method call in: `iac_state_grpc_client_test.go`, `benchmark_iac_state_backend_test.go`, `iac_state_plugin_registry_test.go`, and the `*_test.go` files for the 6 in-process backends. In tests, `context.Background()` for the new first arg is fine. (The `fakeStateBackendClient` in `iac_state_plugin_registry_test.go` implements `pb.IaCStateBackendClient` — a gRPC interface that is *already* ctx-ful — so it needs no change; only the `IaCStateStore`-method call sites change.) + +**Step 6: Build + vet + test — all green.** + +Run: `GOWORK=off go build ./... && GOWORK=off go vet ./module/... && GOWORK=off go test ./module/ -run 'IaCState|IaCModule|GRPCIaCStateStore' -count=1` +Expected: exit 0, all PASS. Also run `GOWORK=off go test ./module/ -bench BenchmarkIaCStateBackend -benchmem -run '^$' -count=1` — both benchmarks still run cleanly. + +**Step 7: gofmt.** + +Run: `GOWORK=off gofmt -l module/` — must print nothing for any file you touched. + +**Step 8: Commit.** + +```bash +git add module/iac_state.go module/iac_state_memory.go module/iac_state_fs.go module/iac_state_postgres.go module/iac_state_spaces.go module/iac_state_gcs.go module/iac_state_azure.go module/iac_state_grpc_client.go module/pipeline_step_iac.go module/iac_module.go module/iac_state_grpc_client_test.go module/benchmark_iac_state_backend_test.go module/iac_state_plugin_registry_test.go +# plus any in-process-backend *_test.go files you touched +git commit -m "$(cat <<'EOF' +feat(module)!: add ctx context.Context to IaCStateStore (operator amendment) + +Widens module.IaCStateStore's 6 methods with a leading ctx parameter so +grpcIaCStateStore plumbs the caller's real context (was +context.Background()) and iacStateBackendServer forwards its gRPC ctx +into the store. The 6 in-process backends accept ctx; postgres/spaces/ +gcs/azure use it for their SDK/DB calls. pipeline_step_iac.go callers +pass the step context. + +Operator-approved scope amendment — see decisions/0033. The separate +interfaces.IaCStateStore already had ctx and is untouched. Phase B/C/D +plugin backends now inherit a ctx-ful interface. + +BREAKING (internal): module.IaCStateStore is an internal interface; the +IaCStateBackend gRPC wire contract is unchanged (gRPC was always ctx-ful). +Rollback: revert this commit — mechanical signature-only revert. +EOF +)" +``` + +Rollback: revert the commit — a mechanical signature-only widening, no data-format or wire-contract change. (Runtime-affecting? No — no go.mod / build-config / migration / plugin-loading-path change; this is an internal interface signature change verified by `go build` + `go test`.) + +--- + ## Notes for the executor -- **TDD discipline:** every task above follows write-test → see-it-fail → implement → see-it-pass → commit. Do not skip the "see it fail" step — it proves the test exercises the new behavior. -- **Cross-repo PR 4:** create the branch + PR in `/Users/jon/workspace/workflow-plugin-azure` separately. It must merge + tag before PR 5. If the pipeline cannot operate cross-repo autonomously, surface PR 4 as a blocker for human action rather than skipping it. -- **PR ordering:** PR 1 → (PR 2, PR 3 in either order) → PR 4 → PR 5. PR 5 is the only breaking change and the only one that touches `go.mod`. -- **Benchmark gate (Task 6):** if the benchmark fails the 5 ms bar, STOP and redesign the proto for streaming before continuing — this is a design-mandated gate, not a formality. -- **Follow-on plan:** once PR 5 merges, author the Phase B/C/D plan. Phase B (AWS) reuses Task 7's converters + Task 8's registry + Task 11's plugin pattern; Phase C (GCP) additionally runs the `kubernetesBackend` interface-audit spike for the `gke` contract decision (design Architecture §2); Phase D (DigitalOcean `spaces`) rides Phase B's `iac_state_spaces.go` deletion. +- **TDD discipline:** every task above follows write-test → see-it-fail → implement → see-it-pass → commit. Do not skip the "see it fail" step — it proves the test exercises the new behavior. (Task 15 is a mechanical interface widening — there the *compiler* is the failing test: Step 1 widens the interface, Step 2 confirms the build breaks everywhere, Steps 3–5 fix it.) +- **Cross-repo PR 4 (autonomous, NOT a human gate):** Tasks 11–12 run in `/Users/jon/workspace/workflow-plugin-azure` — a *different repo*. The dispatched agent operates there directly; its prompt MUST state the absolute repo path and that it is a different repo than the worktree (`decisions/0034-...md`). Push + PR follow normal review discipline (feature branch, never direct-to-default). PR 4 must merge + the release tag (Task 12) must exist before PR 5. +- **Every cross-repo agent dispatch** (PR 4 here, and all plugin PRs in the deferred B/C/D plan) carries a fixed prompt obligation: state the absolute path of the repo it works in + that it differs from the worktree + which repo each file path belongs to. The orchestrator verifies `git -C log` after cross-repo commits. +- **PR ordering:** PR 1 → PR 2 → PR 3 (Tasks 7–8) → PR 6 → PR 3 (Tasks 9–10) → PR 4 → PR 5. PR 5 is the only `go.mod`-touching breaking change. PR 6 stacks on PR 3; `finishing-a-development-branch` splits the single working branch into the 6 PR branches. +- **Benchmark gate (Task 6) — RESOLVED:** the benchmark measured 6.51 ms (1 MB state); root-cause analysis showed the cost is JSON serialization (inherent to the `bytes *_json` wire format), not gRPC transport, so the plan's streaming-redesign contingency was mis-targeted. Operator confirmed unary is acceptable. **Unary is LOCKED** — see `docs/plans/2026-05-14-iac-state-backend-benchmark.md`. No streaming redesign. +- **Follow-on plan:** once PR 5 merges, author the Phase B/C/D plan. Phase B (AWS) reuses Task 7's converters + Task 8's registry + Task 11's plugin pattern + the now-ctx-ful interface from PR 6; Phase C (GCP) additionally runs the `kubernetesBackend` interface-audit spike for the `gke` contract decision (design Architecture §2); Phase D (DigitalOcean `spaces`) rides Phase B's `iac_state_spaces.go` deletion. The IaC state at-rest format follow-up (`docs/plans/2026-05-14-iac-state-backend-benchmark.md` §"Logged follow-up") is a separate post-extraction item. From bfa06e657b59f8783be70d701993b81808917308 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 06:37:17 -0400 Subject: [PATCH 28/39] =?UTF-8?q?chore:=20re-lock=20scope=20for=20cloud-sd?= =?UTF-8?q?k-extraction=20(amended=20=E2=80=94=20alignment=20re-passed)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/plans/2026-05-14-cloud-sdk-extraction.md | 2 +- docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md b/docs/plans/2026-05-14-cloud-sdk-extraction.md index 7e14e5dc..dd9e7eb8 100644 --- a/docs/plans/2026-05-14-cloud-sdk-extraction.md +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md @@ -45,7 +45,7 @@ **PR 4 is autonomous cross-repo work** (de-gated 2026-05-14, `decisions/0034-...md`). It lands in a *different git repository* — `/Users/jon/workspace/workflow-plugin-azure`. A dispatched agent operates in that repo directly; **every cross-repo agent dispatch MUST state, explicitly in its prompt, the absolute path of the repo it works in and that it is a *different* repo than the worktree** (see "Notes for the executor"). Push + PR-creation follow normal review discipline (feature branch, PR — never direct-to-default-branch). PR 5 is **blocked on PR 4's plugin release tag** existing and being installable (Task 13 Step 8 + Task 14 Step 4 runtime-launch validation load the tagged plugin binary); the release tag (Task 12) is an explicit, deliberate step but not a human gate. -**Status:** Draft +**Status:** Locked 2026-05-14T10:37:04Z --- diff --git a/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock b/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock new file mode 100644 index 00000000..25192df5 --- /dev/null +++ b/docs/plans/2026-05-14-cloud-sdk-extraction.md.scope-lock @@ -0,0 +1 @@ +9e5a436fd1335fb1b3b530675b3b9b3dab6a271178910ab434655713e7569868 From a3759d2e47bb5a691d931c0b5c497a4f9d006a57 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:50:00 -0400 Subject: [PATCH 29/39] feat(proto): add IaCStateBackend service to iac.proto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strict 6-method contract mirroring module.IaCStateStore 1:1, with an IaCState message mirroring module.IaCState. Free-form Outputs/Config maps cross the wire as bytes outputs_json/config_json per the iac.proto hard invariant (NO google.protobuf.Struct) — same pattern as ResourceState.outputs_json. Unary RPCs. No TTL field. Regenerated bindings via buf. --- plugin/external/proto/iac.pb.go | 1026 ++++++++++++++--- plugin/external/proto/iac.proto | 48 + plugin/external/proto/iac_grpc.pb.go | 308 +++++ .../external/proto/iac_statebackend_test.go | 23 + 4 files changed, 1259 insertions(+), 146 deletions(-) create mode 100644 plugin/external/proto/iac_statebackend_test.go diff --git a/plugin/external/proto/iac.pb.go b/plugin/external/proto/iac.pb.go index 50b8f4d8..25ddcc40 100644 --- a/plugin/external/proto/iac.pb.go +++ b/plugin/external/proto/iac.pb.go @@ -4544,6 +4544,653 @@ func (x *TroubleshootResponse) GetDiagnostics() []*Diagnostic { return nil } +// IaCState mirrors module.IaCState (module/iac_state.go:4-18). The free-form +// Outputs / Config map[string]any fields cross the wire as JSON bytes per the +// iac.proto hard invariant — same pattern as ResourceState.outputs_json. +type IaCState struct { + state protoimpl.MessageState `protogen:"open.v1"` + ResourceId string `protobuf:"bytes,1,opt,name=resource_id,json=resourceId,proto3" json:"resource_id,omitempty"` + ResourceType string `protobuf:"bytes,2,opt,name=resource_type,json=resourceType,proto3" json:"resource_type,omitempty"` + Provider string `protobuf:"bytes,3,opt,name=provider,proto3" json:"provider,omitempty"` + ProviderRef string `protobuf:"bytes,4,opt,name=provider_ref,json=providerRef,proto3" json:"provider_ref,omitempty"` + ProviderId string `protobuf:"bytes,5,opt,name=provider_id,json=providerId,proto3" json:"provider_id,omitempty"` + ConfigHash string `protobuf:"bytes,6,opt,name=config_hash,json=configHash,proto3" json:"config_hash,omitempty"` + Status string `protobuf:"bytes,7,opt,name=status,proto3" json:"status,omitempty"` + OutputsJson []byte `protobuf:"bytes,8,opt,name=outputs_json,json=outputsJson,proto3" json:"outputs_json,omitempty"` // JSON-encoded map[string]any (module.IaCState.Outputs) + ConfigJson []byte `protobuf:"bytes,9,opt,name=config_json,json=configJson,proto3" json:"config_json,omitempty"` // JSON-encoded map[string]any (module.IaCState.Config) + Dependencies []string `protobuf:"bytes,10,rep,name=dependencies,proto3" json:"dependencies,omitempty"` + CreatedAt string `protobuf:"bytes,11,opt,name=created_at,json=createdAt,proto3" json:"created_at,omitempty"` + UpdatedAt string `protobuf:"bytes,12,opt,name=updated_at,json=updatedAt,proto3" json:"updated_at,omitempty"` + Error string `protobuf:"bytes,13,opt,name=error,proto3" json:"error,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *IaCState) Reset() { + *x = IaCState{} + mi := &file_iac_proto_msgTypes[79] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *IaCState) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IaCState) ProtoMessage() {} + +func (x *IaCState) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[79] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IaCState.ProtoReflect.Descriptor instead. +func (*IaCState) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{79} +} + +func (x *IaCState) GetResourceId() string { + if x != nil { + return x.ResourceId + } + return "" +} + +func (x *IaCState) GetResourceType() string { + if x != nil { + return x.ResourceType + } + return "" +} + +func (x *IaCState) GetProvider() string { + if x != nil { + return x.Provider + } + return "" +} + +func (x *IaCState) GetProviderRef() string { + if x != nil { + return x.ProviderRef + } + return "" +} + +func (x *IaCState) GetProviderId() string { + if x != nil { + return x.ProviderId + } + return "" +} + +func (x *IaCState) GetConfigHash() string { + if x != nil { + return x.ConfigHash + } + return "" +} + +func (x *IaCState) GetStatus() string { + if x != nil { + return x.Status + } + return "" +} + +func (x *IaCState) GetOutputsJson() []byte { + if x != nil { + return x.OutputsJson + } + return nil +} + +func (x *IaCState) GetConfigJson() []byte { + if x != nil { + return x.ConfigJson + } + return nil +} + +func (x *IaCState) GetDependencies() []string { + if x != nil { + return x.Dependencies + } + return nil +} + +func (x *IaCState) GetCreatedAt() string { + if x != nil { + return x.CreatedAt + } + return "" +} + +func (x *IaCState) GetUpdatedAt() string { + if x != nil { + return x.UpdatedAt + } + return "" +} + +func (x *IaCState) GetError() string { + if x != nil { + return x.Error + } + return "" +} + +type GetStateRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + ResourceId string `protobuf:"bytes,1,opt,name=resource_id,json=resourceId,proto3" json:"resource_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetStateRequest) Reset() { + *x = GetStateRequest{} + mi := &file_iac_proto_msgTypes[80] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetStateRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetStateRequest) ProtoMessage() {} + +func (x *GetStateRequest) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[80] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetStateRequest.ProtoReflect.Descriptor instead. +func (*GetStateRequest) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{80} +} + +func (x *GetStateRequest) GetResourceId() string { + if x != nil { + return x.ResourceId + } + return "" +} + +type GetStateResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + State *IaCState `protobuf:"bytes,1,opt,name=state,proto3" json:"state,omitempty"` + Exists bool `protobuf:"varint,2,opt,name=exists,proto3" json:"exists,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetStateResponse) Reset() { + *x = GetStateResponse{} + mi := &file_iac_proto_msgTypes[81] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetStateResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetStateResponse) ProtoMessage() {} + +func (x *GetStateResponse) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[81] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetStateResponse.ProtoReflect.Descriptor instead. +func (*GetStateResponse) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{81} +} + +func (x *GetStateResponse) GetState() *IaCState { + if x != nil { + return x.State + } + return nil +} + +func (x *GetStateResponse) GetExists() bool { + if x != nil { + return x.Exists + } + return false +} + +type SaveStateRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + State *IaCState `protobuf:"bytes,1,opt,name=state,proto3" json:"state,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SaveStateRequest) Reset() { + *x = SaveStateRequest{} + mi := &file_iac_proto_msgTypes[82] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SaveStateRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SaveStateRequest) ProtoMessage() {} + +func (x *SaveStateRequest) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[82] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SaveStateRequest.ProtoReflect.Descriptor instead. +func (*SaveStateRequest) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{82} +} + +func (x *SaveStateRequest) GetState() *IaCState { + if x != nil { + return x.State + } + return nil +} + +type SaveStateResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SaveStateResponse) Reset() { + *x = SaveStateResponse{} + mi := &file_iac_proto_msgTypes[83] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SaveStateResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SaveStateResponse) ProtoMessage() {} + +func (x *SaveStateResponse) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[83] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SaveStateResponse.ProtoReflect.Descriptor instead. +func (*SaveStateResponse) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{83} +} + +type ListStatesRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + Filter map[string]string `protobuf:"bytes,1,rep,name=filter,proto3" json:"filter,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ListStatesRequest) Reset() { + *x = ListStatesRequest{} + mi := &file_iac_proto_msgTypes[84] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ListStatesRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListStatesRequest) ProtoMessage() {} + +func (x *ListStatesRequest) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[84] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListStatesRequest.ProtoReflect.Descriptor instead. +func (*ListStatesRequest) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{84} +} + +func (x *ListStatesRequest) GetFilter() map[string]string { + if x != nil { + return x.Filter + } + return nil +} + +type ListStatesResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + States []*IaCState `protobuf:"bytes,1,rep,name=states,proto3" json:"states,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ListStatesResponse) Reset() { + *x = ListStatesResponse{} + mi := &file_iac_proto_msgTypes[85] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ListStatesResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListStatesResponse) ProtoMessage() {} + +func (x *ListStatesResponse) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[85] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListStatesResponse.ProtoReflect.Descriptor instead. +func (*ListStatesResponse) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{85} +} + +func (x *ListStatesResponse) GetStates() []*IaCState { + if x != nil { + return x.States + } + return nil +} + +type DeleteStateRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + ResourceId string `protobuf:"bytes,1,opt,name=resource_id,json=resourceId,proto3" json:"resource_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *DeleteStateRequest) Reset() { + *x = DeleteStateRequest{} + mi := &file_iac_proto_msgTypes[86] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *DeleteStateRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeleteStateRequest) ProtoMessage() {} + +func (x *DeleteStateRequest) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[86] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeleteStateRequest.ProtoReflect.Descriptor instead. +func (*DeleteStateRequest) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{86} +} + +func (x *DeleteStateRequest) GetResourceId() string { + if x != nil { + return x.ResourceId + } + return "" +} + +type DeleteStateResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *DeleteStateResponse) Reset() { + *x = DeleteStateResponse{} + mi := &file_iac_proto_msgTypes[87] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *DeleteStateResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeleteStateResponse) ProtoMessage() {} + +func (x *DeleteStateResponse) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[87] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeleteStateResponse.ProtoReflect.Descriptor instead. +func (*DeleteStateResponse) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{87} +} + +type LockRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + ResourceId string `protobuf:"bytes,1,opt,name=resource_id,json=resourceId,proto3" json:"resource_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *LockRequest) Reset() { + *x = LockRequest{} + mi := &file_iac_proto_msgTypes[88] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *LockRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*LockRequest) ProtoMessage() {} + +func (x *LockRequest) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[88] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use LockRequest.ProtoReflect.Descriptor instead. +func (*LockRequest) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{88} +} + +func (x *LockRequest) GetResourceId() string { + if x != nil { + return x.ResourceId + } + return "" +} + +type LockResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *LockResponse) Reset() { + *x = LockResponse{} + mi := &file_iac_proto_msgTypes[89] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *LockResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*LockResponse) ProtoMessage() {} + +func (x *LockResponse) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[89] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use LockResponse.ProtoReflect.Descriptor instead. +func (*LockResponse) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{89} +} + +type UnlockRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + ResourceId string `protobuf:"bytes,1,opt,name=resource_id,json=resourceId,proto3" json:"resource_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *UnlockRequest) Reset() { + *x = UnlockRequest{} + mi := &file_iac_proto_msgTypes[90] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *UnlockRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*UnlockRequest) ProtoMessage() {} + +func (x *UnlockRequest) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[90] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use UnlockRequest.ProtoReflect.Descriptor instead. +func (*UnlockRequest) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{90} +} + +func (x *UnlockRequest) GetResourceId() string { + if x != nil { + return x.ResourceId + } + return "" +} + +type UnlockResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *UnlockResponse) Reset() { + *x = UnlockResponse{} + mi := &file_iac_proto_msgTypes[91] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *UnlockResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*UnlockResponse) ProtoMessage() {} + +func (x *UnlockResponse) ProtoReflect() protoreflect.Message { + mi := &file_iac_proto_msgTypes[91] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use UnlockResponse.ProtoReflect.Descriptor instead. +func (*UnlockResponse) Descriptor() ([]byte, []int) { + return file_iac_proto_rawDescGZIP(), []int{91} +} + var File_iac_proto protoreflect.FileDescriptor const file_iac_proto_rawDesc = "" + @@ -4861,7 +5508,56 @@ const file_iac_proto_rawDesc = "" + "\vfailure_msg\x18\x03 \x01(\tR\n" + "failureMsg\"b\n" + "\x14TroubleshootResponse\x12J\n" + - "\vdiagnostics\x18\x01 \x03(\v2(.workflow.plugin.external.iac.DiagnosticR\vdiagnostics*m\n" + + "\vdiagnostics\x18\x01 \x03(\v2(.workflow.plugin.external.iac.DiagnosticR\vdiagnostics\"\xa5\x03\n" + + "\bIaCState\x12\x1f\n" + + "\vresource_id\x18\x01 \x01(\tR\n" + + "resourceId\x12#\n" + + "\rresource_type\x18\x02 \x01(\tR\fresourceType\x12\x1a\n" + + "\bprovider\x18\x03 \x01(\tR\bprovider\x12!\n" + + "\fprovider_ref\x18\x04 \x01(\tR\vproviderRef\x12\x1f\n" + + "\vprovider_id\x18\x05 \x01(\tR\n" + + "providerId\x12\x1f\n" + + "\vconfig_hash\x18\x06 \x01(\tR\n" + + "configHash\x12\x16\n" + + "\x06status\x18\a \x01(\tR\x06status\x12!\n" + + "\foutputs_json\x18\b \x01(\fR\voutputsJson\x12\x1f\n" + + "\vconfig_json\x18\t \x01(\fR\n" + + "configJson\x12\"\n" + + "\fdependencies\x18\n" + + " \x03(\tR\fdependencies\x12\x1d\n" + + "\n" + + "created_at\x18\v \x01(\tR\tcreatedAt\x12\x1d\n" + + "\n" + + "updated_at\x18\f \x01(\tR\tupdatedAt\x12\x14\n" + + "\x05error\x18\r \x01(\tR\x05error\"2\n" + + "\x0fGetStateRequest\x12\x1f\n" + + "\vresource_id\x18\x01 \x01(\tR\n" + + "resourceId\"h\n" + + "\x10GetStateResponse\x12<\n" + + "\x05state\x18\x01 \x01(\v2&.workflow.plugin.external.iac.IaCStateR\x05state\x12\x16\n" + + "\x06exists\x18\x02 \x01(\bR\x06exists\"P\n" + + "\x10SaveStateRequest\x12<\n" + + "\x05state\x18\x01 \x01(\v2&.workflow.plugin.external.iac.IaCStateR\x05state\"\x13\n" + + "\x11SaveStateResponse\"\xa3\x01\n" + + "\x11ListStatesRequest\x12S\n" + + "\x06filter\x18\x01 \x03(\v2;.workflow.plugin.external.iac.ListStatesRequest.FilterEntryR\x06filter\x1a9\n" + + "\vFilterEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"T\n" + + "\x12ListStatesResponse\x12>\n" + + "\x06states\x18\x01 \x03(\v2&.workflow.plugin.external.iac.IaCStateR\x06states\"5\n" + + "\x12DeleteStateRequest\x12\x1f\n" + + "\vresource_id\x18\x01 \x01(\tR\n" + + "resourceId\"\x15\n" + + "\x13DeleteStateResponse\".\n" + + "\vLockRequest\x12\x1f\n" + + "\vresource_id\x18\x01 \x01(\tR\n" + + "resourceId\"\x0e\n" + + "\fLockResponse\"0\n" + + "\rUnlockRequest\x12\x1f\n" + + "\vresource_id\x18\x01 \x01(\tR\n" + + "resourceId\"\x10\n" + + "\x0eUnlockResponse*m\n" + "\n" + "DriftClass\x12\x17\n" + "\x13DRIFT_CLASS_UNKNOWN\x10\x00\x12\x17\n" + @@ -4908,7 +5604,15 @@ const file_iac_proto_rawDesc = "" + "\x05Scale\x122.workflow.plugin.external.iac.ResourceScaleRequest\x1a3.workflow.plugin.external.iac.ResourceScaleResponse\x12\x82\x01\n" + "\vHealthCheck\x128.workflow.plugin.external.iac.ResourceHealthCheckRequest\x1a9.workflow.plugin.external.iac.ResourceHealthCheckResponse\x12x\n" + "\rSensitiveKeys\x122.workflow.plugin.external.iac.SensitiveKeysRequest\x1a3.workflow.plugin.external.iac.SensitiveKeysResponse\x12u\n" + - "\fTroubleshoot\x121.workflow.plugin.external.iac.TroubleshootRequest\x1a2.workflow.plugin.external.iac.TroubleshootResponseB=Z;github.com/GoCodeAlone/workflow/plugin/external/proto;protob\x06proto3" + "\fTroubleshoot\x121.workflow.plugin.external.iac.TroubleshootRequest\x1a2.workflow.plugin.external.iac.TroubleshootResponse2\x93\x05\n" + + "\x0fIaCStateBackend\x12i\n" + + "\bGetState\x12-.workflow.plugin.external.iac.GetStateRequest\x1a..workflow.plugin.external.iac.GetStateResponse\x12l\n" + + "\tSaveState\x12..workflow.plugin.external.iac.SaveStateRequest\x1a/.workflow.plugin.external.iac.SaveStateResponse\x12o\n" + + "\n" + + "ListStates\x12/.workflow.plugin.external.iac.ListStatesRequest\x1a0.workflow.plugin.external.iac.ListStatesResponse\x12r\n" + + "\vDeleteState\x120.workflow.plugin.external.iac.DeleteStateRequest\x1a1.workflow.plugin.external.iac.DeleteStateResponse\x12]\n" + + "\x04Lock\x12).workflow.plugin.external.iac.LockRequest\x1a*.workflow.plugin.external.iac.LockResponse\x12c\n" + + "\x06Unlock\x12+.workflow.plugin.external.iac.UnlockRequest\x1a,.workflow.plugin.external.iac.UnlockResponseB=Z;github.com/GoCodeAlone/workflow/plugin/external/proto;protob\x06proto3" var ( file_iac_proto_rawDescOnce sync.Once @@ -4923,7 +5627,7 @@ func file_iac_proto_rawDescGZIP() []byte { } var file_iac_proto_enumTypes = make([]protoimpl.EnumInfo, 2) -var file_iac_proto_msgTypes = make([]protoimpl.MessageInfo, 87) +var file_iac_proto_msgTypes = make([]protoimpl.MessageInfo, 101) var file_iac_proto_goTypes = []any{ (DriftClass)(0), // 0: workflow.plugin.external.iac.DriftClass (PlanDiagnosticSeverity)(0), // 1: workflow.plugin.external.iac.PlanDiagnosticSeverity @@ -5006,149 +5710,179 @@ var file_iac_proto_goTypes = []any{ (*SensitiveKeysResponse)(nil), // 78: workflow.plugin.external.iac.SensitiveKeysResponse (*TroubleshootRequest)(nil), // 79: workflow.plugin.external.iac.TroubleshootRequest (*TroubleshootResponse)(nil), // 80: workflow.plugin.external.iac.TroubleshootResponse - nil, // 81: workflow.plugin.external.iac.ResourceOutput.SensitiveEntry - nil, // 82: workflow.plugin.external.iac.IaCPlan.InputSnapshotEntry - nil, // 83: workflow.plugin.external.iac.ApplyResult.InitialInputSnapshotEntry - nil, // 84: workflow.plugin.external.iac.ApplyResult.ReplaceIdMapEntry - nil, // 85: workflow.plugin.external.iac.BootstrapResult.EnvVarsEntry - nil, // 86: workflow.plugin.external.iac.MigrationRepairRequest.EnvEntry - nil, // 87: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.SpecsEntry - nil, // 88: workflow.plugin.external.iac.DetectDriftConfigRequest.SpecsEntry - (*timestamppb.Timestamp)(nil), // 89: google.protobuf.Timestamp + (*IaCState)(nil), // 81: workflow.plugin.external.iac.IaCState + (*GetStateRequest)(nil), // 82: workflow.plugin.external.iac.GetStateRequest + (*GetStateResponse)(nil), // 83: workflow.plugin.external.iac.GetStateResponse + (*SaveStateRequest)(nil), // 84: workflow.plugin.external.iac.SaveStateRequest + (*SaveStateResponse)(nil), // 85: workflow.plugin.external.iac.SaveStateResponse + (*ListStatesRequest)(nil), // 86: workflow.plugin.external.iac.ListStatesRequest + (*ListStatesResponse)(nil), // 87: workflow.plugin.external.iac.ListStatesResponse + (*DeleteStateRequest)(nil), // 88: workflow.plugin.external.iac.DeleteStateRequest + (*DeleteStateResponse)(nil), // 89: workflow.plugin.external.iac.DeleteStateResponse + (*LockRequest)(nil), // 90: workflow.plugin.external.iac.LockRequest + (*LockResponse)(nil), // 91: workflow.plugin.external.iac.LockResponse + (*UnlockRequest)(nil), // 92: workflow.plugin.external.iac.UnlockRequest + (*UnlockResponse)(nil), // 93: workflow.plugin.external.iac.UnlockResponse + nil, // 94: workflow.plugin.external.iac.ResourceOutput.SensitiveEntry + nil, // 95: workflow.plugin.external.iac.IaCPlan.InputSnapshotEntry + nil, // 96: workflow.plugin.external.iac.ApplyResult.InitialInputSnapshotEntry + nil, // 97: workflow.plugin.external.iac.ApplyResult.ReplaceIdMapEntry + nil, // 98: workflow.plugin.external.iac.BootstrapResult.EnvVarsEntry + nil, // 99: workflow.plugin.external.iac.MigrationRepairRequest.EnvEntry + nil, // 100: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.SpecsEntry + nil, // 101: workflow.plugin.external.iac.DetectDriftConfigRequest.SpecsEntry + nil, // 102: workflow.plugin.external.iac.ListStatesRequest.FilterEntry + (*timestamppb.Timestamp)(nil), // 103: google.protobuf.Timestamp } var file_iac_proto_depIdxs = []int32{ - 4, // 0: workflow.plugin.external.iac.ResourceSpec.hints:type_name -> workflow.plugin.external.iac.ResourceHints - 89, // 1: workflow.plugin.external.iac.ResourceState.created_at:type_name -> google.protobuf.Timestamp - 89, // 2: workflow.plugin.external.iac.ResourceState.updated_at:type_name -> google.protobuf.Timestamp - 89, // 3: workflow.plugin.external.iac.ResourceState.last_drift_check:type_name -> google.protobuf.Timestamp - 81, // 4: workflow.plugin.external.iac.ResourceOutput.sensitive:type_name -> workflow.plugin.external.iac.ResourceOutput.SensitiveEntry - 10, // 5: workflow.plugin.external.iac.DiffResult.changes:type_name -> workflow.plugin.external.iac.FieldChange - 0, // 6: workflow.plugin.external.iac.DriftResult.class:type_name -> workflow.plugin.external.iac.DriftClass - 89, // 7: workflow.plugin.external.iac.Diagnostic.at:type_name -> google.protobuf.Timestamp - 1, // 8: workflow.plugin.external.iac.PlanDiagnostic.severity:type_name -> workflow.plugin.external.iac.PlanDiagnosticSeverity - 2, // 9: workflow.plugin.external.iac.PlanAction.resource:type_name -> workflow.plugin.external.iac.ResourceSpec - 7, // 10: workflow.plugin.external.iac.PlanAction.current:type_name -> workflow.plugin.external.iac.ResourceState - 10, // 11: workflow.plugin.external.iac.PlanAction.changes:type_name -> workflow.plugin.external.iac.FieldChange - 17, // 12: workflow.plugin.external.iac.IaCPlan.actions:type_name -> workflow.plugin.external.iac.PlanAction - 89, // 13: workflow.plugin.external.iac.IaCPlan.created_at:type_name -> google.protobuf.Timestamp - 82, // 14: workflow.plugin.external.iac.IaCPlan.input_snapshot:type_name -> workflow.plugin.external.iac.IaCPlan.InputSnapshotEntry - 8, // 15: workflow.plugin.external.iac.ApplyResult.resources:type_name -> workflow.plugin.external.iac.ResourceOutput - 19, // 16: workflow.plugin.external.iac.ApplyResult.errors:type_name -> workflow.plugin.external.iac.ActionError - 83, // 17: workflow.plugin.external.iac.ApplyResult.initial_input_snapshot:type_name -> workflow.plugin.external.iac.ApplyResult.InitialInputSnapshotEntry - 13, // 18: workflow.plugin.external.iac.ApplyResult.input_drift_report:type_name -> workflow.plugin.external.iac.DriftEntry - 84, // 19: workflow.plugin.external.iac.ApplyResult.replace_id_map:type_name -> workflow.plugin.external.iac.ApplyResult.ReplaceIdMapEntry - 19, // 20: workflow.plugin.external.iac.DestroyResult.errors:type_name -> workflow.plugin.external.iac.ActionError - 85, // 21: workflow.plugin.external.iac.BootstrapResult.env_vars:type_name -> workflow.plugin.external.iac.BootstrapResult.EnvVarsEntry - 86, // 22: workflow.plugin.external.iac.MigrationRepairRequest.env:type_name -> workflow.plugin.external.iac.MigrationRepairRequest.EnvEntry - 15, // 23: workflow.plugin.external.iac.MigrationRepairResult.diagnostics:type_name -> workflow.plugin.external.iac.Diagnostic - 6, // 24: workflow.plugin.external.iac.CapabilitiesResponse.capabilities:type_name -> workflow.plugin.external.iac.IaCCapabilityDeclaration - 2, // 25: workflow.plugin.external.iac.PlanRequest.desired:type_name -> workflow.plugin.external.iac.ResourceSpec - 7, // 26: workflow.plugin.external.iac.PlanRequest.current:type_name -> workflow.plugin.external.iac.ResourceState - 18, // 27: workflow.plugin.external.iac.PlanResponse.plan:type_name -> workflow.plugin.external.iac.IaCPlan - 18, // 28: workflow.plugin.external.iac.ApplyRequest.plan:type_name -> workflow.plugin.external.iac.IaCPlan - 20, // 29: workflow.plugin.external.iac.ApplyResponse.result:type_name -> workflow.plugin.external.iac.ApplyResult - 3, // 30: workflow.plugin.external.iac.DestroyRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef - 21, // 31: workflow.plugin.external.iac.DestroyResponse.result:type_name -> workflow.plugin.external.iac.DestroyResult - 3, // 32: workflow.plugin.external.iac.StatusRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef - 9, // 33: workflow.plugin.external.iac.StatusResponse.statuses:type_name -> workflow.plugin.external.iac.ResourceStatus - 7, // 34: workflow.plugin.external.iac.ImportResponse.state:type_name -> workflow.plugin.external.iac.ResourceState - 4, // 35: workflow.plugin.external.iac.ResolveSizingRequest.hints:type_name -> workflow.plugin.external.iac.ResourceHints - 5, // 36: workflow.plugin.external.iac.ResolveSizingResponse.sizing:type_name -> workflow.plugin.external.iac.ProviderSizing - 22, // 37: workflow.plugin.external.iac.BootstrapStateBackendResponse.result:type_name -> workflow.plugin.external.iac.BootstrapResult - 8, // 38: workflow.plugin.external.iac.EnumerateAllResponse.outputs:type_name -> workflow.plugin.external.iac.ResourceOutput - 3, // 39: workflow.plugin.external.iac.EnumerateByTagResponse.refs:type_name -> workflow.plugin.external.iac.ResourceRef - 3, // 40: workflow.plugin.external.iac.DetectDriftRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef - 12, // 41: workflow.plugin.external.iac.DetectDriftResponse.drifts:type_name -> workflow.plugin.external.iac.DriftResult - 3, // 42: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef - 87, // 43: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.specs:type_name -> workflow.plugin.external.iac.DetectDriftWithSpecsRequest.SpecsEntry - 12, // 44: workflow.plugin.external.iac.DetectDriftWithSpecsResponse.drifts:type_name -> workflow.plugin.external.iac.DriftResult - 23, // 45: workflow.plugin.external.iac.RepairDirtyMigrationRequest.request:type_name -> workflow.plugin.external.iac.MigrationRepairRequest - 24, // 46: workflow.plugin.external.iac.RepairDirtyMigrationResponse.result:type_name -> workflow.plugin.external.iac.MigrationRepairResult - 18, // 47: workflow.plugin.external.iac.ValidatePlanRequest.plan:type_name -> workflow.plugin.external.iac.IaCPlan - 16, // 48: workflow.plugin.external.iac.ValidatePlanResponse.diagnostics:type_name -> workflow.plugin.external.iac.PlanDiagnostic - 3, // 49: workflow.plugin.external.iac.DetectDriftConfigRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef - 88, // 50: workflow.plugin.external.iac.DetectDriftConfigRequest.specs:type_name -> workflow.plugin.external.iac.DetectDriftConfigRequest.SpecsEntry - 12, // 51: workflow.plugin.external.iac.DetectDriftConfigResponse.drifts:type_name -> workflow.plugin.external.iac.DriftResult - 2, // 52: workflow.plugin.external.iac.ResourceCreateRequest.spec:type_name -> workflow.plugin.external.iac.ResourceSpec - 8, // 53: workflow.plugin.external.iac.ResourceCreateResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput - 3, // 54: workflow.plugin.external.iac.ResourceReadRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef - 8, // 55: workflow.plugin.external.iac.ResourceReadResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput - 3, // 56: workflow.plugin.external.iac.ResourceUpdateRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef - 2, // 57: workflow.plugin.external.iac.ResourceUpdateRequest.spec:type_name -> workflow.plugin.external.iac.ResourceSpec - 8, // 58: workflow.plugin.external.iac.ResourceUpdateResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput - 3, // 59: workflow.plugin.external.iac.ResourceDeleteRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef - 2, // 60: workflow.plugin.external.iac.ResourceDiffRequest.desired:type_name -> workflow.plugin.external.iac.ResourceSpec - 8, // 61: workflow.plugin.external.iac.ResourceDiffRequest.current:type_name -> workflow.plugin.external.iac.ResourceOutput - 11, // 62: workflow.plugin.external.iac.ResourceDiffResponse.result:type_name -> workflow.plugin.external.iac.DiffResult - 3, // 63: workflow.plugin.external.iac.ResourceScaleRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef - 8, // 64: workflow.plugin.external.iac.ResourceScaleResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput - 3, // 65: workflow.plugin.external.iac.ResourceHealthCheckRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef - 14, // 66: workflow.plugin.external.iac.ResourceHealthCheckResponse.result:type_name -> workflow.plugin.external.iac.HealthResult - 3, // 67: workflow.plugin.external.iac.TroubleshootRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef - 15, // 68: workflow.plugin.external.iac.TroubleshootResponse.diagnostics:type_name -> workflow.plugin.external.iac.Diagnostic - 2, // 69: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.SpecsEntry.value:type_name -> workflow.plugin.external.iac.ResourceSpec - 2, // 70: workflow.plugin.external.iac.DetectDriftConfigRequest.SpecsEntry.value:type_name -> workflow.plugin.external.iac.ResourceSpec - 25, // 71: workflow.plugin.external.iac.IaCProviderRequired.Initialize:input_type -> workflow.plugin.external.iac.InitializeRequest - 27, // 72: workflow.plugin.external.iac.IaCProviderRequired.Name:input_type -> workflow.plugin.external.iac.NameRequest - 29, // 73: workflow.plugin.external.iac.IaCProviderRequired.Version:input_type -> workflow.plugin.external.iac.VersionRequest - 31, // 74: workflow.plugin.external.iac.IaCProviderRequired.Capabilities:input_type -> workflow.plugin.external.iac.CapabilitiesRequest - 33, // 75: workflow.plugin.external.iac.IaCProviderRequired.Plan:input_type -> workflow.plugin.external.iac.PlanRequest - 35, // 76: workflow.plugin.external.iac.IaCProviderRequired.Apply:input_type -> workflow.plugin.external.iac.ApplyRequest - 37, // 77: workflow.plugin.external.iac.IaCProviderRequired.Destroy:input_type -> workflow.plugin.external.iac.DestroyRequest - 39, // 78: workflow.plugin.external.iac.IaCProviderRequired.Status:input_type -> workflow.plugin.external.iac.StatusRequest - 41, // 79: workflow.plugin.external.iac.IaCProviderRequired.Import:input_type -> workflow.plugin.external.iac.ImportRequest - 43, // 80: workflow.plugin.external.iac.IaCProviderRequired.ResolveSizing:input_type -> workflow.plugin.external.iac.ResolveSizingRequest - 45, // 81: workflow.plugin.external.iac.IaCProviderRequired.BootstrapStateBackend:input_type -> workflow.plugin.external.iac.BootstrapStateBackendRequest - 47, // 82: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateAll:input_type -> workflow.plugin.external.iac.EnumerateAllRequest - 49, // 83: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateByTag:input_type -> workflow.plugin.external.iac.EnumerateByTagRequest - 51, // 84: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDrift:input_type -> workflow.plugin.external.iac.DetectDriftRequest - 53, // 85: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDriftWithSpecs:input_type -> workflow.plugin.external.iac.DetectDriftWithSpecsRequest - 55, // 86: workflow.plugin.external.iac.IaCProviderCredentialRevoker.RevokeProviderCredential:input_type -> workflow.plugin.external.iac.RevokeProviderCredentialRequest - 57, // 87: workflow.plugin.external.iac.IaCProviderMigrationRepairer.RepairDirtyMigration:input_type -> workflow.plugin.external.iac.RepairDirtyMigrationRequest - 59, // 88: workflow.plugin.external.iac.IaCProviderValidator.ValidatePlan:input_type -> workflow.plugin.external.iac.ValidatePlanRequest - 61, // 89: workflow.plugin.external.iac.IaCProviderDriftConfigDetector.DetectDriftConfig:input_type -> workflow.plugin.external.iac.DetectDriftConfigRequest - 63, // 90: workflow.plugin.external.iac.ResourceDriver.Create:input_type -> workflow.plugin.external.iac.ResourceCreateRequest - 65, // 91: workflow.plugin.external.iac.ResourceDriver.Read:input_type -> workflow.plugin.external.iac.ResourceReadRequest - 67, // 92: workflow.plugin.external.iac.ResourceDriver.Update:input_type -> workflow.plugin.external.iac.ResourceUpdateRequest - 69, // 93: workflow.plugin.external.iac.ResourceDriver.Delete:input_type -> workflow.plugin.external.iac.ResourceDeleteRequest - 71, // 94: workflow.plugin.external.iac.ResourceDriver.Diff:input_type -> workflow.plugin.external.iac.ResourceDiffRequest - 73, // 95: workflow.plugin.external.iac.ResourceDriver.Scale:input_type -> workflow.plugin.external.iac.ResourceScaleRequest - 75, // 96: workflow.plugin.external.iac.ResourceDriver.HealthCheck:input_type -> workflow.plugin.external.iac.ResourceHealthCheckRequest - 77, // 97: workflow.plugin.external.iac.ResourceDriver.SensitiveKeys:input_type -> workflow.plugin.external.iac.SensitiveKeysRequest - 79, // 98: workflow.plugin.external.iac.ResourceDriver.Troubleshoot:input_type -> workflow.plugin.external.iac.TroubleshootRequest - 26, // 99: workflow.plugin.external.iac.IaCProviderRequired.Initialize:output_type -> workflow.plugin.external.iac.InitializeResponse - 28, // 100: workflow.plugin.external.iac.IaCProviderRequired.Name:output_type -> workflow.plugin.external.iac.NameResponse - 30, // 101: workflow.plugin.external.iac.IaCProviderRequired.Version:output_type -> workflow.plugin.external.iac.VersionResponse - 32, // 102: workflow.plugin.external.iac.IaCProviderRequired.Capabilities:output_type -> workflow.plugin.external.iac.CapabilitiesResponse - 34, // 103: workflow.plugin.external.iac.IaCProviderRequired.Plan:output_type -> workflow.plugin.external.iac.PlanResponse - 36, // 104: workflow.plugin.external.iac.IaCProviderRequired.Apply:output_type -> workflow.plugin.external.iac.ApplyResponse - 38, // 105: workflow.plugin.external.iac.IaCProviderRequired.Destroy:output_type -> workflow.plugin.external.iac.DestroyResponse - 40, // 106: workflow.plugin.external.iac.IaCProviderRequired.Status:output_type -> workflow.plugin.external.iac.StatusResponse - 42, // 107: workflow.plugin.external.iac.IaCProviderRequired.Import:output_type -> workflow.plugin.external.iac.ImportResponse - 44, // 108: workflow.plugin.external.iac.IaCProviderRequired.ResolveSizing:output_type -> workflow.plugin.external.iac.ResolveSizingResponse - 46, // 109: workflow.plugin.external.iac.IaCProviderRequired.BootstrapStateBackend:output_type -> workflow.plugin.external.iac.BootstrapStateBackendResponse - 48, // 110: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateAll:output_type -> workflow.plugin.external.iac.EnumerateAllResponse - 50, // 111: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateByTag:output_type -> workflow.plugin.external.iac.EnumerateByTagResponse - 52, // 112: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDrift:output_type -> workflow.plugin.external.iac.DetectDriftResponse - 54, // 113: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDriftWithSpecs:output_type -> workflow.plugin.external.iac.DetectDriftWithSpecsResponse - 56, // 114: workflow.plugin.external.iac.IaCProviderCredentialRevoker.RevokeProviderCredential:output_type -> workflow.plugin.external.iac.RevokeProviderCredentialResponse - 58, // 115: workflow.plugin.external.iac.IaCProviderMigrationRepairer.RepairDirtyMigration:output_type -> workflow.plugin.external.iac.RepairDirtyMigrationResponse - 60, // 116: workflow.plugin.external.iac.IaCProviderValidator.ValidatePlan:output_type -> workflow.plugin.external.iac.ValidatePlanResponse - 62, // 117: workflow.plugin.external.iac.IaCProviderDriftConfigDetector.DetectDriftConfig:output_type -> workflow.plugin.external.iac.DetectDriftConfigResponse - 64, // 118: workflow.plugin.external.iac.ResourceDriver.Create:output_type -> workflow.plugin.external.iac.ResourceCreateResponse - 66, // 119: workflow.plugin.external.iac.ResourceDriver.Read:output_type -> workflow.plugin.external.iac.ResourceReadResponse - 68, // 120: workflow.plugin.external.iac.ResourceDriver.Update:output_type -> workflow.plugin.external.iac.ResourceUpdateResponse - 70, // 121: workflow.plugin.external.iac.ResourceDriver.Delete:output_type -> workflow.plugin.external.iac.ResourceDeleteResponse - 72, // 122: workflow.plugin.external.iac.ResourceDriver.Diff:output_type -> workflow.plugin.external.iac.ResourceDiffResponse - 74, // 123: workflow.plugin.external.iac.ResourceDriver.Scale:output_type -> workflow.plugin.external.iac.ResourceScaleResponse - 76, // 124: workflow.plugin.external.iac.ResourceDriver.HealthCheck:output_type -> workflow.plugin.external.iac.ResourceHealthCheckResponse - 78, // 125: workflow.plugin.external.iac.ResourceDriver.SensitiveKeys:output_type -> workflow.plugin.external.iac.SensitiveKeysResponse - 80, // 126: workflow.plugin.external.iac.ResourceDriver.Troubleshoot:output_type -> workflow.plugin.external.iac.TroubleshootResponse - 99, // [99:127] is the sub-list for method output_type - 71, // [71:99] is the sub-list for method input_type - 71, // [71:71] is the sub-list for extension type_name - 71, // [71:71] is the sub-list for extension extendee - 0, // [0:71] is the sub-list for field type_name + 4, // 0: workflow.plugin.external.iac.ResourceSpec.hints:type_name -> workflow.plugin.external.iac.ResourceHints + 103, // 1: workflow.plugin.external.iac.ResourceState.created_at:type_name -> google.protobuf.Timestamp + 103, // 2: workflow.plugin.external.iac.ResourceState.updated_at:type_name -> google.protobuf.Timestamp + 103, // 3: workflow.plugin.external.iac.ResourceState.last_drift_check:type_name -> google.protobuf.Timestamp + 94, // 4: workflow.plugin.external.iac.ResourceOutput.sensitive:type_name -> workflow.plugin.external.iac.ResourceOutput.SensitiveEntry + 10, // 5: workflow.plugin.external.iac.DiffResult.changes:type_name -> workflow.plugin.external.iac.FieldChange + 0, // 6: workflow.plugin.external.iac.DriftResult.class:type_name -> workflow.plugin.external.iac.DriftClass + 103, // 7: workflow.plugin.external.iac.Diagnostic.at:type_name -> google.protobuf.Timestamp + 1, // 8: workflow.plugin.external.iac.PlanDiagnostic.severity:type_name -> workflow.plugin.external.iac.PlanDiagnosticSeverity + 2, // 9: workflow.plugin.external.iac.PlanAction.resource:type_name -> workflow.plugin.external.iac.ResourceSpec + 7, // 10: workflow.plugin.external.iac.PlanAction.current:type_name -> workflow.plugin.external.iac.ResourceState + 10, // 11: workflow.plugin.external.iac.PlanAction.changes:type_name -> workflow.plugin.external.iac.FieldChange + 17, // 12: workflow.plugin.external.iac.IaCPlan.actions:type_name -> workflow.plugin.external.iac.PlanAction + 103, // 13: workflow.plugin.external.iac.IaCPlan.created_at:type_name -> google.protobuf.Timestamp + 95, // 14: workflow.plugin.external.iac.IaCPlan.input_snapshot:type_name -> workflow.plugin.external.iac.IaCPlan.InputSnapshotEntry + 8, // 15: workflow.plugin.external.iac.ApplyResult.resources:type_name -> workflow.plugin.external.iac.ResourceOutput + 19, // 16: workflow.plugin.external.iac.ApplyResult.errors:type_name -> workflow.plugin.external.iac.ActionError + 96, // 17: workflow.plugin.external.iac.ApplyResult.initial_input_snapshot:type_name -> workflow.plugin.external.iac.ApplyResult.InitialInputSnapshotEntry + 13, // 18: workflow.plugin.external.iac.ApplyResult.input_drift_report:type_name -> workflow.plugin.external.iac.DriftEntry + 97, // 19: workflow.plugin.external.iac.ApplyResult.replace_id_map:type_name -> workflow.plugin.external.iac.ApplyResult.ReplaceIdMapEntry + 19, // 20: workflow.plugin.external.iac.DestroyResult.errors:type_name -> workflow.plugin.external.iac.ActionError + 98, // 21: workflow.plugin.external.iac.BootstrapResult.env_vars:type_name -> workflow.plugin.external.iac.BootstrapResult.EnvVarsEntry + 99, // 22: workflow.plugin.external.iac.MigrationRepairRequest.env:type_name -> workflow.plugin.external.iac.MigrationRepairRequest.EnvEntry + 15, // 23: workflow.plugin.external.iac.MigrationRepairResult.diagnostics:type_name -> workflow.plugin.external.iac.Diagnostic + 6, // 24: workflow.plugin.external.iac.CapabilitiesResponse.capabilities:type_name -> workflow.plugin.external.iac.IaCCapabilityDeclaration + 2, // 25: workflow.plugin.external.iac.PlanRequest.desired:type_name -> workflow.plugin.external.iac.ResourceSpec + 7, // 26: workflow.plugin.external.iac.PlanRequest.current:type_name -> workflow.plugin.external.iac.ResourceState + 18, // 27: workflow.plugin.external.iac.PlanResponse.plan:type_name -> workflow.plugin.external.iac.IaCPlan + 18, // 28: workflow.plugin.external.iac.ApplyRequest.plan:type_name -> workflow.plugin.external.iac.IaCPlan + 20, // 29: workflow.plugin.external.iac.ApplyResponse.result:type_name -> workflow.plugin.external.iac.ApplyResult + 3, // 30: workflow.plugin.external.iac.DestroyRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef + 21, // 31: workflow.plugin.external.iac.DestroyResponse.result:type_name -> workflow.plugin.external.iac.DestroyResult + 3, // 32: workflow.plugin.external.iac.StatusRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef + 9, // 33: workflow.plugin.external.iac.StatusResponse.statuses:type_name -> workflow.plugin.external.iac.ResourceStatus + 7, // 34: workflow.plugin.external.iac.ImportResponse.state:type_name -> workflow.plugin.external.iac.ResourceState + 4, // 35: workflow.plugin.external.iac.ResolveSizingRequest.hints:type_name -> workflow.plugin.external.iac.ResourceHints + 5, // 36: workflow.plugin.external.iac.ResolveSizingResponse.sizing:type_name -> workflow.plugin.external.iac.ProviderSizing + 22, // 37: workflow.plugin.external.iac.BootstrapStateBackendResponse.result:type_name -> workflow.plugin.external.iac.BootstrapResult + 8, // 38: workflow.plugin.external.iac.EnumerateAllResponse.outputs:type_name -> workflow.plugin.external.iac.ResourceOutput + 3, // 39: workflow.plugin.external.iac.EnumerateByTagResponse.refs:type_name -> workflow.plugin.external.iac.ResourceRef + 3, // 40: workflow.plugin.external.iac.DetectDriftRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef + 12, // 41: workflow.plugin.external.iac.DetectDriftResponse.drifts:type_name -> workflow.plugin.external.iac.DriftResult + 3, // 42: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef + 100, // 43: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.specs:type_name -> workflow.plugin.external.iac.DetectDriftWithSpecsRequest.SpecsEntry + 12, // 44: workflow.plugin.external.iac.DetectDriftWithSpecsResponse.drifts:type_name -> workflow.plugin.external.iac.DriftResult + 23, // 45: workflow.plugin.external.iac.RepairDirtyMigrationRequest.request:type_name -> workflow.plugin.external.iac.MigrationRepairRequest + 24, // 46: workflow.plugin.external.iac.RepairDirtyMigrationResponse.result:type_name -> workflow.plugin.external.iac.MigrationRepairResult + 18, // 47: workflow.plugin.external.iac.ValidatePlanRequest.plan:type_name -> workflow.plugin.external.iac.IaCPlan + 16, // 48: workflow.plugin.external.iac.ValidatePlanResponse.diagnostics:type_name -> workflow.plugin.external.iac.PlanDiagnostic + 3, // 49: workflow.plugin.external.iac.DetectDriftConfigRequest.refs:type_name -> workflow.plugin.external.iac.ResourceRef + 101, // 50: workflow.plugin.external.iac.DetectDriftConfigRequest.specs:type_name -> workflow.plugin.external.iac.DetectDriftConfigRequest.SpecsEntry + 12, // 51: workflow.plugin.external.iac.DetectDriftConfigResponse.drifts:type_name -> workflow.plugin.external.iac.DriftResult + 2, // 52: workflow.plugin.external.iac.ResourceCreateRequest.spec:type_name -> workflow.plugin.external.iac.ResourceSpec + 8, // 53: workflow.plugin.external.iac.ResourceCreateResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput + 3, // 54: workflow.plugin.external.iac.ResourceReadRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef + 8, // 55: workflow.plugin.external.iac.ResourceReadResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput + 3, // 56: workflow.plugin.external.iac.ResourceUpdateRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef + 2, // 57: workflow.plugin.external.iac.ResourceUpdateRequest.spec:type_name -> workflow.plugin.external.iac.ResourceSpec + 8, // 58: workflow.plugin.external.iac.ResourceUpdateResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput + 3, // 59: workflow.plugin.external.iac.ResourceDeleteRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef + 2, // 60: workflow.plugin.external.iac.ResourceDiffRequest.desired:type_name -> workflow.plugin.external.iac.ResourceSpec + 8, // 61: workflow.plugin.external.iac.ResourceDiffRequest.current:type_name -> workflow.plugin.external.iac.ResourceOutput + 11, // 62: workflow.plugin.external.iac.ResourceDiffResponse.result:type_name -> workflow.plugin.external.iac.DiffResult + 3, // 63: workflow.plugin.external.iac.ResourceScaleRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef + 8, // 64: workflow.plugin.external.iac.ResourceScaleResponse.output:type_name -> workflow.plugin.external.iac.ResourceOutput + 3, // 65: workflow.plugin.external.iac.ResourceHealthCheckRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef + 14, // 66: workflow.plugin.external.iac.ResourceHealthCheckResponse.result:type_name -> workflow.plugin.external.iac.HealthResult + 3, // 67: workflow.plugin.external.iac.TroubleshootRequest.ref:type_name -> workflow.plugin.external.iac.ResourceRef + 15, // 68: workflow.plugin.external.iac.TroubleshootResponse.diagnostics:type_name -> workflow.plugin.external.iac.Diagnostic + 81, // 69: workflow.plugin.external.iac.GetStateResponse.state:type_name -> workflow.plugin.external.iac.IaCState + 81, // 70: workflow.plugin.external.iac.SaveStateRequest.state:type_name -> workflow.plugin.external.iac.IaCState + 102, // 71: workflow.plugin.external.iac.ListStatesRequest.filter:type_name -> workflow.plugin.external.iac.ListStatesRequest.FilterEntry + 81, // 72: workflow.plugin.external.iac.ListStatesResponse.states:type_name -> workflow.plugin.external.iac.IaCState + 2, // 73: workflow.plugin.external.iac.DetectDriftWithSpecsRequest.SpecsEntry.value:type_name -> workflow.plugin.external.iac.ResourceSpec + 2, // 74: workflow.plugin.external.iac.DetectDriftConfigRequest.SpecsEntry.value:type_name -> workflow.plugin.external.iac.ResourceSpec + 25, // 75: workflow.plugin.external.iac.IaCProviderRequired.Initialize:input_type -> workflow.plugin.external.iac.InitializeRequest + 27, // 76: workflow.plugin.external.iac.IaCProviderRequired.Name:input_type -> workflow.plugin.external.iac.NameRequest + 29, // 77: workflow.plugin.external.iac.IaCProviderRequired.Version:input_type -> workflow.plugin.external.iac.VersionRequest + 31, // 78: workflow.plugin.external.iac.IaCProviderRequired.Capabilities:input_type -> workflow.plugin.external.iac.CapabilitiesRequest + 33, // 79: workflow.plugin.external.iac.IaCProviderRequired.Plan:input_type -> workflow.plugin.external.iac.PlanRequest + 35, // 80: workflow.plugin.external.iac.IaCProviderRequired.Apply:input_type -> workflow.plugin.external.iac.ApplyRequest + 37, // 81: workflow.plugin.external.iac.IaCProviderRequired.Destroy:input_type -> workflow.plugin.external.iac.DestroyRequest + 39, // 82: workflow.plugin.external.iac.IaCProviderRequired.Status:input_type -> workflow.plugin.external.iac.StatusRequest + 41, // 83: workflow.plugin.external.iac.IaCProviderRequired.Import:input_type -> workflow.plugin.external.iac.ImportRequest + 43, // 84: workflow.plugin.external.iac.IaCProviderRequired.ResolveSizing:input_type -> workflow.plugin.external.iac.ResolveSizingRequest + 45, // 85: workflow.plugin.external.iac.IaCProviderRequired.BootstrapStateBackend:input_type -> workflow.plugin.external.iac.BootstrapStateBackendRequest + 47, // 86: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateAll:input_type -> workflow.plugin.external.iac.EnumerateAllRequest + 49, // 87: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateByTag:input_type -> workflow.plugin.external.iac.EnumerateByTagRequest + 51, // 88: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDrift:input_type -> workflow.plugin.external.iac.DetectDriftRequest + 53, // 89: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDriftWithSpecs:input_type -> workflow.plugin.external.iac.DetectDriftWithSpecsRequest + 55, // 90: workflow.plugin.external.iac.IaCProviderCredentialRevoker.RevokeProviderCredential:input_type -> workflow.plugin.external.iac.RevokeProviderCredentialRequest + 57, // 91: workflow.plugin.external.iac.IaCProviderMigrationRepairer.RepairDirtyMigration:input_type -> workflow.plugin.external.iac.RepairDirtyMigrationRequest + 59, // 92: workflow.plugin.external.iac.IaCProviderValidator.ValidatePlan:input_type -> workflow.plugin.external.iac.ValidatePlanRequest + 61, // 93: workflow.plugin.external.iac.IaCProviderDriftConfigDetector.DetectDriftConfig:input_type -> workflow.plugin.external.iac.DetectDriftConfigRequest + 63, // 94: workflow.plugin.external.iac.ResourceDriver.Create:input_type -> workflow.plugin.external.iac.ResourceCreateRequest + 65, // 95: workflow.plugin.external.iac.ResourceDriver.Read:input_type -> workflow.plugin.external.iac.ResourceReadRequest + 67, // 96: workflow.plugin.external.iac.ResourceDriver.Update:input_type -> workflow.plugin.external.iac.ResourceUpdateRequest + 69, // 97: workflow.plugin.external.iac.ResourceDriver.Delete:input_type -> workflow.plugin.external.iac.ResourceDeleteRequest + 71, // 98: workflow.plugin.external.iac.ResourceDriver.Diff:input_type -> workflow.plugin.external.iac.ResourceDiffRequest + 73, // 99: workflow.plugin.external.iac.ResourceDriver.Scale:input_type -> workflow.plugin.external.iac.ResourceScaleRequest + 75, // 100: workflow.plugin.external.iac.ResourceDriver.HealthCheck:input_type -> workflow.plugin.external.iac.ResourceHealthCheckRequest + 77, // 101: workflow.plugin.external.iac.ResourceDriver.SensitiveKeys:input_type -> workflow.plugin.external.iac.SensitiveKeysRequest + 79, // 102: workflow.plugin.external.iac.ResourceDriver.Troubleshoot:input_type -> workflow.plugin.external.iac.TroubleshootRequest + 82, // 103: workflow.plugin.external.iac.IaCStateBackend.GetState:input_type -> workflow.plugin.external.iac.GetStateRequest + 84, // 104: workflow.plugin.external.iac.IaCStateBackend.SaveState:input_type -> workflow.plugin.external.iac.SaveStateRequest + 86, // 105: workflow.plugin.external.iac.IaCStateBackend.ListStates:input_type -> workflow.plugin.external.iac.ListStatesRequest + 88, // 106: workflow.plugin.external.iac.IaCStateBackend.DeleteState:input_type -> workflow.plugin.external.iac.DeleteStateRequest + 90, // 107: workflow.plugin.external.iac.IaCStateBackend.Lock:input_type -> workflow.plugin.external.iac.LockRequest + 92, // 108: workflow.plugin.external.iac.IaCStateBackend.Unlock:input_type -> workflow.plugin.external.iac.UnlockRequest + 26, // 109: workflow.plugin.external.iac.IaCProviderRequired.Initialize:output_type -> workflow.plugin.external.iac.InitializeResponse + 28, // 110: workflow.plugin.external.iac.IaCProviderRequired.Name:output_type -> workflow.plugin.external.iac.NameResponse + 30, // 111: workflow.plugin.external.iac.IaCProviderRequired.Version:output_type -> workflow.plugin.external.iac.VersionResponse + 32, // 112: workflow.plugin.external.iac.IaCProviderRequired.Capabilities:output_type -> workflow.plugin.external.iac.CapabilitiesResponse + 34, // 113: workflow.plugin.external.iac.IaCProviderRequired.Plan:output_type -> workflow.plugin.external.iac.PlanResponse + 36, // 114: workflow.plugin.external.iac.IaCProviderRequired.Apply:output_type -> workflow.plugin.external.iac.ApplyResponse + 38, // 115: workflow.plugin.external.iac.IaCProviderRequired.Destroy:output_type -> workflow.plugin.external.iac.DestroyResponse + 40, // 116: workflow.plugin.external.iac.IaCProviderRequired.Status:output_type -> workflow.plugin.external.iac.StatusResponse + 42, // 117: workflow.plugin.external.iac.IaCProviderRequired.Import:output_type -> workflow.plugin.external.iac.ImportResponse + 44, // 118: workflow.plugin.external.iac.IaCProviderRequired.ResolveSizing:output_type -> workflow.plugin.external.iac.ResolveSizingResponse + 46, // 119: workflow.plugin.external.iac.IaCProviderRequired.BootstrapStateBackend:output_type -> workflow.plugin.external.iac.BootstrapStateBackendResponse + 48, // 120: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateAll:output_type -> workflow.plugin.external.iac.EnumerateAllResponse + 50, // 121: workflow.plugin.external.iac.IaCProviderEnumerator.EnumerateByTag:output_type -> workflow.plugin.external.iac.EnumerateByTagResponse + 52, // 122: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDrift:output_type -> workflow.plugin.external.iac.DetectDriftResponse + 54, // 123: workflow.plugin.external.iac.IaCProviderDriftDetector.DetectDriftWithSpecs:output_type -> workflow.plugin.external.iac.DetectDriftWithSpecsResponse + 56, // 124: workflow.plugin.external.iac.IaCProviderCredentialRevoker.RevokeProviderCredential:output_type -> workflow.plugin.external.iac.RevokeProviderCredentialResponse + 58, // 125: workflow.plugin.external.iac.IaCProviderMigrationRepairer.RepairDirtyMigration:output_type -> workflow.plugin.external.iac.RepairDirtyMigrationResponse + 60, // 126: workflow.plugin.external.iac.IaCProviderValidator.ValidatePlan:output_type -> workflow.plugin.external.iac.ValidatePlanResponse + 62, // 127: workflow.plugin.external.iac.IaCProviderDriftConfigDetector.DetectDriftConfig:output_type -> workflow.plugin.external.iac.DetectDriftConfigResponse + 64, // 128: workflow.plugin.external.iac.ResourceDriver.Create:output_type -> workflow.plugin.external.iac.ResourceCreateResponse + 66, // 129: workflow.plugin.external.iac.ResourceDriver.Read:output_type -> workflow.plugin.external.iac.ResourceReadResponse + 68, // 130: workflow.plugin.external.iac.ResourceDriver.Update:output_type -> workflow.plugin.external.iac.ResourceUpdateResponse + 70, // 131: workflow.plugin.external.iac.ResourceDriver.Delete:output_type -> workflow.plugin.external.iac.ResourceDeleteResponse + 72, // 132: workflow.plugin.external.iac.ResourceDriver.Diff:output_type -> workflow.plugin.external.iac.ResourceDiffResponse + 74, // 133: workflow.plugin.external.iac.ResourceDriver.Scale:output_type -> workflow.plugin.external.iac.ResourceScaleResponse + 76, // 134: workflow.plugin.external.iac.ResourceDriver.HealthCheck:output_type -> workflow.plugin.external.iac.ResourceHealthCheckResponse + 78, // 135: workflow.plugin.external.iac.ResourceDriver.SensitiveKeys:output_type -> workflow.plugin.external.iac.SensitiveKeysResponse + 80, // 136: workflow.plugin.external.iac.ResourceDriver.Troubleshoot:output_type -> workflow.plugin.external.iac.TroubleshootResponse + 83, // 137: workflow.plugin.external.iac.IaCStateBackend.GetState:output_type -> workflow.plugin.external.iac.GetStateResponse + 85, // 138: workflow.plugin.external.iac.IaCStateBackend.SaveState:output_type -> workflow.plugin.external.iac.SaveStateResponse + 87, // 139: workflow.plugin.external.iac.IaCStateBackend.ListStates:output_type -> workflow.plugin.external.iac.ListStatesResponse + 89, // 140: workflow.plugin.external.iac.IaCStateBackend.DeleteState:output_type -> workflow.plugin.external.iac.DeleteStateResponse + 91, // 141: workflow.plugin.external.iac.IaCStateBackend.Lock:output_type -> workflow.plugin.external.iac.LockResponse + 93, // 142: workflow.plugin.external.iac.IaCStateBackend.Unlock:output_type -> workflow.plugin.external.iac.UnlockResponse + 109, // [109:143] is the sub-list for method output_type + 75, // [75:109] is the sub-list for method input_type + 75, // [75:75] is the sub-list for extension type_name + 75, // [75:75] is the sub-list for extension extendee + 0, // [0:75] is the sub-list for field type_name } func init() { file_iac_proto_init() } @@ -5162,9 +5896,9 @@ func file_iac_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_iac_proto_rawDesc), len(file_iac_proto_rawDesc)), NumEnums: 2, - NumMessages: 87, + NumMessages: 101, NumExtensions: 0, - NumServices: 8, + NumServices: 9, }, GoTypes: file_iac_proto_goTypes, DependencyIndexes: file_iac_proto_depIdxs, diff --git a/plugin/external/proto/iac.proto b/plugin/external/proto/iac.proto index a68c1f13..76fde9a2 100644 --- a/plugin/external/proto/iac.proto +++ b/plugin/external/proto/iac.proto @@ -596,3 +596,51 @@ message TroubleshootRequest { message TroubleshootResponse { repeated Diagnostic diagnostics = 1; } + +// ───────────────────────────────────────────────────────────────────────────── +// IaCStateBackend — strict contract for IaC state storage backends served by a +// plugin sidecar. Maps 1:1 onto module.IaCStateStore (6 methods). Unary RPCs: +// the PR 2 benchmark validated unary transport for 1 MB state blobs against the +// in-process baseline. No lock-lease/TTL field — added additively only once a +// plugin backend implements honored expiry with a conformance test. +// ───────────────────────────────────────────────────────────────────────────── +service IaCStateBackend { + rpc GetState (GetStateRequest) returns (GetStateResponse); + rpc SaveState (SaveStateRequest) returns (SaveStateResponse); + rpc ListStates (ListStatesRequest) returns (ListStatesResponse); + rpc DeleteState(DeleteStateRequest) returns (DeleteStateResponse); + rpc Lock (LockRequest) returns (LockResponse); + rpc Unlock (UnlockRequest) returns (UnlockResponse); +} + +// IaCState mirrors module.IaCState (module/iac_state.go:4-18). The free-form +// Outputs / Config map[string]any fields cross the wire as JSON bytes per the +// iac.proto hard invariant — same pattern as ResourceState.outputs_json. +message IaCState { + string resource_id = 1; + string resource_type = 2; + string provider = 3; + string provider_ref = 4; + string provider_id = 5; + string config_hash = 6; + string status = 7; + bytes outputs_json = 8; // JSON-encoded map[string]any (module.IaCState.Outputs) + bytes config_json = 9; // JSON-encoded map[string]any (module.IaCState.Config) + repeated string dependencies = 10; + string created_at = 11; + string updated_at = 12; + string error = 13; +} + +message GetStateRequest { string resource_id = 1; } +message GetStateResponse { IaCState state = 1; bool exists = 2; } +message SaveStateRequest { IaCState state = 1; } // idempotent: full-state replace, last-writer-wins +message SaveStateResponse {} +message ListStatesRequest { map filter = 1; } +message ListStatesResponse { repeated IaCState states = 1; } +message DeleteStateRequest { string resource_id = 1; } +message DeleteStateResponse {} +message LockRequest { string resource_id = 1; } +message LockResponse {} +message UnlockRequest { string resource_id = 1; } +message UnlockResponse {} diff --git a/plugin/external/proto/iac_grpc.pb.go b/plugin/external/proto/iac_grpc.pb.go index aaa61c77..12e75a24 100644 --- a/plugin/external/proto/iac_grpc.pb.go +++ b/plugin/external/proto/iac_grpc.pb.go @@ -1647,3 +1647,311 @@ var ResourceDriver_ServiceDesc = grpc.ServiceDesc{ Streams: []grpc.StreamDesc{}, Metadata: "iac.proto", } + +const ( + IaCStateBackend_GetState_FullMethodName = "/workflow.plugin.external.iac.IaCStateBackend/GetState" + IaCStateBackend_SaveState_FullMethodName = "/workflow.plugin.external.iac.IaCStateBackend/SaveState" + IaCStateBackend_ListStates_FullMethodName = "/workflow.plugin.external.iac.IaCStateBackend/ListStates" + IaCStateBackend_DeleteState_FullMethodName = "/workflow.plugin.external.iac.IaCStateBackend/DeleteState" + IaCStateBackend_Lock_FullMethodName = "/workflow.plugin.external.iac.IaCStateBackend/Lock" + IaCStateBackend_Unlock_FullMethodName = "/workflow.plugin.external.iac.IaCStateBackend/Unlock" +) + +// IaCStateBackendClient is the client API for IaCStateBackend service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +// +// ───────────────────────────────────────────────────────────────────────────── +// IaCStateBackend — strict contract for IaC state storage backends served by a +// plugin sidecar. Maps 1:1 onto module.IaCStateStore (6 methods). Unary RPCs: +// the PR 2 benchmark validated unary transport for 1 MB state blobs against the +// in-process baseline. No lock-lease/TTL field — added additively only once a +// plugin backend implements honored expiry with a conformance test. +// ───────────────────────────────────────────────────────────────────────────── +type IaCStateBackendClient interface { + GetState(ctx context.Context, in *GetStateRequest, opts ...grpc.CallOption) (*GetStateResponse, error) + SaveState(ctx context.Context, in *SaveStateRequest, opts ...grpc.CallOption) (*SaveStateResponse, error) + ListStates(ctx context.Context, in *ListStatesRequest, opts ...grpc.CallOption) (*ListStatesResponse, error) + DeleteState(ctx context.Context, in *DeleteStateRequest, opts ...grpc.CallOption) (*DeleteStateResponse, error) + Lock(ctx context.Context, in *LockRequest, opts ...grpc.CallOption) (*LockResponse, error) + Unlock(ctx context.Context, in *UnlockRequest, opts ...grpc.CallOption) (*UnlockResponse, error) +} + +type iaCStateBackendClient struct { + cc grpc.ClientConnInterface +} + +func NewIaCStateBackendClient(cc grpc.ClientConnInterface) IaCStateBackendClient { + return &iaCStateBackendClient{cc} +} + +func (c *iaCStateBackendClient) GetState(ctx context.Context, in *GetStateRequest, opts ...grpc.CallOption) (*GetStateResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(GetStateResponse) + err := c.cc.Invoke(ctx, IaCStateBackend_GetState_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *iaCStateBackendClient) SaveState(ctx context.Context, in *SaveStateRequest, opts ...grpc.CallOption) (*SaveStateResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(SaveStateResponse) + err := c.cc.Invoke(ctx, IaCStateBackend_SaveState_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *iaCStateBackendClient) ListStates(ctx context.Context, in *ListStatesRequest, opts ...grpc.CallOption) (*ListStatesResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(ListStatesResponse) + err := c.cc.Invoke(ctx, IaCStateBackend_ListStates_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *iaCStateBackendClient) DeleteState(ctx context.Context, in *DeleteStateRequest, opts ...grpc.CallOption) (*DeleteStateResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(DeleteStateResponse) + err := c.cc.Invoke(ctx, IaCStateBackend_DeleteState_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *iaCStateBackendClient) Lock(ctx context.Context, in *LockRequest, opts ...grpc.CallOption) (*LockResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(LockResponse) + err := c.cc.Invoke(ctx, IaCStateBackend_Lock_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *iaCStateBackendClient) Unlock(ctx context.Context, in *UnlockRequest, opts ...grpc.CallOption) (*UnlockResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(UnlockResponse) + err := c.cc.Invoke(ctx, IaCStateBackend_Unlock_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +// IaCStateBackendServer is the server API for IaCStateBackend service. +// All implementations must embed UnimplementedIaCStateBackendServer +// for forward compatibility. +// +// ───────────────────────────────────────────────────────────────────────────── +// IaCStateBackend — strict contract for IaC state storage backends served by a +// plugin sidecar. Maps 1:1 onto module.IaCStateStore (6 methods). Unary RPCs: +// the PR 2 benchmark validated unary transport for 1 MB state blobs against the +// in-process baseline. No lock-lease/TTL field — added additively only once a +// plugin backend implements honored expiry with a conformance test. +// ───────────────────────────────────────────────────────────────────────────── +type IaCStateBackendServer interface { + GetState(context.Context, *GetStateRequest) (*GetStateResponse, error) + SaveState(context.Context, *SaveStateRequest) (*SaveStateResponse, error) + ListStates(context.Context, *ListStatesRequest) (*ListStatesResponse, error) + DeleteState(context.Context, *DeleteStateRequest) (*DeleteStateResponse, error) + Lock(context.Context, *LockRequest) (*LockResponse, error) + Unlock(context.Context, *UnlockRequest) (*UnlockResponse, error) + mustEmbedUnimplementedIaCStateBackendServer() +} + +// UnimplementedIaCStateBackendServer must be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedIaCStateBackendServer struct{} + +func (UnimplementedIaCStateBackendServer) GetState(context.Context, *GetStateRequest) (*GetStateResponse, error) { + return nil, status.Error(codes.Unimplemented, "method GetState not implemented") +} +func (UnimplementedIaCStateBackendServer) SaveState(context.Context, *SaveStateRequest) (*SaveStateResponse, error) { + return nil, status.Error(codes.Unimplemented, "method SaveState not implemented") +} +func (UnimplementedIaCStateBackendServer) ListStates(context.Context, *ListStatesRequest) (*ListStatesResponse, error) { + return nil, status.Error(codes.Unimplemented, "method ListStates not implemented") +} +func (UnimplementedIaCStateBackendServer) DeleteState(context.Context, *DeleteStateRequest) (*DeleteStateResponse, error) { + return nil, status.Error(codes.Unimplemented, "method DeleteState not implemented") +} +func (UnimplementedIaCStateBackendServer) Lock(context.Context, *LockRequest) (*LockResponse, error) { + return nil, status.Error(codes.Unimplemented, "method Lock not implemented") +} +func (UnimplementedIaCStateBackendServer) Unlock(context.Context, *UnlockRequest) (*UnlockResponse, error) { + return nil, status.Error(codes.Unimplemented, "method Unlock not implemented") +} +func (UnimplementedIaCStateBackendServer) mustEmbedUnimplementedIaCStateBackendServer() {} +func (UnimplementedIaCStateBackendServer) testEmbeddedByValue() {} + +// UnsafeIaCStateBackendServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to IaCStateBackendServer will +// result in compilation errors. +type UnsafeIaCStateBackendServer interface { + mustEmbedUnimplementedIaCStateBackendServer() +} + +func RegisterIaCStateBackendServer(s grpc.ServiceRegistrar, srv IaCStateBackendServer) { + // If the following call panics, it indicates UnimplementedIaCStateBackendServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + s.RegisterService(&IaCStateBackend_ServiceDesc, srv) +} + +func _IaCStateBackend_GetState_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetStateRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(IaCStateBackendServer).GetState(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: IaCStateBackend_GetState_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(IaCStateBackendServer).GetState(ctx, req.(*GetStateRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _IaCStateBackend_SaveState_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(SaveStateRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(IaCStateBackendServer).SaveState(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: IaCStateBackend_SaveState_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(IaCStateBackendServer).SaveState(ctx, req.(*SaveStateRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _IaCStateBackend_ListStates_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(ListStatesRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(IaCStateBackendServer).ListStates(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: IaCStateBackend_ListStates_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(IaCStateBackendServer).ListStates(ctx, req.(*ListStatesRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _IaCStateBackend_DeleteState_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(DeleteStateRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(IaCStateBackendServer).DeleteState(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: IaCStateBackend_DeleteState_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(IaCStateBackendServer).DeleteState(ctx, req.(*DeleteStateRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _IaCStateBackend_Lock_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(LockRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(IaCStateBackendServer).Lock(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: IaCStateBackend_Lock_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(IaCStateBackendServer).Lock(ctx, req.(*LockRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _IaCStateBackend_Unlock_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(UnlockRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(IaCStateBackendServer).Unlock(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: IaCStateBackend_Unlock_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(IaCStateBackendServer).Unlock(ctx, req.(*UnlockRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// IaCStateBackend_ServiceDesc is the grpc.ServiceDesc for IaCStateBackend service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var IaCStateBackend_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "workflow.plugin.external.iac.IaCStateBackend", + HandlerType: (*IaCStateBackendServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "GetState", + Handler: _IaCStateBackend_GetState_Handler, + }, + { + MethodName: "SaveState", + Handler: _IaCStateBackend_SaveState_Handler, + }, + { + MethodName: "ListStates", + Handler: _IaCStateBackend_ListStates_Handler, + }, + { + MethodName: "DeleteState", + Handler: _IaCStateBackend_DeleteState_Handler, + }, + { + MethodName: "Lock", + Handler: _IaCStateBackend_Lock_Handler, + }, + { + MethodName: "Unlock", + Handler: _IaCStateBackend_Unlock_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "iac.proto", +} diff --git a/plugin/external/proto/iac_statebackend_test.go b/plugin/external/proto/iac_statebackend_test.go new file mode 100644 index 00000000..fbb20a4d --- /dev/null +++ b/plugin/external/proto/iac_statebackend_test.go @@ -0,0 +1,23 @@ +package proto + +import "testing" + +// Compile-level guard: the IaCStateBackend service + its messages must exist +// in the generated package with the IaCStateStore-mirroring shape. +func TestIaCStateBackendGeneratedTypesExist(t *testing.T) { + var _ IaCStateBackendServer // service interface generated + var _ IaCStateBackendClient // client interface generated + _ = &GetStateRequest{ResourceId: "r"} + _ = &GetStateResponse{Exists: true, State: &IaCState{}} + _ = &SaveStateRequest{State: &IaCState{}} + _ = &ListStatesRequest{Filter: map[string]string{"k": "v"}} + _ = &LockRequest{ResourceId: "r"} + _ = &UnlockRequest{ResourceId: "r"} + // IaCState mirrors module.IaCState; free-form Outputs/Config cross the wire + // as JSON bytes per the iac.proto hard invariant (NO google.protobuf.Struct). + s := &IaCState{ResourceId: "r", ResourceType: "kubernetes", Provider: "azure", + Status: "active", OutputsJson: []byte(`{}`), ConfigJson: []byte(`{}`)} + if s.GetResourceId() != "r" { + t.Fatalf("IaCState.ResourceId accessor missing") + } +} From fc333a00e205062991c6ed5a8c72ce868097b985 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 02:55:05 -0400 Subject: [PATCH 30/39] test(module): add IaCStateBackend gRPC-vs-in-process benchmark harness Drives a ~1 MB synthetic IaCState through Lock/GetState/SaveState/Unlock both in-process (baseline) and over a real bufconn gRPC boundary (post-extraction path). Self-contained (local benchStateToProto + benchStateBackendServer; Task 7 promotes production versions). Feeds the unary-vs-streaming proto-transport decision in the next task. --- module/benchmark_iac_state_backend_test.go | 138 +++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 module/benchmark_iac_state_backend_test.go diff --git a/module/benchmark_iac_state_backend_test.go b/module/benchmark_iac_state_backend_test.go new file mode 100644 index 00000000..5502254e --- /dev/null +++ b/module/benchmark_iac_state_backend_test.go @@ -0,0 +1,138 @@ +package module + +import ( + "context" + "encoding/json" + "net" + "strconv" + "strings" + "testing" + + pb "github.com/GoCodeAlone/workflow/plugin/external/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" +) + +// oneMBState builds an IaCState whose JSON payload is ~1 MB (Outputs map padded). +func oneMBState() *IaCState { + big := strings.Repeat("x", 1024) + outputs := make(map[string]any, 1024) + for i := 0; i < 1024; i++ { + outputs["k"+strconv.Itoa(i)] = big + } + return &IaCState{ + ResourceID: "bench-resource", ResourceType: "kubernetes", Provider: "azure", + Status: "active", Outputs: outputs, Config: map[string]any{"size": "large"}, + CreatedAt: "2026-05-14T00:00:00Z", UpdatedAt: "2026-05-14T00:00:00Z", + } +} + +// benchStateToProto — local, self-contained IaCState -> pb.IaCState converter. +// Task 7 replaces this with the production iacStateToProto. +func benchStateToProto(s *IaCState) *pb.IaCState { + outJSON, _ := json.Marshal(s.Outputs) + cfgJSON, _ := json.Marshal(s.Config) + return &pb.IaCState{ + ResourceId: s.ResourceID, ResourceType: s.ResourceType, Provider: s.Provider, + Status: s.Status, OutputsJson: outJSON, ConfigJson: cfgJSON, + CreatedAt: s.CreatedAt, UpdatedAt: s.UpdatedAt, + } +} + +// benchStateBackendServer wraps an IaCStateStore behind pb.IaCStateBackendServer. +// Task 7 promotes this to the production iacStateBackendServer. +type benchStateBackendServer struct { + pb.UnimplementedIaCStateBackendServer + store IaCStateStore +} + +func (s *benchStateBackendServer) GetState(_ context.Context, r *pb.GetStateRequest) (*pb.GetStateResponse, error) { + st, err := s.store.GetState(r.ResourceId) + if err != nil { + return nil, err + } + if st == nil { + return &pb.GetStateResponse{Exists: false}, nil + } + return &pb.GetStateResponse{Exists: true, State: benchStateToProto(st)}, nil +} +func (s *benchStateBackendServer) SaveState(_ context.Context, r *pb.SaveStateRequest) (*pb.SaveStateResponse, error) { + var outputs, config map[string]any + _ = json.Unmarshal(r.State.OutputsJson, &outputs) + _ = json.Unmarshal(r.State.ConfigJson, &config) + return &pb.SaveStateResponse{}, s.store.SaveState(&IaCState{ + ResourceID: r.State.ResourceId, ResourceType: r.State.ResourceType, + Provider: r.State.Provider, Status: r.State.Status, Outputs: outputs, Config: config, + }) +} +func (s *benchStateBackendServer) Lock(_ context.Context, r *pb.LockRequest) (*pb.LockResponse, error) { + return &pb.LockResponse{}, s.store.Lock(r.ResourceId) +} +func (s *benchStateBackendServer) Unlock(_ context.Context, r *pb.UnlockRequest) (*pb.UnlockResponse, error) { + return &pb.UnlockResponse{}, s.store.Unlock(r.ResourceId) +} +func (s *benchStateBackendServer) ListStates(_ context.Context, _ *pb.ListStatesRequest) (*pb.ListStatesResponse, error) { + return &pb.ListStatesResponse{}, nil +} +func (s *benchStateBackendServer) DeleteState(_ context.Context, r *pb.DeleteStateRequest) (*pb.DeleteStateResponse, error) { + return &pb.DeleteStateResponse{}, s.store.DeleteState(r.ResourceId) +} + +// BenchmarkIaCStateBackend_InProcess is the baseline: direct IaCStateStore calls. +func BenchmarkIaCStateBackend_InProcess(b *testing.B) { + store := NewMemoryIaCStateStore() + st := oneMBState() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := store.Lock(st.ResourceID); err != nil { + b.Fatal(err) + } + if _, err := store.GetState(st.ResourceID); err != nil { + b.Fatal(err) + } + if err := store.SaveState(st); err != nil { + b.Fatal(err) + } + if err := store.Unlock(st.ResourceID); err != nil { + b.Fatal(err) + } + } +} + +// BenchmarkIaCStateBackend_GRPC is the post-extraction path: same store, same +// cycle, but every call crosses a real (in-memory bufconn) gRPC boundary. +func BenchmarkIaCStateBackend_GRPC(b *testing.B) { + lis := bufconn.Listen(4 << 20) // 4 MiB — gRPC default message cap + srv := grpc.NewServer() + pb.RegisterIaCStateBackendServer(srv, &benchStateBackendServer{store: NewMemoryIaCStateStore()}) + go func() { _ = srv.Serve(lis) }() + defer srv.Stop() + + conn, err := grpc.NewClient("passthrough:///bufnet", + grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { return lis.DialContext(ctx) }), + grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + b.Fatal(err) + } + defer conn.Close() + client := pb.NewIaCStateBackendClient(conn) + st := oneMBState() + pbState := benchStateToProto(st) + ctx := context.Background() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := client.Lock(ctx, &pb.LockRequest{ResourceId: st.ResourceID}); err != nil { + b.Fatal(err) + } + if _, err := client.GetState(ctx, &pb.GetStateRequest{ResourceId: st.ResourceID}); err != nil { + b.Fatal(err) + } + if _, err := client.SaveState(ctx, &pb.SaveStateRequest{State: pbState}); err != nil { + b.Fatal(err) + } + if _, err := client.Unlock(ctx, &pb.UnlockRequest{ResourceId: st.ResourceID}); err != nil { + b.Fatal(err) + } + } +} From 7da06cf984a723681220777df5e22c761d009b66 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 09:24:20 -0400 Subject: [PATCH 31/39] test(wftest): add IaCStateBackend to iacServiceChecks coverage table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Task 4 added the IaCStateBackend service to iac.proto but missed the corresponding iacServiceChecks row in wftest/bdd/strict_iac.go. TestIaCServiceChecks_CoversEveryProtoService enforces parity between iac.proto's services and that table — it was failing on the missing entry. Belongs with PR 2 (the proto PR). --- wftest/bdd/strict_iac.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wftest/bdd/strict_iac.go b/wftest/bdd/strict_iac.go index b811e36c..3bbf3ad9 100644 --- a/wftest/bdd/strict_iac.go +++ b/wftest/bdd/strict_iac.go @@ -69,6 +69,10 @@ var iacServiceChecks = []iacServiceCheck{ _, ok := p.(pb.ResourceDriverServer) return ok }}, + {"workflow.plugin.external.iac.IaCStateBackend", func(p any) bool { + _, ok := p.(pb.IaCStateBackendServer) + return ok + }}, } // AssertProviderCapabilitiesMatchRegistration asserts that every typed From 4aab57b68f57901011dd8e978321577893c5bdd0 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 06:08:30 -0400 Subject: [PATCH 32/39] feat(module): IaCState proto converters + grpcIaCStateStore client adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit grpcIaCStateStore implements module.IaCStateStore over an IaCStateBackendClient — the host-side half of the new contract. iacStateToProto/iacStateFromProto convert the free-form Outputs/Config maps via encoding/json (no structpb — iac.proto hard invariant). iacStateBackendServer is the production server type. Promotes these out of the benchmark file so one canonical copy is shared. --- module/benchmark_iac_state_backend_test.go | 59 +---- module/iac_state_grpc_client.go | 248 +++++++++++++++++++++ module/iac_state_grpc_client_test.go | 53 +++++ 3 files changed, 306 insertions(+), 54 deletions(-) create mode 100644 module/iac_state_grpc_client.go create mode 100644 module/iac_state_grpc_client_test.go diff --git a/module/benchmark_iac_state_backend_test.go b/module/benchmark_iac_state_backend_test.go index 5502254e..6659694c 100644 --- a/module/benchmark_iac_state_backend_test.go +++ b/module/benchmark_iac_state_backend_test.go @@ -2,7 +2,6 @@ package module import ( "context" - "encoding/json" "net" "strconv" "strings" @@ -28,57 +27,6 @@ func oneMBState() *IaCState { } } -// benchStateToProto — local, self-contained IaCState -> pb.IaCState converter. -// Task 7 replaces this with the production iacStateToProto. -func benchStateToProto(s *IaCState) *pb.IaCState { - outJSON, _ := json.Marshal(s.Outputs) - cfgJSON, _ := json.Marshal(s.Config) - return &pb.IaCState{ - ResourceId: s.ResourceID, ResourceType: s.ResourceType, Provider: s.Provider, - Status: s.Status, OutputsJson: outJSON, ConfigJson: cfgJSON, - CreatedAt: s.CreatedAt, UpdatedAt: s.UpdatedAt, - } -} - -// benchStateBackendServer wraps an IaCStateStore behind pb.IaCStateBackendServer. -// Task 7 promotes this to the production iacStateBackendServer. -type benchStateBackendServer struct { - pb.UnimplementedIaCStateBackendServer - store IaCStateStore -} - -func (s *benchStateBackendServer) GetState(_ context.Context, r *pb.GetStateRequest) (*pb.GetStateResponse, error) { - st, err := s.store.GetState(r.ResourceId) - if err != nil { - return nil, err - } - if st == nil { - return &pb.GetStateResponse{Exists: false}, nil - } - return &pb.GetStateResponse{Exists: true, State: benchStateToProto(st)}, nil -} -func (s *benchStateBackendServer) SaveState(_ context.Context, r *pb.SaveStateRequest) (*pb.SaveStateResponse, error) { - var outputs, config map[string]any - _ = json.Unmarshal(r.State.OutputsJson, &outputs) - _ = json.Unmarshal(r.State.ConfigJson, &config) - return &pb.SaveStateResponse{}, s.store.SaveState(&IaCState{ - ResourceID: r.State.ResourceId, ResourceType: r.State.ResourceType, - Provider: r.State.Provider, Status: r.State.Status, Outputs: outputs, Config: config, - }) -} -func (s *benchStateBackendServer) Lock(_ context.Context, r *pb.LockRequest) (*pb.LockResponse, error) { - return &pb.LockResponse{}, s.store.Lock(r.ResourceId) -} -func (s *benchStateBackendServer) Unlock(_ context.Context, r *pb.UnlockRequest) (*pb.UnlockResponse, error) { - return &pb.UnlockResponse{}, s.store.Unlock(r.ResourceId) -} -func (s *benchStateBackendServer) ListStates(_ context.Context, _ *pb.ListStatesRequest) (*pb.ListStatesResponse, error) { - return &pb.ListStatesResponse{}, nil -} -func (s *benchStateBackendServer) DeleteState(_ context.Context, r *pb.DeleteStateRequest) (*pb.DeleteStateResponse, error) { - return &pb.DeleteStateResponse{}, s.store.DeleteState(r.ResourceId) -} - // BenchmarkIaCStateBackend_InProcess is the baseline: direct IaCStateStore calls. func BenchmarkIaCStateBackend_InProcess(b *testing.B) { store := NewMemoryIaCStateStore() @@ -105,7 +53,7 @@ func BenchmarkIaCStateBackend_InProcess(b *testing.B) { func BenchmarkIaCStateBackend_GRPC(b *testing.B) { lis := bufconn.Listen(4 << 20) // 4 MiB — gRPC default message cap srv := grpc.NewServer() - pb.RegisterIaCStateBackendServer(srv, &benchStateBackendServer{store: NewMemoryIaCStateStore()}) + pb.RegisterIaCStateBackendServer(srv, &iacStateBackendServer{store: NewMemoryIaCStateStore()}) go func() { _ = srv.Serve(lis) }() defer srv.Stop() @@ -118,7 +66,10 @@ func BenchmarkIaCStateBackend_GRPC(b *testing.B) { defer conn.Close() client := pb.NewIaCStateBackendClient(conn) st := oneMBState() - pbState := benchStateToProto(st) + pbState, err := iacStateToProto(st) + if err != nil { + b.Fatal(err) + } ctx := context.Background() b.ResetTimer() for i := 0; i < b.N; i++ { diff --git a/module/iac_state_grpc_client.go b/module/iac_state_grpc_client.go new file mode 100644 index 00000000..afe70199 --- /dev/null +++ b/module/iac_state_grpc_client.go @@ -0,0 +1,248 @@ +package module + +import ( + "context" + "encoding/json" + + pb "github.com/GoCodeAlone/workflow/plugin/external/proto" +) + +// ───────────────────────────────────────────────────────────────────────────── +// IaCState ⇄ pb.IaCState converters. +// +// The free-form Outputs / Config map[string]any fields cross the wire as JSON +// bytes — the iac.proto hard invariant (iac.proto:6-10) forbids +// google.protobuf.Struct. The plugin/host owns json.Marshal/Unmarshal directly. +// ───────────────────────────────────────────────────────────────────────────── + +// iacStateToProto converts a module IaCState into its proto wire form. +func iacStateToProto(s *IaCState) (*pb.IaCState, error) { + if s == nil { + return nil, nil + } + outputsJSON, err := json.Marshal(s.Outputs) + if err != nil { + return nil, err + } + configJSON, err := json.Marshal(s.Config) + if err != nil { + return nil, err + } + return &pb.IaCState{ + ResourceId: s.ResourceID, + ResourceType: s.ResourceType, + Provider: s.Provider, + ProviderRef: s.ProviderRef, + ProviderId: s.ProviderID, + ConfigHash: s.ConfigHash, + Status: s.Status, + OutputsJson: outputsJSON, + ConfigJson: configJSON, + Dependencies: s.Dependencies, + CreatedAt: s.CreatedAt, + UpdatedAt: s.UpdatedAt, + Error: s.Error, + }, nil +} + +// iacStateFromProto converts a proto IaCState back into a module IaCState. +// +// Empty / "null" / "{}" JSON byte payloads decode to a nil map (not an empty +// non-nil map) so round-trips through a nil Outputs/Config stay clean. +func iacStateFromProto(p *pb.IaCState) (*IaCState, error) { + if p == nil { + return nil, nil + } + outputs, err := jsonBytesToMap(p.OutputsJson) + if err != nil { + return nil, err + } + config, err := jsonBytesToMap(p.ConfigJson) + if err != nil { + return nil, err + } + return &IaCState{ + ResourceID: p.ResourceId, + ResourceType: p.ResourceType, + Provider: p.Provider, + ProviderRef: p.ProviderRef, + ProviderID: p.ProviderId, + ConfigHash: p.ConfigHash, + Status: p.Status, + Outputs: outputs, + Config: config, + Dependencies: p.Dependencies, + CreatedAt: p.CreatedAt, + UpdatedAt: p.UpdatedAt, + Error: p.Error, + }, nil +} + +// jsonBytesToMap decodes JSON bytes into a map[string]any. Empty, "null" and +// "{}" inputs yield a nil map. +func jsonBytesToMap(b []byte) (map[string]any, error) { + s := string(b) + if len(b) == 0 || s == "null" || s == "{}" { + return nil, nil + } + var m map[string]any + if err := json.Unmarshal(b, &m); err != nil { + return nil, err + } + return m, nil +} + +// ───────────────────────────────────────────────────────────────────────────── +// grpcIaCStateStore — host-side IaCStateStore implemented over an +// IaCStateBackendClient. The host half of the strict IaCStateBackend contract. +// ───────────────────────────────────────────────────────────────────────────── + +// grpcIaCStateStore adapts a pb.IaCStateBackendClient to module.IaCStateStore. +type grpcIaCStateStore struct { + client pb.IaCStateBackendClient +} + +// newGRPCIaCStateStore wraps an IaCStateBackendClient as an IaCStateStore. +func newGRPCIaCStateStore(c pb.IaCStateBackendClient) *grpcIaCStateStore { + return &grpcIaCStateStore{client: c} +} + +// GetState retrieves a state record by resource ID. Returns nil, nil when the +// backend reports the record does not exist. +func (s *grpcIaCStateStore) GetState(resourceID string) (*IaCState, error) { + resp, err := s.client.GetState(context.Background(), &pb.GetStateRequest{ResourceId: resourceID}) + if err != nil { + return nil, err + } + if !resp.Exists { + return nil, nil + } + return iacStateFromProto(resp.State) +} + +// SaveState inserts or replaces a state record. +func (s *grpcIaCStateStore) SaveState(state *IaCState) error { + pbState, err := iacStateToProto(state) + if err != nil { + return err + } + _, err = s.client.SaveState(context.Background(), &pb.SaveStateRequest{State: pbState}) + return err +} + +// ListStates returns all state records matching the provided key=value filter. +func (s *grpcIaCStateStore) ListStates(filter map[string]string) ([]*IaCState, error) { + resp, err := s.client.ListStates(context.Background(), &pb.ListStatesRequest{Filter: filter}) + if err != nil { + return nil, err + } + states := make([]*IaCState, 0, len(resp.States)) + for _, p := range resp.States { + st, convErr := iacStateFromProto(p) + if convErr != nil { + return nil, convErr + } + states = append(states, st) + } + return states, nil +} + +// DeleteState removes a state record by resource ID. +func (s *grpcIaCStateStore) DeleteState(resourceID string) error { + _, err := s.client.DeleteState(context.Background(), &pb.DeleteStateRequest{ResourceId: resourceID}) + return err +} + +// Lock acquires an exclusive lock for the given resource ID. +func (s *grpcIaCStateStore) Lock(resourceID string) error { + _, err := s.client.Lock(context.Background(), &pb.LockRequest{ResourceId: resourceID}) + return err +} + +// Unlock releases the lock for the given resource ID. +func (s *grpcIaCStateStore) Unlock(resourceID string) error { + _, err := s.client.Unlock(context.Background(), &pb.UnlockRequest{ResourceId: resourceID}) + return err +} + +// ───────────────────────────────────────────────────────────────────────────── +// iacStateBackendServer — production pb.IaCStateBackendServer that delegates to +// any module.IaCStateStore. The plugin-side half of the contract. +// ───────────────────────────────────────────────────────────────────────────── + +// iacStateBackendServer serves an IaCStateStore over the IaCStateBackend gRPC +// contract. +type iacStateBackendServer struct { + pb.UnimplementedIaCStateBackendServer + store IaCStateStore +} + +// GetState delegates to the backing store, mapping a not-found (nil) result to +// GetStateResponse{Exists: false}. +func (s *iacStateBackendServer) GetState(_ context.Context, r *pb.GetStateRequest) (*pb.GetStateResponse, error) { + st, err := s.store.GetState(r.ResourceId) + if err != nil { + return nil, err + } + if st == nil { + return &pb.GetStateResponse{Exists: false}, nil + } + pbState, err := iacStateToProto(st) + if err != nil { + return nil, err + } + return &pb.GetStateResponse{Exists: true, State: pbState}, nil +} + +// SaveState delegates a full-state replace to the backing store. +func (s *iacStateBackendServer) SaveState(_ context.Context, r *pb.SaveStateRequest) (*pb.SaveStateResponse, error) { + st, err := iacStateFromProto(r.State) + if err != nil { + return nil, err + } + if err := s.store.SaveState(st); err != nil { + return nil, err + } + return &pb.SaveStateResponse{}, nil +} + +// ListStates delegates a filtered listing to the backing store. +func (s *iacStateBackendServer) ListStates(_ context.Context, r *pb.ListStatesRequest) (*pb.ListStatesResponse, error) { + states, err := s.store.ListStates(r.Filter) + if err != nil { + return nil, err + } + pbStates := make([]*pb.IaCState, 0, len(states)) + for _, st := range states { + pbState, convErr := iacStateToProto(st) + if convErr != nil { + return nil, convErr + } + pbStates = append(pbStates, pbState) + } + return &pb.ListStatesResponse{States: pbStates}, nil +} + +// DeleteState delegates a delete-by-ID to the backing store. +func (s *iacStateBackendServer) DeleteState(_ context.Context, r *pb.DeleteStateRequest) (*pb.DeleteStateResponse, error) { + if err := s.store.DeleteState(r.ResourceId); err != nil { + return nil, err + } + return &pb.DeleteStateResponse{}, nil +} + +// Lock delegates lock acquisition to the backing store. +func (s *iacStateBackendServer) Lock(_ context.Context, r *pb.LockRequest) (*pb.LockResponse, error) { + if err := s.store.Lock(r.ResourceId); err != nil { + return nil, err + } + return &pb.LockResponse{}, nil +} + +// Unlock delegates lock release to the backing store. +func (s *iacStateBackendServer) Unlock(_ context.Context, r *pb.UnlockRequest) (*pb.UnlockResponse, error) { + if err := s.store.Unlock(r.ResourceId); err != nil { + return nil, err + } + return &pb.UnlockResponse{}, nil +} diff --git a/module/iac_state_grpc_client_test.go b/module/iac_state_grpc_client_test.go new file mode 100644 index 00000000..41fa3819 --- /dev/null +++ b/module/iac_state_grpc_client_test.go @@ -0,0 +1,53 @@ +package module + +import ( + "context" + "net" + "testing" + + pb "github.com/GoCodeAlone/workflow/plugin/external/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" +) + +func TestGRPCIaCStateStoreRoundTrip(t *testing.T) { + lis := bufconn.Listen(4 << 20) + srv := grpc.NewServer() + pb.RegisterIaCStateBackendServer(srv, &iacStateBackendServer{store: NewMemoryIaCStateStore()}) + go func() { _ = srv.Serve(lis) }() + defer srv.Stop() + + conn, err := grpc.NewClient("passthrough:///bufnet", + grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { return lis.DialContext(ctx) }), + grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + t.Fatal(err) + } + defer conn.Close() + + var store IaCStateStore = newGRPCIaCStateStore(pb.NewIaCStateBackendClient(conn)) + + want := &IaCState{ResourceID: "r1", ResourceType: "kubernetes", Provider: "azure", Status: "active", + Outputs: map[string]any{"endpoint": "https://x"}, Config: map[string]any{"size": "L"}} + if err := store.SaveState(want); err != nil { + t.Fatalf("SaveState: %v", err) + } + got, err := store.GetState("r1") + if err != nil || got == nil { + t.Fatalf("GetState: %v (got=%v)", err, got) + } + if got.ResourceID != "r1" || got.Status != "active" || got.Outputs["endpoint"] != "https://x" { + t.Fatalf("round-trip mismatch: %+v", got) + } + if err := store.Lock("r1"); err != nil { + t.Fatalf("Lock: %v", err) + } + missing, err := store.GetState("nope") + if err != nil || missing != nil { + t.Fatalf("GetState(missing) should be nil,nil — got %v,%v", missing, err) + } + if err := store.Unlock("r1"); err != nil { + t.Fatalf("Unlock: %v", err) + } +} From abd526ff9b9ecfd37fba7439d17b8bd84284b255 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 06:11:44 -0400 Subject: [PATCH 33/39] docs(module): note context.Background() follow-up on grpcIaCStateStore Code-review Minor: the spec asked for the hardcoded context.Background() to be acknowledged as a known follow-up (IaCStateStore has no ctx param) rather than silently used. --- module/iac_state_grpc_client.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/module/iac_state_grpc_client.go b/module/iac_state_grpc_client.go index afe70199..999ac6b7 100644 --- a/module/iac_state_grpc_client.go +++ b/module/iac_state_grpc_client.go @@ -98,6 +98,11 @@ func jsonBytesToMap(b []byte) (map[string]any, error) { // ───────────────────────────────────────────────────────────────────────────── // grpcIaCStateStore adapts a pb.IaCStateBackendClient to module.IaCStateStore. +// +// All six methods call the backend with context.Background(): the +// module.IaCStateStore interface has no ctx parameter, so there is no caller +// context to plumb today. Threading a real context through IaCStateStore is a +// known follow-up, out of scope for this extraction. type grpcIaCStateStore struct { client pb.IaCStateBackendClient } From 774a3bd78cf943b75c486258e40a97e9b4bfd7a3 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 06:14:52 -0400 Subject: [PATCH 34/39] feat(module): engine-side iac.state plugin-backend registry + dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A package-level iacStateBackendRegistry maps a backend name to a pb.IaCStateBackendClient; the engine populates it at plugin-load time (Task 14). IaCModule.Init()'s switch gains a default arm that resolves non-core backend names from the registry, constructing a grpcIaCStateStore. Reserved core names (memory/filesystem/postgres) are rejected at registration. The existing in-process backend cases (incl. azure_blob) are untouched here — the plumbing exists and is tested; PR 5 flips azure_blob onto it. --- module/iac_module.go | 9 ++- module/iac_state_plugin_registry.go | 64 ++++++++++++++++++++ module/iac_state_plugin_registry_test.go | 76 ++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 module/iac_state_plugin_registry.go create mode 100644 module/iac_state_plugin_registry_test.go diff --git a/module/iac_module.go b/module/iac_module.go index b857c9a0..ebc8090f 100644 --- a/module/iac_module.go +++ b/module/iac_module.go @@ -115,7 +115,14 @@ func (m *IaCModule) Init(app modular.Application) error { } m.store = store default: - return fmt.Errorf("iac.state %q: unsupported backend %q (use 'memory', 'filesystem', 'spaces', 'gcs', 'azure_blob', or 'postgres')", m.name, m.backend) + // Not a core in-process backend — consult the plugin-backend registry. + // The engine populates iacStateBackendRegistryInstance at plugin-load + // time; a resolved backend is served over gRPC via grpcIaCStateStore. + if client, ok := iacStateBackendRegistryInstance.resolve(m.backend); ok { + m.store = newGRPCIaCStateStore(client) + break + } + return fmt.Errorf("iac.state %q: unsupported backend %q (use 'memory', 'filesystem', 'spaces', 'gcs', 'azure_blob', or 'postgres', or load the plugin that provides it)", m.name, m.backend) } return app.RegisterService(m.name, m.store) diff --git a/module/iac_state_plugin_registry.go b/module/iac_state_plugin_registry.go new file mode 100644 index 00000000..d089df44 --- /dev/null +++ b/module/iac_state_plugin_registry.go @@ -0,0 +1,64 @@ +package module + +import ( + "fmt" + "sync" + + pb "github.com/GoCodeAlone/workflow/plugin/external/proto" +) + +// ───────────────────────────────────────────────────────────────────────────── +// iacStateBackendRegistry — engine-side registry mapping an iac.state backend +// name to a plugin-served pb.IaCStateBackendClient. +// +// The engine populates the package-level singleton at plugin-load time +// (Task 14); IaCModule.Init consults it for any backend name not handled by an +// in-process core case. Reserved core backend names (memory/filesystem/postgres) +// — the backends that have no cloud SDK and stay in core — cannot be claimed by +// a plugin. +// ───────────────────────────────────────────────────────────────────────────── + +// reservedIaCStateBackends are the core backend names a plugin may never claim. +var reservedIaCStateBackends = map[string]struct{}{ + "memory": {}, + "filesystem": {}, + "postgres": {}, +} + +// iacStateBackendRegistry maps a backend name to a plugin gRPC client. +type iacStateBackendRegistry struct { + mu sync.RWMutex + clients map[string]pb.IaCStateBackendClient +} + +// newIaCStateBackendRegistry constructs an empty registry. +func newIaCStateBackendRegistry() *iacStateBackendRegistry { + return &iacStateBackendRegistry{clients: make(map[string]pb.IaCStateBackendClient)} +} + +// register associates a backend name with a plugin client. Reserved core +// backend names are rejected. Re-registering a non-reserved name overwrites the +// previous client (last plugin loaded wins). +func (r *iacStateBackendRegistry) register(name string, client pb.IaCStateBackendClient) error { + if _, reserved := reservedIaCStateBackends[name]; reserved { + return fmt.Errorf("plugin registered reserved iac.state backend name %q", name) + } + r.mu.Lock() + defer r.mu.Unlock() + r.clients[name] = client + return nil +} + +// resolve returns the plugin client for a backend name, and whether one is +// registered. +func (r *iacStateBackendRegistry) resolve(name string) (pb.IaCStateBackendClient, bool) { + r.mu.RLock() + defer r.mu.RUnlock() + c, ok := r.clients[name] + return c, ok +} + +// iacStateBackendRegistryInstance is the package-level singleton the engine +// populates and IaCModule.Init consults. Task 14 adds an exported +// RegisterIaCStateBackend wrapper around it. +var iacStateBackendRegistryInstance = newIaCStateBackendRegistry() diff --git a/module/iac_state_plugin_registry_test.go b/module/iac_state_plugin_registry_test.go new file mode 100644 index 00000000..b35be00f --- /dev/null +++ b/module/iac_state_plugin_registry_test.go @@ -0,0 +1,76 @@ +package module + +import ( + "context" + "testing" + + pb "github.com/GoCodeAlone/workflow/plugin/external/proto" + "google.golang.org/grpc" +) + +// fakeStateBackendClient is a no-op pb.IaCStateBackendClient stub. The registry +// tests only need it to satisfy the interface; no method is ever called. +type fakeStateBackendClient struct{} + +func (*fakeStateBackendClient) GetState(context.Context, *pb.GetStateRequest, ...grpc.CallOption) (*pb.GetStateResponse, error) { + return nil, nil +} +func (*fakeStateBackendClient) SaveState(context.Context, *pb.SaveStateRequest, ...grpc.CallOption) (*pb.SaveStateResponse, error) { + return nil, nil +} +func (*fakeStateBackendClient) ListStates(context.Context, *pb.ListStatesRequest, ...grpc.CallOption) (*pb.ListStatesResponse, error) { + return nil, nil +} +func (*fakeStateBackendClient) DeleteState(context.Context, *pb.DeleteStateRequest, ...grpc.CallOption) (*pb.DeleteStateResponse, error) { + return nil, nil +} +func (*fakeStateBackendClient) Lock(context.Context, *pb.LockRequest, ...grpc.CallOption) (*pb.LockResponse, error) { + return nil, nil +} +func (*fakeStateBackendClient) Unlock(context.Context, *pb.UnlockRequest, ...grpc.CallOption) (*pb.UnlockResponse, error) { + return nil, nil +} + +func TestIaCStateBackendRegistry(t *testing.T) { + reg := newIaCStateBackendRegistry() + if _, ok := reg.resolve("azure_blob"); ok { + t.Fatal("empty registry should not resolve azure_blob") + } + fake := &fakeStateBackendClient{} + if err := reg.register("azure_blob", fake); err != nil { + t.Fatalf("register: %v", err) + } + got, ok := reg.resolve("azure_blob") + if !ok || got != fake { + t.Fatalf("resolve azure_blob: ok=%v got=%v", ok, got) + } + for _, reserved := range []string{"memory", "filesystem", "postgres"} { + if err := reg.register(reserved, fake); err == nil { + t.Fatalf("register(%q) must fail — reserved core backend name", reserved) + } + } +} + +// TestIaCModule_PluginBackendDispatch exercises the real IaCModule.Init() path: +// a backend name no in-process switch case matches is resolved from the +// package-level iacStateBackendRegistryInstance, yielding a *grpcIaCStateStore. +func TestIaCModule_PluginBackendDispatch(t *testing.T) { + const backend = "azure_blob_test_only" + fake := &fakeStateBackendClient{} + if err := iacStateBackendRegistryInstance.register(backend, fake); err != nil { + t.Fatalf("register: %v", err) + } + defer func() { + iacStateBackendRegistryInstance.mu.Lock() + delete(iacStateBackendRegistryInstance.clients, backend) + iacStateBackendRegistryInstance.mu.Unlock() + }() + + m := NewIaCModule("iac-plugin", map[string]any{"backend": backend}) + if err := m.Init(NewMockApplication()); err != nil { + t.Fatalf("Init: %v", err) + } + if _, ok := m.store.(*grpcIaCStateStore); !ok { + t.Fatalf("m.store is %T, want *grpcIaCStateStore", m.store) + } +} From 3589a9922b5777fb4f14a86316c8033e4de43c3e Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 07:00:52 -0400 Subject: [PATCH 35/39] feat(module): exempt *_ref keys from redaction; lock in credentials: redaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Option-1 credentials move raw cloud secrets inline into plugin-native module config under a credentials: key — already redacted wholesale by the existing 'credential' pattern (regression test added). But that same pattern over-redacts credentials_ref:, which holds a module NAME, not a secret. Adds a narrow *_ref-suffix exemption to isSensitiveField so reference keys are preserved for trace debuggability. --- module/step_output_redactor.go | 5 +++++ module/step_output_redactor_test.go | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/module/step_output_redactor.go b/module/step_output_redactor.go index aded40b4..77c5852b 100644 --- a/module/step_output_redactor.go +++ b/module/step_output_redactor.go @@ -64,6 +64,11 @@ func isSensitiveField(name string, patterns []string) bool { if strings.HasSuffix(lower, safeFieldSuffix) { return false } + // Reference keys hold module/resource NAMES, not secrets — never redact them, + // even though "credentials_ref" contains the "credential" substring. + if strings.HasSuffix(lower, "_ref") { + return false + } for _, p := range patterns { if strings.Contains(lower, p) { return true diff --git a/module/step_output_redactor_test.go b/module/step_output_redactor_test.go index 1e2d3894..4019744c 100644 --- a/module/step_output_redactor_test.go +++ b/module/step_output_redactor_test.go @@ -168,3 +168,28 @@ func TestRedactStepOutput_EmptyMap(t *testing.T) { t.Errorf("empty map should return empty map, got %v", got) } } + +func TestRedactCredentialsBlock(t *testing.T) { + in := map[string]any{ + "credentials": map[string]any{ + "accessKey": "AKIAEXAMPLE", + "secretKey": "supersecret", + }, + "credentials_ref": "aws-creds-module", + "bucket": "public-bucket-name", + } + out := RedactStepOutput(in) + // The credentials: block is redacted WHOLESALE — the existing "credential" + // pattern replaces the whole sub-tree with the placeholder STRING (no + // recursion). That is safe and is the design-sanctioned "already covered". + if out["credentials"] != RedactionPlaceholder { + t.Fatalf("credentials block must be wholesale-redacted, got: %#v", out["credentials"]) + } + // credentials_ref is a module NAME, not a secret — must be PRESERVED. + if out["credentials_ref"] != "aws-creds-module" { + t.Fatalf("credentials_ref must NOT be redacted (it is a module reference): %#v", out["credentials_ref"]) + } + if out["bucket"] != "public-bucket-name" { + t.Fatalf("non-sensitive field wrongly redacted") + } +} From f613432f0d18454514836ed66427e227ccf0c2c5 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 07:03:05 -0400 Subject: [PATCH 36/39] refactor(module): name the _ref redaction-exemption suffix as a const Code-review Minor: refFieldSuffix const for consistency with the existing safeFieldSuffix (_display) exemption. --- module/step_output_redactor.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/module/step_output_redactor.go b/module/step_output_redactor.go index 77c5852b..8c9030b4 100644 --- a/module/step_output_redactor.go +++ b/module/step_output_redactor.go @@ -24,6 +24,11 @@ const RedactionPlaceholder = "[REDACTED]" // safeFieldSuffix marks a field as explicitly safe and exempt from redaction. const safeFieldSuffix = "_display" +// refFieldSuffix marks a field as a reference (a module/resource name, not a +// secret value) — exempt from redaction even if its name contains a sensitive +// substring (e.g. "credentials_ref" contains "credential"). +const refFieldSuffix = "_ref" + // RedactStepOutput recursively scans output and replaces values of sensitive // fields with RedactionPlaceholder. Field names are matched case-insensitively // against SensitiveFieldPatterns. Fields ending with "_display" are never @@ -66,7 +71,7 @@ func isSensitiveField(name string, patterns []string) bool { } // Reference keys hold module/resource NAMES, not secrets — never redact them, // even though "credentials_ref" contains the "credential" substring. - if strings.HasSuffix(lower, "_ref") { + if strings.HasSuffix(lower, refFieldSuffix) { return false } for _, p := range patterns { From 35642c091e5f2cd7b75bf9e377b41ac68d352a39 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 07:04:09 -0400 Subject: [PATCH 37/39] test(plugin/external): guard against gRPC body-logging interceptors CreateModule requests carry inline credentials: blocks (Option-1 credentials model). This guard fails CI if any plugin/external/ file gains a gRPC interceptor option, forcing a reviewer to confirm it cannot log request bodies. Implements the cloud-sdk-extraction design's Security guard-test requirement. --- plugin/external/grpc_logging_guard_test.go | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 plugin/external/grpc_logging_guard_test.go diff --git a/plugin/external/grpc_logging_guard_test.go b/plugin/external/grpc_logging_guard_test.go new file mode 100644 index 00000000..77d70b6d --- /dev/null +++ b/plugin/external/grpc_logging_guard_test.go @@ -0,0 +1,37 @@ +package external + +import ( + "os" + "regexp" + "testing" +) + +// The plugin SDK must NOT install a gRPC interceptor that logs request bodies — +// CreateModule requests carry inline credentials: blocks. This test fails if +// grpc.NewServer / grpc.NewClient anywhere in plugin/external/ is constructed +// with a *UnaryInterceptor option, forcing a reviewer to look. See the +// cloud-sdk-extraction design, Security section. +func TestNoBodyLoggingInterceptor(t *testing.T) { + interceptorOpt := regexp.MustCompile(`(Chain)?Unary(Server|Client)?Interceptor`) + entries, err := os.ReadDir(".") + if err != nil { + t.Fatal(err) + } + for _, e := range entries { + name := e.Name() + if e.IsDir() || !match(name, ".go") || match(name, "_test.go") { + continue + } + b, err := os.ReadFile(name) + if err != nil { + t.Fatal(err) + } + if interceptorOpt.Match(b) { + t.Fatalf("%s references a gRPC interceptor option — if it logs request "+ + "bodies it can leak inline credentials: blocks. Audit it and, if safe, "+ + "add an explicit allowlist entry to this test.", name) + } + } +} + +func match(s, suffix string) bool { return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix } From cfa6b0be592a8d409687704c1b1b449658fe310b Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 07:04:45 -0400 Subject: [PATCH 38/39] test(plugin/external): broaden interceptor guard to Stream interceptors Code-review catch: the guard regex covered Unary only. CreateModule is unary today, but a future streaming RPC carrying credentials must not slip a stream interceptor past the guard. Now matches (Unary|Stream). --- plugin/external/grpc_logging_guard_test.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/plugin/external/grpc_logging_guard_test.go b/plugin/external/grpc_logging_guard_test.go index 77d70b6d..e1f0a383 100644 --- a/plugin/external/grpc_logging_guard_test.go +++ b/plugin/external/grpc_logging_guard_test.go @@ -9,10 +9,12 @@ import ( // The plugin SDK must NOT install a gRPC interceptor that logs request bodies — // CreateModule requests carry inline credentials: blocks. This test fails if // grpc.NewServer / grpc.NewClient anywhere in plugin/external/ is constructed -// with a *UnaryInterceptor option, forcing a reviewer to look. See the -// cloud-sdk-extraction design, Security section. +// with an *Interceptor option, forcing a reviewer to look. Covers Unary AND +// Stream interceptors — CreateModule is unary today, but a future streaming +// RPC carrying credentials must not slip a stream interceptor past this guard. +// See the cloud-sdk-extraction design, Security section. func TestNoBodyLoggingInterceptor(t *testing.T) { - interceptorOpt := regexp.MustCompile(`(Chain)?Unary(Server|Client)?Interceptor`) + interceptorOpt := regexp.MustCompile(`(Chain)?(Unary|Stream)(Server|Client)?Interceptor`) entries, err := os.ReadDir(".") if err != nil { t.Fatal(err) From 003dd5803101213e18f86d3089f639aec6f9f3dc Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Thu, 14 May 2026 10:58:11 -0400 Subject: [PATCH 39/39] fix(iac-host): narrow _ref redaction exemption, validate registry input, harden guard test Addresses Copilot review on PR #670: - step_output_redactor: the "_ref" suffix no longer blanket-bypasses redaction. It exempts only structural-reference words ("credential"), so credentials_ref is preserved but bearer_token_ref / api_key_ref / secret_ref still redact (token/api_key/secret are value-bearing). - iac_state_plugin_registry.register: rejects empty/whitespace names and nil clients; trims the name before use. - grpc_logging_guard_test: walks the whole plugin/external/ tree (catches subpackages like sdk/), skips generated *.pb.go / proto/ files to avoid false positives, and adds a real interceptorAllowlist mechanism the failure message now references. - iac_state_grpc_client_test + benchmark_iac_state_backend_test: close the bufconn listener via t.Cleanup/b.Cleanup; benchmark comment no longer implies bufconn size sets the gRPC message cap. Co-Authored-By: Claude Opus 4.7 (1M context) --- module/benchmark_iac_state_backend_test.go | 5 +- module/iac_state_grpc_client_test.go | 1 + module/iac_state_plugin_registry.go | 11 +++- module/iac_state_plugin_registry_test.go | 17 +++++ module/step_output_redactor.go | 51 +++++++++++--- module/step_output_redactor_test.go | 22 +++++++ plugin/external/grpc_logging_guard_test.go | 77 +++++++++++++++------- 7 files changed, 150 insertions(+), 34 deletions(-) diff --git a/module/benchmark_iac_state_backend_test.go b/module/benchmark_iac_state_backend_test.go index 6659694c..68093414 100644 --- a/module/benchmark_iac_state_backend_test.go +++ b/module/benchmark_iac_state_backend_test.go @@ -51,7 +51,10 @@ func BenchmarkIaCStateBackend_InProcess(b *testing.B) { // BenchmarkIaCStateBackend_GRPC is the post-extraction path: same store, same // cycle, but every call crosses a real (in-memory bufconn) gRPC boundary. func BenchmarkIaCStateBackend_GRPC(b *testing.B) { - lis := bufconn.Listen(4 << 20) // 4 MiB — gRPC default message cap + // 4 MiB in-memory listener buffer. Note: this sizes the bufconn pipe only; + // gRPC's own max message size is configured separately via dial/server options. + lis := bufconn.Listen(4 << 20) + b.Cleanup(func() { _ = lis.Close() }) srv := grpc.NewServer() pb.RegisterIaCStateBackendServer(srv, &iacStateBackendServer{store: NewMemoryIaCStateStore()}) go func() { _ = srv.Serve(lis) }() diff --git a/module/iac_state_grpc_client_test.go b/module/iac_state_grpc_client_test.go index 41fa3819..93af5230 100644 --- a/module/iac_state_grpc_client_test.go +++ b/module/iac_state_grpc_client_test.go @@ -13,6 +13,7 @@ import ( func TestGRPCIaCStateStoreRoundTrip(t *testing.T) { lis := bufconn.Listen(4 << 20) + t.Cleanup(func() { _ = lis.Close() }) srv := grpc.NewServer() pb.RegisterIaCStateBackendServer(srv, &iacStateBackendServer{store: NewMemoryIaCStateStore()}) go func() { _ = srv.Serve(lis) }() diff --git a/module/iac_state_plugin_registry.go b/module/iac_state_plugin_registry.go index d089df44..c1f18685 100644 --- a/module/iac_state_plugin_registry.go +++ b/module/iac_state_plugin_registry.go @@ -2,6 +2,7 @@ package module import ( "fmt" + "strings" "sync" pb "github.com/GoCodeAlone/workflow/plugin/external/proto" @@ -36,10 +37,18 @@ func newIaCStateBackendRegistry() *iacStateBackendRegistry { return &iacStateBackendRegistry{clients: make(map[string]pb.IaCStateBackendClient)} } -// register associates a backend name with a plugin client. Reserved core +// register associates a backend name with a plugin client. The name must be +// non-empty (after trimming) and the client must be non-nil. Reserved core // backend names are rejected. Re-registering a non-reserved name overwrites the // previous client (last plugin loaded wins). func (r *iacStateBackendRegistry) register(name string, client pb.IaCStateBackendClient) error { + name = strings.TrimSpace(name) + if name == "" { + return fmt.Errorf("iac.state backend registration: name must not be empty") + } + if client == nil { + return fmt.Errorf("iac.state backend registration %q: client must not be nil", name) + } if _, reserved := reservedIaCStateBackends[name]; reserved { return fmt.Errorf("plugin registered reserved iac.state backend name %q", name) } diff --git a/module/iac_state_plugin_registry_test.go b/module/iac_state_plugin_registry_test.go index b35be00f..38334f82 100644 --- a/module/iac_state_plugin_registry_test.go +++ b/module/iac_state_plugin_registry_test.go @@ -49,6 +49,23 @@ func TestIaCStateBackendRegistry(t *testing.T) { t.Fatalf("register(%q) must fail — reserved core backend name", reserved) } } + // Empty / whitespace-only name must be rejected. + for _, bad := range []string{"", " "} { + if err := reg.register(bad, fake); err == nil { + t.Fatalf("register(%q) must fail — empty backend name", bad) + } + } + // Nil client must be rejected. + if err := reg.register("nilclient_backend", nil); err == nil { + t.Fatal("register with nil client must fail") + } + // A name surrounded by whitespace is trimmed and registers under the trimmed key. + if err := reg.register(" spaced_backend ", fake); err != nil { + t.Fatalf("register trimmed name: %v", err) + } + if _, ok := reg.resolve("spaced_backend"); !ok { + t.Fatal("trimmed name must resolve under its trimmed key") + } } // TestIaCModule_PluginBackendDispatch exercises the real IaCModule.Init() path: diff --git a/module/step_output_redactor.go b/module/step_output_redactor.go index 8c9030b4..7d8562f3 100644 --- a/module/step_output_redactor.go +++ b/module/step_output_redactor.go @@ -25,10 +25,18 @@ const RedactionPlaceholder = "[REDACTED]" const safeFieldSuffix = "_display" // refFieldSuffix marks a field as a reference (a module/resource name, not a -// secret value) — exempt from redaction even if its name contains a sensitive -// substring (e.g. "credentials_ref" contains "credential"). +// secret value). A "_ref" key is exempt from redaction ONLY when its sensitive +// match comes from a structural-reference word ("credential"). A key like +// "bearer_token_ref" still redacts, because "token" is a value-bearing secret +// pattern, not a structural reference — the "_ref" suffix must not be a blanket +// bypass for every sensitive pattern. const refFieldSuffix = "_ref" +// refExemptPatterns are the sensitive patterns that a "_ref" suffix is allowed +// to exempt: words that describe a *reference to* a credential-holding module +// (e.g. "credentials_ref"), not words that name a secret value itself. +var refExemptPatterns = []string{"credential"} + // RedactStepOutput recursively scans output and replaces values of sensitive // fields with RedactionPlaceholder. Field names are matched case-insensitively // against SensitiveFieldPatterns. Fields ending with "_display" are never @@ -63,21 +71,44 @@ func redactMap(m map[string]any, patterns []string) map[string]any { } // isSensitiveField returns true when the lowercased field name contains any of -// the patterns and does not have the safe suffix. +// the patterns and is not exempted by a safe/reference suffix. func isSensitiveField(name string, patterns []string) bool { lower := strings.ToLower(name) if strings.HasSuffix(lower, safeFieldSuffix) { return false } - // Reference keys hold module/resource NAMES, not secrets — never redact them, - // even though "credentials_ref" contains the "credential" substring. - if strings.HasSuffix(lower, refFieldSuffix) { - return false - } + var matched []string for _, p := range patterns { if strings.Contains(lower, p) { - return true + matched = append(matched, p) + } + } + if len(matched) == 0 { + return false + } + // A "_ref" key is exempt ONLY when every sensitive pattern it matched is a + // structural-reference word (e.g. "credentials_ref" → "credential"). A key + // like "bearer_token_ref" still redacts because "token" names a secret + // value, so "_ref" must not blanket-bypass it. + if strings.HasSuffix(lower, refFieldSuffix) && allRefExempt(matched) { + return false + } + return true +} + +// allRefExempt reports whether every matched pattern is in refExemptPatterns. +func allRefExempt(matched []string) bool { + for _, m := range matched { + exempt := false + for _, e := range refExemptPatterns { + if m == e { + exempt = true + break + } + } + if !exempt { + return false } } - return false + return true } diff --git a/module/step_output_redactor_test.go b/module/step_output_redactor_test.go index 4019744c..6c82bc23 100644 --- a/module/step_output_redactor_test.go +++ b/module/step_output_redactor_test.go @@ -193,3 +193,25 @@ func TestRedactCredentialsBlock(t *testing.T) { t.Fatalf("non-sensitive field wrongly redacted") } } + +// TestRedactRefSuffixDoesNotBypassValueSecrets locks in that the "_ref" suffix +// exempts ONLY structural-reference words (credentials_ref) — it must NOT be a +// blanket bypass for value-bearing secret patterns. A key like +// "bearer_token_ref" still matches "token" and must redact. +func TestRedactRefSuffixDoesNotBypassValueSecrets(t *testing.T) { + in := map[string]any{ + "credentials_ref": "aws-creds-module", // structural ref → preserved + "bearer_token_ref": "tok-abc123", // matches "token" → must redact + "api_key_ref": "ak-secret", // matches "api_key" → must redact + "secret_ref": "shhh", // matches "secret" → must redact + } + out := RedactStepOutput(in) + if out["credentials_ref"] != "aws-creds-module" { + t.Errorf("credentials_ref must be preserved, got %#v", out["credentials_ref"]) + } + for _, k := range []string{"bearer_token_ref", "api_key_ref", "secret_ref"} { + if out[k] != RedactionPlaceholder { + t.Errorf("%s matches a value-bearing secret pattern — _ref must not bypass redaction, got %#v", k, out[k]) + } + } +} diff --git a/plugin/external/grpc_logging_guard_test.go b/plugin/external/grpc_logging_guard_test.go index e1f0a383..bc548cde 100644 --- a/plugin/external/grpc_logging_guard_test.go +++ b/plugin/external/grpc_logging_guard_test.go @@ -1,39 +1,72 @@ package external import ( + "io/fs" "os" + "path/filepath" "regexp" + "strings" "testing" ) -// The plugin SDK must NOT install a gRPC interceptor that logs request bodies — -// CreateModule requests carry inline credentials: blocks. This test fails if -// grpc.NewServer / grpc.NewClient anywhere in plugin/external/ is constructed -// with an *Interceptor option, forcing a reviewer to look. Covers Unary AND -// Stream interceptors — CreateModule is unary today, but a future streaming -// RPC carrying credentials must not slip a stream interceptor past this guard. -// See the cloud-sdk-extraction design, Security section. +// interceptorAllowlist is the set of plugin/external/** Go files (path relative +// to plugin/external/) that are permitted to reference a gRPC interceptor +// option. A file lands here only after a reviewer has confirmed it does NOT log +// request bodies — CreateModule requests carry inline credentials: blocks. +// Empty by design: today nothing legitimately installs an interceptor. +var interceptorAllowlist = map[string]struct{}{} + +// isGeneratedProtoFile reports whether path is protoc-generated code. Generated +// *_grpc.pb.go files reference grpc.UnaryServerInterceptor / StreamServerInterceptor +// in their service-registration types — those are type references in generated +// code, not an interceptor being *installed*, so they are not a body-logging risk. +func isGeneratedProtoFile(path string) bool { + base := filepath.Base(path) + if strings.HasSuffix(base, ".pb.go") { + return true + } + // Anything under a proto/ directory is generated wire code. + return strings.Contains(filepath.ToSlash(path), "/proto/") || strings.HasPrefix(filepath.ToSlash(path), "proto/") +} + +// TestNoBodyLoggingInterceptor walks the WHOLE plugin/external/ tree (including +// subpackages like sdk/) and fails if any non-generated, non-test, non-allowlisted +// Go file constructs grpc.NewServer / grpc.NewClient with an *Interceptor option. +// A body-logging interceptor on a credential-carrying RPC leaks inline +// credentials: blocks. Covers Unary AND Stream, Server AND Client, plain AND +// Chain* variants. See the cloud-sdk-extraction design, Security section. func TestNoBodyLoggingInterceptor(t *testing.T) { interceptorOpt := regexp.MustCompile(`(Chain)?(Unary|Stream)(Server|Client)?Interceptor`) - entries, err := os.ReadDir(".") - if err != nil { - t.Fatal(err) - } - for _, e := range entries { - name := e.Name() - if e.IsDir() || !match(name, ".go") || match(name, "_test.go") { - continue - } - b, err := os.ReadFile(name) + + err := filepath.WalkDir(".", func(path string, d fs.DirEntry, err error) error { if err != nil { - t.Fatal(err) + return err + } + if d.IsDir() { + return nil + } + rel := filepath.ToSlash(path) + if !strings.HasSuffix(rel, ".go") || strings.HasSuffix(rel, "_test.go") { + return nil + } + if isGeneratedProtoFile(rel) { + return nil + } + if _, ok := interceptorAllowlist[rel]; ok { + return nil + } + b, readErr := os.ReadFile(path) + if readErr != nil { + return readErr } if interceptorOpt.Match(b) { - t.Fatalf("%s references a gRPC interceptor option — if it logs request "+ + t.Errorf("%s references a gRPC interceptor option — if it logs request "+ "bodies it can leak inline credentials: blocks. Audit it and, if safe, "+ - "add an explicit allowlist entry to this test.", name) + "add its plugin/external-relative path to interceptorAllowlist in this test.", rel) } + return nil + }) + if err != nil { + t.Fatal(err) } } - -func match(s, suffix string) bool { return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix }