From c01f23a98288f92e33ea65caca90b7458c0e312e Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Mon, 18 May 2026 09:35:57 -0400 Subject: [PATCH 1/2] fix(plugin): sanitize remote step payloads Remote steps must not send host-only HTTP metadata or engine internal config keys through strict protobuf plugin contracts. --- plugin/external/remote_step.go | 18 +++++++- plugin/external/remote_step_test.go | 68 +++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/plugin/external/remote_step.go b/plugin/external/remote_step.go index 0650e77d..609da460 100644 --- a/plugin/external/remote_step.go +++ b/plugin/external/remote_step.go @@ -111,7 +111,7 @@ func (s *RemoteStep) executeRequest(pc *module.PipelineContext, resolvedConfig m if err != nil { return nil, fmt.Errorf("remote step %q (handle %s) encode trigger_data as Struct: %w", s.name, s.handleID, err) } - metadata, err := mapToStruct(pc.Metadata) + metadata, err := mapToStruct(remotePluginMetadata(pc.Metadata)) if err != nil { return nil, fmt.Errorf("remote step %q (handle %s) encode metadata as Struct: %w", s.name, s.handleID, err) } @@ -149,7 +149,7 @@ func (s *RemoteStep) executeRequest(pc *module.PipelineContext, resolvedConfig m if s.contract.Mode == pb.ContractMode_CONTRACT_MODE_LEGACY_STRUCT { return req, nil } - typedConfig, err := mapToTypedAny(s.contract.ConfigMessage, resolvedConfig, s.types) + typedConfig, err := mapToTypedAny(s.contract.ConfigMessage, stripInternalKeys(resolvedConfig), s.types) if err != nil { if s.contract.Mode == pb.ContractMode_CONTRACT_MODE_STRICT_PROTO { return nil, fmt.Errorf("remote step %q STRICT_PROTO config message %q cannot use legacy Struct fallback: %w", s.name, s.contract.ConfigMessage, err) @@ -177,6 +177,20 @@ func (s *RemoteStep) executeRequest(pc *module.PipelineContext, resolvedConfig m return req, nil } +func remotePluginMetadata(metadata map[string]any) map[string]any { + if metadata == nil { + return nil + } + filtered := make(map[string]any, len(metadata)) + for key, value := range metadata { + if _, err := structpb.NewValue(value); err != nil { + continue + } + filtered[key] = value + } + return filtered +} + // Destroy releases the remote step resources. func (s *RemoteStep) Destroy() error { resp, err := s.client.DestroyStep(context.Background(), &pb.HandleRequest{ diff --git a/plugin/external/remote_step_test.go b/plugin/external/remote_step_test.go index f8b2fe32..418e2d51 100644 --- a/plugin/external/remote_step_test.go +++ b/plugin/external/remote_step_test.go @@ -314,6 +314,74 @@ func TestRemoteStep_Execute_StrictContractSkipsLegacyStructEncodeForCurrent(t *t } } +func TestRemoteStep_Execute_FiltersUnrepresentableMetadata(t *testing.T) { + stub := &stubPluginServiceClient{} + step := NewRemoteStep("test-step", "handle-metadata", stub, nil) + pc := module.NewPipelineContext(map[string]any{"name": "typed-input"}, map[string]any{ + "pipeline": "http-flow", + "_http_response_writer": make(chan int), + "_http_request": map[int]string{1: "request"}, + "explicit_trace": true, + }) + + if _, err := step.Execute(context.Background(), pc); err != nil { + t.Fatalf("Execute returned error: %v", err) + } + if stub.lastRequest == nil { + t.Fatal("expected ExecuteStep to be called") + } + got := stub.lastRequest.Metadata.AsMap() + if got["pipeline"] != "http-flow" { + t.Fatalf("expected serializable metadata to be preserved, got %#v", got) + } + if got["explicit_trace"] != true { + t.Fatalf("expected boolean metadata to be preserved, got %#v", got) + } + if _, ok := got["_http_response_writer"]; ok { + t.Fatalf("expected response writer metadata to be filtered, got %#v", got) + } + if _, ok := got["_http_request"]; ok { + t.Fatalf("expected request metadata to be filtered, got %#v", got) + } +} + +func TestRemoteStep_Execute_StrictContractStripsInternalConfigKeys(t *testing.T) { + stub := &stubPluginServiceClient{ + response: &pb.ExecuteStepResponse{TypedOutput: mustAnyFromMapForTest(t, "workflow.plugin.v1.Manifest", map[string]any{ + "name": "typed-output", + "version": "v1", + })}, + } + contract := &pb.ContractDescriptor{ + Kind: pb.ContractKind_CONTRACT_KIND_STEP, + StepType: "test.strict", + ConfigMessage: "workflow.plugin.v1.Manifest", + InputMessage: "workflow.plugin.v1.Manifest", + OutputMessage: "workflow.plugin.v1.Manifest", + Mode: pb.ContractMode_CONTRACT_MODE_STRICT_PROTO, + } + step := NewRemoteStep("test-step", "handle-strict", stub, map[string]any{ + "_config_dir": "/config", + "name": "typed-config", + "version": "v1", + }, contract) + pc := module.NewPipelineContext(map[string]any{ + "name": "typed-input", + "version": "v1", + }, nil) + + if _, err := step.Execute(context.Background(), pc); err != nil { + t.Fatalf("STRICT_PROTO execute should strip internal config keys before typed encode; got %v", err) + } + if stub.lastRequest == nil { + t.Fatal("expected ExecuteStep to be called") + } + if stub.lastRequest.Config != nil { + t.Fatalf("expected strict step to omit legacy Config, got %v", stub.lastRequest.Config) + } + assertAnyTypeForTest(t, stub.lastRequest.TypedConfig, "workflow.plugin.v1.Manifest") +} + func TestRemoteStep_Execute_StrictContractFiltersUnknownCurrentFields(t *testing.T) { stub := &stubPluginServiceClient{ response: &pb.ExecuteStepResponse{TypedOutput: mustAnyFromMapForTest(t, "workflow.plugin.v1.Manifest", map[string]any{ From e4be5734dfcf93a34acb10c622bd148161f42d11 Mon Sep 17 00:00:00 2001 From: Jon Langevin Date: Tue, 19 May 2026 14:47:04 -0400 Subject: [PATCH 2/2] docs(plan): design for live-deployment example validation CI matrix Files a design doc for the live-deploy CI matrix deferred from the 2026-05-19 multi-repo QoL sweep. Schema-level validation is insufficient to promote a plugin to 'verified'; this design adds a weekly OIDC-driven GitHub Actions matrix that exercises each IaC plugin's examples/minimal/config.yaml against staging cloud accounts, auto-promotes on green, demotes on 2 consecutive REDs. Execution is gated on operator provisioning staging accounts + GitHub OIDC trust per provider. Document this as the next concrete step. Companion to workflow#725 (marketplace-verify subcommand). Closes #723. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...026-05-19-live-deploy-validation-design.md | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 _worktrees/live-deploy-design-1779216240/docs/plans/2026-05-19-live-deploy-validation-design.md diff --git a/_worktrees/live-deploy-design-1779216240/docs/plans/2026-05-19-live-deploy-validation-design.md b/_worktrees/live-deploy-design-1779216240/docs/plans/2026-05-19-live-deploy-validation-design.md new file mode 100644 index 00000000..4a281901 --- /dev/null +++ b/_worktrees/live-deploy-design-1779216240/docs/plans/2026-05-19-live-deploy-validation-design.md @@ -0,0 +1,127 @@ +# Live-Deployment Example Validation — Design + +**Date:** 2026-05-19 +**Trigger:** The 2026-05-19 multi-repo QoL sweep validated plugin examples at SCHEMA level (`wfctl validate --skip-unknown-types`) but never ran them end-to-end against real cloud accounts. Promotion from `experimental` to `verified` remains a manual decision tied to GoCodeAlone-internal usage. +**Mode:** Design only (operator must provision CI secrets before execution). + +## Problem + +`wfctl validate --skip-unknown-types` confirms a YAML config parses and references known module types, but does not exercise the providers. A plugin can ship with valid YAML that fails at runtime — wrong field types, deprecated APIs, broken auth flow, infra-side rate limits. Today nothing catches this until an operator pins the plugin in a real project. + +Symptoms today: +- `aws#23`, `gcp#16`, `azure#20`, `tofu#11`, `ci-generator#9` shipped READMEs + examples that pass schema validation but have never been live-tested. +- `digitalocean` is the only IaC plugin with merged production usage (BMW + core-dump + workflow-compute). +- Promotion from `experimental` → `verified` requires a human to pin the plugin in a real wfctl.yaml. Slow + manual. + +## Goal + +Add a CI matrix that runs each P0/P1 plugin's `examples/minimal/config.yaml` against staging cloud accounts via OIDC. On green, the plugin auto-promotes to `verified` via a registry-manifest PR. On red, surfaces a failure annotation on the plugin's repo + opens a tracking issue. + +## Non-Goals + +- Replace the existing `wfctl validate --skip-unknown-types` schema check (still useful as a fast gate). +- Run examples for non-IaC, non-cloud plugins (eventbus, payments, twilio etc. need different validation surfaces — payments needs a Stripe test API; twilio needs a sandbox account; those are out of scope here). +- Validate against PRODUCTION accounts. Staging only. + +## Approach + +### Phase 1: per-provider staging accounts + OIDC + +For each IaC provider (AWS, GCP, Azure, DigitalOcean, OpenTofu-via-any-provider): + +1. Operator creates a dedicated staging account/project/subscription. +2. Configure GitHub OIDC trust: + - AWS: IAM role + `aws-actions/configure-aws-credentials@v4`. + - GCP: Workload Identity Federation + `google-github-actions/auth@v2`. + - Azure: federated credential + `azure/login@v2`. + - DigitalOcean: short-lived API token rotated via OIDC + Vault (or accept long-lived staging token). +3. Repo secret matrix populated: + ``` + STAGING_AWS_ROLE_ARN + STAGING_GCP_WORKLOAD_IDENTITY_PROVIDER + STAGING_GCP_SERVICE_ACCOUNT + STAGING_AZURE_TENANT_ID + AZURE_CLIENT_ID + AZURE_SUBSCRIPTION_ID + STAGING_DIGITALOCEAN_TOKEN + ``` + +### Phase 2: workflow main repo — `live-deploy.yml` workflow + +New workflow file `.github/workflows/live-deploy.yml`: + +```yaml +name: live-deploy +on: + workflow_dispatch: + schedule: + - cron: '0 6 * * 1' # weekly Monday 06:00 UTC +permissions: + id-token: write + contents: read + pull-requests: write +jobs: + live-deploy: + strategy: + fail-fast: false + matrix: + plugin: [aws, gcp, azure, digitalocean, tofu] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + repository: GoCodeAlone/workflow-plugin-${{ matrix.plugin }} + ref: main + - uses: GoCodeAlone/setup-wfctl@v1 + - name: Configure cloud auth (${{ matrix.plugin }}) + run: ./.github/scripts/cloud-auth.sh ${{ matrix.plugin }} + - name: wfctl deploy --dry-run + run: wfctl deploy --dry-run examples/minimal/config.yaml + timeout-minutes: 10 + - name: Report status + if: always() + run: ./.github/scripts/report-validation.sh ${{ matrix.plugin }} ${{ job.status }} +``` + +### Phase 3: registry promotion / demotion + +`report-validation.sh` consumes the job status and: + +- **GREEN:** if the plugin's registry manifest is currently `experimental`, opens a PR against `workflow-registry` flipping it to `verified` with a citation to the workflow run. +- **RED:** if the plugin's manifest is currently `verified`, opens a PR demoting it to `experimental` + opens a tracking issue on the plugin repo. +- **NO CHANGE:** no PR; record the run in a structured artifact for audit. + +A `--explain` flag on `wfctl plugin marketplace-verify` (already shipped in `workflow#725`) can read the latest validation-run artifact to display the live-deploy history alongside the org-usage signal. + +## Assumptions + +- Operator can provision dedicated staging accounts. **Load-bearing.** Without this, the workflow is inert. +- `wfctl deploy --dry-run` exists for all 5 IaC providers. **Verify before execution** — `digitalocean` has it (used in BMW); `aws`/`gcp`/`azure`/`tofu` need verification. +- OIDC trust for all 4 cloud providers is achievable from GitHub Actions. True today — all 4 publish official auth actions. +- The cost of running the matrix weekly is bounded (no idle infra; each example deploys + tears down). Estimated <$5/week if examples are correctly written. +- Promotion/demotion PRs are admin-mergeable autonomously (per `feedback_admin_override_pr_merge`). + +## Top doubts + +1. **Cost runaway.** Examples that fail to tear down can leave cloud infra running. Mitigation: each example must include a teardown step + the workflow runs `wfctl destroy` after `deploy --dry-run` even on failure. Verify in alignment-check. +2. **Flaky staging.** Cloud-provider transient errors will cause false demotions. Mitigation: require 2 consecutive RED runs before opening a demotion PR. The first RED opens an investigation issue. +3. **wfctl deploy --dry-run semantics differ across providers.** If `--dry-run` is too permissive, the signal is meaningless. Verify each provider's dry-run actually validates IAM/API access, not just YAML. + +## Rollback + +The workflow is fire-and-forget reporting; rollback = disable the workflow file. No data is persisted in workflow main beyond the PR/issue creations, which are independently revertible. + +## Dependencies + +- `workflow#725` (`marketplace-verify` subcommand) is the human-readable counterpart to this automated promotion path. +- ADR-0041 (experimental-status marker) defines the manifest schema this PR exercises. + +## Success criteria + +- 5 IaC plugins (aws, gcp, azure, digitalocean, tofu) have weekly green live-deploy runs. +- Promotion PR opened automatically when a plugin earns its first GREEN run. +- Demotion PR + issue opened automatically when a `verified` plugin earns 2 consecutive REDs. +- Operator inspects monthly cost report and signs off on continued scheduled runs. + +## Out of scope + +- Live-deployment validation for non-IaC plugins (eventbus / payments / twilio / etc.) — file separate designs per provider category. +- Replacing the existing `wfctl validate --skip-unknown-types` schema check. +- Integration with workflow-cloud SaaS — this is workflow-engine OSS scope.