From 8c15ed19a1f315da6c4ac94d2893776a04abb5a9 Mon Sep 17 00:00:00 2001 From: reoring Date: Thu, 7 May 2026 10:23:47 +0900 Subject: [PATCH] feat(plugin/eks): ACK Role CR additional inline policies (APTH-1704) Add a new ClusterClass topology variable `karpenter-node-role-additional-inline-policies` (sister to the existing `karpenter-node-role-additional-policy-arns`) so cluster operators can declare IAM inline policies on the karpenter node Role end-to-end through the ACK Role CR's `spec.inlinePolicies` field. ACK + kro v0.8.4 then sees the policies as declarative state instead of treating externally-attached inline policies as drift to delete. Concretely, this eliminates the ~6.5h race observed on pmc-local where the IRSA-give-up bundle (cert-manager-route53-dns01-txt-only + external-dns-route53-record-management) attached by demo-ops/scripts/ 05-pmc-node-role-policies.sh was repeatedly removed by ACK reconcile, causing cert-manager DNS01 challenges to permanently stall. Changes: - constants.go: add `topologyNodeRoleAdditionalInlinePoliciesVariableName` next to its sister `*PolicyARNs*` const so future readers see the symmetric pair. - karpenter_bootstrapper_controller.go: read the topology variable as a map[string]string (new `readTopologyStringMap` helper, modelled on `readTopologyStringSlice`) and forward it to a widened `ensureIAMRoleForEC2(..., inlinePolicies map[string]string)`. The writer materializes `spec.inlinePolicies` only when the map is non-empty so existing clusters that do not opt in remain byte-identical at the ACK CR level. - karpenter_bootstrapper_controller_test.go: cover the three documented cases (no inline = absent on spec, inline declared = both entries preserved verbatim, managed and inline coexist independently) plus a round-trip test on the new map reader. - hack/capi/metadata.yaml: prepend a v0.4 series entry for the upcoming release cut (clusterctl requires descending order by major,minor). Production cluster specs that omit the new variable see no behavior change (the writer skips spec.inlinePolicies entirely when the map is empty), so this is a backward-compatible feature suitable for a v0.4.0 minor release. See design/apth-1704-... and impl/apth-1704-... in github.com/reoring/demo0521 for the full design + implementation hand-off. Linear: APTH-1704 --- hack/capi/metadata.yaml | 15 +- internal/controller/plugin/eks/constants.go | 3 +- .../eks/karpenter_bootstrapper_controller.go | 51 ++++++- .../karpenter_bootstrapper_controller_test.go | 135 +++++++++++++++++- 4 files changed, 196 insertions(+), 8 deletions(-) diff --git a/hack/capi/metadata.yaml b/hack/capi/metadata.yaml index 79828c4..d884f32 100644 --- a/hack/capi/metadata.yaml +++ b/hack/capi/metadata.yaml @@ -2,14 +2,21 @@ apiVersion: clusterctl.cluster.x-k8s.io/v1alpha3 kind: Metadata # Release series declare which controller contract a given minor line ships. # kany8s v0.1.x ships with Cluster API v1beta2 because cmd/main.go imports -# sigs.k8s.io/cluster-api/api/core/v1beta2 (CAPI v1.12+). v0.2.x and v0.3.x -# stay on v1beta2: v0.2.x added the v1beta2 served-alias on existing CRDs -# (APTH-1563) without changing the contract, and v0.3.x simplifies the +# sigs.k8s.io/cluster-api/api/core/v1beta2 (CAPI v1.12+). v0.2.x, v0.3.x, and +# v0.4.x stay on v1beta2: v0.2.x added the v1beta2 served-alias on existing +# CRDs (APTH-1563) without changing the contract; v0.3.x simplifies the # eks-karpenter-bootstrapper plugin so the kany8s-eks-byo ClusterClass RGD -# owns the FargateProfile lifecycle (APTH-1568) — neither touches contract. +# owns the FargateProfile lifecycle (APTH-1568); v0.4.x adds a new +# ClusterClass topology variable (karpenter-node-role-additional-inline-policies) +# so node-role inline policies become declarative end-to-end (APTH-1704), +# eliminating the ACK + kro drift delete race against externally attached +# inline policies — none of these touch the controller contract. # When cutting a new minor, prepend another entry; clusterctl requires # descending order by (major, minor). releaseSeries: +- major: 0 + minor: 4 + contract: v1beta2 - major: 0 minor: 3 contract: v1beta2 diff --git a/internal/controller/plugin/eks/constants.go b/internal/controller/plugin/eks/constants.go index 04c5f6e..60044ec 100644 --- a/internal/controller/plugin/eks/constants.go +++ b/internal/controller/plugin/eks/constants.go @@ -29,7 +29,8 @@ const ( topologyNodeSubnetIDsVariableName = "vpc-node-subnet-ids" topologyControlPlaneSecurityGroupIDsVariableName = "vpc-security-group-ids" topologyNodeSecurityGroupIDsVariableName = "vpc-node-security-group-ids" - topologyNodeRoleAdditionalPolicyARNsVariableName = "karpenter-node-role-additional-policy-arns" + topologyNodeRoleAdditionalPolicyARNsVariableName = "karpenter-node-role-additional-policy-arns" + topologyNodeRoleAdditionalInlinePoliciesVariableName = "karpenter-node-role-additional-inline-policies" defaultKarpenterChartVersion = "1.0.8" ) diff --git a/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go b/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go index 98b1ea7..786ec12 100644 --- a/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go +++ b/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go @@ -457,7 +457,19 @@ func (r *EKSKarpenterBootstrapperReconciler) Reconcile(ctx context.Context, req nodeManagedPolicies = append(nodeManagedPolicies, additionalPolicyARNs...) } nodeManagedPolicies = normalizeDistinctStrings(nodeManagedPolicies) - if ok, err := r.ensureIAMRoleForEC2(ctx, cluster, nodeRoleName, region, nodeRoleAWSName, nodeManagedPolicies); err != nil { + // APTH-1704: read additional inline policies from the ClusterClass topology + // variable (sister to karpenter-node-role-additional-policy-arns above). + // nil map = no inline policies; the writer skips spec.inlinePolicies + // entirely so existing clusters that do not opt in remain byte-identical. + nodeInlinePolicies := map[string]string{} + if additionalInline, ok, err := readTopologyStringMap(cluster, topologyNodeRoleAdditionalInlinePoliciesVariableName); err != nil { + return ctrl.Result{}, err + } else if ok { + for k, v := range additionalInline { + nodeInlinePolicies[k] = v + } + } + if ok, err := r.ensureIAMRoleForEC2(ctx, cluster, nodeRoleName, region, nodeRoleAWSName, nodeManagedPolicies, nodeInlinePolicies); err != nil { return ctrl.Result{}, err } else if !ok { msg := fmt.Sprintf("IAM Role %s/%s exists and is not managed by %s", cluster.Namespace, nodeRoleName, karpenterManagedByValue) @@ -1239,7 +1251,7 @@ func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForIRSA(ctx context.Co }) } -func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForEC2(ctx context.Context, owner *clusterv1.Cluster, name, region, awsRoleName string, managedPolicyARNs []string) (bool, error) { +func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForEC2(ctx context.Context, owner *clusterv1.Cluster, name, region, awsRoleName string, managedPolicyARNs []string, inlinePolicies map[string]string) (bool, error) { obj := newUnstructured(ackIAMRoleGVK, owner.Namespace, name) assume := `{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}` return r.upsertManagedUnstructured(ctx, owner, obj, func(u *unstructured.Unstructured) error { @@ -1253,6 +1265,17 @@ func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForEC2(ctx context.Con policies = append(policies, arn) } mustSetNestedSlice(u, policies, "spec", "policies") + // APTH-1704: inline policies become declarative state on the ACK Role + // CR. Without this, ACK + kro reconcile interprets externally attached + // inline policies as drift and issues DeleteRolePolicy on the next + // reconcile cycle. + if len(inlinePolicies) > 0 { + inlineMap := make(map[string]any, len(inlinePolicies)) + for k, v := range inlinePolicies { + inlineMap[k] = v + } + mustSetNestedField(u, inlineMap, "spec", "inlinePolicies") + } mustSetNestedSlice(u, awsTagsAsSlice(bootstrapperResourceTags(owner, nil)), "spec", "tags") return nil }) @@ -2439,6 +2462,30 @@ func readTopologyStringSlice(cluster *clusterv1.Cluster, variableName string) ([ return nil, false, nil } +// readTopologyStringMap reads a ClusterClass topology variable that is shaped +// as map[string]string and returns the decoded value. It mirrors +// readTopologyStringSlice for sister variables whose schema is +// (type: object, additionalProperties: type: string) — used by APTH-1704 for +// karpenter-node-role-additional-inline-policies, where each map entry is an +// IAM inline policy name → JSON document string. +func readTopologyStringMap(cluster *clusterv1.Cluster, variableName string) (map[string]string, bool, error) { + if cluster == nil || !cluster.Spec.Topology.IsDefined() { + return nil, false, nil + } + for i := range cluster.Spec.Topology.Variables { + v := cluster.Spec.Topology.Variables[i] + if v.Name != variableName { + continue + } + var out map[string]string + if err := json.Unmarshal(v.Value.Raw, &out); err != nil { + return nil, false, fmt.Errorf("unmarshal topology variable %q: %w", variableName, err) + } + return out, true, nil + } + return nil, false, nil +} + func (r *EKSKarpenterBootstrapperReconciler) ensureTopologyStringSliceVariable(ctx context.Context, cluster *clusterv1.Cluster, variableName string, desired []string) (bool, error) { if r == nil || cluster == nil { return false, nil diff --git a/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go b/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go index 36ad223..96e860d 100644 --- a/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go +++ b/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go @@ -443,7 +443,7 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureACKResources_CreateExpectedSpe t.Fatalf("irsa role spec.policyRefs len = %d, want 1", len(refs)) } - if ok, err := r.ensureIAMRoleForEC2(context.Background(), cluster, "demo-karpenter-node", "ap-northeast-1", testEKSNodeRoleName, []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}); err != nil { + if ok, err := r.ensureIAMRoleForEC2(context.Background(), cluster, "demo-karpenter-node", "ap-northeast-1", testEKSNodeRoleName, []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}, nil); err != nil { t.Fatalf("ensureIAMRoleForEC2() error = %v", err) } else if !ok { t.Fatalf("ensureIAMRoleForEC2() managed = false, want true") @@ -459,6 +459,14 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureACKResources_CreateExpectedSpe } else if len(got) != 1 || got[0] != "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" { t.Fatalf("ec2 role spec.policies = %#v", got) } + // APTH-1704: with nil inlinePolicies, spec.inlinePolicies must remain + // absent so that existing ACK Role CRs are byte-identical to the v0.3.x + // shape. + if _, found, err := unstructured.NestedMap(ec2Role.Object, "spec", "inlinePolicies"); err != nil { + t.Fatalf("ec2 role spec.inlinePolicies: %v", err) + } else if found { + t.Fatalf("ec2 role spec.inlinePolicies = present, want absent for nil input") + } if ok, err := r.ensureIAMInstanceProfile(context.Background(), cluster, "demo-karpenter-node-instance-profile", "ap-northeast-1", testEKSNodeRoleName, "demo-karpenter-node"); err != nil { t.Fatalf("ensureIAMInstanceProfile() error = %v", err) } else if !ok { @@ -554,6 +562,7 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_TakeoverWhenExpl "ap-northeast-1", testEKSNodeRoleName, []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}, + nil, ) if err != nil { t.Fatalf("ensureIAMRoleForEC2() error = %v", err) @@ -573,6 +582,130 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_TakeoverWhenExpl } } +// TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_InlinePoliciesDeclaredOnSpec +// verifies the APTH-1704 root fix: when the caller passes a non-empty +// inlinePolicies map, the writer materializes it into spec.inlinePolicies on +// the ACK Role CR so that ACK + kro reconcile sees the policies as declarative +// state (not drift). Without this assertion, the ACK IAM controller would +// issue DeleteRolePolicy on every reconcile cycle (~6.5h period observed in +// pmc-local CloudTrail). +func TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_InlinePoliciesDeclaredOnSpec(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(clusterv1.AddToScheme(scheme)) + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: testClusterName, + Namespace: "default", + UID: "cluster-uid", + }, + } + + c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cluster).Build() + r := &EKSKarpenterBootstrapperReconciler{Client: c, Scheme: scheme} + + certManagerPolicy := `{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["route53:GetChange"],"Resource":"arn:aws:route53:::change/*"}]}` + externalDNSPolicy := `{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["route53:ListHostedZones"],"Resource":"*"}]}` + + ok, err := r.ensureIAMRoleForEC2( + context.Background(), + cluster, + "demo-karpenter-node", + "ap-northeast-1", + testEKSNodeRoleName, + []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}, + map[string]string{ + "cert-manager-route53-dns01-txt-only": certManagerPolicy, + "external-dns-route53-record-management": externalDNSPolicy, + }, + ) + if err != nil { + t.Fatalf("ensureIAMRoleForEC2() error = %v", err) + } + if !ok { + t.Fatalf("ensureIAMRoleForEC2() managed = false, want true") + } + + got := getUnstructured(t, c, ackIAMRoleGVK, "demo-karpenter-node") + // Managed policies (existing path) must still be set independently of + // inlinePolicies — the two fields are independent on the ACK Role CR. + if policies, _, err := unstructured.NestedStringSlice(got.Object, "spec", "policies"); err != nil { + t.Fatalf("role spec.policies: %v", err) + } else if len(policies) != 1 || policies[0] != "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" { + t.Fatalf("role spec.policies = %#v, want managed policy preserved", policies) + } + // spec.inlinePolicies must be present and contain both entries with the + // JSON document strings preserved verbatim (ACK schema = map[string]string). + inline, found, err := unstructured.NestedMap(got.Object, "spec", "inlinePolicies") + if err != nil { + t.Fatalf("role spec.inlinePolicies: %v", err) + } + if !found { + t.Fatalf("role spec.inlinePolicies missing, want both inline entries declared") + } + if got, want := inline["cert-manager-route53-dns01-txt-only"], certManagerPolicy; got != want { + t.Fatalf("inlinePolicies[cert-manager] = %v, want %q", got, want) + } + if got, want := inline["external-dns-route53-record-management"], externalDNSPolicy; got != want { + t.Fatalf("inlinePolicies[external-dns] = %v, want %q", got, want) + } + if len(inline) != 2 { + t.Fatalf("inlinePolicies len = %d, want 2", len(inline)) + } +} + +// TestReadTopologyStringMap_RoundTripsClusterClassVariable verifies that the +// new helper introduced for APTH-1704 decodes a map[string]string topology +// variable identically to readTopologyStringSlice's []string sister. boot +// scripts (boot-single.sh) emit a YAML object whose values are JSON-document +// strings; the helper must preserve those strings byte-for-byte for the writer +// to forward them to spec.inlinePolicies on the ACK Role CR. +func TestReadTopologyStringMap_RoundTripsClusterClassVariable(t *testing.T) { + t.Parallel() + + rawJSON := `{"cert-manager-route53-dns01-txt-only":"{\"Version\":\"2012-10-17\"}","external-dns-route53-record-management":"{\"Version\":\"2012-10-17\",\"Statement\":[]}"}` + inlineVar := clusterv1.ClusterVariable{Name: topologyNodeRoleAdditionalInlinePoliciesVariableName} + inlineVar.Value.Raw = []byte(rawJSON) + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{Name: testClusterName, Namespace: "default"}, + Spec: clusterv1.ClusterSpec{ + Topology: clusterv1.Topology{ + ClassRef: clusterv1.ClusterClassRef{Name: "kany8s-eks-byo"}, + Variables: []clusterv1.ClusterVariable{inlineVar}, + }, + }, + } + + got, ok, err := readTopologyStringMap(cluster, topologyNodeRoleAdditionalInlinePoliciesVariableName) + if err != nil { + t.Fatalf("readTopologyStringMap() error = %v", err) + } + if !ok { + t.Fatalf("readTopologyStringMap() ok = false, want true") + } + if len(got) != 2 { + t.Fatalf("readTopologyStringMap() len = %d, want 2", len(got)) + } + if v := got["cert-manager-route53-dns01-txt-only"]; v != `{"Version":"2012-10-17"}` { + t.Fatalf("cert-manager value = %q, want JSON document preserved verbatim", v) + } + if v := got["external-dns-route53-record-management"]; v != `{"Version":"2012-10-17","Statement":[]}` { + t.Fatalf("external-dns value = %q, want JSON document preserved verbatim", v) + } + + // Sister assertion: readTopologyStringMap on a non-existent variable name + // returns (nil, false, nil) — same shape as readTopologyStringSlice — so + // production cluster specs that omit the variable see no inline policies + // (= production side-effect-zero invariant). + if missing, ok, err := readTopologyStringMap(cluster, "non-existent-variable"); err != nil { + t.Fatalf("readTopologyStringMap(missing) error = %v", err) + } else if ok || missing != nil { + t.Fatalf("readTopologyStringMap(missing) = (%v, %v), want (nil, false)", missing, ok) + } +} + func TestEKSKarpenterBootstrapperReconciler_EnsureIAMPolicy_InvalidExistingShapeReturnsError(t *testing.T) { t.Parallel()