diff --git a/hack/capi/metadata.yaml b/hack/capi/metadata.yaml index 79828c4..d884f32 100644 --- a/hack/capi/metadata.yaml +++ b/hack/capi/metadata.yaml @@ -2,14 +2,21 @@ apiVersion: clusterctl.cluster.x-k8s.io/v1alpha3 kind: Metadata # Release series declare which controller contract a given minor line ships. # kany8s v0.1.x ships with Cluster API v1beta2 because cmd/main.go imports -# sigs.k8s.io/cluster-api/api/core/v1beta2 (CAPI v1.12+). v0.2.x and v0.3.x -# stay on v1beta2: v0.2.x added the v1beta2 served-alias on existing CRDs -# (APTH-1563) without changing the contract, and v0.3.x simplifies the +# sigs.k8s.io/cluster-api/api/core/v1beta2 (CAPI v1.12+). v0.2.x, v0.3.x, and +# v0.4.x stay on v1beta2: v0.2.x added the v1beta2 served-alias on existing +# CRDs (APTH-1563) without changing the contract; v0.3.x simplifies the # eks-karpenter-bootstrapper plugin so the kany8s-eks-byo ClusterClass RGD -# owns the FargateProfile lifecycle (APTH-1568) — neither touches contract. +# owns the FargateProfile lifecycle (APTH-1568); v0.4.x adds a new +# ClusterClass topology variable (karpenter-node-role-additional-inline-policies) +# so node-role inline policies become declarative end-to-end (APTH-1704), +# eliminating the ACK + kro drift delete race against externally attached +# inline policies — none of these touch the controller contract. # When cutting a new minor, prepend another entry; clusterctl requires # descending order by (major, minor). releaseSeries: +- major: 0 + minor: 4 + contract: v1beta2 - major: 0 minor: 3 contract: v1beta2 diff --git a/internal/controller/plugin/eks/constants.go b/internal/controller/plugin/eks/constants.go index 04c5f6e..60044ec 100644 --- a/internal/controller/plugin/eks/constants.go +++ b/internal/controller/plugin/eks/constants.go @@ -29,7 +29,8 @@ const ( topologyNodeSubnetIDsVariableName = "vpc-node-subnet-ids" topologyControlPlaneSecurityGroupIDsVariableName = "vpc-security-group-ids" topologyNodeSecurityGroupIDsVariableName = "vpc-node-security-group-ids" - topologyNodeRoleAdditionalPolicyARNsVariableName = "karpenter-node-role-additional-policy-arns" + topologyNodeRoleAdditionalPolicyARNsVariableName = "karpenter-node-role-additional-policy-arns" + topologyNodeRoleAdditionalInlinePoliciesVariableName = "karpenter-node-role-additional-inline-policies" defaultKarpenterChartVersion = "1.0.8" ) diff --git a/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go b/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go index 98b1ea7..786ec12 100644 --- a/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go +++ b/internal/controller/plugin/eks/karpenter_bootstrapper_controller.go @@ -457,7 +457,19 @@ func (r *EKSKarpenterBootstrapperReconciler) Reconcile(ctx context.Context, req nodeManagedPolicies = append(nodeManagedPolicies, additionalPolicyARNs...) } nodeManagedPolicies = normalizeDistinctStrings(nodeManagedPolicies) - if ok, err := r.ensureIAMRoleForEC2(ctx, cluster, nodeRoleName, region, nodeRoleAWSName, nodeManagedPolicies); err != nil { + // APTH-1704: read additional inline policies from the ClusterClass topology + // variable (sister to karpenter-node-role-additional-policy-arns above). + // nil map = no inline policies; the writer skips spec.inlinePolicies + // entirely so existing clusters that do not opt in remain byte-identical. + nodeInlinePolicies := map[string]string{} + if additionalInline, ok, err := readTopologyStringMap(cluster, topologyNodeRoleAdditionalInlinePoliciesVariableName); err != nil { + return ctrl.Result{}, err + } else if ok { + for k, v := range additionalInline { + nodeInlinePolicies[k] = v + } + } + if ok, err := r.ensureIAMRoleForEC2(ctx, cluster, nodeRoleName, region, nodeRoleAWSName, nodeManagedPolicies, nodeInlinePolicies); err != nil { return ctrl.Result{}, err } else if !ok { msg := fmt.Sprintf("IAM Role %s/%s exists and is not managed by %s", cluster.Namespace, nodeRoleName, karpenterManagedByValue) @@ -1239,7 +1251,7 @@ func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForIRSA(ctx context.Co }) } -func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForEC2(ctx context.Context, owner *clusterv1.Cluster, name, region, awsRoleName string, managedPolicyARNs []string) (bool, error) { +func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForEC2(ctx context.Context, owner *clusterv1.Cluster, name, region, awsRoleName string, managedPolicyARNs []string, inlinePolicies map[string]string) (bool, error) { obj := newUnstructured(ackIAMRoleGVK, owner.Namespace, name) assume := `{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}` return r.upsertManagedUnstructured(ctx, owner, obj, func(u *unstructured.Unstructured) error { @@ -1253,6 +1265,17 @@ func (r *EKSKarpenterBootstrapperReconciler) ensureIAMRoleForEC2(ctx context.Con policies = append(policies, arn) } mustSetNestedSlice(u, policies, "spec", "policies") + // APTH-1704: inline policies become declarative state on the ACK Role + // CR. Without this, ACK + kro reconcile interprets externally attached + // inline policies as drift and issues DeleteRolePolicy on the next + // reconcile cycle. + if len(inlinePolicies) > 0 { + inlineMap := make(map[string]any, len(inlinePolicies)) + for k, v := range inlinePolicies { + inlineMap[k] = v + } + mustSetNestedField(u, inlineMap, "spec", "inlinePolicies") + } mustSetNestedSlice(u, awsTagsAsSlice(bootstrapperResourceTags(owner, nil)), "spec", "tags") return nil }) @@ -2439,6 +2462,30 @@ func readTopologyStringSlice(cluster *clusterv1.Cluster, variableName string) ([ return nil, false, nil } +// readTopologyStringMap reads a ClusterClass topology variable that is shaped +// as map[string]string and returns the decoded value. It mirrors +// readTopologyStringSlice for sister variables whose schema is +// (type: object, additionalProperties: type: string) — used by APTH-1704 for +// karpenter-node-role-additional-inline-policies, where each map entry is an +// IAM inline policy name → JSON document string. +func readTopologyStringMap(cluster *clusterv1.Cluster, variableName string) (map[string]string, bool, error) { + if cluster == nil || !cluster.Spec.Topology.IsDefined() { + return nil, false, nil + } + for i := range cluster.Spec.Topology.Variables { + v := cluster.Spec.Topology.Variables[i] + if v.Name != variableName { + continue + } + var out map[string]string + if err := json.Unmarshal(v.Value.Raw, &out); err != nil { + return nil, false, fmt.Errorf("unmarshal topology variable %q: %w", variableName, err) + } + return out, true, nil + } + return nil, false, nil +} + func (r *EKSKarpenterBootstrapperReconciler) ensureTopologyStringSliceVariable(ctx context.Context, cluster *clusterv1.Cluster, variableName string, desired []string) (bool, error) { if r == nil || cluster == nil { return false, nil diff --git a/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go b/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go index 36ad223..96e860d 100644 --- a/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go +++ b/internal/controller/plugin/eks/karpenter_bootstrapper_controller_test.go @@ -443,7 +443,7 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureACKResources_CreateExpectedSpe t.Fatalf("irsa role spec.policyRefs len = %d, want 1", len(refs)) } - if ok, err := r.ensureIAMRoleForEC2(context.Background(), cluster, "demo-karpenter-node", "ap-northeast-1", testEKSNodeRoleName, []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}); err != nil { + if ok, err := r.ensureIAMRoleForEC2(context.Background(), cluster, "demo-karpenter-node", "ap-northeast-1", testEKSNodeRoleName, []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}, nil); err != nil { t.Fatalf("ensureIAMRoleForEC2() error = %v", err) } else if !ok { t.Fatalf("ensureIAMRoleForEC2() managed = false, want true") @@ -459,6 +459,14 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureACKResources_CreateExpectedSpe } else if len(got) != 1 || got[0] != "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" { t.Fatalf("ec2 role spec.policies = %#v", got) } + // APTH-1704: with nil inlinePolicies, spec.inlinePolicies must remain + // absent so that existing ACK Role CRs are byte-identical to the v0.3.x + // shape. + if _, found, err := unstructured.NestedMap(ec2Role.Object, "spec", "inlinePolicies"); err != nil { + t.Fatalf("ec2 role spec.inlinePolicies: %v", err) + } else if found { + t.Fatalf("ec2 role spec.inlinePolicies = present, want absent for nil input") + } if ok, err := r.ensureIAMInstanceProfile(context.Background(), cluster, "demo-karpenter-node-instance-profile", "ap-northeast-1", testEKSNodeRoleName, "demo-karpenter-node"); err != nil { t.Fatalf("ensureIAMInstanceProfile() error = %v", err) } else if !ok { @@ -554,6 +562,7 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_TakeoverWhenExpl "ap-northeast-1", testEKSNodeRoleName, []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}, + nil, ) if err != nil { t.Fatalf("ensureIAMRoleForEC2() error = %v", err) @@ -573,6 +582,130 @@ func TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_TakeoverWhenExpl } } +// TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_InlinePoliciesDeclaredOnSpec +// verifies the APTH-1704 root fix: when the caller passes a non-empty +// inlinePolicies map, the writer materializes it into spec.inlinePolicies on +// the ACK Role CR so that ACK + kro reconcile sees the policies as declarative +// state (not drift). Without this assertion, the ACK IAM controller would +// issue DeleteRolePolicy on every reconcile cycle (~6.5h period observed in +// pmc-local CloudTrail). +func TestEKSKarpenterBootstrapperReconciler_EnsureIAMRoleForEC2_InlinePoliciesDeclaredOnSpec(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(clusterv1.AddToScheme(scheme)) + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: testClusterName, + Namespace: "default", + UID: "cluster-uid", + }, + } + + c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cluster).Build() + r := &EKSKarpenterBootstrapperReconciler{Client: c, Scheme: scheme} + + certManagerPolicy := `{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["route53:GetChange"],"Resource":"arn:aws:route53:::change/*"}]}` + externalDNSPolicy := `{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["route53:ListHostedZones"],"Resource":"*"}]}` + + ok, err := r.ensureIAMRoleForEC2( + context.Background(), + cluster, + "demo-karpenter-node", + "ap-northeast-1", + testEKSNodeRoleName, + []string{"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"}, + map[string]string{ + "cert-manager-route53-dns01-txt-only": certManagerPolicy, + "external-dns-route53-record-management": externalDNSPolicy, + }, + ) + if err != nil { + t.Fatalf("ensureIAMRoleForEC2() error = %v", err) + } + if !ok { + t.Fatalf("ensureIAMRoleForEC2() managed = false, want true") + } + + got := getUnstructured(t, c, ackIAMRoleGVK, "demo-karpenter-node") + // Managed policies (existing path) must still be set independently of + // inlinePolicies — the two fields are independent on the ACK Role CR. + if policies, _, err := unstructured.NestedStringSlice(got.Object, "spec", "policies"); err != nil { + t.Fatalf("role spec.policies: %v", err) + } else if len(policies) != 1 || policies[0] != "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" { + t.Fatalf("role spec.policies = %#v, want managed policy preserved", policies) + } + // spec.inlinePolicies must be present and contain both entries with the + // JSON document strings preserved verbatim (ACK schema = map[string]string). + inline, found, err := unstructured.NestedMap(got.Object, "spec", "inlinePolicies") + if err != nil { + t.Fatalf("role spec.inlinePolicies: %v", err) + } + if !found { + t.Fatalf("role spec.inlinePolicies missing, want both inline entries declared") + } + if got, want := inline["cert-manager-route53-dns01-txt-only"], certManagerPolicy; got != want { + t.Fatalf("inlinePolicies[cert-manager] = %v, want %q", got, want) + } + if got, want := inline["external-dns-route53-record-management"], externalDNSPolicy; got != want { + t.Fatalf("inlinePolicies[external-dns] = %v, want %q", got, want) + } + if len(inline) != 2 { + t.Fatalf("inlinePolicies len = %d, want 2", len(inline)) + } +} + +// TestReadTopologyStringMap_RoundTripsClusterClassVariable verifies that the +// new helper introduced for APTH-1704 decodes a map[string]string topology +// variable identically to readTopologyStringSlice's []string sister. boot +// scripts (boot-single.sh) emit a YAML object whose values are JSON-document +// strings; the helper must preserve those strings byte-for-byte for the writer +// to forward them to spec.inlinePolicies on the ACK Role CR. +func TestReadTopologyStringMap_RoundTripsClusterClassVariable(t *testing.T) { + t.Parallel() + + rawJSON := `{"cert-manager-route53-dns01-txt-only":"{\"Version\":\"2012-10-17\"}","external-dns-route53-record-management":"{\"Version\":\"2012-10-17\",\"Statement\":[]}"}` + inlineVar := clusterv1.ClusterVariable{Name: topologyNodeRoleAdditionalInlinePoliciesVariableName} + inlineVar.Value.Raw = []byte(rawJSON) + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{Name: testClusterName, Namespace: "default"}, + Spec: clusterv1.ClusterSpec{ + Topology: clusterv1.Topology{ + ClassRef: clusterv1.ClusterClassRef{Name: "kany8s-eks-byo"}, + Variables: []clusterv1.ClusterVariable{inlineVar}, + }, + }, + } + + got, ok, err := readTopologyStringMap(cluster, topologyNodeRoleAdditionalInlinePoliciesVariableName) + if err != nil { + t.Fatalf("readTopologyStringMap() error = %v", err) + } + if !ok { + t.Fatalf("readTopologyStringMap() ok = false, want true") + } + if len(got) != 2 { + t.Fatalf("readTopologyStringMap() len = %d, want 2", len(got)) + } + if v := got["cert-manager-route53-dns01-txt-only"]; v != `{"Version":"2012-10-17"}` { + t.Fatalf("cert-manager value = %q, want JSON document preserved verbatim", v) + } + if v := got["external-dns-route53-record-management"]; v != `{"Version":"2012-10-17","Statement":[]}` { + t.Fatalf("external-dns value = %q, want JSON document preserved verbatim", v) + } + + // Sister assertion: readTopologyStringMap on a non-existent variable name + // returns (nil, false, nil) — same shape as readTopologyStringSlice — so + // production cluster specs that omit the variable see no inline policies + // (= production side-effect-zero invariant). + if missing, ok, err := readTopologyStringMap(cluster, "non-existent-variable"); err != nil { + t.Fatalf("readTopologyStringMap(missing) error = %v", err) + } else if ok || missing != nil { + t.Fatalf("readTopologyStringMap(missing) = (%v, %v), want (nil, false)", missing, ok) + } +} + func TestEKSKarpenterBootstrapperReconciler_EnsureIAMPolicy_InvalidExistingShapeReturnsError(t *testing.T) { t.Parallel()