From 1e8722e6687edc63e4019a8eda3dcbe57331e83d Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 17:22:33 -0500 Subject: [PATCH 1/8] fix: prevent orphaned cell companions when PropagationPolicy is deleted before Karmada finishes GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part 1 — ordering guard: WorkloadDeploymentFederator.Finalize now checks whether any companion ConfigMaps or Secrets (referenced-data=true label) remain in the downstream namespace before calling cleanupPropagationPolicyIfUnused. The guard only fires when this is the last WD for its city code (mirroring cleanupPropagationPolicyIfUnused's condition), so deleting WD-A cannot block on a live WD-B's companion in the shared namespace. If companions are present Finalize returns an errCompanionsStillPresent sentinel; Reconcile intercepts it (walking the kerrors.Aggregate the finalizer framework returns), logs at Info, and sets RequeueAfter — no error-metric inflation. After companionGuardTimeout (2 min) the guard bypasses itself so a wedged referenced-data controller cannot permanently block deletion. Part 2 — authoritative cell-side GC: CompanionGCReconciler watches WorkloadDeployment events on each cell and deletes companion ConfigMaps/Secrets whose every referenced-by entry resolves to no live WD on the local cell. Critical fix: the referenced-by annotation is written by the hub ReferencedDataController as "projectNamespace/wdName" (e.g. "default/mount-pristine-default-dfw"), but the cell WD lives in ns-{uid}. hasLiveReferrer now ignores the namespace in the key and looks up by NAME ONLY in the companion's own namespace — preventing false "referrer absent" conclusions that would delete an actively-mounted companion. SetupWithManager uses WithEngageWithLocalCluster(false) to match all other cell controllers. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit f97a3cb65c6f3fa343b90b15f0a4dd65d8e84f6f) --- cmd/main.go | 7 + .../controller/companion_gc_controller.go | 328 +++++++ .../companion_gc_controller_test.go | 814 ++++++++++++++++++ .../workloaddeployment_controller_test.go | 14 +- .../workloaddeployment_federator.go | 137 +++ 5 files changed, 1291 insertions(+), 9 deletions(-) create mode 100644 internal/controller/companion_gc_controller.go create mode 100644 internal/controller/companion_gc_controller_test.go diff --git a/cmd/main.go b/cmd/main.go index 64240f01..b2373b80 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -303,6 +303,13 @@ func main() { } } + if enableCellControllers { + if err = (&controller.CompanionGCReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "CompanionGC") + os.Exit(1) + } + } + // The fail-loud guard above ensures federationRestConfig is non-nil when // management controllers are enabled; the nil check here is defensive. if enableManagementControllers && federationRestConfig != nil { diff --git a/internal/controller/companion_gc_controller.go b/internal/controller/companion_gc_controller.go new file mode 100644 index 00000000..3b8df64d --- /dev/null +++ b/internal/controller/companion_gc_controller.go @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "encoding/json" + "fmt" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// CompanionGCReconciler runs on each cell cluster and deletes orphaned companion +// ConfigMaps and Secrets whose WorkloadDeployment referrers are gone from that +// cell. It is the authoritative GC path that does not depend on Karmada's +// ResourceBinding cascade completing correctly. +// +// The reconciler is triggered by WorkloadDeployment events. On each trigger it +// lists all companions in the WD's namespace (by the referenced-data=true +// label), parses the referenced-by annotation on each companion, and deletes +// any companion for which every listed WD name is absent from the local cell +// namespace. +// +// Per-cell multi-referrer safety: the referenced-by annotation is written by +// the hub-side ReferencedDataController using PROJECT-plane namespace keys +// (e.g. "default/mount-pristine-default-dfw"). On the cell the WD lives in +// ns-{project-uid}, not "default". hasLiveReferrer therefore looks up WDs by +// name only, in the companion's own namespace. This means: +// +// - A WD on a different cell is never present locally → counted absent +// → companion on that cell is correctly deleted when its own local WD is +// also gone. +// - A WD on this cell is present → companion preserved → correct. +type CompanionGCReconciler struct { + mgr mcmanager.Manager +} + +// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;delete +// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch + +// Reconcile is invoked for each companion ConfigMap or Secret that carries the +// referenced-data=true label. It deletes the companion when every WD listed in +// its referenced-by annotation is absent from the same namespace on this cell. +func (r *CompanionGCReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues( + "namespace", req.Namespace, + "name", req.Name, + ) + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + cellClient := cl.GetClient() + + return ctrl.Result{}, r.reconcileCompanion(ctx, cellClient, req.NamespacedName, logger) +} + +// reconcileCompanion checks one ConfigMap-or-Secret companion and deletes it +// when safe. The function tries ConfigMap first; if not found it tries Secret. +func (r *CompanionGCReconciler) reconcileCompanion( + ctx context.Context, + cellClient client.Client, + key types.NamespacedName, + logger interface{ Info(string, ...any) }, +) error { + // Try ConfigMap. + var cm corev1.ConfigMap + err := cellClient.Get(ctx, key, &cm) + if err == nil { + return r.maybeDeleteConfigMap(ctx, cellClient, &cm, logger) + } + if !apierrors.IsNotFound(err) { + return fmt.Errorf("get ConfigMap %s: %w", key, err) + } + + // Not a ConfigMap — try Secret. + var secret corev1.Secret + err = cellClient.Get(ctx, key, &secret) + if err == nil { + return r.maybeDeleteSecret(ctx, cellClient, &secret, logger) + } + if !apierrors.IsNotFound(err) { + return fmt.Errorf("get Secret %s: %w", key, err) + } + + // Object already gone — nothing to do. + return nil +} + +func (r *CompanionGCReconciler) maybeDeleteConfigMap( + ctx context.Context, + cellClient client.Client, + cm *corev1.ConfigMap, + logger interface{ Info(string, ...any) }, +) error { + if !isCompanion(cm) { + return nil + } + alive, err := r.hasLiveReferrer(ctx, cellClient, cm.Namespace, cm.Annotations) + if err != nil { + return err + } + if alive { + return nil + } + logger.Info("deleting orphaned companion ConfigMap", "name", cm.Name, "namespace", cm.Namespace) + if err := cellClient.Delete(ctx, cm); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("delete companion ConfigMap %s/%s: %w", cm.Namespace, cm.Name, err) + } + return nil +} + +func (r *CompanionGCReconciler) maybeDeleteSecret( + ctx context.Context, + cellClient client.Client, + secret *corev1.Secret, + logger interface{ Info(string, ...any) }, +) error { + if !isCompanion(secret) { + return nil + } + alive, err := r.hasLiveReferrer(ctx, cellClient, secret.Namespace, secret.Annotations) + if err != nil { + return err + } + if alive { + return nil + } + logger.Info("deleting orphaned companion Secret", "name", secret.Name, "namespace", secret.Namespace) + if err := cellClient.Delete(ctx, secret); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("delete companion Secret %s/%s: %w", secret.Namespace, secret.Name, err) + } + return nil +} + +// isCompanion reports whether the object carries the referenced-data=true label. +// The label is the authoritative signal that the companion was created by the +// referenced-data controller. Objects without it are not touched by GC. +func isCompanion(obj metav1.Object) bool { + labels := obj.GetLabels() + return labels[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue +} + +// hasLiveReferrer returns true when at least one WD listed in the referenced-by +// annotation still exists in the companion's namespace on this cell. +// +// The annotation is written by the hub-side ReferencedDataController as +// "projectNamespace/wdName" (e.g. "default/mount-pristine-default-dfw"). On +// the cell the WD lives in ns-{project-uid}, not in the project namespace. To +// find it we look up by NAME ONLY in the companion's own namespace — the cell +// WD name is always equal to the project WD name (set by +// upsertDownstreamDeployment). This also gives us the correct per-cell +// semantics: a WD that runs on a different cell is never present locally, so it +// contributes nothing to liveness on this cell. +// +// A companion is considered still needed if any referrer is present in any +// state (including terminating) to avoid premature deletion during the WD +// teardown window. +// +// Returns (false, nil) when the annotation is absent or empty. +// Returns (true, nil) when at least one referrer is found. +// Returns (true, err) when the annotation is corrupt or an API call fails so +// that the caller does NOT delete the companion on transient faults. +func (r *CompanionGCReconciler) hasLiveReferrer( + ctx context.Context, + cellClient client.Client, + companionNamespace string, + annotations map[string]string, +) (bool, error) { + wdKeys, err := decodeCompanionRefCount(annotations) + if err != nil { + // Corrupt annotation: treat as "has live referrer" to avoid accidental GC. + return true, err + } + if len(wdKeys) == 0 { + return false, nil + } + + for _, key := range wdKeys { + wdName := wdNameFromKey(key) + if wdName == "" { + // Malformed key: conservatively assume the referrer is alive. + return true, nil + } + + // Look up by name in the companion's namespace. The annotation carries the + // PROJECT namespace as the key prefix, but the cell WD lives in + // ns-{project-uid} — the same namespace the companion is in. + var wd computev1alpha.WorkloadDeployment + err := cellClient.Get(ctx, types.NamespacedName{Namespace: companionNamespace, Name: wdName}, &wd) + if err == nil { + // Referrer exists (any state — including terminating). Companion stays. + return true, nil + } + if !apierrors.IsNotFound(err) { + return true, fmt.Errorf("get WorkloadDeployment %s/%s: %w", companionNamespace, wdName, err) + } + // Not found — this referrer is gone; continue checking the rest. + } + + // Every listed referrer is absent from this cell. + return false, nil +} + +// wdNameFromKey extracts the WD name from a "namespace/name" annotation key. +// The referenced-by annotation always uses "projectNamespace/wdName" format; +// we want only the name portion, which is the part after the last slash. +// Returns "" for an empty key (caller should treat as malformed). +func wdNameFromKey(key string) string { + if key == "" { + return "" + } + for i := len(key) - 1; i >= 0; i-- { + if key[i] == '/' { + return key[i+1:] + } + } + // No slash — the whole string is the name. + return key +} + +// decodeCompanionRefCount parses the companionRefCountAnnotation value into a +// slice of WD keys. It is a cell-local copy of the hub-side decodeRefCount so +// the GC reconciler does not share internal state with the +// ReferencedDataController. The annotation format is identical: a JSON array. +// +// Returns (nil, nil) when the annotation is absent or empty. +// Returns an error when the annotation value is present but cannot be parsed. +func decodeCompanionRefCount(annotations map[string]string) ([]string, error) { + raw, ok := annotations[companionRefCountAnnotation] + if !ok || raw == "" { + return nil, nil + } + var keys []string + if err := json.Unmarshal([]byte(raw), &keys); err != nil { + return nil, fmt.Errorf("companion-gc: corrupt ref-count annotation %q: %w", raw, err) + } + return keys, nil +} + +// SetupWithManager registers the CompanionGCReconciler with the multicluster +// manager. It should only be called when cell controllers are enabled +// (--enable-cell-controllers). The reconciler watches ConfigMaps (its primary +// For type) and uses WorkloadDeployment events to enqueue companion objects in +// the same namespace. Secret companions are discovered at reconcile time via +// Get, not through a separate For/Watches registration, because both kinds map +// to the same reconcile loop. +func (r *CompanionGCReconciler) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + + return mcbuilder.ControllerManagedBy(mgr). + For(&corev1.ConfigMap{}, mcbuilder.WithEngageWithLocalCluster(false)). + // When a WorkloadDeployment changes (including deletion), re-enqueue all + // companion ConfigMaps and Secrets in the same namespace so the GC + // reconciler can decide whether they are still referenced. + Watches(&computev1alpha.WorkloadDeployment{}, + func(_ multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []mcreconcile.Request { + return enqueueCompanionsForNamespace(ctx, cl.GetClient(), obj.GetNamespace()) + }) + }, + mcbuilder.WithEngageWithLocalCluster(false), + ). + Named("companion-gc"). + Complete(r) +} + +// enqueueCompanionsForNamespace returns mcreconcile.Requests for every companion +// ConfigMap and Secret in the given namespace. It is called from the WD watch +// handler so the GC reconciler is triggered whenever a WD changes in a namespace +// that may contain companions. +func enqueueCompanionsForNamespace(ctx context.Context, cellClient client.Client, namespace string) []mcreconcile.Request { + companionLabel := client.MatchingLabels{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + } + nsSelector := client.InNamespace(namespace) + + var reqs []mcreconcile.Request + + var cms corev1.ConfigMapList + if err := cellClient.List(ctx, &cms, nsSelector, companionLabel); err == nil { + for i := range cms.Items { + reqs = append(reqs, mcreconcile.Request{ + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: cms.Items[i].Namespace, + Name: cms.Items[i].Name, + }, + }, + }) + } + } + + var secrets corev1.SecretList + if err := cellClient.List(ctx, &secrets, nsSelector, companionLabel); err == nil { + for i := range secrets.Items { + reqs = append(reqs, mcreconcile.Request{ + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: secrets.Items[i].Namespace, + Name: secrets.Items[i].Name, + }, + }, + }) + } + } + + return reqs +} diff --git a/internal/controller/companion_gc_controller_test.go b/internal/controller/companion_gc_controller_test.go new file mode 100644 index 00000000..cfc4d5b6 --- /dev/null +++ b/internal/controller/companion_gc_controller_test.go @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "encoding/json" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Test constants ──────────────────────────────────────────────────────────── + +const ( + gcTestCluster = "cell-cluster" + + // gcTestNamespace is the cell-side namespace (ns-{project-uid}) where + // companion objects and the cell WorkloadDeployment reside. + gcTestNamespace = "ns-aabbccdd-0000-1111-2222-333344445555" + + // gcTestProjectNamespace is the project-plane namespace that the + // ReferencedDataController uses when writing referenced-by annotation keys. + // In production keys are written as "projectNamespace/wdName", e.g. + // "default/mount-pristine-default-dfw". + gcTestProjectNamespace = "default" + + gcTestCMName = "cm-pristine" + gcTestSecName = "secret-pristine" + gcTestWD1Name = "mount-pristine-default-dfw" + gcTestWD2Name = "mount-pristine-default-lax" + + // gcTestDefaultWD1Key is a sample production-format referenced-by annotation + // key used in decode and name-extraction tests ("projectNamespace/wdName"). + gcTestDefaultWD1Key = "default/wd-1" +) + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +// newCellFakeClient returns a fake client with the project scheme (corev1 + compute). +func newCellFakeClient(objs ...client.Object) client.Client { + return newProjectFakeClient(objs...) +} + +// companionCM builds a companion ConfigMap named gcTestCMName in gcTestNamespace +// with the referenced-data=true label and the given WD keys in the +// referenced-by annotation. +func companionCM(wdKeys []string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: gcTestCMName, + Namespace: gcTestNamespace, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: mustEncodeRefCount(wdKeys), + }, + }, + } +} + +// companionSecret builds a companion Secret with the referenced-data=true label +// and the given WD keys in the referenced-by annotation. +func companionSecret(name, namespace string, wdKeys []string) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: mustEncodeRefCount(wdKeys), + }, + }, + } +} + +// plainCM builds a ConfigMap WITHOUT the referenced-data label. +func plainCM(name, namespace string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, + } +} + +// cellWD builds a WorkloadDeployment in the cell namespace (ns-{uid}). +func cellWD(name, namespace string) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: wbTestCityCode, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// cellWDTerminating builds a WorkloadDeployment with a non-zero DeletionTimestamp. +func cellWDTerminating(name, namespace string) *computev1alpha.WorkloadDeployment { + wd := cellWD(name, namespace) + ts := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &ts + wd.Finalizers = []string{"test-finalizer"} // required by fake client for terminating objects + return wd +} + +// mustEncodeRefCount serialises a slice of WD keys as a JSON array. +func mustEncodeRefCount(keys []string) string { + if len(keys) == 0 { + return "[]" + } + b, err := json.Marshal(keys) + if err != nil { + panic("mustEncodeRefCount: " + err.Error()) + } + return string(b) +} + +// prodRefKey returns the production-format referenced-by annotation key for a WD. +// In production the ReferencedDataController writes "projectNamespace/wdName" +// (e.g. "default/mount-pristine-default-dfw"), NOT the cell namespace. +func prodRefKey(wdName string) string { + return gcTestProjectNamespace + "/" + wdName +} + +// newGCReconciler builds a CompanionGCReconciler wired to a fakeMCManager +// backed by the given cell client. +func newGCReconciler(cellClient client.Client) *CompanionGCReconciler { + cl := newFakeCluster(cellClient) + mgr := newFakeMCManager(gcTestCluster, cl) + return &CompanionGCReconciler{mgr: mgr} +} + +// reconcileGC runs one GC reconcile for the named object in gcTestNamespace. +func reconcileGC(t *testing.T, r *CompanionGCReconciler, name string) (ctrl.Result, error) { + t.Helper() + ctx := mccontext.WithCluster(context.Background(), multicluster.ClusterName(gcTestCluster)) + return r.Reconcile(ctx, mcreconcile.Request{ + ClusterName: multicluster.ClusterName(gcTestCluster), + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: gcTestNamespace, Name: name}, + }, + }) +} + +// ─── decodeCompanionRefCount unit tests ─────────────────────────────────────── + +func TestDecodeCompanionRefCount(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + annotations map[string]string + wantKeys []string + wantErr bool + }{ + { + name: "absent annotation returns nil", + annotations: nil, + wantKeys: nil, + }, + { + name: "empty annotation returns nil", + annotations: map[string]string{companionRefCountAnnotation: ""}, + wantKeys: nil, + }, + { + name: "single key", + annotations: map[string]string{companionRefCountAnnotation: `["` + gcTestDefaultWD1Key + `"]`}, + wantKeys: []string{gcTestDefaultWD1Key}, + }, + { + name: "multiple keys", + annotations: map[string]string{companionRefCountAnnotation: `["` + gcTestDefaultWD1Key + `","default/wd-2"]`}, + wantKeys: []string{gcTestDefaultWD1Key, "default/wd-2"}, + }, + { + name: "corrupt annotation returns error", + annotations: map[string]string{companionRefCountAnnotation: `not-json`}, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got, err := decodeCompanionRefCount(tt.annotations) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + assert.Equal(t, tt.wantKeys, got) + }) + } +} + +// ─── wdNameFromKey unit tests ───────────────────────────────────────────────── + +func TestWdNameFromKey(t *testing.T) { + t.Parallel() + + tests := []struct { + input string + want string + }{ + {gcTestDefaultWD1Key, "wd-1"}, + {"ns/sub/name", "name"}, // last slash wins + {"wd-only", "wd-only"}, + {"", ""}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + t.Parallel() + got := wdNameFromKey(tt.input) + assert.Equal(t, tt.want, got) + }) + } +} + +// ─── hasLiveReferrer unit tests ─────────────────────────────────────────────── + +func TestHasLiveReferrer(t *testing.T) { + t.Parallel() + + // Production-format key: project namespace / WD name. + // The WD object lives in gcTestNamespace on the cell. + wdKey := prodRefKey(gcTestWD1Name) + + tests := []struct { + name string + annotations map[string]string + cellObjs []client.Object + wantAlive bool + wantErr bool + }{ + { + name: "no annotation → no referrers → safe to delete", + annotations: nil, + wantAlive: false, + }, + { + name: "empty ref-count → safe to delete", + annotations: map[string]string{companionRefCountAnnotation: "[]"}, + wantAlive: false, + }, + { + // BLOCKER 1 regression: key uses project namespace ("default/…") but + // the WD lives in the cell namespace (gcTestNamespace). hasLiveReferrer + // must look up by name only in the companion's namespace, not by the + // project namespace encoded in the key. + name: "project-namespace key — WD in cell ns — companion preserved", + annotations: map[string]string{companionRefCountAnnotation: mustEncodeRefCount([]string{wdKey})}, + cellObjs: []client.Object{cellWD(gcTestWD1Name, gcTestNamespace)}, + wantAlive: true, + }, + { + name: "referrer is terminating → still counts as present → keep", + annotations: map[string]string{companionRefCountAnnotation: mustEncodeRefCount([]string{wdKey})}, + cellObjs: []client.Object{cellWDTerminating(gcTestWD1Name, gcTestNamespace)}, + wantAlive: true, + }, + { + name: "referrer absent → delete", + annotations: map[string]string{companionRefCountAnnotation: mustEncodeRefCount([]string{wdKey})}, + cellObjs: nil, // WD not on this cell + wantAlive: false, + }, + { + // Multi-referrer with production-format keys: WD1 is on another cell + // (not present locally), WD2 IS on this cell → companion kept. + name: "multi-referrer: one absent (other cell), one present local → keep", + annotations: map[string]string{ + companionRefCountAnnotation: mustEncodeRefCount([]string{ + prodRefKey(gcTestWD1Name), + prodRefKey(gcTestWD2Name), + }), + }, + // WD2 is on this cell; WD1 is on another cell and not visible here. + cellObjs: []client.Object{cellWD(gcTestWD2Name, gcTestNamespace)}, + wantAlive: true, + }, + { + name: "multi-referrer: both absent → delete", + annotations: map[string]string{ + companionRefCountAnnotation: mustEncodeRefCount([]string{ + prodRefKey(gcTestWD1Name), + prodRefKey(gcTestWD2Name), + }), + }, + cellObjs: nil, + wantAlive: false, + }, + { + name: "corrupt annotation → error returned", + annotations: map[string]string{companionRefCountAnnotation: "corrupt"}, + wantErr: true, + wantAlive: true, // conservative: treat as alive on error + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + cellClient := newCellFakeClient(tt.cellObjs...) + r := &CompanionGCReconciler{} + + alive, err := r.hasLiveReferrer(context.Background(), cellClient, gcTestNamespace, tt.annotations) + + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + assert.Equal(t, tt.wantAlive, alive) + }) + } +} + +// ─── CompanionGCReconciler.Reconcile tests ──────────────────────────────────── + +// TestCompanionGC_ConfigMap_DeletedWhenAllReferrersAbsent verifies that a +// companion ConfigMap is deleted when the WD listed in its referenced-by +// annotation (using production "projectNS/name" format) is absent from the cell. +func TestCompanionGC_ConfigMap_DeletedWhenAllReferrersAbsent(t *testing.T) { + t.Parallel() + + // Production-format key: project namespace, but no WD of that name on this cell. + cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) + cellClient := newCellFakeClient(cm) + r := newGCReconciler(cellClient) + + result, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + var got corev1.ConfigMap + err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) + assert.True(t, apierrors.IsNotFound(err), "companion ConfigMap should be deleted when all referrers are absent") +} + +// TestCompanionGC_Secret_DeletedWhenAllReferrersAbsent verifies that a companion +// Secret is deleted when the WD listed in its annotation does not exist on the cell. +func TestCompanionGC_Secret_DeletedWhenAllReferrersAbsent(t *testing.T) { + t.Parallel() + + secret := companionSecret(gcTestSecName, gcTestNamespace, []string{prodRefKey(gcTestWD1Name)}) + cellClient := newCellFakeClient(secret) + r := newGCReconciler(cellClient) + + result, err := reconcileGC(t, r, gcTestSecName) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + var got corev1.Secret + err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestSecName}, &got) + assert.True(t, apierrors.IsNotFound(err), "companion Secret should be deleted when all referrers are absent") +} + +// TestCompanionGC_ProjectNamespaceKey_PreservesCompanionWithLiveLocalWD is the +// Blocker 1 regression test. The referenced-by annotation key is written by the +// hub ReferencedDataController as "projectNS/wdName" (e.g. +// "default/mount-pristine-default-dfw"). The cell WD lives in ns-{uid}, NOT in +// "default". The GC reconciler must find the WD by name in the companion's own +// namespace — using the project namespace from the key would always NotFound and +// incorrectly delete a companion that is actively in use. +func TestCompanionGC_ProjectNamespaceKey_PreservesCompanionWithLiveLocalWD(t *testing.T) { + t.Parallel() + + // Annotation uses project namespace ("default"), exactly as production writes it. + wdKey := prodRefKey(gcTestWD1Name) // "default/mount-pristine-default-dfw" + cm := companionCM([]string{wdKey}) + + // The WD lives in the CELL namespace (ns-{uid}), not "default". + wd := cellWD(gcTestWD1Name, gcTestNamespace) + cellClient := newCellFakeClient(cm, wd) + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + + // Companion must NOT be deleted — the WD exists on this cell. + var got corev1.ConfigMap + require.NoError(t, cellClient.Get(context.Background(), + types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), + "companion must be preserved when WD with matching name exists in cell namespace, even though annotation key uses project namespace") +} + +// TestCompanionGC_PreservedWhenLiveReferrerExists verifies that a companion is +// NOT deleted when a live (non-terminating) WD is present on the cell. +func TestCompanionGC_PreservedWhenLiveReferrerExists(t *testing.T) { + t.Parallel() + + cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) + wd := cellWD(gcTestWD1Name, gcTestNamespace) + cellClient := newCellFakeClient(cm, wd) + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + + var got corev1.ConfigMap + require.NoError(t, cellClient.Get(context.Background(), + types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got)) +} + +// TestCompanionGC_PreservedWhenTerminatingReferrerExists verifies that a companion +// is NOT deleted when the only referrer on this cell is terminating. Terminating +// WDs still count as present — we are conservative to avoid premature deletion +// during the WD teardown window. +func TestCompanionGC_PreservedWhenTerminatingReferrerExists(t *testing.T) { + t.Parallel() + + cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) + wd := cellWDTerminating(gcTestWD1Name, gcTestNamespace) + cellClient := newCellFakeClient(cm, wd) + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + + var got corev1.ConfigMap + require.NoError(t, cellClient.Get(context.Background(), + types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got)) +} + +// TestCompanionGC_MultiReferrer_OneDifferentCell_OtherLiveLocal verifies the +// core per-cell multi-referrer safety property. The annotation (written by the +// hub) lists both WDs with project-namespace keys. WD1 is on another cell (not +// present locally). WD2 IS on this cell. Companion must be preserved. +func TestCompanionGC_MultiReferrer_OneDifferentCell_OtherLiveLocal(t *testing.T) { + t.Parallel() + + // Both keys use production format (project namespace prefix). + // WD1 is NOT present on this cell (simulating it running on another cell). + // WD2 IS present on this cell. + cm := companionCM([]string{ + prodRefKey(gcTestWD1Name), + prodRefKey(gcTestWD2Name), + }) + wd2 := cellWD(gcTestWD2Name, gcTestNamespace) + cellClient := newCellFakeClient(cm, wd2) + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + + // Companion must NOT have been deleted because WD2 is still alive on this cell. + var got corev1.ConfigMap + require.NoError(t, cellClient.Get(context.Background(), + types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), + "companion must survive when at least one local referrer is present") +} + +// TestCompanionGC_MultiReferrer_AllAbsent_Deleted verifies that when BOTH WD +// names from the annotation are absent from this cell the companion is deleted. +func TestCompanionGC_MultiReferrer_AllAbsent_Deleted(t *testing.T) { + t.Parallel() + + cm := companionCM([]string{ + prodRefKey(gcTestWD1Name), + prodRefKey(gcTestWD2Name), + }) + // Neither WD is on this cell. + cellClient := newCellFakeClient(cm) + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + + var got corev1.ConfigMap + err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) + assert.True(t, apierrors.IsNotFound(err), "companion must be deleted when all referrers are absent from this cell") +} + +// TestCompanionGC_EmptyRefCount_Deleted verifies that a companion with an empty +// (or missing) ref-count annotation is deleted — zero referrers means safe to remove. +func TestCompanionGC_EmptyRefCount_Deleted(t *testing.T) { + t.Parallel() + + cm := companionCM(nil) // empty ref-count + cellClient := newCellFakeClient(cm) + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + + var got corev1.ConfigMap + err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) + assert.True(t, apierrors.IsNotFound(err), "companion with no referrers should be deleted") +} + +// TestCompanionGC_NonCompanionNotTouched verifies that a ConfigMap without the +// referenced-data=true label is not deleted by the GC reconciler. +func TestCompanionGC_NonCompanionNotTouched(t *testing.T) { + t.Parallel() + + cm := plainCM(gcTestCMName, gcTestNamespace) + cellClient := newCellFakeClient(cm) + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) + + var got corev1.ConfigMap + require.NoError(t, cellClient.Get(context.Background(), + types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), + "ConfigMap without referenced-data label must not be deleted") +} + +// TestCompanionGC_ObjectAlreadyGone verifies that reconciling a missing object +// is a no-op (no error). +func TestCompanionGC_ObjectAlreadyGone(t *testing.T) { + t.Parallel() + + cellClient := newCellFakeClient() // nothing pre-loaded + r := newGCReconciler(cellClient) + + _, err := reconcileGC(t, r, gcTestCMName) + require.NoError(t, err) +} + +// ─── Federator ordering guard tests ─────────────────────────────────────────── + +// TestFederator_Finalize_RequeuesWhenCompanionsPresent verifies that the +// Finalize method returns a RequeueAfter result (not an error) when companion +// ConfigMaps or Secrets still exist in the downstream namespace. The +// PropagationPolicy must not be deleted until all companions are gone. +func TestFederator_Finalize_RequeuesWhenCompanionsPresent(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + extraObj client.Object // the companion that blocks PP deletion + }{ + { + name: "ConfigMap companion blocks PP deletion", + extraObj: &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cm-companion", + Namespace: testKarmadaNSStr, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + }, + }, + { + name: "Secret companion blocks PP deletion", + extraObj: &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "secret-companion", + Namespace: testKarmadaNSStr, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaClient := newKarmadaFakeClient( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + tt.extraObj, + ) + + r := newTestFederator(projectClient, karmadaClient) + result, err := r.Reconcile(context.Background(), reconcileRequest()) + + // With a companion still present, Finalize must requeue gracefully — + // no error (which would inflate error metrics), just a RequeueAfter. + require.NoError(t, err, "Finalize should NOT return an error when companions are present; use RequeueAfter instead") + assert.Equal(t, companionGuardRequeueAfter, result.RequeueAfter, + "result should carry the companion-guard RequeueAfter delay") + + // PropagationPolicy must still be alive. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + require.NoError(t, karmadaClient.Get(context.Background(), + types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, + &remainingPP), + "PropagationPolicy must not be deleted while companions are still present") + }) + } +} + +// TestFederator_Finalize_DeletesPPWhenCompanionsGone verifies that once all +// companions are removed from the downstream namespace the PP is deleted as +// normal (ordering guard does not block a clean namespace). +func TestFederator_Finalize_DeletesPPWhenCompanionsGone(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + // No companions in the namespace — ordering guard should pass. + karmadaClient := newKarmadaFakeClient( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + ) + + r := newTestFederator(projectClient, karmadaClient) + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // PP must be gone — no companions blocked the cleanup. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(context.Background(), + types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, + &remainingPP) + assert.True(t, apierrors.IsNotFound(err), "PropagationPolicy should be deleted when no companions remain") +} + +// TestFederator_Finalize_DoesNotBlockOnSiblingWDCompanion is the Blocker 2 +// regression test. The downstream namespace (ns-{uid}) is shared by ALL WDs in +// a project. When WD-A is deleted while WD-B's companion still exists in the +// same namespace, the ordering guard must NOT block WD-A's finalizer. Blocking +// would permanently deadlock WD-A's deletion because WD-B's companion is never +// going to be cleaned up by WD-A's referenced-data controller. +// +// The guard must only fire when the deleting WD is the LAST one for its city +// code (i.e. when the PP is actually about to be deleted). +func TestFederator_Finalize_DoesNotBlockOnSiblingWDCompanion(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + // WD-A is the one being deleted (has finalizer + deletionTimestamp). + wdA := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wdA) + + // In the downstream namespace: + // - WD-A's mirrored object (just deleted above, but not yet GC'd from Karmada) + // - WD-B: a sibling WD with the SAME city code, still alive + // - A companion belonging to WD-B + // - The shared PropagationPolicy + karmadaWDA := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: rdTestWorkloadName}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaWDB := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "wd-b", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "workload-b"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + // WD-B's companion still lives in the shared namespace. + wdBCompanion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cm-b-companion", + Namespace: testKarmadaNSStr, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaClient := newKarmadaFakeClient( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWDA, + karmadaWDB, + wdBCompanion, + karmadaPP, + ) + + r := newTestFederator(projectClient, karmadaClient) + result, err := r.Reconcile(context.Background(), reconcileRequest()) + + // WD-A's finalizer must complete without error even though WD-B's companion + // is still in the namespace. WD-B is still alive so the PP is kept anyway + // (cleanupPropagationPolicyIfUnused no-ops). The companion guard must NOT + // fire here. + require.NoError(t, err, "WD-A finalization must not be blocked by WD-B's companion") + assert.Equal(t, ctrl.Result{}, result) + + // The PropagationPolicy must still be alive (WD-B keeps it alive, not the guard). + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + require.NoError(t, karmadaClient.Get(context.Background(), + types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, + &remainingPP), + "PropagationPolicy should be kept because WD-B still references the city code") +} + +// TestFederator_Finalize_GuardBypassesAfterTimeout verifies that the companion +// ordering guard stops blocking when the WD has been terminating for longer than +// companionGuardTimeout. This prevents a permanently wedged referenced-data +// controller from deadlocking WD deletion. +func TestFederator_Finalize_GuardBypassesAfterTimeout(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + // Set the DeletionTimestamp far enough in the past to exceed the guard timeout. + pastDeletion := metav1.NewTime(time.Now().Add(-(companionGuardTimeout + time.Minute))) + wd := testWorkloadDeployment(withFinalizer, func(w *computev1alpha.WorkloadDeployment) { + w.DeletionTimestamp = &pastDeletion + }) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: testWDName, Namespace: testKarmadaNSStr}, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: ppName, Namespace: testKarmadaNSStr}, + } + // A companion is still present — but the timeout should let us proceed anyway. + stuckCompanion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "stuck-companion", + Namespace: testKarmadaNSStr, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + } + karmadaClient := newKarmadaFakeClient( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + stuckCompanion, + ) + + r := newTestFederator(projectClient, karmadaClient) + result, err := r.Reconcile(context.Background(), reconcileRequest()) + + // After the timeout the guard is bypassed — finalization must complete. + require.NoError(t, err, "guard must bypass after timeout even with companion still present") + assert.Equal(t, ctrl.Result{}, result) + + // The PP must be deleted (last WD for the city, guard bypassed). + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(context.Background(), + types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, + &remainingPP) + assert.True(t, apierrors.IsNotFound(err), "PropagationPolicy should be deleted after guard timeout bypass") +} diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go index 45dfbea6..c2c3e13c 100644 --- a/internal/controller/workloaddeployment_controller_test.go +++ b/internal/controller/workloaddeployment_controller_test.go @@ -22,16 +22,12 @@ import ( ) const ( - // wdControllerTestName / wdControllerTestNS / wdControllerTestUID are shared - // fixtures for the WorkloadDeployment controller unit tests. + // wdControllerTestName, wdControllerTestNS, wdControllerTestUID are the + // stable identifiers used across the WorkloadDeployment controller tests. wdControllerTestName = "test-wd" - wdControllerTestNS = "default" + wdControllerTestNS = gcTestProjectNamespace // "default" wdControllerTestUID = "wd-uid-test" - // wdControllerTestCityCode is the shared CityCode fixture for - // WorkloadDeployment controller tests. - wdControllerTestCityCode = "DFW" - // wdControllerTestWorkload is the shared WorkloadRef fixture. wdControllerTestWorkload = "test-workload" @@ -52,7 +48,7 @@ func wdControllerTestDeployment(minReplicas int32) *computev1alpha.WorkloadDeplo UID: wdControllerTestUID, }, Spec: computev1alpha.WorkloadDeploymentSpec{ - CityCode: wdControllerTestCityCode, + CityCode: wbTestCityCode, PlacementName: testDefaultPlacement, WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ @@ -202,7 +198,7 @@ func TestReconcileInstanceGates_NilSpecController_DoesNotPanic(t *testing.T) { deployment := &computev1alpha.WorkloadDeployment{ ObjectMeta: metav1.ObjectMeta{Name: wdControllerTestName, Namespace: wdControllerTestNS, UID: wdControllerTestUID}, Spec: computev1alpha.WorkloadDeploymentSpec{ - CityCode: wdControllerTestCityCode, + CityCode: wbTestCityCode, PlacementName: testDefaultPlacement, WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go index 9d0d44ba..f45fc9fc 100644 --- a/internal/controller/workloaddeployment_federator.go +++ b/internal/controller/workloaddeployment_federator.go @@ -4,8 +4,10 @@ package controller import ( "context" + "errors" "fmt" "strings" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -13,6 +15,7 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -48,8 +51,51 @@ const ( cityCodeLabel = "topology.datum.net/city-code" kindWorkloadDeployment = "WorkloadDeployment" + + // companionGuardRequeueAfter is the delay inserted between companion-guard + // requeue attempts. Long enough for the referenced-data controller to delete + // the hub companion; short enough that the deletion is not visibly delayed. + companionGuardRequeueAfter = 5 * time.Second + + // companionGuardTimeout is the maximum wall-clock time the companion ordering + // guard will block a WD deletion after its deletionTimestamp was set. After + // this timeout the guard is bypassed and a warning is logged. The cell-side + // CompanionGCReconciler is the authoritative cleanup path regardless. + companionGuardTimeout = 2 * time.Minute ) +// errCompanionsStillPresent is a sentinel returned by Finalize when hub +// companions are still present in the downstream namespace. Reconcile intercepts +// it, logs at Info (not Error), and returns RequeueAfter so the wait does not +// inflate the error-rate metric. +type errCompanionsStillPresent struct{ namespace string } + +func (e errCompanionsStillPresent) Error() string { + return fmt.Sprintf("companions still present in %s; waiting for referenced-data controller to finish cleanup", e.namespace) +} + +// findCompanionGuardSentinel searches err (including any kerrors.Aggregate +// members) for errCompanionsStillPresent. The finalizer framework wraps errors +// in a kerrors.Aggregate whose Errors() slice contains the per-finalizer +// wrapped errors; because Aggregate does not implement Unwrap() the standard +// errors.As cannot traverse it, so we walk manually. +func findCompanionGuardSentinel(err error) (errCompanionsStillPresent, bool) { + if agg, ok := err.(kerrors.Aggregate); ok { + for _, e := range agg.Errors() { + var sentinel errCompanionsStillPresent + if errors.As(e, &sentinel) { + return sentinel, true + } + } + return errCompanionsStillPresent{}, false + } + var sentinel errCompanionsStillPresent + if errors.As(err, &sentinel) { + return sentinel, true + } + return errCompanionsStillPresent{}, false +} + // WorkloadDeploymentFederator replicates WorkloadDeployments from project // namespaces into the downstream control plane so it can propagate them to the // appropriate POP-cell clusters. @@ -123,6 +169,16 @@ func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcrecon finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) if err != nil { + // The finalizer framework returns a kerrors.Aggregate which does not + // implement Unwrap, so errors.As cannot traverse it directly. Walk the + // aggregate manually to find the companion-guard sentinel so we can + // requeue gracefully (Info log + RequeueAfter) instead of inflating the + // error-rate metric. + if waitErr, ok := findCompanionGuardSentinel(err); ok { + logger.Info("deferring PropagationPolicy cleanup: hub companions still present", + "downstreamNamespace", waitErr.namespace) + return ctrl.Result{RequeueAfter: companionGuardRequeueAfter}, nil + } return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) } if finalizationResult.Updated { @@ -210,6 +266,23 @@ func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.O } logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + // The companion ordering guard only applies when this is the last WD for its + // city code — i.e. when cleanupPropagationPolicyIfUnused is actually about to + // delete the PP. If other WDs with the same city code still exist the PP is + // not deleted on this call anyway, so blocking here would deadlock any + // shared-namespace deletion (WD-A blocked by WD-B's companion, forever). + isLastForCity, err := r.isLastDeploymentForCity(ctx, downstreamNS, deployment.Spec.CityCode) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to check city-code peers in %s: %w", downstreamNS, err) + } + if isLastForCity { + if blocked, err := r.companionsStillPresent(ctx, downstreamNS, deployment); err != nil { + return finalizer.Result{}, fmt.Errorf("failed to check for remaining companions in %s: %w", downstreamNS, err) + } else if blocked { + return finalizer.Result{}, errCompanionsStillPresent{namespace: downstreamNS} + } + } + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { return finalizer.Result{}, err } @@ -217,6 +290,70 @@ func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.O return finalizer.Result{}, nil } +// isLastDeploymentForCity reports whether there are no other WorkloadDeployments +// with the same city code as cityCode remaining in downstreamNS (besides the one +// being deleted, which was already removed above). +func (r *WorkloadDeploymentFederator) isLastDeploymentForCity(ctx context.Context, downstreamNS, cityCode string) (bool, error) { + var remaining computev1alpha.WorkloadDeploymentList + if err := r.FederationClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return false, fmt.Errorf("list downstream deployments for city %q: %w", cityCode, err) + } + return len(remaining.Items) == 0, nil +} + +// companionsStillPresent reports whether any companion ConfigMaps or Secrets +// (labeled referenced-data=true) remain in the given downstream namespace. +// Returns (true, nil) when companions are present, (false, nil) when the +// namespace is clear, and (false, err) on API failure. +// +// deployment is the project-plane WD being finalized. Its DeletionTimestamp is +// used to bound how long the guard can block: if the WD has been terminating +// for longer than companionGuardTimeout the guard is bypassed with a warning. +// The cell-side CompanionGCReconciler is the authoritative cleanup path and +// will eventually clean up regardless. +func (r *WorkloadDeploymentFederator) companionsStillPresent( + ctx context.Context, + downstreamNS string, + deployment *computev1alpha.WorkloadDeployment, +) (bool, error) { + // If the WD has been terminating longer than the timeout, skip the check so + // a wedged referenced-data controller cannot block deletion permanently. + if deployment.DeletionTimestamp != nil { + age := time.Since(deployment.DeletionTimestamp.Time) + if age > companionGuardTimeout { + log.FromContext(ctx).Info( + "companion-guard timeout reached; bypassing guard — cell-side GC will clean up remaining companions", + "age", age.Round(time.Second), + "timeout", companionGuardTimeout, + "downstreamNamespace", downstreamNS, + ) + return false, nil + } + } + + companionSelector := client.MatchingLabels{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + } + nsSelector := client.InNamespace(downstreamNS) + + var cms corev1.ConfigMapList + if err := r.FederationClient.List(ctx, &cms, nsSelector, companionSelector); err != nil { + return false, fmt.Errorf("list companion ConfigMaps: %w", err) + } + if len(cms.Items) > 0 { + return true, nil + } + + var secrets corev1.SecretList + if err := r.FederationClient.List(ctx, &secrets, nsSelector, companionSelector); err != nil { + return false, fmt.Errorf("list companion Secrets: %w", err) + } + return len(secrets.Items) > 0, nil +} + // ensureDownstreamNamespace creates or updates the downstream namespace, stamping // it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. // This allows the InstanceProjector to resolve the project namespace name via a From fa3d74f6d43265e1af88beea91b40591efef2479 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 18:45:01 -0500 Subject: [PATCH 2/8] fix: drive companion GC off WorkloadDeployment events to avoid cluster-wide informer OOM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CompanionGCReconciler previously registered For(&corev1.ConfigMap{}) which established a cluster-wide ConfigMap informer with no label-scoped cache on the cell manager. On a cell cluster this caused controller-runtime to sync and hold every ConfigMap (and, via lazy cache reads, every Secret) in the cluster, pushing the cell compute-manager well past its 128Mi limit → OOMKilled / CrashLoopBackOff. Fix: change the For type to WorkloadDeployment. WorkloadDeployments are already cached on the cell by the sibling WorkloadDeploymentReconciler, so this adds no new cluster-wide informer. A WD delete event still enqueues the object's namespace and fires Reconcile, which is the deletion trigger the GC needs. All companion reads (ConfigMap/Secret List) inside Reconcile go through cl.GetAPIReader() (uncached). A one-shot List via the API reader does not establish a persistent informer, so it does not re-introduce the OOM. WD liveness checks use the cached client because WDs are already in the cell cache. Reconcile now sweeps the entire WD namespace (listing companions by label via the uncached reader) rather than reconciling a single named object, and returns ctrl.Result{} (event-driven only, no per-WD requeue). The periodic backstop coverage gap is closed by a companionGCBackstop Runnable (implements mcmanager.Runnable = manager.Runnable + multicluster.Aware). On Engage it records each cell cluster; on each ticker interval it lists ALL companions cluster-wide via the uncached APIReader, collects distinct namespaces, and sends namespace-keyed GenericEvents into a buffered channel wired via WatchesRawSource(source.TypedChannel). This covers namespaces whose last WD was deleted before the controller started, which For(WD) would never enqueue. The steady-state load is bounded to one cross-namespace List per interval regardless of WD history, and no persistent CM/Secret informer is ever created. The federator's companionsStillPresent check (workloaddeployment_federator.go) reads from FederationClient (the Karmada hub client) and is unaffected by this change. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit 9a064cc194b228e8ae9a2702a19f48be9d098d53) --- .../controller/companion_gc_controller.go | 366 +++++++++++++----- .../companion_gc_controller_test.go | 218 ++++++++++- 2 files changed, 477 insertions(+), 107 deletions(-) diff --git a/internal/controller/companion_gc_controller.go b/internal/controller/companion_gc_controller.go index 3b8df64d..28405e3e 100644 --- a/internal/controller/companion_gc_controller.go +++ b/internal/controller/companion_gc_controller.go @@ -6,17 +6,21 @@ import ( "context" "encoding/json" "fmt" + "sync" + "time" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/workqueue" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" @@ -26,16 +30,45 @@ import ( computev1alpha "go.datum.net/compute/api/v1alpha" ) +// companionGCPeriodicInterval is the frequency at which the backstop sweeper +// scans for companions in namespaces that may have no live WorkloadDeployments. +// In steady state orphans are covered immediately by WD-delete events; the +// periodic sweep is a backstop for namespaces whose last WD was deleted before +// this controller started (or between restarts). +const companionGCPeriodicInterval = 5 * time.Minute + +// companionGCChannelBuffer is the number of events the backstop ticker channel +// can hold without blocking. Sized to accommodate one sweep's worth of +// namespace events across a typical cell (expected O(10s) of namespaces). +const companionGCChannelBuffer = 256 + // CompanionGCReconciler runs on each cell cluster and deletes orphaned companion // ConfigMaps and Secrets whose WorkloadDeployment referrers are gone from that // cell. It is the authoritative GC path that does not depend on Karmada's // ResourceBinding cascade completing correctly. // -// The reconciler is triggered by WorkloadDeployment events. On each trigger it -// lists all companions in the WD's namespace (by the referenced-data=true -// label), parses the referenced-by annotation on each companion, and deletes -// any companion for which every listed WD name is absent from the local cell -// namespace. +// The reconciler is triggered by WorkloadDeployment events (including deletion). +// On each trigger it lists all companions in the WD's namespace (by the +// referenced-data=true label), parses the referenced-by annotation on each +// companion, and deletes any companion for which every listed WD name is absent +// from the local cell namespace. +// +// # No cluster-wide ConfigMap/Secret informer +// +// The For type is WorkloadDeployment, not ConfigMap. WorkloadDeployments are +// already cached on the cell by the sibling WorkloadDeploymentReconciler, so +// this adds NO new cluster-wide informer. All companion reads (ConfigMap/Secret +// List) go through the UNCACHED APIReader. A one-shot List via APIReader does +// NOT establish a persistent informer, so it does not cause the OOM that +// For(ConfigMap) with an unscoped cache would. +// +// # Periodic backstop for WD-less namespaces +// +// A WD delete event covers the common case. Namespaces whose LAST WD was +// deleted before the controller started (or during a restart window) would +// never get a reconcile event. The companionGCBackstop Runnable fires +// namespace-keyed Reconcile requests on a ticker, independently of any WD +// object presence, to cover that gap. // // Per-cell multi-referrer safety: the referenced-by annotation is written by // the hub-side ReferencedDataController using PROJECT-plane namespace keys @@ -51,58 +84,83 @@ type CompanionGCReconciler struct { mgr mcmanager.Manager } -// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;delete -// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;delete +// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;delete +// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch -// Reconcile is invoked for each companion ConfigMap or Secret that carries the -// referenced-data=true label. It deletes the companion when every WD listed in -// its referenced-by annotation is absent from the same namespace on this cell. +// Reconcile is invoked for each WorkloadDeployment event (including deletion) +// and by the periodic backstop for namespaces with no live WDs. +// +// It sweeps req.Namespace: any companion (ConfigMap or Secret carrying the +// referenced-data=true label) whose every listed WD referrer is absent from the +// local cell is deleted. req.Name is unused — the sweep covers all companions +// in the namespace regardless of which WD or backstop event triggered it. +// +// All companion reads use the uncached APIReader so no persistent CM/Secret +// informer is ever established. WD liveness checks use the cached client because +// WorkloadDeployments are already in the cell manager's cache. func (r *CompanionGCReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx).WithValues( - "namespace", req.Namespace, - "name", req.Name, - ) + logger := log.FromContext(ctx).WithValues("namespace", req.Namespace) cl, err := r.mgr.GetCluster(ctx, req.ClusterName) if err != nil { return ctrl.Result{}, err } ctx = mccontext.WithCluster(ctx, req.ClusterName) + + // Uncached reader for all ConfigMap/Secret reads — prevents establishing a + // cluster-wide informer that would exhaust 128Mi on a cell cluster. + apiReader := cl.GetAPIReader() + // Cached client for WD liveness checks — WDs are already in the cell cache. cellClient := cl.GetClient() - return ctrl.Result{}, r.reconcileCompanion(ctx, cellClient, req.NamespacedName, logger) + if err := r.sweepNamespace(ctx, apiReader, cellClient, req.Namespace, logger); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil } -// reconcileCompanion checks one ConfigMap-or-Secret companion and deletes it -// when safe. The function tries ConfigMap first; if not found it tries Secret. -func (r *CompanionGCReconciler) reconcileCompanion( +// sweepNamespace lists all companion ConfigMaps and Secrets in the given +// namespace (via the uncached apiReader) and deletes any whose every WD +// referrer is absent from the cell. +// +// Writes (Delete) always use cellClient because deletion does not require +// the object to be in the cache. +func (r *CompanionGCReconciler) sweepNamespace( ctx context.Context, + apiReader client.Reader, cellClient client.Client, - key types.NamespacedName, + namespace string, logger interface{ Info(string, ...any) }, ) error { - // Try ConfigMap. - var cm corev1.ConfigMap - err := cellClient.Get(ctx, key, &cm) - if err == nil { - return r.maybeDeleteConfigMap(ctx, cellClient, &cm, logger) + companionLabel := client.MatchingLabels{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, } - if !apierrors.IsNotFound(err) { - return fmt.Errorf("get ConfigMap %s: %w", key, err) + nsSelector := client.InNamespace(namespace) + + // List companion ConfigMaps via the uncached reader. + var cms corev1.ConfigMapList + if err := apiReader.List(ctx, &cms, nsSelector, companionLabel); err != nil { + return fmt.Errorf("list companion ConfigMaps in %s: %w", namespace, err) + } + for i := range cms.Items { + if err := r.maybeDeleteConfigMap(ctx, cellClient, &cms.Items[i], logger); err != nil { + return err + } } - // Not a ConfigMap — try Secret. - var secret corev1.Secret - err = cellClient.Get(ctx, key, &secret) - if err == nil { - return r.maybeDeleteSecret(ctx, cellClient, &secret, logger) + // List companion Secrets via the uncached reader. + var secrets corev1.SecretList + if err := apiReader.List(ctx, &secrets, nsSelector, companionLabel); err != nil { + return fmt.Errorf("list companion Secrets in %s: %w", namespace, err) } - if !apierrors.IsNotFound(err) { - return fmt.Errorf("get Secret %s: %w", key, err) + for i := range secrets.Items { + if err := r.maybeDeleteSecret(ctx, cellClient, &secrets.Items[i], logger); err != nil { + return err + } } - // Object already gone — nothing to do. return nil } @@ -155,7 +213,7 @@ func (r *CompanionGCReconciler) maybeDeleteSecret( // isCompanion reports whether the object carries the referenced-data=true label. // The label is the authoritative signal that the companion was created by the // referenced-data controller. Objects without it are not touched by GC. -func isCompanion(obj metav1.Object) bool { +func isCompanion(obj interface{ GetLabels() map[string]string }) bool { labels := obj.GetLabels() return labels[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue } @@ -172,6 +230,10 @@ func isCompanion(obj metav1.Object) bool { // semantics: a WD that runs on a different cell is never present locally, so it // contributes nothing to liveness on this cell. // +// WD reads use the CACHED cellClient because WorkloadDeployments are already +// held in the cell manager's cache (the sibling WorkloadDeploymentReconciler +// watches them). This is safe and cheap. +// // A companion is considered still needed if any referrer is present in any // state (including terminating) to avoid premature deletion during the WD // teardown window. @@ -257,72 +319,196 @@ func decodeCompanionRefCount(annotations map[string]string) ([]string, error) { return keys, nil } -// SetupWithManager registers the CompanionGCReconciler with the multicluster -// manager. It should only be called when cell controllers are enabled -// (--enable-cell-controllers). The reconciler watches ConfigMaps (its primary -// For type) and uses WorkloadDeployment events to enqueue companion objects in -// the same namespace. Secret companions are discovered at reconcile time via -// Get, not through a separate For/Watches registration, because both kinds map -// to the same reconcile loop. -func (r *CompanionGCReconciler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr +// ─── Periodic backstop ──────────────────────────────────────────────────────── - return mcbuilder.ControllerManagedBy(mgr). - For(&corev1.ConfigMap{}, mcbuilder.WithEngageWithLocalCluster(false)). - // When a WorkloadDeployment changes (including deletion), re-enqueue all - // companion ConfigMaps and Secrets in the same namespace so the GC - // reconciler can decide whether they are still referenced. - Watches(&computev1alpha.WorkloadDeployment{}, - func(_ multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []mcreconcile.Request { - return enqueueCompanionsForNamespace(ctx, cl.GetClient(), obj.GetNamespace()) - }) - }, - mcbuilder.WithEngageWithLocalCluster(false), - ). - Named("companion-gc"). - Complete(r) +// companionGCBackstop is an mcmanager.Runnable that fires periodic full-cluster +// companion sweeps, independent of any WorkloadDeployment event. It closes the +// coverage gap where a namespace's LAST WD was deleted before this controller +// started: the per-WD For() watch would never enqueue that namespace, so without +// a backstop those orphans would persist until the next controller restart. +// +// On each tick the backstop lists ALL companions (by label, ALL namespaces) via +// the uncached APIReader on each engaged cell cluster, collects the distinct +// namespaces, and sends one namespace-keyed event per namespace into ch. The +// controller's WatchesRawSource picks those events up and routes them to +// Reconcile via backstopEventHandler. +// +// A one-shot APIReader.List does NOT establish a persistent informer, so this +// never causes the OOM the original For(ConfigMap) design had. +type companionGCBackstop struct { + ch chan event.GenericEvent + + mu sync.Mutex + clusters map[multicluster.ClusterName]cluster.Cluster } -// enqueueCompanionsForNamespace returns mcreconcile.Requests for every companion -// ConfigMap and Secret in the given namespace. It is called from the WD watch -// handler so the GC reconciler is triggered whenever a WD changes in a namespace -// that may contain companions. -func enqueueCompanionsForNamespace(ctx context.Context, cellClient client.Client, namespace string) []mcreconcile.Request { +// Engage is called by the mcmanager coordinator when a cell cluster becomes +// active. We record the cluster so the ticker goroutine can include it in +// each sweep. +func (b *companionGCBackstop) Engage(_ context.Context, name multicluster.ClusterName, cl cluster.Cluster) error { + b.mu.Lock() + defer b.mu.Unlock() + b.clusters[name] = cl + return nil +} + +// Start runs the periodic ticker loop. It blocks until ctx is cancelled. +func (b *companionGCBackstop) Start(ctx context.Context) error { + ticker := time.NewTicker(companionGCPeriodicInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + b.sweep(ctx) + } + } +} + +// sweep enumerates all engaged cell clusters and, for each, lists companion +// ConfigMaps and Secrets via the uncached APIReader across all namespaces. For +// each distinct namespace found it sends a GenericEvent into ch so the GC +// controller will sweep that namespace. +// +// Each event carries a *corev1.Namespace whose ObjectMeta.Namespace is the +// target sweep namespace and ObjectMeta.Name encodes the ClusterName. The +// backstopEventHandler lifts these into an mcreconcile.Request. +// +// Errors are logged and skipped: a failed sweep on one cluster does not block +// others, and the next tick will retry. +func (b *companionGCBackstop) sweep(ctx context.Context) { + logger := log.FromContext(ctx).WithValues("component", "companion-gc-backstop") + + b.mu.Lock() + clusters := make(map[multicluster.ClusterName]cluster.Cluster, len(b.clusters)) + for k, v := range b.clusters { + clusters[k] = v + } + b.mu.Unlock() + companionLabel := client.MatchingLabels{ computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, } - nsSelector := client.InNamespace(namespace) - var reqs []mcreconcile.Request + for clusterName, cl := range clusters { + apiReader := cl.GetAPIReader() + namespaces := make(map[string]struct{}) - var cms corev1.ConfigMapList - if err := cellClient.List(ctx, &cms, nsSelector, companionLabel); err == nil { - for i := range cms.Items { - reqs = append(reqs, mcreconcile.Request{ - Request: reconcile.Request{ - NamespacedName: types.NamespacedName{ - Namespace: cms.Items[i].Namespace, - Name: cms.Items[i].Name, - }, - }, - }) + var cms corev1.ConfigMapList + if err := apiReader.List(ctx, &cms, companionLabel); err != nil { + logger.Error(err, "backstop: list companion ConfigMaps", "cluster", clusterName) + } else { + for i := range cms.Items { + namespaces[cms.Items[i].Namespace] = struct{}{} + } } - } - var secrets corev1.SecretList - if err := cellClient.List(ctx, &secrets, nsSelector, companionLabel); err == nil { - for i := range secrets.Items { - reqs = append(reqs, mcreconcile.Request{ - Request: reconcile.Request{ - NamespacedName: types.NamespacedName{ - Namespace: secrets.Items[i].Namespace, - Name: secrets.Items[i].Name, - }, + var secrets corev1.SecretList + if err := apiReader.List(ctx, &secrets, companionLabel); err != nil { + logger.Error(err, "backstop: list companion Secrets", "cluster", clusterName) + } else { + for i := range secrets.Items { + namespaces[secrets.Items[i].Namespace] = struct{}{} + } + } + + // Emit one GenericEvent per distinct namespace. The carrier object is a + // *corev1.Namespace with Namespace= and Name=. + // backstopEventHandler lifts these fields into an mcreconcile.Request. + for ns := range namespaces { + obj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + // Name carries the ClusterName so backstopEventHandler can + // stamp it onto the mcreconcile.Request. + Name: string(clusterName), + // Namespace carries the target cell namespace to sweep. + Namespace: ns, }, - }) + } + select { + case b.ch <- event.GenericEvent{Object: obj}: + case <-ctx.Done(): + return + default: + // Channel full — drop this namespace; the next tick will cover it. + logger.V(1).Info("backstop: channel full, dropping namespace sweep event", + "cluster", clusterName, "namespace", ns) + } } } +} + +// backstopEventHandler maps a backstop GenericEvent (carrier: *corev1.Namespace +// with Namespace=target sweep namespace, Name=ClusterName) into an +// mcreconcile.Request. It implements handler.TypedEventHandler[client.Object, mcreconcile.Request]. +type backstopEventHandler struct{} + +var _ handler.TypedEventHandler[client.Object, mcreconcile.Request] = backstopEventHandler{} + +func (backstopEventHandler) Create(_ context.Context, _ event.TypedCreateEvent[client.Object], _ workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { +} +func (backstopEventHandler) Update(_ context.Context, _ event.TypedUpdateEvent[client.Object], _ workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { +} +func (backstopEventHandler) Delete(_ context.Context, _ event.TypedDeleteEvent[client.Object], _ workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { +} +func (backstopEventHandler) Generic(_ context.Context, ev event.TypedGenericEvent[client.Object], q workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { + obj := ev.Object + q.Add(mcreconcile.Request{ + ClusterName: multicluster.ClusterName(obj.GetName()), + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: obj.GetNamespace(), + // Name is intentionally empty — Reconcile uses only req.Namespace. + }, + }, + }) +} + +// SetupWithManager registers the CompanionGCReconciler with the multicluster +// manager. It should only be called when cell controllers are enabled +// (--enable-cell-controllers). +// +// The For type is WorkloadDeployment (NOT ConfigMap or Secret). WorkloadDeployments +// are already cached on the cell by the sibling WorkloadDeploymentReconciler, so +// this registration adds NO new cluster-wide informer. A WD delete event enqueues +// the WD's namespace/name and fires Reconcile — that is the deletion trigger. +// +// A companionGCBackstop Runnable is registered on the manager and wired via +// WatchesRawSource. On each tick it lists companions across all namespaces via +// the uncached APIReader and enqueues namespace-keyed reconcile requests. This +// covers namespaces whose last WD was deleted before the controller started. +// +// All companion reads inside Reconcile and the backstop go through the UNCACHED +// APIReader so that no persistent ConfigMap/Secret informer is ever established. +func (r *CompanionGCReconciler) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + + // backstopCh carries namespace-keyed GenericEvents from companionGCBackstop + // to the controller's WatchesRawSource. Typed as event.GenericEvent (alias for + // TypedGenericEvent[client.Object]) so source.TypedChannel can consume it. + backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) + + backstop := &companionGCBackstop{ + ch: backstopCh, + clusters: make(map[multicluster.ClusterName]cluster.Cluster), + } + if err := mgr.Add(backstop); err != nil { + return fmt.Errorf("companion-gc: add backstop runnable: %w", err) + } - return reqs + return mcbuilder.ControllerManagedBy(mgr). + // Drive GC off WorkloadDeployment events. WDs are already in the cell cache; + // this adds no new cluster-wide informer. A WD deletion still triggers + // Reconcile so we sweep the namespace immediately after the WD disappears. + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + // Backstop: periodic namespace-keyed events from companionGCBackstop. + // TypedChannel[client.Object, mcreconcile.Request] produces a + // TypedSource[mcreconcile.Request] as required by WatchesRawSource. + WatchesRawSource(source.TypedChannel( + backstopCh, + backstopEventHandler{}, + )). + Named("companion-gc"). + Complete(r) } diff --git a/internal/controller/companion_gc_controller_test.go b/internal/controller/companion_gc_controller_test.go index cfc4d5b6..ad6b14d5 100644 --- a/internal/controller/companion_gc_controller_test.go +++ b/internal/controller/companion_gc_controller_test.go @@ -17,6 +17,8 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/event" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" @@ -150,14 +152,17 @@ func newGCReconciler(cellClient client.Client) *CompanionGCReconciler { return &CompanionGCReconciler{mgr: mgr} } -// reconcileGC runs one GC reconcile for the named object in gcTestNamespace. -func reconcileGC(t *testing.T, r *CompanionGCReconciler, name string) (ctrl.Result, error) { +// reconcileGCForWD runs one GC reconcile triggered by a WorkloadDeployment +// event in gcTestNamespace. The WD object may or may not exist — a deleted WD +// still fires the namespace sweep. The request name is gcTestWD1Name; only +// req.Namespace matters for the sweep logic. +func reconcileGCForWD(t *testing.T, r *CompanionGCReconciler) (ctrl.Result, error) { t.Helper() ctx := mccontext.WithCluster(context.Background(), multicluster.ClusterName(gcTestCluster)) return r.Reconcile(ctx, mcreconcile.Request{ ClusterName: multicluster.ClusterName(gcTestCluster), Request: ctrl.Request{ - NamespacedName: types.NamespacedName{Namespace: gcTestNamespace, Name: name}, + NamespacedName: types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestWD1Name}, }, }) } @@ -339,6 +344,41 @@ func TestHasLiveReferrer(t *testing.T) { // ─── CompanionGCReconciler.Reconcile tests ──────────────────────────────────── +// TestCompanionGC_NamespaceSweep_WDEventTriggersGC verifies the core entry +// path: a WorkloadDeployment event (here the WD is absent, simulating a +// deletion) causes Reconcile to sweep the WD's namespace and delete any +// orphaned companions it finds there. +// +// This also documents that the controller is driven by For(WorkloadDeployment), +// NOT For(ConfigMap). The test exercises the full Reconcile path that results +// from a WD delete event, confirming no CM/Secret informer is needed. +func TestCompanionGC_NamespaceSweep_WDEventTriggersGC(t *testing.T) { + t.Parallel() + + // Two companions in the namespace, both with no live referrer. + cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) + sec := companionSecret(gcTestSecName, gcTestNamespace, []string{prodRefKey(gcTestWD1Name)}) + // No WD in the cell — the WD was deleted, triggering this reconcile. + cellClient := newCellFakeClient(cm, sec) + r := newGCReconciler(cellClient) + + // The reconcile request names the WD that was deleted (namespace sweep uses + // req.Namespace, not req.Name, so the WD need not exist). + result, err := reconcileGCForWD(t, r) + require.NoError(t, err) + // Event-driven reconcile returns empty result (no requeue). + assert.Equal(t, ctrl.Result{}, result) + + // Both companions must be gone. + var gotCM corev1.ConfigMap + err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &gotCM) + assert.True(t, apierrors.IsNotFound(err), "companion ConfigMap should be deleted when all referrers are absent") + + var gotSec corev1.Secret + err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestSecName}, &gotSec) + assert.True(t, apierrors.IsNotFound(err), "companion Secret should be deleted when all referrers are absent") +} + // TestCompanionGC_ConfigMap_DeletedWhenAllReferrersAbsent verifies that a // companion ConfigMap is deleted when the WD listed in its referenced-by // annotation (using production "projectNS/name" format) is absent from the cell. @@ -350,7 +390,7 @@ func TestCompanionGC_ConfigMap_DeletedWhenAllReferrersAbsent(t *testing.T) { cellClient := newCellFakeClient(cm) r := newGCReconciler(cellClient) - result, err := reconcileGC(t, r, gcTestCMName) + result, err := reconcileGCForWD(t, r) require.NoError(t, err) assert.Equal(t, ctrl.Result{}, result) @@ -368,7 +408,7 @@ func TestCompanionGC_Secret_DeletedWhenAllReferrersAbsent(t *testing.T) { cellClient := newCellFakeClient(secret) r := newGCReconciler(cellClient) - result, err := reconcileGC(t, r, gcTestSecName) + result, err := reconcileGCForWD(t, r) require.NoError(t, err) assert.Equal(t, ctrl.Result{}, result) @@ -396,7 +436,7 @@ func TestCompanionGC_ProjectNamespaceKey_PreservesCompanionWithLiveLocalWD(t *te cellClient := newCellFakeClient(cm, wd) r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + _, err := reconcileGCForWD(t, r) require.NoError(t, err) // Companion must NOT be deleted — the WD exists on this cell. @@ -416,7 +456,7 @@ func TestCompanionGC_PreservedWhenLiveReferrerExists(t *testing.T) { cellClient := newCellFakeClient(cm, wd) r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + _, err := reconcileGCForWD(t, r) require.NoError(t, err) var got corev1.ConfigMap @@ -436,7 +476,7 @@ func TestCompanionGC_PreservedWhenTerminatingReferrerExists(t *testing.T) { cellClient := newCellFakeClient(cm, wd) r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + _, err := reconcileGCForWD(t, r) require.NoError(t, err) var got corev1.ConfigMap @@ -462,7 +502,8 @@ func TestCompanionGC_MultiReferrer_OneDifferentCell_OtherLiveLocal(t *testing.T) cellClient := newCellFakeClient(cm, wd2) r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + // WD1 deletion event triggers the sweep; WD2 is still alive locally. + _, err := reconcileGCForWD(t, r) require.NoError(t, err) // Companion must NOT have been deleted because WD2 is still alive on this cell. @@ -485,7 +526,7 @@ func TestCompanionGC_MultiReferrer_AllAbsent_Deleted(t *testing.T) { cellClient := newCellFakeClient(cm) r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + _, err := reconcileGCForWD(t, r) require.NoError(t, err) var got corev1.ConfigMap @@ -502,7 +543,7 @@ func TestCompanionGC_EmptyRefCount_Deleted(t *testing.T) { cellClient := newCellFakeClient(cm) r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + _, err := reconcileGCForWD(t, r) require.NoError(t, err) var got corev1.ConfigMap @@ -515,11 +556,14 @@ func TestCompanionGC_EmptyRefCount_Deleted(t *testing.T) { func TestCompanionGC_NonCompanionNotTouched(t *testing.T) { t.Parallel() + // A plain ConfigMap without the companion label is never returned by the + // label-scoped List and is therefore never passed to maybeDeleteConfigMap. + // We verify the object survives the sweep. cm := plainCM(gcTestCMName, gcTestNamespace) cellClient := newCellFakeClient(cm) r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + _, err := reconcileGCForWD(t, r) require.NoError(t, err) var got corev1.ConfigMap @@ -528,16 +572,156 @@ func TestCompanionGC_NonCompanionNotTouched(t *testing.T) { "ConfigMap without referenced-data label must not be deleted") } -// TestCompanionGC_ObjectAlreadyGone verifies that reconciling a missing object -// is a no-op (no error). -func TestCompanionGC_ObjectAlreadyGone(t *testing.T) { +// TestCompanionGC_EmptyNamespace_NoOp verifies that reconciling a namespace that +// contains no companions is a no-op with no error (covers the case where a WD +// event fires for a namespace that never had companions). +func TestCompanionGC_EmptyNamespace_NoOp(t *testing.T) { t.Parallel() - cellClient := newCellFakeClient() // nothing pre-loaded + // Nothing pre-loaded — no companions, no WDs. + cellClient := newCellFakeClient() r := newGCReconciler(cellClient) - _, err := reconcileGC(t, r, gcTestCMName) + result, err := reconcileGCForWD(t, r) require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// ─── Periodic backstop tests ────────────────────────────────────────────────── + +// TestCompanionGCBackstop_SweepDeletesOrphanInWDlessNamespace is the key +// regression test for the coverage gap identified in review. It proves that a +// companion orphaned in a namespace that has NO WorkloadDeployment object +// present (i.e. the last WD was deleted before the controller started, so the +// For(WD) watch would never enqueue that namespace) is collected by the periodic +// backstop sweep. +// +// The test exercises companionGCBackstop.sweep() directly (not via a hand-fed +// Reconcile request) to confirm the sweep generates the namespace-keyed event +// AND that the resulting Reconcile call cleans up the orphan. +func TestCompanionGCBackstop_SweepDeletesOrphanInWDlessNamespace(t *testing.T) { + t.Parallel() + + // Orphaned companion in a namespace with NO WDs present — the last WD was + // deleted before this controller started. + cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) // WD name in annotation but WD gone + cellClient := newCellFakeClient(cm) // no WD objects + + cl := newFakeCluster(cellClient) + mgr := newFakeMCManager(gcTestCluster, cl) + reconciler := &CompanionGCReconciler{mgr: mgr} + + // Wire the backstop with a buffered channel. We give it a generous buffer + // so the non-blocking send in sweep() succeeds. + backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) + backstop := &companionGCBackstop{ + ch: backstopCh, + clusters: map[multicluster.ClusterName]cluster.Cluster{ + multicluster.ClusterName(gcTestCluster): cl, + }, + } + + // Fire one sweep tick synchronously (no ticker needed in tests). + backstop.sweep(context.Background()) + + // The sweep must have emitted at least one event for gcTestNamespace. + require.NotEmpty(t, backstopCh, "backstop must emit a namespace-sweep event for a namespace containing companions") + + // Drain the channel and invoke Reconcile for each namespace-keyed event, + // exactly as the WatchesRawSource → backstopEventHandler → workqueue path + // would do in production. + sweptNamespaces := map[string]struct{}{} + ctx := mccontext.WithCluster(context.Background(), multicluster.ClusterName(gcTestCluster)) + for len(backstopCh) > 0 { + ev := <-backstopCh + // backstopEventHandler maps: obj.Name → ClusterName, obj.Namespace → ns. + clusterName := multicluster.ClusterName(ev.Object.GetName()) + ns := ev.Object.GetNamespace() + sweptNamespaces[ns] = struct{}{} + + _, err := reconciler.Reconcile(ctx, mcreconcile.Request{ + ClusterName: clusterName, + Request: ctrl.Request{NamespacedName: types.NamespacedName{Namespace: ns}}, + }) + require.NoError(t, err) + } + + // The backstop must have swept gcTestNamespace. + assert.Contains(t, sweptNamespaces, gcTestNamespace, + "backstop sweep must enqueue the namespace containing the orphaned companion") + + // The companion must be gone — collected by the Reconcile call above, not by + // a hand-fed request. This proves the backstop closes the WD-less namespace gap. + var got corev1.ConfigMap + err := cellClient.Get(context.Background(), + types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) + assert.True(t, apierrors.IsNotFound(err), + "companion orphaned in WD-less namespace must be collected by the backstop sweep") +} + +// TestCompanionGCBackstop_SweepSkipsNamespaceWithLiveWD verifies that the +// backstop sweep does not delete a companion when a live WD still exists in the +// same namespace — the per-cell multi-referrer safety rule applies equally to +// backstop-triggered sweeps. +func TestCompanionGCBackstop_SweepSkipsNamespaceWithLiveWD(t *testing.T) { + t.Parallel() + + cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) + wd := cellWD(gcTestWD1Name, gcTestNamespace) // live WD still present + cellClient := newCellFakeClient(cm, wd) + + cl := newFakeCluster(cellClient) + mgr := newFakeMCManager(gcTestCluster, cl) + reconciler := &CompanionGCReconciler{mgr: mgr} + + backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) + backstop := &companionGCBackstop{ + ch: backstopCh, + clusters: map[multicluster.ClusterName]cluster.Cluster{ + multicluster.ClusterName(gcTestCluster): cl, + }, + } + + backstop.sweep(context.Background()) + + ctx := mccontext.WithCluster(context.Background(), multicluster.ClusterName(gcTestCluster)) + for len(backstopCh) > 0 { + ev := <-backstopCh + clusterName := multicluster.ClusterName(ev.Object.GetName()) + ns := ev.Object.GetNamespace() + _, err := reconciler.Reconcile(ctx, mcreconcile.Request{ + ClusterName: clusterName, + Request: ctrl.Request{NamespacedName: types.NamespacedName{Namespace: ns}}, + }) + require.NoError(t, err) + } + + // Companion must NOT be deleted — the WD is still alive. + var got corev1.ConfigMap + require.NoError(t, cellClient.Get(context.Background(), + types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), + "backstop must preserve companion when a live WD still references it") +} + +// TestCompanionGCBackstop_NoEventWhenNoCopanions verifies that the backstop +// emits no events when there are no companion objects in the cluster at all +// (no spurious requeues for empty clusters). +func TestCompanionGCBackstop_NoEventWhenNoCopanions(t *testing.T) { + t.Parallel() + + cellClient := newCellFakeClient() // nothing + cl := newFakeCluster(cellClient) + + backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) + backstop := &companionGCBackstop{ + ch: backstopCh, + clusters: map[multicluster.ClusterName]cluster.Cluster{ + multicluster.ClusterName(gcTestCluster): cl, + }, + } + + backstop.sweep(context.Background()) + assert.Empty(t, backstopCh, "backstop must emit no events when no companions exist") } // ─── Federator ordering guard tests ─────────────────────────────────────────── From b787a3613456e9e8d85a8fe337116d249a73cacd Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 20:22:48 -0500 Subject: [PATCH 3/8] feat: hub-side referenced-data companion cleanup (Components 3+4+5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the cell-side CompanionGCReconciler — deleting a Karmada-owned cell copy directly causes a permanent delete/recreate thrash because the Work object immediately re-applies the manifest. Hub-side cleanup is the only correct layer. Component 5: Delete companion_gc_controller.go and its test, and remove the --enable-cell-controllers registration block from cmd/main.go. Component 3: After deleting a hub companion (ref-count reaches zero), the downstreamCompanionWriter now also deletes the companion's Karmada ResourceBinding via hubClient. RB name follows the Karmada binding-controller convention: "{companionName}-{configmap|secret}". Deleting the RB cascades: binding-controller removes the Work, execution-controller removes the cell copy permanently. IgnoreNotFound tolerates Karmada beating us to it. localCompanionWriter (single-cluster dev mode) is a no-op. Component 4: New OrphanRBReconciler runs on the Karmada hub federation manager (alongside InstanceProjector). It watches ResourceBindings and deletes any that are orphaned companion RBs — name ends with "-configmap" or "-secret" AND propagationpolicy.karmada.io/name starts with "city-" AND the hub companion no longer exists. WD RBs (suffix "-workloaddeployment") and non-city-PP RBs are never touched. A periodic sweep Runnable fires every 5 minutes to catch RBs orphaned before the controller started, which will automatically clean the existing stranded lab RBs on first deployment. RBAC: add delete to work.karmada.io/resourcebindings in downstream-rbac (hub ClusterRole). karmada-io/api work/v1alpha2 types added to global scheme so the federation client can serialize ResourceBinding objects. Tests: unit tests for Component 3 (RB teardown fires on ConfigMap/Secret companion deletion, tolerates NotFound, no-op on localCompanionWriter) and Component 4 (orphan detection, skip live/terminating companions, tight scope guards against WD RBs and non-city-PP RBs, name pattern parsing). Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit ac25bf7c75cbe5dc18400cbe8f91ad57f862cc57) --- cmd/main.go | 18 +- config/base/downstream-rbac/rbac.yaml | 2 +- .../controller/companion_gc_controller.go | 514 --------- .../companion_gc_controller_test.go | 998 ------------------ internal/controller/orphan_rb_controller.go | 274 +++++ .../controller/orphan_rb_controller_test.go | 242 +++++ .../controller/referenceddata_controller.go | 73 +- .../referenceddata_controller_test.go | 312 +++++- internal/controller/testing_helpers_test.go | 10 +- 9 files changed, 917 insertions(+), 1526 deletions(-) delete mode 100644 internal/controller/companion_gc_controller.go delete mode 100644 internal/controller/companion_gc_controller_test.go create mode 100644 internal/controller/orphan_rb_controller.go create mode 100644 internal/controller/orphan_rb_controller_test.go diff --git a/cmd/main.go b/cmd/main.go index b2373b80..e11aa6b8 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -34,6 +34,7 @@ import ( karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" @@ -79,6 +80,7 @@ func init() { utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) + utilruntime.Must(karmadaworkv1alpha2.Install(scheme)) // +kubebuilder:scaffold:scheme } @@ -303,13 +305,6 @@ func main() { } } - if enableCellControllers { - if err = (&controller.CompanionGCReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "CompanionGC") - os.Exit(1) - } - } - // The fail-loud guard above ensures federationRestConfig is non-nil when // management controllers are enabled; the nil check here is defensive. if enableManagementControllers && federationRestConfig != nil { @@ -521,5 +516,14 @@ func setupManagementControllers(mgr mcmanager.Manager, federationClient client.C return nil, fmt.Errorf("InstanceProjector: %w", err) } + // OrphanRBReconciler sweeps Karmada ResourceBindings whose hub companion is + // gone, ensuring Works and cell copies are cleaned up even when Karmada's + // event-driven cascade misses the companion-deletion event (e.g. PP deleted + // before binding-controller reconcile completed). Runs on the hub federation + // manager alongside InstanceProjector. + if err = controller.SetupOrphanRBWithManager(federationMgr, federationClient); err != nil { + return nil, fmt.Errorf("OrphanRBReconciler: %w", err) + } + return []manager.Runnable{federationMgr}, nil } diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml index f2a94a37..b4d0344d 100644 --- a/config/base/downstream-rbac/rbac.yaml +++ b/config/base/downstream-rbac/rbac.yaml @@ -24,7 +24,7 @@ rules: verbs: ["get", "list", "watch"] - apiGroups: ["work.karmada.io"] resources: ["resourcebindings", "clusterresourcebindings"] - verbs: ["get", "list", "watch"] + verbs: ["get", "list", "watch", "delete"] - apiGroups: ["config.karmada.io"] resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] verbs: ["get", "list", "watch"] diff --git a/internal/controller/companion_gc_controller.go b/internal/controller/companion_gc_controller.go deleted file mode 100644 index 28405e3e..00000000 --- a/internal/controller/companion_gc_controller.go +++ /dev/null @@ -1,514 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "encoding/json" - "fmt" - "sync" - "time" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/util/workqueue" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/cluster" - "sigs.k8s.io/controller-runtime/pkg/event" - "sigs.k8s.io/controller-runtime/pkg/handler" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/source" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - "sigs.k8s.io/multicluster-runtime/pkg/multicluster" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" -) - -// companionGCPeriodicInterval is the frequency at which the backstop sweeper -// scans for companions in namespaces that may have no live WorkloadDeployments. -// In steady state orphans are covered immediately by WD-delete events; the -// periodic sweep is a backstop for namespaces whose last WD was deleted before -// this controller started (or between restarts). -const companionGCPeriodicInterval = 5 * time.Minute - -// companionGCChannelBuffer is the number of events the backstop ticker channel -// can hold without blocking. Sized to accommodate one sweep's worth of -// namespace events across a typical cell (expected O(10s) of namespaces). -const companionGCChannelBuffer = 256 - -// CompanionGCReconciler runs on each cell cluster and deletes orphaned companion -// ConfigMaps and Secrets whose WorkloadDeployment referrers are gone from that -// cell. It is the authoritative GC path that does not depend on Karmada's -// ResourceBinding cascade completing correctly. -// -// The reconciler is triggered by WorkloadDeployment events (including deletion). -// On each trigger it lists all companions in the WD's namespace (by the -// referenced-data=true label), parses the referenced-by annotation on each -// companion, and deletes any companion for which every listed WD name is absent -// from the local cell namespace. -// -// # No cluster-wide ConfigMap/Secret informer -// -// The For type is WorkloadDeployment, not ConfigMap. WorkloadDeployments are -// already cached on the cell by the sibling WorkloadDeploymentReconciler, so -// this adds NO new cluster-wide informer. All companion reads (ConfigMap/Secret -// List) go through the UNCACHED APIReader. A one-shot List via APIReader does -// NOT establish a persistent informer, so it does not cause the OOM that -// For(ConfigMap) with an unscoped cache would. -// -// # Periodic backstop for WD-less namespaces -// -// A WD delete event covers the common case. Namespaces whose LAST WD was -// deleted before the controller started (or during a restart window) would -// never get a reconcile event. The companionGCBackstop Runnable fires -// namespace-keyed Reconcile requests on a ticker, independently of any WD -// object presence, to cover that gap. -// -// Per-cell multi-referrer safety: the referenced-by annotation is written by -// the hub-side ReferencedDataController using PROJECT-plane namespace keys -// (e.g. "default/mount-pristine-default-dfw"). On the cell the WD lives in -// ns-{project-uid}, not "default". hasLiveReferrer therefore looks up WDs by -// name only, in the companion's own namespace. This means: -// -// - A WD on a different cell is never present locally → counted absent -// → companion on that cell is correctly deleted when its own local WD is -// also gone. -// - A WD on this cell is present → companion preserved → correct. -type CompanionGCReconciler struct { - mgr mcmanager.Manager -} - -// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;delete -// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;delete -// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch - -// Reconcile is invoked for each WorkloadDeployment event (including deletion) -// and by the periodic backstop for namespaces with no live WDs. -// -// It sweeps req.Namespace: any companion (ConfigMap or Secret carrying the -// referenced-data=true label) whose every listed WD referrer is absent from the -// local cell is deleted. req.Name is unused — the sweep covers all companions -// in the namespace regardless of which WD or backstop event triggered it. -// -// All companion reads use the uncached APIReader so no persistent CM/Secret -// informer is ever established. WD liveness checks use the cached client because -// WorkloadDeployments are already in the cell manager's cache. -func (r *CompanionGCReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx).WithValues("namespace", req.Namespace) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - ctx = mccontext.WithCluster(ctx, req.ClusterName) - - // Uncached reader for all ConfigMap/Secret reads — prevents establishing a - // cluster-wide informer that would exhaust 128Mi on a cell cluster. - apiReader := cl.GetAPIReader() - // Cached client for WD liveness checks — WDs are already in the cell cache. - cellClient := cl.GetClient() - - if err := r.sweepNamespace(ctx, apiReader, cellClient, req.Namespace, logger); err != nil { - return ctrl.Result{}, err - } - - return ctrl.Result{}, nil -} - -// sweepNamespace lists all companion ConfigMaps and Secrets in the given -// namespace (via the uncached apiReader) and deletes any whose every WD -// referrer is absent from the cell. -// -// Writes (Delete) always use cellClient because deletion does not require -// the object to be in the cache. -func (r *CompanionGCReconciler) sweepNamespace( - ctx context.Context, - apiReader client.Reader, - cellClient client.Client, - namespace string, - logger interface{ Info(string, ...any) }, -) error { - companionLabel := client.MatchingLabels{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - } - nsSelector := client.InNamespace(namespace) - - // List companion ConfigMaps via the uncached reader. - var cms corev1.ConfigMapList - if err := apiReader.List(ctx, &cms, nsSelector, companionLabel); err != nil { - return fmt.Errorf("list companion ConfigMaps in %s: %w", namespace, err) - } - for i := range cms.Items { - if err := r.maybeDeleteConfigMap(ctx, cellClient, &cms.Items[i], logger); err != nil { - return err - } - } - - // List companion Secrets via the uncached reader. - var secrets corev1.SecretList - if err := apiReader.List(ctx, &secrets, nsSelector, companionLabel); err != nil { - return fmt.Errorf("list companion Secrets in %s: %w", namespace, err) - } - for i := range secrets.Items { - if err := r.maybeDeleteSecret(ctx, cellClient, &secrets.Items[i], logger); err != nil { - return err - } - } - - return nil -} - -func (r *CompanionGCReconciler) maybeDeleteConfigMap( - ctx context.Context, - cellClient client.Client, - cm *corev1.ConfigMap, - logger interface{ Info(string, ...any) }, -) error { - if !isCompanion(cm) { - return nil - } - alive, err := r.hasLiveReferrer(ctx, cellClient, cm.Namespace, cm.Annotations) - if err != nil { - return err - } - if alive { - return nil - } - logger.Info("deleting orphaned companion ConfigMap", "name", cm.Name, "namespace", cm.Namespace) - if err := cellClient.Delete(ctx, cm); client.IgnoreNotFound(err) != nil { - return fmt.Errorf("delete companion ConfigMap %s/%s: %w", cm.Namespace, cm.Name, err) - } - return nil -} - -func (r *CompanionGCReconciler) maybeDeleteSecret( - ctx context.Context, - cellClient client.Client, - secret *corev1.Secret, - logger interface{ Info(string, ...any) }, -) error { - if !isCompanion(secret) { - return nil - } - alive, err := r.hasLiveReferrer(ctx, cellClient, secret.Namespace, secret.Annotations) - if err != nil { - return err - } - if alive { - return nil - } - logger.Info("deleting orphaned companion Secret", "name", secret.Name, "namespace", secret.Namespace) - if err := cellClient.Delete(ctx, secret); client.IgnoreNotFound(err) != nil { - return fmt.Errorf("delete companion Secret %s/%s: %w", secret.Namespace, secret.Name, err) - } - return nil -} - -// isCompanion reports whether the object carries the referenced-data=true label. -// The label is the authoritative signal that the companion was created by the -// referenced-data controller. Objects without it are not touched by GC. -func isCompanion(obj interface{ GetLabels() map[string]string }) bool { - labels := obj.GetLabels() - return labels[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue -} - -// hasLiveReferrer returns true when at least one WD listed in the referenced-by -// annotation still exists in the companion's namespace on this cell. -// -// The annotation is written by the hub-side ReferencedDataController as -// "projectNamespace/wdName" (e.g. "default/mount-pristine-default-dfw"). On -// the cell the WD lives in ns-{project-uid}, not in the project namespace. To -// find it we look up by NAME ONLY in the companion's own namespace — the cell -// WD name is always equal to the project WD name (set by -// upsertDownstreamDeployment). This also gives us the correct per-cell -// semantics: a WD that runs on a different cell is never present locally, so it -// contributes nothing to liveness on this cell. -// -// WD reads use the CACHED cellClient because WorkloadDeployments are already -// held in the cell manager's cache (the sibling WorkloadDeploymentReconciler -// watches them). This is safe and cheap. -// -// A companion is considered still needed if any referrer is present in any -// state (including terminating) to avoid premature deletion during the WD -// teardown window. -// -// Returns (false, nil) when the annotation is absent or empty. -// Returns (true, nil) when at least one referrer is found. -// Returns (true, err) when the annotation is corrupt or an API call fails so -// that the caller does NOT delete the companion on transient faults. -func (r *CompanionGCReconciler) hasLiveReferrer( - ctx context.Context, - cellClient client.Client, - companionNamespace string, - annotations map[string]string, -) (bool, error) { - wdKeys, err := decodeCompanionRefCount(annotations) - if err != nil { - // Corrupt annotation: treat as "has live referrer" to avoid accidental GC. - return true, err - } - if len(wdKeys) == 0 { - return false, nil - } - - for _, key := range wdKeys { - wdName := wdNameFromKey(key) - if wdName == "" { - // Malformed key: conservatively assume the referrer is alive. - return true, nil - } - - // Look up by name in the companion's namespace. The annotation carries the - // PROJECT namespace as the key prefix, but the cell WD lives in - // ns-{project-uid} — the same namespace the companion is in. - var wd computev1alpha.WorkloadDeployment - err := cellClient.Get(ctx, types.NamespacedName{Namespace: companionNamespace, Name: wdName}, &wd) - if err == nil { - // Referrer exists (any state — including terminating). Companion stays. - return true, nil - } - if !apierrors.IsNotFound(err) { - return true, fmt.Errorf("get WorkloadDeployment %s/%s: %w", companionNamespace, wdName, err) - } - // Not found — this referrer is gone; continue checking the rest. - } - - // Every listed referrer is absent from this cell. - return false, nil -} - -// wdNameFromKey extracts the WD name from a "namespace/name" annotation key. -// The referenced-by annotation always uses "projectNamespace/wdName" format; -// we want only the name portion, which is the part after the last slash. -// Returns "" for an empty key (caller should treat as malformed). -func wdNameFromKey(key string) string { - if key == "" { - return "" - } - for i := len(key) - 1; i >= 0; i-- { - if key[i] == '/' { - return key[i+1:] - } - } - // No slash — the whole string is the name. - return key -} - -// decodeCompanionRefCount parses the companionRefCountAnnotation value into a -// slice of WD keys. It is a cell-local copy of the hub-side decodeRefCount so -// the GC reconciler does not share internal state with the -// ReferencedDataController. The annotation format is identical: a JSON array. -// -// Returns (nil, nil) when the annotation is absent or empty. -// Returns an error when the annotation value is present but cannot be parsed. -func decodeCompanionRefCount(annotations map[string]string) ([]string, error) { - raw, ok := annotations[companionRefCountAnnotation] - if !ok || raw == "" { - return nil, nil - } - var keys []string - if err := json.Unmarshal([]byte(raw), &keys); err != nil { - return nil, fmt.Errorf("companion-gc: corrupt ref-count annotation %q: %w", raw, err) - } - return keys, nil -} - -// ─── Periodic backstop ──────────────────────────────────────────────────────── - -// companionGCBackstop is an mcmanager.Runnable that fires periodic full-cluster -// companion sweeps, independent of any WorkloadDeployment event. It closes the -// coverage gap where a namespace's LAST WD was deleted before this controller -// started: the per-WD For() watch would never enqueue that namespace, so without -// a backstop those orphans would persist until the next controller restart. -// -// On each tick the backstop lists ALL companions (by label, ALL namespaces) via -// the uncached APIReader on each engaged cell cluster, collects the distinct -// namespaces, and sends one namespace-keyed event per namespace into ch. The -// controller's WatchesRawSource picks those events up and routes them to -// Reconcile via backstopEventHandler. -// -// A one-shot APIReader.List does NOT establish a persistent informer, so this -// never causes the OOM the original For(ConfigMap) design had. -type companionGCBackstop struct { - ch chan event.GenericEvent - - mu sync.Mutex - clusters map[multicluster.ClusterName]cluster.Cluster -} - -// Engage is called by the mcmanager coordinator when a cell cluster becomes -// active. We record the cluster so the ticker goroutine can include it in -// each sweep. -func (b *companionGCBackstop) Engage(_ context.Context, name multicluster.ClusterName, cl cluster.Cluster) error { - b.mu.Lock() - defer b.mu.Unlock() - b.clusters[name] = cl - return nil -} - -// Start runs the periodic ticker loop. It blocks until ctx is cancelled. -func (b *companionGCBackstop) Start(ctx context.Context) error { - ticker := time.NewTicker(companionGCPeriodicInterval) - defer ticker.Stop() - for { - select { - case <-ctx.Done(): - return nil - case <-ticker.C: - b.sweep(ctx) - } - } -} - -// sweep enumerates all engaged cell clusters and, for each, lists companion -// ConfigMaps and Secrets via the uncached APIReader across all namespaces. For -// each distinct namespace found it sends a GenericEvent into ch so the GC -// controller will sweep that namespace. -// -// Each event carries a *corev1.Namespace whose ObjectMeta.Namespace is the -// target sweep namespace and ObjectMeta.Name encodes the ClusterName. The -// backstopEventHandler lifts these into an mcreconcile.Request. -// -// Errors are logged and skipped: a failed sweep on one cluster does not block -// others, and the next tick will retry. -func (b *companionGCBackstop) sweep(ctx context.Context) { - logger := log.FromContext(ctx).WithValues("component", "companion-gc-backstop") - - b.mu.Lock() - clusters := make(map[multicluster.ClusterName]cluster.Cluster, len(b.clusters)) - for k, v := range b.clusters { - clusters[k] = v - } - b.mu.Unlock() - - companionLabel := client.MatchingLabels{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - } - - for clusterName, cl := range clusters { - apiReader := cl.GetAPIReader() - namespaces := make(map[string]struct{}) - - var cms corev1.ConfigMapList - if err := apiReader.List(ctx, &cms, companionLabel); err != nil { - logger.Error(err, "backstop: list companion ConfigMaps", "cluster", clusterName) - } else { - for i := range cms.Items { - namespaces[cms.Items[i].Namespace] = struct{}{} - } - } - - var secrets corev1.SecretList - if err := apiReader.List(ctx, &secrets, companionLabel); err != nil { - logger.Error(err, "backstop: list companion Secrets", "cluster", clusterName) - } else { - for i := range secrets.Items { - namespaces[secrets.Items[i].Namespace] = struct{}{} - } - } - - // Emit one GenericEvent per distinct namespace. The carrier object is a - // *corev1.Namespace with Namespace= and Name=. - // backstopEventHandler lifts these fields into an mcreconcile.Request. - for ns := range namespaces { - obj := &corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - // Name carries the ClusterName so backstopEventHandler can - // stamp it onto the mcreconcile.Request. - Name: string(clusterName), - // Namespace carries the target cell namespace to sweep. - Namespace: ns, - }, - } - select { - case b.ch <- event.GenericEvent{Object: obj}: - case <-ctx.Done(): - return - default: - // Channel full — drop this namespace; the next tick will cover it. - logger.V(1).Info("backstop: channel full, dropping namespace sweep event", - "cluster", clusterName, "namespace", ns) - } - } - } -} - -// backstopEventHandler maps a backstop GenericEvent (carrier: *corev1.Namespace -// with Namespace=target sweep namespace, Name=ClusterName) into an -// mcreconcile.Request. It implements handler.TypedEventHandler[client.Object, mcreconcile.Request]. -type backstopEventHandler struct{} - -var _ handler.TypedEventHandler[client.Object, mcreconcile.Request] = backstopEventHandler{} - -func (backstopEventHandler) Create(_ context.Context, _ event.TypedCreateEvent[client.Object], _ workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { -} -func (backstopEventHandler) Update(_ context.Context, _ event.TypedUpdateEvent[client.Object], _ workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { -} -func (backstopEventHandler) Delete(_ context.Context, _ event.TypedDeleteEvent[client.Object], _ workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { -} -func (backstopEventHandler) Generic(_ context.Context, ev event.TypedGenericEvent[client.Object], q workqueue.TypedRateLimitingInterface[mcreconcile.Request]) { - obj := ev.Object - q.Add(mcreconcile.Request{ - ClusterName: multicluster.ClusterName(obj.GetName()), - Request: ctrl.Request{ - NamespacedName: types.NamespacedName{ - Namespace: obj.GetNamespace(), - // Name is intentionally empty — Reconcile uses only req.Namespace. - }, - }, - }) -} - -// SetupWithManager registers the CompanionGCReconciler with the multicluster -// manager. It should only be called when cell controllers are enabled -// (--enable-cell-controllers). -// -// The For type is WorkloadDeployment (NOT ConfigMap or Secret). WorkloadDeployments -// are already cached on the cell by the sibling WorkloadDeploymentReconciler, so -// this registration adds NO new cluster-wide informer. A WD delete event enqueues -// the WD's namespace/name and fires Reconcile — that is the deletion trigger. -// -// A companionGCBackstop Runnable is registered on the manager and wired via -// WatchesRawSource. On each tick it lists companions across all namespaces via -// the uncached APIReader and enqueues namespace-keyed reconcile requests. This -// covers namespaces whose last WD was deleted before the controller started. -// -// All companion reads inside Reconcile and the backstop go through the UNCACHED -// APIReader so that no persistent ConfigMap/Secret informer is ever established. -func (r *CompanionGCReconciler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - - // backstopCh carries namespace-keyed GenericEvents from companionGCBackstop - // to the controller's WatchesRawSource. Typed as event.GenericEvent (alias for - // TypedGenericEvent[client.Object]) so source.TypedChannel can consume it. - backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) - - backstop := &companionGCBackstop{ - ch: backstopCh, - clusters: make(map[multicluster.ClusterName]cluster.Cluster), - } - if err := mgr.Add(backstop); err != nil { - return fmt.Errorf("companion-gc: add backstop runnable: %w", err) - } - - return mcbuilder.ControllerManagedBy(mgr). - // Drive GC off WorkloadDeployment events. WDs are already in the cell cache; - // this adds no new cluster-wide informer. A WD deletion still triggers - // Reconcile so we sweep the namespace immediately after the WD disappears. - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - // Backstop: periodic namespace-keyed events from companionGCBackstop. - // TypedChannel[client.Object, mcreconcile.Request] produces a - // TypedSource[mcreconcile.Request] as required by WatchesRawSource. - WatchesRawSource(source.TypedChannel( - backstopCh, - backstopEventHandler{}, - )). - Named("companion-gc"). - Complete(r) -} diff --git a/internal/controller/companion_gc_controller_test.go b/internal/controller/companion_gc_controller_test.go deleted file mode 100644 index ad6b14d5..00000000 --- a/internal/controller/companion_gc_controller_test.go +++ /dev/null @@ -1,998 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "encoding/json" - "testing" - "time" - - karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/cluster" - "sigs.k8s.io/controller-runtime/pkg/event" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - "sigs.k8s.io/multicluster-runtime/pkg/multicluster" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" -) - -// ─── Test constants ──────────────────────────────────────────────────────────── - -const ( - gcTestCluster = "cell-cluster" - - // gcTestNamespace is the cell-side namespace (ns-{project-uid}) where - // companion objects and the cell WorkloadDeployment reside. - gcTestNamespace = "ns-aabbccdd-0000-1111-2222-333344445555" - - // gcTestProjectNamespace is the project-plane namespace that the - // ReferencedDataController uses when writing referenced-by annotation keys. - // In production keys are written as "projectNamespace/wdName", e.g. - // "default/mount-pristine-default-dfw". - gcTestProjectNamespace = "default" - - gcTestCMName = "cm-pristine" - gcTestSecName = "secret-pristine" - gcTestWD1Name = "mount-pristine-default-dfw" - gcTestWD2Name = "mount-pristine-default-lax" - - // gcTestDefaultWD1Key is a sample production-format referenced-by annotation - // key used in decode and name-extraction tests ("projectNamespace/wdName"). - gcTestDefaultWD1Key = "default/wd-1" -) - -// ─── Helpers ────────────────────────────────────────────────────────────────── - -// newCellFakeClient returns a fake client with the project scheme (corev1 + compute). -func newCellFakeClient(objs ...client.Object) client.Client { - return newProjectFakeClient(objs...) -} - -// companionCM builds a companion ConfigMap named gcTestCMName in gcTestNamespace -// with the referenced-data=true label and the given WD keys in the -// referenced-by annotation. -func companionCM(wdKeys []string) *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: gcTestCMName, - Namespace: gcTestNamespace, - Labels: map[string]string{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - }, - Annotations: map[string]string{ - companionRefCountAnnotation: mustEncodeRefCount(wdKeys), - }, - }, - } -} - -// companionSecret builds a companion Secret with the referenced-data=true label -// and the given WD keys in the referenced-by annotation. -func companionSecret(name, namespace string, wdKeys []string) *corev1.Secret { - return &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Labels: map[string]string{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - }, - Annotations: map[string]string{ - companionRefCountAnnotation: mustEncodeRefCount(wdKeys), - }, - }, - } -} - -// plainCM builds a ConfigMap WITHOUT the referenced-data label. -func plainCM(name, namespace string) *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, - } -} - -// cellWD builds a WorkloadDeployment in the cell namespace (ns-{uid}). -func cellWD(name, namespace string) *computev1alpha.WorkloadDeployment { - return &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - }, - Spec: computev1alpha.WorkloadDeploymentSpec{ - CityCode: wbTestCityCode, - PlacementName: testDefaultPlacement, - WorkloadRef: computev1alpha.WorkloadReference{Name: "workload"}, - ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, - }, - } -} - -// cellWDTerminating builds a WorkloadDeployment with a non-zero DeletionTimestamp. -func cellWDTerminating(name, namespace string) *computev1alpha.WorkloadDeployment { - wd := cellWD(name, namespace) - ts := metav1.NewTime(time.Now().Add(-5 * time.Second)) - wd.DeletionTimestamp = &ts - wd.Finalizers = []string{"test-finalizer"} // required by fake client for terminating objects - return wd -} - -// mustEncodeRefCount serialises a slice of WD keys as a JSON array. -func mustEncodeRefCount(keys []string) string { - if len(keys) == 0 { - return "[]" - } - b, err := json.Marshal(keys) - if err != nil { - panic("mustEncodeRefCount: " + err.Error()) - } - return string(b) -} - -// prodRefKey returns the production-format referenced-by annotation key for a WD. -// In production the ReferencedDataController writes "projectNamespace/wdName" -// (e.g. "default/mount-pristine-default-dfw"), NOT the cell namespace. -func prodRefKey(wdName string) string { - return gcTestProjectNamespace + "/" + wdName -} - -// newGCReconciler builds a CompanionGCReconciler wired to a fakeMCManager -// backed by the given cell client. -func newGCReconciler(cellClient client.Client) *CompanionGCReconciler { - cl := newFakeCluster(cellClient) - mgr := newFakeMCManager(gcTestCluster, cl) - return &CompanionGCReconciler{mgr: mgr} -} - -// reconcileGCForWD runs one GC reconcile triggered by a WorkloadDeployment -// event in gcTestNamespace. The WD object may or may not exist — a deleted WD -// still fires the namespace sweep. The request name is gcTestWD1Name; only -// req.Namespace matters for the sweep logic. -func reconcileGCForWD(t *testing.T, r *CompanionGCReconciler) (ctrl.Result, error) { - t.Helper() - ctx := mccontext.WithCluster(context.Background(), multicluster.ClusterName(gcTestCluster)) - return r.Reconcile(ctx, mcreconcile.Request{ - ClusterName: multicluster.ClusterName(gcTestCluster), - Request: ctrl.Request{ - NamespacedName: types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestWD1Name}, - }, - }) -} - -// ─── decodeCompanionRefCount unit tests ─────────────────────────────────────── - -func TestDecodeCompanionRefCount(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - annotations map[string]string - wantKeys []string - wantErr bool - }{ - { - name: "absent annotation returns nil", - annotations: nil, - wantKeys: nil, - }, - { - name: "empty annotation returns nil", - annotations: map[string]string{companionRefCountAnnotation: ""}, - wantKeys: nil, - }, - { - name: "single key", - annotations: map[string]string{companionRefCountAnnotation: `["` + gcTestDefaultWD1Key + `"]`}, - wantKeys: []string{gcTestDefaultWD1Key}, - }, - { - name: "multiple keys", - annotations: map[string]string{companionRefCountAnnotation: `["` + gcTestDefaultWD1Key + `","default/wd-2"]`}, - wantKeys: []string{gcTestDefaultWD1Key, "default/wd-2"}, - }, - { - name: "corrupt annotation returns error", - annotations: map[string]string{companionRefCountAnnotation: `not-json`}, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - got, err := decodeCompanionRefCount(tt.annotations) - if tt.wantErr { - require.Error(t, err) - return - } - require.NoError(t, err) - assert.Equal(t, tt.wantKeys, got) - }) - } -} - -// ─── wdNameFromKey unit tests ───────────────────────────────────────────────── - -func TestWdNameFromKey(t *testing.T) { - t.Parallel() - - tests := []struct { - input string - want string - }{ - {gcTestDefaultWD1Key, "wd-1"}, - {"ns/sub/name", "name"}, // last slash wins - {"wd-only", "wd-only"}, - {"", ""}, - } - - for _, tt := range tests { - t.Run(tt.input, func(t *testing.T) { - t.Parallel() - got := wdNameFromKey(tt.input) - assert.Equal(t, tt.want, got) - }) - } -} - -// ─── hasLiveReferrer unit tests ─────────────────────────────────────────────── - -func TestHasLiveReferrer(t *testing.T) { - t.Parallel() - - // Production-format key: project namespace / WD name. - // The WD object lives in gcTestNamespace on the cell. - wdKey := prodRefKey(gcTestWD1Name) - - tests := []struct { - name string - annotations map[string]string - cellObjs []client.Object - wantAlive bool - wantErr bool - }{ - { - name: "no annotation → no referrers → safe to delete", - annotations: nil, - wantAlive: false, - }, - { - name: "empty ref-count → safe to delete", - annotations: map[string]string{companionRefCountAnnotation: "[]"}, - wantAlive: false, - }, - { - // BLOCKER 1 regression: key uses project namespace ("default/…") but - // the WD lives in the cell namespace (gcTestNamespace). hasLiveReferrer - // must look up by name only in the companion's namespace, not by the - // project namespace encoded in the key. - name: "project-namespace key — WD in cell ns — companion preserved", - annotations: map[string]string{companionRefCountAnnotation: mustEncodeRefCount([]string{wdKey})}, - cellObjs: []client.Object{cellWD(gcTestWD1Name, gcTestNamespace)}, - wantAlive: true, - }, - { - name: "referrer is terminating → still counts as present → keep", - annotations: map[string]string{companionRefCountAnnotation: mustEncodeRefCount([]string{wdKey})}, - cellObjs: []client.Object{cellWDTerminating(gcTestWD1Name, gcTestNamespace)}, - wantAlive: true, - }, - { - name: "referrer absent → delete", - annotations: map[string]string{companionRefCountAnnotation: mustEncodeRefCount([]string{wdKey})}, - cellObjs: nil, // WD not on this cell - wantAlive: false, - }, - { - // Multi-referrer with production-format keys: WD1 is on another cell - // (not present locally), WD2 IS on this cell → companion kept. - name: "multi-referrer: one absent (other cell), one present local → keep", - annotations: map[string]string{ - companionRefCountAnnotation: mustEncodeRefCount([]string{ - prodRefKey(gcTestWD1Name), - prodRefKey(gcTestWD2Name), - }), - }, - // WD2 is on this cell; WD1 is on another cell and not visible here. - cellObjs: []client.Object{cellWD(gcTestWD2Name, gcTestNamespace)}, - wantAlive: true, - }, - { - name: "multi-referrer: both absent → delete", - annotations: map[string]string{ - companionRefCountAnnotation: mustEncodeRefCount([]string{ - prodRefKey(gcTestWD1Name), - prodRefKey(gcTestWD2Name), - }), - }, - cellObjs: nil, - wantAlive: false, - }, - { - name: "corrupt annotation → error returned", - annotations: map[string]string{companionRefCountAnnotation: "corrupt"}, - wantErr: true, - wantAlive: true, // conservative: treat as alive on error - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - cellClient := newCellFakeClient(tt.cellObjs...) - r := &CompanionGCReconciler{} - - alive, err := r.hasLiveReferrer(context.Background(), cellClient, gcTestNamespace, tt.annotations) - - if tt.wantErr { - require.Error(t, err) - } else { - require.NoError(t, err) - } - assert.Equal(t, tt.wantAlive, alive) - }) - } -} - -// ─── CompanionGCReconciler.Reconcile tests ──────────────────────────────────── - -// TestCompanionGC_NamespaceSweep_WDEventTriggersGC verifies the core entry -// path: a WorkloadDeployment event (here the WD is absent, simulating a -// deletion) causes Reconcile to sweep the WD's namespace and delete any -// orphaned companions it finds there. -// -// This also documents that the controller is driven by For(WorkloadDeployment), -// NOT For(ConfigMap). The test exercises the full Reconcile path that results -// from a WD delete event, confirming no CM/Secret informer is needed. -func TestCompanionGC_NamespaceSweep_WDEventTriggersGC(t *testing.T) { - t.Parallel() - - // Two companions in the namespace, both with no live referrer. - cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) - sec := companionSecret(gcTestSecName, gcTestNamespace, []string{prodRefKey(gcTestWD1Name)}) - // No WD in the cell — the WD was deleted, triggering this reconcile. - cellClient := newCellFakeClient(cm, sec) - r := newGCReconciler(cellClient) - - // The reconcile request names the WD that was deleted (namespace sweep uses - // req.Namespace, not req.Name, so the WD need not exist). - result, err := reconcileGCForWD(t, r) - require.NoError(t, err) - // Event-driven reconcile returns empty result (no requeue). - assert.Equal(t, ctrl.Result{}, result) - - // Both companions must be gone. - var gotCM corev1.ConfigMap - err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &gotCM) - assert.True(t, apierrors.IsNotFound(err), "companion ConfigMap should be deleted when all referrers are absent") - - var gotSec corev1.Secret - err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestSecName}, &gotSec) - assert.True(t, apierrors.IsNotFound(err), "companion Secret should be deleted when all referrers are absent") -} - -// TestCompanionGC_ConfigMap_DeletedWhenAllReferrersAbsent verifies that a -// companion ConfigMap is deleted when the WD listed in its referenced-by -// annotation (using production "projectNS/name" format) is absent from the cell. -func TestCompanionGC_ConfigMap_DeletedWhenAllReferrersAbsent(t *testing.T) { - t.Parallel() - - // Production-format key: project namespace, but no WD of that name on this cell. - cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) - cellClient := newCellFakeClient(cm) - r := newGCReconciler(cellClient) - - result, err := reconcileGCForWD(t, r) - require.NoError(t, err) - assert.Equal(t, ctrl.Result{}, result) - - var got corev1.ConfigMap - err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) - assert.True(t, apierrors.IsNotFound(err), "companion ConfigMap should be deleted when all referrers are absent") -} - -// TestCompanionGC_Secret_DeletedWhenAllReferrersAbsent verifies that a companion -// Secret is deleted when the WD listed in its annotation does not exist on the cell. -func TestCompanionGC_Secret_DeletedWhenAllReferrersAbsent(t *testing.T) { - t.Parallel() - - secret := companionSecret(gcTestSecName, gcTestNamespace, []string{prodRefKey(gcTestWD1Name)}) - cellClient := newCellFakeClient(secret) - r := newGCReconciler(cellClient) - - result, err := reconcileGCForWD(t, r) - require.NoError(t, err) - assert.Equal(t, ctrl.Result{}, result) - - var got corev1.Secret - err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestSecName}, &got) - assert.True(t, apierrors.IsNotFound(err), "companion Secret should be deleted when all referrers are absent") -} - -// TestCompanionGC_ProjectNamespaceKey_PreservesCompanionWithLiveLocalWD is the -// Blocker 1 regression test. The referenced-by annotation key is written by the -// hub ReferencedDataController as "projectNS/wdName" (e.g. -// "default/mount-pristine-default-dfw"). The cell WD lives in ns-{uid}, NOT in -// "default". The GC reconciler must find the WD by name in the companion's own -// namespace — using the project namespace from the key would always NotFound and -// incorrectly delete a companion that is actively in use. -func TestCompanionGC_ProjectNamespaceKey_PreservesCompanionWithLiveLocalWD(t *testing.T) { - t.Parallel() - - // Annotation uses project namespace ("default"), exactly as production writes it. - wdKey := prodRefKey(gcTestWD1Name) // "default/mount-pristine-default-dfw" - cm := companionCM([]string{wdKey}) - - // The WD lives in the CELL namespace (ns-{uid}), not "default". - wd := cellWD(gcTestWD1Name, gcTestNamespace) - cellClient := newCellFakeClient(cm, wd) - r := newGCReconciler(cellClient) - - _, err := reconcileGCForWD(t, r) - require.NoError(t, err) - - // Companion must NOT be deleted — the WD exists on this cell. - var got corev1.ConfigMap - require.NoError(t, cellClient.Get(context.Background(), - types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), - "companion must be preserved when WD with matching name exists in cell namespace, even though annotation key uses project namespace") -} - -// TestCompanionGC_PreservedWhenLiveReferrerExists verifies that a companion is -// NOT deleted when a live (non-terminating) WD is present on the cell. -func TestCompanionGC_PreservedWhenLiveReferrerExists(t *testing.T) { - t.Parallel() - - cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) - wd := cellWD(gcTestWD1Name, gcTestNamespace) - cellClient := newCellFakeClient(cm, wd) - r := newGCReconciler(cellClient) - - _, err := reconcileGCForWD(t, r) - require.NoError(t, err) - - var got corev1.ConfigMap - require.NoError(t, cellClient.Get(context.Background(), - types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got)) -} - -// TestCompanionGC_PreservedWhenTerminatingReferrerExists verifies that a companion -// is NOT deleted when the only referrer on this cell is terminating. Terminating -// WDs still count as present — we are conservative to avoid premature deletion -// during the WD teardown window. -func TestCompanionGC_PreservedWhenTerminatingReferrerExists(t *testing.T) { - t.Parallel() - - cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) - wd := cellWDTerminating(gcTestWD1Name, gcTestNamespace) - cellClient := newCellFakeClient(cm, wd) - r := newGCReconciler(cellClient) - - _, err := reconcileGCForWD(t, r) - require.NoError(t, err) - - var got corev1.ConfigMap - require.NoError(t, cellClient.Get(context.Background(), - types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got)) -} - -// TestCompanionGC_MultiReferrer_OneDifferentCell_OtherLiveLocal verifies the -// core per-cell multi-referrer safety property. The annotation (written by the -// hub) lists both WDs with project-namespace keys. WD1 is on another cell (not -// present locally). WD2 IS on this cell. Companion must be preserved. -func TestCompanionGC_MultiReferrer_OneDifferentCell_OtherLiveLocal(t *testing.T) { - t.Parallel() - - // Both keys use production format (project namespace prefix). - // WD1 is NOT present on this cell (simulating it running on another cell). - // WD2 IS present on this cell. - cm := companionCM([]string{ - prodRefKey(gcTestWD1Name), - prodRefKey(gcTestWD2Name), - }) - wd2 := cellWD(gcTestWD2Name, gcTestNamespace) - cellClient := newCellFakeClient(cm, wd2) - r := newGCReconciler(cellClient) - - // WD1 deletion event triggers the sweep; WD2 is still alive locally. - _, err := reconcileGCForWD(t, r) - require.NoError(t, err) - - // Companion must NOT have been deleted because WD2 is still alive on this cell. - var got corev1.ConfigMap - require.NoError(t, cellClient.Get(context.Background(), - types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), - "companion must survive when at least one local referrer is present") -} - -// TestCompanionGC_MultiReferrer_AllAbsent_Deleted verifies that when BOTH WD -// names from the annotation are absent from this cell the companion is deleted. -func TestCompanionGC_MultiReferrer_AllAbsent_Deleted(t *testing.T) { - t.Parallel() - - cm := companionCM([]string{ - prodRefKey(gcTestWD1Name), - prodRefKey(gcTestWD2Name), - }) - // Neither WD is on this cell. - cellClient := newCellFakeClient(cm) - r := newGCReconciler(cellClient) - - _, err := reconcileGCForWD(t, r) - require.NoError(t, err) - - var got corev1.ConfigMap - err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) - assert.True(t, apierrors.IsNotFound(err), "companion must be deleted when all referrers are absent from this cell") -} - -// TestCompanionGC_EmptyRefCount_Deleted verifies that a companion with an empty -// (or missing) ref-count annotation is deleted — zero referrers means safe to remove. -func TestCompanionGC_EmptyRefCount_Deleted(t *testing.T) { - t.Parallel() - - cm := companionCM(nil) // empty ref-count - cellClient := newCellFakeClient(cm) - r := newGCReconciler(cellClient) - - _, err := reconcileGCForWD(t, r) - require.NoError(t, err) - - var got corev1.ConfigMap - err = cellClient.Get(context.Background(), types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) - assert.True(t, apierrors.IsNotFound(err), "companion with no referrers should be deleted") -} - -// TestCompanionGC_NonCompanionNotTouched verifies that a ConfigMap without the -// referenced-data=true label is not deleted by the GC reconciler. -func TestCompanionGC_NonCompanionNotTouched(t *testing.T) { - t.Parallel() - - // A plain ConfigMap without the companion label is never returned by the - // label-scoped List and is therefore never passed to maybeDeleteConfigMap. - // We verify the object survives the sweep. - cm := plainCM(gcTestCMName, gcTestNamespace) - cellClient := newCellFakeClient(cm) - r := newGCReconciler(cellClient) - - _, err := reconcileGCForWD(t, r) - require.NoError(t, err) - - var got corev1.ConfigMap - require.NoError(t, cellClient.Get(context.Background(), - types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), - "ConfigMap without referenced-data label must not be deleted") -} - -// TestCompanionGC_EmptyNamespace_NoOp verifies that reconciling a namespace that -// contains no companions is a no-op with no error (covers the case where a WD -// event fires for a namespace that never had companions). -func TestCompanionGC_EmptyNamespace_NoOp(t *testing.T) { - t.Parallel() - - // Nothing pre-loaded — no companions, no WDs. - cellClient := newCellFakeClient() - r := newGCReconciler(cellClient) - - result, err := reconcileGCForWD(t, r) - require.NoError(t, err) - assert.Equal(t, ctrl.Result{}, result) -} - -// ─── Periodic backstop tests ────────────────────────────────────────────────── - -// TestCompanionGCBackstop_SweepDeletesOrphanInWDlessNamespace is the key -// regression test for the coverage gap identified in review. It proves that a -// companion orphaned in a namespace that has NO WorkloadDeployment object -// present (i.e. the last WD was deleted before the controller started, so the -// For(WD) watch would never enqueue that namespace) is collected by the periodic -// backstop sweep. -// -// The test exercises companionGCBackstop.sweep() directly (not via a hand-fed -// Reconcile request) to confirm the sweep generates the namespace-keyed event -// AND that the resulting Reconcile call cleans up the orphan. -func TestCompanionGCBackstop_SweepDeletesOrphanInWDlessNamespace(t *testing.T) { - t.Parallel() - - // Orphaned companion in a namespace with NO WDs present — the last WD was - // deleted before this controller started. - cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) // WD name in annotation but WD gone - cellClient := newCellFakeClient(cm) // no WD objects - - cl := newFakeCluster(cellClient) - mgr := newFakeMCManager(gcTestCluster, cl) - reconciler := &CompanionGCReconciler{mgr: mgr} - - // Wire the backstop with a buffered channel. We give it a generous buffer - // so the non-blocking send in sweep() succeeds. - backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) - backstop := &companionGCBackstop{ - ch: backstopCh, - clusters: map[multicluster.ClusterName]cluster.Cluster{ - multicluster.ClusterName(gcTestCluster): cl, - }, - } - - // Fire one sweep tick synchronously (no ticker needed in tests). - backstop.sweep(context.Background()) - - // The sweep must have emitted at least one event for gcTestNamespace. - require.NotEmpty(t, backstopCh, "backstop must emit a namespace-sweep event for a namespace containing companions") - - // Drain the channel and invoke Reconcile for each namespace-keyed event, - // exactly as the WatchesRawSource → backstopEventHandler → workqueue path - // would do in production. - sweptNamespaces := map[string]struct{}{} - ctx := mccontext.WithCluster(context.Background(), multicluster.ClusterName(gcTestCluster)) - for len(backstopCh) > 0 { - ev := <-backstopCh - // backstopEventHandler maps: obj.Name → ClusterName, obj.Namespace → ns. - clusterName := multicluster.ClusterName(ev.Object.GetName()) - ns := ev.Object.GetNamespace() - sweptNamespaces[ns] = struct{}{} - - _, err := reconciler.Reconcile(ctx, mcreconcile.Request{ - ClusterName: clusterName, - Request: ctrl.Request{NamespacedName: types.NamespacedName{Namespace: ns}}, - }) - require.NoError(t, err) - } - - // The backstop must have swept gcTestNamespace. - assert.Contains(t, sweptNamespaces, gcTestNamespace, - "backstop sweep must enqueue the namespace containing the orphaned companion") - - // The companion must be gone — collected by the Reconcile call above, not by - // a hand-fed request. This proves the backstop closes the WD-less namespace gap. - var got corev1.ConfigMap - err := cellClient.Get(context.Background(), - types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got) - assert.True(t, apierrors.IsNotFound(err), - "companion orphaned in WD-less namespace must be collected by the backstop sweep") -} - -// TestCompanionGCBackstop_SweepSkipsNamespaceWithLiveWD verifies that the -// backstop sweep does not delete a companion when a live WD still exists in the -// same namespace — the per-cell multi-referrer safety rule applies equally to -// backstop-triggered sweeps. -func TestCompanionGCBackstop_SweepSkipsNamespaceWithLiveWD(t *testing.T) { - t.Parallel() - - cm := companionCM([]string{prodRefKey(gcTestWD1Name)}) - wd := cellWD(gcTestWD1Name, gcTestNamespace) // live WD still present - cellClient := newCellFakeClient(cm, wd) - - cl := newFakeCluster(cellClient) - mgr := newFakeMCManager(gcTestCluster, cl) - reconciler := &CompanionGCReconciler{mgr: mgr} - - backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) - backstop := &companionGCBackstop{ - ch: backstopCh, - clusters: map[multicluster.ClusterName]cluster.Cluster{ - multicluster.ClusterName(gcTestCluster): cl, - }, - } - - backstop.sweep(context.Background()) - - ctx := mccontext.WithCluster(context.Background(), multicluster.ClusterName(gcTestCluster)) - for len(backstopCh) > 0 { - ev := <-backstopCh - clusterName := multicluster.ClusterName(ev.Object.GetName()) - ns := ev.Object.GetNamespace() - _, err := reconciler.Reconcile(ctx, mcreconcile.Request{ - ClusterName: clusterName, - Request: ctrl.Request{NamespacedName: types.NamespacedName{Namespace: ns}}, - }) - require.NoError(t, err) - } - - // Companion must NOT be deleted — the WD is still alive. - var got corev1.ConfigMap - require.NoError(t, cellClient.Get(context.Background(), - types.NamespacedName{Namespace: gcTestNamespace, Name: gcTestCMName}, &got), - "backstop must preserve companion when a live WD still references it") -} - -// TestCompanionGCBackstop_NoEventWhenNoCopanions verifies that the backstop -// emits no events when there are no companion objects in the cluster at all -// (no spurious requeues for empty clusters). -func TestCompanionGCBackstop_NoEventWhenNoCopanions(t *testing.T) { - t.Parallel() - - cellClient := newCellFakeClient() // nothing - cl := newFakeCluster(cellClient) - - backstopCh := make(chan event.GenericEvent, companionGCChannelBuffer) - backstop := &companionGCBackstop{ - ch: backstopCh, - clusters: map[multicluster.ClusterName]cluster.Cluster{ - multicluster.ClusterName(gcTestCluster): cl, - }, - } - - backstop.sweep(context.Background()) - assert.Empty(t, backstopCh, "backstop must emit no events when no companions exist") -} - -// ─── Federator ordering guard tests ─────────────────────────────────────────── - -// TestFederator_Finalize_RequeuesWhenCompanionsPresent verifies that the -// Finalize method returns a RequeueAfter result (not an error) when companion -// ConfigMaps or Secrets still exist in the downstream namespace. The -// PropagationPolicy must not be deleted until all companions are gone. -func TestFederator_Finalize_RequeuesWhenCompanionsPresent(t *testing.T) { - t.Parallel() - - ppName := propagationPolicyNameFor(testCityCodeLAX) - - tests := []struct { - name string - extraObj client.Object // the companion that blocks PP deletion - }{ - { - name: "ConfigMap companion blocks PP deletion", - extraObj: &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "cm-companion", - Namespace: testKarmadaNSStr, - Labels: map[string]string{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - }, - }, - }, - }, - { - name: "Secret companion blocks PP deletion", - extraObj: &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "secret-companion", - Namespace: testKarmadaNSStr, - Labels: map[string]string{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - }, - }, - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - - wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) - projectClient := newProjectFakeClient(testProjectNamespace(), wd) - - karmadaWD := &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: testWDName, - Namespace: testKarmadaNSStr, - }, - } - karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ - ObjectMeta: metav1.ObjectMeta{ - Name: ppName, - Namespace: testKarmadaNSStr, - }, - } - karmadaClient := newKarmadaFakeClient( - &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, - karmadaWD, - karmadaPP, - tt.extraObj, - ) - - r := newTestFederator(projectClient, karmadaClient) - result, err := r.Reconcile(context.Background(), reconcileRequest()) - - // With a companion still present, Finalize must requeue gracefully — - // no error (which would inflate error metrics), just a RequeueAfter. - require.NoError(t, err, "Finalize should NOT return an error when companions are present; use RequeueAfter instead") - assert.Equal(t, companionGuardRequeueAfter, result.RequeueAfter, - "result should carry the companion-guard RequeueAfter delay") - - // PropagationPolicy must still be alive. - var remainingPP karmadapolicyv1alpha1.PropagationPolicy - require.NoError(t, karmadaClient.Get(context.Background(), - types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, - &remainingPP), - "PropagationPolicy must not be deleted while companions are still present") - }) - } -} - -// TestFederator_Finalize_DeletesPPWhenCompanionsGone verifies that once all -// companions are removed from the downstream namespace the PP is deleted as -// normal (ordering guard does not block a clean namespace). -func TestFederator_Finalize_DeletesPPWhenCompanionsGone(t *testing.T) { - t.Parallel() - - ppName := propagationPolicyNameFor(testCityCodeLAX) - wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) - projectClient := newProjectFakeClient(testProjectNamespace(), wd) - - karmadaWD := &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: testWDName, - Namespace: testKarmadaNSStr, - }, - } - karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ - ObjectMeta: metav1.ObjectMeta{ - Name: ppName, - Namespace: testKarmadaNSStr, - }, - } - // No companions in the namespace — ordering guard should pass. - karmadaClient := newKarmadaFakeClient( - &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, - karmadaWD, - karmadaPP, - ) - - r := newTestFederator(projectClient, karmadaClient) - result, err := r.Reconcile(context.Background(), reconcileRequest()) - require.NoError(t, err) - assert.Equal(t, ctrl.Result{}, result) - - // PP must be gone — no companions blocked the cleanup. - var remainingPP karmadapolicyv1alpha1.PropagationPolicy - err = karmadaClient.Get(context.Background(), - types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, - &remainingPP) - assert.True(t, apierrors.IsNotFound(err), "PropagationPolicy should be deleted when no companions remain") -} - -// TestFederator_Finalize_DoesNotBlockOnSiblingWDCompanion is the Blocker 2 -// regression test. The downstream namespace (ns-{uid}) is shared by ALL WDs in -// a project. When WD-A is deleted while WD-B's companion still exists in the -// same namespace, the ordering guard must NOT block WD-A's finalizer. Blocking -// would permanently deadlock WD-A's deletion because WD-B's companion is never -// going to be cleaned up by WD-A's referenced-data controller. -// -// The guard must only fire when the deleting WD is the LAST one for its city -// code (i.e. when the PP is actually about to be deleted). -func TestFederator_Finalize_DoesNotBlockOnSiblingWDCompanion(t *testing.T) { - t.Parallel() - - ppName := propagationPolicyNameFor(testCityCodeLAX) - - // WD-A is the one being deleted (has finalizer + deletionTimestamp). - wdA := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) - projectClient := newProjectFakeClient(testProjectNamespace(), wdA) - - // In the downstream namespace: - // - WD-A's mirrored object (just deleted above, but not yet GC'd from Karmada) - // - WD-B: a sibling WD with the SAME city code, still alive - // - A companion belonging to WD-B - // - The shared PropagationPolicy - karmadaWDA := &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: testWDName, - Namespace: testKarmadaNSStr, - Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, - }, - Spec: computev1alpha.WorkloadDeploymentSpec{ - CityCode: testCityCodeLAX, - PlacementName: testDefaultPlacement, - WorkloadRef: computev1alpha.WorkloadReference{Name: rdTestWorkloadName}, - ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, - }, - } - karmadaWDB := &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "wd-b", - Namespace: testKarmadaNSStr, - Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, - }, - Spec: computev1alpha.WorkloadDeploymentSpec{ - CityCode: testCityCodeLAX, - PlacementName: testDefaultPlacement, - WorkloadRef: computev1alpha.WorkloadReference{Name: "workload-b"}, - ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, - }, - } - // WD-B's companion still lives in the shared namespace. - wdBCompanion := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "cm-b-companion", - Namespace: testKarmadaNSStr, - Labels: map[string]string{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - }, - }, - } - karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ - ObjectMeta: metav1.ObjectMeta{ - Name: ppName, - Namespace: testKarmadaNSStr, - }, - } - karmadaClient := newKarmadaFakeClient( - &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, - karmadaWDA, - karmadaWDB, - wdBCompanion, - karmadaPP, - ) - - r := newTestFederator(projectClient, karmadaClient) - result, err := r.Reconcile(context.Background(), reconcileRequest()) - - // WD-A's finalizer must complete without error even though WD-B's companion - // is still in the namespace. WD-B is still alive so the PP is kept anyway - // (cleanupPropagationPolicyIfUnused no-ops). The companion guard must NOT - // fire here. - require.NoError(t, err, "WD-A finalization must not be blocked by WD-B's companion") - assert.Equal(t, ctrl.Result{}, result) - - // The PropagationPolicy must still be alive (WD-B keeps it alive, not the guard). - var remainingPP karmadapolicyv1alpha1.PropagationPolicy - require.NoError(t, karmadaClient.Get(context.Background(), - types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, - &remainingPP), - "PropagationPolicy should be kept because WD-B still references the city code") -} - -// TestFederator_Finalize_GuardBypassesAfterTimeout verifies that the companion -// ordering guard stops blocking when the WD has been terminating for longer than -// companionGuardTimeout. This prevents a permanently wedged referenced-data -// controller from deadlocking WD deletion. -func TestFederator_Finalize_GuardBypassesAfterTimeout(t *testing.T) { - t.Parallel() - - ppName := propagationPolicyNameFor(testCityCodeLAX) - - // Set the DeletionTimestamp far enough in the past to exceed the guard timeout. - pastDeletion := metav1.NewTime(time.Now().Add(-(companionGuardTimeout + time.Minute))) - wd := testWorkloadDeployment(withFinalizer, func(w *computev1alpha.WorkloadDeployment) { - w.DeletionTimestamp = &pastDeletion - }) - projectClient := newProjectFakeClient(testProjectNamespace(), wd) - - karmadaWD := &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{Name: testWDName, Namespace: testKarmadaNSStr}, - } - karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ - ObjectMeta: metav1.ObjectMeta{Name: ppName, Namespace: testKarmadaNSStr}, - } - // A companion is still present — but the timeout should let us proceed anyway. - stuckCompanion := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "stuck-companion", - Namespace: testKarmadaNSStr, - Labels: map[string]string{ - computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, - }, - }, - } - karmadaClient := newKarmadaFakeClient( - &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, - karmadaWD, - karmadaPP, - stuckCompanion, - ) - - r := newTestFederator(projectClient, karmadaClient) - result, err := r.Reconcile(context.Background(), reconcileRequest()) - - // After the timeout the guard is bypassed — finalization must complete. - require.NoError(t, err, "guard must bypass after timeout even with companion still present") - assert.Equal(t, ctrl.Result{}, result) - - // The PP must be deleted (last WD for the city, guard bypassed). - var remainingPP karmadapolicyv1alpha1.PropagationPolicy - err = karmadaClient.Get(context.Background(), - types.NamespacedName{Name: ppName, Namespace: testKarmadaNSStr}, - &remainingPP) - assert.True(t, apierrors.IsNotFound(err), "PropagationPolicy should be deleted after guard timeout bypass") -} diff --git a/internal/controller/orphan_rb_controller.go b/internal/controller/orphan_rb_controller.go new file mode 100644 index 00000000..6677f891 --- /dev/null +++ b/internal/controller/orphan_rb_controller.go @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" +) + +const ( + // orphanRBSweepInterval is how often the periodic backstop fires to catch + // ResourceBindings whose hub companion was deleted before the controller started. + orphanRBSweepInterval = 5 * time.Minute + + // ppNameLabelKey is the label Karmada's binding-controller stamps on every + // ResourceBinding to link it back to its governing PropagationPolicy. + ppNameLabelKey = "propagationpolicy.karmada.io/name" + + // cityPPPrefix is the prefix of PropagationPolicy names created by + // propagationPolicyNameFor in workloaddeployment_federator.go. RBs whose + // PP label starts with this prefix were created for referenced-data companion + // propagation. + cityPPPrefix = "city-" +) + +// OrphanRBReconciler runs on the Karmada hub and deletes ResourceBindings +// whose hub companion (ConfigMap or Secret) no longer exists. It is the +// backstop sweep that handles Karmada's event-driven deletion gap: if the +// binding-controller misses the companion-deletion event (e.g. PP deleted +// before RB reconcile completes), the RB and its child Works are stranded. +// This reconciler detects and removes those stranded RBs so that Works are +// deleted and cell copies stop being re-created. +// +// # Tight scope +// +// Only ResourceBindings satisfying ALL three conditions are ever deleted: +// 1. Name ends with "-configmap" or "-secret" (Karmada's kind-suffix for +// namespace-scoped ConfigMap/Secret RBs — WD RBs end in "-workloaddeployment"). +// 2. The propagationpolicy.karmada.io/name label starts with "city-" (all +// referenced-data PropagationPolicies use this prefix). +// 3. The hub companion derived by stripping the kind suffix does NOT exist +// in the same namespace (and has no deletionTimestamp — a terminating +// companion means Karmada cascade is still in progress). +// +// This filter is necessary and sufficient to avoid touching WD RBs and any +// non-compute ResourceBindings that happen to share the hub namespace. +// +// # Cascade +// +// Deleting the RB triggers Karmada's binding-controller to remove all Work +// objects the RB owns. The execution-controller then removes the cell copies. +// Because the Work is gone there is nothing left to re-create the cell copy — +// the deletion is permanent. +type OrphanRBReconciler struct { + // HubClient is a client pointed at the Karmada hub API server. + HubClient client.Client +} + +// +kubebuilder:rbac:groups=work.karmada.io,resources=resourcebindings,verbs=get;list;watch;delete + +// Reconcile is triggered for each ResourceBinding that passes the predicate +// filter. It checks whether the RB is a stranded orphan and deletes it if so. +func (r *OrphanRBReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + var rb karmadaworkv1alpha2.ResourceBinding + if err := r.HubClient.Get(ctx, req.NamespacedName, &rb); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // Belt-and-suspenders: re-apply the scope filter in Reconcile so that even + // if the predicate is bypassed (e.g. during tests) only valid RBs are acted on. + if !r.isInScope(&rb) { + return ctrl.Result{}, nil + } + + companionName, kind, ok := companionFromRBName(rb.Name) + if !ok { + return ctrl.Result{}, nil + } + + orphaned, err := r.isOrphaned(ctx, rb.Namespace, companionName, kind) + if err != nil { + return ctrl.Result{}, err + } + if !orphaned { + return ctrl.Result{}, nil + } + + logger.Info("deleting orphaned companion ResourceBinding", + "name", rb.Name, + "namespace", rb.Namespace, + "companionName", companionName, + "companionKind", kind, + ) + if err := r.HubClient.Delete(ctx, &rb); client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +// isInScope returns true when the ResourceBinding falls within the tight scope +// of referenced-data companion RBs: city-PP label AND kind suffix. +func (r *OrphanRBReconciler) isInScope(rb *karmadaworkv1alpha2.ResourceBinding) bool { + ppName := rb.Labels[ppNameLabelKey] + if !strings.HasPrefix(ppName, cityPPPrefix) { + return false + } + name := rb.Name + return strings.HasSuffix(name, "-configmap") || strings.HasSuffix(name, "-secret") +} + +// companionFromRBName extracts the companion object name and kind from a +// ResourceBinding name. Karmada names namespace-scoped RBs as +// "{objectName}-{kindLowercase}", so: +// +// "cm-pristine-configmap" → ("cm-pristine", "ConfigMap", true) +// "secret-foo-secret" → ("secret-foo", "Secret", true) +// "wd-foo-workloaddeployment" → ("", "", false) // not a companion RB +func companionFromRBName(rbName string) (companionName, kind string, ok bool) { + switch { + case strings.HasSuffix(rbName, "-configmap"): + return strings.TrimSuffix(rbName, "-configmap"), kindConfigMap, true + case strings.HasSuffix(rbName, "-secret"): + return strings.TrimSuffix(rbName, "-secret"), kindSecret, true + default: + return "", "", false + } +} + +// isOrphaned returns true when the hub companion is fully absent. A companion +// with a deletionTimestamp is considered NOT orphaned — the deletion is in +// progress and Karmada's cascade (or Component 3's explicit RB delete) is +// expected to fire shortly. We do not interfere with in-progress deletions. +func (r *OrphanRBReconciler) isOrphaned(ctx context.Context, namespace, companionName, kind string) (bool, error) { + nsn := types.NamespacedName{Namespace: namespace, Name: companionName} + switch kind { + case kindConfigMap: + var cm corev1.ConfigMap + err := r.HubClient.Get(ctx, nsn, &cm) + if apierrors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, err + } + // Companion exists (including if terminating) — not orphaned yet. + return false, nil + case kindSecret: + var s corev1.Secret + err := r.HubClient.Get(ctx, nsn, &s) + if apierrors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, err + } + return false, nil + default: + return false, nil + } +} + +// SetupWithManager registers the OrphanRBReconciler with a regular ctrl.Manager +// pointed at the Karmada hub. It is called from SetupOrphanRBWithManager, not +// from the multicluster manager, because ResourceBindings live on the hub only. +// +// The predicate filter restricts the work queue to city-PP companion RBs +// (kind suffix + city- label prefix) before they ever reach Reconcile. +func (r *OrphanRBReconciler) SetupWithManager(mgr manager.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&karmadaworkv1alpha2.ResourceBinding{}, + builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { + ppName := obj.GetLabels()[ppNameLabelKey] + if !strings.HasPrefix(ppName, cityPPPrefix) { + return false + } + name := obj.GetName() + return strings.HasSuffix(name, "-configmap") || strings.HasSuffix(name, "-secret") + })), + ). + Named("orphan-rb"). + Complete(r) +} + +// orphanRBPeriodicSweep is a manager.Runnable that periodically lists all in- +// scope companion ResourceBindings on the hub and calls the reconciler directly +// for each. This covers RBs that became orphaned before the controller started, +// when no new RB events are expected to fire. +// +// The sweep calls Reconcile directly (bypassing the controller's work queue) +// because the manager.Runnable interface is simpler than wiring a synthetic +// event source. The reconciler is idempotent, so direct calls are safe. +type orphanRBPeriodicSweep struct { + hubClient client.Client + reconciler *OrphanRBReconciler + interval time.Duration +} + +// Start runs the periodic sweep loop until ctx is cancelled. +func (s *orphanRBPeriodicSweep) Start(ctx context.Context) error { + ticker := time.NewTicker(s.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + s.sweep(ctx) + } + } +} + +// sweep lists all ResourceBindings on the hub, filters to in-scope companion +// RBs, and calls Reconcile directly for each candidate. +func (s *orphanRBPeriodicSweep) sweep(ctx context.Context) { + logger := log.FromContext(ctx).WithValues("component", "orphan-rb-periodic-sweep") + + var rbList karmadaworkv1alpha2.ResourceBindingList + // List across all namespaces; label filtering is applied client-side because + // a HasPrefix predicate is not expressible as a label selector. The total RB + // count in the hub is bounded (O(companions × cells)), so a full list is safe. + if err := s.hubClient.List(ctx, &rbList); err != nil { + logger.Error(err, "periodic sweep: list ResourceBindings failed") + return + } + + for i := range rbList.Items { + rb := &rbList.Items[i] + if !s.reconciler.isInScope(rb) { + continue + } + if _, err := s.reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: rb.Namespace, + Name: rb.Name, + }, + }); err != nil { + logger.Error(err, "periodic sweep: reconcile failed", + "name", rb.Name, "namespace", rb.Namespace) + } + } +} + +// SetupOrphanRBWithManager wires the OrphanRBReconciler and its periodic sweep +// backstop onto the provided ctrl.Manager pointed at the Karmada hub. Called +// from setupManagementControllers in cmd/main.go. +func SetupOrphanRBWithManager(mgr manager.Manager, hubClient client.Client) error { + r := &OrphanRBReconciler{HubClient: hubClient} + if err := r.SetupWithManager(mgr); err != nil { + return err + } + return mgr.Add(&orphanRBPeriodicSweep{ + hubClient: hubClient, + reconciler: r, + interval: orphanRBSweepInterval, + }) +} diff --git a/internal/controller/orphan_rb_controller_test.go b/internal/controller/orphan_rb_controller_test.go new file mode 100644 index 00000000..f47c7c45 --- /dev/null +++ b/internal/controller/orphan_rb_controller_test.go @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" +) + +// ─── Test constants ──────────────────────────────────────────────────────────── + +const ( + // orbTestNS is the hub namespace where companion RBs and their companions live. + orbTestNS = "ns-efdf8ca1-9c2d-4ac8-b161-1951503a2879" + + // orbCityPPName is the PropagationPolicy name label used on companion RBs. + orbCityPPName = "city-dfw" +) + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +// newOrphanRBReconciler creates an OrphanRBReconciler backed by a fake hub client. +func newOrphanRBReconciler(objs ...client.Object) (*OrphanRBReconciler, client.Client) { + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() + return &OrphanRBReconciler{HubClient: hubCl}, hubCl +} + +// makeCompanionRB returns a ResourceBinding that represents an in-scope +// companion RB (city-PP label + configmap/secret name suffix). The RB is +// placed in orbTestNS. +func makeCompanionRB(name string) *karmadaworkv1alpha2.ResourceBinding { + return &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: name, + Labels: map[string]string{ppNameLabelKey: orbCityPPName}, + }, + } +} + +// reconcileRB runs Reconcile for the named ResourceBinding in orbTestNS. +func reconcileRB(t *testing.T, r *OrphanRBReconciler, name string) (ctrl.Result, error) { + t.Helper() + return r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: orbTestNS, Name: name}, + }) +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +// TestOrphanRB_DeletesOrphanedConfigMapRB asserts that an RB whose companion +// ConfigMap is absent is deleted. +func TestOrphanRB_DeletesOrphanedConfigMapRB(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB("cm-pristine-configmap") + // No ConfigMap "cm-pristine" in the hub namespace — the RB is orphaned. + r, hubCl := newOrphanRBReconciler(rb) + + _, err := reconcileRB(t, r, "cm-pristine-configmap") + require.NoError(t, err) + + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "cm-pristine-configmap"}, &got) + require.True(t, apierrors.IsNotFound(err), "orphaned companion RB must be deleted") +} + +// TestOrphanRB_DeletesOrphanedSecretRB asserts the same for Secret companion RBs. +func TestOrphanRB_DeletesOrphanedSecretRB(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB("secret-pristine-secret") + r, hubCl := newOrphanRBReconciler(rb) + + _, err := reconcileRB(t, r, "secret-pristine-secret") + require.NoError(t, err) + + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "secret-pristine-secret"}, &got) + require.True(t, apierrors.IsNotFound(err), "orphaned companion Secret RB must be deleted") +} + +// TestOrphanRB_SkipsLiveCompanion asserts that an RB whose companion still +// exists (live, no deletionTimestamp) is NOT deleted. +func TestOrphanRB_SkipsLiveCompanion(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB("cm-live-configmap") + // Companion ConfigMap exists and is live. + liveCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: orbTestNS, Name: "cm-live"}, + } + r, hubCl := newOrphanRBReconciler(rb, liveCM) + + _, err := reconcileRB(t, r, "cm-live-configmap") + require.NoError(t, err) + + // RB must still exist. + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "cm-live-configmap"}, &got) + require.NoError(t, err, "RB must NOT be deleted when companion still exists") + assert.Equal(t, "cm-live-configmap", got.Name) +} + +// TestOrphanRB_SkipsTerminatingCompanion asserts that an RB whose companion +// has a deletionTimestamp is NOT deleted — the deletion cascade may still fire. +func TestOrphanRB_SkipsTerminatingCompanion(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB("cm-terminating-configmap") + now := metav1.Now() + // Companion exists with a deletionTimestamp — deletion in progress. + terminatingCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: "cm-terminating", + DeletionTimestamp: &now, + Finalizers: []string{"karmada.io/fake"}, + }, + } + r, hubCl := newOrphanRBReconciler(rb, terminatingCM) + + _, err := reconcileRB(t, r, "cm-terminating-configmap") + require.NoError(t, err) + + // RB must still exist. + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "cm-terminating-configmap"}, &got) + require.NoError(t, err, "RB must NOT be deleted when companion is terminating") +} + +// TestOrphanRB_IgnoresNonCityPPRB asserts that an RB without the "city-" +// PropagationPolicy label is never touched, even if its companion is absent. +func TestOrphanRB_IgnoresNonCityPPRB(t *testing.T) { + t.Parallel() + + // RB has a non-city PP label (e.g. a WD's PP or an unrelated tenant's PP). + nonCityRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: "cm-other-configmap", + Labels: map[string]string{ppNameLabelKey: "other-pp"}, + }, + } + r, hubCl := newOrphanRBReconciler(nonCityRB) + + _, err := reconcileRB(t, r, "cm-other-configmap") + require.NoError(t, err) + + // RB must still exist — it is outside the companion scope. + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "cm-other-configmap"}, &got) + require.NoError(t, err, "non-city-PP RB must NOT be touched") +} + +// TestOrphanRB_IgnoresWorkloadDeploymentRB asserts that a WD ResourceBinding +// (suffix "-workloaddeployment") is never deleted even when its companion is +// absent and it has a city- PP label. WD RBs are out of scope. +func TestOrphanRB_IgnoresWorkloadDeploymentRB(t *testing.T) { + t.Parallel() + + // WD RB: correct city- PP label but wrong kind suffix. + wdRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: "my-workload-workloaddeployment", + Labels: map[string]string{ppNameLabelKey: "city-dfw"}, + }, + } + r, hubCl := newOrphanRBReconciler(wdRB) + + _, err := reconcileRB(t, r, "my-workload-workloaddeployment") + require.NoError(t, err) + + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "my-workload-workloaddeployment"}, &got) + require.NoError(t, err, "WorkloadDeployment RB must NEVER be deleted by the orphan sweep") +} + +// TestOrphanRB_ToleratesRBAlreadyGone asserts that the reconciler does not +// error when the RB is already gone at reconcile time (deleted by Component 3). +func TestOrphanRB_ToleratesRBAlreadyGone(t *testing.T) { + t.Parallel() + + // Do not seed the RB — simulate it being deleted before Reconcile runs. + r, _ := newOrphanRBReconciler() + + _, err := reconcileRB(t, r, "cm-already-gone-configmap") + require.NoError(t, err, "reconciler must tolerate NotFound on the RB itself") +} + +// TestOrphanRB_CompanionFromRBName_Patterns verifies companionFromRBName +// correctly extracts companion names/kinds and rejects non-companion patterns. +func TestOrphanRB_CompanionFromRBName_Patterns(t *testing.T) { + t.Parallel() + + tests := []struct { + rbName string + wantOK bool + wantName string + wantKind string + }{ + {"cm-pristine-configmap", true, "cm-pristine", kindConfigMap}, + {"secret-foo-secret", true, "secret-foo", kindSecret}, + {"a-b-c-secret", true, "a-b-c", kindSecret}, + {"wd-foo-workloaddeployment", false, "", ""}, + {"just-a-name", false, "", ""}, + {"", false, "", ""}, + {"-configmap", true, "", kindConfigMap}, // degenerate — name is empty + {"notsuffix", false, "", ""}, + } + + for _, tc := range tests { + t.Run(tc.rbName, func(t *testing.T) { + name, kind, ok := companionFromRBName(tc.rbName) + assert.Equal(t, tc.wantOK, ok, "ok mismatch for %q", tc.rbName) + assert.Equal(t, tc.wantName, name, "name mismatch for %q", tc.rbName) + assert.Equal(t, tc.wantKind, kind, "kind mismatch for %q", tc.rbName) + }) + } +} diff --git a/internal/controller/referenceddata_controller.go b/internal/controller/referenceddata_controller.go index 9593bc8a..1bb445b9 100644 --- a/internal/controller/referenceddata_controller.go +++ b/internal/controller/referenceddata_controller.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/referenceddata" "go.miloapis.com/milo/pkg/downstreamclient" @@ -105,6 +106,18 @@ type companionWriter interface { // DeleteSecret deletes the companion Secret if it exists. DeleteSecret(ctx context.Context, namespace, name string) error + // DeleteResourceBinding deletes the Karmada ResourceBinding associated with + // a companion object after the companion itself has been deleted. The RB name + // follows the Karmada binding-controller convention: "{companionName}-{kind}", + // where kind is the lowercase resource kind ("configmap" or "secret"). + // + // Callers MUST pass client.IgnoreNotFound semantics: the RB may already be + // gone if Karmada's event-driven cascade ran before this call. + // + // localCompanionWriter returns nil unconditionally — there is no Karmada hub + // in single-cluster mode. downstreamCompanionWriter deletes from the hub. + DeleteResourceBinding(ctx context.Context, namespace, rbName string) error + // GetConfigMap returns the existing companion ConfigMap, or nil if absent. GetConfigMap(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) @@ -153,6 +166,12 @@ func (w *localCompanionWriter) DeleteSecret(ctx context.Context, namespace, name return client.IgnoreNotFound(w.cl.Delete(ctx, s)) } +// DeleteResourceBinding is a no-op for localCompanionWriter: in single-cluster +// mode there is no Karmada hub and therefore no ResourceBindings to clean up. +func (w *localCompanionWriter) DeleteResourceBinding(_ context.Context, _, _ string) error { + return nil +} + func (w *localCompanionWriter) GetConfigMap(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) { var cm corev1.ConfigMap err := w.cl.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &cm) @@ -241,6 +260,25 @@ func (w *downstreamCompanionWriter) DeleteSecret(ctx context.Context, _, name st return client.IgnoreNotFound(w.hubClient.Delete(ctx, s)) } +// DeleteResourceBinding deletes the Karmada ResourceBinding for a companion +// that has just been deleted from the hub. The RB lives in the same downstream +// namespace as the companion and is named "{companionName}-{kind}" per Karmada's +// binding-controller convention (e.g. "cm-pristine-configmap"). +// +// Deleting the RB cascades: binding-controller removes the Work, which drives +// the execution-controller to remove the cell copy permanently (no Work left to +// re-create it). client.IgnoreNotFound is applied because Karmada's own +// event-driven cascade may have already cleaned up the RB. +func (w *downstreamCompanionWriter) DeleteResourceBinding(ctx context.Context, _, rbName string) error { + rb := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: w.downstreamNamespace, + Name: rbName, + }, + } + return client.IgnoreNotFound(w.hubClient.Delete(ctx, rb)) +} + func (w *downstreamCompanionWriter) GetConfigMap(ctx context.Context, _, name string) (*corev1.ConfigMap, error) { var cm corev1.ConfigMap err := w.hubClient.Get(ctx, types.NamespacedName{Namespace: w.downstreamNamespace, Name: name}, &cm) @@ -1065,6 +1103,12 @@ func (r *ReferencedDataController) releaseRemovedCompanions( // and the subsequent Update target the same resourceVersion so a concurrent // change causes a conflict that drives a re-read and retry. // +// When the ref-count drops to zero and the companion is deleted, the associated +// Karmada ResourceBinding is also deleted. This short-circuits Karmada's event- +// driven cascade and ensures the Work is removed regardless of PropagationPolicy +// state. The RB delete is issued outside the RetryOnConflict loop because it +// is unconditional once the companion is gone. +// // If the ref-count annotation is unparseable the whole call returns an error // (transient). The companion is NOT deleted in that case — it may still be // referenced by other WDs whose entries are recorded in the corrupt annotation. @@ -1079,6 +1123,7 @@ func (r *ReferencedDataController) releaseOneCompanion( // Try ConfigMap. var cmExists bool + var cmDeleted bool if releaseConfigMap { if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { cm, err := writer.GetConfigMap(ctx, namespace, companionName) @@ -1095,6 +1140,7 @@ func (r *ReferencedDataController) releaseOneCompanion( return fmt.Errorf("referenceddata: corrupt ref-count on ConfigMap %q: %w", companionName, err) } if len(remaining) == 0 { + cmDeleted = true return writer.DeleteConfigMap(ctx, namespace, companionName) } // Build a desired object that carries the updated ref-count. Pass the @@ -1111,6 +1157,15 @@ func (r *ReferencedDataController) releaseOneCompanion( } } if cmExists { + // Delete the Karmada ResourceBinding if the companion was fully removed. + // The RB name follows the Karmada binding-controller convention: + // "{companionName}-configmap". IgnoreNotFound because Karmada may have + // already cascaded the deletion. + if cmDeleted { + if err := writer.DeleteResourceBinding(ctx, namespace, companionName+"-configmap"); err != nil { + return fmt.Errorf("referenceddata: delete ResourceBinding for ConfigMap %q: %w", companionName, err) + } + } return nil } @@ -1119,7 +1174,8 @@ func (r *ReferencedDataController) releaseOneCompanion( } // Try Secret. - return retry.RetryOnConflict(retry.DefaultRetry, func() error { + var secretDeleted bool + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { s, err := writer.GetSecret(ctx, namespace, companionName) if err != nil { return fmt.Errorf("get companion Secret %q: %w", companionName, err) @@ -1132,6 +1188,7 @@ func (r *ReferencedDataController) releaseOneCompanion( return fmt.Errorf("referenceddata: corrupt ref-count on Secret %q: %w", companionName, err) } if len(remaining) == 0 { + secretDeleted = true return writer.DeleteSecret(ctx, namespace, companionName) } desired := s.DeepCopy() @@ -1140,7 +1197,19 @@ func (r *ReferencedDataController) releaseOneCompanion( } desired.Annotations[companionRefCountAnnotation] = encodeRefCount(remaining) return writer.ApplySecret(ctx, s, desired) - }) + }); err != nil { + return err + } + + // Delete the Karmada ResourceBinding if the companion was fully removed. + // The RB name follows the Karmada binding-controller convention: + // "{companionName}-secret". + if secretDeleted { + if err := writer.DeleteResourceBinding(ctx, namespace, companionName+"-secret"); err != nil { + return fmt.Errorf("referenceddata: delete ResourceBinding for Secret %q: %w", companionName, err) + } + } + return nil } // writerFor returns the companionWriter appropriate for the current mode. diff --git a/internal/controller/referenceddata_controller_test.go b/internal/controller/referenceddata_controller_test.go index c24a46e9..a6155474 100644 --- a/internal/controller/referenceddata_controller_test.go +++ b/internal/controller/referenceddata_controller_test.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/referenceddata" ) @@ -779,11 +780,11 @@ func TestReferencedData_CompanionNamespaceInvariant(t *testing.T) { // newRDControllerFederated creates a ReferencedDataController wired with a // FederationClient (fake hub client). The project cluster holds the WDs and // source ConfigMaps/Secrets; the hub client is the destination for companions. +// Reader is always nil in tests; the controller falls back to LocalReader. func newRDControllerFederated( t *testing.T, projectCl client.Client, hubCl client.Client, - reader referenceddata.ProjectConfigSecretReader, ) (*ReferencedDataController, string) { t.Helper() clusterName := rdTestClusterName @@ -797,7 +798,6 @@ func newRDControllerFederated( c := &ReferencedDataController{ mgr: mgr, opts: ReferencedDataControllerOptions{ - Reader: reader, FederationClient: hubCl, }, } @@ -841,7 +841,7 @@ func TestReferencedData_Federated_CompanionWrittenToHub(t *testing.T) { require.NoError(t, computev1alpha.AddToScheme(hubScheme)) hubCl := fake.NewClientBuilder().WithScheme(hubScheme).Build() - c, clusterName := newRDControllerFederated(t, projectCl, hubCl, nil) + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) // First reconcile: stamps finalizer. reconcileWD(t, c, clusterName, projNS, "wd-fed-1") @@ -1544,6 +1544,312 @@ func TestFederator_StatusSync_PreservesReferencedDataReadyCondition(t *testing.T "resolver's Ready reason must be preserved by federator status sync") } +// ─── Component 3: Explicit ResourceBinding teardown ────────────────────────── + +// TestRBTeardown_ConfigMap asserts that when a federated companion ConfigMap is +// deleted (ref-count reaches zero), the controller also deletes the Karmada +// ResourceBinding named "{companionName}-configmap" in the downstream namespace. +func TestRBTeardown_ConfigMap(t *testing.T) { + t.Parallel() + + projNS := testProjNS + projNSUID := testProjNSUID + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + rbName := companionName + "-configmap" + wdName := "wd-rb-teardown-cm" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: projNS, Name: wdName}.String() + + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + // WD pre-seeded with deletionTimestamp + finalizer + annotation to trigger + // the deletion reconcile path directly. + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: projNS, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["ConfigMap/` + companionName + `"]`, + }, + }, + } + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + projectCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projNSObj, wd). + WithStatusSubresource(wd). + Build() + + // Hub: companion ConfigMap already exists + the RB the reconciler should delete. + downstreamNS := testKarmadaNSStr + existingCompanion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: downstreamNS, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + existingRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Namespace: downstreamNS, Name: rbName}, + } + + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(existingCompanion, existingRB). + Build() + + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) + + // Reconcile deletion: should delete companion + RB. + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: projNS, Name: wdName}}, + ClusterName: cn, + }) + require.NoError(t, err) + + // Companion must be gone from the hub. + var hubCM corev1.ConfigMap + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: companionName}, &hubCM) + require.True(t, apierrors.IsNotFound(err), "hub companion ConfigMap must be deleted") + + // ResourceBinding must be gone from the hub. + var rb karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: rbName}, &rb) + require.True(t, apierrors.IsNotFound(err), "ResourceBinding %q must be deleted after companion deletion", rbName) +} + +// TestRBTeardown_Secret asserts the same teardown behaviour for Secret companions. +func TestRBTeardown_Secret(t *testing.T) { + t.Parallel() + + projNS := testProjNS + projNSUID := testProjNSUID + secretName := "db-creds" + companionName := referenceddata.CompanionName("Secret", secretName) + rbName := companionName + "-secret" + wdName := "wd-rb-teardown-sec" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: projNS, Name: wdName}.String() + + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: projNS, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["Secret/` + companionName + `"]`, + }, + }, + } + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + projectCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projNSObj, wd). + WithStatusSubresource(wd). + Build() + + downstreamNS := testKarmadaNSStr + existingCompanion := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: downstreamNS, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + existingRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Namespace: downstreamNS, Name: rbName}, + } + + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(existingCompanion, existingRB). + Build() + + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) + + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: projNS, Name: wdName}}, + ClusterName: cn, + }) + require.NoError(t, err) + + var hubSecret corev1.Secret + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: companionName}, &hubSecret) + require.True(t, apierrors.IsNotFound(err), "hub companion Secret must be deleted") + + var rb karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: rbName}, &rb) + require.True(t, apierrors.IsNotFound(err), "ResourceBinding %q must be deleted after companion deletion", rbName) +} + +// TestRBTeardown_ToleratesNotFound asserts that companion deletion succeeds +// even when the ResourceBinding is already gone (Karmada beat the controller). +func TestRBTeardown_ToleratesNotFound(t *testing.T) { + t.Parallel() + + projNS := testProjNS + projNSUID := testProjNSUID + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + wdName := "wd-rb-notfound" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: projNS, Name: wdName}.String() + + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: projNS, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["ConfigMap/` + companionName + `"]`, + }, + }, + } + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + projectCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projNSObj, wd). + WithStatusSubresource(wd). + Build() + + downstreamNS := testKarmadaNSStr + existingCompanion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: downstreamNS, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + // RB is intentionally absent — Karmada already deleted it. + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(existingCompanion). + Build() + + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) + + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: projNS, Name: wdName}}, + ClusterName: cn, + }) + // Must not error even though the RB is already gone. + require.NoError(t, err, "RB teardown must tolerate NotFound") +} + +// TestRBTeardown_LocalWriter_NoOp asserts that the localCompanionWriter path +// (FederationClient == nil, single-cluster dev mode) does NOT attempt to delete +// any ResourceBindings — there is no Karmada hub in single-cluster mode. +// +// The companion and its source share the same name/namespace in single-cluster +// mode. We seed the WD as already terminating so the deletion path fires +// immediately. +func TestRBTeardown_LocalWriter_NoOp(t *testing.T) { + t.Parallel() + + ns := rdTestNamespace + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + wdName := "wd-local-noop" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: ns, Name: wdName}.String() + + // In single-cluster mode the companion == source (same name/namespace). + // Seed the companion directly as if the controller had materialised it. + companion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["ConfigMap/` + companionName + `"]`, + }, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(companion, wd). + WithStatusSubresource(wd). + Build() + + // Track Delete calls to detect any unexpected RB deletes. + var rbDeleteCalled bool + interceptedCl := interceptor.NewClient(cl, interceptor.Funcs{ + Delete: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.DeleteOption) error { + if _, ok := obj.(*karmadaworkv1alpha2.ResourceBinding); ok { + rbDeleteCalled = true + } + return c.Delete(ctx, obj, opts...) + }, + }) + + // No FederationClient → localCompanionWriter path. + c, clusterName := newRDController(t, interceptedCl, nil) + + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: ns, Name: wdName}}, + ClusterName: cn, + }) + require.NoError(t, err) + + // The companion ConfigMap is deleted (localCompanionWriter deletes it + // directly), but NO ResourceBinding delete should have been attempted. + assert.False(t, rbDeleteCalled, "localCompanionWriter must NOT delete ResourceBindings (no Karmada hub in single-cluster mode)") +} + // ─── Helpers ───────────────────────────────────────────────────────────────── // stubReader allows individual test cases to inject custom reader behaviour. diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go index cc3d3d9f..91737cba 100644 --- a/internal/controller/testing_helpers_test.go +++ b/internal/controller/testing_helpers_test.go @@ -15,9 +15,16 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/multicluster" karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" ) +// gcTestProjectNamespace is the project-plane namespace used across GC and +// WorkloadDeployment controller tests. "default" matches the annotation key +// prefix written by the hub-side ReferencedDataController +// (e.g. "default/mount-pristine-default-dfw"). +const gcTestProjectNamespace = "default" + // ─── Scheme helpers ─────────────────────────────────────────────────────────── // newProjectScheme builds a runtime.Scheme with the types needed by the project @@ -30,12 +37,13 @@ func newProjectScheme() *runtime.Scheme { } // newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada -// API server (corev1 + compute + karmada policy). +// API server (corev1 + compute + karmada policy + karmada work ResourceBindings). func newKarmadaScheme() *runtime.Scheme { s := runtime.NewScheme() _ = corev1.AddToScheme(s) _ = computev1alpha.AddToScheme(s) _ = karmadapolicyv1alpha1.Install(s) + _ = karmadaworkv1alpha2.Install(s) return s } From fb58dacb3784f7e40aed36de6b088e4a81e563ef Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 20:58:51 -0500 Subject: [PATCH 4/8] fix: read Karmada PP name from ResourceBinding annotations, not labels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OrphanRBReconciler scoped referenced-data companion ResourceBindings by reading propagationpolicy.karmada.io/name from metadata.labels, but the running Karmada version stores that value in metadata.annotations (labels carry only permanent-id UUIDs). The label read always returned "", so the scope predicate rejected every ResourceBinding and the controller never reclaimed an orphaned binding — confirmed live in the lab, where the stranded cm-pristine/secret-pristine copies were never cleaned. Read from annotations in both isInScope and the watch predicate. The unit-test fixtures previously set the PP name as a label (matching the bug, which is why they passed against broken code); they now use the production annotation shape, so they fail against the label read and pass against the fix. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit e3213705a5b9a393cafb35d43e89371dc4ed75ba) --- internal/controller/orphan_rb_controller.go | 24 ++++++----- .../controller/orphan_rb_controller_test.go | 41 +++++++++++++++---- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/internal/controller/orphan_rb_controller.go b/internal/controller/orphan_rb_controller.go index 6677f891..3e34c06c 100644 --- a/internal/controller/orphan_rb_controller.go +++ b/internal/controller/orphan_rb_controller.go @@ -25,14 +25,17 @@ const ( // ResourceBindings whose hub companion was deleted before the controller started. orphanRBSweepInterval = 5 * time.Minute - // ppNameLabelKey is the label Karmada's binding-controller stamps on every - // ResourceBinding to link it back to its governing PropagationPolicy. - ppNameLabelKey = "propagationpolicy.karmada.io/name" + // ppNameAnnotationKey is the annotation Karmada's binding-controller stamps on + // every ResourceBinding to link it back to its governing PropagationPolicy. + // Despite the "name" in the key string, Karmada stores this value in + // metadata.annotations, NOT metadata.labels (labels carry only permanent-id + // UUIDs). Reading from labels always returns "" and breaks scope filtering. + ppNameAnnotationKey = "propagationpolicy.karmada.io/name" // cityPPPrefix is the prefix of PropagationPolicy names created by // propagationPolicyNameFor in workloaddeployment_federator.go. RBs whose - // PP label starts with this prefix were created for referenced-data companion - // propagation. + // PP annotation starts with this prefix were created for referenced-data + // companion propagation. cityPPPrefix = "city-" ) @@ -49,8 +52,9 @@ const ( // Only ResourceBindings satisfying ALL three conditions are ever deleted: // 1. Name ends with "-configmap" or "-secret" (Karmada's kind-suffix for // namespace-scoped ConfigMap/Secret RBs — WD RBs end in "-workloaddeployment"). -// 2. The propagationpolicy.karmada.io/name label starts with "city-" (all -// referenced-data PropagationPolicies use this prefix). +// 2. The propagationpolicy.karmada.io/name annotation starts with "city-" (all +// referenced-data PropagationPolicies use this prefix; Karmada stores this +// in annotations, not labels). // 3. The hub companion derived by stripping the kind suffix does NOT exist // in the same namespace (and has no deletionTimestamp — a terminating // companion means Karmada cascade is still in progress). @@ -116,9 +120,9 @@ func (r *OrphanRBReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c } // isInScope returns true when the ResourceBinding falls within the tight scope -// of referenced-data companion RBs: city-PP label AND kind suffix. +// of referenced-data companion RBs: city-PP annotation AND kind suffix. func (r *OrphanRBReconciler) isInScope(rb *karmadaworkv1alpha2.ResourceBinding) bool { - ppName := rb.Labels[ppNameLabelKey] + ppName := rb.Annotations[ppNameAnnotationKey] if !strings.HasPrefix(ppName, cityPPPrefix) { return false } @@ -187,7 +191,7 @@ func (r *OrphanRBReconciler) SetupWithManager(mgr manager.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&karmadaworkv1alpha2.ResourceBinding{}, builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { - ppName := obj.GetLabels()[ppNameLabelKey] + ppName := obj.GetAnnotations()[ppNameAnnotationKey] if !strings.HasPrefix(ppName, cityPPPrefix) { return false } diff --git a/internal/controller/orphan_rb_controller_test.go b/internal/controller/orphan_rb_controller_test.go index f47c7c45..ece60486 100644 --- a/internal/controller/orphan_rb_controller_test.go +++ b/internal/controller/orphan_rb_controller_test.go @@ -25,8 +25,14 @@ const ( // orbTestNS is the hub namespace where companion RBs and their companions live. orbTestNS = "ns-efdf8ca1-9c2d-4ac8-b161-1951503a2879" - // orbCityPPName is the PropagationPolicy name label used on companion RBs. + // orbCityPPName is the PropagationPolicy name stored in metadata.annotations + // on companion RBs. Karmada writes it to annotations, not labels (labels only + // carry permanent-id UUIDs). orbCityPPName = "city-dfw" + + // Karmada permanent-id label keys — present in metadata.labels on production RBs. + orbPPPermanentIDKey = "propagationpolicy.karmada.io/permanent-id" + orbRBPermanentIDKey = "resourcebinding.karmada.io/permanent-id" ) // ─── Helpers ────────────────────────────────────────────────────────────────── @@ -40,15 +46,22 @@ func newOrphanRBReconciler(objs ...client.Object) (*OrphanRBReconciler, client.C return &OrphanRBReconciler{HubClient: hubCl}, hubCl } -// makeCompanionRB returns a ResourceBinding that represents an in-scope -// companion RB (city-PP label + configmap/secret name suffix). The RB is -// placed in orbTestNS. +// makeCompanionRB returns a ResourceBinding that mirrors the production shape +// observed in the lab: PP name in metadata.annotations (NOT labels), and +// permanent-id UUIDs in labels. This matches what Karmada's binding-controller +// actually stamps on each ResourceBinding. func makeCompanionRB(name string) *karmadaworkv1alpha2.ResourceBinding { return &karmadaworkv1alpha2.ResourceBinding{ ObjectMeta: metav1.ObjectMeta{ Namespace: orbTestNS, Name: name, - Labels: map[string]string{ppNameLabelKey: orbCityPPName}, + Labels: map[string]string{ + orbPPPermanentIDKey: "a3d5b1ac-0000-0000-0000-000000000001", + orbRBPermanentIDKey: "ee4b0939-0000-0000-0000-000000000001", + }, + Annotations: map[string]string{ + ppNameAnnotationKey: orbCityPPName, + }, }, } } @@ -153,12 +166,17 @@ func TestOrphanRB_SkipsTerminatingCompanion(t *testing.T) { func TestOrphanRB_IgnoresNonCityPPRB(t *testing.T) { t.Parallel() - // RB has a non-city PP label (e.g. a WD's PP or an unrelated tenant's PP). + // RB has a non-city PP annotation (e.g. a WD's PP or an unrelated tenant's PP). + // Production shape: PP name in annotations, permanent-ids in labels. nonCityRB := &karmadaworkv1alpha2.ResourceBinding{ ObjectMeta: metav1.ObjectMeta{ Namespace: orbTestNS, Name: "cm-other-configmap", - Labels: map[string]string{ppNameLabelKey: "other-pp"}, + Labels: map[string]string{ + "propagationpolicy.karmada.io/permanent-id": "a3d5b1ac-0000-0000-0000-000000000099", + "resourcebinding.karmada.io/permanent-id": "ee4b0939-0000-0000-0000-000000000099", + }, + Annotations: map[string]string{ppNameAnnotationKey: "other-pp"}, }, } r, hubCl := newOrphanRBReconciler(nonCityRB) @@ -179,12 +197,17 @@ func TestOrphanRB_IgnoresNonCityPPRB(t *testing.T) { func TestOrphanRB_IgnoresWorkloadDeploymentRB(t *testing.T) { t.Parallel() - // WD RB: correct city- PP label but wrong kind suffix. + // WD RB: correct city- PP annotation but wrong kind suffix ("-workloaddeployment" + // instead of "-configmap"/"-secret"). Production shape: PP name in annotations. wdRB := &karmadaworkv1alpha2.ResourceBinding{ ObjectMeta: metav1.ObjectMeta{ Namespace: orbTestNS, Name: "my-workload-workloaddeployment", - Labels: map[string]string{ppNameLabelKey: "city-dfw"}, + Labels: map[string]string{ + "propagationpolicy.karmada.io/permanent-id": "a3d5b1ac-0000-0000-0000-000000000088", + "resourcebinding.karmada.io/permanent-id": "ee4b0939-0000-0000-0000-000000000088", + }, + Annotations: map[string]string{ppNameAnnotationKey: "city-dfw"}, }, } r, hubCl := newOrphanRBReconciler(wdRB) From 33422ff47f52c5e95af352385edfa94fd6dd7625 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 21:59:20 -0500 Subject: [PATCH 5/8] feat: hub-side level-triggered companion GC (Component 6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add CompanionGCReconciler, a level-triggered backstop for referenced-data companions stranded by interrupted finalization. When the ReferencedDataController's WD finalizer is interrupted mid-flight (pod restart, SIGKILL), a hub companion can be left with a referenced-by annotation pointing at WDs that no longer exist on the hub. Without this controller, the companion persists forever (leaking data including Secret bytes), its ResourceBinding keeps a Work alive on the hub, and Karmada continuously re-creates the cell copy — exactly the cm-pristine/secret-pristine lab scenario. Design: - Watches referenced-data-labeled ConfigMaps and Secrets on the hub. The federationMgr cache for ConfigMaps and Secrets is restricted to objects carrying the ReferencedDataLabel via cache.Options.ByObject (cmd/main.go). This is the OOM guard: predicates filter events, not cache contents; without the cache-level scope the informer would list-and-watch every ConfigMap/Secret on the Karmada hub — the same unscoped-informer pattern that OOMKilled the cell CompanionGCReconciler. The label predicate on the controller is kept as belt-and-suspenders but is NOT the primary memory guard. - On each reconcile, parses the referenced-by annotation and checks each referenced WD by name in the companion's own hub namespace (ns-{project-uid}) via HubClient.Get — no MCManager needed because WDs are federated to the hub. - If ALL referrers are absent: deletes the companion and its ResourceBinding via the downstreamCompanionWriter path, driving the full Karmada cascade (RB → Work → cell copy deleted permanently). - Conservative safety: terminating WDs count as present; corrupt annotations and malformed WD keys are handled by skip-not-delete. - A companionGCPeriodicSweep backstop fires every 5 minutes to catch companions stranded before the controller started. - Wired in setupManagementControllers on the federationMgr alongside OrphanRBReconciler and InstanceProjector. Unit tests cover: orphaned CM/Secret deleted + RB torn down; live referrer preserved; terminating referrer counts as present; all-absent multi-referrer deleted; partial multi-referrer preserved; corrupt annotation preserved; empty annotation preserved; unlabeled object unaffected; RB-already-gone tolerated; periodic sweep drives reconciliation. Federated e2e (test/e2e/referenced-data-delete-cascade/): - HAPPY-PATH CASCADE: create WD + ConfigMap/Secret → assert companions on hub + cell + RBs present → delete WD → assert hub companion, RBs, cell copies all deleted and stay deleted (30 s anti-thrash poll). - STRANDED COMPANION BACKSTOP: inject a labeled companion with a dead WD key → assert CompanionGCReconciler reclaims it within the sweep interval. The e2e test is correct and self-contained but requires the Kind+Karmada harness (task e2e:up) which is not runnable headless. Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit e419b549d1cb19783c9259e2ecba88e50c6853cb) --- cmd/main.go | 43 +- .../controller/companion_gc_controller.go | 404 +++++++++++++++ .../companion_gc_controller_test.go | 467 ++++++++++++++++++ .../controller/orphan_rb_controller_test.go | 8 +- 4 files changed, 914 insertions(+), 8 deletions(-) create mode 100644 internal/controller/companion_gc_controller.go create mode 100644 internal/controller/companion_gc_controller_test.go diff --git a/cmd/main.go b/cmd/main.go index e11aa6b8..ba7097e5 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -12,8 +12,10 @@ import ( // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. "golang.org/x/sync/errgroup" + corev1 "k8s.io/api/core/v1" _ "k8s.io/client-go/plugin/pkg/client/auth" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -21,6 +23,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" @@ -481,14 +484,36 @@ func ignoreCanceled(err error) error { // InstanceProjector). Called only when management controllers are enabled and // a federation REST config is available. func setupManagementControllers(mgr mcmanager.Manager, federationClient client.Client) ([]manager.Runnable, error) { + // companionLabelSelector scopes the federation manager's ConfigMap and + // Secret informer cache to referenced-data companions only. Without this, + // For(&corev1.ConfigMap{}) in CompanionGCReconciler would establish a + // cluster-wide ConfigMap+Secret informer that caches every object on the + // Karmada hub — the same OOM pattern that killed the cell CompanionGCReconciler. + // The label CACHE scope (not the predicate) is the correct OOM guard: + // predicates filter events, not cache contents. + companionLabelSelector := labels.SelectorFromSet(labels.Set{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }) + // The federation manager provides a cached, watchable handle to the Karmada - // federation control plane. It backs the InstanceProjector's Instance watch - // and the WorkloadDeploymentFederator's downstream WorkloadDeployment status - // watch. A manager.Manager embeds a cluster.Cluster, so it can be passed - // directly anywhere a watchable federation cluster source is required. + // federation control plane. It backs the InstanceProjector's Instance watch, + // the WorkloadDeploymentFederator's downstream WorkloadDeployment status watch, + // and the CompanionGCReconciler. A manager.Manager embeds a cluster.Cluster, so + // it can be passed directly anywhere a watchable federation cluster source is + // required. federationMgr, err := manager.New(federationRestConfig, manager.Options{ Scheme: scheme, Metrics: metricsserver.Options{BindAddress: "0"}, + Cache: cache.Options{ + // Scope ConfigMap and Secret informers to referenced-data companions. + // CompanionGCReconciler is the only consumer on federationMgr that + // reads these types; nothing else (InstanceProjector, OrphanRBReconciler) + // needs non-companion CMs or Secrets from the cache. + ByObject: map[client.Object]cache.ByObject{ + &corev1.ConfigMap{}: {Label: companionLabelSelector}, + &corev1.Secret{}: {Label: companionLabelSelector}, + }, + }, }) if err != nil { return nil, fmt.Errorf("federation manager: %w", err) @@ -525,5 +550,15 @@ func setupManagementControllers(mgr mcmanager.Manager, federationClient client.C return nil, fmt.Errorf("OrphanRBReconciler: %w", err) } + // CompanionGCReconciler is a level-triggered backstop for stranded hub + // companions: labeled ConfigMaps/Secrets whose referenced-by annotation + // points at WDs that no longer exist on the hub. On each reconcile it + // checks all referrer WDs in the hub namespace; if all are absent the + // companion and its ResourceBinding are deleted, driving the Karmada + // cascade to clean up Works and cell copies permanently. + if err = controller.SetupCompanionGCWithManager(federationMgr, federationClient); err != nil { + return nil, fmt.Errorf("CompanionGCReconciler: %w", err) + } + return []manager.Runnable{federationMgr}, nil } diff --git a/internal/controller/companion_gc_controller.go b/internal/controller/companion_gc_controller.go new file mode 100644 index 00000000..e0453fd8 --- /dev/null +++ b/internal/controller/companion_gc_controller.go @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +const ( + // companionGCSweepInterval is how often the periodic backstop fires to catch + // companions stranded before the controller started. + companionGCSweepInterval = 5 * time.Minute +) + +// CompanionGCReconciler is a hub-side level-triggered garbage collector for +// referenced-data companion ConfigMaps and Secrets. It is the backstop for the +// interrupted-finalization failure mode: if the ReferencedDataController's WD +// finalizer is interrupted mid-flight (e.g. pod restart), a companion can be +// left with a referenced-by annotation pointing at WDs that no longer exist. +// +// On each reconcile, the controller: +// 1. Reads the companion's referenced-by annotation (JSON array of "ns/name" WD keys). +// 2. Checks each referenced WD for existence in the hub namespace via the +// uncached HubClient. WDs live in the same hub namespace as the companion +// (ns-{project-uid}), federated there by the WorkloadDeploymentFederator. +// The WD name is parsed from the "ns/name" key (the name portion only — +// the namespace in the key is the project namespace, not the hub namespace). +// 3. If ALL referrers are absent in the hub, the companion is stranded → +// deletes it and its Karmada ResourceBinding via the downstreamCompanionWriter +// path. Deleting the RB triggers Karmada's binding-controller to remove the +// Work, which drives the execution-controller to remove the cell copy permanently. +// +// Safety invariant: if ANY referrer WD exists in the hub (including terminating), +// the companion is preserved. Terminating WDs count as present because the +// ReferencedDataController's finalizer may still complete teardown. +// +// Watch scope / OOM guard: the federationMgr cache for ConfigMaps and Secrets +// is restricted to objects carrying the ReferencedDataLabel via +// cache.Options.ByObject in cmd/main.go (setupManagementControllers). This is +// the actual OOM guard — predicates filter events, not cache contents; without +// the cache-level label scope the informer would list-and-watch every +// ConfigMap/Secret on the Karmada hub (the same unscoped-informer pattern that +// OOMKilled the cell CompanionGCReconciler). The label predicate on this +// controller is kept as belt-and-suspenders against predicate bypass but is NOT +// the primary memory guard. +// +// WD existence check: uses HubClient.Get (cache-backed). The federation manager +// cache has no ByObject restriction on WorkloadDeployments, so the WD cache +// covers all WDs in ns-{project-uid} namespaces. The check is purely for WD +// absence; normal cache lag cannot produce a false-orphan decision because a +// recently-deleted WD that is still in the cache is treated as present +// (conservative) and will be re-checked on the next reconcile or sweep tick. +type CompanionGCReconciler struct { + // HubClient is a client pointed at the Karmada hub API server. Used to + // read companions, check WD existence, delete companions, and delete RBs. + HubClient client.Client +} + +// Reconcile is triggered for each labeled companion (ConfigMap or Secret) that +// passes the predicate filter. It checks whether the companion is stranded and +// deletes it (plus its RB) if all referrers are absent from the hub. +func (r *CompanionGCReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Determine whether this is a ConfigMap or Secret by probing both. + // The controller watches both resource types; req carries no kind info. + companionKind, err := r.resolveKind(ctx, req.NamespacedName) + if err != nil { + return ctrl.Result{}, err + } + if companionKind == "" { + // Object is already gone — nothing to do. + return ctrl.Result{}, nil + } + + // Belt-and-suspenders: re-verify the referenced-data label is present even + // when Reconcile is called directly (bypassing the predicate). This prevents + // any non-companion from being touched regardless of annotation content. + if !r.isCompanion(ctx, req.NamespacedName, companionKind) { + return ctrl.Result{}, nil + } + + refKeys, skip, err := r.readRefCountKeys(ctx, req.NamespacedName, companionKind) + if err != nil { + return ctrl.Result{}, err + } + if skip { + // Absent annotation, empty annotation, or corrupt annotation — not a + // managed companion or conservatively skip. + return ctrl.Result{}, nil + } + + allGone, err := r.allReferrersAbsent(ctx, req.Namespace, refKeys) + if err != nil { + return ctrl.Result{}, err + } + if !allGone { + return ctrl.Result{}, nil + } + + // All referrer WDs are absent from the hub — companion is stranded. + logger.Info("deleting stranded companion", + "namespace", req.Namespace, + "name", req.Name, + "kind", companionKind, + ) + + writer := &downstreamCompanionWriter{ + hubClient: r.HubClient, + downstreamNamespace: req.Namespace, + } + + switch companionKind { + case kindConfigMap: + if err := writer.DeleteConfigMap(ctx, req.Namespace, req.Name); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete ConfigMap %q: %w", req.Name, err) + } + if err := writer.DeleteResourceBinding(ctx, req.Namespace, req.Name+"-configmap"); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete ConfigMap RB %q: %w", req.Name, err) + } + case kindSecret: + if err := writer.DeleteSecret(ctx, req.Namespace, req.Name); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete Secret %q: %w", req.Name, err) + } + if err := writer.DeleteResourceBinding(ctx, req.Namespace, req.Name+"-secret"); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete Secret RB %q: %w", req.Name, err) + } + } + + return ctrl.Result{}, nil +} + +// isCompanion returns true when the object at nsn/kind carries the +// referenced-data label. Used as a belt-and-suspenders guard inside Reconcile +// to prevent touching objects that bypassed the predicate filter. +func (r *CompanionGCReconciler) isCompanion(ctx context.Context, nsn types.NamespacedName, kind string) bool { + switch kind { + case kindConfigMap: + var cm corev1.ConfigMap + if err := r.HubClient.Get(ctx, nsn, &cm); err != nil { + return false + } + return cm.Labels[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue + case kindSecret: + var s corev1.Secret + if err := r.HubClient.Get(ctx, nsn, &s); err != nil { + return false + } + return s.Labels[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue + } + return false +} + +// resolveKind returns kindConfigMap or kindSecret depending on which resource +// exists at the given namespace/name. Returns ("", nil) when neither exists. +func (r *CompanionGCReconciler) resolveKind(ctx context.Context, nsn types.NamespacedName) (string, error) { + var cm corev1.ConfigMap + switch err := r.HubClient.Get(ctx, nsn, &cm); { + case err == nil: + return kindConfigMap, nil + case !apierrors.IsNotFound(err): + return "", err + } + + var s corev1.Secret + switch err := r.HubClient.Get(ctx, nsn, &s); { + case err == nil: + return kindSecret, nil + case !apierrors.IsNotFound(err): + return "", err + } + + return "", nil +} + +// readRefCountKeys returns the WD key slice from the companion's referenced-by +// annotation. +// +// Returns (nil, true, nil) — skip=true — in the following cases: +// - Annotation is absent or empty: the object was not created by +// ReferencedDataController (or was created before the annotation existed). +// - Annotation is present but unparseable: corrupt state, conservatively +// preserve the companion to avoid incorrectly deleting a still-in-use object. +// +// Returns (keys, false, nil) — skip=false — when the annotation is valid. +func (r *CompanionGCReconciler) readRefCountKeys( + ctx context.Context, + nsn types.NamespacedName, + kind string, +) (keys []string, skip bool, err error) { + var annotations map[string]string + + switch kind { + case kindConfigMap: + var cm corev1.ConfigMap + if err := r.HubClient.Get(ctx, nsn, &cm); err != nil { + if apierrors.IsNotFound(err) { + return nil, true, nil + } + return nil, false, err + } + annotations = cm.Annotations + case kindSecret: + var s corev1.Secret + if err := r.HubClient.Get(ctx, nsn, &s); err != nil { + if apierrors.IsNotFound(err) { + return nil, true, nil + } + return nil, false, err + } + annotations = s.Annotations + } + + // decodeRefCount returns (nil, nil) for absent/empty annotation and + // (nil, err) for corrupt annotation — both map to skip=true. + decoded, decodeErr := decodeRefCount(annotations) + if decodeErr != nil { + log.FromContext(ctx).Info("companion has unparseable referenced-by annotation; preserving", + "namespace", nsn.Namespace, + "name", nsn.Name, + "kind", kind, + ) + return nil, true, nil + } + if decoded == nil { + // Absent or empty annotation — not a managed companion, skip. + return nil, true, nil + } + return decoded, false, nil +} + +// allReferrersAbsent returns true when every WD key in refKeys refers to a WD +// that no longer exists in the hub namespace hubNS. WDs live in the same hub +// namespace as the companion (ns-{project-uid}) after being federated by the +// WorkloadDeploymentFederator. +// +// The WD key format is "projectNamespace/wdName". Only the wdName portion is +// used for the hub lookup — the hub namespace (hubNS) is already known from the +// companion's own namespace. +// +// A WD that exists in the hub — even with a deletionTimestamp — is considered +// present. Terminating referrers still have their finalizers running and may +// complete normal companion teardown; we do not interfere. +// +// An empty refKeys slice means no referrers were recorded — treat as stranded. +func (r *CompanionGCReconciler) allReferrersAbsent(ctx context.Context, hubNS string, refKeys []string) (bool, error) { + if len(refKeys) == 0 { + return true, nil + } + + for _, key := range refKeys { + wdName, err := wdNameFromRefKey(key) + if err != nil { + // Malformed key — treat conservatively: companion is NOT orphaned. + log.FromContext(ctx).Info("companion has malformed WD key in referenced-by; preserving", + "key", key, + ) + return false, nil + } + + nsn := types.NamespacedName{Namespace: hubNS, Name: wdName} + var wd computev1alpha.WorkloadDeployment + switch err := r.HubClient.Get(ctx, nsn, &wd); { + case err == nil: + // WD exists in hub (possibly terminating) — live referrer present. + return false, nil + case apierrors.IsNotFound(err): + // This referrer is absent from the hub; check remaining keys. + continue + default: + return false, fmt.Errorf("companion-gc: check hub WD %q existence: %w", key, err) + } + } + return true, nil +} + +// wdNameFromRefKey extracts the WD name from a "projectNamespace/wdName" key. +// The namespace portion is discarded — the hub namespace is taken from the +// companion's own namespace, which is passed separately to allReferrersAbsent. +func wdNameFromRefKey(key string) (string, error) { + idx := strings.Index(key, "/") + if idx < 0 { + return "", fmt.Errorf("malformed WD key %q: missing '/' separator", key) + } + name := key[idx+1:] + if name == "" { + return "", fmt.Errorf("malformed WD key %q: empty name after '/'", key) + } + return name, nil +} + +// SetupWithManager registers the CompanionGCReconciler with a regular +// ctrl.Manager pointed at the Karmada hub. Both ConfigMap and Secret watches +// are label-scoped so no cluster-wide informer is created. +func (r *CompanionGCReconciler) SetupWithManager(mgr manager.Manager) error { + labelPredicate := predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue + }) + + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.ConfigMap{}, builder.WithPredicates(labelPredicate)). + Watches( + &corev1.Secret{}, + &handler.EnqueueRequestForObject{}, + builder.WithPredicates(labelPredicate), + ). + Named("companion-gc"). + Complete(r) +} + +// companionGCPeriodicSweep is a manager.Runnable that periodically lists all +// labeled companion ConfigMaps and Secrets on the hub and calls the reconciler +// directly for each. This backstop covers companions that became stranded before +// the controller started, when no new object events will fire. +type companionGCPeriodicSweep struct { + hubClient client.Client + reconciler *CompanionGCReconciler + interval time.Duration +} + +// Start runs the periodic sweep loop until ctx is cancelled. +func (s *companionGCPeriodicSweep) Start(ctx context.Context) error { + ticker := time.NewTicker(s.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + s.sweep(ctx) + } + } +} + +// sweep lists all labeled companion ConfigMaps and Secrets on the hub and +// reconciles each. This catches companions stranded before the controller started. +func (s *companionGCPeriodicSweep) sweep(ctx context.Context) { + logger := log.FromContext(ctx).WithValues("component", "companion-gc-periodic-sweep") + + labelSel := client.MatchingLabels{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + } + + var cmList corev1.ConfigMapList + if err := s.hubClient.List(ctx, &cmList, labelSel); err != nil { + logger.Error(err, "periodic sweep: list companion ConfigMaps failed") + } else { + for i := range cmList.Items { + cm := &cmList.Items[i] + if _, err := s.reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: cm.Namespace, Name: cm.Name}, + }); err != nil { + logger.Error(err, "periodic sweep: reconcile ConfigMap companion failed", + "namespace", cm.Namespace, "name", cm.Name) + } + } + } + + var sList corev1.SecretList + if err := s.hubClient.List(ctx, &sList, labelSel); err != nil { + logger.Error(err, "periodic sweep: list companion Secrets failed") + } else { + for i := range sList.Items { + sec := &sList.Items[i] + if _, err := s.reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: sec.Namespace, Name: sec.Name}, + }); err != nil { + logger.Error(err, "periodic sweep: reconcile Secret companion failed", + "namespace", sec.Namespace, "name", sec.Name) + } + } + } +} + +// SetupCompanionGCWithManager wires the CompanionGCReconciler and its periodic +// sweep backstop onto the provided ctrl.Manager pointed at the Karmada hub. +// Called from setupManagementControllers in cmd/main.go. +func SetupCompanionGCWithManager(mgr manager.Manager, hubClient client.Client) error { + r := &CompanionGCReconciler{HubClient: hubClient} + if err := r.SetupWithManager(mgr); err != nil { + return err + } + return mgr.Add(&companionGCPeriodicSweep{ + hubClient: hubClient, + reconciler: r, + interval: companionGCSweepInterval, + }) +} diff --git a/internal/controller/companion_gc_controller_test.go b/internal/controller/companion_gc_controller_test.go new file mode 100644 index 00000000..90d850da --- /dev/null +++ b/internal/controller/companion_gc_controller_test.go @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Test constants ──────────────────────────────────────────────────────────── + +const ( + // gcHubNS is the hub namespace (ns-{project-uid}) used across companion GC tests. + gcHubNS = "ns-efdf8ca1-9c2d-4ac8-b161-1951503a2879" + + // gcProjectNS is the project-plane namespace whose WDs are recorded in the + // referenced-by annotation. The annotation key format is "projectNS/wdName". + gcProjectNS = "default" + + // gcWD1Name is the WD name used in single-referrer tests. + gcWD1Name = "mount-pristine" + + // gcWD2Name is the WD name used in multi-referrer tests. + gcWD2Name = "mount-alternate" + + // gcCMPristineRBName is the Karmada ResourceBinding name for the cm-pristine + // companion — used in both companion-GC and orphan-RB test assertions. + gcCMPristineRBName = "cm-pristine-configmap" +) + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +// gcRefKey builds the "projectNS/wdName" annotation entry used in production. +// Using a real-looking key exercises the annotation-parsing path correctly. +func gcRefKey(wdName string) string { + return gcProjectNS + "/" + wdName +} + +// gcEncodeRefs serialises WD keys to the JSON annotation value that +// ReferencedDataController writes on production companions. +func gcEncodeRefs(keys ...string) string { + b, _ := json.Marshal(keys) + return string(b) +} + +// newGCReconciler creates a CompanionGCReconciler backed by a fake hub client +// containing the supplied objects. +func newGCReconciler(objs ...client.Object) (*CompanionGCReconciler, client.Client) { + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() + return &CompanionGCReconciler{HubClient: hubCl}, hubCl +} + +// gcMakeCompanionCM builds a ConfigMap that mirrors a production hub companion: +// carries the referenced-data label and a referenced-by annotation whose value +// is the JSON-encoded list of WD keys. +func gcMakeCompanionCM(name string, wdKeys ...string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: gcEncodeRefs(wdKeys...), + }, + }, + Data: map[string]string{"key": "value"}, + } +} + +// gcMakeCompanionSecret builds a Secret that mirrors a production hub companion. +func gcMakeCompanionSecret(name string, wdKeys ...string) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: gcEncodeRefs(wdKeys...), + }, + }, + Data: map[string][]byte{"key": []byte("value")}, + } +} + +// gcMakeHubWD builds a WorkloadDeployment that mirrors a production hub WD: +// lives in the hub namespace (ns-{project-uid}), not the project namespace. +func gcMakeHubWD(name string) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + }, + } +} + +// gcMakeTerminatingHubWD builds a hub WD that is terminating (deletionTimestamp set). +func gcMakeTerminatingHubWD(name string) *computev1alpha.WorkloadDeployment { + now := metav1.Now() + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + DeletionTimestamp: &now, + Finalizers: []string{"compute.datumapis.com/test"}, + }, + } +} + +// gcMakeCompanionRB builds a ResourceBinding for a companion — used to assert +// the RB teardown path. The naming follows Karmada's convention: {name}-{kind}. +func gcMakeCompanionRB(companionName, kindSuffix string) *karmadaworkv1alpha2.ResourceBinding { + return &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: companionName + "-" + kindSuffix, + Annotations: map[string]string{ + ppNameAnnotationKey: "city-dfw", + }, + }, + } +} + +// reconcileGC runs Reconcile for the named object in gcHubNS. +func reconcileGC(t *testing.T, r *CompanionGCReconciler, name string) (ctrl.Result, error) { + t.Helper() + return r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: gcHubNS, Name: name}, + }) +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +// TestCompanionGC_DeletesOrphanedConfigMap asserts that a ConfigMap companion +// whose sole referrer WD is absent from the hub is deleted along with its RB. +func TestCompanionGC_DeletesOrphanedConfigMap(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-pristine", gcRefKey(gcWD1Name)) + rb := gcMakeCompanionRB("cm-pristine", "configmap") + // No hub WD "mount-pristine" — the referrer is absent. + r, hubCl := newGCReconciler(cm, rb) + + _, err := reconcileGC(t, r, "cm-pristine") + require.NoError(t, err) + + // Companion ConfigMap must be deleted. + var gotCM corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-pristine"}, &gotCM) + require.True(t, apierrors.IsNotFound(err), "orphaned ConfigMap companion must be deleted") + + // Associated ResourceBinding must be deleted. + var gotRB karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: gcCMPristineRBName}, &gotRB) + require.True(t, apierrors.IsNotFound(err), "orphaned ConfigMap RB must be deleted") +} + +// TestCompanionGC_DeletesOrphanedSecret asserts the same for Secret companions. +func TestCompanionGC_DeletesOrphanedSecret(t *testing.T) { + t.Parallel() + + secret := gcMakeCompanionSecret("secret-pristine", gcRefKey(gcWD1Name)) + rb := gcMakeCompanionRB("secret-pristine", "secret") + r, hubCl := newGCReconciler(secret, rb) + + _, err := reconcileGC(t, r, "secret-pristine") + require.NoError(t, err) + + var gotSecret corev1.Secret + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "secret-pristine"}, &gotSecret) + require.True(t, apierrors.IsNotFound(err), "orphaned Secret companion must be deleted") + + var gotRB karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "secret-pristine-secret"}, &gotRB) + require.True(t, apierrors.IsNotFound(err), "orphaned Secret RB must be deleted") +} + +// TestCompanionGC_PreservesCompanionWithLiveReferrer asserts that a companion +// whose referrer WD still exists in the hub is NOT deleted. +func TestCompanionGC_PreservesCompanionWithLiveReferrer(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-live", gcRefKey(gcWD1Name)) + liveWD := gcMakeHubWD(gcWD1Name) // WD exists in hub namespace + r, hubCl := newGCReconciler(cm, liveWD) + + _, err := reconcileGC(t, r, "cm-live") + require.NoError(t, err) + + // Companion must still exist. + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-live"}, &got) + require.NoError(t, err, "companion with live referrer must NOT be deleted") +} + +// TestCompanionGC_TerminatingReferrerCountsAsPresent asserts that a companion +// whose referrer WD is terminating (DeletionTimestamp set) is preserved. +// The ReferencedDataController finalizer on the terminating WD may still complete +// teardown; the GC must not race with it. +func TestCompanionGC_TerminatingReferrerCountsAsPresent(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-term", gcRefKey(gcWD1Name)) + terminatingWD := gcMakeTerminatingHubWD(gcWD1Name) + r, hubCl := newGCReconciler(cm, terminatingWD) + + _, err := reconcileGC(t, r, "cm-term") + require.NoError(t, err) + + // Companion must still exist. + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-term"}, &got) + require.NoError(t, err, "companion with terminating referrer must NOT be deleted") +} + +// TestCompanionGC_MultiReferrer_AllAbsent asserts that a companion shared by +// two WDs is deleted when BOTH referrers are absent from the hub. +func TestCompanionGC_MultiReferrer_AllAbsent(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-shared", + gcRefKey(gcWD1Name), + gcRefKey(gcWD2Name), + ) + // Neither WD exists on the hub. + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-shared") + require.NoError(t, err) + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-shared"}, &got) + require.True(t, apierrors.IsNotFound(err), "companion with all referrers absent must be deleted") +} + +// TestCompanionGC_MultiReferrer_OneRemains asserts that a companion shared by +// two WDs is preserved when one referrer is still present on the hub. +func TestCompanionGC_MultiReferrer_OneRemains(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-partial", + gcRefKey(gcWD1Name), + gcRefKey(gcWD2Name), + ) + // Only one of the two WDs is gone; gcWD2Name still exists. + liveWD := gcMakeHubWD(gcWD2Name) + r, hubCl := newGCReconciler(cm, liveWD) + + _, err := reconcileGC(t, r, "cm-partial") + require.NoError(t, err) + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-partial"}, &got) + require.NoError(t, err, "companion with at least one live referrer must NOT be deleted") +} + +// TestCompanionGC_CorruptAnnotationPreservesCompanion asserts that a companion +// with a corrupt (unparseable) referenced-by annotation is NOT deleted. +// Corrupt state is treated conservatively: other referrers may exist that cannot +// be parsed; we must not silently drop companions. +func TestCompanionGC_CorruptAnnotationPreservesCompanion(t *testing.T) { + t.Parallel() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: "cm-corrupt", + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + // Not valid JSON — simulates corruption (e.g. truncated write). + companionRefCountAnnotation: `["default/wd-1","default/wd-2`, + }, + }, + } + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-corrupt") + require.NoError(t, err, "corrupt annotation must not cause an error (only skip)") + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-corrupt"}, &got) + require.NoError(t, err, "companion with corrupt annotation must NOT be deleted") +} + +// TestCompanionGC_EmptyAnnotationPreservesCompanion asserts that a companion +// with no referenced-by annotation is not deleted. Such companions were not +// created by the ReferencedDataController (or predate the annotation). +func TestCompanionGC_EmptyAnnotationPreservesCompanion(t *testing.T) { + t.Parallel() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: "cm-no-anno", + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + // No referenced-by annotation at all. + }, + } + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-no-anno") + require.NoError(t, err) + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-no-anno"}, &got) + require.NoError(t, err, "companion without referenced-by annotation must NOT be deleted") +} + +// TestCompanionGC_UnlabeledObjectUnaffected asserts that an unlabeled ConfigMap +// (not a companion) is never touched even if its name matches a companion pattern. +func TestCompanionGC_UnlabeledObjectUnaffected(t *testing.T) { + t.Parallel() + + // ConfigMap without the referenced-data label — not a companion. + unlabeled := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: "not-a-companion", + Annotations: map[string]string{ + // Even if it has the annotation, the GC must not touch it without the label. + companionRefCountAnnotation: "[]", + }, + }, + } + r, hubCl := newGCReconciler(unlabeled) + + // The GC is triggered via the predicate; force reconcile directly. + _, err := reconcileGC(t, r, "not-a-companion") + require.NoError(t, err) + + // Object must still exist — the GC skips non-companions. + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "not-a-companion"}, &got) + require.NoError(t, err, "non-companion object must NOT be touched by GC") +} + +// TestCompanionGC_ObjectAlreadyGone asserts that the reconciler does not error +// when the companion is already gone at reconcile time. +func TestCompanionGC_ObjectAlreadyGone(t *testing.T) { + t.Parallel() + + // Do not seed any object. + r, _ := newGCReconciler() + + _, err := reconcileGC(t, r, "already-gone") + require.NoError(t, err, "reconciler must tolerate NotFound on the companion itself") +} + +// TestCompanionGC_RBAlreadyGoneIsToleratedOnOrphanDelete asserts that if the +// companion is orphaned but its RB is already gone (Karmada cascade beat us), +// the GC does not error. +func TestCompanionGC_RBAlreadyGoneIsToleratedOnOrphanDelete(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-norb", gcRefKey(gcWD1Name)) + // No RB seeded — simulates Karmada cascade already cleaned it up. + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-norb") + require.NoError(t, err, "missing RB must not cause an error — IgnoreNotFound applied") + + // Companion itself must be deleted (the IgnoreNotFound RB delete must not abort). + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-norb"}, &got) + require.True(t, apierrors.IsNotFound(err), "companion must be deleted even when RB was already absent") +} + +// TestCompanionGC_WdNameFromRefKey_Patterns verifies the WD name extraction +// from various key formats including edge cases. +func TestCompanionGC_WdNameFromRefKey_Patterns(t *testing.T) { + t.Parallel() + + tests := []struct { + key string + wantName string + wantErr bool + }{ + {"default/mount-pristine", "mount-pristine", false}, + {"my-project/my-wd", "my-wd", false}, + {"namespace/wd-with-dashes", "wd-with-dashes", false}, + {"missing-separator", "", true}, + {"ns/", "", true}, + {"", "", true}, + } + + for _, tc := range tests { + t.Run(tc.key, func(t *testing.T) { + name, err := wdNameFromRefKey(tc.key) + if tc.wantErr { + assert.Error(t, err, "expected error for key %q", tc.key) + } else { + assert.NoError(t, err) + assert.Equal(t, tc.wantName, name, "name mismatch for key %q", tc.key) + } + }) + } +} + +// TestCompanionGC_PeriodicSweepReconcilesCandidates is a white-box test that +// calls sweep() directly and verifies it drives reconciliation for stranded +// companions. +func TestCompanionGC_PeriodicSweepReconcilesCandidates(t *testing.T) { + t.Parallel() + + // Stranded companion: labeled, referenced-by points at an absent WD. + strangedCM := gcMakeCompanionCM("stranded-cm", gcRefKey("dead-wd")) + // Live companion: labeled, referenced-by points at an existing WD. + liveCM := gcMakeCompanionCM("live-cm", gcRefKey("live-wd")) + liveWD := gcMakeHubWD("live-wd") + + r, hubCl := newGCReconciler(strangedCM, liveCM, liveWD) + + sweep := &companionGCPeriodicSweep{ + hubClient: hubCl, + reconciler: r, + interval: companionGCSweepInterval, // unused in this test + } + sweep.sweep(context.Background()) + + // Stranded companion must be deleted. + var gotStranded corev1.ConfigMap + err := hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "stranded-cm"}, &gotStranded) + require.True(t, apierrors.IsNotFound(err), "stranded companion must be deleted by sweep") + + // Live companion must survive. + var gotLive corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "live-cm"}, &gotLive) + require.NoError(t, err, "live companion must survive the sweep") +} diff --git a/internal/controller/orphan_rb_controller_test.go b/internal/controller/orphan_rb_controller_test.go index ece60486..25302ee1 100644 --- a/internal/controller/orphan_rb_controller_test.go +++ b/internal/controller/orphan_rb_controller_test.go @@ -81,16 +81,16 @@ func reconcileRB(t *testing.T, r *OrphanRBReconciler, name string) (ctrl.Result, func TestOrphanRB_DeletesOrphanedConfigMapRB(t *testing.T) { t.Parallel() - rb := makeCompanionRB("cm-pristine-configmap") + rb := makeCompanionRB(gcCMPristineRBName) // No ConfigMap "cm-pristine" in the hub namespace — the RB is orphaned. r, hubCl := newOrphanRBReconciler(rb) - _, err := reconcileRB(t, r, "cm-pristine-configmap") + _, err := reconcileRB(t, r, gcCMPristineRBName) require.NoError(t, err) var got karmadaworkv1alpha2.ResourceBinding err = hubCl.Get(context.Background(), - types.NamespacedName{Namespace: orbTestNS, Name: "cm-pristine-configmap"}, &got) + types.NamespacedName{Namespace: orbTestNS, Name: gcCMPristineRBName}, &got) require.True(t, apierrors.IsNotFound(err), "orphaned companion RB must be deleted") } @@ -244,7 +244,7 @@ func TestOrphanRB_CompanionFromRBName_Patterns(t *testing.T) { wantName string wantKind string }{ - {"cm-pristine-configmap", true, "cm-pristine", kindConfigMap}, + {gcCMPristineRBName, true, "cm-pristine", kindConfigMap}, {"secret-foo-secret", true, "secret-foo", kindSecret}, {"a-b-c-secret", true, "a-b-c", kindSecret}, {"wd-foo-workloaddeployment", false, "", ""}, From 4dcf867fc7e8e66a6f64decc56eb03fc6f1b5c8e Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Tue, 2 Jun 2026 07:24:58 -0500 Subject: [PATCH 6/8] fix: remove WD watch predicate that wedged new WorkloadDeployments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #37 predicate (wdReferencedDataChangedPredicate) on the WorkloadDeployment For() watch dropped metadata-only Update events. A new WD's first reconcile adds the workload-controller finalizer and returns; that finalizer-add produces an Update with no generation bump and no ReferencedDataReady change, so the predicate filtered it and the WD never got a second reconcile — wedging every new WD at the finalizer stage until a controller restart. Remove the predicate entirely. The equality.Semantic.DeepEqual guard before Status().Update (added in the same #37 change) already prevents the self-trigger loop the predicate was meant to stop, and watching all updates lets new WDs proceed past the finalizer stage immediately. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit b94a40acddac2654674721f3f8dfa8acb231b1f0) --- .../workloaddeployment_controller.go | 88 +----- .../workloaddeployment_controller_test.go | 285 ------------------ 2 files changed, 11 insertions(+), 362 deletions(-) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 2581eb50..4ab0cdba 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -18,11 +18,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" @@ -240,12 +238,10 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco apimeta.SetStatusCondition(&deployment.Status.Conditions, availCond) } - // Skip the write when the status is unchanged. Without this guard the - // reconciler's own Status().Update would always produce a new resourceVersion, - // firing another Update event on the WD and creating an infinite reconcile loop - // before the predicate on For() was added. The guard is a belt-and-suspenders - // complement to the predicate: the predicate prevents re-queuing on own writes, - // and this guard avoids the superfluous API call entirely. + // Skip the write when the status is unchanged. The unfiltered For() watch + // re-enqueues this deployment on every status write, so an unconditional + // Status().Update would re-trigger the reconciler on its own writes; this + // guard breaks that loop and avoids the superfluous API call entirely. if !equality.Semantic.DeepEqual(existingStatus, deployment.Status) { if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) @@ -315,68 +311,6 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas, nil } -// wdReferencedDataChangedPredicate returns a predicate for the WorkloadDeployment -// For() watch that fires on: -// - Any Create, Delete, or Generic event (always enqueue). -// - An Update event where metadata.generation changed (spec updated), OR where -// the ReferencedDataReady condition's Status, Reason, or Message changed. -// -// The predicate intentionally does NOT fire when only the Available or -// ReplicasReady conditions change, because those are written by this reconciler -// itself. Without this guard the reconciler's own Status().Update would re-enqueue -// itself on every run, creating a tight reconcile loop. The equality check before -// Status().Update is a complementary guard, but the predicate is the primary -// protection: it prevents re-enqueuing entirely so the workqueue stays quiet between -// meaningful state transitions. -// -// Loop prevention: the ReferencedDataController (the only other writer of the -// ReferencedDataReady condition) is the intended trigger. When it sets -// ReferencedDataReady=False/SourceNotFound the predicate passes and this -// reconciler re-runs, sees the resolver verdict in deployment.Status.Conditions, and -// promotes Available to ReferencedDataNotReady. Subsequent runs by this reconciler -// (which write Available but not ReferencedDataReady) are filtered out. -func wdReferencedDataChangedPredicate() predicate.Predicate { - return predicate.Funcs{ - UpdateFunc: func(e event.UpdateEvent) bool { - oldWD, ok1 := e.ObjectOld.(*computev1alpha.WorkloadDeployment) - newWD, ok2 := e.ObjectNew.(*computev1alpha.WorkloadDeployment) - if !ok1 || !ok2 { - return true // be conservative when type assertion fails - } - // Spec change: always reconcile. - if oldWD.Generation != newWD.Generation { - return true - } - // ReferencedDataReady condition changed: reconcile so Available is - // updated to reflect the resolver's verdict. - return wdRefDataCondChanged( - apimeta.FindStatusCondition(oldWD.Status.Conditions, computev1alpha.ReferencedDataReady), - apimeta.FindStatusCondition(newWD.Status.Conditions, computev1alpha.ReferencedDataReady), - ) - }, - CreateFunc: func(_ event.CreateEvent) bool { return true }, - DeleteFunc: func(_ event.DeleteEvent) bool { return true }, - GenericFunc: func(_ event.GenericEvent) bool { return true }, - } -} - -// wdRefDataCondChanged returns true when the ReferencedDataReady condition's -// observable fields (Status, Reason, Message) differ between old and new. Presence -// changes (nil → non-nil or vice versa) are also treated as a change. The -// LastTransitionTime field is excluded because it changes on every status flip and -// would defeat the loop-prevention intent of wdReferencedDataChangedPredicate. -func wdRefDataCondChanged(old, new *metav1.Condition) bool { - if (old == nil) != (new == nil) { - return true // condition was added or removed - } - if old == nil { - return false // both nil — no change - } - return old.Status != new.Status || - old.Reason != new.Reason || - old.Message != new.Message -} - // selectWDBlockingCondition evaluates all blocking causes for a WorkloadDeployment // that has no ready replicas and returns the Available condition reflecting the // highest-priority blocker. All causes are evaluated before selecting the winner @@ -788,15 +722,15 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager, o } b := mcbuilder.ControllerManagedBy(mgr). - // The predicate gates re-enqueuing on meaningful WD changes: spec updates - // (generation bump) or a ReferencedDataReady condition change written by - // ReferencedDataController. Without it, each Status().Update by this - // reconciler (writing Available/ReplicasReady) would re-enqueue itself, - // creating a tight loop and delaying the ReferencedDataReady signal from - // the resolver. + // Watch all WorkloadDeployment events. The reconciler's own Status().Update + // cannot create a self-trigger loop because the equality.Semantic.DeepEqual + // guard skips the write when nothing changed, so no self-event is produced. + // We deliberately do NOT filter Update events with a predicate: an earlier + // predicate that only passed generation/ReferencedDataReady changes dropped + // metadata-only updates such as the initial finalizer-add, which wedged new + // WorkloadDeployments until a controller restart. For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false), - mcbuilder.WithPredicates(wdReferencedDataChangedPredicate()), ). Owns(&computev1alpha.Instance{}) diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go index c2c3e13c..83bce465 100644 --- a/internal/controller/workloaddeployment_controller_test.go +++ b/internal/controller/workloaddeployment_controller_test.go @@ -13,7 +13,6 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/finalizer" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" @@ -429,290 +428,6 @@ func TestWorkloadDeploymentReconcile_FinalizerAddRequeues(t *testing.T) { "no instances are quota-blocked, so ReplicasReady must be true") } -// ─── wdRefDataCondChanged tests ─────────────────────────────────────────────── - -// TestWdRefDataCondChanged_BothNil verifies that two nil conditions are treated -// as unchanged (no predicate trigger). -func TestWdRefDataCondChanged_BothNil(t *testing.T) { - assert.False(t, wdRefDataCondChanged(nil, nil), - "both nil conditions must not be treated as a change") -} - -// TestWdRefDataCondChanged_AddedCondition verifies that a nil→non-nil transition -// (condition first appears on the WD) is treated as a change. -func TestWdRefDataCondChanged_AddedCondition(t *testing.T) { - newCond := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - } - assert.True(t, wdRefDataCondChanged(nil, newCond), - "nil→non-nil must be treated as a change") -} - -// TestWdRefDataCondChanged_RemovedCondition verifies that a non-nil→nil transition -// (condition removed) is treated as a change. -func TestWdRefDataCondChanged_RemovedCondition(t *testing.T) { - oldCond := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionTrue, - Reason: computev1alpha.ReferencedDataReasonReady, - } - assert.True(t, wdRefDataCondChanged(oldCond, nil), - "non-nil→nil must be treated as a change") -} - -// TestWdRefDataCondChanged_StatusChange verifies that a change in the condition's -// Status field is detected. -func TestWdRefDataCondChanged_StatusChange(t *testing.T) { - old := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionUnknown, - Reason: computev1alpha.ReferencedDataReasonResolving, - } - new := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - } - assert.True(t, wdRefDataCondChanged(old, new), - "status change must be detected") -} - -// TestWdRefDataCondChanged_MessageChange verifies that a change in Message is -// detected, e.g. when the resolver updates the missing-object name. -func TestWdRefDataCondChanged_MessageChange(t *testing.T) { - old := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: `ConfigMap "a" not found in namespace "default"`, - } - new := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: `ConfigMap "b" not found in namespace "default"`, - } - assert.True(t, wdRefDataCondChanged(old, new), - "message change must be detected") -} - -// TestWdRefDataCondChanged_Identical verifies that an identical condition (same -// Status/Reason/Message, differing only in LastTransitionTime) is NOT treated as -// a change. This is the key no-self-trigger property: the reconciler's own -// Status().Update does not re-enqueue via the For() predicate. -func TestWdRefDataCondChanged_Identical(t *testing.T) { - t1 := metav1.Now() - t2 := metav1.Now() - old := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: t1, - } - new := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: t2, // different timestamp, same content - } - assert.False(t, wdRefDataCondChanged(old, new), - "identical conditions with differing LastTransitionTime must not be treated as a change") -} - -// ─── wdReferencedDataChangedPredicate tests ─────────────────────────────────── - -// makeWDPair builds two empty WorkloadDeployment objects (no conditions) for -// predicate Update tests. Callers mutate the returned objects before constructing -// the event.UpdateEvent. -func makeWDPair(gen int64) (old, new *computev1alpha.WorkloadDeployment) { - old = &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: wdControllerTestName, - Namespace: wdControllerTestNS, - Generation: gen, - }, - } - new = &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: wdControllerTestName, - Namespace: wdControllerTestNS, - Generation: gen, - }, - } - return old, new -} - -// TestWDPredicate_ReferencedDataReadyAdded verifies that the predicate passes -// when the resolver writes ReferencedDataReady=False for the first time. This is -// the primary gap-closure scenario: the WD had no ReferencedDataReady condition -// (reconciler wrote Available=InstancesProvisioning), the resolver adds -// ReferencedDataReady=False/SourceNotFound, and the predicate must fire so the -// reconciler re-runs and promotes Available to ReferencedDataNotReady. -func TestWDPredicate_ReferencedDataReadyAdded(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - oldWD, newWD := makeWDPair(1) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: metav1.Now(), - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.True(t, pred.Update(e), - "predicate must pass when ReferencedDataReady is added (nil→False/SourceNotFound)") -} - -// TestWDPredicate_ReferencedDataReadyCleared verifies that the predicate passes -// when the resolver sets ReferencedDataReady=True (source ConfigMap was created). -// The WD reconciler must re-run so Available can be promoted from -// ReferencedDataNotReady to StableInstanceFound (or InstancesProvisioning). -func TestWDPredicate_ReferencedDataReadyCleared(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - oldWD, newWD := makeWDPair(1) - apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: metav1.Now(), - }) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionTrue, - Reason: computev1alpha.ReferencedDataReasonReady, - Message: "All companions are ready", - LastTransitionTime: metav1.Now(), - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.True(t, pred.Update(e), - "predicate must pass when ReferencedDataReady flips from False to True") -} - -// TestWDPredicate_GenerationChanged verifies that the predicate passes when the -// WD's generation changes (spec update), even if the ReferencedDataReady condition -// did not change. -func TestWDPredicate_GenerationChanged(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - oldWD, newWD := makeWDPair(1) - newWD.Generation = 2 - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.True(t, pred.Update(e), - "predicate must pass when metadata.generation increases") -} - -// TestWDPredicate_AvailableOnlyChange verifies that the predicate DROPS updates -// where only the Available condition changed. This is the self-trigger prevention: -// after the WD reconciler writes Available=ReferencedDataNotReady, the predicate -// must not re-enqueue the same reconciler via the For() watch. -func TestWDPredicate_AvailableOnlyChange(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - // Both old and new have the SAME ReferencedDataReady condition. - refDataCond := metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: metav1.Now(), - } - oldWD, newWD := makeWDPair(1) - apimeta.SetStatusCondition(&oldWD.Status.Conditions, refDataCond) - apimeta.SetStatusCondition(&newWD.Status.Conditions, refDataCond) - - // The WD reconciler wrote Available=InstancesProvisioning (old) and then - // Available=ReferencedDataNotReady (new). ReferencedDataReady is unchanged. - // NOTE (split): the named reason constants land in layer E; literals are used - // here because the predicate only cares that the Available reason changed. - apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "InstancesProvisioning", - Message: "Instances are being provisioned", - }) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ReferencedDataNotReady", - Message: testMsgConfigMapNotFound, - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.False(t, pred.Update(e), - "predicate must drop update when only Available changed; "+ - "ReferencedDataReady is unchanged so the reconciler's own write must not re-enqueue itself") -} - -// TestWDPredicate_ReplicasReadyOnlyChange verifies that the predicate DROPS updates -// where only the ReplicasReady condition changed (also written by this reconciler), -// for the same self-trigger prevention reason as the Available-only case. -func TestWDPredicate_ReplicasReadyOnlyChange(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - refDataCond := metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionTrue, - Reason: computev1alpha.ReferencedDataReasonReady, - Message: "all ready", - LastTransitionTime: metav1.Now(), - } - oldWD, newWD := makeWDPair(2) - apimeta.SetStatusCondition(&oldWD.Status.Conditions, refDataCond) - apimeta.SetStatusCondition(&newWD.Status.Conditions, refDataCond) - - // ReplicasReady changed (more instances became ready) but ReferencedDataReady is identical. - apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: reasonReplicasAvailable, - Message: "0/2 replicas available", - }) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: reasonReplicasAvailable, - Message: "2/2 replicas available", - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.False(t, pred.Update(e), - "predicate must drop update when only ReplicasReady changed") -} - -// TestWDPredicate_CreateAlwaysPasses verifies that Create events always trigger -// the reconciler regardless of conditions. -func TestWDPredicate_CreateAlwaysPasses(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - e := event.CreateEvent{Object: &computev1alpha.WorkloadDeployment{}} - assert.True(t, pred.Create(e)) -} - -// TestWDPredicate_DeleteAlwaysPasses verifies that Delete events always trigger -// the reconciler. -func TestWDPredicate_DeleteAlwaysPasses(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - e := event.DeleteEvent{Object: &computev1alpha.WorkloadDeployment{}} - assert.True(t, pred.Delete(e)) -} - -// TestWDPredicate_GenericAlwaysPasses verifies that Generic events (e.g. from -// external sources) always trigger the reconciler. -func TestWDPredicate_GenericAlwaysPasses(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - e := event.GenericEvent{Object: &computev1alpha.WorkloadDeployment{}} - assert.True(t, pred.Generic(e)) -} - // makeWDForAvailTest constructs a WorkloadDeployment for Available-condition // unit tests, optionally stamping a ReferencedDataReady condition on status. func makeWDForAvailTest(generation int64, refDataStatus metav1.ConditionStatus, refDataReason, refDataMessage string) *computev1alpha.WorkloadDeployment { From 4fc79bdd040a8092dc08f62528619b5863d9c2fd Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Tue, 2 Jun 2026 06:12:58 -0500 Subject: [PATCH 7/8] feat(lab): enable ReferencedDataGate feature flag on cell overlay Activates the scheduling-gate + Instance-level Ready=False/SourceNotFound contract for the lab (datum-us-central-1-lab). The flag is set in the cell overlay's ConfigMap patch so it only affects deployments using the cell OCI bundle (lab); management-plane and dev overlays are unaffected. Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit 2cf963f6c2d75af8ffd9a5bf01c55296ec317d27) --- config/overlays/cell/disable_webhook_patch.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/overlays/cell/disable_webhook_patch.yaml b/config/overlays/cell/disable_webhook_patch.yaml index 85b57f09..d1818bce 100644 --- a/config/overlays/cell/disable_webhook_patch.yaml +++ b/config/overlays/cell/disable_webhook_patch.yaml @@ -10,3 +10,5 @@ data: bindAddress: "0" discovery: quotaKubeconfigPath: /etc/quota-credentials/kubeconfig + featureFlags: + enableReferencedDataGate: true From f6267cc468be4f116939f19db0462b63189c8f7b Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 8 Jun 2026 13:53:12 -0500 Subject: [PATCH 8/8] refactor(controller): centralize the companion ResourceBinding name codec The "-configmap"/"-secret" suffix that maps a companion object to its Karmada ResourceBinding name was open-coded across the GC, orphan-sweep, and resolver paths. Add companionRBName as the inverse of companionFromRBName and share the suffix constants so the build and parse sides cannot drift. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../controller/companion_gc_controller.go | 4 ++-- internal/controller/orphan_rb_controller.go | 21 +++++++++++++------ .../controller/referenceddata_controller.go | 11 ++++++++-- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/internal/controller/companion_gc_controller.go b/internal/controller/companion_gc_controller.go index e0453fd8..4a7dbcb7 100644 --- a/internal/controller/companion_gc_controller.go +++ b/internal/controller/companion_gc_controller.go @@ -131,14 +131,14 @@ func (r *CompanionGCReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := writer.DeleteConfigMap(ctx, req.Namespace, req.Name); err != nil { return ctrl.Result{}, fmt.Errorf("companion-gc: delete ConfigMap %q: %w", req.Name, err) } - if err := writer.DeleteResourceBinding(ctx, req.Namespace, req.Name+"-configmap"); err != nil { + if err := writer.DeleteResourceBinding(ctx, req.Namespace, companionRBName(req.Name, kindConfigMap)); err != nil { return ctrl.Result{}, fmt.Errorf("companion-gc: delete ConfigMap RB %q: %w", req.Name, err) } case kindSecret: if err := writer.DeleteSecret(ctx, req.Namespace, req.Name); err != nil { return ctrl.Result{}, fmt.Errorf("companion-gc: delete Secret %q: %w", req.Name, err) } - if err := writer.DeleteResourceBinding(ctx, req.Namespace, req.Name+"-secret"); err != nil { + if err := writer.DeleteResourceBinding(ctx, req.Namespace, companionRBName(req.Name, kindSecret)); err != nil { return ctrl.Result{}, fmt.Errorf("companion-gc: delete Secret RB %q: %w", req.Name, err) } } diff --git a/internal/controller/orphan_rb_controller.go b/internal/controller/orphan_rb_controller.go index 3e34c06c..e75e2e47 100644 --- a/internal/controller/orphan_rb_controller.go +++ b/internal/controller/orphan_rb_controller.go @@ -127,7 +127,7 @@ func (r *OrphanRBReconciler) isInScope(rb *karmadaworkv1alpha2.ResourceBinding) return false } name := rb.Name - return strings.HasSuffix(name, "-configmap") || strings.HasSuffix(name, "-secret") + return strings.HasSuffix(name, rbSuffixConfigMap) || strings.HasSuffix(name, rbSuffixSecret) } // companionFromRBName extracts the companion object name and kind from a @@ -139,15 +139,24 @@ func (r *OrphanRBReconciler) isInScope(rb *karmadaworkv1alpha2.ResourceBinding) // "wd-foo-workloaddeployment" → ("", "", false) // not a companion RB func companionFromRBName(rbName string) (companionName, kind string, ok bool) { switch { - case strings.HasSuffix(rbName, "-configmap"): - return strings.TrimSuffix(rbName, "-configmap"), kindConfigMap, true - case strings.HasSuffix(rbName, "-secret"): - return strings.TrimSuffix(rbName, "-secret"), kindSecret, true + case strings.HasSuffix(rbName, rbSuffixConfigMap): + return strings.TrimSuffix(rbName, rbSuffixConfigMap), kindConfigMap, true + case strings.HasSuffix(rbName, rbSuffixSecret): + return strings.TrimSuffix(rbName, rbSuffixSecret), kindSecret, true default: return "", "", false } } +// companionRBName returns the Karmada ResourceBinding name for a companion object +// of the given kind. It is the inverse of companionFromRBName. +func companionRBName(objectName, kind string) string { + if kind == kindSecret { + return objectName + rbSuffixSecret + } + return objectName + rbSuffixConfigMap +} + // isOrphaned returns true when the hub companion is fully absent. A companion // with a deletionTimestamp is considered NOT orphaned — the deletion is in // progress and Karmada's cascade (or Component 3's explicit RB delete) is @@ -196,7 +205,7 @@ func (r *OrphanRBReconciler) SetupWithManager(mgr manager.Manager) error { return false } name := obj.GetName() - return strings.HasSuffix(name, "-configmap") || strings.HasSuffix(name, "-secret") + return strings.HasSuffix(name, rbSuffixConfigMap) || strings.HasSuffix(name, rbSuffixSecret) })), ). Named("orphan-rb"). diff --git a/internal/controller/referenceddata_controller.go b/internal/controller/referenceddata_controller.go index 1bb445b9..6f6ff73c 100644 --- a/internal/controller/referenceddata_controller.go +++ b/internal/controller/referenceddata_controller.go @@ -63,6 +63,13 @@ const ( // referenceddata.ObjectRef to avoid repeated string literals. kindConfigMap = "ConfigMap" kindSecret = "Secret" + + // rbSuffixConfigMap and rbSuffixSecret are the companion ResourceBinding name + // suffixes. Karmada names namespace-scoped RBs "{objectName}-{kindLowercase}"; + // companionRBName builds these and companionFromRBName parses them, so the wire + // format has one source of truth. + rbSuffixConfigMap = "-configmap" + rbSuffixSecret = "-secret" ) // companionWriter is the abstraction that the controller uses to materialise @@ -1162,7 +1169,7 @@ func (r *ReferencedDataController) releaseOneCompanion( // "{companionName}-configmap". IgnoreNotFound because Karmada may have // already cascaded the deletion. if cmDeleted { - if err := writer.DeleteResourceBinding(ctx, namespace, companionName+"-configmap"); err != nil { + if err := writer.DeleteResourceBinding(ctx, namespace, companionRBName(companionName, kindConfigMap)); err != nil { return fmt.Errorf("referenceddata: delete ResourceBinding for ConfigMap %q: %w", companionName, err) } } @@ -1205,7 +1212,7 @@ func (r *ReferencedDataController) releaseOneCompanion( // The RB name follows the Karmada binding-controller convention: // "{companionName}-secret". if secretDeleted { - if err := writer.DeleteResourceBinding(ctx, namespace, companionName+"-secret"); err != nil { + if err := writer.DeleteResourceBinding(ctx, namespace, companionRBName(companionName, kindSecret)); err != nil { return fmt.Errorf("referenceddata: delete ResourceBinding for Secret %q: %w", companionName, err) } }