diff --git a/cmd/main.go b/cmd/main.go index 64240f01..ba7097e5 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -12,8 +12,10 @@ import ( // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. "golang.org/x/sync/errgroup" + corev1 "k8s.io/api/core/v1" _ "k8s.io/client-go/plugin/pkg/client/auth" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -21,6 +23,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" @@ -34,6 +37,7 @@ import ( karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" @@ -79,6 +83,7 @@ func init() { utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) + utilruntime.Must(karmadaworkv1alpha2.Install(scheme)) // +kubebuilder:scaffold:scheme } @@ -479,14 +484,36 @@ func ignoreCanceled(err error) error { // InstanceProjector). Called only when management controllers are enabled and // a federation REST config is available. func setupManagementControllers(mgr mcmanager.Manager, federationClient client.Client) ([]manager.Runnable, error) { + // companionLabelSelector scopes the federation manager's ConfigMap and + // Secret informer cache to referenced-data companions only. Without this, + // For(&corev1.ConfigMap{}) in CompanionGCReconciler would establish a + // cluster-wide ConfigMap+Secret informer that caches every object on the + // Karmada hub — the same OOM pattern that killed the cell CompanionGCReconciler. + // The label CACHE scope (not the predicate) is the correct OOM guard: + // predicates filter events, not cache contents. + companionLabelSelector := labels.SelectorFromSet(labels.Set{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }) + // The federation manager provides a cached, watchable handle to the Karmada - // federation control plane. It backs the InstanceProjector's Instance watch - // and the WorkloadDeploymentFederator's downstream WorkloadDeployment status - // watch. A manager.Manager embeds a cluster.Cluster, so it can be passed - // directly anywhere a watchable federation cluster source is required. + // federation control plane. It backs the InstanceProjector's Instance watch, + // the WorkloadDeploymentFederator's downstream WorkloadDeployment status watch, + // and the CompanionGCReconciler. A manager.Manager embeds a cluster.Cluster, so + // it can be passed directly anywhere a watchable federation cluster source is + // required. federationMgr, err := manager.New(federationRestConfig, manager.Options{ Scheme: scheme, Metrics: metricsserver.Options{BindAddress: "0"}, + Cache: cache.Options{ + // Scope ConfigMap and Secret informers to referenced-data companions. + // CompanionGCReconciler is the only consumer on federationMgr that + // reads these types; nothing else (InstanceProjector, OrphanRBReconciler) + // needs non-companion CMs or Secrets from the cache. + ByObject: map[client.Object]cache.ByObject{ + &corev1.ConfigMap{}: {Label: companionLabelSelector}, + &corev1.Secret{}: {Label: companionLabelSelector}, + }, + }, }) if err != nil { return nil, fmt.Errorf("federation manager: %w", err) @@ -514,5 +541,24 @@ func setupManagementControllers(mgr mcmanager.Manager, federationClient client.C return nil, fmt.Errorf("InstanceProjector: %w", err) } + // OrphanRBReconciler sweeps Karmada ResourceBindings whose hub companion is + // gone, ensuring Works and cell copies are cleaned up even when Karmada's + // event-driven cascade misses the companion-deletion event (e.g. PP deleted + // before binding-controller reconcile completed). Runs on the hub federation + // manager alongside InstanceProjector. + if err = controller.SetupOrphanRBWithManager(federationMgr, federationClient); err != nil { + return nil, fmt.Errorf("OrphanRBReconciler: %w", err) + } + + // CompanionGCReconciler is a level-triggered backstop for stranded hub + // companions: labeled ConfigMaps/Secrets whose referenced-by annotation + // points at WDs that no longer exist on the hub. On each reconcile it + // checks all referrer WDs in the hub namespace; if all are absent the + // companion and its ResourceBinding are deleted, driving the Karmada + // cascade to clean up Works and cell copies permanently. + if err = controller.SetupCompanionGCWithManager(federationMgr, federationClient); err != nil { + return nil, fmt.Errorf("CompanionGCReconciler: %w", err) + } + return []manager.Runnable{federationMgr}, nil } diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml index f2a94a37..b4d0344d 100644 --- a/config/base/downstream-rbac/rbac.yaml +++ b/config/base/downstream-rbac/rbac.yaml @@ -24,7 +24,7 @@ rules: verbs: ["get", "list", "watch"] - apiGroups: ["work.karmada.io"] resources: ["resourcebindings", "clusterresourcebindings"] - verbs: ["get", "list", "watch"] + verbs: ["get", "list", "watch", "delete"] - apiGroups: ["config.karmada.io"] resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] verbs: ["get", "list", "watch"] diff --git a/config/overlays/cell/disable_webhook_patch.yaml b/config/overlays/cell/disable_webhook_patch.yaml index 85b57f09..d1818bce 100644 --- a/config/overlays/cell/disable_webhook_patch.yaml +++ b/config/overlays/cell/disable_webhook_patch.yaml @@ -10,3 +10,5 @@ data: bindAddress: "0" discovery: quotaKubeconfigPath: /etc/quota-credentials/kubeconfig + featureFlags: + enableReferencedDataGate: true diff --git a/internal/controller/companion_gc_controller.go b/internal/controller/companion_gc_controller.go new file mode 100644 index 00000000..4a7dbcb7 --- /dev/null +++ b/internal/controller/companion_gc_controller.go @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +const ( + // companionGCSweepInterval is how often the periodic backstop fires to catch + // companions stranded before the controller started. + companionGCSweepInterval = 5 * time.Minute +) + +// CompanionGCReconciler is a hub-side level-triggered garbage collector for +// referenced-data companion ConfigMaps and Secrets. It is the backstop for the +// interrupted-finalization failure mode: if the ReferencedDataController's WD +// finalizer is interrupted mid-flight (e.g. pod restart), a companion can be +// left with a referenced-by annotation pointing at WDs that no longer exist. +// +// On each reconcile, the controller: +// 1. Reads the companion's referenced-by annotation (JSON array of "ns/name" WD keys). +// 2. Checks each referenced WD for existence in the hub namespace via the +// uncached HubClient. WDs live in the same hub namespace as the companion +// (ns-{project-uid}), federated there by the WorkloadDeploymentFederator. +// The WD name is parsed from the "ns/name" key (the name portion only — +// the namespace in the key is the project namespace, not the hub namespace). +// 3. If ALL referrers are absent in the hub, the companion is stranded → +// deletes it and its Karmada ResourceBinding via the downstreamCompanionWriter +// path. Deleting the RB triggers Karmada's binding-controller to remove the +// Work, which drives the execution-controller to remove the cell copy permanently. +// +// Safety invariant: if ANY referrer WD exists in the hub (including terminating), +// the companion is preserved. Terminating WDs count as present because the +// ReferencedDataController's finalizer may still complete teardown. +// +// Watch scope / OOM guard: the federationMgr cache for ConfigMaps and Secrets +// is restricted to objects carrying the ReferencedDataLabel via +// cache.Options.ByObject in cmd/main.go (setupManagementControllers). This is +// the actual OOM guard — predicates filter events, not cache contents; without +// the cache-level label scope the informer would list-and-watch every +// ConfigMap/Secret on the Karmada hub (the same unscoped-informer pattern that +// OOMKilled the cell CompanionGCReconciler). The label predicate on this +// controller is kept as belt-and-suspenders against predicate bypass but is NOT +// the primary memory guard. +// +// WD existence check: uses HubClient.Get (cache-backed). The federation manager +// cache has no ByObject restriction on WorkloadDeployments, so the WD cache +// covers all WDs in ns-{project-uid} namespaces. The check is purely for WD +// absence; normal cache lag cannot produce a false-orphan decision because a +// recently-deleted WD that is still in the cache is treated as present +// (conservative) and will be re-checked on the next reconcile or sweep tick. +type CompanionGCReconciler struct { + // HubClient is a client pointed at the Karmada hub API server. Used to + // read companions, check WD existence, delete companions, and delete RBs. + HubClient client.Client +} + +// Reconcile is triggered for each labeled companion (ConfigMap or Secret) that +// passes the predicate filter. It checks whether the companion is stranded and +// deletes it (plus its RB) if all referrers are absent from the hub. +func (r *CompanionGCReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Determine whether this is a ConfigMap or Secret by probing both. + // The controller watches both resource types; req carries no kind info. + companionKind, err := r.resolveKind(ctx, req.NamespacedName) + if err != nil { + return ctrl.Result{}, err + } + if companionKind == "" { + // Object is already gone — nothing to do. + return ctrl.Result{}, nil + } + + // Belt-and-suspenders: re-verify the referenced-data label is present even + // when Reconcile is called directly (bypassing the predicate). This prevents + // any non-companion from being touched regardless of annotation content. + if !r.isCompanion(ctx, req.NamespacedName, companionKind) { + return ctrl.Result{}, nil + } + + refKeys, skip, err := r.readRefCountKeys(ctx, req.NamespacedName, companionKind) + if err != nil { + return ctrl.Result{}, err + } + if skip { + // Absent annotation, empty annotation, or corrupt annotation — not a + // managed companion or conservatively skip. + return ctrl.Result{}, nil + } + + allGone, err := r.allReferrersAbsent(ctx, req.Namespace, refKeys) + if err != nil { + return ctrl.Result{}, err + } + if !allGone { + return ctrl.Result{}, nil + } + + // All referrer WDs are absent from the hub — companion is stranded. + logger.Info("deleting stranded companion", + "namespace", req.Namespace, + "name", req.Name, + "kind", companionKind, + ) + + writer := &downstreamCompanionWriter{ + hubClient: r.HubClient, + downstreamNamespace: req.Namespace, + } + + switch companionKind { + case kindConfigMap: + if err := writer.DeleteConfigMap(ctx, req.Namespace, req.Name); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete ConfigMap %q: %w", req.Name, err) + } + if err := writer.DeleteResourceBinding(ctx, req.Namespace, companionRBName(req.Name, kindConfigMap)); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete ConfigMap RB %q: %w", req.Name, err) + } + case kindSecret: + if err := writer.DeleteSecret(ctx, req.Namespace, req.Name); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete Secret %q: %w", req.Name, err) + } + if err := writer.DeleteResourceBinding(ctx, req.Namespace, companionRBName(req.Name, kindSecret)); err != nil { + return ctrl.Result{}, fmt.Errorf("companion-gc: delete Secret RB %q: %w", req.Name, err) + } + } + + return ctrl.Result{}, nil +} + +// isCompanion returns true when the object at nsn/kind carries the +// referenced-data label. Used as a belt-and-suspenders guard inside Reconcile +// to prevent touching objects that bypassed the predicate filter. +func (r *CompanionGCReconciler) isCompanion(ctx context.Context, nsn types.NamespacedName, kind string) bool { + switch kind { + case kindConfigMap: + var cm corev1.ConfigMap + if err := r.HubClient.Get(ctx, nsn, &cm); err != nil { + return false + } + return cm.Labels[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue + case kindSecret: + var s corev1.Secret + if err := r.HubClient.Get(ctx, nsn, &s); err != nil { + return false + } + return s.Labels[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue + } + return false +} + +// resolveKind returns kindConfigMap or kindSecret depending on which resource +// exists at the given namespace/name. Returns ("", nil) when neither exists. +func (r *CompanionGCReconciler) resolveKind(ctx context.Context, nsn types.NamespacedName) (string, error) { + var cm corev1.ConfigMap + switch err := r.HubClient.Get(ctx, nsn, &cm); { + case err == nil: + return kindConfigMap, nil + case !apierrors.IsNotFound(err): + return "", err + } + + var s corev1.Secret + switch err := r.HubClient.Get(ctx, nsn, &s); { + case err == nil: + return kindSecret, nil + case !apierrors.IsNotFound(err): + return "", err + } + + return "", nil +} + +// readRefCountKeys returns the WD key slice from the companion's referenced-by +// annotation. +// +// Returns (nil, true, nil) — skip=true — in the following cases: +// - Annotation is absent or empty: the object was not created by +// ReferencedDataController (or was created before the annotation existed). +// - Annotation is present but unparseable: corrupt state, conservatively +// preserve the companion to avoid incorrectly deleting a still-in-use object. +// +// Returns (keys, false, nil) — skip=false — when the annotation is valid. +func (r *CompanionGCReconciler) readRefCountKeys( + ctx context.Context, + nsn types.NamespacedName, + kind string, +) (keys []string, skip bool, err error) { + var annotations map[string]string + + switch kind { + case kindConfigMap: + var cm corev1.ConfigMap + if err := r.HubClient.Get(ctx, nsn, &cm); err != nil { + if apierrors.IsNotFound(err) { + return nil, true, nil + } + return nil, false, err + } + annotations = cm.Annotations + case kindSecret: + var s corev1.Secret + if err := r.HubClient.Get(ctx, nsn, &s); err != nil { + if apierrors.IsNotFound(err) { + return nil, true, nil + } + return nil, false, err + } + annotations = s.Annotations + } + + // decodeRefCount returns (nil, nil) for absent/empty annotation and + // (nil, err) for corrupt annotation — both map to skip=true. + decoded, decodeErr := decodeRefCount(annotations) + if decodeErr != nil { + log.FromContext(ctx).Info("companion has unparseable referenced-by annotation; preserving", + "namespace", nsn.Namespace, + "name", nsn.Name, + "kind", kind, + ) + return nil, true, nil + } + if decoded == nil { + // Absent or empty annotation — not a managed companion, skip. + return nil, true, nil + } + return decoded, false, nil +} + +// allReferrersAbsent returns true when every WD key in refKeys refers to a WD +// that no longer exists in the hub namespace hubNS. WDs live in the same hub +// namespace as the companion (ns-{project-uid}) after being federated by the +// WorkloadDeploymentFederator. +// +// The WD key format is "projectNamespace/wdName". Only the wdName portion is +// used for the hub lookup — the hub namespace (hubNS) is already known from the +// companion's own namespace. +// +// A WD that exists in the hub — even with a deletionTimestamp — is considered +// present. Terminating referrers still have their finalizers running and may +// complete normal companion teardown; we do not interfere. +// +// An empty refKeys slice means no referrers were recorded — treat as stranded. +func (r *CompanionGCReconciler) allReferrersAbsent(ctx context.Context, hubNS string, refKeys []string) (bool, error) { + if len(refKeys) == 0 { + return true, nil + } + + for _, key := range refKeys { + wdName, err := wdNameFromRefKey(key) + if err != nil { + // Malformed key — treat conservatively: companion is NOT orphaned. + log.FromContext(ctx).Info("companion has malformed WD key in referenced-by; preserving", + "key", key, + ) + return false, nil + } + + nsn := types.NamespacedName{Namespace: hubNS, Name: wdName} + var wd computev1alpha.WorkloadDeployment + switch err := r.HubClient.Get(ctx, nsn, &wd); { + case err == nil: + // WD exists in hub (possibly terminating) — live referrer present. + return false, nil + case apierrors.IsNotFound(err): + // This referrer is absent from the hub; check remaining keys. + continue + default: + return false, fmt.Errorf("companion-gc: check hub WD %q existence: %w", key, err) + } + } + return true, nil +} + +// wdNameFromRefKey extracts the WD name from a "projectNamespace/wdName" key. +// The namespace portion is discarded — the hub namespace is taken from the +// companion's own namespace, which is passed separately to allReferrersAbsent. +func wdNameFromRefKey(key string) (string, error) { + idx := strings.Index(key, "/") + if idx < 0 { + return "", fmt.Errorf("malformed WD key %q: missing '/' separator", key) + } + name := key[idx+1:] + if name == "" { + return "", fmt.Errorf("malformed WD key %q: empty name after '/'", key) + } + return name, nil +} + +// SetupWithManager registers the CompanionGCReconciler with a regular +// ctrl.Manager pointed at the Karmada hub. Both ConfigMap and Secret watches +// are label-scoped so no cluster-wide informer is created. +func (r *CompanionGCReconciler) SetupWithManager(mgr manager.Manager) error { + labelPredicate := predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[computev1alpha.ReferencedDataLabel] == computev1alpha.ReferencedDataLabelValue + }) + + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.ConfigMap{}, builder.WithPredicates(labelPredicate)). + Watches( + &corev1.Secret{}, + &handler.EnqueueRequestForObject{}, + builder.WithPredicates(labelPredicate), + ). + Named("companion-gc"). + Complete(r) +} + +// companionGCPeriodicSweep is a manager.Runnable that periodically lists all +// labeled companion ConfigMaps and Secrets on the hub and calls the reconciler +// directly for each. This backstop covers companions that became stranded before +// the controller started, when no new object events will fire. +type companionGCPeriodicSweep struct { + hubClient client.Client + reconciler *CompanionGCReconciler + interval time.Duration +} + +// Start runs the periodic sweep loop until ctx is cancelled. +func (s *companionGCPeriodicSweep) Start(ctx context.Context) error { + ticker := time.NewTicker(s.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + s.sweep(ctx) + } + } +} + +// sweep lists all labeled companion ConfigMaps and Secrets on the hub and +// reconciles each. This catches companions stranded before the controller started. +func (s *companionGCPeriodicSweep) sweep(ctx context.Context) { + logger := log.FromContext(ctx).WithValues("component", "companion-gc-periodic-sweep") + + labelSel := client.MatchingLabels{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + } + + var cmList corev1.ConfigMapList + if err := s.hubClient.List(ctx, &cmList, labelSel); err != nil { + logger.Error(err, "periodic sweep: list companion ConfigMaps failed") + } else { + for i := range cmList.Items { + cm := &cmList.Items[i] + if _, err := s.reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: cm.Namespace, Name: cm.Name}, + }); err != nil { + logger.Error(err, "periodic sweep: reconcile ConfigMap companion failed", + "namespace", cm.Namespace, "name", cm.Name) + } + } + } + + var sList corev1.SecretList + if err := s.hubClient.List(ctx, &sList, labelSel); err != nil { + logger.Error(err, "periodic sweep: list companion Secrets failed") + } else { + for i := range sList.Items { + sec := &sList.Items[i] + if _, err := s.reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: sec.Namespace, Name: sec.Name}, + }); err != nil { + logger.Error(err, "periodic sweep: reconcile Secret companion failed", + "namespace", sec.Namespace, "name", sec.Name) + } + } + } +} + +// SetupCompanionGCWithManager wires the CompanionGCReconciler and its periodic +// sweep backstop onto the provided ctrl.Manager pointed at the Karmada hub. +// Called from setupManagementControllers in cmd/main.go. +func SetupCompanionGCWithManager(mgr manager.Manager, hubClient client.Client) error { + r := &CompanionGCReconciler{HubClient: hubClient} + if err := r.SetupWithManager(mgr); err != nil { + return err + } + return mgr.Add(&companionGCPeriodicSweep{ + hubClient: hubClient, + reconciler: r, + interval: companionGCSweepInterval, + }) +} diff --git a/internal/controller/companion_gc_controller_test.go b/internal/controller/companion_gc_controller_test.go new file mode 100644 index 00000000..90d850da --- /dev/null +++ b/internal/controller/companion_gc_controller_test.go @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Test constants ──────────────────────────────────────────────────────────── + +const ( + // gcHubNS is the hub namespace (ns-{project-uid}) used across companion GC tests. + gcHubNS = "ns-efdf8ca1-9c2d-4ac8-b161-1951503a2879" + + // gcProjectNS is the project-plane namespace whose WDs are recorded in the + // referenced-by annotation. The annotation key format is "projectNS/wdName". + gcProjectNS = "default" + + // gcWD1Name is the WD name used in single-referrer tests. + gcWD1Name = "mount-pristine" + + // gcWD2Name is the WD name used in multi-referrer tests. + gcWD2Name = "mount-alternate" + + // gcCMPristineRBName is the Karmada ResourceBinding name for the cm-pristine + // companion — used in both companion-GC and orphan-RB test assertions. + gcCMPristineRBName = "cm-pristine-configmap" +) + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +// gcRefKey builds the "projectNS/wdName" annotation entry used in production. +// Using a real-looking key exercises the annotation-parsing path correctly. +func gcRefKey(wdName string) string { + return gcProjectNS + "/" + wdName +} + +// gcEncodeRefs serialises WD keys to the JSON annotation value that +// ReferencedDataController writes on production companions. +func gcEncodeRefs(keys ...string) string { + b, _ := json.Marshal(keys) + return string(b) +} + +// newGCReconciler creates a CompanionGCReconciler backed by a fake hub client +// containing the supplied objects. +func newGCReconciler(objs ...client.Object) (*CompanionGCReconciler, client.Client) { + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() + return &CompanionGCReconciler{HubClient: hubCl}, hubCl +} + +// gcMakeCompanionCM builds a ConfigMap that mirrors a production hub companion: +// carries the referenced-data label and a referenced-by annotation whose value +// is the JSON-encoded list of WD keys. +func gcMakeCompanionCM(name string, wdKeys ...string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: gcEncodeRefs(wdKeys...), + }, + }, + Data: map[string]string{"key": "value"}, + } +} + +// gcMakeCompanionSecret builds a Secret that mirrors a production hub companion. +func gcMakeCompanionSecret(name string, wdKeys ...string) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: gcEncodeRefs(wdKeys...), + }, + }, + Data: map[string][]byte{"key": []byte("value")}, + } +} + +// gcMakeHubWD builds a WorkloadDeployment that mirrors a production hub WD: +// lives in the hub namespace (ns-{project-uid}), not the project namespace. +func gcMakeHubWD(name string) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + }, + } +} + +// gcMakeTerminatingHubWD builds a hub WD that is terminating (deletionTimestamp set). +func gcMakeTerminatingHubWD(name string) *computev1alpha.WorkloadDeployment { + now := metav1.Now() + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: name, + DeletionTimestamp: &now, + Finalizers: []string{"compute.datumapis.com/test"}, + }, + } +} + +// gcMakeCompanionRB builds a ResourceBinding for a companion — used to assert +// the RB teardown path. The naming follows Karmada's convention: {name}-{kind}. +func gcMakeCompanionRB(companionName, kindSuffix string) *karmadaworkv1alpha2.ResourceBinding { + return &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: companionName + "-" + kindSuffix, + Annotations: map[string]string{ + ppNameAnnotationKey: "city-dfw", + }, + }, + } +} + +// reconcileGC runs Reconcile for the named object in gcHubNS. +func reconcileGC(t *testing.T, r *CompanionGCReconciler, name string) (ctrl.Result, error) { + t.Helper() + return r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: gcHubNS, Name: name}, + }) +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +// TestCompanionGC_DeletesOrphanedConfigMap asserts that a ConfigMap companion +// whose sole referrer WD is absent from the hub is deleted along with its RB. +func TestCompanionGC_DeletesOrphanedConfigMap(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-pristine", gcRefKey(gcWD1Name)) + rb := gcMakeCompanionRB("cm-pristine", "configmap") + // No hub WD "mount-pristine" — the referrer is absent. + r, hubCl := newGCReconciler(cm, rb) + + _, err := reconcileGC(t, r, "cm-pristine") + require.NoError(t, err) + + // Companion ConfigMap must be deleted. + var gotCM corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-pristine"}, &gotCM) + require.True(t, apierrors.IsNotFound(err), "orphaned ConfigMap companion must be deleted") + + // Associated ResourceBinding must be deleted. + var gotRB karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: gcCMPristineRBName}, &gotRB) + require.True(t, apierrors.IsNotFound(err), "orphaned ConfigMap RB must be deleted") +} + +// TestCompanionGC_DeletesOrphanedSecret asserts the same for Secret companions. +func TestCompanionGC_DeletesOrphanedSecret(t *testing.T) { + t.Parallel() + + secret := gcMakeCompanionSecret("secret-pristine", gcRefKey(gcWD1Name)) + rb := gcMakeCompanionRB("secret-pristine", "secret") + r, hubCl := newGCReconciler(secret, rb) + + _, err := reconcileGC(t, r, "secret-pristine") + require.NoError(t, err) + + var gotSecret corev1.Secret + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "secret-pristine"}, &gotSecret) + require.True(t, apierrors.IsNotFound(err), "orphaned Secret companion must be deleted") + + var gotRB karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "secret-pristine-secret"}, &gotRB) + require.True(t, apierrors.IsNotFound(err), "orphaned Secret RB must be deleted") +} + +// TestCompanionGC_PreservesCompanionWithLiveReferrer asserts that a companion +// whose referrer WD still exists in the hub is NOT deleted. +func TestCompanionGC_PreservesCompanionWithLiveReferrer(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-live", gcRefKey(gcWD1Name)) + liveWD := gcMakeHubWD(gcWD1Name) // WD exists in hub namespace + r, hubCl := newGCReconciler(cm, liveWD) + + _, err := reconcileGC(t, r, "cm-live") + require.NoError(t, err) + + // Companion must still exist. + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-live"}, &got) + require.NoError(t, err, "companion with live referrer must NOT be deleted") +} + +// TestCompanionGC_TerminatingReferrerCountsAsPresent asserts that a companion +// whose referrer WD is terminating (DeletionTimestamp set) is preserved. +// The ReferencedDataController finalizer on the terminating WD may still complete +// teardown; the GC must not race with it. +func TestCompanionGC_TerminatingReferrerCountsAsPresent(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-term", gcRefKey(gcWD1Name)) + terminatingWD := gcMakeTerminatingHubWD(gcWD1Name) + r, hubCl := newGCReconciler(cm, terminatingWD) + + _, err := reconcileGC(t, r, "cm-term") + require.NoError(t, err) + + // Companion must still exist. + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-term"}, &got) + require.NoError(t, err, "companion with terminating referrer must NOT be deleted") +} + +// TestCompanionGC_MultiReferrer_AllAbsent asserts that a companion shared by +// two WDs is deleted when BOTH referrers are absent from the hub. +func TestCompanionGC_MultiReferrer_AllAbsent(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-shared", + gcRefKey(gcWD1Name), + gcRefKey(gcWD2Name), + ) + // Neither WD exists on the hub. + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-shared") + require.NoError(t, err) + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-shared"}, &got) + require.True(t, apierrors.IsNotFound(err), "companion with all referrers absent must be deleted") +} + +// TestCompanionGC_MultiReferrer_OneRemains asserts that a companion shared by +// two WDs is preserved when one referrer is still present on the hub. +func TestCompanionGC_MultiReferrer_OneRemains(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-partial", + gcRefKey(gcWD1Name), + gcRefKey(gcWD2Name), + ) + // Only one of the two WDs is gone; gcWD2Name still exists. + liveWD := gcMakeHubWD(gcWD2Name) + r, hubCl := newGCReconciler(cm, liveWD) + + _, err := reconcileGC(t, r, "cm-partial") + require.NoError(t, err) + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-partial"}, &got) + require.NoError(t, err, "companion with at least one live referrer must NOT be deleted") +} + +// TestCompanionGC_CorruptAnnotationPreservesCompanion asserts that a companion +// with a corrupt (unparseable) referenced-by annotation is NOT deleted. +// Corrupt state is treated conservatively: other referrers may exist that cannot +// be parsed; we must not silently drop companions. +func TestCompanionGC_CorruptAnnotationPreservesCompanion(t *testing.T) { + t.Parallel() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: "cm-corrupt", + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + // Not valid JSON — simulates corruption (e.g. truncated write). + companionRefCountAnnotation: `["default/wd-1","default/wd-2`, + }, + }, + } + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-corrupt") + require.NoError(t, err, "corrupt annotation must not cause an error (only skip)") + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-corrupt"}, &got) + require.NoError(t, err, "companion with corrupt annotation must NOT be deleted") +} + +// TestCompanionGC_EmptyAnnotationPreservesCompanion asserts that a companion +// with no referenced-by annotation is not deleted. Such companions were not +// created by the ReferencedDataController (or predate the annotation). +func TestCompanionGC_EmptyAnnotationPreservesCompanion(t *testing.T) { + t.Parallel() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: "cm-no-anno", + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + // No referenced-by annotation at all. + }, + } + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-no-anno") + require.NoError(t, err) + + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-no-anno"}, &got) + require.NoError(t, err, "companion without referenced-by annotation must NOT be deleted") +} + +// TestCompanionGC_UnlabeledObjectUnaffected asserts that an unlabeled ConfigMap +// (not a companion) is never touched even if its name matches a companion pattern. +func TestCompanionGC_UnlabeledObjectUnaffected(t *testing.T) { + t.Parallel() + + // ConfigMap without the referenced-data label — not a companion. + unlabeled := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: gcHubNS, + Name: "not-a-companion", + Annotations: map[string]string{ + // Even if it has the annotation, the GC must not touch it without the label. + companionRefCountAnnotation: "[]", + }, + }, + } + r, hubCl := newGCReconciler(unlabeled) + + // The GC is triggered via the predicate; force reconcile directly. + _, err := reconcileGC(t, r, "not-a-companion") + require.NoError(t, err) + + // Object must still exist — the GC skips non-companions. + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "not-a-companion"}, &got) + require.NoError(t, err, "non-companion object must NOT be touched by GC") +} + +// TestCompanionGC_ObjectAlreadyGone asserts that the reconciler does not error +// when the companion is already gone at reconcile time. +func TestCompanionGC_ObjectAlreadyGone(t *testing.T) { + t.Parallel() + + // Do not seed any object. + r, _ := newGCReconciler() + + _, err := reconcileGC(t, r, "already-gone") + require.NoError(t, err, "reconciler must tolerate NotFound on the companion itself") +} + +// TestCompanionGC_RBAlreadyGoneIsToleratedOnOrphanDelete asserts that if the +// companion is orphaned but its RB is already gone (Karmada cascade beat us), +// the GC does not error. +func TestCompanionGC_RBAlreadyGoneIsToleratedOnOrphanDelete(t *testing.T) { + t.Parallel() + + cm := gcMakeCompanionCM("cm-norb", gcRefKey(gcWD1Name)) + // No RB seeded — simulates Karmada cascade already cleaned it up. + r, hubCl := newGCReconciler(cm) + + _, err := reconcileGC(t, r, "cm-norb") + require.NoError(t, err, "missing RB must not cause an error — IgnoreNotFound applied") + + // Companion itself must be deleted (the IgnoreNotFound RB delete must not abort). + var got corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "cm-norb"}, &got) + require.True(t, apierrors.IsNotFound(err), "companion must be deleted even when RB was already absent") +} + +// TestCompanionGC_WdNameFromRefKey_Patterns verifies the WD name extraction +// from various key formats including edge cases. +func TestCompanionGC_WdNameFromRefKey_Patterns(t *testing.T) { + t.Parallel() + + tests := []struct { + key string + wantName string + wantErr bool + }{ + {"default/mount-pristine", "mount-pristine", false}, + {"my-project/my-wd", "my-wd", false}, + {"namespace/wd-with-dashes", "wd-with-dashes", false}, + {"missing-separator", "", true}, + {"ns/", "", true}, + {"", "", true}, + } + + for _, tc := range tests { + t.Run(tc.key, func(t *testing.T) { + name, err := wdNameFromRefKey(tc.key) + if tc.wantErr { + assert.Error(t, err, "expected error for key %q", tc.key) + } else { + assert.NoError(t, err) + assert.Equal(t, tc.wantName, name, "name mismatch for key %q", tc.key) + } + }) + } +} + +// TestCompanionGC_PeriodicSweepReconcilesCandidates is a white-box test that +// calls sweep() directly and verifies it drives reconciliation for stranded +// companions. +func TestCompanionGC_PeriodicSweepReconcilesCandidates(t *testing.T) { + t.Parallel() + + // Stranded companion: labeled, referenced-by points at an absent WD. + strangedCM := gcMakeCompanionCM("stranded-cm", gcRefKey("dead-wd")) + // Live companion: labeled, referenced-by points at an existing WD. + liveCM := gcMakeCompanionCM("live-cm", gcRefKey("live-wd")) + liveWD := gcMakeHubWD("live-wd") + + r, hubCl := newGCReconciler(strangedCM, liveCM, liveWD) + + sweep := &companionGCPeriodicSweep{ + hubClient: hubCl, + reconciler: r, + interval: companionGCSweepInterval, // unused in this test + } + sweep.sweep(context.Background()) + + // Stranded companion must be deleted. + var gotStranded corev1.ConfigMap + err := hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "stranded-cm"}, &gotStranded) + require.True(t, apierrors.IsNotFound(err), "stranded companion must be deleted by sweep") + + // Live companion must survive. + var gotLive corev1.ConfigMap + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: gcHubNS, Name: "live-cm"}, &gotLive) + require.NoError(t, err, "live companion must survive the sweep") +} diff --git a/internal/controller/orphan_rb_controller.go b/internal/controller/orphan_rb_controller.go new file mode 100644 index 00000000..e75e2e47 --- /dev/null +++ b/internal/controller/orphan_rb_controller.go @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" +) + +const ( + // orphanRBSweepInterval is how often the periodic backstop fires to catch + // ResourceBindings whose hub companion was deleted before the controller started. + orphanRBSweepInterval = 5 * time.Minute + + // ppNameAnnotationKey is the annotation Karmada's binding-controller stamps on + // every ResourceBinding to link it back to its governing PropagationPolicy. + // Despite the "name" in the key string, Karmada stores this value in + // metadata.annotations, NOT metadata.labels (labels carry only permanent-id + // UUIDs). Reading from labels always returns "" and breaks scope filtering. + ppNameAnnotationKey = "propagationpolicy.karmada.io/name" + + // cityPPPrefix is the prefix of PropagationPolicy names created by + // propagationPolicyNameFor in workloaddeployment_federator.go. RBs whose + // PP annotation starts with this prefix were created for referenced-data + // companion propagation. + cityPPPrefix = "city-" +) + +// OrphanRBReconciler runs on the Karmada hub and deletes ResourceBindings +// whose hub companion (ConfigMap or Secret) no longer exists. It is the +// backstop sweep that handles Karmada's event-driven deletion gap: if the +// binding-controller misses the companion-deletion event (e.g. PP deleted +// before RB reconcile completes), the RB and its child Works are stranded. +// This reconciler detects and removes those stranded RBs so that Works are +// deleted and cell copies stop being re-created. +// +// # Tight scope +// +// Only ResourceBindings satisfying ALL three conditions are ever deleted: +// 1. Name ends with "-configmap" or "-secret" (Karmada's kind-suffix for +// namespace-scoped ConfigMap/Secret RBs — WD RBs end in "-workloaddeployment"). +// 2. The propagationpolicy.karmada.io/name annotation starts with "city-" (all +// referenced-data PropagationPolicies use this prefix; Karmada stores this +// in annotations, not labels). +// 3. The hub companion derived by stripping the kind suffix does NOT exist +// in the same namespace (and has no deletionTimestamp — a terminating +// companion means Karmada cascade is still in progress). +// +// This filter is necessary and sufficient to avoid touching WD RBs and any +// non-compute ResourceBindings that happen to share the hub namespace. +// +// # Cascade +// +// Deleting the RB triggers Karmada's binding-controller to remove all Work +// objects the RB owns. The execution-controller then removes the cell copies. +// Because the Work is gone there is nothing left to re-create the cell copy — +// the deletion is permanent. +type OrphanRBReconciler struct { + // HubClient is a client pointed at the Karmada hub API server. + HubClient client.Client +} + +// +kubebuilder:rbac:groups=work.karmada.io,resources=resourcebindings,verbs=get;list;watch;delete + +// Reconcile is triggered for each ResourceBinding that passes the predicate +// filter. It checks whether the RB is a stranded orphan and deletes it if so. +func (r *OrphanRBReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + var rb karmadaworkv1alpha2.ResourceBinding + if err := r.HubClient.Get(ctx, req.NamespacedName, &rb); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // Belt-and-suspenders: re-apply the scope filter in Reconcile so that even + // if the predicate is bypassed (e.g. during tests) only valid RBs are acted on. + if !r.isInScope(&rb) { + return ctrl.Result{}, nil + } + + companionName, kind, ok := companionFromRBName(rb.Name) + if !ok { + return ctrl.Result{}, nil + } + + orphaned, err := r.isOrphaned(ctx, rb.Namespace, companionName, kind) + if err != nil { + return ctrl.Result{}, err + } + if !orphaned { + return ctrl.Result{}, nil + } + + logger.Info("deleting orphaned companion ResourceBinding", + "name", rb.Name, + "namespace", rb.Namespace, + "companionName", companionName, + "companionKind", kind, + ) + if err := r.HubClient.Delete(ctx, &rb); client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +// isInScope returns true when the ResourceBinding falls within the tight scope +// of referenced-data companion RBs: city-PP annotation AND kind suffix. +func (r *OrphanRBReconciler) isInScope(rb *karmadaworkv1alpha2.ResourceBinding) bool { + ppName := rb.Annotations[ppNameAnnotationKey] + if !strings.HasPrefix(ppName, cityPPPrefix) { + return false + } + name := rb.Name + return strings.HasSuffix(name, rbSuffixConfigMap) || strings.HasSuffix(name, rbSuffixSecret) +} + +// companionFromRBName extracts the companion object name and kind from a +// ResourceBinding name. Karmada names namespace-scoped RBs as +// "{objectName}-{kindLowercase}", so: +// +// "cm-pristine-configmap" → ("cm-pristine", "ConfigMap", true) +// "secret-foo-secret" → ("secret-foo", "Secret", true) +// "wd-foo-workloaddeployment" → ("", "", false) // not a companion RB +func companionFromRBName(rbName string) (companionName, kind string, ok bool) { + switch { + case strings.HasSuffix(rbName, rbSuffixConfigMap): + return strings.TrimSuffix(rbName, rbSuffixConfigMap), kindConfigMap, true + case strings.HasSuffix(rbName, rbSuffixSecret): + return strings.TrimSuffix(rbName, rbSuffixSecret), kindSecret, true + default: + return "", "", false + } +} + +// companionRBName returns the Karmada ResourceBinding name for a companion object +// of the given kind. It is the inverse of companionFromRBName. +func companionRBName(objectName, kind string) string { + if kind == kindSecret { + return objectName + rbSuffixSecret + } + return objectName + rbSuffixConfigMap +} + +// isOrphaned returns true when the hub companion is fully absent. A companion +// with a deletionTimestamp is considered NOT orphaned — the deletion is in +// progress and Karmada's cascade (or Component 3's explicit RB delete) is +// expected to fire shortly. We do not interfere with in-progress deletions. +func (r *OrphanRBReconciler) isOrphaned(ctx context.Context, namespace, companionName, kind string) (bool, error) { + nsn := types.NamespacedName{Namespace: namespace, Name: companionName} + switch kind { + case kindConfigMap: + var cm corev1.ConfigMap + err := r.HubClient.Get(ctx, nsn, &cm) + if apierrors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, err + } + // Companion exists (including if terminating) — not orphaned yet. + return false, nil + case kindSecret: + var s corev1.Secret + err := r.HubClient.Get(ctx, nsn, &s) + if apierrors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, err + } + return false, nil + default: + return false, nil + } +} + +// SetupWithManager registers the OrphanRBReconciler with a regular ctrl.Manager +// pointed at the Karmada hub. It is called from SetupOrphanRBWithManager, not +// from the multicluster manager, because ResourceBindings live on the hub only. +// +// The predicate filter restricts the work queue to city-PP companion RBs +// (kind suffix + city- label prefix) before they ever reach Reconcile. +func (r *OrphanRBReconciler) SetupWithManager(mgr manager.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&karmadaworkv1alpha2.ResourceBinding{}, + builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { + ppName := obj.GetAnnotations()[ppNameAnnotationKey] + if !strings.HasPrefix(ppName, cityPPPrefix) { + return false + } + name := obj.GetName() + return strings.HasSuffix(name, rbSuffixConfigMap) || strings.HasSuffix(name, rbSuffixSecret) + })), + ). + Named("orphan-rb"). + Complete(r) +} + +// orphanRBPeriodicSweep is a manager.Runnable that periodically lists all in- +// scope companion ResourceBindings on the hub and calls the reconciler directly +// for each. This covers RBs that became orphaned before the controller started, +// when no new RB events are expected to fire. +// +// The sweep calls Reconcile directly (bypassing the controller's work queue) +// because the manager.Runnable interface is simpler than wiring a synthetic +// event source. The reconciler is idempotent, so direct calls are safe. +type orphanRBPeriodicSweep struct { + hubClient client.Client + reconciler *OrphanRBReconciler + interval time.Duration +} + +// Start runs the periodic sweep loop until ctx is cancelled. +func (s *orphanRBPeriodicSweep) Start(ctx context.Context) error { + ticker := time.NewTicker(s.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + s.sweep(ctx) + } + } +} + +// sweep lists all ResourceBindings on the hub, filters to in-scope companion +// RBs, and calls Reconcile directly for each candidate. +func (s *orphanRBPeriodicSweep) sweep(ctx context.Context) { + logger := log.FromContext(ctx).WithValues("component", "orphan-rb-periodic-sweep") + + var rbList karmadaworkv1alpha2.ResourceBindingList + // List across all namespaces; label filtering is applied client-side because + // a HasPrefix predicate is not expressible as a label selector. The total RB + // count in the hub is bounded (O(companions × cells)), so a full list is safe. + if err := s.hubClient.List(ctx, &rbList); err != nil { + logger.Error(err, "periodic sweep: list ResourceBindings failed") + return + } + + for i := range rbList.Items { + rb := &rbList.Items[i] + if !s.reconciler.isInScope(rb) { + continue + } + if _, err := s.reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: rb.Namespace, + Name: rb.Name, + }, + }); err != nil { + logger.Error(err, "periodic sweep: reconcile failed", + "name", rb.Name, "namespace", rb.Namespace) + } + } +} + +// SetupOrphanRBWithManager wires the OrphanRBReconciler and its periodic sweep +// backstop onto the provided ctrl.Manager pointed at the Karmada hub. Called +// from setupManagementControllers in cmd/main.go. +func SetupOrphanRBWithManager(mgr manager.Manager, hubClient client.Client) error { + r := &OrphanRBReconciler{HubClient: hubClient} + if err := r.SetupWithManager(mgr); err != nil { + return err + } + return mgr.Add(&orphanRBPeriodicSweep{ + hubClient: hubClient, + reconciler: r, + interval: orphanRBSweepInterval, + }) +} diff --git a/internal/controller/orphan_rb_controller_test.go b/internal/controller/orphan_rb_controller_test.go new file mode 100644 index 00000000..25302ee1 --- /dev/null +++ b/internal/controller/orphan_rb_controller_test.go @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" +) + +// ─── Test constants ──────────────────────────────────────────────────────────── + +const ( + // orbTestNS is the hub namespace where companion RBs and their companions live. + orbTestNS = "ns-efdf8ca1-9c2d-4ac8-b161-1951503a2879" + + // orbCityPPName is the PropagationPolicy name stored in metadata.annotations + // on companion RBs. Karmada writes it to annotations, not labels (labels only + // carry permanent-id UUIDs). + orbCityPPName = "city-dfw" + + // Karmada permanent-id label keys — present in metadata.labels on production RBs. + orbPPPermanentIDKey = "propagationpolicy.karmada.io/permanent-id" + orbRBPermanentIDKey = "resourcebinding.karmada.io/permanent-id" +) + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +// newOrphanRBReconciler creates an OrphanRBReconciler backed by a fake hub client. +func newOrphanRBReconciler(objs ...client.Object) (*OrphanRBReconciler, client.Client) { + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() + return &OrphanRBReconciler{HubClient: hubCl}, hubCl +} + +// makeCompanionRB returns a ResourceBinding that mirrors the production shape +// observed in the lab: PP name in metadata.annotations (NOT labels), and +// permanent-id UUIDs in labels. This matches what Karmada's binding-controller +// actually stamps on each ResourceBinding. +func makeCompanionRB(name string) *karmadaworkv1alpha2.ResourceBinding { + return &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: name, + Labels: map[string]string{ + orbPPPermanentIDKey: "a3d5b1ac-0000-0000-0000-000000000001", + orbRBPermanentIDKey: "ee4b0939-0000-0000-0000-000000000001", + }, + Annotations: map[string]string{ + ppNameAnnotationKey: orbCityPPName, + }, + }, + } +} + +// reconcileRB runs Reconcile for the named ResourceBinding in orbTestNS. +func reconcileRB(t *testing.T, r *OrphanRBReconciler, name string) (ctrl.Result, error) { + t.Helper() + return r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: orbTestNS, Name: name}, + }) +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +// TestOrphanRB_DeletesOrphanedConfigMapRB asserts that an RB whose companion +// ConfigMap is absent is deleted. +func TestOrphanRB_DeletesOrphanedConfigMapRB(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB(gcCMPristineRBName) + // No ConfigMap "cm-pristine" in the hub namespace — the RB is orphaned. + r, hubCl := newOrphanRBReconciler(rb) + + _, err := reconcileRB(t, r, gcCMPristineRBName) + require.NoError(t, err) + + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: gcCMPristineRBName}, &got) + require.True(t, apierrors.IsNotFound(err), "orphaned companion RB must be deleted") +} + +// TestOrphanRB_DeletesOrphanedSecretRB asserts the same for Secret companion RBs. +func TestOrphanRB_DeletesOrphanedSecretRB(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB("secret-pristine-secret") + r, hubCl := newOrphanRBReconciler(rb) + + _, err := reconcileRB(t, r, "secret-pristine-secret") + require.NoError(t, err) + + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "secret-pristine-secret"}, &got) + require.True(t, apierrors.IsNotFound(err), "orphaned companion Secret RB must be deleted") +} + +// TestOrphanRB_SkipsLiveCompanion asserts that an RB whose companion still +// exists (live, no deletionTimestamp) is NOT deleted. +func TestOrphanRB_SkipsLiveCompanion(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB("cm-live-configmap") + // Companion ConfigMap exists and is live. + liveCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: orbTestNS, Name: "cm-live"}, + } + r, hubCl := newOrphanRBReconciler(rb, liveCM) + + _, err := reconcileRB(t, r, "cm-live-configmap") + require.NoError(t, err) + + // RB must still exist. + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "cm-live-configmap"}, &got) + require.NoError(t, err, "RB must NOT be deleted when companion still exists") + assert.Equal(t, "cm-live-configmap", got.Name) +} + +// TestOrphanRB_SkipsTerminatingCompanion asserts that an RB whose companion +// has a deletionTimestamp is NOT deleted — the deletion cascade may still fire. +func TestOrphanRB_SkipsTerminatingCompanion(t *testing.T) { + t.Parallel() + + rb := makeCompanionRB("cm-terminating-configmap") + now := metav1.Now() + // Companion exists with a deletionTimestamp — deletion in progress. + terminatingCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: "cm-terminating", + DeletionTimestamp: &now, + Finalizers: []string{"karmada.io/fake"}, + }, + } + r, hubCl := newOrphanRBReconciler(rb, terminatingCM) + + _, err := reconcileRB(t, r, "cm-terminating-configmap") + require.NoError(t, err) + + // RB must still exist. + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "cm-terminating-configmap"}, &got) + require.NoError(t, err, "RB must NOT be deleted when companion is terminating") +} + +// TestOrphanRB_IgnoresNonCityPPRB asserts that an RB without the "city-" +// PropagationPolicy label is never touched, even if its companion is absent. +func TestOrphanRB_IgnoresNonCityPPRB(t *testing.T) { + t.Parallel() + + // RB has a non-city PP annotation (e.g. a WD's PP or an unrelated tenant's PP). + // Production shape: PP name in annotations, permanent-ids in labels. + nonCityRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: "cm-other-configmap", + Labels: map[string]string{ + "propagationpolicy.karmada.io/permanent-id": "a3d5b1ac-0000-0000-0000-000000000099", + "resourcebinding.karmada.io/permanent-id": "ee4b0939-0000-0000-0000-000000000099", + }, + Annotations: map[string]string{ppNameAnnotationKey: "other-pp"}, + }, + } + r, hubCl := newOrphanRBReconciler(nonCityRB) + + _, err := reconcileRB(t, r, "cm-other-configmap") + require.NoError(t, err) + + // RB must still exist — it is outside the companion scope. + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "cm-other-configmap"}, &got) + require.NoError(t, err, "non-city-PP RB must NOT be touched") +} + +// TestOrphanRB_IgnoresWorkloadDeploymentRB asserts that a WD ResourceBinding +// (suffix "-workloaddeployment") is never deleted even when its companion is +// absent and it has a city- PP label. WD RBs are out of scope. +func TestOrphanRB_IgnoresWorkloadDeploymentRB(t *testing.T) { + t.Parallel() + + // WD RB: correct city- PP annotation but wrong kind suffix ("-workloaddeployment" + // instead of "-configmap"/"-secret"). Production shape: PP name in annotations. + wdRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: orbTestNS, + Name: "my-workload-workloaddeployment", + Labels: map[string]string{ + "propagationpolicy.karmada.io/permanent-id": "a3d5b1ac-0000-0000-0000-000000000088", + "resourcebinding.karmada.io/permanent-id": "ee4b0939-0000-0000-0000-000000000088", + }, + Annotations: map[string]string{ppNameAnnotationKey: "city-dfw"}, + }, + } + r, hubCl := newOrphanRBReconciler(wdRB) + + _, err := reconcileRB(t, r, "my-workload-workloaddeployment") + require.NoError(t, err) + + var got karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), + types.NamespacedName{Namespace: orbTestNS, Name: "my-workload-workloaddeployment"}, &got) + require.NoError(t, err, "WorkloadDeployment RB must NEVER be deleted by the orphan sweep") +} + +// TestOrphanRB_ToleratesRBAlreadyGone asserts that the reconciler does not +// error when the RB is already gone at reconcile time (deleted by Component 3). +func TestOrphanRB_ToleratesRBAlreadyGone(t *testing.T) { + t.Parallel() + + // Do not seed the RB — simulate it being deleted before Reconcile runs. + r, _ := newOrphanRBReconciler() + + _, err := reconcileRB(t, r, "cm-already-gone-configmap") + require.NoError(t, err, "reconciler must tolerate NotFound on the RB itself") +} + +// TestOrphanRB_CompanionFromRBName_Patterns verifies companionFromRBName +// correctly extracts companion names/kinds and rejects non-companion patterns. +func TestOrphanRB_CompanionFromRBName_Patterns(t *testing.T) { + t.Parallel() + + tests := []struct { + rbName string + wantOK bool + wantName string + wantKind string + }{ + {gcCMPristineRBName, true, "cm-pristine", kindConfigMap}, + {"secret-foo-secret", true, "secret-foo", kindSecret}, + {"a-b-c-secret", true, "a-b-c", kindSecret}, + {"wd-foo-workloaddeployment", false, "", ""}, + {"just-a-name", false, "", ""}, + {"", false, "", ""}, + {"-configmap", true, "", kindConfigMap}, // degenerate — name is empty + {"notsuffix", false, "", ""}, + } + + for _, tc := range tests { + t.Run(tc.rbName, func(t *testing.T) { + name, kind, ok := companionFromRBName(tc.rbName) + assert.Equal(t, tc.wantOK, ok, "ok mismatch for %q", tc.rbName) + assert.Equal(t, tc.wantName, name, "name mismatch for %q", tc.rbName) + assert.Equal(t, tc.wantKind, kind, "kind mismatch for %q", tc.rbName) + }) + } +} diff --git a/internal/controller/referenceddata_controller.go b/internal/controller/referenceddata_controller.go index 9593bc8a..6f6ff73c 100644 --- a/internal/controller/referenceddata_controller.go +++ b/internal/controller/referenceddata_controller.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/referenceddata" "go.miloapis.com/milo/pkg/downstreamclient" @@ -62,6 +63,13 @@ const ( // referenceddata.ObjectRef to avoid repeated string literals. kindConfigMap = "ConfigMap" kindSecret = "Secret" + + // rbSuffixConfigMap and rbSuffixSecret are the companion ResourceBinding name + // suffixes. Karmada names namespace-scoped RBs "{objectName}-{kindLowercase}"; + // companionRBName builds these and companionFromRBName parses them, so the wire + // format has one source of truth. + rbSuffixConfigMap = "-configmap" + rbSuffixSecret = "-secret" ) // companionWriter is the abstraction that the controller uses to materialise @@ -105,6 +113,18 @@ type companionWriter interface { // DeleteSecret deletes the companion Secret if it exists. DeleteSecret(ctx context.Context, namespace, name string) error + // DeleteResourceBinding deletes the Karmada ResourceBinding associated with + // a companion object after the companion itself has been deleted. The RB name + // follows the Karmada binding-controller convention: "{companionName}-{kind}", + // where kind is the lowercase resource kind ("configmap" or "secret"). + // + // Callers MUST pass client.IgnoreNotFound semantics: the RB may already be + // gone if Karmada's event-driven cascade ran before this call. + // + // localCompanionWriter returns nil unconditionally — there is no Karmada hub + // in single-cluster mode. downstreamCompanionWriter deletes from the hub. + DeleteResourceBinding(ctx context.Context, namespace, rbName string) error + // GetConfigMap returns the existing companion ConfigMap, or nil if absent. GetConfigMap(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) @@ -153,6 +173,12 @@ func (w *localCompanionWriter) DeleteSecret(ctx context.Context, namespace, name return client.IgnoreNotFound(w.cl.Delete(ctx, s)) } +// DeleteResourceBinding is a no-op for localCompanionWriter: in single-cluster +// mode there is no Karmada hub and therefore no ResourceBindings to clean up. +func (w *localCompanionWriter) DeleteResourceBinding(_ context.Context, _, _ string) error { + return nil +} + func (w *localCompanionWriter) GetConfigMap(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) { var cm corev1.ConfigMap err := w.cl.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &cm) @@ -241,6 +267,25 @@ func (w *downstreamCompanionWriter) DeleteSecret(ctx context.Context, _, name st return client.IgnoreNotFound(w.hubClient.Delete(ctx, s)) } +// DeleteResourceBinding deletes the Karmada ResourceBinding for a companion +// that has just been deleted from the hub. The RB lives in the same downstream +// namespace as the companion and is named "{companionName}-{kind}" per Karmada's +// binding-controller convention (e.g. "cm-pristine-configmap"). +// +// Deleting the RB cascades: binding-controller removes the Work, which drives +// the execution-controller to remove the cell copy permanently (no Work left to +// re-create it). client.IgnoreNotFound is applied because Karmada's own +// event-driven cascade may have already cleaned up the RB. +func (w *downstreamCompanionWriter) DeleteResourceBinding(ctx context.Context, _, rbName string) error { + rb := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: w.downstreamNamespace, + Name: rbName, + }, + } + return client.IgnoreNotFound(w.hubClient.Delete(ctx, rb)) +} + func (w *downstreamCompanionWriter) GetConfigMap(ctx context.Context, _, name string) (*corev1.ConfigMap, error) { var cm corev1.ConfigMap err := w.hubClient.Get(ctx, types.NamespacedName{Namespace: w.downstreamNamespace, Name: name}, &cm) @@ -1065,6 +1110,12 @@ func (r *ReferencedDataController) releaseRemovedCompanions( // and the subsequent Update target the same resourceVersion so a concurrent // change causes a conflict that drives a re-read and retry. // +// When the ref-count drops to zero and the companion is deleted, the associated +// Karmada ResourceBinding is also deleted. This short-circuits Karmada's event- +// driven cascade and ensures the Work is removed regardless of PropagationPolicy +// state. The RB delete is issued outside the RetryOnConflict loop because it +// is unconditional once the companion is gone. +// // If the ref-count annotation is unparseable the whole call returns an error // (transient). The companion is NOT deleted in that case — it may still be // referenced by other WDs whose entries are recorded in the corrupt annotation. @@ -1079,6 +1130,7 @@ func (r *ReferencedDataController) releaseOneCompanion( // Try ConfigMap. var cmExists bool + var cmDeleted bool if releaseConfigMap { if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { cm, err := writer.GetConfigMap(ctx, namespace, companionName) @@ -1095,6 +1147,7 @@ func (r *ReferencedDataController) releaseOneCompanion( return fmt.Errorf("referenceddata: corrupt ref-count on ConfigMap %q: %w", companionName, err) } if len(remaining) == 0 { + cmDeleted = true return writer.DeleteConfigMap(ctx, namespace, companionName) } // Build a desired object that carries the updated ref-count. Pass the @@ -1111,6 +1164,15 @@ func (r *ReferencedDataController) releaseOneCompanion( } } if cmExists { + // Delete the Karmada ResourceBinding if the companion was fully removed. + // The RB name follows the Karmada binding-controller convention: + // "{companionName}-configmap". IgnoreNotFound because Karmada may have + // already cascaded the deletion. + if cmDeleted { + if err := writer.DeleteResourceBinding(ctx, namespace, companionRBName(companionName, kindConfigMap)); err != nil { + return fmt.Errorf("referenceddata: delete ResourceBinding for ConfigMap %q: %w", companionName, err) + } + } return nil } @@ -1119,7 +1181,8 @@ func (r *ReferencedDataController) releaseOneCompanion( } // Try Secret. - return retry.RetryOnConflict(retry.DefaultRetry, func() error { + var secretDeleted bool + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { s, err := writer.GetSecret(ctx, namespace, companionName) if err != nil { return fmt.Errorf("get companion Secret %q: %w", companionName, err) @@ -1132,6 +1195,7 @@ func (r *ReferencedDataController) releaseOneCompanion( return fmt.Errorf("referenceddata: corrupt ref-count on Secret %q: %w", companionName, err) } if len(remaining) == 0 { + secretDeleted = true return writer.DeleteSecret(ctx, namespace, companionName) } desired := s.DeepCopy() @@ -1140,7 +1204,19 @@ func (r *ReferencedDataController) releaseOneCompanion( } desired.Annotations[companionRefCountAnnotation] = encodeRefCount(remaining) return writer.ApplySecret(ctx, s, desired) - }) + }); err != nil { + return err + } + + // Delete the Karmada ResourceBinding if the companion was fully removed. + // The RB name follows the Karmada binding-controller convention: + // "{companionName}-secret". + if secretDeleted { + if err := writer.DeleteResourceBinding(ctx, namespace, companionRBName(companionName, kindSecret)); err != nil { + return fmt.Errorf("referenceddata: delete ResourceBinding for Secret %q: %w", companionName, err) + } + } + return nil } // writerFor returns the companionWriter appropriate for the current mode. diff --git a/internal/controller/referenceddata_controller_test.go b/internal/controller/referenceddata_controller_test.go index c24a46e9..a6155474 100644 --- a/internal/controller/referenceddata_controller_test.go +++ b/internal/controller/referenceddata_controller_test.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/referenceddata" ) @@ -779,11 +780,11 @@ func TestReferencedData_CompanionNamespaceInvariant(t *testing.T) { // newRDControllerFederated creates a ReferencedDataController wired with a // FederationClient (fake hub client). The project cluster holds the WDs and // source ConfigMaps/Secrets; the hub client is the destination for companions. +// Reader is always nil in tests; the controller falls back to LocalReader. func newRDControllerFederated( t *testing.T, projectCl client.Client, hubCl client.Client, - reader referenceddata.ProjectConfigSecretReader, ) (*ReferencedDataController, string) { t.Helper() clusterName := rdTestClusterName @@ -797,7 +798,6 @@ func newRDControllerFederated( c := &ReferencedDataController{ mgr: mgr, opts: ReferencedDataControllerOptions{ - Reader: reader, FederationClient: hubCl, }, } @@ -841,7 +841,7 @@ func TestReferencedData_Federated_CompanionWrittenToHub(t *testing.T) { require.NoError(t, computev1alpha.AddToScheme(hubScheme)) hubCl := fake.NewClientBuilder().WithScheme(hubScheme).Build() - c, clusterName := newRDControllerFederated(t, projectCl, hubCl, nil) + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) // First reconcile: stamps finalizer. reconcileWD(t, c, clusterName, projNS, "wd-fed-1") @@ -1544,6 +1544,312 @@ func TestFederator_StatusSync_PreservesReferencedDataReadyCondition(t *testing.T "resolver's Ready reason must be preserved by federator status sync") } +// ─── Component 3: Explicit ResourceBinding teardown ────────────────────────── + +// TestRBTeardown_ConfigMap asserts that when a federated companion ConfigMap is +// deleted (ref-count reaches zero), the controller also deletes the Karmada +// ResourceBinding named "{companionName}-configmap" in the downstream namespace. +func TestRBTeardown_ConfigMap(t *testing.T) { + t.Parallel() + + projNS := testProjNS + projNSUID := testProjNSUID + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + rbName := companionName + "-configmap" + wdName := "wd-rb-teardown-cm" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: projNS, Name: wdName}.String() + + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + // WD pre-seeded with deletionTimestamp + finalizer + annotation to trigger + // the deletion reconcile path directly. + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: projNS, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["ConfigMap/` + companionName + `"]`, + }, + }, + } + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + projectCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projNSObj, wd). + WithStatusSubresource(wd). + Build() + + // Hub: companion ConfigMap already exists + the RB the reconciler should delete. + downstreamNS := testKarmadaNSStr + existingCompanion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: downstreamNS, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + existingRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Namespace: downstreamNS, Name: rbName}, + } + + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(existingCompanion, existingRB). + Build() + + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) + + // Reconcile deletion: should delete companion + RB. + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: projNS, Name: wdName}}, + ClusterName: cn, + }) + require.NoError(t, err) + + // Companion must be gone from the hub. + var hubCM corev1.ConfigMap + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: companionName}, &hubCM) + require.True(t, apierrors.IsNotFound(err), "hub companion ConfigMap must be deleted") + + // ResourceBinding must be gone from the hub. + var rb karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: rbName}, &rb) + require.True(t, apierrors.IsNotFound(err), "ResourceBinding %q must be deleted after companion deletion", rbName) +} + +// TestRBTeardown_Secret asserts the same teardown behaviour for Secret companions. +func TestRBTeardown_Secret(t *testing.T) { + t.Parallel() + + projNS := testProjNS + projNSUID := testProjNSUID + secretName := "db-creds" + companionName := referenceddata.CompanionName("Secret", secretName) + rbName := companionName + "-secret" + wdName := "wd-rb-teardown-sec" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: projNS, Name: wdName}.String() + + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: projNS, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["Secret/` + companionName + `"]`, + }, + }, + } + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + projectCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projNSObj, wd). + WithStatusSubresource(wd). + Build() + + downstreamNS := testKarmadaNSStr + existingCompanion := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: downstreamNS, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + existingRB := &karmadaworkv1alpha2.ResourceBinding{ + ObjectMeta: metav1.ObjectMeta{Namespace: downstreamNS, Name: rbName}, + } + + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(existingCompanion, existingRB). + Build() + + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) + + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: projNS, Name: wdName}}, + ClusterName: cn, + }) + require.NoError(t, err) + + var hubSecret corev1.Secret + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: companionName}, &hubSecret) + require.True(t, apierrors.IsNotFound(err), "hub companion Secret must be deleted") + + var rb karmadaworkv1alpha2.ResourceBinding + err = hubCl.Get(context.Background(), types.NamespacedName{Namespace: downstreamNS, Name: rbName}, &rb) + require.True(t, apierrors.IsNotFound(err), "ResourceBinding %q must be deleted after companion deletion", rbName) +} + +// TestRBTeardown_ToleratesNotFound asserts that companion deletion succeeds +// even when the ResourceBinding is already gone (Karmada beat the controller). +func TestRBTeardown_ToleratesNotFound(t *testing.T) { + t.Parallel() + + projNS := testProjNS + projNSUID := testProjNSUID + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + wdName := "wd-rb-notfound" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: projNS, Name: wdName}.String() + + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: projNS, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["ConfigMap/` + companionName + `"]`, + }, + }, + } + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + projectCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projNSObj, wd). + WithStatusSubresource(wd). + Build() + + downstreamNS := testKarmadaNSStr + existingCompanion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: downstreamNS, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + // RB is intentionally absent — Karmada already deleted it. + hubCl := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(existingCompanion). + Build() + + c, clusterName := newRDControllerFederated(t, projectCl, hubCl) + + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: projNS, Name: wdName}}, + ClusterName: cn, + }) + // Must not error even though the RB is already gone. + require.NoError(t, err, "RB teardown must tolerate NotFound") +} + +// TestRBTeardown_LocalWriter_NoOp asserts that the localCompanionWriter path +// (FederationClient == nil, single-cluster dev mode) does NOT attempt to delete +// any ResourceBindings — there is no Karmada hub in single-cluster mode. +// +// The companion and its source share the same name/namespace in single-cluster +// mode. We seed the WD as already terminating so the deletion path fires +// immediately. +func TestRBTeardown_LocalWriter_NoOp(t *testing.T) { + t.Parallel() + + ns := rdTestNamespace + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + wdName := "wd-local-noop" + + now := metav1.Now() + wdKey := types.NamespacedName{Namespace: ns, Name: wdName}.String() + + // In single-cluster mode the companion == source (same name/namespace). + // Seed the companion directly as if the controller had materialised it. + companion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount([]string{wdKey}), + }, + }, + } + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: wdName, + DeletionTimestamp: &now, + Finalizers: []string{referencedDataFinalizer}, + Annotations: map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: `["ConfigMap/` + companionName + `"]`, + }, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(companion, wd). + WithStatusSubresource(wd). + Build() + + // Track Delete calls to detect any unexpected RB deletes. + var rbDeleteCalled bool + interceptedCl := interceptor.NewClient(cl, interceptor.Funcs{ + Delete: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.DeleteOption) error { + if _, ok := obj.(*karmadaworkv1alpha2.ResourceBinding); ok { + rbDeleteCalled = true + } + return c.Delete(ctx, obj, opts...) + }, + }) + + // No FederationClient → localCompanionWriter path. + c, clusterName := newRDController(t, interceptedCl, nil) + + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: ns, Name: wdName}}, + ClusterName: cn, + }) + require.NoError(t, err) + + // The companion ConfigMap is deleted (localCompanionWriter deletes it + // directly), but NO ResourceBinding delete should have been attempted. + assert.False(t, rbDeleteCalled, "localCompanionWriter must NOT delete ResourceBindings (no Karmada hub in single-cluster mode)") +} + // ─── Helpers ───────────────────────────────────────────────────────────────── // stubReader allows individual test cases to inject custom reader behaviour. diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go index cc3d3d9f..91737cba 100644 --- a/internal/controller/testing_helpers_test.go +++ b/internal/controller/testing_helpers_test.go @@ -15,9 +15,16 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/multicluster" karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + karmadaworkv1alpha2 "github.com/karmada-io/api/work/v1alpha2" computev1alpha "go.datum.net/compute/api/v1alpha" ) +// gcTestProjectNamespace is the project-plane namespace used across GC and +// WorkloadDeployment controller tests. "default" matches the annotation key +// prefix written by the hub-side ReferencedDataController +// (e.g. "default/mount-pristine-default-dfw"). +const gcTestProjectNamespace = "default" + // ─── Scheme helpers ─────────────────────────────────────────────────────────── // newProjectScheme builds a runtime.Scheme with the types needed by the project @@ -30,12 +37,13 @@ func newProjectScheme() *runtime.Scheme { } // newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada -// API server (corev1 + compute + karmada policy). +// API server (corev1 + compute + karmada policy + karmada work ResourceBindings). func newKarmadaScheme() *runtime.Scheme { s := runtime.NewScheme() _ = corev1.AddToScheme(s) _ = computev1alpha.AddToScheme(s) _ = karmadapolicyv1alpha1.Install(s) + _ = karmadaworkv1alpha2.Install(s) return s } diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 2581eb50..4ab0cdba 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -18,11 +18,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" @@ -240,12 +238,10 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco apimeta.SetStatusCondition(&deployment.Status.Conditions, availCond) } - // Skip the write when the status is unchanged. Without this guard the - // reconciler's own Status().Update would always produce a new resourceVersion, - // firing another Update event on the WD and creating an infinite reconcile loop - // before the predicate on For() was added. The guard is a belt-and-suspenders - // complement to the predicate: the predicate prevents re-queuing on own writes, - // and this guard avoids the superfluous API call entirely. + // Skip the write when the status is unchanged. The unfiltered For() watch + // re-enqueues this deployment on every status write, so an unconditional + // Status().Update would re-trigger the reconciler on its own writes; this + // guard breaks that loop and avoids the superfluous API call entirely. if !equality.Semantic.DeepEqual(existingStatus, deployment.Status) { if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) @@ -315,68 +311,6 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas, nil } -// wdReferencedDataChangedPredicate returns a predicate for the WorkloadDeployment -// For() watch that fires on: -// - Any Create, Delete, or Generic event (always enqueue). -// - An Update event where metadata.generation changed (spec updated), OR where -// the ReferencedDataReady condition's Status, Reason, or Message changed. -// -// The predicate intentionally does NOT fire when only the Available or -// ReplicasReady conditions change, because those are written by this reconciler -// itself. Without this guard the reconciler's own Status().Update would re-enqueue -// itself on every run, creating a tight reconcile loop. The equality check before -// Status().Update is a complementary guard, but the predicate is the primary -// protection: it prevents re-enqueuing entirely so the workqueue stays quiet between -// meaningful state transitions. -// -// Loop prevention: the ReferencedDataController (the only other writer of the -// ReferencedDataReady condition) is the intended trigger. When it sets -// ReferencedDataReady=False/SourceNotFound the predicate passes and this -// reconciler re-runs, sees the resolver verdict in deployment.Status.Conditions, and -// promotes Available to ReferencedDataNotReady. Subsequent runs by this reconciler -// (which write Available but not ReferencedDataReady) are filtered out. -func wdReferencedDataChangedPredicate() predicate.Predicate { - return predicate.Funcs{ - UpdateFunc: func(e event.UpdateEvent) bool { - oldWD, ok1 := e.ObjectOld.(*computev1alpha.WorkloadDeployment) - newWD, ok2 := e.ObjectNew.(*computev1alpha.WorkloadDeployment) - if !ok1 || !ok2 { - return true // be conservative when type assertion fails - } - // Spec change: always reconcile. - if oldWD.Generation != newWD.Generation { - return true - } - // ReferencedDataReady condition changed: reconcile so Available is - // updated to reflect the resolver's verdict. - return wdRefDataCondChanged( - apimeta.FindStatusCondition(oldWD.Status.Conditions, computev1alpha.ReferencedDataReady), - apimeta.FindStatusCondition(newWD.Status.Conditions, computev1alpha.ReferencedDataReady), - ) - }, - CreateFunc: func(_ event.CreateEvent) bool { return true }, - DeleteFunc: func(_ event.DeleteEvent) bool { return true }, - GenericFunc: func(_ event.GenericEvent) bool { return true }, - } -} - -// wdRefDataCondChanged returns true when the ReferencedDataReady condition's -// observable fields (Status, Reason, Message) differ between old and new. Presence -// changes (nil → non-nil or vice versa) are also treated as a change. The -// LastTransitionTime field is excluded because it changes on every status flip and -// would defeat the loop-prevention intent of wdReferencedDataChangedPredicate. -func wdRefDataCondChanged(old, new *metav1.Condition) bool { - if (old == nil) != (new == nil) { - return true // condition was added or removed - } - if old == nil { - return false // both nil — no change - } - return old.Status != new.Status || - old.Reason != new.Reason || - old.Message != new.Message -} - // selectWDBlockingCondition evaluates all blocking causes for a WorkloadDeployment // that has no ready replicas and returns the Available condition reflecting the // highest-priority blocker. All causes are evaluated before selecting the winner @@ -788,15 +722,15 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager, o } b := mcbuilder.ControllerManagedBy(mgr). - // The predicate gates re-enqueuing on meaningful WD changes: spec updates - // (generation bump) or a ReferencedDataReady condition change written by - // ReferencedDataController. Without it, each Status().Update by this - // reconciler (writing Available/ReplicasReady) would re-enqueue itself, - // creating a tight loop and delaying the ReferencedDataReady signal from - // the resolver. + // Watch all WorkloadDeployment events. The reconciler's own Status().Update + // cannot create a self-trigger loop because the equality.Semantic.DeepEqual + // guard skips the write when nothing changed, so no self-event is produced. + // We deliberately do NOT filter Update events with a predicate: an earlier + // predicate that only passed generation/ReferencedDataReady changes dropped + // metadata-only updates such as the initial finalizer-add, which wedged new + // WorkloadDeployments until a controller restart. For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false), - mcbuilder.WithPredicates(wdReferencedDataChangedPredicate()), ). Owns(&computev1alpha.Instance{}) diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go index 45dfbea6..83bce465 100644 --- a/internal/controller/workloaddeployment_controller_test.go +++ b/internal/controller/workloaddeployment_controller_test.go @@ -13,7 +13,6 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/finalizer" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" @@ -22,16 +21,12 @@ import ( ) const ( - // wdControllerTestName / wdControllerTestNS / wdControllerTestUID are shared - // fixtures for the WorkloadDeployment controller unit tests. + // wdControllerTestName, wdControllerTestNS, wdControllerTestUID are the + // stable identifiers used across the WorkloadDeployment controller tests. wdControllerTestName = "test-wd" - wdControllerTestNS = "default" + wdControllerTestNS = gcTestProjectNamespace // "default" wdControllerTestUID = "wd-uid-test" - // wdControllerTestCityCode is the shared CityCode fixture for - // WorkloadDeployment controller tests. - wdControllerTestCityCode = "DFW" - // wdControllerTestWorkload is the shared WorkloadRef fixture. wdControllerTestWorkload = "test-workload" @@ -52,7 +47,7 @@ func wdControllerTestDeployment(minReplicas int32) *computev1alpha.WorkloadDeplo UID: wdControllerTestUID, }, Spec: computev1alpha.WorkloadDeploymentSpec{ - CityCode: wdControllerTestCityCode, + CityCode: wbTestCityCode, PlacementName: testDefaultPlacement, WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ @@ -202,7 +197,7 @@ func TestReconcileInstanceGates_NilSpecController_DoesNotPanic(t *testing.T) { deployment := &computev1alpha.WorkloadDeployment{ ObjectMeta: metav1.ObjectMeta{Name: wdControllerTestName, Namespace: wdControllerTestNS, UID: wdControllerTestUID}, Spec: computev1alpha.WorkloadDeploymentSpec{ - CityCode: wdControllerTestCityCode, + CityCode: wbTestCityCode, PlacementName: testDefaultPlacement, WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, @@ -433,290 +428,6 @@ func TestWorkloadDeploymentReconcile_FinalizerAddRequeues(t *testing.T) { "no instances are quota-blocked, so ReplicasReady must be true") } -// ─── wdRefDataCondChanged tests ─────────────────────────────────────────────── - -// TestWdRefDataCondChanged_BothNil verifies that two nil conditions are treated -// as unchanged (no predicate trigger). -func TestWdRefDataCondChanged_BothNil(t *testing.T) { - assert.False(t, wdRefDataCondChanged(nil, nil), - "both nil conditions must not be treated as a change") -} - -// TestWdRefDataCondChanged_AddedCondition verifies that a nil→non-nil transition -// (condition first appears on the WD) is treated as a change. -func TestWdRefDataCondChanged_AddedCondition(t *testing.T) { - newCond := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - } - assert.True(t, wdRefDataCondChanged(nil, newCond), - "nil→non-nil must be treated as a change") -} - -// TestWdRefDataCondChanged_RemovedCondition verifies that a non-nil→nil transition -// (condition removed) is treated as a change. -func TestWdRefDataCondChanged_RemovedCondition(t *testing.T) { - oldCond := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionTrue, - Reason: computev1alpha.ReferencedDataReasonReady, - } - assert.True(t, wdRefDataCondChanged(oldCond, nil), - "non-nil→nil must be treated as a change") -} - -// TestWdRefDataCondChanged_StatusChange verifies that a change in the condition's -// Status field is detected. -func TestWdRefDataCondChanged_StatusChange(t *testing.T) { - old := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionUnknown, - Reason: computev1alpha.ReferencedDataReasonResolving, - } - new := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - } - assert.True(t, wdRefDataCondChanged(old, new), - "status change must be detected") -} - -// TestWdRefDataCondChanged_MessageChange verifies that a change in Message is -// detected, e.g. when the resolver updates the missing-object name. -func TestWdRefDataCondChanged_MessageChange(t *testing.T) { - old := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: `ConfigMap "a" not found in namespace "default"`, - } - new := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: `ConfigMap "b" not found in namespace "default"`, - } - assert.True(t, wdRefDataCondChanged(old, new), - "message change must be detected") -} - -// TestWdRefDataCondChanged_Identical verifies that an identical condition (same -// Status/Reason/Message, differing only in LastTransitionTime) is NOT treated as -// a change. This is the key no-self-trigger property: the reconciler's own -// Status().Update does not re-enqueue via the For() predicate. -func TestWdRefDataCondChanged_Identical(t *testing.T) { - t1 := metav1.Now() - t2 := metav1.Now() - old := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: t1, - } - new := &metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: t2, // different timestamp, same content - } - assert.False(t, wdRefDataCondChanged(old, new), - "identical conditions with differing LastTransitionTime must not be treated as a change") -} - -// ─── wdReferencedDataChangedPredicate tests ─────────────────────────────────── - -// makeWDPair builds two empty WorkloadDeployment objects (no conditions) for -// predicate Update tests. Callers mutate the returned objects before constructing -// the event.UpdateEvent. -func makeWDPair(gen int64) (old, new *computev1alpha.WorkloadDeployment) { - old = &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: wdControllerTestName, - Namespace: wdControllerTestNS, - Generation: gen, - }, - } - new = &computev1alpha.WorkloadDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: wdControllerTestName, - Namespace: wdControllerTestNS, - Generation: gen, - }, - } - return old, new -} - -// TestWDPredicate_ReferencedDataReadyAdded verifies that the predicate passes -// when the resolver writes ReferencedDataReady=False for the first time. This is -// the primary gap-closure scenario: the WD had no ReferencedDataReady condition -// (reconciler wrote Available=InstancesProvisioning), the resolver adds -// ReferencedDataReady=False/SourceNotFound, and the predicate must fire so the -// reconciler re-runs and promotes Available to ReferencedDataNotReady. -func TestWDPredicate_ReferencedDataReadyAdded(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - oldWD, newWD := makeWDPair(1) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: metav1.Now(), - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.True(t, pred.Update(e), - "predicate must pass when ReferencedDataReady is added (nil→False/SourceNotFound)") -} - -// TestWDPredicate_ReferencedDataReadyCleared verifies that the predicate passes -// when the resolver sets ReferencedDataReady=True (source ConfigMap was created). -// The WD reconciler must re-run so Available can be promoted from -// ReferencedDataNotReady to StableInstanceFound (or InstancesProvisioning). -func TestWDPredicate_ReferencedDataReadyCleared(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - oldWD, newWD := makeWDPair(1) - apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: metav1.Now(), - }) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionTrue, - Reason: computev1alpha.ReferencedDataReasonReady, - Message: "All companions are ready", - LastTransitionTime: metav1.Now(), - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.True(t, pred.Update(e), - "predicate must pass when ReferencedDataReady flips from False to True") -} - -// TestWDPredicate_GenerationChanged verifies that the predicate passes when the -// WD's generation changes (spec update), even if the ReferencedDataReady condition -// did not change. -func TestWDPredicate_GenerationChanged(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - oldWD, newWD := makeWDPair(1) - newWD.Generation = 2 - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.True(t, pred.Update(e), - "predicate must pass when metadata.generation increases") -} - -// TestWDPredicate_AvailableOnlyChange verifies that the predicate DROPS updates -// where only the Available condition changed. This is the self-trigger prevention: -// after the WD reconciler writes Available=ReferencedDataNotReady, the predicate -// must not re-enqueue the same reconciler via the For() watch. -func TestWDPredicate_AvailableOnlyChange(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - // Both old and new have the SAME ReferencedDataReady condition. - refDataCond := metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.ReferencedDataReasonSourceNotFound, - Message: testMsgConfigMapNotFound, - LastTransitionTime: metav1.Now(), - } - oldWD, newWD := makeWDPair(1) - apimeta.SetStatusCondition(&oldWD.Status.Conditions, refDataCond) - apimeta.SetStatusCondition(&newWD.Status.Conditions, refDataCond) - - // The WD reconciler wrote Available=InstancesProvisioning (old) and then - // Available=ReferencedDataNotReady (new). ReferencedDataReady is unchanged. - // NOTE (split): the named reason constants land in layer E; literals are used - // here because the predicate only cares that the Available reason changed. - apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "InstancesProvisioning", - Message: "Instances are being provisioned", - }) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ReferencedDataNotReady", - Message: testMsgConfigMapNotFound, - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.False(t, pred.Update(e), - "predicate must drop update when only Available changed; "+ - "ReferencedDataReady is unchanged so the reconciler's own write must not re-enqueue itself") -} - -// TestWDPredicate_ReplicasReadyOnlyChange verifies that the predicate DROPS updates -// where only the ReplicasReady condition changed (also written by this reconciler), -// for the same self-trigger prevention reason as the Available-only case. -func TestWDPredicate_ReplicasReadyOnlyChange(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - - refDataCond := metav1.Condition{ - Type: computev1alpha.ReferencedDataReady, - Status: metav1.ConditionTrue, - Reason: computev1alpha.ReferencedDataReasonReady, - Message: "all ready", - LastTransitionTime: metav1.Now(), - } - oldWD, newWD := makeWDPair(2) - apimeta.SetStatusCondition(&oldWD.Status.Conditions, refDataCond) - apimeta.SetStatusCondition(&newWD.Status.Conditions, refDataCond) - - // ReplicasReady changed (more instances became ready) but ReferencedDataReady is identical. - apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: reasonReplicasAvailable, - Message: "0/2 replicas available", - }) - apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: reasonReplicasAvailable, - Message: "2/2 replicas available", - }) - - e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} - assert.False(t, pred.Update(e), - "predicate must drop update when only ReplicasReady changed") -} - -// TestWDPredicate_CreateAlwaysPasses verifies that Create events always trigger -// the reconciler regardless of conditions. -func TestWDPredicate_CreateAlwaysPasses(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - e := event.CreateEvent{Object: &computev1alpha.WorkloadDeployment{}} - assert.True(t, pred.Create(e)) -} - -// TestWDPredicate_DeleteAlwaysPasses verifies that Delete events always trigger -// the reconciler. -func TestWDPredicate_DeleteAlwaysPasses(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - e := event.DeleteEvent{Object: &computev1alpha.WorkloadDeployment{}} - assert.True(t, pred.Delete(e)) -} - -// TestWDPredicate_GenericAlwaysPasses verifies that Generic events (e.g. from -// external sources) always trigger the reconciler. -func TestWDPredicate_GenericAlwaysPasses(t *testing.T) { - pred := wdReferencedDataChangedPredicate() - e := event.GenericEvent{Object: &computev1alpha.WorkloadDeployment{}} - assert.True(t, pred.Generic(e)) -} - // makeWDForAvailTest constructs a WorkloadDeployment for Available-condition // unit tests, optionally stamping a ReferencedDataReady condition on status. func makeWDForAvailTest(generation int64, refDataStatus metav1.ConditionStatus, refDataReason, refDataMessage string) *computev1alpha.WorkloadDeployment { diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go index 9d0d44ba..f45fc9fc 100644 --- a/internal/controller/workloaddeployment_federator.go +++ b/internal/controller/workloaddeployment_federator.go @@ -4,8 +4,10 @@ package controller import ( "context" + "errors" "fmt" "strings" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -13,6 +15,7 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -48,8 +51,51 @@ const ( cityCodeLabel = "topology.datum.net/city-code" kindWorkloadDeployment = "WorkloadDeployment" + + // companionGuardRequeueAfter is the delay inserted between companion-guard + // requeue attempts. Long enough for the referenced-data controller to delete + // the hub companion; short enough that the deletion is not visibly delayed. + companionGuardRequeueAfter = 5 * time.Second + + // companionGuardTimeout is the maximum wall-clock time the companion ordering + // guard will block a WD deletion after its deletionTimestamp was set. After + // this timeout the guard is bypassed and a warning is logged. The cell-side + // CompanionGCReconciler is the authoritative cleanup path regardless. + companionGuardTimeout = 2 * time.Minute ) +// errCompanionsStillPresent is a sentinel returned by Finalize when hub +// companions are still present in the downstream namespace. Reconcile intercepts +// it, logs at Info (not Error), and returns RequeueAfter so the wait does not +// inflate the error-rate metric. +type errCompanionsStillPresent struct{ namespace string } + +func (e errCompanionsStillPresent) Error() string { + return fmt.Sprintf("companions still present in %s; waiting for referenced-data controller to finish cleanup", e.namespace) +} + +// findCompanionGuardSentinel searches err (including any kerrors.Aggregate +// members) for errCompanionsStillPresent. The finalizer framework wraps errors +// in a kerrors.Aggregate whose Errors() slice contains the per-finalizer +// wrapped errors; because Aggregate does not implement Unwrap() the standard +// errors.As cannot traverse it, so we walk manually. +func findCompanionGuardSentinel(err error) (errCompanionsStillPresent, bool) { + if agg, ok := err.(kerrors.Aggregate); ok { + for _, e := range agg.Errors() { + var sentinel errCompanionsStillPresent + if errors.As(e, &sentinel) { + return sentinel, true + } + } + return errCompanionsStillPresent{}, false + } + var sentinel errCompanionsStillPresent + if errors.As(err, &sentinel) { + return sentinel, true + } + return errCompanionsStillPresent{}, false +} + // WorkloadDeploymentFederator replicates WorkloadDeployments from project // namespaces into the downstream control plane so it can propagate them to the // appropriate POP-cell clusters. @@ -123,6 +169,16 @@ func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcrecon finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) if err != nil { + // The finalizer framework returns a kerrors.Aggregate which does not + // implement Unwrap, so errors.As cannot traverse it directly. Walk the + // aggregate manually to find the companion-guard sentinel so we can + // requeue gracefully (Info log + RequeueAfter) instead of inflating the + // error-rate metric. + if waitErr, ok := findCompanionGuardSentinel(err); ok { + logger.Info("deferring PropagationPolicy cleanup: hub companions still present", + "downstreamNamespace", waitErr.namespace) + return ctrl.Result{RequeueAfter: companionGuardRequeueAfter}, nil + } return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) } if finalizationResult.Updated { @@ -210,6 +266,23 @@ func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.O } logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + // The companion ordering guard only applies when this is the last WD for its + // city code — i.e. when cleanupPropagationPolicyIfUnused is actually about to + // delete the PP. If other WDs with the same city code still exist the PP is + // not deleted on this call anyway, so blocking here would deadlock any + // shared-namespace deletion (WD-A blocked by WD-B's companion, forever). + isLastForCity, err := r.isLastDeploymentForCity(ctx, downstreamNS, deployment.Spec.CityCode) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to check city-code peers in %s: %w", downstreamNS, err) + } + if isLastForCity { + if blocked, err := r.companionsStillPresent(ctx, downstreamNS, deployment); err != nil { + return finalizer.Result{}, fmt.Errorf("failed to check for remaining companions in %s: %w", downstreamNS, err) + } else if blocked { + return finalizer.Result{}, errCompanionsStillPresent{namespace: downstreamNS} + } + } + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { return finalizer.Result{}, err } @@ -217,6 +290,70 @@ func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.O return finalizer.Result{}, nil } +// isLastDeploymentForCity reports whether there are no other WorkloadDeployments +// with the same city code as cityCode remaining in downstreamNS (besides the one +// being deleted, which was already removed above). +func (r *WorkloadDeploymentFederator) isLastDeploymentForCity(ctx context.Context, downstreamNS, cityCode string) (bool, error) { + var remaining computev1alpha.WorkloadDeploymentList + if err := r.FederationClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return false, fmt.Errorf("list downstream deployments for city %q: %w", cityCode, err) + } + return len(remaining.Items) == 0, nil +} + +// companionsStillPresent reports whether any companion ConfigMaps or Secrets +// (labeled referenced-data=true) remain in the given downstream namespace. +// Returns (true, nil) when companions are present, (false, nil) when the +// namespace is clear, and (false, err) on API failure. +// +// deployment is the project-plane WD being finalized. Its DeletionTimestamp is +// used to bound how long the guard can block: if the WD has been terminating +// for longer than companionGuardTimeout the guard is bypassed with a warning. +// The cell-side CompanionGCReconciler is the authoritative cleanup path and +// will eventually clean up regardless. +func (r *WorkloadDeploymentFederator) companionsStillPresent( + ctx context.Context, + downstreamNS string, + deployment *computev1alpha.WorkloadDeployment, +) (bool, error) { + // If the WD has been terminating longer than the timeout, skip the check so + // a wedged referenced-data controller cannot block deletion permanently. + if deployment.DeletionTimestamp != nil { + age := time.Since(deployment.DeletionTimestamp.Time) + if age > companionGuardTimeout { + log.FromContext(ctx).Info( + "companion-guard timeout reached; bypassing guard — cell-side GC will clean up remaining companions", + "age", age.Round(time.Second), + "timeout", companionGuardTimeout, + "downstreamNamespace", downstreamNS, + ) + return false, nil + } + } + + companionSelector := client.MatchingLabels{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + } + nsSelector := client.InNamespace(downstreamNS) + + var cms corev1.ConfigMapList + if err := r.FederationClient.List(ctx, &cms, nsSelector, companionSelector); err != nil { + return false, fmt.Errorf("list companion ConfigMaps: %w", err) + } + if len(cms.Items) > 0 { + return true, nil + } + + var secrets corev1.SecretList + if err := r.FederationClient.List(ctx, &secrets, nsSelector, companionSelector); err != nil { + return false, fmt.Errorf("list companion Secrets: %w", err) + } + return len(secrets.Items) > 0, nil +} + // ensureDownstreamNamespace creates or updates the downstream namespace, stamping // it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. // This allows the InstanceProjector to resolve the project namespace name via a