diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ddfaa170..caa00ff3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,9 +15,9 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Run linter uses: golangci/golangci-lint-action@v8 with: - version: v2.1.5 + version: v2.12.2 diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8949c76b..5dcc90bb 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -18,6 +18,7 @@ jobs: secrets: inherit publish-kustomize-bundles: + needs: publish-container-image permissions: id-token: write contents: read @@ -26,4 +27,6 @@ jobs: with: bundle-name: ghcr.io/datum-cloud/compute-kustomize bundle-path: config + image-name: ghcr.io/datum-cloud/compute + image-overlays: config/base/manager secrets: inherit diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 8429bf2d..9bede775 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Install the latest version of kind run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 834d33a0..462cbf3d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Running Tests run: | diff --git a/.gitignore b/.gitignore index 2b0c6e44..d5cc564d 100644 --- a/.gitignore +++ b/.gitignore @@ -14,8 +14,8 @@ # Output of the go coverage tool, specifically when used with LiteIDE *.out -# Dependency directories (remove the comment below to include it) -# vendor/ +# Dependency directories +vendor/ # Go workspace file go.work @@ -25,3 +25,6 @@ go.work.sum .env bin/ + +# Local e2e environment artefacts (Kind kubeconfigs, etc.) +tmp/ diff --git a/.golangci.yml b/.golangci.yml index a7246fbb..e0342bda 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -35,6 +35,16 @@ linters: - dupl - lll path: internal/* + # field.ErrorList{} is the idiomatic Kubernetes validation init pattern; + # preallocating requires knowing the error count in advance which is not + # possible in recursive validation helpers. + - linters: + - prealloc + path: internal/validation/ + # Append-built slices in this package are clearer without prealloc. + - linters: + - prealloc + path: internal/controller/instancecontrol/ paths: - third_party$ - builtin$ diff --git a/Makefile b/Makefile index 61744a36..3d6a3e2e 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ KUSTOMIZE_VERSION ?= v5.5.0 CONTROLLER_TOOLS_VERSION ?= v0.16.4 DEFAULTER_GEN_VERSION ?= v0.32.3 ENVTEST_VERSION ?= release-0.19 -GOLANGCI_LINT_VERSION ?= v2.1.5 +GOLANGCI_LINT_VERSION ?= v2.12.2 # renovate: datasource=go depName=fybrik.io/crdoc CRDOC_VERSION ?= v0.6.4 diff --git a/cmd/main.go b/cmd/main.go index 3bb44bc9..4358a087 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -18,20 +18,27 @@ import ( "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcsingle "sigs.k8s.io/multicluster-runtime/providers/single" + karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" + "go.datum.net/compute/internal/features" + quotametrics "go.datum.net/compute/internal/quota" computewebhook "go.datum.net/compute/internal/webhook" computev1alphawebhooks "go.datum.net/compute/internal/webhook/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" @@ -41,6 +48,10 @@ import ( // +kubebuilder:scaffold:imports ) +// singleClusterName is the fixed cluster name that mcsingle.New registers. +// All single-mode wiring that references this cluster must use this constant. +const singleClusterName = "single" + var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") @@ -51,6 +62,11 @@ var ( gitCommit = "unknown" gitTreeState = "unknown" buildDate = "unknown" + + // federationRestConfig holds the REST config for the Karmada federation control + // plane. It is populated from --federation-kubeconfig when set, and is nil + // when the flag is omitted. + federationRestConfig *rest.Config ) func init() { @@ -61,22 +77,45 @@ func init() { utilruntime.Must(computev1alpha.AddToScheme(scheme)) utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) + utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) + utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) // +kubebuilder:scaffold:scheme } +//nolint:gocyclo // main wires all controller paths; complexity is inherent to startup sequencing func main() { var enableLeaderElection bool var leaderElectionNamespace string var probeAddr string var serverConfigFile string + var federationKubeconfig string + var federationContext string + var enableManagementControllers bool + var enableCellControllers bool flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.") + flag.StringVar(&federationKubeconfig, "federation-kubeconfig", "", + "Path to the kubeconfig file for the Karmada federation control plane. "+ + "Required when --enable-management-controllers is set. "+ + "When omitted, federation features are disabled.") + flag.StringVar(&federationContext, "federation-context", "", + "Context to use from the federation kubeconfig. When omitted, the current context is used.") + flag.BoolVar(&enableManagementControllers, "enable-management-controllers", false, + "Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector).") + flag.BoolVar(&enableCellControllers, "enable-cell-controllers", false, + "Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler).") + + var featureGatesFlag string + flag.StringVar(&featureGatesFlag, "feature-gates", "", + "A set of key=value pairs that describe feature gates for the compute operator. "+ + "Example: --feature-gates=NetworkingIntegration=false. "+ + "Available features: NetworkingIntegration (default=true).") opts := zap.Options{ Development: true, @@ -87,8 +126,40 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + if featureGatesFlag != "" { + if err := features.MutableFeatureGate.Set(featureGatesFlag); err != nil { + setupLog.Error(err, "unable to parse feature gates", "feature-gates", featureGatesFlag) + os.Exit(1) + } + } + setupLog.Info("feature gates", "NetworkingIntegration", features.FeatureGate.Enabled(features.NetworkingIntegration)) + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + if federationKubeconfig != "" { + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: federationKubeconfig}, + &clientcmd.ConfigOverrides{CurrentContext: federationContext}, + ) + var err error + federationRestConfig, err = loader.ClientConfig() + if err != nil { + setupLog.Error(err, "unable to load federation kubeconfig", "path", federationKubeconfig) + os.Exit(1) + } + setupLog.Info("federation kubeconfig loaded", "path", federationKubeconfig) + } + + // Fail loud: management controllers require a federation kubeconfig. Silently + // skipping them when --enable-management-controllers is set would leave + // federation and instance projection broken with no visible signal. + if enableManagementControllers && federationRestConfig == nil { + setupLog.Error(nil, + "management controllers enabled but no federation kubeconfig configured", + "hint", "set --federation-kubeconfig") + os.Exit(1) + } + setupLog.Info("starting compute", "version", version, "gitCommit", gitCommit, @@ -96,24 +167,28 @@ func main() { "buildDate", buildDate, ) - var serverConfig config.WorkloadOperator - var configData []byte - if len(serverConfigFile) > 0 { - var err error - configData, err = os.ReadFile(serverConfigFile) - if err != nil { - setupLog.Error(fmt.Errorf("unable to read server config from %q", serverConfigFile), "") - os.Exit(1) - } - } - - if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { - setupLog.Error(err, "unable to decode server config") + serverConfig, err := loadServerConfig(serverConfigFile) + if err != nil { + setupLog.Error(err, "unable to load server config") os.Exit(1) } setupLog.Info("server config", "config", serverConfig) + quotaRestConfig, err := serverConfig.Discovery.QuotaRestConfig() + if err != nil { + setupLog.Error(err, "unable to load quota REST config") + os.Exit(1) + } + if quotaRestConfig != nil { + setupLog.Info("quota REST config loaded", "path", serverConfig.Discovery.QuotaKubeconfigPath) + quotametrics.EnforcementEnabled.Set(1) + } else { + setupLog.Error(nil, "quota enforcement is DISABLED — workloads will schedule without quota accounting; "+ + "set quotaKubeconfigPath in server config to enable enforcement") + quotametrics.EnforcementEnabled.Set(0) + } + cfg := ctrl.GetConfigOrDie() deploymentCluster, err := cluster.New(cfg, func(o *cluster.Options) { @@ -124,7 +199,9 @@ func main() { os.Exit(1) } - runnables, provider, err := initializeClusterDiscovery(serverConfig, deploymentCluster, scheme) + runnables, provider, edgeClusterName, err := initializeClusterDiscovery( + serverConfig, deploymentCluster, scheme, + ) if err != nil { setupLog.Error(err, "unable to initialize cluster discovery") os.Exit(1) @@ -176,21 +253,62 @@ func main() { os.Exit(1) } - if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Workload") - os.Exit(1) + if enableManagementControllers { + if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Workload") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") - os.Exit(1) + + // Build a single federation client shared across all controllers that need to + // read or write to the Karmada federation control plane. This is the hub that + // the management controllers federate through and that edge cells write back to. + var federationClient client.Client + if federationRestConfig != nil { + federationClient, err = client.New(federationRestConfig, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create federation client") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler") - os.Exit(1) + + if enableCellControllers { + if err = (&controller.WorkloadDeploymentReconciler{ + NetworkingEnabled: features.FeatureGate.Enabled(features.NetworkingIntegration), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") + os.Exit(1) + } } - if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Instance") - os.Exit(1) + + if enableCellControllers { + clusterNameForProject := func(_ string) multicluster.ClusterName { + return multicluster.ClusterName(singleClusterName) + } + instanceReconciler := &controller.InstanceReconciler{FederationClient: federationClient} + err = instanceReconciler.SetupWithManager( + mgr, + quotaRestConfig, + controller.NewSingleModeProjectID(mgr), + controller.NewSingleModeProjectNamespace(mgr), + edgeClusterName, + clusterNameForProject, + ) + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Instance") + os.Exit(1) + } + } + + // The fail-loud guard above ensures federationRestConfig is non-nil when + // management controllers are enabled; the nil check here is defensive. + if enableManagementControllers && federationRestConfig != nil { + extra, err := setupManagementControllers(mgr, federationClient) + if err != nil { + setupLog.Error(err, "unable to set up management controllers") + os.Exit(1) + } + runnables = append(runnables, extra...) } if serverConfig.WebhookServer != nil { @@ -223,11 +341,6 @@ func main() { }) } - setupLog.Info("starting cluster discovery provider") - g.Go(func() error { - return ignoreCanceled(provider.Run(ctx, mgr)) - }) - setupLog.Info("starting multicluster manager") g.Go(func() error { return ignoreCanceled(mgr.Start(ctx)) @@ -239,51 +352,33 @@ func main() { } } -type runnableProvider interface { - multicluster.Provider - Run(context.Context, mcmanager.Manager) error -} - -// Needed until we contribute the patch in the following PR again (need to sign CLA): -// -// See: https://github.com/kubernetes-sigs/multicluster-runtime/pull/18 -type wrappedSingleClusterProvider struct { - multicluster.Provider - cluster cluster.Cluster -} - -func (p *wrappedSingleClusterProvider) Run(ctx context.Context, mgr mcmanager.Manager) error { - if err := mgr.Engage(ctx, "single", p.cluster); err != nil { - return err - } - return p.Provider.(runnableProvider).Run(ctx, mgr) -} - func initializeClusterDiscovery( serverConfig config.WorkloadOperator, deploymentCluster cluster.Cluster, scheme *runtime.Scheme, -) (runnables []manager.Runnable, provider runnableProvider, err error) { +) (runnables []manager.Runnable, provider multicluster.Provider, edgeClusterName string, err error) { runnables = append(runnables, deploymentCluster) switch serverConfig.Discovery.Mode { case multiclusterproviders.ProviderSingle: - provider = &wrappedSingleClusterProvider{ - Provider: mcsingle.New("single", deploymentCluster), - cluster: deploymentCluster, + provider = mcsingle.New(multicluster.ClusterName(singleClusterName), deploymentCluster) + edgeClusterName = serverConfig.Discovery.ClusterName + if edgeClusterName == "" { + edgeClusterName = singleClusterName } case multiclusterproviders.ProviderMilo: discoveryRestConfig, err := serverConfig.Discovery.DiscoveryRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get discovery rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get discovery rest config: %w", err) } projectRestConfig, err := serverConfig.Discovery.ProjectRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get project rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get project rest config: %w", err) } discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{ + Metrics: metricsserver.Options{BindAddress: "0"}, Client: client.Options{ Cache: &client.CacheOptions{ Unstructured: true, @@ -291,7 +386,7 @@ func initializeClusterDiscovery( }, }) if err != nil { - return nil, nil, fmt.Errorf("unable to set up overall controller manager: %w", err) + return nil, nil, "", fmt.Errorf("unable to set up overall controller manager: %w", err) } provider, err = milomulticluster.New(discoveryManager, milomulticluster.Options{ @@ -304,10 +399,11 @@ func initializeClusterDiscovery( ProjectRestConfig: projectRestConfig, }) if err != nil { - return nil, nil, fmt.Errorf("unable to create datum project provider: %w", err) + return nil, nil, "", fmt.Errorf("unable to create datum project provider: %w", err) } runnables = append(runnables, discoveryManager) + edgeClusterName = serverConfig.Discovery.ClusterName // case providers.ProviderKind: // provider = mckind.New(mckind.Options{ @@ -319,13 +415,29 @@ func initializeClusterDiscovery( // }) default: - return nil, nil, fmt.Errorf( + return nil, nil, "", fmt.Errorf( "unsupported cluster discovery mode %s", serverConfig.Discovery.Mode, ) } - return runnables, provider, nil + return runnables, provider, edgeClusterName, nil +} + +func loadServerConfig(path string) (config.WorkloadOperator, error) { + var serverConfig config.WorkloadOperator + var configData []byte + if len(path) > 0 { + var err error + configData, err = os.ReadFile(path) + if err != nil { + return serverConfig, fmt.Errorf("unable to read server config from %q: %w", path, err) + } + } + if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { + return serverConfig, fmt.Errorf("unable to decode server config: %w", err) + } + return serverConfig, nil } func ignoreCanceled(err error) error { @@ -334,3 +446,47 @@ func ignoreCanceled(err error) error { } return err } + +// setupManagementControllers wires the WorkloadDeploymentFederator and +// InstanceProjector onto mgr. It returns any additional Runnable objects that +// must be started alongside the main manager (the federation manager used by +// InstanceProjector). Called only when management controllers are enabled and +// a federation REST config is available. +func setupManagementControllers(mgr mcmanager.Manager, federationClient client.Client) ([]manager.Runnable, error) { + // The federation manager provides a cached, watchable handle to the Karmada + // federation control plane. It backs the InstanceProjector's Instance watch + // and the WorkloadDeploymentFederator's downstream WorkloadDeployment status + // watch. A manager.Manager embeds a cluster.Cluster, so it can be passed + // directly anywhere a watchable federation cluster source is required. + federationMgr, err := manager.New(federationRestConfig, manager.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + if err != nil { + return nil, fmt.Errorf("federation manager: %w", err) + } + + // The federator watches both the project WD (via the multicluster manager) + // and the downstream Karmada WD (via the federation cluster) so that status + // aggregated downstream by Karmada is mirrored back to the project WD + // immediately instead of on the next informer resync. + federator := &controller.WorkloadDeploymentFederator{ + FederationClient: federationClient, + FederationCluster: federationMgr, + } + if err := federator.SetupWithManager(mgr); err != nil { + return nil, fmt.Errorf("WorkloadDeploymentFederator: %w", err) + } + + // InstanceProjector runs in the management plane, watches Instances written + // back by POP-cell operators to the Karmada federation control plane, and + // projects them into the corresponding project namespaces via the multicluster manager. + if err = (&controller.InstanceProjector{ + FederationClient: federationClient, + MCManager: mgr, + }).SetupWithManager(federationMgr); err != nil { + return nil, fmt.Errorf("InstanceProjector: %w", err) + } + + return []manager.Runnable{federationMgr}, nil +} diff --git a/config/base/downstream-rbac/kustomization.yaml b/config/base/downstream-rbac/kustomization.yaml new file mode 100644 index 00000000..4c4dbe44 --- /dev/null +++ b/config/base/downstream-rbac/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml new file mode 100644 index 00000000..1937ef02 --- /dev/null +++ b/config/base/downstream-rbac/rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-manager +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch", "create", "update", "patch"] + - apiGroups: ["compute.datumapis.com"] + resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["policy.karmada.io"] + resources: ["propagationpolicies", "clusterpropagationpolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.karmada.io"] + resources: ["clusters"] + verbs: ["get", "list", "watch"] + - apiGroups: ["work.karmada.io"] + resources: ["resourcebindings", "clusterresourcebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["config.karmada.io"] + resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: compute-manager +subjects: + - kind: User + name: system:serviceaccount:compute-system:compute-manager diff --git a/config/base/federation/kustomization.yaml b/config/base/federation/kustomization.yaml new file mode 100644 index 00000000..1261dac6 --- /dev/null +++ b/config/base/federation/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../crd/bases/compute.datumapis.com_instances.yaml + - ../crd/bases/compute.datumapis.com_workloaddeployments.yaml + - ../crd/bases/compute.datumapis.com_workloads.yaml + +components: + - ../../components/federation diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index e2c06e97..8ef18135 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -26,14 +26,33 @@ spec: seccompProfile: type: RuntimeDefault containers: - - command: + - name: manager + command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 - - --server-config=/config/config.yaml + - --leader-elect=$(LEADER_ELECT) + - --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS) + - --server-config=$(SERVER_CONFIG) + - --federation-kubeconfig=$(FEDERATION_KUBECONFIG) + - --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS) + - --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS) + - --feature-gates=$(FEATURE_GATES) + env: + - name: LEADER_ELECT + value: "true" + - name: HEALTH_PROBE_BIND_ADDRESS + value: ":8081" + - name: SERVER_CONFIG + value: /config/config.yaml + - name: FEDERATION_KUBECONFIG + value: "" + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "false" + - name: ENABLE_CELL_CONTROLLERS + value: "false" + - name: FEATURE_GATES + value: "" image: ghcr.io/datum-cloud/compute:latest - name: manager ports: - containerPort: 9443 name: webhook-server @@ -66,7 +85,7 @@ spec: volumeMounts: - name: config mountPath: /config - serviceAccountName: compute + serviceAccountName: compute-manager terminationGracePeriodSeconds: 10 volumes: - name: config diff --git a/config/base/manager/service_account.yaml b/config/base/manager/service_account.yaml index f8711deb..cc6bd6cc 100644 --- a/config/base/manager/service_account.yaml +++ b/config/base/manager/service_account.yaml @@ -4,4 +4,4 @@ metadata: labels: app.kubernetes.io/name: compute app.kubernetes.io/managed-by: kustomize - name: compute + name: compute-manager diff --git a/config/components/cell-controllers/kustomization.yaml b/config/components/cell-controllers/kustomization.yaml new file mode 100644 index 00000000..3f32da3b --- /dev/null +++ b/config/components/cell-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_CELL_CONTROLLERS + value: "true" diff --git a/config/components/controller_rbac/metrics_auth_role_binding.yaml b/config/components/controller_rbac/metrics_auth_role_binding.yaml index 1ea3d974..ada1a1de 100644 --- a/config/components/controller_rbac/metrics_auth_role_binding.yaml +++ b/config/components/controller_rbac/metrics_auth_role_binding.yaml @@ -8,4 +8,4 @@ roleRef: name: compute-metrics-auth-role subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/controller_rbac/role.yaml b/config/components/controller_rbac/role.yaml index 5d803d2c..a634f512 100644 --- a/config/components/controller_rbac/role.yaml +++ b/config/components/controller_rbac/role.yaml @@ -4,6 +4,20 @@ kind: ClusterRole metadata: name: compute rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list - apiGroups: - compute.datumapis.com resources: @@ -36,3 +50,36 @@ rules: - get - patch - update +- apiGroups: + - networking.datumapis.com + resources: + - locations + - networkcontexts + - subnets + verbs: + - get + - list + - watch +- apiGroups: + - networking.datumapis.com + resources: + - networkbindings + - subnetclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - quota.miloapis.com + resources: + - resourceclaims + verbs: + - create + - delete + - get + - list + - watch diff --git a/config/components/controller_rbac/role_binding.yaml b/config/components/controller_rbac/role_binding.yaml index 6256bf3f..2f3e2676 100644 --- a/config/components/controller_rbac/role_binding.yaml +++ b/config/components/controller_rbac/role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/federation/kustomization.yaml b/config/components/federation/kustomization.yaml new file mode 100644 index 00000000..3ba207ff --- /dev/null +++ b/config/components/federation/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - workloaddeployment-interpreter.yaml diff --git a/config/components/federation/workloaddeployment-interpreter.yaml b/config/components/federation/workloaddeployment-interpreter.yaml new file mode 100644 index 00000000..2743a63b --- /dev/null +++ b/config/components/federation/workloaddeployment-interpreter.yaml @@ -0,0 +1,28 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: workloaddeployment +spec: + target: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + customizations: + statusReflection: + luaScript: | + function ReflectStatus(observedObj) + if observedObj.status == nil then + return nil + end + return observedObj.status + end + statusAggregation: + luaScript: | + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if statusItems[1].status ~= nil then + desiredObj.status = statusItems[1].status + end + return desiredObj + end diff --git a/config/components/iam/roles/compute-admin.yaml b/config/components/iam/roles/compute-admin.yaml index 4405bbd2..6bfaafa2 100644 --- a/config/components/iam/roles/compute-admin.yaml +++ b/config/components/iam/roles/compute-admin.yaml @@ -12,4 +12,5 @@ spec: includedPermissions: - compute.datumapis.com/workloads.create - compute.datumapis.com/workloads.update + - compute.datumapis.com/workloads.patch - compute.datumapis.com/workloads.delete diff --git a/config/components/leader_election/leader_election_role_binding.yaml b/config/components/leader_election/leader_election_role_binding.yaml index a5fe9996..d6783c07 100644 --- a/config/components/leader_election/leader_election_role_binding.yaml +++ b/config/components/leader_election/leader_election_role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute-leader-election subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/management-controllers/kustomization.yaml b/config/components/management-controllers/kustomization.yaml new file mode 100644 index 00000000..d1e29e7f --- /dev/null +++ b/config/components/management-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "true" diff --git a/config/components/quota-credentials/kustomization.yaml b/config/components/quota-credentials/kustomization.yaml new file mode 100644 index 00000000..ffc9a6d8 --- /dev/null +++ b/config/components/quota-credentials/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + volumeMounts: + - name: quota-credentials + mountPath: /etc/quota-credentials + readOnly: true + volumes: + - name: quota-credentials + secret: + secretName: compute-quota-credentials + optional: true diff --git a/config/components/service-catalog/service-configuration.yaml b/config/components/service-catalog/service-configuration.yaml index 202ac8af..8c29a50e 100644 --- a/config/components/service-catalog/service-configuration.yaml +++ b/config/components/service-catalog/service-configuration.yaml @@ -6,6 +6,9 @@ spec: serviceRef: name: compute phase: Published + locations: + supportedClasses: + - datum-managed monitoredResourceTypes: - type: compute.datumapis.com/Instance displayName: Compute Instance @@ -44,6 +47,26 @@ spec: description: Seconds the instance has been in a running state. kind: Cumulative unit: s + - name: compute.datumapis.com/workloads + displayName: Compute Workloads + description: Number of compute workloads. + kind: Gauge + unit: '{workload}' + - name: compute.datumapis.com/instances + displayName: Compute Instances + description: Number of compute instances. + kind: Gauge + unit: '{instance}' + - name: compute.datumapis.com/vcpus + displayName: Compute vCPUs + description: Number of vCPUs allocated across all instances. + kind: Gauge + unit: '{millicore}' + - name: compute.datumapis.com/memory + displayName: Compute Memory + description: Memory allocated across all instances. + kind: Gauge + unit: MiB billing: consumerDestinations: - monitoredResourceType: compute.datumapis.com/Instance @@ -53,13 +76,13 @@ spec: - compute.datumapis.com/instance/cpu-allocated - compute.datumapis.com/instance/memory-allocated - compute.datumapis.com/instance/uptime-seconds + quota: metricRules: - selector: apiGroup: compute.datumapis.com kind: Workload metricCosts: compute.datumapis.com/workloads: 1 - quota: limits: - name: compute-workloads metric: compute.datumapis.com/workloads diff --git a/config/overlays/cell/disable_webhook_patch.yaml b/config/overlays/cell/disable_webhook_patch.yaml new file mode 100644 index 00000000..85b57f09 --- /dev/null +++ b/config/overlays/cell/disable_webhook_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + discovery: + quotaKubeconfigPath: /etc/quota-credentials/kubeconfig diff --git a/config/overlays/cell/kustomization.yaml b/config/overlays/cell/kustomization.yaml new file mode 100644 index 00000000..80925ee2 --- /dev/null +++ b/config/overlays/cell/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/cell-controllers + - ../../components/quota-credentials + +patches: +- path: disable_webhook_patch.yaml diff --git a/config/overlays/management-plane/discovery_mode_patch.yaml b/config/overlays/management-plane/discovery_mode_patch.yaml new file mode 100644 index 00000000..97bf762c --- /dev/null +++ b/config/overlays/management-plane/discovery_mode_patch.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + webhookServer: {} + discovery: + mode: milo diff --git a/config/overlays/management-plane/downstream_kubeconfig_patch.yaml b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml new file mode 100644 index 00000000..7b3b764b --- /dev/null +++ b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: compute-manager +spec: + template: + spec: + containers: + - name: manager + env: + - name: FEDERATION_KUBECONFIG + value: /etc/kubernetes/downstream/auth/downstream-kubeconfig.yaml + volumeMounts: + - name: downstream-kubeconfig + mountPath: /etc/kubernetes/downstream/auth + readOnly: true + - name: karmada-token + mountPath: /etc/kubernetes/karmada-token + readOnly: true + volumes: + - name: downstream-kubeconfig + configMap: + name: compute-downstream-kubeconfig + - name: karmada-token + projected: + sources: + - serviceAccountToken: + audience: https://karmada-apiserver.karmada-system.svc.cluster.local:5443 + path: token diff --git a/config/overlays/management-plane/kustomization.yaml b/config/overlays/management-plane/kustomization.yaml new file mode 100644 index 00000000..dae13c58 --- /dev/null +++ b/config/overlays/management-plane/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager + - ../../base/webhook +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/resource-metrics + - ../../components/high-availability + - ../../components/management-controllers + - ../../components/csi-webhook-cert + +patches: +- path: downstream_kubeconfig_patch.yaml +- path: discovery_mode_patch.yaml diff --git a/config/overlays/single-cluster/kustomization.yaml b/config/overlays/single-cluster/kustomization.yaml index 7a2d0320..4d72934e 100644 --- a/config/overlays/single-cluster/kustomization.yaml +++ b/config/overlays/single-cluster/kustomization.yaml @@ -15,3 +15,5 @@ components: - ../../components/resource-metrics - ../../components/high-availability - ../../components/csi-webhook-cert + - ../../components/management-controllers + - ../../components/cell-controllers diff --git a/go.mod b/go.mod index 19fc0103..48bab65b 100644 --- a/go.mod +++ b/go.mod @@ -1,31 +1,34 @@ module go.datum.net/compute -go 1.24.0 - -toolchain go1.24.2 +go 1.25.0 require ( + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 - github.com/onsi/ginkgo/v2 v2.23.4 - github.com/onsi/gomega v1.37.0 + github.com/karmada-io/api v1.15.0 + github.com/onsi/ginkgo/v2 v2.27.2 + github.com/onsi/gomega v1.38.2 + github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 - go.datum.net/network-services-operator v0.1.0 - go.miloapis.com/milo v0.24.11 - golang.org/x/crypto v0.39.0 - golang.org/x/sync v0.16.0 + go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 + go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 + golang.org/x/crypto v0.45.0 + golang.org/x/sync v0.18.0 google.golang.org/protobuf v1.36.11 - k8s.io/api v0.33.1 - k8s.io/apimachinery v0.33.2 - k8s.io/client-go v0.33.1 - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 - sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/gateway-api v1.2.1 - sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 + k8s.io/api v0.35.0 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 + k8s.io/component-base v0.35.0 + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 + sigs.k8s.io/controller-runtime v0.23.3 + sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c + sigs.k8s.io/multicluster-runtime v0.23.3 ) require ( - cel.dev/expr v0.19.1 // indirect - github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + cel.dev/expr v0.24.0 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/antlr4-go/antlr/v4 v4.13.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect @@ -35,74 +38,70 @@ require ( github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.1 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect - github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/cel-go v0.26.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.64.0 // indirect - github.com/prometheus/procfs v0.16.1 // indirect - github.com/spf13/cobra v1.9.1 // indirect - github.com/spf13/pflag v1.0.7 // indirect - github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect + github.com/spf13/cobra v1.10.0 // indirect + github.com/spf13/pflag v1.0.9 // indirect + github.com/stoewer/go-strcase v1.3.1 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect - go.opentelemetry.io/proto/otlp v1.4.0 // indirect - go.uber.org/automaxprocs v1.6.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect + go.opentelemetry.io/otel v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.37.0 // indirect + go.opentelemetry.io/otel/sdk v1.37.0 // indirect + go.opentelemetry.io/otel/trace v1.37.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/net v0.41.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.12.0 // indirect - golang.org/x/tools v0.33.0 // indirect + golang.org/x/tools v0.38.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/grpc v1.71.1 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/grpc v1.74.2 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect - k8s.io/apiserver v0.33.1 // indirect - k8s.io/component-base v0.33.1 // indirect + k8s.io/apiextensions-apiserver v0.35.0 // indirect + k8s.io/apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index c472bd8b..42a98554 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,9 @@ -cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= -cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= -github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= +cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= +github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -17,16 +19,22 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= -github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= +github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -42,17 +50,16 @@ github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZ github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= -github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/cel-go v0.26.0 h1:DPGjXackMpJWH680oGY4lZhYjIameYmR+/6RBdDGmaI= +github.com/google/cel-go v0.26.0/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -62,18 +69,18 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+uBhcekkmy4IkffJww= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/karmada-io/api v1.15.0 h1:6Dx+Q36LaoPqKM4gduUuhSBQ3eKjKusjkvmggLpt9xs= +github.com/karmada-io/api v1.15.0/go.mod h1:wNbBEmXYkrRLSC2VgmXizIG12FW+/sAUF7UIz5WlYAU= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -84,42 +91,43 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= -github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4= -github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= -github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= +github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0= +github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE= +github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.3.1 h1:iS0MdW+kVTxgMoE1LAZyMiYJFKlOzLooE4MxjirtkAs= +github.com/stoewer/go-strcase v1.3.1/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -129,160 +137,125 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.datum.net/network-services-operator v0.1.0 h1:PAXOZ5DdJFgRoeVBPIXhqkCm6DxbP4tVOPcr3Y7h/So= -go.datum.net/network-services-operator v0.1.0/go.mod h1:uloVfxqE+8DgSiMB651X8UC9yECpXbwp/NBstofCceE= -go.miloapis.com/milo v0.1.0 h1:AYFVz1lfta/NbWSFSSKPtnkCA2rN+iegxlfQrDgEvYY= -go.miloapis.com/milo v0.1.0/go.mod h1:X+DpWOchv/Vm63mwHnboW00KRGsODY2bUTS/bBbK1+E= -go.miloapis.com/milo v0.24.11 h1:rByXDKbP4ZEN0I/z1C2RyUCyQi0NWrITLqoQILSAn2E= -go.miloapis.com/milo v0.24.11/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 h1:P3dePA6cCXKimZzE6d7Xxpj2rz54BxOHI8K8ic7VQ+c= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359/go.mod h1:Nr0PsCodkTW31vWVxR9dhAP9w0y+WHUYeyrcRnchcIE= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 h1:LSHyqLt/jus6iEMvo8pc731L+PyrTHP2bqfMMtHPSWc= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42/go.mod h1:p9O2kk194mvoL8rhqjwb+LWB+GIyY4vQqiTowwibVWo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= -go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= -go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 h1:Ahq7pZmv87yiyn3jeFz/LekZmPLLdKejuO3NcK9MssM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0/go.mod h1:MJTqhM0im3mRLw1i8uGHnCvUEeS7VwRyxlLC78PA18M= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 h1:m639+BofXTvcY1q8CGs4ItwQarYtJPOWmVobfM1HpVI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0/go.mod h1:LjReUci/F4BUyv+y4dwnq3h/26iNOeC3wAIqgvTIZVo= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= -golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 h1:0UOBWO4dC+e51ui0NFKSPbkHHiQ4TmrEfEZMLDyRmY8= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0/go.mod h1:8ytArBbtOy2xfht+y2fqKd5DRDJRUQhqbyEnQ4bDChs= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 h1:MAKi5q709QWfnkkpNQ0M12hYJ1+e8qYVDyowc4U1XZM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.74.2 h1:WoosgB65DlWVC9FqI82dGsZhWFNBSLjQ84bjROOpMu4= +google.golang.org/grpc v1.74.2/go.mod h1:CtQ+BGjaAIXHs/5YS3i473GqwBBa1zGQNevxdeBEXrM= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= -k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= -k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI= -k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA= -k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= -k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.1 h1:yLgLUPDVC6tHbNcw5uE9mo1T6ELhJj7B0geifra3Qdo= -k8s.io/apiserver v0.33.1/go.mod h1:VMbE4ArWYLO01omz+k8hFjAdYfc3GVAYPrhP2tTKccs= -k8s.io/client-go v0.33.1 h1:ZZV/Ks2g92cyxWkRRnfUDsnhNn28eFpt26aGc8KbXF4= -k8s.io/client-go v0.33.1/go.mod h1:JAsUrl1ArO7uRVFWfcj6kOomSlCv+JpvIsp6usAGefA= -k8s.io/component-base v0.33.1 h1:EoJ0xA+wr77T+G8p6T3l4efT2oNwbqBVKR71E0tBIaI= -k8s.io/component-base v0.33.1/go.mod h1:guT/w/6piyPfTgq7gfvgetyXMIh10zuXA6cRRm3rDuY= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= +k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/apiserver v0.35.0 h1:CUGo5o+7hW9GcAEF3x3usT3fX4f9r8xmgQeCBDaOgX4= +k8s.io/apiserver v0.35.0/go.mod h1:QUy1U4+PrzbJaM3XGu2tQ7U9A4udRRo5cyxkFX0GEds= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= +k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= +k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= -sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= -sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM= -sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 h1:Pq69tTKfN8ADw8m8A3wUtP8wJ9SPQbbOsgapm3BZEPw= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8/go.mod h1:CpBzLMLQKdm+UCchd2FiGPiDdCxM5dgCCPKuaQ6Fsv0= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c h1:GS4VnGRV90GEUjrgQ2GT5ii6yzWj3KtgUg+sVMdhs5c= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/multicluster-runtime v0.23.3 h1:vrzlXRzHTDsjspUAfoW2rCtr0agoI4q20p9x4Fz4png= +sigs.k8s.io/multicluster-runtime v0.23.3/go.mod h1:r/UA4GHgFoXCcR4tcvlZz7SiLx3l1kJKDuBAhILNIHs= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/internal/config/config.go b/internal/config/config.go index dddb7926..df4419b6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -229,6 +229,23 @@ type DiscoveryConfig struct { // template when connecting to project control planes. When not provided, // the operator will use the in-cluster config. ProjectKubeconfigPath string `json:"projectKubeconfigPath"` + + // ClusterName is the stable, unique name for this edge cluster. It is + // stamped onto ResourceClaim objects so that each edge controller can + // distinguish its own claims from those created by other edge controllers + // in the same project control planes. + // + // Required when Mode is "milo". Optional in single mode; defaults to "single". + ClusterName string `json:"clusterName"` + + // QuotaKubeconfigPath is the path to the kubeconfig file used when creating + // ResourceClaim objects against Milo project control planes. When set it + // takes precedence over ProjectKubeconfigPath for quota calls. When both are + // unset, quota accounting is disabled. + // + // Use this field in deployments (mode: single or mode: milo) that need to + // talk to api.datum.net for quota enforcement. + QuotaKubeconfigPath string `json:"quotaKubeconfigPath"` } func SetDefaults_DiscoveryConfig(obj *DiscoveryConfig) { @@ -253,6 +270,32 @@ func (c *DiscoveryConfig) ProjectRestConfig() (*rest.Config, error) { return clientcmd.BuildConfigFromFlags("", c.ProjectKubeconfigPath) } +// QuotaRestConfig returns the REST config for quota ResourceClaim management +// against Milo project control planes. QuotaKubeconfigPath is preferred; if +// unset, ProjectKubeconfigPath is used as a fallback. +// +// Returns (nil, nil) when no credential path is configured at all — this is +// the intentional opt-out case and the caller should disable quota enforcement. +// +// Returns (nil, error) when a credential path IS configured but the file does +// not exist on disk. This is a misconfiguration (Secret not mounted, wrong +// path) that must not silently disable enforcement; callers should treat this +// as a fatal startup error. +func (c *DiscoveryConfig) QuotaRestConfig() (*rest.Config, error) { + path := c.QuotaKubeconfigPath + if path == "" { + path = c.ProjectKubeconfigPath + } + if path == "" { + return nil, nil + } + if _, err := os.Stat(path); os.IsNotExist(err) { + return nil, fmt.Errorf("quota kubeconfig path %q is configured but file does not exist: "+ + "ensure the quota credential Secret is mounted correctly", path) + } + return clientcmd.BuildConfigFromFlags("", path) +} + func init() { SchemeBuilder.Register(&WorkloadOperator{}) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 5f586932..5a7a3cee 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -1,6 +1,8 @@ package config import ( + "os" + "path/filepath" "testing" "k8s.io/apimachinery/pkg/runtime" @@ -56,3 +58,67 @@ webhookServer: t.Error("TLS.CertDir was not defaulted") } } + +// TestQuotaRestConfig_NilWhenNoPath verifies that omitting quotaKubeconfigPath +// returns (nil, nil) — the intentional opt-out / enforcement-disabled case. +func TestQuotaRestConfig_NilWhenNoPath(t *testing.T) { + cfg := &DiscoveryConfig{} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() = non-nil, want nil (no path configured)") + } +} + +// TestQuotaRestConfig_ErrorWhenPathMissing verifies that explicitly setting a +// kubeconfig path that does not exist on disk returns a non-nil error (fail-loud). +func TestQuotaRestConfig_ErrorWhenPathMissing(t *testing.T) { + cfg := &DiscoveryConfig{ + QuotaKubeconfigPath: "/nonexistent/path/quota.kubeconfig", + } + restCfg, err := cfg.QuotaRestConfig() + if err == nil { + t.Fatal("QuotaRestConfig() error = nil, want non-nil error when path is configured but file absent") + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() returned non-nil config alongside error") + } +} + +// TestQuotaRestConfig_SuccessWhenFileExists verifies that a configured path +// pointing to an existing (though minimal) kubeconfig file succeeds. +func TestQuotaRestConfig_SuccessWhenFileExists(t *testing.T) { + // Write a minimal kubeconfig that clientcmd can parse. + dir := t.TempDir() + kubeconfigPath := filepath.Join(dir, "quota.kubeconfig") + minimalKubeconfig := []byte(`apiVersion: v1 +kind: Config +clusters: +- cluster: + server: https://localhost:1234 + name: test +contexts: +- context: + cluster: test + user: test + name: test +current-context: test +users: +- name: test + user: {} +`) + if err := os.WriteFile(kubeconfigPath, minimalKubeconfig, 0600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + cfg := &DiscoveryConfig{QuotaKubeconfigPath: kubeconfigPath} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg == nil { + t.Error("QuotaRestConfig() = nil, want non-nil when file exists") + } +} diff --git a/internal/controller/clustername.go b/internal/controller/clustername.go new file mode 100644 index 00000000..e726cf81 --- /dev/null +++ b/internal/controller/clustername.go @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import "strings" + +// The cross-plane cluster/project identity travels as a single Kubernetes label +// value: NSO's MappedNamespaceResourceStrategy encodes the name as "cluster-" +// with "/" replaced by "_", so a full project path ("org/project") survives as a +// legal label value ("cluster-org_project"). + +// EncodeClusterName renders a project/cluster name into the label wire form +// "cluster-" with "/" replaced by "_". +func EncodeClusterName(name string) string { + return "cluster-" + strings.ReplaceAll(name, "/", "_") +} + +// DecodeClusterName reverses EncodeClusterName, returning the full path. +func DecodeClusterName(encoded string) string { + return strings.ReplaceAll(strings.TrimPrefix(encoded, "cluster-"), "_", "/") +} diff --git a/internal/controller/clustername_test.go b/internal/controller/clustername_test.go new file mode 100644 index 00000000..269e9fc7 --- /dev/null +++ b/internal/controller/clustername_test.go @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "testing" +) + +func TestEncodeDecodeClusterName_RoundTrip(t *testing.T) { + t.Parallel() + cases := []struct { + name string + input string + }{ + {name: "simple name", input: "datum-cloud"}, + {name: "org/project path", input: "org/project"}, + {name: "three-segment path", input: "a/b/c"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got := DecodeClusterName(EncodeClusterName(tc.input)) + if got != tc.input { + t.Errorf("round-trip(%q): got %q, want %q", tc.input, got, tc.input) + } + }) + } +} diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index fb0ebe88..311337e0 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -15,7 +15,10 @@ import ( const ( deploymentWorkloadUIDIndex = "deploymentWorkloadUIDIndex" workloadNetworksIndex = "workloadNetworksIndex" - deploymentLocationIndex = "deploymentLocationIndex" + // deploymentCityCodeIndex indexes WorkloadDeployments by their Spec.CityCode + // so that SubnetClaim/Subnet watches can efficiently find the deployments + // that target the same city as a changed networking resource. + deploymentCityCodeIndex = "deploymentCityCodeIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { @@ -30,32 +33,30 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) } - // Index workload deployments by location - if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { - return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } return nil } func deploymentWorkloadUIDIndexFunc(o client.Object) []string { - return []string{ - string(o.(*computev1alpha.WorkloadDeployment).Spec.WorkloadRef.UID), + // Skip deployments without a workload UID: indexing them under the empty + // key would make them matchable by a GC query built from a corrupt (empty) + // UID, mirroring deploymentCityCodeIndexFunc. + uid := string(o.(*computev1alpha.WorkloadDeployment).Spec.WorkloadRef.UID) + if uid == "" { + return nil } + return []string{uid} } -func deploymentLocationIndexFunc(o client.Object) []string { +func deploymentCityCodeIndexFunc(o client.Object) []string { deployment := o.(*computev1alpha.WorkloadDeployment) - if deployment.Status.Location == nil { + if deployment.Spec.CityCode == "" { return nil } - - return []string{ - types.NamespacedName{ - Namespace: deployment.Status.Location.Namespace, - Name: deployment.Status.Location.Name, - }.String(), - } + return []string{deployment.Spec.CityCode} } func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { diff --git a/internal/controller/indexers_test.go b/internal/controller/indexers_test.go new file mode 100644 index 00000000..a5afd919 --- /dev/null +++ b/internal/controller/indexers_test.go @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/types" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// TestDeploymentWorkloadUIDIndexFunc verifies that deployments without a +// workload UID are excluded from the index: indexing them under the empty key +// would make them matchable by a GC query built from a corrupt (empty) UID. +func TestDeploymentWorkloadUIDIndexFunc(t *testing.T) { + t.Parallel() + + withUID := &computev1alpha.WorkloadDeployment{ + Spec: computev1alpha.WorkloadDeploymentSpec{ + WorkloadRef: computev1alpha.WorkloadReference{UID: types.UID("wl-uid-1")}, + }, + } + assert.Equal(t, []string{"wl-uid-1"}, deploymentWorkloadUIDIndexFunc(withUID)) + + withoutUID := &computev1alpha.WorkloadDeployment{} + assert.Nil(t, deploymentWorkloadUIDIndexFunc(withoutUID), + "a deployment without a workload UID must not be indexed under the empty key") +} diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index 820609c1..2ea23780 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -4,53 +4,225 @@ package controller import ( "context" + "errors" "fmt" + "maps" "strings" + "time" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - ctrlsource "sigs.k8s.io/controller-runtime/pkg/source" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" "go.datum.net/compute/internal/controller/instancecontrol" + quotametrics "go.datum.net/compute/internal/quota" ) -const instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" +const ( + // instanceQuotaFinalizer ensures the quota ResourceClaim is deleted when + // an Instance is removed. + instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" + + // instanceControllerFinalizer is registered with the finalizer framework and + // triggers downstream write-back cleanup on deletion. + instanceControllerFinalizer = "compute.datumapis.com/instance-controller" + + // instanceQuotaClaimSourceLabel is stamped on ResourceClaim objects with the + // name of the edge cluster that created them. The claim watch predicate uses + // this label to filter out claims written by other edge controllers targeting + // the same project control planes. + instanceQuotaClaimSourceLabel = "compute.datumapis.com/source-cluster" + + // instanceQuotaClaimNamespaceLabel records the source Instance's namespace on + // the ResourceClaim. The claim lives in the project's quota namespace (not the + // Instance's namespace), so the claim watch reads this label to map a grant + // back to the owning Instance. + instanceQuotaClaimNamespaceLabel = "compute.datumapis.com/instance-namespace" + + // instanceQuotaClaimNamePrefix namespaces an Instance's ResourceClaim name by + // resource type. Claims for different resource kinds share the project quota + // namespace, so the Instance name alone (unique among Instances, but not + // across kinds) could collide with another kind's claim — the prefix prevents + // that. The claim watch strips it to recover the Instance name. + instanceQuotaClaimNamePrefix = "instance-" + + quotaResourceTypeInstances = "compute.datumapis.com/instances" + + miloProjectAPIGroup = "resourcemanager.miloapis.com" + + miloProjectKind = "Project" + + msgNotProgrammed = "Instance has not been programmed" + + msgInstanceReady = "Instance is ready" + + msgInstanceProgrammed = "Instance has been programmed" + + msgInstanceAvailable = "Instance is available" + + // reasonNetworkFailedToCreate is the reason code for network creation failure. + reasonNetworkFailedToCreate = "NetworkFailedToCreate" +) + +// instanceTypeD1Standard2 is the platform instance type name for the +// 1 vCPU / 2 GiB size used as the catalog baseline for quota accounting. +const instanceTypeD1Standard2 = "datumcloud/d1-standard-2" + +// instanceTypeResources holds the vCPU and memory for a named instance type. +type instanceTypeResources struct { + // CPUMillicores is the number of CPU millicores (1000 = 1 vCPU). + CPUMillicores int64 + // MemoryMiB is the amount of RAM in mebibytes. + MemoryMiB int64 +} + +// instanceTypeCatalog maps platform instance type names to their resource +// dimensions used for quota accounting when the instance spec carries only an +// instanceType and no explicit container Limits or instance-level Requests. +// +// These are the platform-declared quota sizes for the instance type, not a +// derivation of any infra provider's machine type. (infra-provider-gcp separately +// maps datumcloud/d1-standard-2 to the GCP n2-standard-2 machine type for VM +// provisioning; that mapping does not define the quota size here.) When new +// instance types are added, add them here with their vCPU/memory values. +var instanceTypeCatalog = map[string]instanceTypeResources{ + instanceTypeD1Standard2: { + CPUMillicores: 1000, // 1 vCPU + MemoryMiB: 2048, // 2 GiB + }, +} + +// Quota-pending requeue backoff. The instance controller is normally re-queued by +// the ResourceClaim watch when a claim is granted, but that grant event lives on +// the project control plane and can be missed (informer engagement races, watch +// relist gaps), wedging the instance at QuotaGranted!=True indefinitely. While +// quota is pending we requeue on a backing-off schedule as a safety net so a +// missed grant self-heals. The interval lengthens the longer the instance waits: +// +// elapsed < 60s : every 1s (catch a grant landing almost immediately) +// 60s – 5m : every 15s +// 5m – 10m : every 60s +// >= 10m : every 300s +const ( + quotaPendingRequeueFast = 1 * time.Second + quotaPendingRequeueMedium = 15 * time.Second + quotaPendingRequeueSlow = 60 * time.Second + quotaPendingRequeueIdle = 300 * time.Second + + quotaPendingFastWindow = 60 * time.Second + quotaPendingMediumWindow = 5 * time.Minute + quotaPendingSlowWindow = 10 * time.Minute +) // clusterGetter is the subset of mcmanager.Manager used by InstanceReconciler. // Keeping it narrow allows unit tests to substitute a minimal fake. type clusterGetter interface { - GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) + GetCluster(ctx context.Context, clusterName multicluster.ClusterName) (cluster.Cluster, error) } +// errProjectIdentityUnresolvable is the sentinel wrapped by project-identity +// resolvers when the edge namespace is missing one of the identity labels +// stamped by NSO's MappedNamespaceResourceStrategy. Both labels are written +// atomically at namespace creation, before any Instance can exist in the +// namespace, so absence is misconfiguration — not a propagation race — and +// retrying cannot fix it. Callers use errors.Is to distinguish this from +// transient resolution failures. +var errProjectIdentityUnresolvable = errors.New("project identity unresolvable") + +// InstanceProjectIDFunc derives the Milo project ID for a given Instance. +// In Milo mode the project ID equals the multicluster ClusterName. In +// single-cell mode it is decoded from the upstream-cluster-name namespace label. +// Returns an error wrapping errProjectIdentityUnresolvable when the identity +// label is missing (misconfiguration); transient failures return ordinary +// errors that should trigger a requeue. +type InstanceProjectIDFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + +// InstanceProjectNamespaceFunc derives the in-project namespace where +// ResourceClaims for a given Instance should be created. In Milo mode this +// equals instance.Namespace. In single-cell mode it comes from the +// upstream-namespace namespace label. +// Returns an error wrapping errProjectIdentityUnresolvable when the identity +// label is missing (misconfiguration); transient failures return ordinary +// errors that should trigger a requeue. +type InstanceProjectNamespaceFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + // InstanceReconciler reconciles an Instance object type InstanceReconciler struct { - mgr clusterGetter - managementCluster cluster.Cluster + mgr clusterGetter + scheme *runtime.Scheme + quotaClientManager *quotametrics.ProjectQuotaClientManager + edgeClusterName string + // recorder emits Kubernetes events on the Instance object for quota failure + // modes so operators can diagnose issues via `kubectl describe`. + recorder record.EventRecorder + // projectIDForInstance derives the Milo project ID used for quota + // ResourceClaim management. In Milo mode it returns string(clusterName); in + // single-cell mode it reads the upstream-cluster-name label from the edge + // namespace and decodes "cluster-" → "". + projectIDForInstance InstanceProjectIDFunc + // projectNamespaceForInstance derives the in-project namespace where + // ResourceClaims must be created. In Milo mode the ResourceClaim lives in + // instance.Namespace (the project-level namespace); in single-cell mode the + // edge namespace is ns-{uid} which does not exist in the project control + // plane — the real namespace is the upstream-namespace label value (e.g. + // "default"). When nil, falls back to instance.Namespace. + projectNamespaceForInstance InstanceProjectNamespaceFunc + // clusterNameForProject maps a Milo project ID back to the multicluster + // ClusterName that owns that project's workloads. In Milo mode the + // ClusterName equals the project ID. In single-cell mode the only registered + // cluster is "single" regardless of project ID. When nil, falls back to + // multicluster.ClusterName(projectID), which is correct for Milo mode. + clusterNameForProject func(projectID string) multicluster.ClusterName + // FederationClient is an optional client pointing at the upstream + // Karmada/federation control plane (configured via --federation-kubeconfig). + // When non-nil, the reconciler writes a copy of each Instance back to the + // federation control plane so that the InstanceProjector (running in the + // management cluster) can aggregate status across all POP cells. Set to nil to + // disable federation write-back (e.g. in non-federation deployments). + FederationClient client.Client + finalizers finalizer.Finalizers } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/finalizers,verbs=update // +kubebuilder:rbac:groups=quota.miloapis.com,resources=resourceclaims,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (_ ctrl.Result, err error) { logger := log.FromContext(ctx) @@ -69,29 +241,24 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } + // Run the finalizer framework first. This handles downstream write-back cleanup + // via the Finalize method registered below. + finalizationResult, err := r.finalizers.Finalize(ctx, &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &instance); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + logger.Info("reconciling instance") defer logger.Info("reconcile complete") if !instance.DeletionTimestamp.IsZero() { - if controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) - var claim quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed getting resource claim for deletion: %w", err) - } - } else { - if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { - return ctrl.Result{}, fmt.Errorf("failed deleting resource claim: %w", err) - } - } - - controllerutil.RemoveFinalizer(&instance, instanceQuotaFinalizer) - if err := cl.GetClient().Update(ctx, &instance); err != nil { - return ctrl.Result{}, fmt.Errorf("failed removing quota finalizer: %w", err) - } - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.reconcileDeletion(ctx, cl.GetClient(), req.ClusterName, &instance) } if !controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { @@ -102,94 +269,530 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, nil } - grantedCondition, err := r.reconcileQuotaClaim(ctx, req.ClusterName, &instance) + statusChanged, quotaErr := r.reconcileQuotaCondition(ctx, req.ClusterName, &instance) + + // Safety-net requeue while quota is not yet granted, computed up front so + // every return path below honors it. A conflict during the pending window + // must not drop the instance onto controller-runtime's exponential + // error-backoff (which can stretch to minutes), which would defeat recovery + // from a missed ResourceClaim grant event. Logged so the requeue is + // observable: a re-firing requeue prints this every pass while pending. + quotaReq := quotaPendingRequeueAfter(&instance, time.Now()) + if quotaReq > 0 { + logger.Info("quota pending; scheduling safety-net requeue", + "after", quotaReq.String(), "cluster", req.ClusterName.String(), "instance", instance.Name) + } + + // Transient errors from the quota and Ready-condition reconciles are + // returned only after any condition change has been persisted, so the + // failure reason is visible on the Instance while controller-runtime + // requeues with backoff. + readyChanged, readyErr := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) + + if statusChanged || readyChanged { + if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { + if quotaReq > 0 && apierrors.IsConflict(err) { + logger.Info("status update conflicted while quota pending; requeuing instead of error-backoff", + "after", quotaReq.String(), "instance", instance.Name) + return ctrl.Result{RequeueAfter: quotaReq}, nil + } + return ctrl.Result{}, err + } + if readyErr != nil { + return ctrl.Result{}, readyErr + } + // Return with the quota error (nil or transient) so controller-runtime + // requeues with backoff on failures. On the success path (quotaErr==nil) + // we fall through to removeQuotaSchedulingGate below instead of returning + // early, so the gate is cleared in the same reconcile pass rather than + // waiting for a requeue that may never come (ResourceClaim is immutable + // and local Instances are not watched). + if quotaErr != nil { + if err := r.writeBackToUpstream(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, quotaErr + } + } else if readyErr != nil { + return ctrl.Result{}, readyErr + } else if quotaErr != nil { + // No status change but quota evaluation failed — return error to requeue. + return ctrl.Result{}, quotaErr + } + + if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + return ctrl.Result{}, err + } + + if err := r.writeBackToUpstream(ctx, &instance); err != nil { + if quotaReq > 0 && apierrors.IsConflict(err) { + logger.Info("upstream writeback conflicted while quota pending; requeuing instead of error-backoff", + "after", quotaReq.String(), "instance", instance.Name) + return ctrl.Result{RequeueAfter: quotaReq}, nil + } + return ctrl.Result{}, err + } + + if quotaReq > 0 { + logger.Info("requeuing instance", "after", quotaReq.String(), + "cluster", req.ClusterName.String(), "instance", instance.Name) + } + + return ctrl.Result{RequeueAfter: quotaReq}, nil +} + +// reconcileDeletion handles quota-claim cleanup when an Instance is being +// deleted. It removes the quota finalizer once the ResourceClaim is gone. +func (r *InstanceReconciler) reconcileDeletion(ctx context.Context, cl client.Client, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if !controllerutil.ContainsFinalizer(instance, instanceQuotaFinalizer) { + return nil + } + + if r.quotaClientManager != nil { + if err := r.cleanupQuotaClaim(ctx, clusterName, instance); err != nil { + if !errors.Is(err, errProjectIdentityUnresolvable) { + // Transient failure (API unreachable, quota client errors) — + // retry with backoff rather than risking an orphaned claim. + return err + } + // Unresolvable project identity must not wedge deletion: the identity + // labels are stamped at namespace creation, so absence is + // misconfiguration that no retry fixes. Log at ERROR and emit an event + // so the operator is aware, then fall through to finalizer removal so + // the Instance is not permanently stuck in Terminating. The orphaned + // claim will count against project budget until Milo's TTL/GC removes it. + log.FromContext(ctx).Error(err, "project identity unresolvable during deletion; ResourceClaim may be orphaned — budget leak possible", + "instance", instance.Name, "namespace", instance.Namespace) + r.recorder.Event(instance, corev1.EventTypeWarning, + "QuotaClaimOrphaned", + "Skipping ResourceClaim cleanup: project identity could not be resolved; claim may be orphaned in Milo project control plane") + quotametrics.ClaimOrphanedTotal.Inc() + } + } + + controllerutil.RemoveFinalizer(instance, instanceQuotaFinalizer) + if err := cl.Update(ctx, instance); err != nil { + return fmt.Errorf("failed removing quota finalizer: %w", err) + } + return nil +} + +// cleanupQuotaClaim deletes the ResourceClaim backing an Instance from the +// project control plane. Errors wrapping errProjectIdentityUnresolvable mean +// the claim cannot even be located; the caller decides whether deletion +// proceeds without cleanup. +func (r *InstanceReconciler) cleanupQuotaClaim(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project ID during deletion: %w", err) + } + + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + return fmt.Errorf("failed getting quota client for deletion: %w", err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling quota claim: %w", err) + return fmt.Errorf("resolving project namespace during deletion: %w", err) + } + claimName := quotaClaimName(instance) + var claim quotav1alpha1.ResourceClaim + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: claimNamespace, Name: claimName}, &claim); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed getting resource claim for deletion: %w", err) + } + return nil + } + if err := projectClient.Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed deleting resource claim: %w", err) + } + return nil +} + +// quotaClaimName returns the name of the ResourceClaim backing an Instance's +// quota: the Instance name (unique among Instances within the project control +// plane) prefixed by instanceQuotaClaimNamePrefix to avoid colliding with other +// resource kinds' claims in the shared quota namespace. The owning Instance's +// namespace is preserved on the claim via instanceQuotaClaimNamespaceLabel so +// the claim watch can map a grant back to the Instance. +func quotaClaimName(instance *computev1alpha.Instance) string { + return instanceQuotaClaimNamePrefix + instance.Name +} + +// quotaPendingRequeueAfter returns a safety-net requeue interval while the +// instance's quota is not yet granted, backing off the longer it has waited (see +// the quotaPendingRequeue* constants). It returns 0 when quota is already granted +// (QuotaGranted=True) or the condition is absent, so a granted/normal instance is +// not needlessly requeued. +// +// Elapsed time is anchored on the instance's creation timestamp, NOT the +// QuotaGranted condition's LastTransitionTime: while quota is pending the +// condition stays Unknown (PendingEvaluation and NoBudget are both Unknown), so +// SetStatusCondition never bumps LastTransitionTime off its 1970-01-01 CRD +// default — which would peg every pending instance to the slowest tier. +func quotaPendingRequeueAfter(instance *computev1alpha.Instance, now time.Time) time.Duration { + cond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if cond == nil || cond.Status == metav1.ConditionTrue { + return 0 } + elapsed := now.Sub(instance.CreationTimestamp.Time) + switch { + case elapsed < quotaPendingFastWindow: + return quotaPendingRequeueFast + case elapsed < quotaPendingMediumWindow: + return quotaPendingRequeueMedium + case elapsed < quotaPendingSlowWindow: + return quotaPendingRequeueSlow + default: + return quotaPendingRequeueIdle + } +} - statusChanged := false +// reconcileQuotaCondition reconciles the ResourceClaim and updates the +// InstanceQuotaGranted status condition. It returns (changed, err) where +// changed=true means a status update is required, and err non-nil means the +// reconciler should requeue (with backoff) in addition to writing the condition. +func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (bool, error) { + grantedCondition, claimErr := r.reconcileQuotaClaim(ctx, clusterName, instance) + // reconcileQuotaClaim returns (condition, err). A non-nil error signals a + // transient infrastructure failure; a non-nil condition carries the reason to + // write. Both can be non-nil: write the condition AND requeue with backoff. switch { - case grantedCondition == nil || (grantedCondition.Status == metav1.ConditionFalse && grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason): - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition == nil && claimErr == nil: + // No grant decision yet: the claim was just created or carries no + // Granted condition. Stay PendingEvaluation until the claim watch or + // the safety-net requeue observes the decision. + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionUnknown, Reason: computev1alpha.InstanceQuotaGrantedReasonPendingEvaluation, Message: "Waiting for quota evaluation", ObservedGeneration: instance.Generation, + }), nil + + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionFalse && + grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason: + // Claim exists but pending — no AllowanceBucket. Distinct from "evaluating". + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceQuotaGrantedReasonNoBudget, + Message: "ResourceClaim is pending: no AllowanceBucket configured for this project", + ObservedGeneration: instance.Generation, + }) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonNoBudget, + "ResourceClaim pending: no AllowanceBucket configured for this project") + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonNoBudget).Inc() + return changed, claimErr + + case grantedCondition != nil && grantedCondition.Type == computev1alpha.InstanceQuotaGranted: + // reconcileQuotaClaim populated a structured failure condition. + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: grantedCondition.Status, + Reason: grantedCondition.Reason, + Message: grantedCondition.Message, + ObservedGeneration: instance.Generation, }) + return changed, claimErr - case grantedCondition.Status == metav1.ConditionTrue: - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionTrue: + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr - case grantedCondition.Status == metav1.ConditionFalse: + case grantedCondition != nil: // False, non-pending reason from ResourceClaim reason := computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded if grantedCondition.Reason == quotav1alpha1.ResourceClaimValidationFailedReason { reason = computev1alpha.InstanceQuotaGrantedReasonValidationFailed } - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: reason, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr + + default: // grantedCondition == nil && claimErr != nil — should not reach here + return false, claimErr + } +} + +// removeQuotaSchedulingGate removes the quota scheduling gate from the +// Instance spec once QuotaGranted=True has been persisted to status. +// It guards on ObservedGeneration to prevent a stale True condition from +// generation N unblocking a generation N+1 instance before quota for the +// new spec has been evaluated. +func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { + return nil + } + // Stale condition guard: only remove the gate if the condition reflects the + // current spec generation. A condition from an older generation means quota + // has not yet been evaluated for the current spec. + if quotaGrantedCond.ObservedGeneration != instance.Generation { + return nil + } + if instance.Spec.Controller == nil { + return nil } - readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gateRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + if gate.Name == instancecontrol.QuotaSchedulingGate.String() { + gateRemoved = true + continue + } + newGates = append(newGates, gate) + } + if !gateRemoved { + return nil + } + + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching quota scheduling gate: %w", err) + } + return nil +} + +// Finalize removes the downstream write-back Instance when the local Instance is +// deleted. It is a no-op when downstream federation is disabled. +func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + instance := obj.(*computev1alpha.Instance) + + downstreamInstance := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(instance), downstreamInstance) + if apierrors.IsNotFound(err) { + return finalizer.Result{}, nil + } if err != nil { - return ctrl.Result{}, err + return finalizer.Result{}, fmt.Errorf("failed getting downstream instance for deletion: %w", err) } - if statusChanged || readyChanged { - if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { - return ctrl.Result{}, err + if err := r.FederationClient.Delete(ctx, downstreamInstance); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed deleting downstream write-back instance: %w", err) + } + + return finalizer.Result{}, nil +} + +// writeBackToUpstream copies the Instance spec and status to the upstream +// Karmada/federation control plane so that the InstanceProjector can aggregate +// state from all POP cells. It is a no-op when FederationClient is nil (federation disabled). +func (r *InstanceReconciler) writeBackToUpstream(ctx context.Context, instance *computev1alpha.Instance) error { + if r.FederationClient == nil { + return nil + } + + // Read the upstream project namespace name and encoded cluster name from + // the federation-plane namespace. The federator stamps both labels + // atomically when it creates the namespace, before any cell Instance can + // exist in it, so they are the sole source of write-back identity: a + // failed read or a missing label is corruption, never a propagation race. + // Deriving substitute values here would write WRONG identity upstream, + // where the InstanceProjector could mislink the projection — erroring + // retries with backoff instead. + var downstreamNS corev1.Namespace + if err := r.FederationClient.Get(ctx, client.ObjectKey{Name: instance.Namespace}, &downstreamNS); err != nil { + return fmt.Errorf("failed getting federation namespace %q for write-back identity: %w", instance.Namespace, err) + } + upstreamNamespace := downstreamNS.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if upstreamNamespace == "" { + return fmt.Errorf("federation namespace %q is missing the %s label required for write-back identity", + instance.Namespace, downstreamclient.UpstreamOwnerNamespaceLabel) + } + encodedClusterName := downstreamNS.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encodedClusterName == "" { + return fmt.Errorf("federation namespace %q is missing the %s label required for write-back identity", + instance.Namespace, downstreamclient.UpstreamOwnerClusterNameLabel) + } + + // The write-back copy must carry every label the stateful control strategy + // stamps atomically at Instance creation (with a backfill pass converging + // live instances), so absence of any of them is transient. Erroring retries + // with backoff until the labels land, instead of propagating incomplete + // identity upstream where the projection could never be linked back to its + // owners or routed by city/placement. + var missingLabels []string + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + computev1alpha.WorkloadDeploymentNameLabel, + computev1alpha.CityCodeLabel, + computev1alpha.WorkloadNameLabel, + computev1alpha.PlacementNameLabel, + } { + if instance.Labels[key] == "" { + missingLabels = append(missingLabels, key) } - // Return after the status update so that the next reconcile sees the - // updated QuotaGranted condition before attempting spec changes. - return ctrl.Result{}, nil + } + if len(missingLabels) > 0 { + return fmt.Errorf("instance %s/%s is missing linking labels required for write-back: %s", + instance.Namespace, instance.Name, strings.Join(missingLabels, ", ")) } - // Remove the quota scheduling gate once QuotaGranted=True is persisted. - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue { - if instance.Spec.Controller != nil { - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if gateRemoved { - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.GetClient().Patch(ctx, &instance, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - } + writeBack := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name, + Namespace: instance.Namespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedClusterName, + downstreamclient.UpstreamOwnerNamespaceLabel: upstreamNamespace, + computev1alpha.WorkloadUIDLabel: instance.Labels[computev1alpha.WorkloadUIDLabel], + computev1alpha.WorkloadDeploymentUIDLabel: instance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + computev1alpha.InstanceIndexLabel: instance.Labels[computev1alpha.InstanceIndexLabel], + computev1alpha.WorkloadDeploymentNameLabel: instance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + computev1alpha.CityCodeLabel: instance.Labels[computev1alpha.CityCodeLabel], + computev1alpha.WorkloadNameLabel: instance.Labels[computev1alpha.WorkloadNameLabel], + computev1alpha.PlacementNameLabel: instance.Labels[computev1alpha.PlacementNameLabel], + }, + }, + Spec: instance.Spec, + } + + existing := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(writeBack), existing) + if apierrors.IsNotFound(err) { + // The federation namespace already exists: the identity Get above read + // it, and the federator guarantees a labeled namespace before any cell + // Instance can exist. If it disappears between the Get and this Create, + // the Create fails NotFound and retries via backoff until the federator + // restores it — creating an unlabeled namespace here would manufacture + // the very corruption the identity checks reject. + if err := r.FederationClient.Create(ctx, writeBack); err != nil { + return fmt.Errorf("failed creating downstream write-back instance: %w", err) + } + writeBack.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, writeBack); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status after create: %w", err) + } + return nil + } + if err != nil { + return fmt.Errorf("failed getting downstream instance: %w", err) + } + + // Build a comparable map containing only the keys this function owns so that + // Karmada-managed labels on the existing object do not cause spurious updates. + ownedLabels := make(map[string]string, len(writeBack.Labels)) + for k := range writeBack.Labels { + ownedLabels[k] = existing.Labels[k] + } + + if !apiequality.Semantic.DeepEqual(existing.Spec, instance.Spec) || + !apiequality.Semantic.DeepEqual(ownedLabels, writeBack.Labels) { + existing.Spec = instance.Spec + // Merge writeBack.Labels into existing.Labels. Only keys owned by + // writeBackToUpstream are written; any labels Karmada or other actors + // have placed on the downstream object are preserved. + if existing.Labels == nil { + existing.Labels = make(map[string]string) + } + maps.Copy(existing.Labels, writeBack.Labels) + if err := r.FederationClient.Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance: %w", err) } } - return ctrl.Result{}, nil + if !apiequality.Semantic.DeepEqual(existing.Status, instance.Status) { + existing.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status: %w", err) + } + } + + return nil } -func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (*metav1.Condition, error) { +// reconcileQuotaClaim attempts to create or observe a ResourceClaim for the +// given instance. It returns: +// - (nil, nil) — no grant decision yet: the claim was just created or +// carries no Granted condition; caller sets PendingEvaluation +// - (condition, nil) — terminal condition (True/False/Unknown from claim or failure) +// - (condition, err) — condition to write + transient error to requeue with backoff +// +// The condition's Type field is always InstanceQuotaGranted when set by this function +// to distinguish it from ResourceClaim conditions returned directly. +func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (*metav1.Condition, error) { + if r.quotaClientManager == nil { + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, + Message: "Quota enforcement disabled: no credential configured", + }, nil + } + logger := log.FromContext(ctx) - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + // Transient (namespace API unreachable) or permanent (identity labels + // missing — misconfiguration). Either way the failure is surfaced: + // structured condition + warning event + error return, instead of + // silently parking the instance at PendingEvaluation. + msg := fmt.Sprintf("Could not resolve project ID: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project ID for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + msg := fmt.Sprintf("Failed to build quota client for project %q: %v", projectID, err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting quota client for project %q: %w", projectID, err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + msg := fmt.Sprintf("Could not resolve project namespace: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project namespace for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + + claimName := quotaClaimName(instance) requests := []quotav1alpha1.ResourceRequest{ { - ResourceType: "compute.datumapis.com/instances", + ResourceType: quotaResourceTypeInstances, Amount: 1, }, } @@ -213,41 +816,117 @@ func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterNam desired := "av1alpha1.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{ Name: claimName, - Namespace: instance.Namespace, + Namespace: claimNamespace, + Labels: map[string]string{ + instanceQuotaClaimSourceLabel: r.edgeClusterName, + instanceQuotaClaimNamespaceLabel: instance.Namespace, + }, }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", - Name: clusterName, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instance.Name, - Namespace: instance.Namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, Requests: requests, }, } var existing quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { - if !apierrors.IsNotFound(err) { - return nil, fmt.Errorf("failed getting resource claim: %w", err) - } - if err := r.managementCluster.GetClient().Create(ctx, desired); err != nil { - return nil, fmt.Errorf("failed creating resource claim: %w", err) + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { + if apierrors.IsNotFound(err) { + // Claim doesn't exist yet — attempt to create it. + createErr := projectClient.Create(ctx, desired) + if createErr == nil { + return nil, nil + } + return r.classifyCreateError(instance, projectID, claimNamespace, createErr) } - return nil, nil + // GET itself failed — treat as backend unavailable. + msg := fmt.Sprintf("Quota backend unreachable getting ResourceClaim: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting resource claim: %w", err) } grantedCondition := apimeta.FindStatusCondition(existing.Status.Conditions, quotav1alpha1.ResourceClaimGranted) return grantedCondition, nil } +// classifyCreateError maps a ResourceClaim creation error to a structured +// QuotaGranted condition with a specific reason, emits a Kubernetes event, and +// increments the appropriate metric counter. +func (r *InstanceReconciler) classifyCreateError( + instance *computev1alpha.Instance, + projectID, claimNamespace string, + err error, +) (*metav1.Condition, error) { + var reason, metricLabel, msg string + + switch { + case apierrors.IsNotFound(err): + // 404 on Create: either the project control plane path doesn't exist + // (project deleted) or the namespace doesn't exist yet. + if claimNamespace != "" { + reason = computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound + metricLabel = quotametrics.ReasonNamespaceNotFound + msg = fmt.Sprintf("Quota claim namespace %q not found on project %q control plane", claimNamespace, projectID) + } else { + reason = computev1alpha.InstanceQuotaGrantedReasonProjectNotFound + metricLabel = quotametrics.ReasonProjectNotFound + msg = fmt.Sprintf("Milo project %q not found", projectID) + } + case apierrors.IsForbidden(err) || apierrors.IsInvalid(err): + // 403/422: quota admission plugin rejected the claim. + reason = computev1alpha.InstanceQuotaGrantedReasonMisconfigured + metricLabel = quotametrics.ReasonMisconfigured + msg = fmt.Sprintf("Quota admission rejected ResourceClaim for project %q: %v", projectID, err) + default: + // Connectivity or server error — treat as backend unavailable. + reason = computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable + metricLabel = quotametrics.ReasonBackendUnavailable + msg = fmt.Sprintf("Quota backend unreachable creating ResourceClaim: %v", err) + } + + r.recorder.Event(instance, corev1.EventTypeWarning, reason, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(metricLabel).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: reason, + Message: msg, + }, fmt.Errorf("failed creating resource claim: %w", err) +} + +// resolveInstanceResources determines the vCPU and memory amounts to claim +// for an instance. Explicit sizing always takes precedence over the instance +// type catalog, so a workload that overrides container limits is accounted at +// its actual resource footprint rather than the catalog baseline. +// +// Precedence order: +// 1. Sandbox container Limits (sum across all containers) — all containers +// must have both cpu and memory Limits for this path to succeed. +// 2. Instance-level Resources.Requests — both cpu and memory must be present. +// 3. instanceTypeCatalog lookup by instanceType — used for the common case +// where a workload is sized only by instanceType with no explicit limits. +// +// Returns (0, 0, false) when none of the above yield a complete sizing, so +// the caller falls back to claiming only the instance count. func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores int64, memMiB int64, resolved bool) { rt := instance.Spec.Runtime + + // Path 1: explicit per-container Limits — most specific, wins if fully set. if rt.Sandbox != nil { var totalCPU resource.Quantity var totalMem resource.Quantity @@ -266,18 +945,60 @@ func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores totalCPU.Add(cpu) totalMem.Add(mem) } - if !allSet || len(rt.Sandbox.Containers) == 0 { - return 0, 0, false + if allSet && len(rt.Sandbox.Containers) > 0 { + return totalCPU.MilliValue(), totalMem.Value() / (1024 * 1024), true } - return totalCPU.MilliValue(), totalMem.Value() / (1024 * 1024), true + // Containers exist but limits are incomplete — fall through so the + // instance-level Requests and instanceType catalog paths can still + // yield a sizing. } + // Path 2: instance-level resource requests. cpu, hasCPU := rt.Resources.Requests[corev1.ResourceCPU] mem, hasMem := rt.Resources.Requests[corev1.ResourceMemory] - if !hasCPU || !hasMem { - return 0, 0, false + if hasCPU && hasMem { + return cpu.MilliValue(), mem.Value() / (1024 * 1024), true + } + + // Path 3: instanceType catalog — handles the typical production case where + // instanceType is the only sizing signal and no explicit limits are set. + if rt.Resources.InstanceType != "" { + if spec, ok := instanceTypeCatalog[rt.Resources.InstanceType]; ok { + return spec.CPUMillicores, spec.MemoryMiB, true + } + } + + return 0, 0, false +} + +// instanceBlockingReasonPriority ranks Instance blocking reasons so the most +// specific, user-actionable cause wins when several conditions are unsatisfied. +// Higher numbers are more specific. Reasons absent from the table rank 0. +// +// 0 - unknown/default +// 1 - Provisioning (transient runtime startup) +// 3 - PendingQuota (operator action may be needed) +// 5 - ImageUnavailable / InstanceCrashing / ConfigurationError +// (hard runtime error, user-actionable) +// 7 - NetworkFailedToCreate (hard infra error) +func instanceBlockingReasonPriority(reason string) int { + switch reason { + case computev1alpha.InstanceReadyReasonProvisioning: + return 1 + case computev1alpha.InstanceProgrammedReasonPendingQuota: + return 3 + case computev1alpha.InstanceReadyReasonImageUnavailable, + computev1alpha.InstanceReadyReasonInstanceCrashing, + computev1alpha.InstanceReadyReasonConfigurationError: + // Hard runtime errors are user-actionable (wrong image, crashing app, bad + // config) and rank highest among non-infra reasons so they are not buried + // under transient startup/quota reasons. + return 5 + case reasonNetworkFailedToCreate: + return 7 + default: + return 0 } - return cpu.MilliValue(), mem.Value() / (1024 * 1024), true } // networkFailureChecker is a function that checks if a network creation failure @@ -327,7 +1048,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, ObservedGeneration: instance.Generation, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, } } else { readyCondition = readyCondition.DeepCopy() @@ -344,8 +1065,9 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( return false, fmt.Errorf("failed checking for network creation failure: %w", err) } + readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { - readyCondition.Reason = "NetworkFailedToCreate" + readyCondition.Reason = reasonNetworkFailedToCreate readyCondition.Message = networkCreationFailureMessage } else { readyCondition.Reason = computev1alpha.InstanceReadyReasonSchedulingGatesPresent @@ -360,41 +1082,120 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) - readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming - if programmedCondition != nil && programmedCondition.Reason != pendingReason { - readyCondition.Reason = programmedCondition.Reason + // Surface the most specific provider sub-condition rather than a generic + // "Instance has not been programmed". A provider reason like + // ImageUnavailable (set on the Available condition while Programmed is + // still Unknown) must surface on Ready with its actionable message. + // + // Two tiers are tracked: + // - bestKnown: the best candidate from the priority table (ranked 1-7). + // - fallback: the Programmed condition's own reason/message when it has + // one but it is not in the priority table (e.g. a provider + // writes a custom Programmed reason otherwise unknown to + // this controller). Preserves Programmed.Reason → Ready.Reason + // pass-through behavior. + type candidate struct { + status metav1.ConditionStatus + reason string + message string + priority int } - readyCondition.Message = "Instance has not been programmed" - if programmedCondition != nil && programmedCondition.Status != metav1.ConditionUnknown { - readyCondition.Message = programmedCondition.Message + // Generic default — used only when nothing better is found. + fallbackCandidate := candidate{ + status: metav1.ConditionFalse, + reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, + message: msgNotProgrammed, + priority: -1, + } + // Promote the Programmed condition's own reason as a fallback when it is + // more specific than PendingProgramming/Pending but not in the priority + // table. Preserves pass-through for provider-written Programmed reasons. + if programmedCondition != nil && programmedCondition.Reason != pendingReason && + programmedCondition.Reason != computev1alpha.InstanceProgrammedReasonPendingProgramming { + fallbackCandidate = candidate{ + status: programmedCondition.Status, + reason: programmedCondition.Reason, + message: programmedCondition.Message, + priority: 0, + } } + best := fallbackCandidate + consider := func(status metav1.ConditionStatus, reason, message string) { + // A generic "Pending" reason carries no actionable signal; skip it so + // it cannot displace an already-set specific reason from the provider. + if reason == pendingReason { + return + } + p := instanceBlockingReasonPriority(reason) + if p > best.priority { + best = candidate{status: status, reason: reason, message: message, priority: p} + } + } + + // Sub-conditions set by the provider (e.g. Available=Unknown/ImageUnavailable) + // may be more specific than the Programmed condition. Consult each one so + // the highest-priority reason wins, regardless of which condition carries it. + for _, cond := range instance.Status.Conditions { + if cond.Status == metav1.ConditionTrue { + // Satisfied conditions are not blocking; skip them. + continue + } + switch cond.Type { + case computev1alpha.InstanceProgrammed, + computev1alpha.InstanceReady, + computev1alpha.InstanceQuotaGranted: + // InstanceProgrammed is handled below; InstanceReady is being set + // now. InstanceQuotaGranted is a gate-level signal evaluated before + // this branch is reached — including it here would let a transient + // PendingEvaluation reason displace the generic not-programmed + // fallback when no provider sub-condition is set yet. + continue + } + consider(cond.Status, cond.Reason, cond.Message) + } + // Also let the Programmed condition itself compete through the priority table + // in case it carries a known reason (e.g. PendingQuota). + if programmedCondition != nil { + consider(programmedCondition.Status, programmedCondition.Reason, programmedCondition.Message) + } + + readyCondition.Status = best.status + readyCondition.Reason = best.reason + readyCondition.Message = best.message + return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } logger.Info("instance is programmed", "instance", instance.Name) - runningCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceAvailable) - if runningCondition == nil || runningCondition.Status != metav1.ConditionTrue { - logger.Info("instance is not running", "instance", instance.Name) + availableCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceAvailable) + if availableCondition == nil || availableCondition.Status != metav1.ConditionTrue { + logger.Info("instance is not available", "instance", instance.Name) - readyCondition.Reason = pendingReason - if runningCondition != nil && runningCondition.Reason != pendingReason { - readyCondition.Reason = runningCondition.Reason + // Propagate the Available condition's reason and message directly — + // including when the status is Unknown — so provider-set reasons like + // ImageUnavailable surface on Ready rather than a generic message. + readyStatus := metav1.ConditionFalse + readyReason := pendingReason + readyMessage := "Instance is not available" + if availableCondition != nil && availableCondition.Reason != pendingReason { + readyStatus = availableCondition.Status + readyReason = availableCondition.Reason + readyMessage = availableCondition.Message } - readyCondition.Message = "Instance is not running" - if runningCondition != nil && runningCondition.Status != metav1.ConditionUnknown { - readyCondition.Message = runningCondition.Message - } + readyCondition.Status = readyStatus + readyCondition.Reason = readyReason + readyCondition.Message = readyMessage return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } readyCondition.Status = metav1.ConditionTrue readyCondition.Reason = computev1alpha.InstanceReadyReasonAvailable - readyCondition.Message = "Instance is ready" + readyCondition.Message = msgInstanceReady return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } @@ -436,38 +1237,111 @@ func (r *InstanceReconciler) checkForNetworkCreationFailure(ctx context.Context, return false, "", nil } +// resolveProjectID delegates to projectIDForInstance; when nil it falls back +// to string(clusterName) (Milo mode). +func (r *InstanceReconciler) resolveProjectID(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectIDForInstance != nil { + return r.projectIDForInstance(ctx, clusterName, instance) + } + return string(clusterName), nil +} + +// resolveProjectNamespace delegates to projectNamespaceForInstance; when nil +// it falls back to instance.Namespace (Milo mode). +func (r *InstanceReconciler) resolveProjectNamespace(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectNamespaceForInstance != nil { + return r.projectNamespaceForInstance(ctx, clusterName, instance) + } + return instance.Namespace, nil +} + +// resolveClusterNameForProject delegates to clusterNameForProject; when nil it +// falls back to multicluster.ClusterName(projectID) (Milo mode). +func (r *InstanceReconciler) resolveClusterNameForProject(projectID string) multicluster.ClusterName { + if r.clusterNameForProject != nil { + return r.clusterNameForProject(projectID) + } + return multicluster.ClusterName(projectID) +} + // SetupWithManager sets up the controller with the Manager. -func (r *InstanceReconciler) SetupWithManager(mgr mcmanager.Manager, managementCluster cluster.Cluster) error { +// +// quotaRestConfig is the REST config used to reach Milo project control planes +// for ResourceClaim management. Pass nil to disable quota accounting. +// +// projectIDForInstance derives the Milo project ID for each reconcile request. +// In Milo mode pass nil (falls back to using ClusterName). In single-cell mode +// pass a function that decodes the project ID from the edge namespace's +// upstream-cluster-name label. +// +// clusterNameForProject maps a project ID back to the multicluster ClusterName. +// In Milo mode pass nil (falls back to ClusterName(projectID)). In single-cell +// mode pass a function that always returns "single". +func (r *InstanceReconciler) SetupWithManager( + mgr mcmanager.Manager, + quotaRestConfig *rest.Config, + projectIDForInstance InstanceProjectIDFunc, + projectNamespaceForInstance InstanceProjectNamespaceFunc, + edgeClusterName string, + clusterNameForProject func(projectID string) multicluster.ClusterName, +) error { r.mgr = mgr - r.managementCluster = managementCluster - - // Watch ResourceClaim objects on the management cluster directly, bypassing - // the multicluster clusterInjectingQueue which would overwrite ClusterName. - // Using ctrlsource.TypedKind lets the handler produce mcreconcile.Request - // values with the correct ClusterName taken from claim.Spec.ConsumerRef.Name. - claimSource := ctrlsource.TypedKind( - managementCluster.GetCache(), - "av1alpha1.ResourceClaim{}, - handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, claim *quotav1alpha1.ResourceClaim) []mcreconcile.Request { - if claim.Spec.ResourceRef.Kind != "Instance" || claim.Spec.ResourceRef.APIGroup != "compute.datumapis.com" { - return nil - } - return []mcreconcile.Request{ - { - Request: reconcile.Request{ - NamespacedName: types.NamespacedName{ - Name: claim.Spec.ResourceRef.Name, - Namespace: claim.Spec.ResourceRef.Namespace, - }, - }, - ClusterName: claim.Spec.ConsumerRef.Name, - }, - } - }), - ) + r.scheme = mgr.GetLocalManager().GetScheme() + //nolint:staticcheck // GetEventRecorder (new events API) has an incompatible Eventf + // signature (requires related object + action args) that would require migrating + // all emit sites. GetEventRecorderFor remains correct; migration is deferred. + r.recorder = mgr.GetLocalManager().GetEventRecorderFor("instance-controller") + r.edgeClusterName = edgeClusterName + r.projectIDForInstance = projectIDForInstance + r.projectNamespaceForInstance = projectNamespaceForInstance + r.clusterNameForProject = clusterNameForProject + if quotaRestConfig != nil { + if edgeClusterName == "" { + return fmt.Errorf("edgeClusterName must be set when quota enforcement is enabled; set discovery.clusterName in the server config") + } + r.quotaClientManager = quotametrics.New(quotaRestConfig) + } + + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(instanceControllerFinalizer, r); err != nil { + return fmt.Errorf("failed to register finalizer: %w", err) + } + + edgeClusterNameVal := r.edgeClusterName return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Instance{}, mcbuilder.WithEngageWithLocalCluster(false)). - WatchesRawSource(claimSource). + Watches( + "av1alpha1.ResourceClaim{}, + func(_ multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []mcreconcile.Request { + claim := obj.(*quotav1alpha1.ResourceClaim) + // Map the claim back to its owning Instance. The Instance + // namespace is carried on a label (the claim itself lives in + // the project's quota namespace) and the Instance name is the + // claim name with the resource-kind prefix stripped. + instanceNamespace := claim.GetLabels()[instanceQuotaClaimNamespaceLabel] + if instanceNamespace == "" { + return nil + } + return []mcreconcile.Request{ + { + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: instanceNamespace, + Name: strings.TrimPrefix(claim.Name, instanceQuotaClaimNamePrefix), + }, + }, + ClusterName: r.resolveClusterNameForProject(claim.Spec.ConsumerRef.Name), + }, + } + }, + ) + }, + mcbuilder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[instanceQuotaClaimSourceLabel] == edgeClusterNameVal + })), + ). Complete(r) } diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index b356d433..1445ff96 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -3,59 +3,51 @@ package controller import ( "context" "fmt" - "net/http" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" + "go.datum.net/compute/internal/quota" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" ) -// fakeCluster implements cluster.Cluster for testing using a fake client. -type fakeCluster struct { - client client.Client - scheme *runtime.Scheme -} - -func (f *fakeCluster) GetHTTPClient() *http.Client { return nil } -func (f *fakeCluster) GetConfig() *rest.Config { return nil } -func (f *fakeCluster) GetCache() cache.Cache { return nil } -func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.scheme } -func (f *fakeCluster) GetClient() client.Client { return f.client } -func (f *fakeCluster) GetFieldIndexer() client.FieldIndexer { return nil } -func (f *fakeCluster) GetEventRecorderFor(string) record.EventRecorder { return nil } -func (f *fakeCluster) GetRESTMapper() apimeta.RESTMapper { return nil } -func (f *fakeCluster) GetAPIReader() client.Reader { return f.client } -func (f *fakeCluster) Start(context.Context) error { return nil } - -// fakeMCManager is a minimal multicluster manager that returns a single cluster. -type fakeMCManager struct { - clusters map[string]cluster.Cluster -} - -func (m *fakeMCManager) GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) { - cl, ok := m.clusters[clusterName] - if !ok { - return nil, fmt.Errorf("cluster %q not found", clusterName) - } - return cl, nil -} +// Test constants for repeated string literals across controller package tests. +const ( + testInstanceName = "test-instance" + testReasonString = "TestReason" + testMessageString = "Test message" + testUIDString = "test-uid" + testInstanceType = "d1-standard-2" + testDefaultPlacement = "default" + testDefaultNamespace = "default" + testEdgeClusterName = "test-edge" + testComputeAPIVersion = "compute.datumapis.com/v1alpha" + testQuotaAPIGroup = "quota.miloapis.com" + testQuotaResource = "resourceclaims" + kindWorkloadDeploymentTest = "WorkloadDeployment" // mirrors kindWorkloadDeployment +) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. func newTestScheme(t *testing.T) *runtime.Scheme { @@ -79,8 +71,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance without ready condition should create default", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, }, @@ -89,7 +81,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, }, }, @@ -97,8 +89,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates should set scheduling gates present", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -114,7 +106,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -134,8 +126,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates and network failure should set network failed", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -153,7 +145,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "NetworkFailedToCreate", + Reason: reasonNetworkFailedToCreate, Message: "Network creation failed: timeout", ObservedGeneration: 1, }, @@ -162,8 +154,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance not programmed should set pending programming", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -171,8 +163,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -181,17 +173,17 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, { - name: "instance programmed but not running should wait for running", + name: "instance programmed but not available should wait for available", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -200,13 +192,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -215,8 +207,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -224,8 +216,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance fully ready should set ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -234,13 +226,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, }, }, }, @@ -250,7 +242,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -258,8 +250,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "no change when condition already matches", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -268,7 +260,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -276,13 +268,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, }, }, }, @@ -292,7 +284,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -343,8 +335,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota denied blocks ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -360,14 +352,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, LastTransitionTime: metav1.Now(), }, }, @@ -385,8 +377,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota available does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -402,14 +394,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, LastTransitionTime: metav1.Now(), }, }, @@ -420,15 +412,15 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, }, }, { name: "quota pending unknown does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -448,7 +440,7 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, }, }, } @@ -491,7 +483,7 @@ func TestReconcileQuota(t *testing.T) { instanceName = "my-instance" ) - claimName := namespace + "--" + instanceName + claimName := instanceQuotaClaimNamePrefix + instanceName const deploymentName = "my-deployment" @@ -501,25 +493,28 @@ func TestReconcileQuota(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: deploymentName, Namespace: namespace, - UID: "test-uid", + UID: testUIDString, }, } } // makeInstance creates a test Instance with an owner reference to the // deployment so that checkForNetworkCreationFailure can look it up. + // Both finalizers are pre-populated so that the finalizer framework does + // not need to add instanceControllerFinalizer on the first reconcile, + // which would cause an early return before quota logic runs. makeInstance := func(_ *runtime.Scheme, gates ...computev1alpha.SchedulingGate) *computev1alpha.Instance { return &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ Name: instanceName, - Namespace: namespace, - Finalizers: []string{instanceQuotaFinalizer}, + Namespace: testDefaultNamespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, OwnerReferences: []metav1.OwnerReference{ { - APIVersion: "compute.datumapis.com/v1alpha", - Kind: "WorkloadDeployment", + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, Name: deploymentName, - UID: "test-uid", + UID: testUIDString, Controller: func() *bool { b := true; return &b }(), }, }, @@ -529,7 +524,7 @@ func TestReconcileQuota(t *testing.T) { SchedulingGates: gates, }, Runtime: computev1alpha.InstanceRuntimeSpec{ - Resources: computev1alpha.InstanceRuntimeResources{InstanceType: "d1-standard-2"}, + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, }, NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, }, @@ -544,18 +539,21 @@ func TestReconcileQuota(t *testing.T) { }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, Name: clusterName, }, + // ResourceRef points at the Project resource (cluster-scoped), not the + // Instance. The quota admission plugin validates against the + // ResourceRegistration's claimingResources, which only allows + // resourcemanager.miloapis.com/Project. ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instanceName, - Namespace: namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: clusterName, }, Requests: []quotav1alpha1.ResourceRequest{ - {ResourceType: "compute.datumapis.com/instances", Amount: 1}, + {ResourceType: quotaResourceTypeInstances, Amount: 1}, }, }, Status: quotav1alpha1.ResourceClaimStatus{ @@ -572,7 +570,7 @@ func TestReconcileQuota(t *testing.T) { } } - newReconciler := func(t *testing.T, projectObjs []client.Object, mgmtObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { + newReconciler := func(t *testing.T, projectObjs []client.Object, quotaObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { t.Helper() s := newTestScheme(t) @@ -582,26 +580,44 @@ func TestReconcileQuota(t *testing.T) { WithStatusSubresource(&computev1alpha.Instance{}). Build() - mgmtClient := fake.NewClientBuilder(). + quotaClient := fake.NewClientBuilder(). WithScheme(s). - WithObjects(mgmtObjs...). + WithObjects(quotaObjs...). WithStatusSubresource("av1alpha1.ResourceClaim{}). Build() mgr := &fakeMCManager{ clusters: map[string]cluster.Cluster{ - clusterName: &fakeCluster{client: projectClient, scheme: s}, + clusterName: newFakeCluster(projectClient), }, } + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + r := &InstanceReconciler{ - mgr: mgr, - managementCluster: &fakeCluster{client: mgmtClient, scheme: s}, + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + // Milo mode: project ID == ClusterName; claim namespace == instance.Namespace. + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + // nil → falls back to instance.Namespace, which is correct for Milo mode. + projectNamespaceForInstance: nil, } - return r, projectClient, mgmtClient + + // Initialize the finalizer registry so that r.finalizers.Finalize is not + // a nil-pointer dereference. SetupWithManager does this in production; in + // tests we replicate the same steps manually. + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + return r, projectClient, quotaClient } - t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True", func(t *testing.T) { + t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True in single reconcile", func(t *testing.T) { s := newTestScheme(t) instance := makeInstance(s, computev1alpha.SchedulingGate{Name: instancecontrol.NetworkSchedulingGate.String()}, @@ -611,7 +627,10 @@ func TestReconcileQuota(t *testing.T) { r, projectClient, _ := newReconciler(t, []client.Object{instance, makeDeployment()}, []client.Object{claim}) - // First reconcile: sets QuotaGranted=True in status, returns early. + // Single reconcile: sets QuotaGranted=True in status AND removes the + // Quota scheduling gate in the same pass. The early-return-before-gate- + // removal bug required a second reconcile that never arrived because + // ResourceClaims are immutable and local Instances are not watched. _, err := r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -623,22 +642,41 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) - // Second reconcile: status is already set, so removes the scheduling gate. - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) - hasQuotaGate := false for _, g := range updated.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate must be removed in the same reconcile pass as the status update") + }) + + t.Run("ready-condition reconcile error: quota condition persisted before the error returns", func(t *testing.T) { + s := newTestScheme(t) + // A scheduling gate keeps the Ready-condition reconcile on the network + // failure checker path, and the missing owner reference makes that + // checker fail. + instance := makeInstance(s, + computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, + ) + instance.OwnerReferences = nil + claim := makeClaim(s, metav1.ConditionTrue, quotav1alpha1.ResourceClaimGrantedReason) + + r, projectClient, _ := newReconciler(t, []client.Object{instance, makeDeployment()}, []client.Object{claim}) + + _, err := r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, + "QuotaGranted condition must be persisted even when the Ready-condition reconcile fails") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) }) - t.Run("quota exceeded flow: conditions cascade to block Programmed/Running/Ready", func(t *testing.T) { + t.Run("quota exceeded flow: conditions cascade to block Programmed/Available/Ready", func(t *testing.T) { s := newTestScheme(t) instance := makeInstance(s, computev1alpha.SchedulingGate{Name: instancecontrol.NetworkSchedulingGate.String()}, @@ -664,10 +702,10 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionFalse, programmedCond.Status) assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, programmedCond.Reason) - runningCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceAvailable) - require.NotNil(t, runningCond) - assert.Equal(t, metav1.ConditionFalse, runningCond.Status) - assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, runningCond.Reason) + availableCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceAvailable) + require.NotNil(t, availableCond) + assert.Equal(t, metav1.ConditionFalse, availableCond.Status) + assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, availableCond.Reason) readyCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceReady) require.NotNil(t, readyCond) @@ -709,7 +747,9 @@ func TestReconcileQuota(t *testing.T) { } require.NoError(t, mgmtClient.Status().Update(context.Background(), &existingClaim)) - // Second reconcile should see granted claim and update status. + // Second reconcile should see the granted claim, update status to + // QuotaGranted=True, AND remove the gate in the same pass (no third + // reconcile required). _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -719,28 +759,41 @@ func TestReconcileQuota(t *testing.T) { require.NotNil(t, quotaCond) assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) - // Third reconcile removes the gate (status is already true, no more status write needed). - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &recovered)) hasQuotaGate := false for _, g := range recovered.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed after quota granted") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate should be removed in the same reconcile pass that sets QuotaGranted=True") }) t.Run("deleted before grant: finalizer deletes claim and is removed", func(t *testing.T) { s := newTestScheme(t) now := metav1.Now() - instance := makeInstance(s, - computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, - ) - instance.DeletionTimestamp = &now + // Build the instance directly without instanceControllerFinalizer to + // represent the state after the Karmada finalizer has already been + // cleaned up; only the quota finalizer remains to be processed. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } claim := makeClaim(s, metav1.ConditionFalse, quotav1alpha1.ResourceClaimPendingReason) @@ -766,3 +819,1628 @@ func TestReconcileQuota(t *testing.T) { } }) } + +// TestQuotaGateRemovedInSingleReconcile is a regression test for the bug where +// the Quota scheduling gate was never removed from an Instance after quota was +// granted. The root cause was an early return in the Reconcile function: when +// reconcileQuotaCondition set QuotaGranted=True (statusChanged=true), the code +// wrote the status update and returned before reaching removeQuotaSchedulingGate. +// Because ResourceClaims are immutable (no further transitions) and local +// Instances are not watched (WithEngageWithLocalCluster(false)), no requeue ever +// arrived — leaving the Quota gate stranded in spec.controller.schedulingGates +// and the projected Instance stuck "Pending (SchedulingGatesPresent)". +func TestQuotaGateRemovedInSingleReconcile(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "my-instance" + deploymentName = "my-deployment" + ) + + claimName := instanceQuotaClaimNamePrefix + instanceName + + tests := []struct { + name string + initialGates []computev1alpha.SchedulingGate + expectGateGone bool + }{ + { + name: "Quota gate only: removed in single reconcile when claim is granted", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "Quota gate plus Network gate: Quota removed, Network preserved", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "No gates: no-op, reconcile completes cleanly", + initialGates: []computev1alpha.SchedulingGate{}, + expectGateGone: false, // no gate to begin with + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + Generation: 1, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: tt.initialGates, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: namespace, UID: testUIDString}, + } + + // ResourceClaim already in QuotaAvailable state — simulates the state + // that triggered the bug: claim already granted but gate still present. + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota available", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Exactly one reconcile — must be sufficient to both set QuotaGranted=True + // and remove the Quota gate. No second reconcile should be required. + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + }) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + + // QuotaGranted condition must be set to True. + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be present") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Quota gate must be gone after the single reconcile. + hasQuotaGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasQuotaGate = true + } + } + if tt.expectGateGone { + assert.False(t, hasQuotaGate, + "Quota gate must be removed in the same reconcile pass as the QuotaGranted=True status write; "+ + "a stranded gate leaves the projected Instance stuck Pending (SchedulingGatesPresent)") + } + + // Network gate (if present) must be preserved — only the Quota gate is + // cleared by InstanceReconciler; NetworkSchedulingGate is owned by + // WorkloadDeploymentReconciler. + for _, g := range updated.Spec.Controller.SchedulingGates { + assert.NotEqual(t, instancecontrol.QuotaSchedulingGate.String(), g.Name, + "Quota gate must not remain after granted claim") + } + }) + } +} + +// TestReconcileQuotaSingleMode verifies that in single-cell mode: +// - the project ID is decoded from the upstream-cluster-name label on the edge +// namespace (not taken from the always-"single" ClusterName) +// - the ResourceClaim is created in the in-project namespace (upstream-namespace +// label, e.g. "default"), not in the edge namespace (ns-abc123) +// - the ResourceRef points at resourcemanager.miloapis.com/Project, not Instance +func TestReconcileQuotaSingleMode(t *testing.T) { + const ( + instanceName = "my-instance" + edgeNS = "ns-abc123" // edge namespace (ns-{uid}) — does NOT exist in project CP + projectID = "datum-cloud" // decoded from "cluster-datum-cloud" + projectNS = "default" // upstream-namespace label value — where claims live + deploymentName = "my-deployment" + ) + + // Claim name is the instance-prefixed Instance name; the claim object itself + // lives in projectNS (the instance's edge namespace is carried on a label). + claimName := instanceQuotaClaimNamePrefix + instanceName + + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: edgeNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: edgeNS, UID: "test-uid"}, + } + + // ResourceClaim lives in projectNS ("default"), not edgeNS ("ns-abc123"). + // ResourceRef points at the Project resource, matching the ResourceRegistration's + // claimingResources (resourcemanager.miloapis.com/Project only). + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: projectNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + // The quota client is keyed by projectID ("datum-cloud"), matching what + // projectIDForInstance returns after decoding "cluster-datum-cloud". + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + qm := quota.New(nil) + qm.StoreClient(projectID, quotaClient) + + const singleCluster = "single" + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + singleCluster: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: singleCluster, + // Single-cell mode: project ID decoded from upstream-cluster-name label. + // Simulates what cmd/main.go does for "cluster-datum-cloud" → "datum-cloud". + projectIDForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectID, nil + }, + // Single-cell mode: claim namespace comes from upstream-namespace label. + // Simulates what cmd/main.go does by reading the edge namespace labels. + projectNamespaceForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectNS, nil + }, + // Single-cell mode: watch map func must always return "single". + clusterNameForProject: func(_ string) multicluster.ClusterName { + return singleCluster + }, + } + + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: edgeNS, Name: instanceName}}, + ClusterName: singleCluster, + } + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: edgeNS, Name: instanceName}, &updated)) + + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be set") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status, "quota should be granted in single mode") + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Verify clusterNameForProject always returns "single" so the watch map func + // never enqueues an unknown cluster name. + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject(projectID)) + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject("any-other-project")) + + // Verify resolveProjectNamespace returns the in-project namespace, not the edge namespace. + resolvedNS, resolveErr := r.resolveProjectNamespace(context.Background(), singleCluster, instance) + require.NoError(t, resolveErr) + assert.Equal(t, projectNS, resolvedNS, "claim namespace must be the in-project namespace, not the edge namespace") +} + +// TestReconcileQuotaFailureModes verifies that infrastructure failures in the +// quota path set specific QuotaGranted=False conditions (fail-closed) rather +// than silently allowing workloads to schedule. +func TestReconcileQuotaFailureModes(t *testing.T) { + const ( + testProject = "test-project" + testNS = "default" + testInstance = "my-instance" + testDeployment = "my-deployment" + ) + + makeInstance := func() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstance, + Namespace: testNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: testDeployment, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + } + + makeDeployment := func() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: testDeployment, Namespace: testNS, UID: testUIDString}, + } + } + + newReconcilerWithInterceptor := func( + t *testing.T, + funcs interceptor.Funcs, + fakeRecorder *record.FakeRecorder, + ) (*InstanceReconciler, client.Client) { + t.Helper() + s := newTestScheme(t) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithInterceptorFuncs(funcs). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient + } + + reconcileReq := func() mcreconcile.Request { + return mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: testNS, Name: testInstance}}, + ClusterName: testProject, + } + } + + t.Run("FM-2: backend unreachable sets QuotaBackendUnavailable", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return fmt.Errorf("connection refused") + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + // Reconcile returns error for transient failures. + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, cond.Reason) + + // Event should have been emitted. + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable) + default: + t.Error("expected a Warning event for backend unavailable, got none") + } + }) + + // FM-4/FM-5: 404 on Create maps to NamespaceNotFound when the claim namespace + // is known (the more common case for project-exists-but-namespace-absent), and + // to ProjectNotFound when the namespace itself is empty (project CP path missing). + t.Run("FM-5: 404 on Create with known namespace sets QuotaNamespaceNotFound", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + notFoundErr := apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return notFoundErr + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return notFoundErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + // claimNamespace == testNS (non-empty) → NamespaceNotFound, not ProjectNotFound. + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound, cond.Reason, + "404 on Create with known namespace should map to NamespaceNotFound") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound) + default: + t.Error("expected a Warning event for namespace not found, got none") + } + }) + + t.Run("FM-6: 403 on Create sets QuotaMisconfigured", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + forbiddenErr := apierrors.NewForbidden( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim", + fmt.Errorf("ResourceRegistration not found")) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return forbiddenErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonMisconfigured, cond.Reason, + "403 on Create should map to Misconfigured") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonMisconfigured) + default: + t.Error("expected a Warning event for misconfigured quota, got none") + } + }) + + t.Run("FM-7: claim pending with no budget sets QuotaNoBudget", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + claimName := instanceQuotaClaimNamePrefix + testInstance + pendingClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionFalse, + Reason: quotav1alpha1.ResourceClaimPendingReason, + Message: "No AllowanceBucket configured", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(pendingClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err, "pending-no-budget is not a transient error — no requeue needed") + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionUnknown, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNoBudget, cond.Reason, + "pending claim with no budget should use NoBudget reason, not PendingEvaluation") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNoBudget) + default: + t.Error("expected a Warning event for no budget, got none") + } + }) + + t.Run("quota disabled: quotaClientManager nil sets QuotaDisabled (not QuotaAvailable)", func(t *testing.T) { + s := newTestScheme(t) + instance := makeInstance() + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: nil, // explicitly disabled + edgeClusterName: testEdgeClusterName, + recorder: record.NewFakeRecorder(10), + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, cond.Reason, + "intentionally disabled quota should use QuotaDisabled reason") + }) + + t.Run("observedGeneration guard: stale True condition does not remove gate for new generation", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + // Instance at generation 2 with a stale QuotaGranted=True from generation 1. + instance := makeInstance() + instance.Generation = 2 + instance.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, + Message: "quota granted (generation 1)", + ObservedGeneration: 1, // stale — does not match instance.Generation=2 + LastTransitionTime: metav1.Now(), + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + claimName := instanceQuotaClaimNamePrefix + testInstance + grantedClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + ResourceRef: quotav1alpha1.UnversionedObjectReference{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + Requests: []quotav1alpha1.ResourceRequest{{ResourceType: quotaResourceTypeInstances, Amount: 1}}, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(grantedClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Single reconcile: reconcileQuotaCondition writes QuotaGranted=True with + // ObservedGeneration=2 into the in-memory instance, status is persisted, + // then removeQuotaSchedulingGate reads the in-memory condition (gen=2 == + // instance.Generation=2) and removes the gate — all in one pass. + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + hasGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasGate = true + } + } + assert.False(t, hasGate, "gate should be removed in the same reconcile that refreshes the condition to current generation") + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, int64(2), cond.ObservedGeneration, "condition must reflect current generation") + }) + + t.Run("FM-1: missing identity label sets ProjectIDUnresolvable and errors", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, fake.NewClientBuilder().WithScheme(s).Build()) + + // Mirrors the single-mode resolver contract: the edge namespace exists + // but was never stamped with the cluster-name identity label. + identityErr := fmt.Errorf("edge namespace %q is missing label %q: %w", + testNS, downstreamclient.UpstreamOwnerClusterNameLabel, errProjectIdentityUnresolvable) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return "", identityErr + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err, "unresolvable identity must surface as an error, not a silent PendingEvaluation park") + require.ErrorIs(t, err, errProjectIdentityUnresolvable) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, cond.Reason) + assert.Contains(t, cond.Message, downstreamclient.UpstreamOwnerClusterNameLabel, + "condition message must name the missing label") + assert.Contains(t, cond.Message, testNS, + "condition message must name the edge namespace") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable) + default: + t.Error("expected a Warning event for unresolvable project identity, got none") + } + }) +} + +// TestReconcileDeletionProjectIdentity verifies the deletion-path tradeoff for +// project-identity resolution: unresolvable identity (missing namespace labels, +// a misconfiguration no retry fixes) must not wedge deletion — claim cleanup is +// skipped and the claim may leak until Milo GC — while transient resolution +// failures retry rather than risking an orphaned claim. +func TestReconcileDeletionProjectIdentity(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "my-instance" + ) + claimName := instanceQuotaClaimNamePrefix + instanceName + + makeDeletingInstance := func() *computev1alpha.Instance { + now := metav1.Now() + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + } + + makeClaim := func() *quotav1alpha1.ResourceClaim { + return "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName}, + ResourceRef: quotav1alpha1.UnversionedObjectReference{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName}, + Requests: []quotav1alpha1.ResourceRequest{{ResourceType: quotaResourceTypeInstances, Amount: 1}}, + }, + } + } + + newReconciler := func(t *testing.T, projectIDFn InstanceProjectIDFunc, rec record.EventRecorder) (*InstanceReconciler, client.Client, client.Client) { + t.Helper() + s := newTestScheme(t) + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeDeletingInstance()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeClaim()). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: rec, + projectIDForInstance: projectIDFn, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient, quotaClient + } + + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + } + + t.Run("unresolvable identity: deletion proceeds, claim cleanup skipped", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + identityErr := fmt.Errorf("edge namespace %q is missing label %q: %w", + namespace, downstreamclient.UpstreamOwnerClusterNameLabel, errProjectIdentityUnresolvable) + r, projectClient, quotaClient := newReconciler(t, + func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return "", identityErr + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err, "unresolvable identity must not wedge deletion") + + // Finalizer removed; the fake client garbage collects the object once the + // last finalizer clears, so accept either a clean object or NotFound. + var updated computev1alpha.Instance + getErr := projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated) + if getErr != nil { + assert.True(t, apierrors.IsNotFound(getErr), "unexpected error getting instance after finalizer removal") + } else { + assert.NotContains(t, updated.Finalizers, instanceQuotaFinalizer) + } + + // Claim cleanup skipped — the claim leaks until Milo GC removes it. + var claim quotav1alpha1.ResourceClaim + require.NoError(t, quotaClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: claimName}, &claim), + "claim must be left in place when identity is unresolvable") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, "QuotaClaimOrphaned") + default: + t.Error("expected a QuotaClaimOrphaned event, got none") + } + }) + + t.Run("transient resolution failure: reconcile errors and retries", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + r, projectClient, quotaClient := newReconciler(t, + func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return "", fmt.Errorf("connection refused") + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), req) + require.Error(t, err, "transient failures must retry rather than orphan the claim") + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + assert.Contains(t, updated.Finalizers, instanceQuotaFinalizer, + "finalizer must stay until claim cleanup succeeds") + + var claim quotav1alpha1.ResourceClaim + require.NoError(t, quotaClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: claimName}, &claim)) + + select { + case event := <-fakeRecorder.Events: + t.Errorf("no orphan event expected for a transient failure, got %q", event) + default: + } + }) +} + +// TestQuotaPendingRequeueAfter verifies the backing-off safety-net requeue used +// while an instance's quota claim is still pending: 1s for the first minute, then +// 15s, then 60s after 5m, then 300s after 10m; and no requeue once granted. +func TestQuotaPendingRequeueAfter(t *testing.T) { + base := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) + + // created is the instance creation time; quota elapsed is measured from it + // (NOT the condition's LastTransitionTime, which stays at the 1970 default + // while quota is pending). The condition LastTransitionTime here is + // deliberately left at the 1970 zero value to mirror that production reality. + withQuota := func(s metav1.ConditionStatus, created time.Time) *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.NewTime(created), + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{{ + Type: computev1alpha.InstanceQuotaGranted, + Status: s, + Reason: "PendingEvaluation", + }}, + }, + } + } + + tests := []struct { + name string + inst *computev1alpha.Instance + now time.Time + want time.Duration + }{ + {"granted -> no requeue", withQuota(metav1.ConditionTrue, base), base.Add(time.Hour), 0}, + {"no quota condition -> no requeue", &computev1alpha.Instance{}, base, 0}, + {"just pending -> 1s", withQuota(metav1.ConditionUnknown, base), base.Add(5 * time.Second), quotaPendingRequeueFast}, + {"59s -> 1s", withQuota(metav1.ConditionUnknown, base), base.Add(59 * time.Second), quotaPendingRequeueFast}, + {"60s boundary -> 15s", withQuota(metav1.ConditionUnknown, base), base.Add(60 * time.Second), quotaPendingRequeueMedium}, + {"3m -> 15s", withQuota(metav1.ConditionUnknown, base), base.Add(3 * time.Minute), quotaPendingRequeueMedium}, + {"5m boundary -> 60s", withQuota(metav1.ConditionUnknown, base), base.Add(5 * time.Minute), quotaPendingRequeueSlow}, + {"8m -> 60s", withQuota(metav1.ConditionUnknown, base), base.Add(8 * time.Minute), quotaPendingRequeueSlow}, + {"10m boundary -> 300s", withQuota(metav1.ConditionUnknown, base), base.Add(10 * time.Minute), quotaPendingRequeueIdle}, + {"1h -> 300s", withQuota(metav1.ConditionUnknown, base), base.Add(time.Hour), quotaPendingRequeueIdle}, + {"denied(False) still polls", withQuota(metav1.ConditionFalse, base), base.Add(2 * time.Minute), quotaPendingRequeueMedium}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, quotaPendingRequeueAfter(tc.inst, tc.now)) + }) + } +} + +// Shared literals for the instance-sizing / blocking-reason tests below. +const ( + testContainerName = "app" + testContainerImage = "test/image:latest" +) + +// TestReconcileInstanceReadyCondition_ProviderSubConditionSurfacing verifies +// that provider-set sub-condition reasons (e.g. ImageUnavailable written by the +// unikraft provider onto the Available condition) surface on Ready with both the +// reason AND the message preserved — even when the sub-condition status is +// Unknown (the normal state for a retriable image-pull failure). +// +// This guards against Ready carrying a generic message that discards the +// actionable provider reason. +func TestReconcileInstanceReadyCondition_ProviderSubConditionSurfacing(t *testing.T) { + // These messages mirror the exact strings that translateWaitingReason in the + // unikraft provider writes. Both the reason AND the message must reach Ready. + const ( + msgImageUnavailable = "The instance image could not be pulled" + msgInstanceCrashing = "The instance is repeatedly failing to start" + msgConfigError = "The instance could not be started due to a configuration error" + msgProvisioning = "Instance is provisioning" + msgProgrammingInProgress = "Instance is being programmed" + ) + + noGates := func(inst *computev1alpha.Instance) *computev1alpha.Instance { return inst } + withQuotaGranted := func(inst *computev1alpha.Instance) *computev1alpha.Instance { + inst.Status.Conditions = append(inst.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, + Message: "Quota allocated", + }) + return inst + } + + tests := []struct { + name string + instance *computev1alpha.Instance + wantStatus metav1.ConditionStatus + wantReason string + wantMessage string + }{ + { + // The key scenario from the design: provider writes Available=Unknown/ + // ImageUnavailable while Programmed is still Unknown/ProgrammingInProgress. + // Ready must carry ImageUnavailable + the actionable message, NOT the + // generic "Instance has not been programmed". + name: "image_pull_failure_surfaces_on_ready", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + // Provider sets Available=Unknown/ImageUnavailable when the + // container enters an image-pull waiting state. + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + { + // Even while Programmed is Unknown, Ready must surface the provider + // sub-condition's reason and message; the generic PendingProgramming/ + // msgNotProgrammed pair is reserved for instances with no more + // specific signal. + name: "provider_reason_wins_over_generic_message_while_programmed_unknown", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + { + // When both a transient Provisioning and ImageUnavailable are present, + // ImageUnavailable (priority 5) must win over Provisioning (priority 1). + name: "image_unavailable_beats_transient_provisioning", + instance: noGates(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonProvisioning, + Message: msgProvisioning, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + { + // When no specific provider sub-condition exists but Programmed carries + // a specific reason (ProgrammingInProgress), that reason should + // pass-through to Ready. The generic msgNotProgrammed fallback is only + // used when Programmed is absent or carries only a generic "Pending" reason. + name: "programmed_in_progress_passes_through_when_no_provider_sub_condition", + instance: noGates(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + }, + }, + }), + // ProgrammingInProgress is more specific than PendingProgramming and + // passes through from Programmed → Ready. + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + wantMessage: msgProgrammingInProgress, + }, + { + // True generic fallback: no Programmed condition at all. The default + // PendingProgramming/msgNotProgrammed must be emitted. + name: "generic_fallback_when_programmed_condition_absent", + instance: noGates(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + }, + }), + wantStatus: metav1.ConditionFalse, + wantReason: computev1alpha.InstanceProgrammedReasonPendingProgramming, + wantMessage: msgNotProgrammed, + }, + { + // InstanceCrashing: terminal-ish (not retried indefinitely by the user, + // they must fix the app). Status=Unknown from provider → Ready=Unknown. + name: "instance_crashing_surfaces_on_ready", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonInstanceCrashing, + Message: msgInstanceCrashing, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonInstanceCrashing, + wantMessage: msgInstanceCrashing, + }, + { + // ConfigurationError: provider could not start the container due to a + // spec/config issue. User must correct the workload. + name: "configuration_error_surfaces_on_ready", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonConfigurationError, + Message: msgConfigError, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonConfigurationError, + wantMessage: msgConfigError, + }, + { + // When Programmed=True but Available=Unknown/ImageUnavailable, the + // available-not-true branch must also propagate the provider reason+message. + name: "image_unavailable_on_available_condition_programmed_true", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceProgrammedReasonProgrammed, + Message: msgInstanceProgrammed, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + } + + noNetworkFailure := func(_ context.Context, _ client.Client, _ *computev1alpha.Instance) (bool, string, error) { + return false, "", nil + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &InstanceReconciler{} + _, err := r.reconcileInstanceReadyCondition(context.Background(), nil, tt.instance, noNetworkFailure) + require.NoError(t, err) + + ready := apimeta.FindStatusCondition(tt.instance.Status.Conditions, computev1alpha.InstanceReady) + require.NotNil(t, ready, "Ready condition must be set") + assert.Equal(t, tt.wantStatus, ready.Status, "Ready.Status mismatch") + assert.Equal(t, tt.wantReason, ready.Reason, "Ready.Reason mismatch") + assert.Equal(t, tt.wantMessage, ready.Message, "Ready.Message mismatch") + }) + } +} + +// TestResolveInstanceResources verifies the three-tier sizing precedence: +// explicit container Limits > instance-level Requests > instanceType catalog. +func TestResolveInstanceResources(t *testing.T) { + // d1Standard2 is the canonical catalog entry for datumcloud/d1-standard-2 + // (1 vCPU = 1000 millicores, 2 GiB = 2048 MiB) — the platform-declared quota + // size for the instance type. + const ( + d1CPUMillicores = int64(1000) + d1MemMiB = int64(2048) + ) + + cpu500m := resource.MustParse("500m") + cpu1 := resource.MustParse("1") + mem256Mi := resource.MustParse("256Mi") + mem512Mi := resource.MustParse("512Mi") + + makeContainerResources := func(cpu, mem resource.Quantity) *computev1alpha.ContainerResourceRequirements { + return &computev1alpha.ContainerResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: cpu, + corev1.ResourceMemory: mem, + }, + } + } + + tests := []struct { + name string + instance *computev1alpha.Instance + wantCPU int64 + wantMem int64 + wantResolved bool + }{ + { + // Common production case: instanceType only, no explicit limits. + // resolveInstanceResources must consult the catalog and return the + // d1-standard-2 values so vcpus + memory are included in the claim. + name: "instanceType only: d1-standard-2 resolves from catalog", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: instanceTypeD1Standard2, + }, + }, + }, + }, + wantCPU: d1CPUMillicores, + wantMem: d1MemMiB, + wantResolved: true, + }, + { + // Explicit container Limits take precedence over the catalog so that + // a workload with custom sizing is accounted at its actual footprint. + name: "explicit container limits override catalog", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: instanceTypeD1Standard2, + }, + Sandbox: &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: testContainerName, + Image: testContainerImage, + Resources: makeContainerResources(cpu500m, mem256Mi), + }, + { + Name: "sidecar", + Image: "test/sidecar:latest", + Resources: makeContainerResources(cpu500m, mem256Mi), + }, + }, + }, + }, + }, + }, + // Two containers each contributing 500m CPU + 256 MiB → 1000m + 512 MiB. + wantCPU: 1000, + wantMem: 512, + wantResolved: true, + }, + { + // A single container with full cpu+memory Limits; no instanceType needed. + name: "single container limits, no instanceType", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Sandbox: &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: testContainerName, + Image: testContainerImage, + Resources: makeContainerResources(cpu1, mem512Mi), + }, + }, + }, + }, + }, + }, + wantCPU: 1000, + wantMem: 512, + wantResolved: true, + }, + { + // Instance-level Requests (no sandbox, no instanceType) use path 2. + name: "instance-level resources.requests resolve correctly", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: cpu1, + corev1.ResourceMemory: mem512Mi, + }, + }, + }, + }, + }, + wantCPU: 1000, + wantMem: 512, + wantResolved: true, + }, + { + // An unknown instanceType with no explicit sizing must not fabricate + // values; the caller falls back to claiming instance count only. + name: "unknown instanceType, no explicit limits: unresolved", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: "datumcloud/unknown-type-99", + }, + }, + }, + }, + wantCPU: 0, + wantMem: 0, + wantResolved: false, + }, + { + // Empty instanceType and no explicit sizing: unresolved. + name: "empty instanceType, nothing explicit: unresolved", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{}, + }, + }, + }, + wantCPU: 0, + wantMem: 0, + wantResolved: false, + }, + { + // Sandbox containers without any Limits fall through to the catalog + // when an instanceType is set — partial container specs must not block + // catalog resolution. + name: "sandbox containers without limits fall through to catalog", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: instanceTypeD1Standard2, + }, + Sandbox: &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: testContainerName, + Image: testContainerImage, + // No Resources.Limits set — common for UKC workloads. + }, + }, + }, + }, + }, + }, + wantCPU: d1CPUMillicores, + wantMem: d1MemMiB, + wantResolved: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cpu, mem, resolved := resolveInstanceResources(tt.instance) + assert.Equal(t, tt.wantResolved, resolved, "resolved mismatch") + assert.Equal(t, tt.wantCPU, cpu, "cpuMillicores mismatch") + assert.Equal(t, tt.wantMem, mem, "memMiB mismatch") + }) + } +} + +// TestReconcileQuotaClaim_RequestsIncludeVCPUsAndMemory confirms that when an +// instance is sized by instanceType alone (the typical production shape), the +// ResourceClaim created by reconcileQuotaClaim includes vcpus and memory +// requests in addition to the instance count, so the AllowanceBuckets are fed. +func TestReconcileQuotaClaim_RequestsIncludeVCPUsAndMemory(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "claim-resources-test" + ) + + claimName := instanceQuotaClaimNamePrefix + instanceName + + s := newTestScheme(t) + + // Instance sized by instanceType only — no container limits, no explicit + // instance-level requests. This is the common production workload shape. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: "owner-deployment", + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + // No Requests, no container Limits — catalog must supply the values. + InstanceType: instanceTypeD1Standard2, + }, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "owner-deployment", + Namespace: namespace, + UID: testUIDString, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + + r := &InstanceReconciler{ + mgr: &fakeMCManager{clusters: map[string]cluster.Cluster{clusterName: newFakeCluster(projectClient)}}, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + recorder: &record.FakeRecorder{}, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + }) + require.NoError(t, err) + + // Verify the created ResourceClaim carries vcpus and memory requests. + var createdClaim quotav1alpha1.ResourceClaim + require.NoError(t, quotaClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: claimName}, &createdClaim)) + + byType := make(map[string]int64, len(createdClaim.Spec.Requests)) + for _, req := range createdClaim.Spec.Requests { + byType[req.ResourceType] = req.Amount + } + + assert.Equal(t, int64(1), byType[quotaResourceTypeInstances], "instance count must be 1") + assert.Equal(t, int64(1000), byType["compute.datumapis.com/vcpus"], + "d1-standard-2 must claim 1000 millicores (1 vCPU)") + assert.Equal(t, int64(2048), byType["compute.datumapis.com/memory"], + "d1-standard-2 must claim 2048 MiB (2 GiB)") +} diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go new file mode 100644 index 00000000..4ac3e508 --- /dev/null +++ b/internal/controller/instance_projector.go @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// InstanceProjector watches Instance objects written back to the upstream +// Karmada/management control plane by POP-cell InstanceReconcilers and creates +// read-only projections in the corresponding project namespace within each +// project cluster. +// +// Namespace resolution: an upstream Instance lives in namespace +// `ns-`. The UID portion is matched against the UID of +// namespaces in the project cluster to find the target namespace. +// +// Ownership: each projected Instance is owned by the project WorkloadDeployment +// so that it is garbage-collected via cascading deletion when the deployment is +// removed from the project cluster. +// +// The controller is registered with a standard manager.Manager pointed at the +// upstream Karmada control plane — NOT the multicluster-runtime manager — so +// informer watches are scoped to the upstream control plane. +type InstanceProjector struct { + // FederationClient reads Instance objects from the Karmada federation control + // plane (configured via --federation-kubeconfig). Must be set before + // SetupWithManager is called. + FederationClient client.Client + + // MCManager provides access to project cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch + +func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("instance", req.NamespacedName) + + var downstreamInstance computev1alpha.Instance + if err := r.FederationClient.Get(ctx, req.NamespacedName, &downstreamInstance); err != nil { + if apierrors.IsNotFound(err) { + // Instance was deleted from the upstream control plane. Projections + // are owned by the project WorkloadDeployment, so cascading deletion + // handles cleanup. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting upstream instance: %w", err) + } + + // Federation-plane Instances exist exclusively as write-back copies, and + // the InstanceReconciler stamps both upstream-owner labels atomically when + // it writes the copy — "not ours" cannot occur. A missing cluster label is + // a stamping-invariant violation that never self-heals, so surface it as an + // error for backoff and visibility rather than silently dropping the + // projection. + encodedClusterName := downstreamInstance.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encodedClusterName == "" { + return ctrl.Result{}, fmt.Errorf("downstream instance %s/%s is missing the %s label; cannot resolve the project cluster", + downstreamInstance.Namespace, downstreamInstance.Name, + downstreamclient.UpstreamOwnerClusterNameLabel) + } + + // The encoded form is "cluster-" with "/" replaced by "_". + clusterName := DecodeClusterName(encodedClusterName) + + projectCluster, err := r.MCManager.GetCluster(ctx, multicluster.ClusterName(clusterName)) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // The InstanceReconciler stamps UpstreamOwnerNamespaceLabel with the project + // namespace name (read from the upstream Karmada namespace label set by the federator), + // so we can resolve the target namespace directly without scanning. Both + // upstream-owner labels are stamped together with non-empty values, so a + // cluster label without a namespace label is an invariant violation that + // never self-heals — surface it as an error for backoff and visibility + // rather than requeueing at a flat rate. + targetNamespace := downstreamInstance.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + return ctrl.Result{}, fmt.Errorf("downstream instance %s/%s carries %s but is missing the %s label; cannot resolve the project namespace", + downstreamInstance.Namespace, downstreamInstance.Name, + downstreamclient.UpstreamOwnerClusterNameLabel, downstreamclient.UpstreamOwnerNamespaceLabel) + } + + // Resolve the owning WorkloadDeployment by NAME in the project cluster. + // Core invariant: the ownerReference MUST be built from a project-cluster + // object obtained via projectClient.Get — never from any edge/Karmada + // identity. The WD name is stable across all planes (project cluster, + // Karmada, edge) and is the correct cross-plane identifier, carried by + // WorkloadDeploymentNameLabel (stamped by the edge stateful control + // strategy). + wdName := downstreamInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel] + if wdName == "" { + // A write-back copy that cannot identify its WorkloadDeployment violates + // the same stamping invariant as the labels above — surface it as an + // error for backoff and visibility instead of silently dropping the + // projection. + return ctrl.Result{}, fmt.Errorf("downstream instance %s/%s is missing the %s label; cannot resolve its WorkloadDeployment", + downstreamInstance.Namespace, downstreamInstance.Name, computev1alpha.WorkloadDeploymentNameLabel) + } + + // Fetch the project-cluster WD directly by name. The returned object carries + // the project-cluster metadata.uid — the only UID that GC in the project + // cluster can act on. + var ownerWD computev1alpha.WorkloadDeployment + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: targetNamespace, Name: wdName}, &ownerWD); err != nil { + if apierrors.IsNotFound(err) { + // Never create an ownerless projection. The controller only watches + // Instances, so no event fires when the WD appears — returning an + // error retries with backoff and surfaces the wait in error metrics. + // A transient ordering race (Instance projected before + // WorkloadReconciler created the project WD) resolves on retry; a + // deleted WD ends the retries once its write-back copies are gone. + return ctrl.Result{}, fmt.Errorf("workload deployment %q not found in project cluster %q for instance %s/%s", + wdName, clusterName, downstreamInstance.Namespace, downstreamInstance.Name) + } + return ctrl.Result{}, fmt.Errorf("failed getting WorkloadDeployment %s/%s in project cluster %s: %w", + targetNamespace, wdName, clusterName, err) + } + + projection := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: downstreamInstance.Name, + Namespace: targetNamespace, + }, + } + + operationResult, err := controllerutil.CreateOrUpdate(ctx, projectClient, projection, func() error { + // Propagate upstream tracking labels so consumers can filter by origin. + if projection.Labels == nil { + projection.Labels = make(map[string]string) + } + for k, v := range downstreamInstance.Labels { + projection.Labels[k] = v + } + + projection.Spec = downstreamInstance.Spec + + // Attach an owner reference using the live project-cluster WD object. + // controllerutil.SetOwnerReference reads UID and GVK from ownerWD, which + // was fetched from projectClient — satisfying the core invariant. + return controllerutil.SetOwnerReference(&ownerWD, projection, projectCluster.GetScheme()) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed upserting Instance projection in %s/%s: %w", clusterName, targetNamespace, err) + } + + logger.Info("reconciled Instance projection", "operation", operationResult, "namespace", targetNamespace, "cluster", clusterName) + + // 7. Sync status — status is a separate subresource. + projection.Status = downstreamInstance.Status + if err := projectClient.Status().Update(ctx, projection); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed updating Instance projection status: %w", err) + } + + return ctrl.Result{}, nil +} + +// SetupWithManager registers the InstanceProjector with upstreamMgr, a standard +// manager.Manager configured against the upstream Karmada/federation control plane +// REST config. FederationClient and MCManager must be set before calling this method. +func (r *InstanceProjector) SetupWithManager(upstreamMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(upstreamMgr). + For(&computev1alpha.Instance{}). + Named("instance-projector"). + Complete(r) +} diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go new file mode 100644 index 00000000..ad9c374f --- /dev/null +++ b/internal/controller/instance_projector_test.go @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "maps" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Test constants ─────────────────────────────────────────────────────────── + +const ( + // projTestCluster is the project cluster name used in projector tests. + projTestCluster = "project-cluster" + + // projTestProjNS is the project namespace name. + projTestProjNS = "proj-namespace" + + // projTestProjNSUID is the project namespace UID embedded in the Karmada + // namespace name below. + projTestProjNSUID = types.UID("deadbeef-1111-2222-3333-444455556666") + + // projTestKarmadaNS is the Karmada namespace derived from the UID above + // via the ns- convention. + projTestKarmadaNS = "ns-deadbeef-1111-2222-3333-444455556666" + + // projTestInstanceName is the name of the Karmada (and projected) Instance. + // Follows the "-" convention: "my-wd-0". + projTestInstanceName = "my-wd-0" + + // projTestWDUID is the UID of the owning WorkloadDeployment as it exists in + // the PROJECT cluster. This is the UID that owner references must use, since + // Kubernetes GC in the project cluster only knows this UID. + projTestWDUID = types.UID("project-wd-uid-9999-aaaa-bbbb-cccc") + + // projTestEdgeWDUID is the UID of the WorkloadDeployment as it exists on the + // EDGE/Karmada plane. Each plane mints its own UID, so this is intentionally + // distinct from projTestWDUID. The WorkloadDeploymentUIDLabel on downstream + // Instances carries this edge UID — NOT the project UID. + projTestEdgeWDUID = types.UID("edge-uid-0000-1111-2222-3333") + + // projTestWDName is the name of the owning WorkloadDeployment. The name is + // the same across all planes (project cluster, Karmada, edge) and is the + // correct cross-plane stable identifier. + projTestWDName = "my-wd" + + // projTestWorkloadUID is the UID of the owning Workload (carried via WorkloadUIDLabel). + projTestWorkloadUID = "wl-uid-1111-2222-3333-4444" + + // projTestInstanceIndex is the ordinal index of the instance (carried via InstanceIndexLabel). + projTestInstanceIndex = "0" +) + +// encodedCluster returns the value of the UpstreamOwnerClusterNameLabel for +// projTestCluster ("cluster-"). +func encodedCluster() string { + return "cluster-" + projTestCluster +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// projTestProjectNS builds the project cluster Namespace with the stable test UID. +func projTestProjectNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestProjNS, + UID: projTestProjNSUID, + }, + } +} + +// projTestWorkloadDeployment builds the project WorkloadDeployment that owns +// projected Instances. +func projTestWorkloadDeployment() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestWDName, + Namespace: projTestProjNS, + UID: projTestWDUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "LAX", + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "my-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// projTestKarmadaInstance builds a Karmada Instance with the default labels +// needed for the InstanceProjector to act on it. Optional label overrides are +// applied last. +func projTestKarmadaInstance(labelOverrides map[string]string) *computev1alpha.Instance { + labels := map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // WorkloadDeploymentUIDLabel carries the EDGE UID — intentionally distinct + // from projTestWDUID (the project-cluster WD UID). Owner references must + // never be built from this value. + computev1alpha.WorkloadDeploymentUIDLabel: string(projTestEdgeWDUID), + computev1alpha.WorkloadDeploymentNameLabel: projTestWDName, + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + } + maps.Copy(labels, labelOverrides) + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + Labels: labels, + }, + Spec: computev1alpha.InstanceSpec{ + // Minimal valid spec — actual content is copied to the projection. + }, + } +} + +// newTestProjector wires an InstanceProjector with the given downstream client and +// a project cluster that serves the supplied project client. +func newTestProjector(karmadaClient client.Client, projectClient client.Client) *InstanceProjector { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(projTestCluster, projectCluster) + return &InstanceProjector{ + FederationClient: karmadaClient, + MCManager: mgr, + } +} + +// projectorRequest builds a ctrl.Request for the test Instance in Karmada. +func projectorRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + }, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestInstanceProjector_Reconcile is the primary table-driven test. +func TestInstanceProjector_Reconcile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + + // karmadaInstance is what exists in the Karmada API server. + // A nil value means the Instance does not exist (not-found path). + karmadaInstance *computev1alpha.Instance + + // projectObjs are pre-populated in the project cluster fake client. + projectObjs []client.Object + + // wantProjection controls whether a projected Instance should appear. + wantProjection bool + + // wantOwnerRef controls whether the projected Instance should have an + // owner reference pointing to the project WorkloadDeployment. + wantOwnerRef bool + + // wantErr controls whether the reconcile should return an error. + wantErr bool + }{ + { + name: "happy path — instance projected with owner reference", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Cross-plane UID regression test: the Karmada Instance carries the EDGE + // WD UID in WorkloadDeploymentUIDLabel (projTestEdgeWDUID), which is + // intentionally different from the project-cluster WD UID (projTestWDUID). + // The owner reference on the projection must use the project-cluster UID. + // This test fails if someone reintroduces UID-based matching against the + // edge/Karmada plane. + name: "WD name label present, edge UID differs from project UID — owner ref UID equals project WD UID", + karmadaInstance: projTestKarmadaInstance(nil), // carries projTestEdgeWDUID, not projTestWDUID + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), // UID is projTestWDUID + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // When the project WD does not yet exist (transient ordering race — + // Instance projected before WorkloadReconciler created the project WD) + // the projector must return an error and NOT create an ownerless + // projection: its only watch is the Instance, so nothing fires when + // the WD appears — error backoff is the retry mechanism. + name: "project WD not found — error, no ownerless projection created", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + // No WorkloadDeployment — simulates the transient ordering race. + }, + wantProjection: false, + wantErr: true, + }, + { + // A write-back copy that cannot identify its WorkloadDeployment + // violates the stamping invariant — the projector must return an + // error rather than silently drop the projection. + name: "WD name label absent — error, no projection", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + computev1alpha.WorkloadDeploymentNameLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantErr: true, + }, + { + // Federation-plane Instances are exclusively write-back copies and the + // write-back stamps both upstream-owner labels atomically, so a missing + // cluster label is a stamping-invariant violation, not a foreign object. + name: "missing upstream-cluster-name label — error", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + // Intentionally no UpstreamOwnerClusterNameLabel. + Labels: map[string]string{ + "some-other-label": "value", + }, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantErr: true, + }, + { + // The write-back stamps both upstream-owner labels together, so a + // cluster label without a namespace label is an invariant violation + // that never self-heals — the projector must return an error rather + // than requeue at a flat rate. + name: "missing upstream-namespace label — error", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the upstream namespace label. + downstreamclient.UpstreamOwnerNamespaceLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantErr: true, + }, + { + name: "karmada instance not found — no-op", + karmadaInstance: nil, // causes Get to return NotFound + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + // Verify that all linking labels (WorkloadUID, WorkloadDeploymentUID, + // WorkloadDeploymentNameLabel, InstanceIndex) survive from the Karmada + // write-back object through to the projection. + name: "all linking labels propagated from Karmada to projection", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + var karmadaObjs []client.Object + if tt.karmadaInstance != nil { + karmadaObjs = append(karmadaObjs, tt.karmadaInstance) + } + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(tt.projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newTestProjector(karmadaClient, projectClient) + + req := projectorRequest() + result, err := r.Reconcile(context.Background(), req) + + if tt.wantErr { + require.Error(t, err) + assert.Zero(t, result.RequeueAfter, + "errors rely on controller backoff, not a flat requeue") + // No error path may leave a projection behind — in particular, + // an ownerless projection must never be created. + var projection computev1alpha.Instance + getErr := projectClient.Get(context.Background(), types.NamespacedName{ + Name: req.Name, + Namespace: projTestProjNS, + }, &projection) + assert.True(t, isNotFound(getErr), + "expected no projection in project namespace on error, but found one (or unexpected error: %v)", getErr) + return + } + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Check whether a projected Instance exists in the project namespace. + var projection computev1alpha.Instance + err = projectClient.Get(ctx, types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestProjNS, + }, &projection) + + if !tt.wantProjection { + assert.True(t, isNotFound(err), + "expected no projection in project namespace, but found one (or unexpected error: %v)", err) + return + } + + require.NoError(t, err, "expected projection to exist in project namespace") + + // Labels should be copied from the Karmada instance. + if tt.karmadaInstance != nil { + for k, v := range tt.karmadaInstance.Labels { + assert.Equal(t, v, projection.Labels[k], + "projection label %q should match Karmada instance label", k) + } + } + + // Linking labels must survive from the Karmada instance to the projection + // so that the CLI can resolve Workload name, city, and instance ordinal. + if tt.wantProjection && tt.karmadaInstance != nil { + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadUIDLabel], + projection.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + projection.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.InstanceIndexLabel], + projection.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated to the projection") + } + + if tt.wantOwnerRef { + require.NotEmpty(t, projection.OwnerReferences, + "projected instance should have an owner reference to the WorkloadDeployment") + ownerRef := projection.OwnerReferences[0] + // Core invariant: owner ref UID must be the PROJECT-cluster WD UID. + assert.Equal(t, string(projTestWDUID), string(ownerRef.UID), + "owner reference UID must match the project-cluster WorkloadDeployment UID") + // Regression guard: the edge UID must NOT appear in the owner ref. + // If this assertion fails, someone reintroduced cross-plane UID matching. + assert.NotEqual(t, string(projTestEdgeWDUID), string(ownerRef.UID), + "owner reference UID must NOT be the edge/Karmada WD UID") + assert.Equal(t, projTestWDName, ownerRef.Name, + "owner reference name should match the WorkloadDeployment name") + } else { + assert.Empty(t, projection.OwnerReferences, + "projected instance should have no owner reference") + } + }) + } +} + +// TestInstanceProjector_SpecCopied verifies that the Instance spec is correctly +// propagated from the Karmada instance to the projection. +func TestInstanceProjector_SpecCopied(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + // Set a recognizable spec field we can assert against. + karmadaInst.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{{Name: "test-gate"}}, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(projTestProjectNS(), projTestWorkloadDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + _, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) + + require.NotNil(t, projection.Spec.Controller) + require.Len(t, projection.Spec.Controller.SchedulingGates, 1) + assert.Equal(t, "test-gate", projection.Spec.Controller.SchedulingGates[0].Name) +} + +// TestInstanceProjector_NamespaceResolution verifies that the projector resolves +// the target project namespace directly from the UpstreamOwnerNamespaceLabel on +// the Karmada Instance, landing the projection in the correct namespace. +func TestInstanceProjector_NamespaceResolution(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects( + projTestProjectNS(), + projTestWorkloadDeployment(), + ). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + result, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // Projection must land in the namespace named by the label. + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) +} + +// isNotFound returns true only when err is a Kubernetes not-found error; a nil +// error means the object exists and returns false. +// Used to distinguish "no projection created" from "projection exists but Get failed". +func isNotFound(err error) bool { + if err == nil { + return false // object exists — not the "not found" case + } + return client.IgnoreNotFound(err) == nil +} diff --git a/internal/controller/instance_writeback_test.go b/internal/controller/instance_writeback_test.go new file mode 100644 index 00000000..5c5020cf --- /dev/null +++ b/internal/controller/instance_writeback_test.go @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── write-back test constants ──────────────────────────────────────────────── + +const ( + wbTestClusterName = "edge-cluster" + wbTestNamespace = "ns-proj-uid-1234" + wbTestInstanceName = "inst-0" + wbTestWorkloadUID = "wl-uid-aaaa-bbbb" + wbTestWDUID = "wd-uid-cccc-dddd" + wbTestInstanceIndex = "0" + wbTestUpstreamNS = "proj-namespace" + wbTestEncodedCluster = "cluster-" + wbTestClusterName + + // The four self-describing labels. + wbTestWDName = "my-workload-deployment" + wbTestCityCode = "DFW" + wbTestWorkloadName = "my-workload" + wbTestPlacement = "us-central" +) + +// wbTestCellInstance builds a cell-side Instance with all seven owned labels +// pre-populated, as addInstanceControllerLabels would produce. +func wbTestCellInstance() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + computev1alpha.WorkloadDeploymentNameLabel: wbTestWDName, + computev1alpha.CityCodeLabel: wbTestCityCode, + computev1alpha.WorkloadNameLabel: wbTestWorkloadName, + computev1alpha.PlacementNameLabel: wbTestPlacement, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceReadyReasonAvailable, + Message: "Instance is ready", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } +} + +// wbTestDownstreamNS returns a Namespace object in the downstream (Karmada) +// control plane that carries the upstream routing labels, simulating the +// namespace stamped by NSO's MappedNamespaceResourceStrategy. +func wbTestDownstreamNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + }, + }, + } +} + +// newWriteBackReconciler wires an InstanceReconciler whose FederationClient is set +// to federationClient and whose local cluster has a single cell instance. +func newWriteBackReconciler(federationClient client.Client) *InstanceReconciler { + return &InstanceReconciler{ + FederationClient: federationClient, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestWriteBackToUpstream_CreatePath_AllLabels verifies that the first +// write-back to an empty Karmada control plane creates an Instance with all five +// expected labels (two routing + three linking) and also writes the cell-side +// status via Status().Update. +func TestWriteBackToUpstream_CreatePath_AllLabels(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + // Verify the created Karmada Instance carries all five expected labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestEncodedCluster, created.Labels[downstreamclient.UpstreamOwnerClusterNameLabel], + "UpstreamOwnerClusterNameLabel must be set") + assert.Equal(t, wbTestUpstreamNS, created.Labels[downstreamclient.UpstreamOwnerNamespaceLabel], + "UpstreamOwnerNamespaceLabel must be set") + assert.Equal(t, wbTestWorkloadUID, created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestWDUID, created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestInstanceIndex, created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated from cell instance") + + // Status must have been written via Status().Update after Create. + require.Len(t, created.Status.Conditions, 1, + "Status().Update must be called after Create; condition should be present") + assert.Equal(t, computev1alpha.InstanceReady, created.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionTrue, created.Status.Conditions[0].Status) +} + +// TestWriteBackToUpstream_UpdatePath_LabelMerge verifies that an +// existing Karmada Instance with a Karmada-managed label retains that label +// after the update path runs, while all five owned labels are written correctly. +func TestWriteBackToUpstream_UpdatePath_LabelMerge(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + // Pre-populate the Karmada control plane with a pre-existing Instance + // carrying only the two linking labels plus a simulated Karmada-managed label. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + // All five owned labels must be present with correct values. + assert.Equal(t, wbTestEncodedCluster, updated.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]) + assert.Equal(t, wbTestUpstreamNS, updated.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]) + assert.Equal(t, wbTestWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel]) + assert.Equal(t, wbTestWDUID, updated.Labels[computev1alpha.WorkloadDeploymentUIDLabel]) + assert.Equal(t, wbTestInstanceIndex, updated.Labels[computev1alpha.InstanceIndexLabel]) + + // The Karmada-managed label must survive the merge (not be replaced/deleted). + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after merge; should not be overwritten") +} + +// TestWriteBackToUpstream_LabelChangeTriggerUpdate verifies that +// a changed linking label on the cell instance causes the Karmada object to +// be updated with the new value. +func TestWriteBackToUpstream_LabelChangeTriggerUpdate(t *testing.T) { + t.Parallel() + + newWorkloadUID := "wl-uid-CHANGED" + + // Pre-populate with the five-label map from a previous write-back. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Modify the WorkloadUIDLabel on the cell instance. + cellInstance := wbTestCellInstance() + cellInstance.Labels[computev1alpha.WorkloadUIDLabel] = newWorkloadUID + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, newWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel change on the cell instance must be reflected in the Karmada object") +} + +// TestWriteBackToUpstream_MissingLinkingLabels_Error verifies that +// writeBackToUpstream refuses to create an upstream copy when the cell-side +// Instance lacks the linking labels (e.g. before the stateful control +// strategy's backfill has converged it). The error must name every missing +// label so the wait is diagnosable, and no upstream object may be created — +// an Instance with empty identity labels could never be linked back to its +// owners. +func TestWriteBackToUpstream_MissingLinkingLabels_Error(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Instance with nil Labels — simulates an early reconcile before the + // linking labels are stamped. + cellInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.Error(t, err) + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + computev1alpha.WorkloadDeploymentNameLabel, + computev1alpha.CityCodeLabel, + computev1alpha.WorkloadNameLabel, + computev1alpha.PlacementNameLabel, + } { + assert.Contains(t, err.Error(), key, + "error must name missing label %q", key) + } + + // No upstream Instance may be created with empty identity labels. + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when linking labels are missing (got err: %v)", getErr) +} + +// TestWriteBackToUpstream_MissingLinkingLabels_NoUpdate verifies that an +// existing upstream copy is left untouched when the cell-side Instance has +// lost its linking labels: the write-back must error out before the update +// path can overwrite the previously written identity with empty values. +func TestWriteBackToUpstream_MissingLinkingLabels_NoUpdate(t *testing.T) { + t.Parallel() + + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Cell instance lost its labels (only the index label remains). + cellInstance := wbTestCellInstance() + delete(cellInstance.Labels, computev1alpha.WorkloadUIDLabel) + delete(cellInstance.Labels, computev1alpha.WorkloadDeploymentUIDLabel) + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.Error(t, err) + assert.Contains(t, err.Error(), computev1alpha.WorkloadUIDLabel) + assert.Contains(t, err.Error(), computev1alpha.WorkloadDeploymentUIDLabel) + assert.NotContains(t, err.Error(), computev1alpha.InstanceIndexLabel, + "a present label must not be reported missing") + + // The existing upstream copy must keep its previously written identity. + var existing computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &existing)) + assert.Equal(t, wbTestWorkloadUID, existing.Labels[computev1alpha.WorkloadUIDLabel], + "existing WorkloadUIDLabel must not be overwritten with an empty value") + assert.Equal(t, wbTestWDUID, existing.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "existing WorkloadDeploymentUIDLabel must not be overwritten with an empty value") +} + +// TestWriteBackToUpstream_MissingSelfDescribingLabel_Error verifies that the +// self-describing labels are required, not best-effort: a cell Instance +// missing only WorkloadDeploymentNameLabel must fail write-back with an error +// naming exactly that label, and no upstream copy may be created. +func TestWriteBackToUpstream_MissingSelfDescribingLabel_Error(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + cellInstance := wbTestCellInstance() + delete(cellInstance.Labels, computev1alpha.WorkloadDeploymentNameLabel) + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.Error(t, err) + assert.Contains(t, err.Error(), computev1alpha.WorkloadDeploymentNameLabel, + "error must name the missing label") + assert.NotContains(t, err.Error(), computev1alpha.CityCodeLabel, + "a present label must not be reported missing") + + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when a required label is missing (got err: %v)", getErr) +} + +// TestWriteBackToUpstream_NamespaceIdentity_Errors verifies that the +// federation-plane namespace is the strict source of upstream identity: +// a missing namespace or a namespace lacking either upstream-owner label must +// fail the write-back with an error naming the namespace (and label), and no +// upstream copy may be created — there are no fallback identity values. +func TestWriteBackToUpstream_NamespaceIdentity_Errors(t *testing.T) { + t.Parallel() + + nsWithoutLabel := func(missing string) *corev1.Namespace { + ns := wbTestDownstreamNS() + delete(ns.Labels, missing) + return ns + } + + tests := []struct { + name string + // ns is the federation-plane namespace; nil means it does not exist. + ns *corev1.Namespace + // wantInError must all appear in the returned error. + wantInError []string + }{ + { + name: "namespace missing — error, no copy", + ns: nil, + wantInError: []string{wbTestNamespace}, + }, + { + name: "namespace lacks upstream-namespace label — error names namespace and label", + ns: nsWithoutLabel(downstreamclient.UpstreamOwnerNamespaceLabel), + wantInError: []string{wbTestNamespace, downstreamclient.UpstreamOwnerNamespaceLabel}, + }, + { + name: "namespace lacks upstream-cluster-name label — error names namespace and label", + ns: nsWithoutLabel(downstreamclient.UpstreamOwnerClusterNameLabel), + wantInError: []string{wbTestNamespace, downstreamclient.UpstreamOwnerClusterNameLabel}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + builder := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithStatusSubresource(&computev1alpha.Instance{}) + if tt.ns != nil { + builder = builder.WithObjects(tt.ns) + } + upstreamClient := builder.Build() + + r := newWriteBackReconciler(upstreamClient) + + err := r.writeBackToUpstream(context.Background(), wbTestCellInstance()) + require.Error(t, err) + for _, want := range tt.wantInError { + assert.Contains(t, err.Error(), want) + } + + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when upstream identity is unresolvable (got err: %v)", getErr) + }) + } +} + +// TestWriteBackToUpstream_NamespaceGetFailure_Error verifies that a transient +// failure reading the federation-plane namespace aborts the write-back instead +// of proceeding with derived identity values. +func TestWriteBackToUpstream_NamespaceGetFailure_Error(t *testing.T) { + t.Parallel() + + getFailure := errors.New("federation API unavailable") + upstreamClient := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(ctx context.Context, cl client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + if _, ok := obj.(*corev1.Namespace); ok { + return getFailure + } + return cl.Get(ctx, key, obj, opts...) + }, + }). + Build() + + r := newWriteBackReconciler(upstreamClient) + + err := r.writeBackToUpstream(context.Background(), wbTestCellInstance()) + require.ErrorIs(t, err, getFailure) + + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when the namespace read fails (got err: %v)", getErr) +} + +// TestWriteBackToUpstream_FourNewLabels_CreatePath verifies that the four +// self-describing labels (WorkloadDeploymentName, CityCode, WorkloadName, +// PlacementName) are written to the Karmada object on the create path. +func TestWriteBackToUpstream_FourNewLabels_CreatePath(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestWDName, created.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestCityCode, created.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must propagate to Karmada object") + assert.Equal(t, wbTestWorkloadName, created.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestPlacement, created.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must propagate to Karmada object") +} + +// TestWriteBackToUpstream_FourNewLabels_UpdatePath verifies that the four +// self-describing labels are written on the update path and existing Karmada- +// managed labels on the downstream object are preserved. +func TestWriteBackToUpstream_FourNewLabels_UpdatePath(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, wbTestWDName, updated.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be set on update path") + assert.Equal(t, wbTestCityCode, updated.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must be set on update path") + assert.Equal(t, wbTestWorkloadName, updated.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be set on update path") + assert.Equal(t, wbTestPlacement, updated.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must be set on update path") + + // Karmada-managed label must survive the merge. + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after the update merge") +} diff --git a/internal/controller/instancecontrol/instancecontrol.go b/internal/controller/instancecontrol/instancecontrol.go index 6de9df99..d2c83692 100644 --- a/internal/controller/instancecontrol/instancecontrol.go +++ b/internal/controller/instancecontrol/instancecontrol.go @@ -26,10 +26,11 @@ type Strategy interface { type ActionType string const ( - ActionTypeCreate ActionType = "Create" - ActionTypeUpdate ActionType = "Update" - ActionTypeDelete ActionType = "Delete" - ActionTypeWait ActionType = "Wait" + ActionTypeCreate ActionType = "Create" + ActionTypeUpdate ActionType = "Update" + ActionTypeDelete ActionType = "Delete" + ActionTypeWait ActionType = "Wait" + ActionTypePatchLabels ActionType = "PatchLabels" ) type Action struct { @@ -104,3 +105,22 @@ func NewWaitAction(object client.Object) Action { fn: func(ctx context.Context, c client.Client) error { return nil }, } } + +// NewPatchLabelsAction returns an action that applies a metadata-only labels +// patch to the given object. It uses a MergeFrom patch so only the labels +// field is sent to the API server — the spec, template, and template-hash are +// never touched. This is intentionally separate from ActionTypeUpdate so that +// label backfill never participates in the ordered rolling-update flow. +func NewPatchLabelsAction(updated client.Object, base client.Object) Action { + patch := client.MergeFrom(base) + return Action{ + Object: updated, + actionType: ActionTypePatchLabels, + fn: func(ctx context.Context, c client.Client) error { + if err := c.Patch(ctx, updated, patch); err != nil { + return fmt.Errorf("failed to patch labels on %T %s: %w", updated, updated.GetName(), err) + } + return nil + }, + } +} diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 566a652c..34e5966e 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -15,13 +15,30 @@ import ( "go.datum.net/compute/internal/controller/instancecontrol" ) +// Options controls optional behaviours of the stateful instance control strategy. +type Options struct { + // NetworkingEnabled controls whether the Network scheduling gate is added to + // newly created Instances. Set to false when the networking integration is + // disabled so that Instances are not blocked waiting for a NetworkBinding. + // Defaults to true. + NetworkingEnabled bool +} + // Behavior inspired by https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/statefulset // Does not currently implement exact behavior. type statefulControl struct { + opts Options } +// New returns a stateful instance control strategy with networking enabled. func New() instancecontrol.Strategy { - return &statefulControl{} + return NewWithOptions(Options{NetworkingEnabled: true}) +} + +// NewWithOptions returns a stateful instance control strategy with the given +// options. +func NewWithOptions(opts Options) instancecontrol.Strategy { + return &statefulControl{opts: opts} } func (c *statefulControl) GetActions( @@ -36,8 +53,10 @@ func (c *statefulControl) GetActions( var createActions []instancecontrol.Action var waitActions []instancecontrol.Action - // highest -> lowest - var updateActions []instancecontrol.Action + // highest -> lowest. Instances whose template hash has drifted from the + // desired template are deleted and recreated (not updated in place) so the + // change actually rolls the backing pod — see the recreate branch below. + var recreateActions []instancecontrol.Action // highest -> lowest var deleteActions []instancecontrol.Action @@ -68,15 +87,25 @@ func (c *statefulControl) GetActions( }, Spec: deployment.Spec.Template.Spec, } + // Set Location best-effort: when Status.Location is nil (no matching + // Location object for the city code) Instance.Spec.Location stays nil and + // instance creation proceeds normally — this must not block scheduling. desiredInstances[i].Spec.Location = deployment.Status.Location // TODO(jreese) consider adding scheduling gates via mutating webhooks - desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ - TemplateHash: instanceTemplateHash, - SchedulingGates: []v1alpha.SchedulingGate{ + gates := []v1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + } + if c.opts.NetworkingEnabled { + // Prepend the Network gate so it is cleared first; quota is + // independent and evaluated in parallel by InstanceReconciler. + gates = append([]v1alpha.SchedulingGate{ {Name: instancecontrol.NetworkSchedulingGate.String()}, - {Name: instancecontrol.QuotaSchedulingGate.String()}, - }, + }, gates...) + } + desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ + TemplateHash: instanceTemplateHash, + SchedulingGates: gates, } addInstanceControllerLabels(desiredInstances[i], getInstanceOrdinal(desiredInstances[i].Name), deployment) @@ -102,22 +131,56 @@ func (c *statefulControl) GetActions( if !apimeta.IsStatusConditionTrue(instance.Status.Conditions, v1alpha.InstanceReady) { waitActions = append(waitActions, instancecontrol.NewWaitAction(instance)) } else if needsUpdate(instance, instanceTemplateHash) { - updatedInstance := instance.DeepCopy() - updatedInstance.Annotations = deployment.Spec.Template.Annotations - updatedInstance.Labels = deployment.Spec.Template.Labels + // The instance's template hash no longer matches the desired + // template — e.g. an image change, or a restart requested via the + // RestartedAtAnnotation, which is part of the template hash. The + // unikraft provider bakes the pod's runtime, rootfs, and file + // mounts at pod-creation time and never reconciles an existing + // pod's spec, so an in-place Instance update would silently fail to + // roll the running workload. Delete the instance instead; the next + // reconcile recreates it from the current template via the create + // path above, and the provider tears down the old pod + // (finalizer-gated) and boots a fresh one. Ordered, one-at-a-time + // pacing is preserved by the descending-ordinal sort, the + // skip-all-but-first logic, and the DeletionTimestamp WaitAction. + recreateActions = append(recreateActions, instancecontrol.NewDeleteAction(instance)) + } + } + } - addInstanceControllerLabels(updatedInstance, getInstanceOrdinal(updatedInstance.Name), deployment) + // Converge controller-managed labels on every existing instance, regardless + // of Ready state or template hash. Labels are stamped only at instance + // creation and rollout is recreate-only, so when the label schema evolves — + // a label is added or its value derivation changes — this pass is the only + // mechanism that updates live instances; without it, any instance alive at + // the time of the change would never receive it. The patch is metadata-only + // and is emitted outside the ordered rollout decision so it never gates or + // reorders instance creation/updates. + var patchLabelActions []instancecontrol.Action + for _, instance := range desiredInstances { + if instance.CreationTimestamp.IsZero() || !instance.DeletionTimestamp.IsZero() { + // Skip instances that don't exist yet or are being deleted. + continue + } - updatedInstance.Spec = deployment.Spec.Template.Spec - updateActions = append(updateActions, instancecontrol.NewUpdateAction(updatedInstance)) + desiredLabels := desiredControllerLabels(getInstanceOrdinal(instance.Name), deployment) + if labelsNeedBackfill(instance.Labels, desiredLabels) { + base := instance.DeepCopy() + patched := instance.DeepCopy() + for k, v := range desiredLabels { + if patched.Labels == nil { + patched.Labels = make(map[string]string) + } + patched.Labels[k] = v } + patchLabelActions = append(patchLabelActions, instancecontrol.NewPatchLabelsAction(patched, base)) } } - slices.SortFunc(updateActions, descendingOrdinal) + slices.SortFunc(recreateActions, descendingOrdinal) slices.SortFunc(deleteActions, descendingOrdinal) - actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)) + actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(recreateActions)+len(deleteActions)+len(patchLabelActions)) switch deployment.Spec.ScaleSettings.InstanceManagementPolicy { case v1alpha.OrderedReadyInstanceManagementPolicyType: @@ -132,7 +195,7 @@ func (c *statefulControl) GetActions( slices.SortFunc(actions, ascendingOrdinal) - actions = append(actions, updateActions...) + actions = append(actions, recreateActions...) actions = append(actions, deleteActions...) // Skip all actions except the first one. @@ -144,6 +207,8 @@ func (c *statefulControl) GetActions( } + actions = append(actions, patchLabelActions...) + return actions, nil } @@ -152,7 +217,34 @@ func addInstanceControllerLabels(instance *v1alpha.Instance, index int, deployme instance.Labels = map[string]string{} } - instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(index) - instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) - instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + for k, v := range desiredControllerLabels(index, deployment) { + instance.Labels[k] = v + } +} + +// desiredControllerLabels returns the full set of controller-managed labels +// that every instance should carry. Used both when stamping a new instance +// and when checking whether an existing instance needs a backfill patch. +func desiredControllerLabels(index int, deployment *v1alpha.WorkloadDeployment) map[string]string { + return map[string]string{ + v1alpha.InstanceIndexLabel: strconv.Itoa(index), + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + // Self-describing labels for routing, filtering, and observability. + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } +} + +// labelsNeedBackfill reports whether any of the desired controller-managed +// label key/value pairs are absent or incorrect on the current instance labels. +func labelsNeedBackfill(current map[string]string, desired map[string]string) bool { + for k, v := range desired { + if current[k] != v { + return true + } + } + return false } diff --git a/internal/controller/instancecontrol/stateful/stateful_control_test.go b/internal/controller/instancecontrol/stateful/stateful_control_test.go index d45b24b3..d9133efa 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control_test.go +++ b/internal/controller/instancecontrol/stateful/stateful_control_test.go @@ -13,6 +13,8 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/utils/ptr" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" ) @@ -47,6 +49,11 @@ func TestFreshDeployment(t *testing.T) { assert.True(t, actions[1].IsSkipped()) } +// TestUpdateWithAllReadyInstances verifies that a template change on Ready +// instances rolls them by delete+recreate (not an in-place update), ordered +// highest-ordinal-first with only the first action active. An in-place update +// would never roll the backing pod, since the unikraft provider bakes the pod +// at creation time and ignores spec changes on an existing pod. func TestUpdateWithAllReadyInstances(t *testing.T) { ctx := context.Background() control := New() @@ -65,11 +72,11 @@ func TestUpdateWithAllReadyInstances(t *testing.T) { assert.Len(t, actions, 2) assert.Equal(t, "test-deploy-1", actions[0].Object.GetName()) - assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[0].ActionType()) + assert.Equal(t, instancecontrol.ActionTypeDelete, actions[0].ActionType()) assert.False(t, actions[0].IsSkipped()) assert.Equal(t, "test-deploy-0", actions[1].Object.GetName()) - assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[1].ActionType()) + assert.Equal(t, instancecontrol.ActionTypeDelete, actions[1].ActionType()) assert.True(t, actions[1].IsSkipped()) } @@ -150,16 +157,418 @@ func TestScaleDownWithAllReadyInstances(t *testing.T) { assert.False(t, actions[0].IsSkipped()) } +// TestNetworkingEnabledAddsNetworkGate verifies that when networking is enabled +// (the default), newly created Instances receive both the Network and Quota +// scheduling gates so that they are held until the network is provisioned. +func TestNetworkingEnabledAddsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: true}) + + deployment := getWorkloadDeployment("test-deploy-net-on", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.Contains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must be present when networking is enabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must be present") +} + +// TestNetworkingDisabledOmitsNetworkGate verifies that when networking is +// disabled, newly created Instances do NOT receive the Network scheduling gate, +// so they are not blocked on network provisioning. The Quota gate is still +// added so quota enforcement remains active. +func TestNetworkingDisabledOmitsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: false}) + + deployment := getWorkloadDeployment("test-deploy-net-off", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.NotContains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must NOT be present when networking is disabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must still be present when networking is disabled") +} + // Add more test functions below for different scenarios. +// TestInstanceLabels_FourNewLabelsStamped verifies that all four +// self-describing labels are stamped on newly created Instances, with values +// sourced from the WorkloadDeployment spec. +func TestInstanceLabels_FourNewLabelsStamped(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-deploy", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must equal deployment name") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must equal deployment.Spec.CityCode") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must equal deployment.Spec.WorkloadRef.Name") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must equal deployment.Spec.PlacementName") +} + +// TestInstanceLabels_RefreshedOnRecreate verifies that when a template change +// rolls an instance, the recreated instance carries the four self-describing +// labels sourced from the WorkloadDeployment. A template change deletes the +// drifted instance and recreates it via the create path on the following +// reconcile, which stamps the labels. +func TestInstanceLabels_RefreshedOnRecreate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-update", 1) + + // A ready existing instance on the old template hash. + currentInstances := []v1alpha.Instance{*getInstanceForDeployment(deployment, 0)} + + // Trigger a roll by changing the image. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "updated-image" + + // First reconcile: the drifted instance is deleted (recreate), not updated. + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeDelete, actions[0].ActionType()) + assert.Equal(t, "test-labels-update-0", actions[0].Object.GetName()) + + // Next reconcile, after the old instance has been fully deleted and is gone: + // the empty slot is refilled by the create path, which stamps the labels. + actions, err = control.GetActions(ctx, scheme, deployment, nil) + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be set on the recreated instance") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must be set on the recreated instance") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be set on the recreated instance") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must be set on the recreated instance") +} + +// TestInstanceLocation_SetWhenDeploymentStatusLocationPresent verifies that when +// deployment.Status.Location is set, the new Instance receives it as Spec.Location. +func TestInstanceLocation_SetWhenDeploymentStatusLocationPresent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-set", 1) + deployment.Status.Location = &networkingv1alpha.LocationReference{ + Name: "loc-dfw-1", + Namespace: "networking-system", + } + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Location, + "Spec.Location must be set when deployment.Status.Location is non-nil") + assert.Equal(t, "loc-dfw-1", instance.Spec.Location.Name) + assert.Equal(t, "networking-system", instance.Spec.Location.Namespace) +} + +// TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent verifies that when +// deployment.Status.Location is nil (no Location object matches the city code), +// instance creation still succeeds and Spec.Location remains nil — no regression +// on the "create instances regardless of Location" contract. +func TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-nil", 1) + // deployment.Status.Location is intentionally not set (nil) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err, "instance creation must succeed even when Status.Location is nil") + assert.Len(t, actions, 1, "exactly one create action must be produced") + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Nil(t, instance.Spec.Location, + "Spec.Location must remain nil when deployment.Status.Location is not set") + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType(), + "action must be a Create, proving instance creation is not gated on Location") +} + +// TestLabelBackfill_NotReadyMatchingHash verifies that a not-Ready instance +// with an unchanged template hash receives a PatchLabels action when it is +// missing controller-managed labels. The action must not be a rollout recreate, +// must not alter spec/template, and must not block subsequent instances. +func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-notready", 2) + + // Instance 0: not-Ready, correct template hash, but missing city-code/workload-name labels. + instance0 := getInstanceForDeployment(deployment, 0) + apimeta.SetStatusCondition(&instance0.Status.Conditions, metav1.Condition{ + Type: v1alpha.InstanceReady, + Status: metav1.ConditionFalse, + Reason: "NotReady", + Message: "Instance is not ready", + LastTransitionTime: metav1.Now(), + }) + // Simulate pre-existing instance that only has the index label (missing the newer labels). + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + } + + // Instance 1: needs to be created (nil in desiredInstances), so we only provide instance0. + currentInstances := []v1alpha.Instance{*instance0} + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + // Collect actions by type. + var waitActions, createActions, recreateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeWait: + waitActions = append(waitActions, a) + case instancecontrol.ActionTypeCreate: + createActions = append(createActions, a) + case instancecontrol.ActionTypeDelete: + recreateActions = append(recreateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // The not-Ready instance must still produce a Wait (rollout is gated). + assert.Len(t, waitActions, 1, "not-Ready instance must still produce a Wait action") + assert.Equal(t, "test-backfill-notready-0", waitActions[0].Object.GetName()) + + // The missing instance-1 create is skipped (ordered policy, Wait is first). + assert.Len(t, createActions, 1, "instance-1 create action must be present") + assert.True(t, createActions[0].IsSkipped(), "create for instance-1 must be skipped while instance-0 is waiting") + + // No rollout recreate actions must be produced. + assert.Empty(t, recreateActions, "no rollout recreate must be produced for a matching-hash instance") + + // A PatchLabels action must be produced for instance-0. + assert.Len(t, patchActions, 1, "exactly one PatchLabels action for the label-drifted instance") + assert.Equal(t, "test-backfill-notready-0", patchActions[0].Object.GetName()) + assert.False(t, patchActions[0].IsSkipped(), "PatchLabels must not be skipped by the rollout skip-loop") + + // The patched object must carry all desired labels. + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.GetName(), patched.Labels[v1alpha.WorkloadDeploymentNameLabel]) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel]) + assert.Equal(t, deployment.Spec.WorkloadRef.Name, patched.Labels[v1alpha.WorkloadNameLabel]) + assert.Equal(t, deployment.Spec.PlacementName, patched.Labels[v1alpha.PlacementNameLabel]) + + // The patched object's spec and template-hash must be unchanged. + assert.Equal(t, instancecontrol.ComputeHash(deployment.Spec.Template), patched.Spec.Controller.TemplateHash, + "template hash must be unchanged by the label backfill") + assert.Equal(t, deployment.Spec.Template.Spec.Runtime, patched.Spec.Runtime, + "spec must be unchanged by the label backfill") +} + +// TestLabelBackfill_Idempotent verifies that an instance already carrying all +// correct controller-managed labels produces no PatchLabels action. +func TestLabelBackfill_Idempotent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-idempotent", 1) + + // Instance already has all controller-managed labels set correctly. + instance := getInstanceForDeployment(deployment, 0) + instance.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + for _, a := range actions { + assert.NotEqual(t, instancecontrol.ActionTypePatchLabels, a.ActionType(), + "no PatchLabels action must be produced when all labels are already correct") + } +} + +// TestLabelBackfill_ReadyInstanceCorrected verifies that a Ready instance with +// correct template hash but drifted labels receives a PatchLabels action +// without triggering a rollout recreate. +func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-ready", 1) + + // Ready instance with matching hash but missing city-code label. + instance := getInstanceForDeployment(deployment, 0) + // Remove the city-code label to simulate drift. + delete(instance.Labels, v1alpha.CityCodeLabel) + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var recreateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeDelete: + recreateActions = append(recreateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // No rollout recreate must be produced — template hash matches. + assert.Empty(t, recreateActions, "no rollout recreate must be produced for a matching-hash ready instance") + + // A PatchLabels action must be produced. + assert.Len(t, patchActions, 1, "PatchLabels action must be produced for the label-drifted ready instance") + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel], + "city-code label must be corrected by the backfill") +} + +// TestLabelBackfill_DoesNotAffectRollingUpdate verifies that a genuine template +// change on a Ready instance still produces the normal ordered roll (a recreate +// Delete per instance) and that the PatchLabels path does not interfere with or +// duplicate it. +func TestLabelBackfill_DoesNotAffectRollingUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-rolling", 2) + + // Two ready instances with all correct labels and matching current hash. + instance0 := getInstanceForDeployment(deployment, 0) + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + instance1 := getInstanceForDeployment(deployment, 1) + instance1.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "1", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + // Trigger a template change. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "rolling-update-image" + + currentInstances := []v1alpha.Instance{*instance0, *instance1} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var recreateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeDelete: + recreateActions = append(recreateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // Two recreate (Delete) actions expected (one per instance), ordered highest-to-lowest. + assert.Len(t, recreateActions, 2, "both instances must produce recreate actions on template change") + assert.Equal(t, "test-backfill-rolling-1", recreateActions[0].Object.GetName(), + "recreate actions must be ordered highest ordinal first") + assert.Equal(t, "test-backfill-rolling-0", recreateActions[1].Object.GetName()) + assert.False(t, recreateActions[0].IsSkipped(), "first recreate must be active") + assert.True(t, recreateActions[1].IsSkipped(), "second recreate must be skipped (ordered rollout)") + + // No PatchLabels — all labels are already correct. + assert.Empty(t, patchActions, "no PatchLabels when all labels are already correct") +} + func getWorkloadDeployment(name string, minReplicas int32) *v1alpha.WorkloadDeployment { instance := getInstanceTemplate(name, 0) deployment := &v1alpha.WorkloadDeployment{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: "default", + UID: "test-wd-uid", }, Spec: v1alpha.WorkloadDeploymentSpec{ + WorkloadRef: v1alpha.WorkloadReference{ + Name: "test-workload", + UID: "test-workload-uid", + }, + PlacementName: "test-placement", + CityCode: "DFW", ScaleSettings: v1alpha.HorizontalScaleSettings{ MinReplicas: minReplicas, InstanceManagementPolicy: v1alpha.OrderedReadyInstanceManagementPolicyType, @@ -180,6 +589,20 @@ func getInstanceForDeployment(deployment *v1alpha.WorkloadDeployment, ordinal in TemplateHash: instancecontrol.ComputeHash(deployment.Spec.Template), } + // Stamp all controller-managed labels so that the label-backfill path is a + // no-op for instances built by this helper. Tests that specifically exercise + // label drift should manipulate the labels directly after calling this helper. + if instance.Labels == nil { + instance.Labels = map[string]string{} + } + instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(ordinal) + instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) + instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + instance.Labels[v1alpha.WorkloadDeploymentNameLabel] = deployment.GetName() + instance.Labels[v1alpha.CityCodeLabel] = deployment.Spec.CityCode + instance.Labels[v1alpha.WorkloadNameLabel] = deployment.Spec.WorkloadRef.Name + instance.Labels[v1alpha.PlacementNameLabel] = deployment.Spec.PlacementName + return instance } diff --git a/internal/controller/singlemode.go b/internal/controller/singlemode.go new file mode 100644 index 00000000..46a2aa95 --- /dev/null +++ b/internal/controller/singlemode.go @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// NewSingleModeProjectID returns an InstanceProjectIDFunc for single-cell mode. +// It reads the upstream-cluster-name label on the edge namespace (e.g. +// "cluster-datum-cloud") and decodes it to the project ID ("datum-cloud"). +// This is the inverse of the "cluster-" encoding used by NSO's +// MappedNamespaceResourceStrategy when stamping cluster-scoped namespace labels. +// The label is stamped atomically at namespace creation, before any Instance +// can exist in the namespace, so an absent label is misconfiguration: the +// returned error wraps errProjectIdentityUnresolvable and names the namespace +// and the missing label. Transient API failures return ordinary errors +// (requeue with backoff). +func NewSingleModeProjectID(mgr mcmanager.Manager) InstanceProjectIDFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + encoded := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encoded == "" { + return "", fmt.Errorf("edge namespace %q is missing label %q: %w", + inst.Namespace, downstreamclient.UpstreamOwnerClusterNameLabel, errProjectIdentityUnresolvable) + } + return DecodeClusterName(encoded), nil + } +} + +// NewSingleModeProjectNamespace returns an InstanceProjectNamespaceFunc for +// single-cell mode. It reads the upstream-namespace label on the edge namespace +// (e.g. "ns-efdf8ca1-...") to find the in-project namespace ("default") where +// ResourceClaims must be created in the project control plane. +// The label is stamped atomically at namespace creation, before any Instance +// can exist in the namespace, so an absent label is misconfiguration: the +// returned error wraps errProjectIdentityUnresolvable and names the namespace +// and the missing label. Transient API failures return ordinary errors +// (requeue with backoff). +func NewSingleModeProjectNamespace(mgr mcmanager.Manager) InstanceProjectNamespaceFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + projectNS := ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if projectNS == "" { + return "", fmt.Errorf("edge namespace %q is missing label %q: %w", + inst.Namespace, downstreamclient.UpstreamOwnerNamespaceLabel, errProjectIdentityUnresolvable) + } + return projectNS, nil + } +} + +// readEdgeNamespace reads the edge namespace object via the uncached APIReader +// (no informer started, no cache sync required) with a short deadline. +// Returns a transient error on API failures so callers can requeue with backoff. +func readEdgeNamespace( + ctx context.Context, + mgr mcmanager.Manager, + clusterName multicluster.ClusterName, + namespace string, +) (corev1.Namespace, error) { + cl, err := mgr.GetCluster(ctx, clusterName) + if err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: getting cluster %q: %w", clusterName, err) + } + var ns corev1.Namespace + getCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + if err := cl.GetAPIReader().Get(getCtx, client.ObjectKey{Name: namespace}, &ns); err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: reading namespace %q: %w", namespace, err) + } + return ns, nil +} diff --git a/internal/controller/singlemode_test.go b/internal/controller/singlemode_test.go new file mode 100644 index 00000000..99b94f6b --- /dev/null +++ b/internal/controller/singlemode_test.go @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // smTestEdgeNS mirrors the ns-{uid} edge namespaces NSO creates. + smTestEdgeNS = "ns-efdf8ca1-7b6e-4a30-9b1c-0d6f55555555" + // smTestEncodedCluster mirrors the "cluster-" encoding stamped by + // NSO's MappedNamespaceResourceStrategy. + smTestEncodedCluster = "cluster-datum-cloud" + smTestProjectID = "datum-cloud" + smTestProjectNS = "default" + smTestCluster = "single" +) + +// smEdgeNamespace builds an edge namespace shaped like production: both +// identity labels are stamped together at creation. Passing nil labels models +// convention drift where the stamping never happened. +func smEdgeNamespace(labels map[string]string) *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: smTestEdgeNS, + Labels: labels, + }, + } +} + +func smInstance() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-instance", + Namespace: smTestEdgeNS, + }, + } +} + +func TestNewSingleModeProjectID(t *testing.T) { + t.Run("label present: decodes cluster- to the project ID", func(t *testing.T) { + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: smTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: smTestProjectNS, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + projectID, err := NewSingleModeProjectID(mgr)(context.Background(), smTestCluster, smInstance()) + require.NoError(t, err) + assert.Equal(t, smTestProjectID, projectID) + }) + + t.Run("label absent: returns errProjectIdentityUnresolvable naming the label", func(t *testing.T) { + // Only the namespace label is present — the cluster-name label was never + // stamped (convention drift, not a propagation race). + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: smTestProjectNS, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + _, err := NewSingleModeProjectID(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.ErrorIs(t, err, errProjectIdentityUnresolvable) + assert.Contains(t, err.Error(), smTestEdgeNS, + "error must name the edge namespace") + assert.Contains(t, err.Error(), downstreamclient.UpstreamOwnerClusterNameLabel, + "error must name the missing label") + }) + + t.Run("namespace read failure: transient error, not the sentinel", func(t *testing.T) { + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient())) + + _, err := NewSingleModeProjectID(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.False(t, errors.Is(err, errProjectIdentityUnresolvable), + "a failed namespace read is transient and must not be classified as unresolvable identity") + }) +} + +func TestNewSingleModeProjectNamespace(t *testing.T) { + t.Run("label present: returns the in-project namespace", func(t *testing.T) { + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: smTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: smTestProjectNS, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + projectNS, err := NewSingleModeProjectNamespace(mgr)(context.Background(), smTestCluster, smInstance()) + require.NoError(t, err) + assert.Equal(t, smTestProjectNS, projectNS) + }) + + t.Run("label absent: returns errProjectIdentityUnresolvable naming the label", func(t *testing.T) { + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: smTestEncodedCluster, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + _, err := NewSingleModeProjectNamespace(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.ErrorIs(t, err, errProjectIdentityUnresolvable) + assert.Contains(t, err.Error(), smTestEdgeNS, + "error must name the edge namespace") + assert.Contains(t, err.Error(), downstreamclient.UpstreamOwnerNamespaceLabel, + "error must name the missing label") + }) + + t.Run("namespace read failure: transient error, not the sentinel", func(t *testing.T) { + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient())) + + _, err := NewSingleModeProjectNamespace(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.False(t, errors.Is(err, errProjectIdentityUnresolvable), + "a failed namespace read is transient and must not be classified as unresolvable identity") + }) +} diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go new file mode 100644 index 00000000..cc3d3d9f --- /dev/null +++ b/internal/controller/testing_helpers_test.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Scheme helpers ─────────────────────────────────────────────────────────── + +// newProjectScheme builds a runtime.Scheme with the types needed by the project +// cluster (corev1 + compute). +func newProjectScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + return s +} + +// newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada +// API server (corev1 + compute + karmada policy). +func newKarmadaScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + _ = karmadapolicyv1alpha1.Install(s) + return s +} + +// newProjectFakeClient returns a fake client pre-populated with the given +// objects and the project scheme. +func newProjectFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(objs...). + WithStatusSubresource(objs...). + Build() +} + +// newKarmadaFakeClient returns a fake client pre-populated with the given +// objects and the Karmada scheme. +func newKarmadaFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() +} + +// ─── Fake cluster.Cluster ───────────────────────────────────────────────────── + +// fakeCluster is a minimal cluster.Cluster implementation for tests. +// Embeds the interface so only the methods we need are implemented. +type fakeCluster struct { + cluster.Cluster // nil embed — panics if unimplemented methods are called + cl client.Client +} + +func (f *fakeCluster) GetClient() client.Client { return f.cl } +func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.cl.Scheme() } +func (f *fakeCluster) GetAPIReader() client.Reader { return f.cl } + +// newFakeCluster wraps a fake client in a fakeCluster. +func newFakeCluster(cl client.Client) *fakeCluster { + return &fakeCluster{cl: cl} +} + +// ─── Fake mcmanager.Manager ─────────────────────────────────────────────────── + +// fakeMCManager is a minimal mcmanager.Manager implementation that serves a +// fixed map of project clusters. Only GetCluster is implemented; all other +// Manager methods panic through the embedded nil interface. +type fakeMCManager struct { + mcmanager.Manager // nil embed — panics if unimplemented methods are called + clusters map[string]cluster.Cluster +} + +func (m *fakeMCManager) GetCluster(_ context.Context, name multicluster.ClusterName) (cluster.Cluster, error) { + if c, ok := m.clusters[string(name)]; ok { + return c, nil + } + return nil, fmt.Errorf("cluster %q not found in fake manager", name) +} + +// newFakeMCManager returns a fakeMCManager with a single named cluster. +func newFakeMCManager(clusterName string, cl cluster.Cluster) *fakeMCManager { + return &fakeMCManager{ + clusters: map[string]cluster.Cluster{clusterName: cl}, + } +} diff --git a/internal/controller/workload_controller.go b/internal/controller/workload_controller.go index 6e907b65..34f55def 100644 --- a/internal/controller/workload_controller.go +++ b/internal/controller/workload_controller.go @@ -26,13 +26,17 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) -const workloadControllerFinalizer = "compute.datumapis.com/workload-controller" +const ( + workloadControllerFinalizer = "compute.datumapis.com/workload-controller" + workloadConditionTypeAvailable = "Available" +) // WorkloadReconciler reconciles a Workload object type WorkloadReconciler struct { @@ -118,7 +122,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ if len(notFoundNetworks) > 0 { missingNetworks := strings.Join(notFoundNetworks.UnsortedList(), ", ") changed := apimeta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{ - Type: "Available", + Type: workloadConditionTypeAvailable, Status: metav1.ConditionFalse, Reason: "NetworkNotFound", Message: fmt.Sprintf("Unable to find networks: %s", missingNetworks), @@ -216,6 +220,7 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( newWorkloadStatus := workload.Status.DeepCopy() totalReplicas := int32(0) totalCurrentReplicas := int32(0) + totalUpdatedReplicas := int32(0) totalDesiredReplicas := int32(0) totalReadyReplicas := int32(0) totalDeployments := int32(0) @@ -247,12 +252,14 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( foundAvailableDeployment := false replicas := int32(0) currentReplicas := int32(0) + updatedReplicas := int32(0) desiredReplicas := int32(0) readyReplicas := int32(0) totalDeployments += int32(len(placementDeployments)) for _, deployment := range placementDeployments { replicas += deployment.Status.Replicas currentReplicas += deployment.Status.CurrentReplicas + updatedReplicas += deployment.Status.UpdatedReplicas desiredReplicas += deployment.Status.DesiredReplicas readyReplicas += deployment.Status.ReadyReplicas @@ -262,11 +269,13 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( } totalReplicas += replicas totalCurrentReplicas += currentReplicas + totalUpdatedReplicas += updatedReplicas totalDesiredReplicas += desiredReplicas totalReadyReplicas += readyReplicas placementStatus.Replicas = replicas placementStatus.CurrentReplicas = currentReplicas + placementStatus.UpdatedReplicas = updatedReplicas placementStatus.DesiredReplicas = desiredReplicas placementStatus.ReadyReplicas = readyReplicas @@ -300,8 +309,10 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( newWorkloadStatus.Deployments = totalDeployments newWorkloadStatus.Replicas = totalReplicas newWorkloadStatus.CurrentReplicas = totalCurrentReplicas + newWorkloadStatus.UpdatedReplicas = totalUpdatedReplicas newWorkloadStatus.DesiredReplicas = totalDesiredReplicas newWorkloadStatus.ReadyReplicas = totalReadyReplicas + newWorkloadStatus.ObservedGeneration = workload.Generation if equality.Semantic.DeepEqual(workload.Status, newWorkloadStatus) { return nil @@ -383,9 +394,9 @@ func (r *WorkloadReconciler) getDeploymentsForWorkload( existingDeployments.Insert(deployment.Name) } - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := upstreamClient.List(ctx, &locations); err != nil { - return nil, nil, fmt.Errorf("failed to list locations: %w", err) + return nil, nil, fmt.Errorf("failed to list location bindings: %w", err) } if len(locations.Items) == 0 { @@ -463,7 +474,7 @@ func (r *WorkloadReconciler) SetupWithManager(mgr mcmanager.Manager) error { return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Workload{}, mcbuilder.WithEngageWithLocalCluster(false)). Owns(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Watches(&networkingv1alpha.Network{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + Watches(&networkingv1alpha.Network{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, network client.Object) []mcreconcile.Request { logger := log.FromContext(ctx) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 50e21ef0..f810e53f 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -24,6 +24,7 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" @@ -37,11 +38,23 @@ import ( type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager finalizers finalizer.Finalizers + + // NetworkingEnabled controls whether the networking integration with + // network-services-operator is active. When false, NetworkBinding creation is + // skipped, the Network scheduling gate is never added to Instances (and is + // actively removed if present), and the networking step is treated as + // immediately ready. Defaults to true. + NetworkingEnabled bool } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=locations,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkbindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkcontexts,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnetclaims,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnets,verbs=get;list;watch func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) @@ -76,7 +89,10 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco if err = cl.GetClient().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) } - return ctrl.Result{}, nil + // The finalizer-add Update is metadata-only and may be filtered by event + // predicates or handlers, so requeue explicitly to guarantee the + // deployment is reconciled past this point. + return ctrl.Result{Requeue: true}, nil } if !deployment.DeletionTimestamp.IsZero() { @@ -86,10 +102,6 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") - if deployment.Status.Location == nil { - return ctrl.Result{}, nil - } - // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -100,7 +112,9 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, fmt.Errorf("failed listing instances: %w", err) } - instanceControl := instancecontrolstateful.New() + instanceControl := instancecontrolstateful.NewWithOptions(instancecontrolstateful.Options{ + NetworkingEnabled: r.NetworkingEnabled, + }) actions, err := instanceControl.GetActions(ctx, cl.GetScheme(), &deployment, instances.Items) if err != nil { @@ -122,9 +136,28 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco } } - networkReady, err := r.reconcileNetworks(ctx, cl.GetClient(), &deployment) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + // When networking is disabled, bypass the entire network provisioning path. + // The Network scheduling gate is treated as cleared and no NetworkBindings + // are created. This lets Instances reach the runtime on cells where + // network-services-operator (VPC) is not yet available. + var networkReady bool + locationResolved := true + if !r.NetworkingEnabled { + networkReady = true + } else { + var resolvedLocation *networkingv1alpha.LocationReference + networkReady, resolvedLocation, err = r.reconcileNetworks(ctx, cl.GetClient(), &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + } + // Persist the resolved Location to status so downstream components (e.g. + // the stateful instance control strategy) can propagate it to Instances. + // When no matching Location exists, resolvedLocation is nil and + // Status.Location remains nil — instance creation is not blocked. + locationResolved = resolvedLocation != nil + if resolvedLocation != nil { + deployment.Status.Location = resolvedLocation + } } // Networks are all ready with subnets ready to use, remove any scheduling @@ -138,64 +171,73 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco desiredReplicas = 0 } - currentReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady) + currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady) if err != nil { return ctrl.Result{}, err } - patchResult, err := controllerutil.CreateOrPatch(ctx, cl.GetClient(), &deployment, func() error { - deployment.Status.Replicas = int32(replicas) - deployment.Status.CurrentReplicas = int32(currentReplicas) - deployment.Status.DesiredReplicas = desiredReplicas - deployment.Status.ReadyReplicas = int32(readyReplicas) - - if quotaBlockedReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), - }) - } else { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", - Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), - }) - } - - if readyReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", - }) - } + deployment.Status.Replicas = int32(replicas) + deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.UpdatedReplicas = int32(updatedReplicas) + deployment.Status.DesiredReplicas = desiredReplicas + deployment.Status.ReadyReplicas = int32(readyReplicas) + deployment.Status.ObservedGeneration = deployment.Generation + + if quotaBlockedReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), + }) + } else { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: "ReplicasAvailable", + Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), + }) + } - return nil - }) + if readyReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: "StableInstanceFound", + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + }) + } else if !locationResolved { + // Network provisioning cannot even start without a Location, so surface + // the unresolved city rather than the generic provisioning reason — it is + // the only user-visible signal while the deployment waits for the city's + // Location to be created. + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "NoMatchingLocation", + Message: fmt.Sprintf("No Location matches city code %q", deployment.Spec.CityCode), + }) + } else if !networkReady { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningNetwork", + Message: "Network is being provisioned", + }) + } else if replicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningInstances", + Message: "Instances are being provisioned", + }) + } - if err != nil { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) } - logger.Info("deployment status processed", "operation_result", patchResult) + logger.Info("deployment status updated") return ctrl.Result{}, nil } @@ -206,14 +248,17 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( deployment *computev1alpha.WorkloadDeployment, instances []computev1alpha.Instance, networkReady bool, -) (currentReplicas, readyReplicas, quotaBlockedReplicas int, err error) { +) (currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas int, err error) { templateHash := instancecontrol.ComputeHash(deployment.Spec.Template) for _, instance := range instances { if apimeta.IsStatusConditionPresentAndEqual(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted, metav1.ConditionFalse) { quotaBlockedReplicas++ } - if networkReady && len(instance.Spec.Controller.SchedulingGates) > 0 { + // Spec.Controller is a nilable pointer; guard it before dereferencing the + // scheduling gates so an instance without controller state cannot panic + // the reconcile (mirrors the Status.Controller guard below). + if networkReady && instance.Spec.Controller != nil && len(instance.Spec.Controller.SchedulingGates) > 0 { newGates := slices.DeleteFunc(instance.Spec.Controller.SchedulingGates, func(gate computev1alpha.SchedulingGate) bool { return gate.Name == instancecontrol.NetworkSchedulingGate.String() }) @@ -222,31 +267,74 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( instance.Spec.Controller.SchedulingGates = newGates return nil }); patchErr != nil { - return 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr) + return 0, 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr) } } } - if apimeta.IsStatusConditionTrue(instance.Status.Conditions, computev1alpha.InstanceProgrammed) { - if instance.Status.Controller.ObservedTemplateHash == templateHash { - currentReplicas++ - } + // An instance is "updated" once it has observed the desired template + // revision, regardless of readiness. Counting these (even before they are + // Programmed) makes a rolling update / restart observable: UpdatedReplicas + // dips below Replicas while the recreated instance comes up, then recovers. + // Status.Controller is a pointer the infra provider may not have populated + // yet; guard the deref to avoid a panic that would abort the reconcile. + onLatestRevision := instance.Status.Controller != nil && + instance.Status.Controller.ObservedTemplateHash == templateHash + if onLatestRevision { + updatedReplicas++ + } + + // CurrentReplicas is the Programmed subset of UpdatedReplicas — updated + // instances that are ready to serve. + if onLatestRevision && apimeta.IsStatusConditionTrue(instance.Status.Conditions, computev1alpha.InstanceProgrammed) { + currentReplicas++ } if apimeta.IsStatusConditionTrue(instance.Status.Conditions, computev1alpha.InstanceReady) { readyReplicas++ } } - return currentReplicas, readyReplicas, quotaBlockedReplicas, nil + return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, nil } +// reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all +// network interfaces on the deployment. It returns (networkReady, resolvedLocation, err). +// resolvedLocation is non-nil when a Location matching the deployment's city code +// was found; nil otherwise. Instance creation is never gated on resolvedLocation +// being non-nil — callers must treat a nil location as best-effort only. func (r *WorkloadDeploymentReconciler) reconcileNetworks( ctx context.Context, c client.Client, deployment *computev1alpha.WorkloadDeployment, -) (bool, error) { +) (bool, *networkingv1alpha.LocationReference, error) { logger := log.FromContext(ctx) + // Resolve the Location for this deployment's city code. With Karmada + // propagation the WorkloadDeployment lands in the cluster that serves the + // requested city, so the Location object for that city must exist locally. + var locationList networkingv1alpha.LocationList + if err := c.List(ctx, &locationList); err != nil { + return false, nil, fmt.Errorf("failed to list locations: %w", err) + } + + var locationRef *networkingv1alpha.LocationReference + for _, loc := range locationList.Items { + if cityCode, ok := loc.Spec.Topology["topology.datum.net/city-code"]; ok && cityCode == deployment.Spec.CityCode { + locationRef = &networkingv1alpha.LocationReference{ + Name: loc.Name, + Namespace: loc.Namespace, + } + break + } + } + + if locationRef == nil { + // Surfaced to users via the Available condition (NoMatchingLocation); the + // log is debug-level detail only. + logger.V(1).Info("no location found for city code, waiting", "cityCode", deployment.Spec.CityCode) + return false, nil, nil + } + // First, ensure we have a NetworkBinding for each interface, and that the // binding is ready before we move on to create SubnetClaims. @@ -260,7 +348,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkBindingObjectKey, &networkBinding); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network binding: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network binding: %w", err) } if networkBinding.CreationTimestamp.IsZero() { @@ -271,16 +359,16 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( }, Spec: networkingv1alpha.NetworkBindingSpec{ Network: networkInterface.Network, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetControllerReference(deployment, &networkBinding, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on network binding: %w", err) + return false, nil, fmt.Errorf("failed to set controller on network binding: %w", err) } if err := c.Create(ctx, &networkBinding); err != nil { - return false, fmt.Errorf("failed creating network binding: %w", err) + return false, nil, fmt.Errorf("failed creating network binding: %w", err) } } @@ -293,7 +381,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( if !allNetworkBindingsReady { logger.Info("waiting for network bindings to be ready") - return false, nil + return false, locationRef, nil } // TODO(jreese): Currently this makes a SubnetClaim that will be used by @@ -312,12 +400,12 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkContextObjectKey, &networkContext); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network context: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network context: %w", err) } if !apimeta.IsStatusConditionTrue(networkContext.Status.Conditions, networkingv1alpha.NetworkContextReady) { logger.Info("waiting for network context to be ready", "network_context", networkContext.Name) - return false, nil + return false, locationRef, nil } var subnetClaims networkingv1alpha.SubnetClaimList @@ -326,7 +414,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.List(ctx, &subnetClaims, listOpts...); err != nil { - return false, fmt.Errorf("failed listing subnet claims: %w", err) + return false, nil, fmt.Errorf("failed listing subnet claims: %w", err) } var subnetClaim networkingv1alpha.SubnetClaim @@ -347,8 +435,8 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } // If it's not the same location, don't consider the subnet claim. - if claim.Spec.Location.Namespace != deployment.Status.Location.Namespace || - claim.Spec.Location.Name != deployment.Status.Location.Name { + if claim.Spec.Location.Namespace != locationRef.Namespace || + claim.Spec.Location.Name != locationRef.Name { continue } @@ -371,28 +459,28 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( NetworkContext: networkingv1alpha.LocalNetworkContextRef{ Name: networkContext.Name, }, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetOwnerReference(&networkContext, &subnetClaim, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on subnet claim: %w", err) + return false, nil, fmt.Errorf("failed to set controller on subnet claim: %w", err) } if err := c.Create(ctx, &subnetClaim); err != nil { - return false, fmt.Errorf("failed creating subnet claim: %w", err) + return false, nil, fmt.Errorf("failed creating subnet claim: %w", err) } logger.Info("created subnet claim", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } logger.Info("found subnet claim", "subnetClaim", subnetClaim.Name) if !apimeta.IsStatusConditionTrue(subnetClaim.Status.Conditions, "Ready") { logger.Info("waiting for subnet claim to be ready", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } var subnet networkingv1alpha.Subnet @@ -401,19 +489,19 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( Name: subnetClaim.Status.SubnetRef.Name, } if err := c.Get(ctx, subnetObjectKey, &subnet); err != nil { - return false, fmt.Errorf("failed fetching subnet: %w", err) + return false, nil, fmt.Errorf("failed fetching subnet: %w", err) } if !apimeta.IsStatusConditionTrue(subnet.Status.Conditions, "Ready") { logger.Info("waiting for subnet to be ready", "subnet", subnet.Name) - return false, nil + return false, locationRef, nil } logger.Info("subnet is ready", "subnet", subnet.Name) } - return true, nil + return true, locationRef, nil } var errDeploymentHasInstances = errors.New("deployment has instances") @@ -468,47 +556,86 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) e if err := r.finalizers.Register(workloadControllerFinalizer, r); err != nil { return fmt.Errorf("failed to register finalizer: %w", err) } - return mcbuilder.ControllerManagedBy(mgr). + + b := mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Owns(&computev1alpha.Instance{}). - Owns(&networkingv1alpha.NetworkBinding{}). - Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnetClaim := o.(*networkingv1alpha.SubnetClaim) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) - }) - }). - Watches(&networkingv1alpha.Subnet{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnet := o.(*networkingv1alpha.Subnet) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) + Owns(&computev1alpha.Instance{}) + + // Only watch networking resources when the networking integration is enabled. + // On cells without network-services-operator these watches would log spurious + // errors for missing CRDs. + if r.NetworkingEnabled { + b = b. + Owns(&networkingv1alpha.NetworkBinding{}). + // A deployment whose city has no Location yet waits without any other + // wake-up event: NetworkBindings/SubnetClaims/Subnets only exist after + // a Location resolved, and the reconciler does not poll. Watching + // Locations re-reconciles the waiting deployments when their city's + // Location appears (or its topology changes). + Watches(&networkingv1alpha.Location{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + location := o.(*networkingv1alpha.Location) + return enqueueWorkloadDeploymentsForLocation(ctx, cl.GetClient(), clusterName, location) + }) + }). + Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnetClaim := o.(*networkingv1alpha.SubnetClaim) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + }) + }). + Watches(&networkingv1alpha.Subnet{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnet := o.(*networkingv1alpha.Subnet) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) + }) }) - }). - Complete(r) + } + + return b.Complete(r) } -func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName string, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { +// enqueueWorkloadDeploymentByLocation maps an object that carries a +// LocationReference (SubnetClaim, Subnet) to the WorkloadDeployments targeting +// the referenced Location's city. The reference must be resolved to the Location +// object first because only its topology carries the city code. +func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName multicluster.ClusterName, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { logger := log.FromContext(ctx) - cluster, err := mgr.GetCluster(ctx, clusterName) + cl, err := mgr.GetCluster(ctx, clusterName) if err != nil { logger.Error(err, "failed to get cluster") return nil } - clusterClient := cluster.GetClient() + clusterClient := cl.GetClient() - locationName := (types.NamespacedName{ + var location networkingv1alpha.Location + if err := clusterClient.Get(ctx, types.NamespacedName{ Namespace: locationRef.Namespace, Name: locationRef.Name, - }).String() - listOpts := client.MatchingFields{ - deploymentLocationIndex: locationName, + }, &location); err != nil { + logger.Error(err, "failed to get location for enqueue", "location", locationRef) + return nil } - var workloadDeployments computev1alpha.WorkloadDeploymentList + return enqueueWorkloadDeploymentsForLocation(ctx, clusterClient, clusterName, &location) +} - if err := clusterClient.List(ctx, &workloadDeployments, listOpts); err != nil { - logger.Error(err, "failed to list workloads") +// enqueueWorkloadDeploymentsForLocation maps a Location to the +// WorkloadDeployments that target its city, via the deploymentCityCodeIndex. +func enqueueWorkloadDeploymentsForLocation(ctx context.Context, c client.Client, clusterName multicluster.ClusterName, location *networkingv1alpha.Location) []mcreconcile.Request { + logger := log.FromContext(ctx) + + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if !ok { + return nil + } + + var workloadDeployments computev1alpha.WorkloadDeploymentList + if err := c.List(ctx, &workloadDeployments, client.MatchingFields{ + deploymentCityCodeIndex: cityCode, + }); err != nil { + logger.Error(err, "failed to list workload deployments") return nil } diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go new file mode 100644 index 00000000..e343a17b --- /dev/null +++ b/internal/controller/workloaddeployment_controller_test.go @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/controller/instancecontrol" +) + +const ( + // wdControllerTestName / wdControllerTestNS / wdControllerTestUID are shared + // fixtures for the WorkloadDeployment controller unit tests. + wdControllerTestName = "test-wd" + wdControllerTestNS = "default" + wdControllerTestUID = "wd-uid-test" + + // wdControllerTestCityCode is the shared CityCode fixture for + // WorkloadDeployment controller tests. + wdControllerTestCityCode = "DFW" + + // wdControllerTestWorkload is the shared WorkloadRef fixture. + wdControllerTestWorkload = "test-workload" + + // wdTestReasonProgrammed / wdTestReasonReady are condition Reason fixtures + // matching what the infra provider writes on Instances. + wdTestReasonProgrammed = "Programmed" + wdTestReasonReady = "Ready" +) + +// wdControllerTestDeployment builds a WorkloadDeployment fixture shaped like a +// cell-local deployment after Karmada propagation: city code, placement, and a +// minimal instance template so ComputeHash produces a stable hash. +func wdControllerTestDeployment(minReplicas int32) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: wdControllerTestName, + Namespace: wdControllerTestNS, + UID: wdControllerTestUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: wdControllerTestCityCode, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: minReplicas, + // Always present in production: the API server defaults the policy + // via kubebuilder, and the instance-control strategy emits no + // create/wait actions without it. + InstanceManagementPolicy: computev1alpha.OrderedReadyInstanceManagementPolicyType, + }, + Template: computev1alpha.InstanceTemplateSpec{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{}, + }, + }, + }, + }, + } +} + +// wdControllerTestInstance builds an Instance fixture labeled the way the +// instance control strategy creates them (workload-deployment-uid label set). +func wdControllerTestInstance(name string) computev1alpha.Instance { + return computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: wdControllerTestNS, + Labels: map[string]string{ + computev1alpha.WorkloadDeploymentUIDLabel: wdControllerTestUID, + }, + }, + } +} + +// TestReconcileInstanceGates_NilController_DoesNotPanic is a regression test for +// the case where an Instance has Programmed=True but Status.Controller is nil. +// +// Background: Status.Controller is a nilable pointer that the infra provider +// populates independently of setting the Programmed condition. Before the guard +// was added, reconcileInstanceGates would dereference Status.Controller while +// counting currentReplicas, causing a nil pointer panic that aborted the +// reconcile loop and froze WorkloadDeployment status. +// +// This test verifies that: +// 1. The reconcile does not panic when Status.Controller is nil. +// 2. Only instances with a non-nil Status.Controller whose ObservedTemplateHash +// matches the deployment's current template hash are counted as current. +func TestReconcileInstanceGates_NilController_DoesNotPanic(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(2) + templateHash := instancecontrol.ComputeHash(deployment.Spec.Template) + + // Instance A: Programmed=True but Status.Controller is nil (the panic case). + // This instance must NOT be counted as current and must NOT cause a panic. + instanceNilController := wdControllerTestInstance("instance-nil-controller") + instanceNilController.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionTrue, + Reason: wdTestReasonProgrammed, + LastTransitionTime: metav1.Now(), + }, + }, + // Status.Controller intentionally nil — this is the regression scenario. + Controller: nil, + } + + // Instance B: Programmed=True with Status.Controller populated and matching + // hash. This instance MUST be counted as current (currentReplicas == 1). + instanceWithController := wdControllerTestInstance("instance-with-controller") + instanceWithController.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionTrue, + Reason: wdTestReasonProgrammed, + LastTransitionTime: metav1.Now(), + }, + }, + Controller: &computev1alpha.InstanceControllerStatus{ + ObservedTemplateHash: templateHash, + }, + } + + // Instance C: Ready=True (contributes to readyReplicas). + instanceReady := wdControllerTestInstance("instance-ready") + instanceReady.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceReady, + Status: metav1.ConditionTrue, + Reason: wdTestReasonReady, + LastTransitionTime: metav1.Now(), + }, + }, + Controller: &computev1alpha.InstanceControllerStatus{ + ObservedTemplateHash: templateHash, + }, + } + + instances := []computev1alpha.Instance{ + instanceNilController, + instanceWithController, + instanceReady, + } + + // Use a fake client. networkReady=false avoids the gate-patch path that + // would call CreateOrPatch, so the client is not exercised here. + cl := newProjectFakeClient() + r := &WorkloadDeploymentReconciler{} + + // The call must not panic — that is the primary regression assertion. + currentReplicas, _, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + instances, + false, // networkReady=false: skip gate-patch path + ) + + require.NoError(t, err) + + // Only instanceWithController has Programmed=True AND a non-nil + // Status.Controller with a matching hash — the nil-Controller instance must + // not be counted. instanceReady also has a matching hash but no Programmed + // condition, so it also does not increment currentReplicas. + assert.Equal(t, 1, currentReplicas, + "only the instance with a populated, matching Status.Controller counts as current; "+ + "the nil-Controller instance must not be counted (Status.Controller nil regression guard)") + + assert.Equal(t, 1, readyReplicas, "instanceReady must be counted as ready") + assert.Equal(t, 0, quotaBlockedReplicas) +} + +// TestReconcileInstanceGates_NilSpecController_DoesNotPanic is a regression test +// for a nil-deref in reconcileInstanceGates: Spec.Controller is a nilable +// pointer, and the network gate-clearing path dereferenced +// instance.Spec.Controller.SchedulingGates without a nil guard. When +// networkReady is true and an instance has no controller spec, the unguarded +// deref panicked the reconcile. This must not panic. +func TestReconcileInstanceGates_NilSpecController_DoesNotPanic(t *testing.T) { + t.Parallel() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: wdControllerTestName, Namespace: wdControllerTestNS, UID: wdControllerTestUID}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: wdControllerTestCityCode, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + Template: computev1alpha.InstanceTemplateSpec{}, + }, + } + + // Spec.Controller intentionally nil — the network gate-clearing path runs + // (networkReady=true) and must skip this instance instead of panicking. + instanceNilSpecController := computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{Name: "instance-nil-spec-controller", Namespace: wdControllerTestNS}, + } + + cl := newProjectFakeClient() + r := &WorkloadDeploymentReconciler{} + + require.NotPanics(t, func() { + _, _, _, _, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{instanceNilSpecController}, + true, // networkReady=true exercises the Spec.Controller deref path + ) + require.NoError(t, err) + }) +} + +// TestReconcileInstanceGates_ReplicaCounting verifies how instances are bucketed +// into the replica counters: +// +// - updatedReplicas: ObservedTemplateHash matches the desired template hash, +// regardless of readiness — a stale hash must not count. +// - currentReplicas: the Programmed=True subset of updated instances. +// - readyReplicas: Ready=True regardless of revision. +// - quotaBlockedReplicas: QuotaGranted=False. +func TestReconcileInstanceGates_ReplicaCounting(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(4) + templateHash := instancecontrol.ComputeHash(deployment.Spec.Template) + + // Updated + Programmed + Ready: counts toward updated, current, and ready. + instanceUpdatedReady := wdControllerTestInstance("instance-updated-ready") + instanceUpdatedReady.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + {Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: wdTestReasonProgrammed, LastTransitionTime: metav1.Now()}, + {Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: wdTestReasonReady, LastTransitionTime: metav1.Now()}, + }, + Controller: &computev1alpha.InstanceControllerStatus{ObservedTemplateHash: templateHash}, + } + + // Stale revision but Programmed and Ready: counts toward ready only — a + // rolling update must surface UpdatedReplicas < Replicas. + instanceStale := wdControllerTestInstance("instance-stale") + instanceStale.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + {Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: wdTestReasonProgrammed, LastTransitionTime: metav1.Now()}, + {Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: wdTestReasonReady, LastTransitionTime: metav1.Now()}, + }, + Controller: &computev1alpha.InstanceControllerStatus{ObservedTemplateHash: "stale-hash"}, + } + + // Updated but not yet Programmed: counts toward updated only. + instanceUpdatedPending := wdControllerTestInstance("instance-updated-pending") + instanceUpdatedPending.Status = computev1alpha.InstanceStatus{ + Controller: &computev1alpha.InstanceControllerStatus{ObservedTemplateHash: templateHash}, + } + + // Quota-blocked: QuotaGranted=False as the instance quota controller writes it. + instanceQuotaBlocked := wdControllerTestInstance("instance-quota-blocked") + instanceQuotaBlocked.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: "quota exceeded", + LastTransitionTime: metav1.Now(), + }, + }, + } + + cl := newProjectFakeClient() + r := &WorkloadDeploymentReconciler{} + + currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{instanceUpdatedReady, instanceStale, instanceUpdatedPending, instanceQuotaBlocked}, + false, + ) + require.NoError(t, err) + + assert.Equal(t, 2, updatedReplicas, "matching-hash instances count as updated; the stale-hash instance must not") + assert.Equal(t, 1, currentReplicas, "only updated AND Programmed instances count as current") + assert.Equal(t, 2, readyReplicas, "Ready=True counts regardless of revision") + assert.Equal(t, 1, quotaBlockedReplicas, "QuotaGranted=False counts as quota-blocked") +} + +// TestReconcileInstanceGates_ClearsNetworkSchedulingGate verifies the network +// gate-clearing path: once networking is ready, the Network scheduling gate is +// removed from gated instances while unrelated gates are preserved. When +// networking is not ready, gates are left untouched. +func TestReconcileInstanceGates_ClearsNetworkSchedulingGate(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(1) + + newGatedInstance := func() *computev1alpha.Instance { + instance := wdControllerTestInstance("instance-gated") + // Gate order matches the stateful instance control strategy: Network + // prepended ahead of Quota. + instance.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + } + return &instance + } + + t.Run("network ready removes only the Network gate", func(t *testing.T) { + t.Parallel() + + instance := newGatedInstance() + cl := newProjectFakeClient(instance) + r := &WorkloadDeploymentReconciler{} + + _, _, _, _, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{*instance}, + true, + ) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, cl.Get(context.Background(), client.ObjectKeyFromObject(instance), &updated)) + require.NotNil(t, updated.Spec.Controller) + require.Len(t, updated.Spec.Controller.SchedulingGates, 1, + "the Network gate must be removed and the Quota gate preserved") + assert.Equal(t, instancecontrol.QuotaSchedulingGate.String(), updated.Spec.Controller.SchedulingGates[0].Name) + }) + + t.Run("network not ready leaves gates untouched", func(t *testing.T) { + t.Parallel() + + instance := newGatedInstance() + cl := newProjectFakeClient(instance) + r := &WorkloadDeploymentReconciler{} + + _, _, _, _, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{*instance}, + false, + ) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, cl.Get(context.Background(), client.ObjectKeyFromObject(instance), &updated)) + require.NotNil(t, updated.Spec.Controller) + assert.Len(t, updated.Spec.Controller.SchedulingGates, 2, + "gates must not be cleared while networking is still provisioning") + }) +} + +// newTestWDReconciler builds a WorkloadDeploymentReconciler wired to a fake +// project cluster with the controller finalizer pre-registered, mirroring +// SetupWithManager. Networking is disabled so Reconcile treats the network as +// immediately ready without touching networking CRDs. +func newTestWDReconciler(projectClient client.Client) *WorkloadDeploymentReconciler { + r := &WorkloadDeploymentReconciler{ + mgr: newFakeMCManager(testCluster, newFakeCluster(projectClient)), + NetworkingEnabled: false, + } + feds := finalizer.NewFinalizers() + if err := feds.Register(workloadControllerFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// TestWorkloadDeploymentReconcile_FinalizerAddRequeues verifies the first +// reconcile of a brand-new WorkloadDeployment: the finalizer is added and the +// reconciler requeues explicitly, since the metadata-only finalizer Update may +// be filtered by event predicates or handlers and would otherwise strand the +// deployment unreconciled. +func TestWorkloadDeploymentReconcile_FinalizerAddRequeues(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(1) // no finalizer yet + cl := newProjectFakeClient(deployment) + r := newTestWDReconciler(cl) + + req := mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{Name: wdControllerTestName, Namespace: wdControllerTestNS}, + }, + } + + result, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{Requeue: true}, result, + "finalizer-add reconcile must requeue explicitly; the metadata-only update may not re-enqueue via watches") + + var updated computev1alpha.WorkloadDeployment + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + assert.Contains(t, updated.Finalizers, workloadControllerFinalizer) + + // Second reconcile (post-requeue) proceeds past the finalizer branch and + // publishes status: ObservedGeneration tracks the deployment generation and + // DesiredReplicas reflects scale settings. + result, err = r.Reconcile(context.Background(), req) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + assert.Equal(t, updated.Generation, updated.Status.ObservedGeneration) + assert.Equal(t, int32(1), updated.Status.DesiredReplicas) + assert.True(t, apimeta.IsStatusConditionTrue(updated.Status.Conditions, computev1alpha.WorkloadDeploymentReplicasReady), + "no instances are quota-blocked, so ReplicasReady must be true") +} diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go new file mode 100644 index 00000000..332978d7 --- /dev/null +++ b/internal/controller/workloaddeployment_federator.go @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mchandler "sigs.k8s.io/multicluster-runtime/pkg/handler" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" + milosource "go.miloapis.com/milo/pkg/multicluster-runtime/source" +) + +const ( + // federatorFinalizer is added to project-namespace WorkloadDeployments that + // have been federated to the downstream control plane. It ensures we clean up + // the downstream object and any orphaned PropagationPolicies before the project + // object is permanently deleted. + federatorFinalizer = "compute.datumapis.com/federator" + + // cityCodeLabel is applied to WorkloadDeployments in the downstream namespace + // and is used by PropagationPolicy selectors to route them to the correct + // POP-cell clusters. Downstream Cluster objects are expected to carry this + // label with their city-code value. + cityCodeLabel = "topology.datum.net/city-code" + + kindWorkloadDeployment = "WorkloadDeployment" +) + +// WorkloadDeploymentFederator replicates WorkloadDeployments from project +// namespaces into the downstream control plane so it can propagate them to the +// appropriate POP-cell clusters. +// +// For each WorkloadDeployment the controller: +// 1. Determines the downstream namespace via the ns- +// convention (matching the MappedNamespaceResourceStrategy used by +// go.datum.net/network-services-operator). +// 2. Upserts a corresponding WorkloadDeployment in that downstream namespace, +// stamped with label topology.datum.net/city-code=. +// 3. Lazily creates a PropagationPolicy per city code per downstream namespace +// that selects WorkloadDeployments by the city-code label and targets +// clusters carrying the same label. The PP is deleted once no deployments +// with that city code remain in the namespace. +// 4. Reads the aggregated status from the downstream control plane and writes +// it back to the project-namespace object. +// 5. On deletion: removes the downstream WorkloadDeployment and cleans up +// unused PropagationPolicies. +type WorkloadDeploymentFederator struct { + mgr mcmanager.Manager + // FederationClient is a client pointed at the Karmada federation control + // plane (the federation hub that the management controllers read and write + // through). The caller (cmd/main.go) constructs it from --federation-kubeconfig. + FederationClient client.Client + // FederationCluster is a watchable cluster handle for the same Karmada + // federation control plane that FederationClient talks to. It is used to set + // up an informer-backed watch on the downstream WorkloadDeployment objects so + // that status aggregated by Karmada onto the downstream WD is mirrored back to + // the project-namespace WD immediately, rather than waiting for the next + // informer resync. When nil (e.g. in unit tests), the downstream watch is + // skipped and the controller falls back to watching only the VCP WD. + FederationCluster cluster.Cluster + finalizers finalizer.Finalizers +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list + +func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + if r.FederationClient == nil { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx) + + // An empty cluster name resolves to the local host management cluster, which + // has no compute CRDs — any Get would fail with "no matches for kind" and + // requeue in a hot loop. The For watch (EngageWithLocalCluster=false) and the + // preservation-wrapped downstream watch both set a real project cluster name, + // so an empty name here is never legitimate. Drop it without erroring. + if req.ClusterName == "" { + logger.V(1).Info("dropping reconcile with empty cluster name") + return ctrl.Result{}, nil + } + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var deployment computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + + if !deployment.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + logger.Info("federating deployment to downstream control plane") + + // Determine the downstream namespace for this project namespace using the + // ns- convention (MappedNamespaceResourceStrategy). + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(req.ClusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine downstream namespace: %w", err) + } + + if err := r.ensureDownstreamNamespace(ctx, downstreamNS, deployment.Namespace, string(req.ClusterName)); err != nil { + return ctrl.Result{}, err + } + + // Upsert the WorkloadDeployment in the downstream control plane via the + // strategy client so any future Create calls also go through + // ensureDownstreamNamespace automatically. + if err := r.upsertDownstreamDeployment(ctx, strategy.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + if err := r.ensurePropagationPolicy(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return ctrl.Result{}, err + } + + if err := r.syncStatusFromDownstream(ctx, cl.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + logger.Info("federation complete") + return ctrl.Result{}, nil +} + +// Finalize removes the downstream WorkloadDeployment and, if no other +// deployments with the same city code remain in the downstream namespace, deletes +// the PropagationPolicy as well. +func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + deployment := obj.(*computev1alpha.WorkloadDeployment) + logger := log.FromContext(ctx).WithValues( + "deployment", deployment.Name, + "namespace", deployment.Namespace, + ) + + clusterName, ok := mccontext.ClusterFrom(ctx) + if !ok { + return finalizer.Result{}, fmt.Errorf("cluster name not found in context") + } + + cl, err := r.mgr.GetCluster(ctx, clusterName) + if err != nil { + return finalizer.Result{}, err + } + + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(clusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to determine downstream namespace during finalization: %w", err) + } + + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, kd); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed to delete downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return finalizer.Result{}, err + } + + return finalizer.Result{}, nil +} + +// ensureDownstreamNamespace creates or updates the downstream namespace, stamping +// it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. +// This allows the InstanceProjector to resolve the project namespace name via a +// direct label lookup rather than scanning all namespaces by UID. +func (r *WorkloadDeploymentFederator) ensureDownstreamNamespace(ctx context.Context, name, upstreamNamespace, clusterName string) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + _, err := controllerutil.CreateOrUpdate(ctx, r.FederationClient, ns, func() error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] = EncodeClusterName(clusterName) + ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = upstreamNamespace + return nil + }) + if err != nil { + return fmt.Errorf("failed to ensure downstream namespace %q: %w", name, err) + } + return nil +} + +// upsertDownstreamDeployment creates or updates the WorkloadDeployment in the +// downstream namespace via the provided client (expected to be strategy.GetClient() +// so the downstream namespace is created with upstream tracking labels). +func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( + ctx context.Context, + downstreamClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, downstreamClient, kd, func() error { + if kd.Labels == nil { + kd.Labels = make(map[string]string) + } + kd.Labels[cityCodeLabel] = deployment.Spec.CityCode + kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace + kd.Spec = deployment.Spec + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + + log.FromContext(ctx).Info("upserted downstream deployment", "result", result, "downstreamNamespace", downstreamNS) + return nil +} + +// ensurePropagationPolicy creates or updates a PropagationPolicy in the downstream +// namespace that selects all WorkloadDeployments with the given city-code label +// and targets clusters carrying the same label. +func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, r.FederationClient, pp, func() error { + pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ + // Select all WorkloadDeployments in this namespace that carry the + // city-code label. Using a label selector (rather than individual + // resource names) means that new deployments for this city are + // automatically picked up without updating the policy. + ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ + { + APIVersion: computev1alpha.GroupVersion.String(), + Kind: kindWorkloadDeployment, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + Placement: karmadapolicyv1alpha1.Placement{ + // Route to clusters that carry the same city-code label. POP-cell + // clusters registered with the downstream control plane must be + // labeled accordingly. + ClusterAffinity: &karmadapolicyv1alpha1.ClusterAffinity{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("upserted PropagationPolicy", "result", result, "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment +// from the downstream namespace and writes it back to the project-namespace +// object. It is a no-op when the downstream object does not yet exist. +func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( + ctx context.Context, + projectClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + var kd computev1alpha.WorkloadDeployment + if err := r.FederationClient.Get(ctx, types.NamespacedName{ + Name: deployment.Name, + Namespace: downstreamNS, + }, &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) + } + + if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { + return nil + } + + deployment.Status = kd.Status + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + } + return nil +} + +// cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given +// city code if no WorkloadDeployments with that city code remain in the +// downstream namespace. +func (r *WorkloadDeploymentFederator) cleanupPropagationPolicyIfUnused( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + // The webhook requires cityCode, so an empty value here is corruption. An + // empty-valued label selector would match the wrong deployment set and + // mis-decide whether the PropagationPolicy is still in use. + if cityCode == "" { + return fmt.Errorf("cannot evaluate PropagationPolicy usage in namespace %q: city code is empty", downstreamNS) + } + + var remaining computev1alpha.WorkloadDeploymentList + if err := r.FederationClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return fmt.Errorf("failed to list remaining downstream deployments for city %q: %w", cityCode, err) + } + + if len(remaining.Items) > 0 { + // Other deployments still need this PropagationPolicy. + return nil + } + + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, pp); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to delete PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("deleted PropagationPolicy (no more deployments for city)", "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// SetupWithManager registers the controller with the multicluster manager. +// It must only be called when FederationClient is non-nil. +// +// The controller watches two control planes: +// +// - The VCP/project WorkloadDeployment (via For), so spec changes in the +// project namespace trigger federation to the downstream control plane. +// - The downstream Karmada WorkloadDeployment (via WatchesRawSource against +// FederationCluster), so when Karmada aggregates new status onto the +// downstream WD the corresponding project WD is reconciled immediately and +// the status is mirrored back. Without this second watch the federator only +// caught up on the next informer resync (~10h), causing status lag. +func (r *WorkloadDeploymentFederator) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(federatorFinalizer, r); err != nil { + return fmt.Errorf("failed to register federator finalizer: %w", err) + } + + b := mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + Named("workload-deployment-federator") + + // Watch the downstream Karmada WorkloadDeployment whose status we mirror. + // FederationCluster is a watchable handle for the federation control plane; + // it is nil in unit tests, where only the For watch is exercised. + // + // The handler MUST preserve the ClusterName that mapDownstreamDeploymentToRequest + // sets. milosource binds the raw source to the empty cluster name, and the + // default TypedEnqueueRequestsFromMapFunc wraps the map in TypedInjectCluster, + // which overwrites each request's ClusterName with that bound empty name — so + // every request would resolve to the local host cluster (no compute CRDs) and + // fail with "no matches for kind WorkloadDeployment". The preservation variant + // skips that injection so our project-cluster ClusterName survives to Reconcile. + if r.FederationCluster != nil { + preserveClusterName := func(_ multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[*computev1alpha.WorkloadDeployment, mcreconcile.Request] { + return mchandler.TypedEnqueueRequestsFromMapFuncWithClusterPreservation(r.mapDownstreamDeploymentToRequest) + } + b = b.WatchesRawSource(milosource.MustNewClusterSource( + r.FederationCluster, + &computev1alpha.WorkloadDeployment{}, + preserveClusterName, + )) + } + + return b.Complete(r) +} + +// mapDownstreamDeploymentToRequest maps an event on a downstream Karmada +// WorkloadDeployment to a reconcile request for the corresponding +// project-namespace WorkloadDeployment. +// +// Correlation mirrors the identity the federator establishes when it mirrors the +// object downstream (see upsertDownstreamDeployment / ensureDownstreamNamespace): +// +// - The WD name is stable across all planes, so the request name equals the +// downstream WD name. +// - upsertDownstreamDeployment stamps the downstream WD with +// UpstreamOwnerNamespaceLabel = the project namespace, which becomes the +// request namespace. +// - The project cluster name is not on the WD itself; ensureDownstreamNamespace +// stamps it as UpstreamOwnerClusterNameLabel on the downstream namespace +// (encoded "cluster-" with "/" -> "_"). We read the namespace from the +// federation plane to recover and decode it. +// +// Both correlation labels are stamped unconditionally by this controller +// (upsertDownstreamDeployment / ensureDownstreamNamespace), so a downstream WD +// or namespace lacking one is corruption, not a foreign object. Map functions +// cannot return errors and there is no polling backstop — a dropped event +// means permanently stale status on the project WD — so those drops are logged +// at error level to make the corruption visible. +func (r *WorkloadDeploymentFederator) mapDownstreamDeploymentToRequest( + ctx context.Context, + downstream *computev1alpha.WorkloadDeployment, +) []mcreconcile.Request { + logger := log.FromContext(ctx) + + projectNamespace := downstream.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if projectNamespace == "" { + logger.Error(nil, "downstream WorkloadDeployment is missing the upstream-namespace label; dropping status event", + "downstreamNamespace", downstream.Namespace, "name", downstream.Name, + "label", downstreamclient.UpstreamOwnerNamespaceLabel) + return nil + } + + var ns corev1.Namespace + if err := r.FederationCluster.GetClient().Get(ctx, types.NamespacedName{Name: downstream.Namespace}, &ns); err != nil { + logger.V(1).Info("unable to resolve downstream namespace for status mapping; dropping event", + "downstreamNamespace", downstream.Namespace, "error", err) + return nil + } + encodedClusterName := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encodedClusterName == "" { + logger.Error(nil, "downstream namespace is missing the upstream-cluster-name label; dropping status event", + "downstreamNamespace", downstream.Namespace, "name", downstream.Name, + "label", downstreamclient.UpstreamOwnerClusterNameLabel) + return nil + } + clusterName := projectClusterNameFromLabel(encodedClusterName) + if clusterName == "" { + logger.Error(nil, "undecodable upstream-cluster-name label on downstream namespace; dropping status event", + "downstreamNamespace", downstream.Namespace, "name", downstream.Name, + "label", downstreamclient.UpstreamOwnerClusterNameLabel, "encoded", encodedClusterName) + return nil + } + + // Verify the project cluster is engaged before enqueuing. The Milo + // multicluster provider keys clusters by bare project name, and GetCluster + // returns an error for an unknown name. Without this guard, an unresolvable + // name — or the empty string, which mcmanager routes to the local host + // cluster that has no compute CRDs — would make Reconcile fail with + // "no matches for kind WorkloadDeployment" in a hot loop. Dropping the event + // is safe: once the provider engages the project cluster, the For watch + // reconciles it and the next downstream status event maps cleanly. + if _, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)); err != nil { + logger.V(1).Info("project cluster not engaged for downstream status mapping; dropping event", + "clusterName", clusterName, "downstreamNamespace", downstream.Namespace, "error", err) + return nil + } + + return []mcreconcile.Request{ + { + ClusterName: multicluster.ClusterName(clusterName), + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: projectNamespace, + Name: downstream.Name, + }, + }, + }, + } +} + +// projectClusterNameFromLabel extracts the project cluster name that the Milo +// multicluster provider uses as its cluster key from a downstream namespace's +// UpstreamOwnerClusterNameLabel value. +// +// MappedNamespaceResourceStrategy encodes the label as "cluster-_" +// (with "/" replaced by "_"), e.g. "cluster-datum-cloud" (no org) or +// "cluster-_test-project-abc" (empty org). The provider, however, keys clusters +// by bare project name only (multicluster provider: key = project.Name), so we +// strip the "cluster-" prefix, decode "_" back to "/", and return the final path +// segment — the project name. Examples: +// +// "cluster-datum-cloud" -> "datum-cloud" +// "cluster-_test-project-abc" -> "test-project-abc" +func projectClusterNameFromLabel(encoded string) string { + name := DecodeClusterName(encoded) + if i := strings.LastIndex(name, "/"); i >= 0 { + name = name[i+1:] + } + return name +} + +// propagationPolicyNameFor returns the PropagationPolicy name for a given city +// code. The name is stable and deterministic so that multiple reconciles of +// different deployments sharing the same city code converge on the same policy. +func propagationPolicyNameFor(cityCode string) string { + sanitized := strings.ToLower(strings.ReplaceAll(cityCode, " ", "-")) + return fmt.Sprintf("city-%s", sanitized) +} diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go new file mode 100644 index 00000000..0b71f0a0 --- /dev/null +++ b/internal/controller/workloaddeployment_federator_test.go @@ -0,0 +1,576 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Shared test constants ──────────────────────────────────────────────────── + +const ( + testCluster = "test-project-cluster" + testProjNS = "my-project" + testProjNSUID = types.UID("aabbccdd-0000-1111-2222-333344445555") + testKarmadaNSStr = "ns-aabbccdd-0000-1111-2222-333344445555" + testWDName = "my-workload-deployment" + testCityCodeLAX = "LAX" +) + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +// testProjectNamespace returns a corev1.Namespace for the project cluster with a +// stable UID that matches testKarmadaNSStr. +func testProjectNamespace() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testProjNS, + UID: testProjNSUID, + }, + } +} + +// testWorkloadDeployment returns a WorkloadDeployment with the given options. +func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testProjNS, + UID: "wd-uid-1111", + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + WorkloadRef: computev1alpha.WorkloadReference{ + Name: "test-workload", + }, + PlacementName: testDefaultPlacement, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + }, + }, + } + for _, opt := range opts { + opt(wd) + } + return wd +} + +// withFinalizer adds the federator finalizer to the WorkloadDeployment. +func withFinalizer(wd *computev1alpha.WorkloadDeployment) { + wd.Finalizers = append(wd.Finalizers, federatorFinalizer) +} + +// withDeletionTimestamp sets a non-zero DeletionTimestamp on the WorkloadDeployment. +func withDeletionTimestamp(wd *computev1alpha.WorkloadDeployment) { + t := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &t +} + +// newTestFederator constructs a WorkloadDeploymentFederator wired to the given +// project client (via a fakeMCManager) and downstream client. The federator +// finalizer is pre-registered so reconcile can handle deletions. +func newTestFederator(projectClient client.Client, karmadaClient client.Client) *WorkloadDeploymentFederator { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(testCluster, projectCluster) + + r := &WorkloadDeploymentFederator{ + mgr: mgr, + FederationClient: karmadaClient, + } + + feds := finalizer.NewFinalizers() + if err := feds.Register(federatorFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// reconcileRequest builds an mcreconcile.Request for the test WorkloadDeployment. +func reconcileRequest() mcreconcile.Request { + return mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, + }, + } +} + +// ─── Unit tests ─────────────────────────────────────────────────────────────── + +// TestMapDownstreamDeploymentToRequest verifies the downstream-WD → project-WD +// mapping used by the cross-plane status watch: the request name equals the +// downstream WD name, the namespace comes from the WD's upstream-namespace label, +// and the cluster name is decoded from the downstream namespace's +// upstream-cluster-name label. Events lacking correlation metadata are dropped. +func TestMapDownstreamDeploymentToRequest(t *testing.T) { + t.Parallel() + + // The encoded cluster name on the downstream namespace decodes to testCluster. + encodedCluster := EncodeClusterName(testCluster) + + downstreamNS := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testKarmadaNSStr, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster, + }, + }, + } + + // A downstream namespace whose cluster label decodes to a project cluster the + // manager has not engaged — used to verify the not-engaged drop path. + unknownClusterNS := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testKarmadaNSStr, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: "cluster-unregistered-project", + }, + }, + } + + newDownstreamWD := func(labels map[string]string) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: labels, + }, + } + } + + tests := []struct { + name string + karmadaObjs []client.Object + downstreamWD *computev1alpha.WorkloadDeployment + want []mcreconcile.Request + }{ + { + name: "maps to project WD request", + karmadaObjs: []client.Object{downstreamNS}, + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: []mcreconcile.Request{ + { + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: testProjNS, + Name: testWDName, + }, + }, + }, + }, + }, + { + name: "missing upstream-namespace label is dropped", + karmadaObjs: []client.Object{downstreamNS}, + downstreamWD: newDownstreamWD(nil), + want: nil, + }, + { + name: "missing downstream namespace is dropped", + karmadaObjs: nil, // namespace not present in federation cluster + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: nil, + }, + { + name: "namespace without cluster label is dropped", + karmadaObjs: []client.Object{&corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}, + }}, + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: nil, + }, + { + name: "project cluster not engaged is dropped", + karmadaObjs: []client.Object{unknownClusterNS}, + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + karmadaClient := newKarmadaFakeClient(tt.karmadaObjs...) + r := &WorkloadDeploymentFederator{ + // Only testCluster is engaged; the not-engaged case decodes to a + // different project name and must be dropped by the GetCluster guard. + mgr: newFakeMCManager(testCluster, newFakeCluster(karmadaClient)), + FederationClient: karmadaClient, + FederationCluster: newFakeCluster(karmadaClient), + } + + got := r.mapDownstreamDeploymentToRequest(context.Background(), tt.downstreamWD) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestProjectClusterNameFromLabel(t *testing.T) { + t.Parallel() + + tests := []struct { + encoded string + want string + }{ + {"cluster-datum-cloud", "datum-cloud"}, + // Org-scoped encodings decode to org/project; the provider keys on the + // bare project name, so only the final path segment is returned. + {"cluster-org_project", "project"}, + {"cluster-_test-project-abc", "test-project-abc"}, + {"cluster-test-project-cluster", "test-project-cluster"}, + } + for _, tt := range tests { + t.Run(tt.encoded, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, projectClusterNameFromLabel(tt.encoded)) + }) + } +} + +func TestPropagationPolicyNameFor(t *testing.T) { + t.Parallel() + + tests := []struct { + cityCode string + want string + }{ + {"LAX", "city-lax"}, + {"lax", "city-lax"}, + {"New York", "city-new-york"}, + {"LOS ANGELES", "city-los-angeles"}, + {"SEA", "city-sea"}, + } + + for _, tt := range tests { + t.Run(tt.cityCode, func(t *testing.T) { + t.Parallel() + got := propagationPolicyNameFor(tt.cityCode) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestWorkloadDeploymentFederator_NoFederationClient verifies that the reconciler +// is a no-op when FederationClient is nil. +func TestWorkloadDeploymentFederator_NoFederationClient(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + r := newTestFederator(projectClient, nil) + r.FederationClient = nil // explicitly nil + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_EmptyClusterNameDropped verifies that a +// reconcile request carrying an empty cluster name is dropped without error +// (and without touching GetCluster), so it can never fall back to the local +// host cluster and spin in a "no matches for kind" requeue loop. +func TestWorkloadDeploymentFederator_EmptyClusterNameDropped(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + req := mcreconcile.Request{ + ClusterName: "", + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{Name: testWDName, Namespace: testProjNS}, + }, + } + result, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen verifies that the +// first reconcile of a brand-new WorkloadDeployment adds the finalizer and +// returns without federating (the finalizer update triggers a re-queue). +func TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment() // no finalizer yet + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // The project WD should now have the finalizer persisted. + var updated computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updated)) + assert.Contains(t, updated.Finalizers, federatorFinalizer) + + // Karmada should be untouched – federation happens on the next reconcile. + var wdList computev1alpha.WorkloadDeploymentList + require.NoError(t, karmadaClient.List(context.Background(), &wdList)) + assert.Empty(t, wdList.Items, "no Karmada WD should be created on first-seen reconcile") +} + +// TestWorkloadDeploymentFederator_FederatesToKarmada verifies that a +// WorkloadDeployment with the finalizer already set is fully federated: +// the Karmada namespace, WorkloadDeployment (with city-code label), and +// PropagationPolicy are all created. +func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Karmada namespace must exist. + var karmadaNS corev1.Namespace + err = karmadaClient.Get(ctx, types.NamespacedName{Name: testKarmadaNSStr}, &karmadaNS) + require.NoError(t, err, "Karmada namespace %q should exist", testKarmadaNSStr) + + // Karmada WorkloadDeployment must exist with the city-code label. + var karmadaWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &karmadaWD) + require.NoError(t, err, "Karmada WorkloadDeployment should exist") + assert.Equal(t, testCityCodeLAX, karmadaWD.Labels[cityCodeLabel], + "city-code label should be set on Karmada WD") + assert.Equal(t, testCityCodeLAX, karmadaWD.Spec.CityCode, + "spec.cityCode should be copied from project WD") + + // PropagationPolicy for the city code must exist. + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp) + require.NoError(t, err, "PropagationPolicy %q should exist", ppName) + + // The PP must select WorkloadDeployments by the city-code label. + require.Len(t, pp.Spec.ResourceSelectors, 1) + sel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) + assert.Equal(t, "WorkloadDeployment", sel.Kind) + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + + // The PP cluster affinity must target clusters carrying the same city-code. + require.NotNil(t, pp.Spec.Placement.ClusterAffinity) + require.NotNil(t, pp.Spec.Placement.ClusterAffinity.LabelSelector) + assert.Equal(t, testCityCodeLAX, + pp.Spec.Placement.ClusterAffinity.LabelSelector.MatchLabels[cityCodeLabel]) +} + +// TestWorkloadDeploymentFederator_Finalization covers the deletion scenarios: +// cleanup of Karmada resources and conditional PropagationPolicy removal. +func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + // karmadaExtra holds additional Karmada objects beyond the "own" WD and PP. + karmadaExtra []client.Object + wantPPGone bool + }{ + { + name: "last WD for city — PropagationPolicy removed", + karmadaExtra: nil, + wantPPGone: true, + }, + { + name: "other WD for same city remains — PropagationPolicy kept", + karmadaExtra: []client.Object{ + // A sibling WD in the same Karmada namespace with the same city-code. + &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-deployment", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "other", + WorkloadRef: computev1alpha.WorkloadReference{Name: "other"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + }, + }, + wantPPGone: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Project cluster: namespace + WD with finalizer and deletion timestamp. + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Karmada cluster: the mirrored WD + its PropagationPolicy + any extras. + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaObjs := []client.Object{ + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + } + karmadaObjs = append(karmadaObjs, tt.karmadaExtra...) + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // The Karmada-side WD must be gone. + var remainingWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &remainingWD) + assert.True(t, apierrors.IsNotFound(err), + "Karmada WD %q should be deleted after finalization", testWDName) + + // PropagationPolicy presence depends on whether siblings remain. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &remainingPP) + if tt.wantPPGone { + assert.True(t, apierrors.IsNotFound(err), + "PropagationPolicy should be deleted when no city siblings remain") + } else { + assert.NoError(t, err, + "PropagationPolicy should be kept when other city siblings remain") + } + + // The project WD should be gone: once the federator finalizer is removed + // from an object that already has a DeletionTimestamp, the API server + // (and the fake client) garbage-collects the object. + var updatedWD computev1alpha.WorkloadDeployment + err = projectClient.Get(ctx, + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD) + assert.True(t, apierrors.IsNotFound(err), + "project WD should be gone after finalizer removal (DeletionTimestamp + empty Finalizers = GC)") + }) + } +} + +// TestCleanupPropagationPolicyIfUnused_EmptyCityCode verifies the guard +// against listing with an empty city-code label value, which would match the +// wrong deployment set and mis-decide PropagationPolicy cleanup. +func TestCleanupPropagationPolicyIfUnused_EmptyCityCode(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + err := r.cleanupPropagationPolicyIfUnused(context.Background(), testKarmadaNSStr, "") + require.Error(t, err) + assert.Contains(t, err.Error(), "city code is empty") +} + +// TestWorkloadDeploymentFederator_NotFound verifies that a missing +// WorkloadDeployment is handled gracefully (no error, no action). +func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) // WD missing + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_Finalize_DirectCall exercises the Finalize +// method directly, ensuring the cluster name is required in context. +func TestWorkloadDeploymentFederator_Finalize_DirectCall(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + wd := testWorkloadDeployment(withFinalizer) + + // Without cluster in context → must return an error. + _, err := r.Finalize(context.Background(), wd) + require.Error(t, err, "Finalize without cluster context should fail") + assert.Contains(t, err.Error(), "cluster name not found") + + // With cluster in context → must succeed (karmada client returns not-found, which is OK). + ctx := mccontext.WithCluster(context.Background(), testCluster) + result, err := r.Finalize(ctx, wd) + require.NoError(t, err) + assert.False(t, result.Updated) +} diff --git a/internal/controller/workloaddeployment_location_test.go b/internal/controller/workloaddeployment_location_test.go new file mode 100644 index 00000000..60aba65a --- /dev/null +++ b/internal/controller/workloaddeployment_location_test.go @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + + "go.datum.net/compute/internal/controller/instancecontrol" +) + +const ( + // locTestCityCode / locTestOtherCityCode: deployments under test target + // locTestCityCode; locTestOtherCityCode identifies a decoy Location that + // must never match. + locTestCityCode = "DFW" + locTestOtherCityCode = "ORD" + + // locTestNamespace mirrors where Location objects live in real clusters. + locTestNamespace = "networking-system" + + // locTestWDNamespace is the namespace of the deployments under test. + locTestWDNamespace = "default" + + // locTestTopologyKey is the production topology key that carries a + // Location's city code. + locTestTopologyKey = "topology.datum.net/city-code" +) + +// newNetworkingScheme returns a scheme with compute + networkingv1alpha types. +func newNetworkingScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = computev1alpha.AddToScheme(s) + _ = networkingv1alpha.AddToScheme(s) + return s +} + +// newTestLocation builds a Location fixture shaped like production: the city +// code is carried in Spec.Topology under the topology.datum.net/city-code key. +func newTestLocation(name, cityCode string) *networkingv1alpha.Location { + return &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: locTestNamespace}, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{locTestTopologyKey: cityCode}, + }, + } +} + +// TestReconcileNetworks_PersistsLocation_WhenLocationFound verifies that when a +// Location object matching the deployment's city code exists in the cluster, the +// resolved LocationReference is returned by reconcileNetworks and can be persisted +// to deployment.Status.Location. Instance creation must not be blocked — the +// function returns networkReady=false only because no NetworkInterfaces exist on +// the deployment in this scenario (short-circuit before bindings), not because +// Location was absent. +func TestReconcileNetworks_PersistsLocation_WhenLocationFound(t *testing.T) { + t.Parallel() + + const locationName = "loc-dfw-1" + + location := newTestLocation(locationName, locTestCityCode) + + s := newNetworkingScheme() + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(location).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: locTestCityCode, + // No NetworkInterfaces — the function returns false,locationRef,nil + // after the location is found but before bindings are checked. + }, + } + + r := &WorkloadDeploymentReconciler{} + _, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err) + require.NotNil(t, resolvedLocation, + "resolved location must be non-nil when a matching Location object exists") + assert.Equal(t, locationName, resolvedLocation.Name) + assert.Equal(t, locTestNamespace, resolvedLocation.Namespace) + + // Simulate what the Reconcile loop does: persist resolvedLocation to Status. + deployment.Status.Location = resolvedLocation + assert.Equal(t, locationName, deployment.Status.Location.Name, + "Status.Location.Name must match the resolved Location object name") +} + +// TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound verifies that +// when no Location object in the cluster matches the deployment's city code, +// reconcileNetworks returns (false, nil, nil) — no error and no resolved +// location. The caller must treat nil location as best-effort and must NOT block +// instance creation. +func TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound(t *testing.T) { + t.Parallel() + + s := newNetworkingScheme() + // Cluster has a Location for a DIFFERENT city code. + otherLocation := newTestLocation("loc-ord-1", locTestOtherCityCode) + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(otherLocation).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: locTestCityCode, // no matching Location + }, + } + + r := &WorkloadDeploymentReconciler{} + networkReady, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err, "missing location must not cause an error") + assert.False(t, networkReady, "network is not ready when no location is found") + assert.Nil(t, resolvedLocation, + "resolved location must be nil when no matching Location object exists") + + // Status.Location remains nil — callers must not update it in this case. + // Confirm the deployment's Status.Location is unaffected (nil → nil). + assert.Nil(t, deployment.Status.Location, + "Status.Location must remain nil when no Location matches the city code") +} + +// newLocationTestWDReconciler builds a WorkloadDeploymentReconciler with +// networking enabled, wired to a fake cluster, with the controller finalizer +// pre-registered the same way SetupWithManager does. Networking must be enabled +// so Reconcile exercises Location resolution. +func newLocationTestWDReconciler(cl client.Client) *WorkloadDeploymentReconciler { + r := &WorkloadDeploymentReconciler{ + mgr: newFakeMCManager(testCluster, newFakeCluster(cl)), + NetworkingEnabled: true, + } + feds := finalizer.NewFinalizers() + if err := feds.Register(workloadControllerFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// TestWorkloadDeploymentReconcile_NoMatchingLocation_SetsCondition verifies the +// user-visible surface while a deployment waits for its city's Location: the +// Available condition must name the unresolved city (reason NoMatchingLocation), +// and once a matching Location appears the next reconcile must replace that +// reason — the unresolved-city signal must not outlive its cause. +func TestWorkloadDeploymentReconcile_NoMatchingLocation_SetsCondition(t *testing.T) { + t.Parallel() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "location-test-wd", + Namespace: locTestWDNamespace, + UID: "location-test-wd-uid", + // Pre-set the finalizer so Reconcile proceeds past the finalizer-add + // branch. + Finalizers: []string{workloadControllerFinalizer}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: locTestCityCode, + WorkloadRef: computev1alpha.WorkloadReference{Name: "location-test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + // Production deployments always carry the kubebuilder-defaulted + // policy; without it the instance-control strategy emits no actions. + InstanceManagementPolicy: computev1alpha.OrderedReadyInstanceManagementPolicyType, + }, + }, + } + + // An instance shaped the way the instance-control strategy creates it: + // ordinal name, controller labels, and the scheduling gates stamped at + // creation. Pre-seeding it (with a CreationTimestamp, which the fake client + // does not stamp on Create) keeps the strategy in its wait path so the test + // exercises only the condition transitions. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name + "-0", + Namespace: deployment.Namespace, + CreationTimestamp: metav1.Now(), + Labels: map[string]string{ + computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.UID), + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + }, + } + + // The only Location in the cluster serves a different city. + otherLocation := newTestLocation("loc-ord-1", locTestOtherCityCode) + + cl := fake.NewClientBuilder(). + WithScheme(newNetworkingScheme()). + WithObjects(deployment, instance, otherLocation). + WithStatusSubresource(deployment). + Build() + r := newLocationTestWDReconciler(cl) + + req := mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{Name: deployment.Name, Namespace: deployment.Namespace}, + }, + } + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + + var updated computev1alpha.WorkloadDeployment + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) + require.NotNil(t, cond, "Available must be set while the city has no Location") + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, "NoMatchingLocation", cond.Reason) + assert.Contains(t, cond.Message, locTestCityCode, + "the condition message must name the unresolved city code") + assert.Nil(t, updated.Status.Location) + + // Provision the city's Location; the next reconcile resolves it and must + // replace the NoMatchingLocation reason. + matchingLocation := newTestLocation("loc-dfw-2", locTestCityCode) + require.NoError(t, cl.Create(context.Background(), matchingLocation)) + + _, err = r.Reconcile(context.Background(), req) + require.NoError(t, err) + + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + cond = apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) + require.NotNil(t, cond) + assert.Equal(t, "ProvisioningInstances", cond.Reason, + "the unresolved-city reason must give way once the Location resolves") + require.NotNil(t, updated.Status.Location) + assert.Equal(t, matchingLocation.Name, updated.Status.Location.Name) +} + +// TestEnqueueWorkloadDeploymentsForLocation verifies the Location watch mapping: +// a Location event must enqueue exactly the WorkloadDeployments whose CityCode +// matches the Location's topology (via deploymentCityCodeIndex), and a Location +// without a city code in its topology must map to nothing. +func TestEnqueueWorkloadDeploymentsForLocation(t *testing.T) { + t.Parallel() + + wdDFW := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "wd-dfw", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{CityCode: locTestCityCode}, + } + wdORD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "wd-ord", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{CityCode: locTestOtherCityCode}, + } + + cl := fake.NewClientBuilder(). + WithScheme(newNetworkingScheme()). + WithIndex(&computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc). + WithObjects(wdDFW, wdORD). + Build() + + location := newTestLocation("loc-dfw-1", locTestCityCode) + + requests := enqueueWorkloadDeploymentsForLocation(context.Background(), cl, testCluster, location) + require.Len(t, requests, 1, "only deployments whose CityCode matches the Location must be enqueued") + assert.Equal(t, wdDFW.Name, requests[0].Name) + assert.Equal(t, locTestWDNamespace, requests[0].Namespace) + assert.Equal(t, multicluster.ClusterName(testCluster), requests[0].ClusterName) + + // A Location without a city code in its topology identifies no city, so no + // deployment can match it. + noCityLocation := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{Name: "loc-no-city", Namespace: locTestNamespace}, + Spec: networkingv1alpha.LocationSpec{Topology: map[string]string{}}, + } + assert.Empty(t, enqueueWorkloadDeploymentsForLocation(context.Background(), cl, testCluster, noCityLocation)) +} diff --git a/internal/controller/workloaddeployment_scheduler.go b/internal/controller/workloaddeployment_scheduler.go deleted file mode 100644 index 041b0d64..00000000 --- a/internal/controller/workloaddeployment_scheduler.go +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" -) - -// WorkloadDeploymentScheduler schedules a WorkloadDeployment -type WorkloadDeploymentScheduler struct { - mgr mcmanager.Manager -} - -func (r *WorkloadDeploymentScheduler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - - ctx = mccontext.WithCluster(ctx, req.ClusterName) - var deployment computev1alpha.WorkloadDeployment - if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, err - } - - if !deployment.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - logger.Info("scheduling deployment") - defer logger.Info("scheduling complete") - - // TODO(jreese) improve! - // The first iteration of this scheduler will be very simple and only look for - // the first available location that is viable for the deployment. In the - // future, we could see a more advanced system similar to the Kubernetes - // scheduler itself. - - // Step 1: Get Locations - var locations networkingv1alpha.LocationList - if err := cl.GetClient().List(ctx, &locations); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list locations: %w", err) - } - - if len(locations.Items) == 0 { - // Should only be the case in new environments if workloads are created - // prior to location registration. - - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are registered with the system.", - }) - if changed { - // TODO(jreese) investigate kubevirt / other operators for better tracking - // of updates to the status. I seem to remember a "builder" of sorts that - // looked rather nice. - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil - } - - // TODO(jreese) define standard Topology keys somewhere - - var selectedLocation *networkingv1alpha.Location - for _, location := range locations.Items { - cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] - if ok && cityCode == deployment.Spec.CityCode { - selectedLocation = &location - break - } - } - - if selectedLocation == nil { - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoCandidateLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are candidates for this deployment.", - }) - if changed { - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - } else { - deployment.Status.Location = &networkingv1alpha.LocationReference{ - Name: selectedLocation.Name, - Namespace: selectedLocation.Namespace, - } - - // TODO(jreese) make sure we don't run into update conflicts with the update - // of the spec then status here. Just can't remember if it's an issue. - - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "LocationAssigned", - ObservedGeneration: deployment.Generation, - Message: "Deployment has been assigned a location.", - }) - - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentScheduler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - return mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithPredicates( - predicate.NewPredicateFuncs(func(object client.Object) bool { - // Don't process deployments that have been scheduled - o := object.(*computev1alpha.WorkloadDeployment) - return o.Status.Location == nil - }), - )). - Named("workload-deployment-scheduler"). - Complete(r) -} diff --git a/internal/features/features.go b/internal/features/features.go new file mode 100644 index 00000000..c44de349 --- /dev/null +++ b/internal/features/features.go @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +// Package features defines the feature gates for the compute operator. Feature +// gates follow the Kubernetes component-base convention: each feature is +// declared as a Feature constant, registered with a FeatureSpec that includes +// its default enablement state, and toggled at runtime via the --feature-gates +// flag exposed by the binary. +// +// cmd/main.go defines the --feature-gates string flag itself and applies its +// value with: +// +// features.MutableFeatureGate.Set(featureGatesFlag) +// +// Enablement is read through the read-only view: +// +// if features.FeatureGate.Enabled(features.NetworkingIntegration) { ... } +package features + +import ( + "k8s.io/component-base/featuregate" +) + +const ( + // NetworkingIntegration controls whether the compute operator integrates with + // the network-services-operator (VPC) for NetworkBinding provisioning and the + // Network scheduling gate on Instances. + // + // When disabled: + // - No NetworkBinding objects are created. + // - The Network scheduling gate is not added to newly created Instances. + // - Any existing Network scheduling gate is actively removed. + // - The networking step is treated as immediately ready so Instances + // proceed to the runtime without a NetworkBinding. + // + // This flag exists so operators can run compute on edge/lab cells where + // VPC/NSO is not yet functional. The default is true (enabled) so that + // existing production deployments are unaffected. + // + // alpha: v0.1 + NetworkingIntegration featuregate.Feature = "NetworkingIntegration" +) + +// MutableFeatureGate is the mutable feature gate for the compute operator. +// cmd/main.go applies the --feature-gates flag value via MutableFeatureGate.Set +// at startup. Enablement should be read from FeatureGate (the read-only view) +// after startup. +var MutableFeatureGate featuregate.MutableFeatureGate = featuregate.NewFeatureGate() + +// FeatureGate is the read-only view of the compute operator feature gate. +// Use this for enablement checks rather than MutableFeatureGate to avoid +// accidental mutations after startup. +var FeatureGate featuregate.FeatureGate = MutableFeatureGate + +func init() { + if err := MutableFeatureGate.Add(map[featuregate.Feature]featuregate.FeatureSpec{ + NetworkingIntegration: {Default: true, PreRelease: featuregate.Alpha}, + }); err != nil { + panic(err) + } +} diff --git a/internal/features/features_test.go b/internal/features/features_test.go new file mode 100644 index 00000000..61687064 --- /dev/null +++ b/internal/features/features_test.go @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package features + +import ( + "testing" +) + +// TestNetworkingIntegration_DefaultEnabled verifies that the NetworkingIntegration +// feature gate defaults to enabled so that existing production deployments are +// unaffected when the flag is not set. +func TestNetworkingIntegration_DefaultEnabled(t *testing.T) { + // Use a fresh gate so this test is independent of any global state mutations. + gate := MutableFeatureGate.DeepCopy() + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration default = false, want true") + } +} + +// TestNetworkingIntegration_CanBeDisabled verifies that setting +// NetworkingIntegration=false via the feature gate string disables the +// integration, allowing operators to run compute without VPC/NSO. +func TestNetworkingIntegration_CanBeDisabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=false"); err != nil { + t.Fatalf("Set(NetworkingIntegration=false): %v", err) + } + if gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = true after Set=false, want false") + } +} + +// TestNetworkingIntegration_ExplicitlyEnabled verifies that the gate can be +// explicitly set to true (round-trip). +func TestNetworkingIntegration_ExplicitlyEnabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=true"); err != nil { + t.Fatalf("Set(NetworkingIntegration=true): %v", err) + } + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = false after Set=true, want true") + } +} diff --git a/internal/validation/instance_validation.go b/internal/validation/instance_validation.go index 7f112822..59a57585 100644 --- a/internal/validation/instance_validation.go +++ b/internal/validation/instance_validation.go @@ -17,6 +17,17 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +const ( + // diskTypePDStandard is the only currently supported disk type. + diskTypePDStandard = "pd-standard" + + // defaultImageName is the only currently supported container image. + defaultImageName = "datumcloud/ubuntu-2204-lts" + + // defaultInstanceType is the only currently supported instance type. + defaultInstanceType = "datumcloud/d1-standard-2" +) + func validateInstanceTemplate( template computev1alpha.InstanceTemplateSpec, fieldPath *field.Path, @@ -97,6 +108,11 @@ func validateInstanceNetworkInterfaces( allErrs = append(allErrs, field.Invalid(networkNameField, networkInterface.Network, msg)) } + extra := make(map[string]authorizationv1.ExtraValue, len(opts.AdmissionRequest.UserInfo.Extra)) + for k, v := range opts.AdmissionRequest.UserInfo.Extra { + extra[k] = authorizationv1.ExtraValue(v) + } + review := authorizationv1.SubjectAccessReview{ Spec: authorizationv1.SubjectAccessReviewSpec{ ResourceAttributes: &authorizationv1.ResourceAttributes{ @@ -110,6 +126,7 @@ func validateInstanceNetworkInterfaces( User: opts.AdmissionRequest.UserInfo.Username, Groups: opts.AdmissionRequest.UserInfo.Groups, UID: opts.AdmissionRequest.UserInfo.UID, + Extra: extra, }, } @@ -258,8 +275,8 @@ func validateDiskVolumeSource(diskSource *computev1alpha.DiskTemplateVolumeSourc diskTemplateSpecField := diskTemplateField.Child("spec") // TODO(jrese) look up valid disk types - if diskTemplate.Spec.Type != "pd-standard" { - allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{"pd-standard"})) + if diskTemplate.Spec.Type != diskTypePDStandard { + allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{diskTypePDStandard})) } populatorResourceRequests, errs := validateDiskPopulator(diskTemplate.Spec.Populator, diskTemplateField.Child("populator")) @@ -400,8 +417,8 @@ func validateDiskPopulator(populator *computev1alpha.DiskPopulator, fieldPath *f // TODO(jreese) look up image imagePopulator := populator.Image - if imagePopulator.Name != "datumcloud/ubuntu-2204-lts" { - allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{"datumcloud/ubuntu-2204-lts"})) + if imagePopulator.Name != defaultImageName { + allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{defaultImageName})) } } } @@ -657,8 +674,8 @@ func validateInstanceRuntimeResources(resources computev1alpha.InstanceRuntimeRe allErrs := field.ErrorList{} // TODO(jreese) look up available instance types - if resources.InstanceType != "datumcloud/d1-standard-2" { - allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{"datumcloud/d1-standard-2"})) + if resources.InstanceType != defaultInstanceType { + allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{defaultInstanceType})) } if resources.Requests != nil { diff --git a/internal/validation/workload_validation_test.go b/internal/validation/workload_validation_test.go index f73e4c9f..2a0324ee 100644 --- a/internal/validation/workload_validation_test.go +++ b/internal/validation/workload_validation_test.go @@ -23,6 +23,14 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +const ( + testCPUResource = "cpu" + testVolName = "vol" + testDuplicateMountPath = "duplicate-mount-path" + testDefaultNamespace = "default" + testCityCodeDFW = "DFW" +) + func TestValidateWorkloads(t *testing.T) { scenarios := map[string]struct { workload *computev1alpha.Workload @@ -157,7 +165,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(50, resource.DecimalSI), AverageValue: resource.NewQuantity(50, resource.DecimalSI), @@ -181,7 +189,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -202,7 +210,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageValue: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -223,7 +231,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageUtilization: proto.Int32(0), }, @@ -336,16 +344,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Gi"), @@ -369,16 +377,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Pi"), @@ -402,16 +410,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10.5Gi"), @@ -436,7 +444,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -473,7 +481,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -490,11 +498,11 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].VolumeAttachments = []computev1alpha.VolumeAttachment{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { @@ -503,7 +511,7 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, VolumeSource: volumeSource, }, } @@ -540,7 +548,7 @@ func TestValidateWorkloads(t *testing.T) { interceptorFuncs: &interceptor.Funcs{ Create: func(ctx context.Context, client client.WithWatch, obj client.Object, opts ...client.CreateOption) error { if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { - if sar.Spec.ResourceAttributes.Name == "default" && + if sar.Spec.ResourceAttributes.Name == testDefaultNamespace && sar.Spec.ResourceAttributes.Group == networkingv1alpha.GroupVersion.Group && sar.Spec.ResourceAttributes.Version == networkingv1alpha.GroupVersion.Version && sar.Spec.ResourceAttributes.Resource == "networks" { @@ -559,8 +567,8 @@ func TestValidateWorkloads(t *testing.T) { initObjs := []client.Object{ &networkingv1alpha.Network{ ObjectMeta: metav1.ObjectMeta{ - Namespace: "default", - Name: "default", + Namespace: testDefaultNamespace, + Name: testDefaultNamespace, }, }, } @@ -606,7 +614,7 @@ func TestValidateWorkloads(t *testing.T) { ) if len(scenario.opts.ValidCityCodes) == 0 { - scenario.opts.ValidCityCodes = []string{"DFW"} + scenario.opts.ValidCityCodes = []string{testCityCodeDFW} } t.Run(name, func(t *testing.T) { @@ -645,7 +653,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, Sandbox: &computev1alpha.SandboxRuntime{ Containers: []computev1alpha.SandboxContainer{ @@ -661,7 +669,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, @@ -702,7 +710,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, VirtualMachine: &computev1alpha.VirtualMachineRuntime{ VolumeAttachments: []computev1alpha.VolumeAttachment{ @@ -719,10 +727,10 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Populator: &computev1alpha.DiskPopulator{ Image: &computev1alpha.ImageDiskPopulator{ - Name: "datumcloud/ubuntu-2204-lts", + Name: defaultImageName, }, }, }, @@ -736,7 +744,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, diff --git a/internal/webhook/v1alpha/workload_webhook.go b/internal/webhook/v1alpha/workload_webhook.go index e3f3735c..a8b94b38 100644 --- a/internal/webhook/v1alpha/workload_webhook.go +++ b/internal/webhook/v1alpha/workload_webhook.go @@ -6,12 +6,12 @@ import ( "fmt" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/validation" @@ -27,8 +27,7 @@ func SetupWorkloadWebhookWithManager(mgr mcmanager.Manager) error { mgr: mgr, } - return ctrl.NewWebhookManagedBy(mgr.GetLocalManager()). - For(&computev1alpha.Workload{}). + return ctrl.NewWebhookManagedBy(mgr.GetLocalManager(), &computev1alpha.Workload{}). WithDefaulter(webhook). WithValidator(webhook). Complete() @@ -40,17 +39,11 @@ type workloadWebhook struct { mgr mcmanager.Manager } -var _ admission.CustomDefaulter = &workloadWebhook{} -var _ admission.CustomValidator = &workloadWebhook{} - -// Default implements webhook.Defaulter so a webhook will be registered for the type -func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return fmt.Errorf("unexpected type %T", obj) - } - _ = workload +var _ admission.Defaulter[*computev1alpha.Workload] = &workloadWebhook{} +var _ admission.Validator[*computev1alpha.Workload] = &workloadWebhook{} +// Default implements admission.Defaulter so a mutating webhook will be registered for the type. +func (r *workloadWebhook) Default(_ context.Context, _ *computev1alpha.Workload) error { // // TODO(jreese) review and test gateway defaulting / logic // if gw := workload.Spec.Gateway; gw != nil { // for i, tcpRoute := range gw.TCPRoutes { @@ -75,15 +68,10 @@ func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error // +kubebuilder:webhook:path=/validate-compute-datumapis-com-v1alpha-workload,mutating=false,failurePolicy=fail,sideEffects=None,groups=compute.datumapis.com,resources=workloads,verbs=create;update,versions=v1alpha,name=vworkload.kb.io,admissionReviewVersions=v1 -func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - +func (r *workloadWebhook) ValidateCreate(ctx context.Context, workload *computev1alpha.Workload) (admission.Warnings, error) { clusterName := computewebhook.ClusterNameFromContext(ctx) - cluster, err := r.mgr.GetCluster(ctx, clusterName) + cluster, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)) if err != nil { return nil, err } @@ -101,9 +89,9 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object // that means for the scheduling phase, since there would not currently be // sufficient context to know who created the workload and what locations // are valid candidates based on that. Maybe an annotation, or spec field? - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := clusterClient.List(ctx, &locations); err != nil { - return nil, fmt.Errorf("failed to list locations: %w", err) + return nil, fmt.Errorf("failed to list location bindings: %w", err) } validCityCodes := sets.Set[string]{} @@ -123,38 +111,18 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object } if errs := validation.ValidateWorkloadCreate(workload, opts); len(errs) > 0 { - return nil, errors.NewInvalid(obj.GetObjectKind().GroupVersionKind().GroupKind(), workload.Name, errs) + return nil, errors.NewInvalid(workload.GroupVersionKind().GroupKind(), workload.Name, errs) } return nil, nil } -func (r *workloadWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - oldworkload, ok := oldObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", oldObj) - } - - _ = oldworkload - - newworkload, ok := newObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", newObj) - } - - _ = newworkload - +func (r *workloadWebhook) ValidateUpdate(_ context.Context, _, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object update. return nil, nil } -func (r *workloadWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - _ = workload - +func (r *workloadWebhook) ValidateDelete(_ context.Context, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object deletion. return nil, nil }