diff --git a/api/v1alpha/annotations.go b/api/v1alpha/annotations.go index a945547e..833f1645 100644 --- a/api/v1alpha/annotations.go +++ b/api/v1alpha/annotations.go @@ -4,4 +4,46 @@ const ( AnnotationNamespace = "compute.datumapis.com" SSHKeysAnnotation = AnnotationNamespace + "/ssh-keys" + + // ExpectedReferencedDataAnnotation is set on a WorkloadDeployment by the + // ReferencedDataController. Its value is a JSON-encoded array of companion + // object names (sorted deterministically) that the cell should expect. + // The cell does a pure set-membership check against labeled companions + // without recomputing names. + // + // Example value: ["configmap.app-config","secret.db-creds"] + ExpectedReferencedDataAnnotation = AnnotationNamespace + "/expected-referenced-data" + + // RestartedAtAnnotation may be set on an InstanceTemplateSpec's annotations + // to trigger a rolling restart. The value is an RFC3339 timestamp. Because + // this annotation lives in the template metadata, it is included in the + // template hash and triggers the existing ordered in-place roll. + RestartedAtAnnotation = AnnotationNamespace + "/restartedAt" + + // ReferencedDataGateStartAnnotation is stamped on an Instance by the cell + // InstanceReconciler the first time it observes the ReferencedData scheduling + // gate. Its value is an RFC3339 timestamp. Used to compute gate-wait duration + // for the compute_referenced_data_gate_wait_seconds histogram. + ReferencedDataGateStartAnnotation = AnnotationNamespace + "/referenced-data-gate-start" + + // ReferencedDataErrorAnnotation is stamped on a WorkloadDeployment by the + // ReferencedDataController when a terminal source error occurs (SourceNotFound, + // SourceUnauthorized, or SourceTooLarge). Its value is a JSON object with + // "reason" and "message" fields carrying the authoritative resolver verdict. + // + // Example value: + // {"reason":"SourceNotFound","message":"ConfigMap \"app-config\" not found in namespace \"default\""} + // + // This annotation bridges the federation boundary: Karmada propagates + // metadata.annotations hub→cell alongside WorkloadDeployment objects, but + // status.conditions do not propagate in that direction. The cell + // InstanceReconciler reads this annotation from the cell WD copy (returned by + // fetchOwnerWorkloadDeployment) and promotes it to the Instance's + // ReferencedDataReady condition so the terminal error is visible at the Instance + // level without requiring a cross-plane condition read. + // + // The annotation is removed when the error resolves (companion materialises / + // ReferencedDataReady flips True), so the absence of the annotation means + // either no error or the error has cleared. + ReferencedDataErrorAnnotation = AnnotationNamespace + "/referenced-data-error" ) diff --git a/api/v1alpha/instance_types.go b/api/v1alpha/instance_types.go index 457537a4..80005424 100644 --- a/api/v1alpha/instance_types.go +++ b/api/v1alpha/instance_types.go @@ -138,6 +138,16 @@ type SandboxContainer struct { // so replicate the structure here too. Env []corev1.EnvVar `json:"env,omitempty"` + // List of sources to populate environment variables in the container. + // The keys defined within a source must be a C_IDENTIFIER. All invalid + // keys will be reported as an event when the container is starting. When a + // key exists in multiple sources, the value associated with the last source + // will take precedence. Values defined by an Env with a duplicate key will + // take precedence. + // + // +kubebuilder:validation:Optional + EnvFrom []EnvFromSource `json:"envFrom,omitempty"` + // The resource requirements for the container, such as CPU, memory, and GPUs. // // +kubebuilder:validation:Optional @@ -156,6 +166,54 @@ type SandboxContainer struct { Ports []NamedPort `json:"ports,omitempty"` } +// EnvFromSource represents a source for a set of ConfigMaps or Secrets to be +// used as environment variables in a container. +type EnvFromSource struct { + // An optional identifier to prepend to each key in the referenced + // ConfigMap or Secret. Must be a valid C_IDENTIFIER. + // + // +kubebuilder:validation:Optional + Prefix string `json:"prefix,omitempty"` + + // The ConfigMap to select from. + // + // +kubebuilder:validation:Optional + ConfigMapRef *ConfigMapEnvSource `json:"configMapRef,omitempty"` + + // The Secret to select from. + // + // +kubebuilder:validation:Optional + SecretRef *SecretEnvSource `json:"secretRef,omitempty"` +} + +// ConfigMapEnvSource selects a ConfigMap to populate the environment variables +// of a container. +type ConfigMapEnvSource struct { + // Name of the ConfigMap in the same namespace as the Workload. + // + // +kubebuilder:validation:Required + Name string `json:"name"` + + // Specify whether the ConfigMap must be defined. + // + // +kubebuilder:validation:Optional + Optional *bool `json:"optional,omitempty"` +} + +// SecretEnvSource selects a Secret to populate the environment variables +// of a container. +type SecretEnvSource struct { + // Name of the Secret in the same namespace as the Workload. + // + // +kubebuilder:validation:Required + Name string `json:"name"` + + // Specify whether the Secret must be defined. + // + // +kubebuilder:validation:Optional + Optional *bool `json:"optional,omitempty"` +} + type ContainerResourceRequirements struct { // Limits describes the maximum amount of compute resources allowed. // @@ -414,6 +472,38 @@ const ( // InstanceQuotaGranted indicates whether quota has been allocated for the instance InstanceQuotaGranted = "QuotaGranted" + + // ReferencedDataReady indicates whether all ConfigMaps and Secrets referenced + // by the workload template have been resolved and delivered to the cell. + // This condition is set on both WorkloadDeployment (resolver view) and + // Instance (cell view). + ReferencedDataReady = "ReferencedDataReady" +) + +const ( + // ReferencedDataReasonResolving indicates the resolver is in the process of + // reading source ConfigMaps/Secrets from the project control plane. + ReferencedDataReasonResolving = "Resolving" + + // ReferencedDataReasonAwaitingPropagation indicates the expected companions + // have not yet all arrived on the cell. + ReferencedDataReasonAwaitingPropagation = "AwaitingPropagation" + + // ReferencedDataReasonSourceNotFound indicates one or more referenced + // ConfigMaps or Secrets could not be found in the project namespace. + ReferencedDataReasonSourceNotFound = "SourceNotFound" + + // ReferencedDataReasonSourceUnauthorized indicates the management identity + // does not have permission to read one or more referenced objects. + ReferencedDataReasonSourceUnauthorized = "SourceUnauthorized" + + // ReferencedDataReasonSourceTooLarge indicates one or more referenced objects + // exceed the allowed size limit. + ReferencedDataReasonSourceTooLarge = "SourceTooLarge" + + // ReferencedDataReasonReady indicates all referenced data has been resolved + // and is present on the cell. + ReferencedDataReasonReady = "Ready" ) const ( diff --git a/api/v1alpha/labels.go b/api/v1alpha/labels.go index c75a1f26..f894aa7e 100644 --- a/api/v1alpha/labels.go +++ b/api/v1alpha/labels.go @@ -23,4 +23,13 @@ const ( // PlacementNameLabel carries the placement name from the Workload that drove // this Instance's deployment, sourced from WorkloadDeploymentSpec.PlacementName. PlacementNameLabel = LabelNamespace + "/placement-name" + + // ReferencedDataLabel is stamped on companion ConfigMaps and Secrets + // materialized by the ReferencedDataController, and on WorkloadDeployments + // that reference external ConfigMaps or Secrets. Used as a label selector + // by the Karmada PropagationPolicy to propagate companions to cells. + ReferencedDataLabel = LabelNamespace + "/referenced-data" + + // ReferencedDataLabelValue is the value used for ReferencedDataLabel. + ReferencedDataLabelValue = "true" ) diff --git a/api/v1alpha/zz_generated.deepcopy.go b/api/v1alpha/zz_generated.deepcopy.go index 926e222c..ca5b2830 100644 --- a/api/v1alpha/zz_generated.deepcopy.go +++ b/api/v1alpha/zz_generated.deepcopy.go @@ -14,6 +14,26 @@ import ( "sigs.k8s.io/gateway-api/apis/v1alpha2" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ConfigMapEnvSource) DeepCopyInto(out *ConfigMapEnvSource) { + *out = *in + if in.Optional != nil { + in, out := &in.Optional, &out.Optional + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ConfigMapEnvSource. +func (in *ConfigMapEnvSource) DeepCopy() *ConfigMapEnvSource { + if in == nil { + return nil + } + out := new(ConfigMapEnvSource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ContainerResourceRequirements) DeepCopyInto(out *ContainerResourceRequirements) { *out = *in @@ -157,6 +177,31 @@ func (in *DiskTemplateVolumeSourceTemplate) DeepCopy() *DiskTemplateVolumeSource return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvFromSource) DeepCopyInto(out *EnvFromSource) { + *out = *in + if in.ConfigMapRef != nil { + in, out := &in.ConfigMapRef, &out.ConfigMapRef + *out = new(ConfigMapEnvSource) + (*in).DeepCopyInto(*out) + } + if in.SecretRef != nil { + in, out := &in.SecretRef, &out.SecretRef + *out = new(SecretEnvSource) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvFromSource. +func (in *EnvFromSource) DeepCopy() *EnvFromSource { + if in == nil { + return nil + } + out := new(EnvFromSource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FilesystemDiskPopulator) DeepCopyInto(out *FilesystemDiskPopulator) { *out = *in @@ -668,6 +713,13 @@ func (in *SandboxContainer) DeepCopyInto(out *SandboxContainer) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.EnvFrom != nil { + in, out := &in.EnvFrom, &out.EnvFrom + *out = make([]EnvFromSource, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.Resources != nil { in, out := &in.Resources, &out.Resources *out = new(ContainerResourceRequirements) @@ -741,6 +793,26 @@ func (in *SchedulingGate) DeepCopy() *SchedulingGate { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SecretEnvSource) DeepCopyInto(out *SecretEnvSource) { + *out = *in + if in.Optional != nil { + in, out := &in.Optional, &out.Optional + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SecretEnvSource. +func (in *SecretEnvSource) DeepCopy() *SecretEnvSource { + if in == nil { + return nil + } + out := new(SecretEnvSource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *VirtualMachineRuntime) DeepCopyInto(out *VirtualMachineRuntime) { *out = *in diff --git a/cmd/main.go b/cmd/main.go index 4358a087..64240f01 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -273,9 +273,12 @@ func main() { } if enableCellControllers { + wdOpts := controller.WorkloadDeploymentReconcilerOptions{ + EnableReferencedDataGate: serverConfig.FeatureFlags.EnableReferencedDataGate, + } if err = (&controller.WorkloadDeploymentReconciler{ NetworkingEnabled: features.FeatureGate.Enabled(features.NetworkingIntegration), - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(mgr, wdOpts); err != nil { setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") os.Exit(1) } @@ -310,6 +313,29 @@ func main() { } runnables = append(runnables, extra...) } + // ReferencedDataController is a management-plane controller (it reconciles + // WorkloadDeployments on project clusters and materialises companions). Gate + // it to the management controller set so it does not collide with the cell's + // WorkloadDeploymentReconciler. + if enableManagementControllers { + if err = (&controller.ReferencedDataController{}).SetupWithManager(mgr, controller.ReferencedDataControllerOptions{ + // ProjectReader is nil for single-cluster mode; the controller falls back + // to a LocalReader. Set this to a *referenceddata.ProjectReader when the + // Milo multicluster mode is active and cross-project reads are required. + Reader: nil, + // FederationClient is set when the federation hub (Karmada) is configured. + // When non-nil, companions are materialised into the downstream + // ns-{project-uid} namespace on the hub so Karmada can propagate them + // to cells alongside the WorkloadDeployment. When nil, companions land + // in the project namespace (single-cluster / dev path). + FederationClient: federationClient, + PerObjectLimitBytes: serverConfig.ReferencedData.PerObjectLimitBytes, + AggregateLimitBytes: serverConfig.ReferencedData.AggregateLimitBytes, + }); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ReferencedData") + os.Exit(1) + } + } if serverConfig.WebhookServer != nil { if err = computev1alphawebhooks.SetupWorkloadWebhookWithManager(mgr); err != nil { diff --git a/config/base/crd/bases/compute.datumapis.com_instances.yaml b/config/base/crd/bases/compute.datumapis.com_instances.yaml index a007c0d7..cfbc039c 100644 --- a/config/base/crd/bases/compute.datumapis.com_instances.yaml +++ b/config/base/crd/bases/compute.datumapis.com_instances.yaml @@ -455,6 +455,54 @@ spec: x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map + envFrom: + description: |- + List of sources to populate environment variables in the container. + The keys defined within a source must be a C_IDENTIFIER. All invalid + keys will be reported as an event when the container is starting. When a + key exists in multiple sources, the value associated with the last source + will take precedence. Values defined by an Env with a duplicate key will + take precedence. + items: + description: |- + EnvFromSource represents a source for a set of ConfigMaps or Secrets to be + used as environment variables in a container. + properties: + configMapRef: + description: The ConfigMap to select from. + properties: + name: + description: Name of the ConfigMap in the + same namespace as the Workload. + type: string + optional: + description: Specify whether the ConfigMap + must be defined. + type: boolean + required: + - name + type: object + prefix: + description: |- + An optional identifier to prepend to each key in the referenced + ConfigMap or Secret. Must be a valid C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from. + properties: + name: + description: Name of the Secret in the same + namespace as the Workload. + type: string + optional: + description: Specify whether the Secret must + be defined. + type: boolean + required: + - name + type: object + type: object + type: array image: description: The fully qualified container image name. type: string diff --git a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml index e584af9f..777d3324 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml @@ -569,6 +569,54 @@ spec: x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map + envFrom: + description: |- + List of sources to populate environment variables in the container. + The keys defined within a source must be a C_IDENTIFIER. All invalid + keys will be reported as an event when the container is starting. When a + key exists in multiple sources, the value associated with the last source + will take precedence. Values defined by an Env with a duplicate key will + take precedence. + items: + description: |- + EnvFromSource represents a source for a set of ConfigMaps or Secrets to be + used as environment variables in a container. + properties: + configMapRef: + description: The ConfigMap to select from. + properties: + name: + description: Name of the ConfigMap + in the same namespace as the Workload. + type: string + optional: + description: Specify whether the ConfigMap + must be defined. + type: boolean + required: + - name + type: object + prefix: + description: |- + An optional identifier to prepend to each key in the referenced + ConfigMap or Secret. Must be a valid C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from. + properties: + name: + description: Name of the Secret in + the same namespace as the Workload. + type: string + optional: + description: Specify whether the Secret + must be defined. + type: boolean + required: + - name + type: object + type: object + type: array image: description: The fully qualified container image name. diff --git a/config/base/crd/bases/compute.datumapis.com_workloads.yaml b/config/base/crd/bases/compute.datumapis.com_workloads.yaml index c1c8efd9..f2af09dc 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloads.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloads.yaml @@ -579,6 +579,54 @@ spec: x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map + envFrom: + description: |- + List of sources to populate environment variables in the container. + The keys defined within a source must be a C_IDENTIFIER. All invalid + keys will be reported as an event when the container is starting. When a + key exists in multiple sources, the value associated with the last source + will take precedence. Values defined by an Env with a duplicate key will + take precedence. + items: + description: |- + EnvFromSource represents a source for a set of ConfigMaps or Secrets to be + used as environment variables in a container. + properties: + configMapRef: + description: The ConfigMap to select from. + properties: + name: + description: Name of the ConfigMap + in the same namespace as the Workload. + type: string + optional: + description: Specify whether the ConfigMap + must be defined. + type: boolean + required: + - name + type: object + prefix: + description: |- + An optional identifier to prepend to each key in the referenced + ConfigMap or Secret. Must be a valid C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from. + properties: + name: + description: Name of the Secret in + the same namespace as the Workload. + type: string + optional: + description: Specify whether the Secret + must be defined. + type: boolean + required: + - name + type: object + type: object + type: array image: description: The fully qualified container image name. diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml index 1937ef02..f2a94a37 100644 --- a/config/base/downstream-rbac/rbac.yaml +++ b/config/base/downstream-rbac/rbac.yaml @@ -6,6 +6,13 @@ rules: - apiGroups: [""] resources: ["namespaces"] verbs: ["get", "list", "watch", "create", "update", "patch"] + # The referenced-data controller reads the source ConfigMaps/Secrets a Workload + # references and materializes companion copies in the same hub namespace, which + # Karmada then propagates to the cell. It owns the companions' full lifecycle, + # including ref-count deletion, so it needs create/update/patch/delete here. + - apiGroups: [""] + resources: ["configmaps", "secrets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["compute.datumapis.com"] resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] diff --git a/config/components/controller_rbac/role.yaml b/config/components/controller_rbac/role.yaml index a634f512..425eecf6 100644 --- a/config/components/controller_rbac/role.yaml +++ b/config/components/controller_rbac/role.yaml @@ -4,6 +4,19 @@ kind: ClusterRole metadata: name: compute rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: diff --git a/internal/config/config.go b/internal/config/config.go index df4419b6..1ec2512d 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -36,6 +36,49 @@ type WorkloadOperator struct { WebhookServer *WebhookServerConfig `json:"webhookServer,omitempty"` Discovery DiscoveryConfig `json:"discovery"` + + // FeatureFlags configures optional management-plane feature gates. + FeatureFlags FeatureFlagsConfig `json:"featureFlags,omitempty"` + + // ReferencedData configures the ReferencedDataController. + ReferencedData ReferencedDataConfig `json:"referencedData,omitempty"` +} + +// +k8s:deepcopy-gen=true + +// ReferencedDataConfig holds size-limit knobs for the ReferencedDataController. +// Both limits default to zero, which causes the controller to use its built-in +// defaults (256 KiB per object, 1 MiB aggregate per WorkloadDeployment). +type ReferencedDataConfig struct { + // PerObjectLimitBytes is the maximum allowed byte size for a single + // companion ConfigMap or Secret (sum of all Data + BinaryData values). + // A value of 0 uses the built-in default of 256 KiB. + PerObjectLimitBytes int64 `json:"perObjectLimitBytes,omitempty"` + + // AggregateLimitBytes is the maximum allowed aggregate byte size across + // all companion objects for a single WorkloadDeployment. + // A value of 0 uses the built-in default of 1 MiB. + AggregateLimitBytes int64 `json:"aggregateLimitBytes,omitempty"` +} + +// +k8s:deepcopy-gen=true + +// FeatureFlagsConfig holds management-plane feature gates. All flags default +// to false (off) unless explicitly enabled, so that new capabilities can be +// merged and deployed safely before the full feature rollout is complete. +type FeatureFlagsConfig struct { + // EnableReferencedDataGate controls whether new Instances receive the + // "ReferencedData" scheduling gate when the workload template references + // ConfigMaps or Secrets. + // + // This gate MUST NOT be enabled until both the cell gate-clearing reconciler + // (Phase 2) and the unikraft provider gate-honoring (Phase 3) are confirmed + // deployed everywhere. Enabling it prematurely will cause gated instances to + // either stall indefinitely (cell not yet clearing) or launch without the + // referenced data mounted (provider not yet honoring gates). + // + // Defaults to false. + EnableReferencedDataGate bool `json:"enableReferencedDataGate,omitempty"` } // +k8s:deepcopy-gen=true diff --git a/internal/config/zz_generated.deepcopy.go b/internal/config/zz_generated.deepcopy.go index 88f2508f..5851d470 100644 --- a/internal/config/zz_generated.deepcopy.go +++ b/internal/config/zz_generated.deepcopy.go @@ -26,6 +26,21 @@ func (in *DiscoveryConfig) DeepCopy() *DiscoveryConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FeatureFlagsConfig) DeepCopyInto(out *FeatureFlagsConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FeatureFlagsConfig. +func (in *FeatureFlagsConfig) DeepCopy() *FeatureFlagsConfig { + if in == nil { + return nil + } + out := new(FeatureFlagsConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MetricsServerConfig) DeepCopyInto(out *MetricsServerConfig) { *out = *in @@ -47,6 +62,21 @@ func (in *MetricsServerConfig) DeepCopy() *MetricsServerConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ReferencedDataConfig) DeepCopyInto(out *ReferencedDataConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReferencedDataConfig. +func (in *ReferencedDataConfig) DeepCopy() *ReferencedDataConfig { + if in == nil { + return nil + } + out := new(ReferencedDataConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TLSConfig) DeepCopyInto(out *TLSConfig) { *out = *in @@ -94,6 +124,8 @@ func (in *WorkloadOperator) DeepCopyInto(out *WorkloadOperator) { (*in).DeepCopyInto(*out) } out.Discovery = in.Discovery + out.FeatureFlags = in.FeatureFlags + out.ReferencedData = in.ReferencedData } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadOperator. diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index 311337e0..6d6e4a54 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -10,6 +10,7 @@ import ( mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/referenceddata" ) const ( @@ -19,15 +20,36 @@ const ( // so that SubnetClaim/Subnet watches can efficiently find the deployments // that target the same city as a changed networking resource. deploymentCityCodeIndex = "deploymentCityCodeIndex" + + deploymentLocationIndex = "deploymentLocationIndex" + + // wdRefersToConfigMapIndex indexes WorkloadDeployments by the ConfigMap + // names they reference (via env.ValueFrom, envFrom, or volumes). Used by + // the ReferencedDataController's source-watch to re-queue WDs when a source + // ConfigMap changes (rotation). + wdRefersToConfigMapIndex = "wdRefersToConfigMapIndex" + + // wdRefersToSecretIndex indexes WorkloadDeployments by the Secret names + // they reference. Used by the ReferencedDataController's source-watch. + wdRefersToSecretIndex = "wdRefersToSecretIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { return errors.Join( addWorkloadDeploymentIndexers(ctx, mgr), addWorkloadIndexers(ctx, mgr), + addInstanceIndexers(ctx, mgr), ) } +func addInstanceIndexers(_ context.Context, _ mcmanager.Manager) error { + // No instance-level indexes are currently registered. The companion-watch + // enqueue handler (enqueueInstancesInNamespace) lists all Instances in the + // namespace directly, which is correct because companions are shared across + // all Instances in a namespace. + return nil +} + func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) error { if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentWorkloadUIDIndex, deploymentWorkloadUIDIndexFunc); err != nil { return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) @@ -37,6 +59,20 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } + // Index workload deployments by location + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + } + + // Index WDs by the ConfigMap and Secret names they reference, so the + // ReferencedDataController can re-queue WDs when sources change (rotation). + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, wdRefersToConfigMapIndex, wdRefersToConfigMapIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", wdRefersToConfigMapIndex, err) + } + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, wdRefersToSecretIndex, wdRefersToSecretIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", wdRefersToSecretIndex, err) + } + return nil } @@ -59,6 +95,20 @@ func deploymentCityCodeIndexFunc(o client.Object) []string { return []string{deployment.Spec.CityCode} } +func deploymentLocationIndexFunc(o client.Object) []string { + deployment := o.(*computev1alpha.WorkloadDeployment) + if deployment.Status.Location == nil { + return nil + } + + return []string{ + types.NamespacedName{ + Namespace: deployment.Status.Location.Namespace, + Name: deployment.Status.Location.Name, + }.String(), + } +} + func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.Workload{}, workloadNetworksIndex, workloadNetworksIndexFunc); err != nil { return fmt.Errorf("failed to add workload indexer %q: %w", workloadNetworksIndex, err) @@ -67,6 +117,34 @@ func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { return nil } +// wdRefersToConfigMapIndexFunc returns the namespace/name keys of all +// ConfigMaps referenced by a WorkloadDeployment's template. +func wdRefersToConfigMapIndexFunc(o client.Object) []string { + wd := o.(*computev1alpha.WorkloadDeployment) + refs := referenceddata.CollectFromTemplate(wd.Namespace, wd.Spec.Template) + var keys []string + for _, ref := range refs { + if ref.Kind == "ConfigMap" { + keys = append(keys, types.NamespacedName{Namespace: ref.Namespace, Name: ref.Name}.String()) + } + } + return keys +} + +// wdRefersToSecretIndexFunc returns the namespace/name keys of all Secrets +// referenced by a WorkloadDeployment's template. +func wdRefersToSecretIndexFunc(o client.Object) []string { + wd := o.(*computev1alpha.WorkloadDeployment) + refs := referenceddata.CollectFromTemplate(wd.Namespace, wd.Spec.Template) + var keys []string + for _, ref := range refs { + if ref.Kind == kindSecret { + keys = append(keys, types.NamespacedName{Namespace: ref.Namespace, Name: ref.Name}.String()) + } + } + return keys +} + func workloadNetworksIndexFunc(o client.Object) []string { workload := o.(*computev1alpha.Workload) diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index 2ea23780..7d0eccde 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -4,6 +4,7 @@ package controller import ( "context" + "encoding/json" "errors" "fmt" "maps" @@ -42,6 +43,7 @@ import ( "go.datum.net/compute/internal/controller/instancecontrol" quotametrics "go.datum.net/compute/internal/quota" + "go.datum.net/compute/internal/referenceddata" ) const ( @@ -224,6 +226,7 @@ type InstanceReconciler struct { // +kubebuilder:rbac:groups="",resources=namespaces,verbs=get // +kubebuilder:rbac:groups="",resources=events,verbs=create;patch +//nolint:gocyclo // conditions are reconciled, persisted, then returned as errors; the ordered pipeline is inherently branchy func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (_ ctrl.Result, err error) { logger := log.FromContext(ctx) @@ -283,6 +286,15 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ "after", quotaReq.String(), "cluster", req.ClusterName.String(), "instance", instance.Name) } + // Reconcile the ReferencedData condition: diff expected companions against + // those present on the cell. The result tells us whether the gate may be + // cleared in the spec-patch pass below. + refDataResult, requeueAfter, err := r.reconcileReferencedDataCondition(ctx, cl.GetClient(), &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling referenced data condition: %w", err) + } + statusChanged = refDataResult.conditionChanged || statusChanged + // Transient errors from the quota and Ready-condition reconciles are // returned only after any condition change has been persisted, so the // failure reason is visible on the Instance while controller-runtime @@ -303,8 +315,8 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ } // Return with the quota error (nil or transient) so controller-runtime // requeues with backoff on failures. On the success path (quotaErr==nil) - // we fall through to removeQuotaSchedulingGate below instead of returning - // early, so the gate is cleared in the same reconcile pass rather than + // we fall through to reconcileSchedulingGates below instead of returning + // early, so gates are cleared in the same reconcile pass rather than // waiting for a requeue that may never come (ResourceClaim is immutable // and local Instances are not watched). if quotaErr != nil { @@ -320,7 +332,10 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, quotaErr } - if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + // Spec-patch pass: remove scheduling gates for conditions that are now + // persisted as True. Handles both the Quota gate and the ReferencedData gate + // in a single patch to avoid duplicate API calls. + if err := r.reconcileSchedulingGates(ctx, cl.GetClient(), &instance); err != nil { return ctrl.Result{}, err } @@ -333,12 +348,471 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } - if quotaReq > 0 { - logger.Info("requeuing instance", "after", quotaReq.String(), + // Honor both the quota safety-net requeue and the referenced-data resolver + // requeue: pick the soonest non-zero deadline so neither pending wait is + // dropped. + effectiveRequeue := quotaReq + if requeueAfter > 0 && (effectiveRequeue == 0 || requeueAfter < effectiveRequeue) { + effectiveRequeue = requeueAfter + } + if effectiveRequeue > 0 { + logger.Info("requeuing instance", "after", effectiveRequeue.String(), "cluster", req.ClusterName.String(), "instance", instance.Name) } - return ctrl.Result{RequeueAfter: quotaReq}, nil + return ctrl.Result{RequeueAfter: effectiveRequeue}, nil +} + +// reconcileSchedulingGates removes scheduling gates whose corresponding +// conditions have been persisted as True. It handles both the Quota gate and +// the ReferencedData gate in a single patch to avoid duplicate API calls. +// +// Both gates are guarded by ObservedGeneration == instance.Generation to +// prevent a stale True condition from generation N unblocking a generation N+1 +// instance before quota/referenced-data for the new spec has been re-evaluated. +func (r *InstanceReconciler) reconcileSchedulingGates( + ctx context.Context, + cl client.Client, + instance *computev1alpha.Instance, +) error { + if instance.Spec.Controller == nil || len(instance.Spec.Controller.SchedulingGates) == 0 { + return nil + } + + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + refDataCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady) + + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gatesRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + // Guard on ObservedGeneration to prevent a stale True condition from a + // prior generation clearing a freshly-stamped gate before the condition + // has been re-evaluated at the current generation. + removeQuota := gate.Name == instancecontrol.QuotaSchedulingGate.String() && + quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue && + quotaGrantedCond.ObservedGeneration == instance.Generation + removeRefData := gate.Name == instancecontrol.ReferencedDataSchedulingGate.String() && + refDataCond != nil && refDataCond.Status == metav1.ConditionTrue && + refDataCond.ObservedGeneration == instance.Generation + if removeQuota || removeRefData { + gatesRemoved = true + // Observe gate-wait duration and emit event when the ReferencedData + // gate clears. + if removeRefData { + r.observeGateWaitDuration(instance) + r.emitReferencedDataClearedEvent(instance) + } + continue + } + newGates = append(newGates, gate) + } + if gatesRemoved { + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching scheduling gates: %w", err) + } + } + return nil +} + +// referencedDataResult carries outputs of reconcileReferencedDataCondition. +type referencedDataResult struct { + // conditionChanged is true when the ReferencedDataReady condition was updated. + conditionChanged bool +} + +// reconcileReferencedDataCondition checks whether all expected companion +// ConfigMaps/Secrets are present on the cell for the given instance. +// +// It reads the expected-set annotation from the owning WorkloadDeployment, +// lists labeled companions in the namespace, and computes the diff. +// +// - Annotation absent (resolver not yet finished): Resolving / Unknown, requeueAfter. +// - Expected set present but some companions missing: AwaitingPropagation / False. +// - All companions present: Ready / True (gate cleared by caller). +// +// Returns a requeueAfter duration when the annotation is not yet stamped on the +// WD, so the instance reconciler retries even without a watch event. +func (r *InstanceReconciler) reconcileReferencedDataCondition( + ctx context.Context, + cl client.Client, + instance *computev1alpha.Instance, +) (referencedDataResult, time.Duration, error) { + // If the instance does not carry the ReferencedData gate there is nothing to + // reconcile here — skip silently. + hasGate := false + if instance.Spec.Controller != nil { + for _, g := range instance.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.ReferencedDataSchedulingGate.String() { + hasGate = true + break + } + } + } + if !hasGate { + // Gate already gone — ensure condition is Ready if the gate was just + // cleared on a previous pass (idempotent). + existing := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady) + if existing == nil { + // Gate never present; nothing to do. + return referencedDataResult{}, 0, nil + } + if existing.Status == metav1.ConditionTrue { + // Already marked ready — nothing to do. + return referencedDataResult{}, 0, nil + } + // Gate is absent but condition is not True. Only self-heal to True when we + // can confirm that all companions are actually present — fetching the WD + // annotation to validate. If the annotation is absent (resolver hasn't + // finished yet) or companions are still missing, leave the condition alone + // rather than falsely reporting Ready. + wd, err := r.fetchOwnerWorkloadDeployment(ctx, cl, instance) + if err != nil { + return referencedDataResult{}, 0, err + } + annoRaw, hasAnno := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + if !hasAnno || annoRaw == "" { + // Resolver hasn't stamped the annotation yet — cannot confirm readiness. + return referencedDataResult{}, 0, nil + } + var expectedTokens []string + if err := json.Unmarshal([]byte(annoRaw), &expectedTokens); err != nil { + // Malformed annotation — cannot confirm readiness. + return referencedDataResult{}, 0, nil + } + if len(expectedTokens) > 0 { + presentByKindName, err := r.listPresentCompanionsByKindName(ctx, cl, instance.Namespace) + if err != nil { + return referencedDataResult{}, 0, fmt.Errorf("failed listing companion objects during self-heal check: %w", err) + } + for _, token := range expectedTokens { + if _, ok := presentByKindName[token]; !ok { + // At least one companion is missing — do not self-heal to True. + return referencedDataResult{}, 0, nil + } + } + } + // All companions confirmed present (or none expected) — mark ready. + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + Message: "All referenced companions are available", + ObservedGeneration: instance.Generation, + }) + return referencedDataResult{conditionChanged: changed}, 0, nil + } + + // Stamp the gate-start annotation once so we can measure duration later. + r.stampGateStartAnnotation(ctx, cl, instance) + + // Fetch the owning WorkloadDeployment to read the expected-set annotation. + wd, err := r.fetchOwnerWorkloadDeployment(ctx, cl, instance) + if err != nil { + return referencedDataResult{}, 0, err + } + + // When the resolver has determined a terminal source error (the source object + // is missing, unauthorized, or too large), the companion will never arrive. + // Promote the terminal error to the Instance condition so it is visible without + // a secondary fetch. Returns (changed, true) when a terminal error was found. + if changed, terminal := r.applyTerminalErrorFromWD(ctx, wd, instance); terminal { + return referencedDataResult{conditionChanged: changed}, 0, nil + } + + // Annotation not yet present — the resolver hasn't finished. Signal Resolving + // and requeue so we re-check even without a new watch event. + annoRaw, hasAnno := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + if !hasAnno || annoRaw == "" { + changed := r.setReferencedDataCondition(instance, metav1.ConditionUnknown, + computev1alpha.ReferencedDataReasonResolving, + "Waiting for the resolver to finish reading source objects") + return referencedDataResult{conditionChanged: changed}, 5 * time.Second, nil + } + + // Decode the expected companion tokens (kind-qualified "Kind/name" strings). + var expectedTokens []string + if err := json.Unmarshal([]byte(annoRaw), &expectedTokens); err != nil { + // Malformed annotation — treat like absent; the resolver will fix it. + changed := r.setReferencedDataCondition(instance, metav1.ConditionUnknown, + computev1alpha.ReferencedDataReasonResolving, + "expected-referenced-data annotation is malformed; waiting for resolver") + return referencedDataResult{conditionChanged: changed}, 5 * time.Second, nil + } + + // No companions expected (WD template has no references) — clear the gate. + if len(expectedTokens) == 0 { + changed := r.setReferencedDataCondition(instance, metav1.ConditionTrue, + computev1alpha.ReferencedDataReasonReady, + "No referenced companions required") + return referencedDataResult{conditionChanged: changed}, 0, nil + } + + // List labeled companions in the instance's namespace, keyed by "Kind/name" + // to match the kind-qualified tokens in the annotation. + presentByKindName, err := r.listPresentCompanionsByKindName(ctx, cl, instance.Namespace) + if err != nil { + return referencedDataResult{}, 0, fmt.Errorf("failed listing companion objects: %w", err) + } + + // Diff: find which expected companions are missing. + // expectedTokens contains "Kind/name" tokens; presentByKindName is keyed the same way. + var missing []string + for _, token := range expectedTokens { + if _, ok := presentByKindName[token]; !ok { + missing = append(missing, token) + } + } + + // Update metrics (aggregated per namespace to avoid high-cardinality per-instance series). + present := len(expectedTokens) - len(missing) + referenceddata.CompanionsExpected.WithLabelValues(instance.Namespace).Set(float64(len(expectedTokens))) + referenceddata.CompanionsPresent.WithLabelValues(instance.Namespace).Set(float64(present)) + + if len(missing) > 0 { + msg := fmt.Sprintf("Waiting for %d companion(s) to arrive on cell: %s", + len(missing), strings.Join(missing, ", ")) + // Capture the previous reason before the condition is updated, so we can + // determine whether the reason has transitioned. We emit a Warning event + // only on a reason transition (not on every reconcile where the missing-set + // message changes) to avoid event floods. + prevCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady) + prevReason := "" + if prevCond != nil { + prevReason = prevCond.Reason + } + changed := r.setReferencedDataConditionWithTransition(instance, metav1.ConditionFalse, + computev1alpha.ReferencedDataReasonAwaitingPropagation, msg) + if prevReason != computev1alpha.ReferencedDataReasonAwaitingPropagation { + r.emitEvent(instance, corev1.EventTypeWarning, + computev1alpha.ReferencedDataReasonAwaitingPropagation, msg) + } + return referencedDataResult{conditionChanged: changed}, 0, nil + } + + // All companions present. + changed := r.setReferencedDataConditionWithTransition(instance, metav1.ConditionTrue, + computev1alpha.ReferencedDataReasonReady, + fmt.Sprintf("All %d referenced companion(s) are present on cell", len(expectedTokens))) + return referencedDataResult{conditionChanged: changed}, 0, nil +} + +// setReferencedDataCondition sets the ReferencedDataReady condition and returns +// whether it changed. +func (r *InstanceReconciler) setReferencedDataCondition( + instance *computev1alpha.Instance, + status metav1.ConditionStatus, + reason, message string, +) bool { + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: status, + Reason: reason, + Message: message, + ObservedGeneration: instance.Generation, + }) +} + +// setReferencedDataConditionWithTransition sets the condition and, when it +// changed, increments the condition-transition metric. +func (r *InstanceReconciler) setReferencedDataConditionWithTransition( + instance *computev1alpha.Instance, + status metav1.ConditionStatus, + reason, message string, +) bool { + prev := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady) + fromReason := "none" + if prev != nil { + fromReason = prev.Reason + } + changed := r.setReferencedDataCondition(instance, status, reason, message) + if changed && fromReason != reason { + referenceddata.ConditionTransitions.WithLabelValues(instance.Namespace, fromReason, reason).Inc() + } + return changed +} + +// isTerminalReferencedDataReason reports whether the given ReferencedData reason +// is terminal — i.e., the companion will never arrive because the source object +// is permanently unavailable, not just slow to propagate. +func isTerminalReferencedDataReason(reason string) bool { + switch reason { + case computev1alpha.ReferencedDataReasonSourceNotFound, + computev1alpha.ReferencedDataReasonSourceUnauthorized, + computev1alpha.ReferencedDataReasonSourceTooLarge: + return true + } + return false +} + +// applyTerminalErrorFromWD reads the terminal-error signal from the owner WD and, +// when a valid terminal error is found, sets the Instance's ReferencedDataReady +// condition. Returns (changed, true) when a terminal error was applied, or +// (false, false) when there is no terminal error to apply. +// +// The signal is read from two places in priority order: +// 1. ReferencedDataErrorAnnotation on the WD metadata — the primary path in +// federation, because Karmada propagates annotations hub→cell but does not +// propagate status.conditions in that direction. +// 2. The WD's ReferencedDataReady status condition — the fallback for +// single-cluster deployments or during a controller version upgrade before +// the annotation was introduced. +func (r *InstanceReconciler) applyTerminalErrorFromWD( + ctx context.Context, + wd *computev1alpha.WorkloadDeployment, + instance *computev1alpha.Instance, +) (changed, terminal bool) { + // Path 1: annotation (federation-safe). + if termErrRaw := wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation]; termErrRaw != "" { + termReason, termMessage, decodeErr := decodeTerminalError(termErrRaw) + if decodeErr != nil { + log.FromContext(ctx).V(1).Info("malformed referenced-data-error annotation on WD; ignoring", + "workloadDeployment", wd.Name, "error", decodeErr) + } else if isTerminalReferencedDataReason(termReason) { + ch := r.setReferencedDataConditionWithTransition(instance, metav1.ConditionFalse, + termReason, termMessage) + return ch, true + } + } + + // Path 2: status condition fallback (single-cluster). + wdCond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + if wdCond != nil && wdCond.Status == metav1.ConditionFalse && isTerminalReferencedDataReason(wdCond.Reason) { + ch := r.setReferencedDataConditionWithTransition(instance, metav1.ConditionFalse, + wdCond.Reason, wdCond.Message) + return ch, true + } + + return false, false +} + +// listPresentCompanionsByKindName returns a set keyed by kind-qualified tokens +// ("Kind/name", e.g. "ConfigMap/app-config") for every companion ConfigMap and +// Secret present in the given namespace (matched by ReferencedDataLabel). This +// allows the gate-clearing logic to match the kind-qualified tokens stored in +// the expected-referenced-data annotation without ambiguity. +func (r *InstanceReconciler) listPresentCompanionsByKindName( + ctx context.Context, + cl client.Client, + namespace string, +) (map[string]struct{}, error) { + labelSel := client.MatchingLabels{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue} + inNs := client.InNamespace(namespace) + + present := make(map[string]struct{}) + + var cmList corev1.ConfigMapList + if err := cl.List(ctx, &cmList, inNs, labelSel); err != nil { + return nil, fmt.Errorf("list companion ConfigMaps: %w", err) + } + for _, cm := range cmList.Items { + present[referenceddata.CompanionToken(kindConfigMap, cm.Name)] = struct{}{} + } + + var secretList corev1.SecretList + if err := cl.List(ctx, &secretList, inNs, labelSel); err != nil { + return nil, fmt.Errorf("list companion Secrets: %w", err) + } + for _, s := range secretList.Items { + present[referenceddata.CompanionToken(kindSecret, s.Name)] = struct{}{} + } + + return present, nil +} + +// fetchOwnerWorkloadDeployment retrieves the WorkloadDeployment that owns the +// given instance. It is shared by the network-failure check and the +// referenced-data condition reconciler to avoid duplicate fetches; callers +// that already hold the WD should not call this a second time. +func (r *InstanceReconciler) fetchOwnerWorkloadDeployment( + ctx context.Context, + cl client.Client, + instance *computev1alpha.Instance, +) (*computev1alpha.WorkloadDeployment, error) { + ownerRef := metav1.GetControllerOf(instance) + if ownerRef == nil { + return nil, fmt.Errorf("instance %s/%s has no controller owner reference", instance.Namespace, instance.Name) + } + var wd computev1alpha.WorkloadDeployment + if err := cl.Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: ownerRef.Name}, &wd); err != nil { + return nil, fmt.Errorf("failed fetching owning WorkloadDeployment %q: %w", ownerRef.Name, err) + } + return &wd, nil +} + +// stampGateStartAnnotation records the RFC3339 time at which the ReferencedData +// gate was first observed. It is a best-effort, no-op if the annotation is +// already present or the patch fails. +// +// The status conditions on instance are preserved across the patch: the fake +// client (and real API server) does not touch status in a metadata-only Patch, +// but the controller-runtime fake client zeroes and re-unmarshals instance from +// the server response, which would lose any in-memory status changes not yet +// persisted via Status().Update(). We save and restore them explicitly. +func (r *InstanceReconciler) stampGateStartAnnotation( + ctx context.Context, + cl client.Client, + instance *computev1alpha.Instance, +) { + if instance.Annotations != nil { + if _, ok := instance.Annotations[computev1alpha.ReferencedDataGateStartAnnotation]; ok { + return + } + } + // Preserve in-memory status so the Patch does not discard conditions that + // have been set by earlier reconcile steps but not yet persisted to the + // API server via Status().Update(). + savedStatus := instance.Status.DeepCopy() + + patch := client.MergeFrom(instance.DeepCopy()) + if instance.Annotations == nil { + instance.Annotations = make(map[string]string) + } + instance.Annotations[computev1alpha.ReferencedDataGateStartAnnotation] = time.Now().UTC().Format(time.RFC3339) + if err := cl.Patch(ctx, instance, patch); err != nil { + log.FromContext(ctx).V(1).Info("could not stamp gate-start annotation (non-fatal)", "error", err) + } + + // Restore the in-memory status after the patch overwrites it with the + // server-side state. + instance.Status = *savedStatus +} + +// observeGateWaitDuration records the gate-wait histogram when the +// ReferencedData gate is being cleared. It reads the start annotation stamped +// when the gate was first observed. +func (r *InstanceReconciler) observeGateWaitDuration(instance *computev1alpha.Instance) { + if instance.Annotations == nil { + return + } + startStr, ok := instance.Annotations[computev1alpha.ReferencedDataGateStartAnnotation] + if !ok { + return + } + startTime, err := time.Parse(time.RFC3339, startStr) + if err != nil { + return + } + elapsed := time.Since(startTime).Seconds() + referenceddata.GateWaitDuration.WithLabelValues(instance.Namespace).Observe(elapsed) +} + +// emitReferencedDataClearedEvent records a Normal event on the instance when +// the ReferencedData gate is cleared. +func (r *InstanceReconciler) emitReferencedDataClearedEvent(instance *computev1alpha.Instance) { + r.emitEvent(instance, corev1.EventTypeNormal, + computev1alpha.ReferencedDataReasonReady, + "All referenced companion ConfigMaps/Secrets are present; ReferencedData gate cleared") +} + +// emitEvent emits a Kubernetes event if a recorder is available. Guard against +// a nil recorder so that unit tests that don't wire up a recorder don't panic. +func (r *InstanceReconciler) emitEvent(obj *computev1alpha.Instance, eventType, reason, message string) { + if r.recorder == nil { + return + } + r.recorder.Event(obj, eventType, reason, message) } // reconcileDeletion handles quota-claim cleanup when an Instance is being @@ -526,47 +1000,6 @@ func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, cluste } } -// removeQuotaSchedulingGate removes the quota scheduling gate from the -// Instance spec once QuotaGranted=True has been persisted to status. -// It guards on ObservedGeneration to prevent a stale True condition from -// generation N unblocking a generation N+1 instance before quota for the -// new spec has been evaluated. -func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { - return nil - } - // Stale condition guard: only remove the gate if the condition reflects the - // current spec generation. A condition from an older generation means quota - // has not yet been evaluated for the current spec. - if quotaGrantedCond.ObservedGeneration != instance.Generation { - return nil - } - if instance.Spec.Controller == nil { - return nil - } - - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if !gateRemoved { - return nil - } - - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.Patch(ctx, instance, patch); err != nil { - return fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - return nil -} - // Finalize removes the downstream write-back Instance when the local Instance is // deleted. It is a no-op when downstream federation is disabled. func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { @@ -1203,17 +1636,8 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( // Rough way to propagate creation errors up to the instance as soon as possible. // Lots of room for improvement here. func (r *InstanceReconciler) checkForNetworkCreationFailure(ctx context.Context, upstreamClient client.Client, instance *computev1alpha.Instance) (failed bool, message string, err error) { - workloadDeploymentRef := metav1.GetControllerOf(instance) - if workloadDeploymentRef == nil { - return false, "", fmt.Errorf("instance is not owned by a workload deployment") - } - - var workloadDeployment computev1alpha.WorkloadDeployment - workloadDeploymentObjectKey := client.ObjectKey{ - Namespace: instance.Namespace, - Name: workloadDeploymentRef.Name, - } - if err := upstreamClient.Get(ctx, workloadDeploymentObjectKey, &workloadDeployment); err != nil { + workloadDeployment, err := r.fetchOwnerWorkloadDeployment(ctx, upstreamClient, instance) + if err != nil { return false, "", fmt.Errorf("failed fetching workload deployment: %w", err) } @@ -1343,5 +1767,56 @@ func (r *InstanceReconciler) SetupWithManager( return obj.GetLabels()[instanceQuotaClaimSourceLabel] == edgeClusterNameVal })), ). + // Watch companion ConfigMaps: when one arrives (or is updated) re-queue all + // Instances in the namespace so they can attempt gate-clearing. + Watches(&corev1.ConfigMap{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []mcreconcile.Request { + if obj.GetLabels()[computev1alpha.ReferencedDataLabel] != computev1alpha.ReferencedDataLabelValue { + return nil + } + return enqueueInstancesInNamespace(ctx, cl.GetClient(), string(clusterName), obj.GetNamespace()) + }) + }). + // Watch companion Secrets for the same reason. + Watches(&corev1.Secret{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []mcreconcile.Request { + if obj.GetLabels()[computev1alpha.ReferencedDataLabel] != computev1alpha.ReferencedDataLabelValue { + return nil + } + return enqueueInstancesInNamespace(ctx, cl.GetClient(), string(clusterName), obj.GetNamespace()) + }) + }). Complete(r) } + +// enqueueInstancesInNamespace returns reconcile requests for every Instance +// in the given namespace. Companions are shared across all Instances in a +// namespace, so any gated instance in the namespace should re-check when a +// companion ConfigMap or Secret arrives. +func enqueueInstancesInNamespace( + ctx context.Context, + cl client.Client, + clusterName, namespace string, +) []mcreconcile.Request { + logger := log.FromContext(ctx) + + var instanceList computev1alpha.InstanceList + if err := cl.List(ctx, &instanceList, client.InNamespace(namespace)); err != nil { + logger.Error(err, "failed listing instances for companion watch", "namespace", namespace) + return nil + } + + requests := make([]mcreconcile.Request, 0, len(instanceList.Items)) + for _, inst := range instanceList.Items { + requests = append(requests, mcreconcile.Request{ + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: inst.Namespace, + Name: inst.Name, + }, + }, + ClusterName: multicluster.ClusterName(clusterName), + }) + } + return requests +} diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 1445ff96..8ff1d154 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -47,6 +47,9 @@ const ( testQuotaAPIGroup = "quota.miloapis.com" testQuotaResource = "resourceclaims" kindWorkloadDeploymentTest = "WorkloadDeployment" // mirrors kindWorkloadDeployment + + // testMsgQuotaExceeded is the quota-denied message used across quota tests. + testMsgQuotaExceeded = "Quota exceeded for project" ) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. @@ -55,6 +58,7 @@ func newTestScheme(t *testing.T) *runtime.Scheme { s := runtime.NewScheme() require.NoError(t, computev1alpha.AddToScheme(s)) require.NoError(t, quotav1alpha1.AddToScheme(s)) + require.NoError(t, corev1.AddToScheme(s)) return s } @@ -345,7 +349,7 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: "Quota exceeded for project", + Message: testMsgQuotaExceeded, LastTransitionTime: metav1.Now(), }, { @@ -370,7 +374,7 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingQuota, - Message: "Quota exceeded for project", + Message: testMsgQuotaExceeded, }, }, { @@ -824,7 +828,7 @@ func TestReconcileQuota(t *testing.T) { // the Quota scheduling gate was never removed from an Instance after quota was // granted. The root cause was an early return in the Reconcile function: when // reconcileQuotaCondition set QuotaGranted=True (statusChanged=true), the code -// wrote the status update and returned before reaching removeQuotaSchedulingGate. +// wrote the status update and returned before reaching reconcileSchedulingGates. // Because ResourceClaims are immutable (no further transitions) and local // Instances are not watched (WithEngageWithLocalCluster(false)), no requeue ever // arrived — leaving the Quota gate stranded in spec.controller.schedulingGates @@ -1573,7 +1577,7 @@ func TestReconcileQuotaFailureModes(t *testing.T) { // Single reconcile: reconcileQuotaCondition writes QuotaGranted=True with // ObservedGeneration=2 into the in-memory instance, status is persisted, - // then removeQuotaSchedulingGate reads the in-memory condition (gen=2 == + // then reconcileSchedulingGates reads the in-memory condition (gen=2 == // instance.Generation=2) and removes the gate — all in one pass. _, err := r.Reconcile(context.Background(), reconcileReq()) require.NoError(t, err) diff --git a/internal/controller/instance_referenced_data_test.go b/internal/controller/instance_referenced_data_test.go new file mode 100644 index 00000000..55ada278 --- /dev/null +++ b/internal/controller/instance_referenced_data_test.go @@ -0,0 +1,793 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/controller/instancecontrol" +) + +const ( + refDataTestCluster = "test-project" + refDataTestNamespace = "ns-test-uid" + refDataTestDeployment = "my-deployment" + refDataTestInstance = "my-instance" + refDataTestWDUID = "wd-uid" + refDataTestDataKey = "key" + // Companion objects are now named by source name (option B fix). + refDataTestCMCompanionName = "app-config" + refDataTestSecretCompanionName = "db-creds" + + // Annotation tokens are kind-qualified "Kind/name" strings. + refDataTestCMToken = "ConfigMap/app-config" + refDataTestSecretToken = "Secret/db-creds" + refDataTestDataValue = "value" +) + +// makeWDForCell builds a WorkloadDeployment that can own test instances in the +// cell-side gate-clearing tests. Named distinctly to avoid collision with the +// makeWD helper in referenceddata_controller_test.go. +func makeWDForCell(annotationValue string) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: refDataTestDeployment, + Namespace: refDataTestNamespace, + UID: refDataTestWDUID, + }, + } + if annotationValue != "" { + wd.Annotations = map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: annotationValue, + } + } + return wd +} + +// makeInstanceWithRefDataGate builds an Instance with the ReferencedData gate +// and an owner reference to makeWDForCell. +func makeInstanceWithRefDataGate() *computev1alpha.Instance { + gates := []computev1alpha.SchedulingGate{ + {Name: instancecontrol.ReferencedDataSchedulingGate.String()}, + } + + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: refDataTestInstance, + Namespace: refDataTestNamespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + Labels: map[string]string{ + computev1alpha.WorkloadDeploymentUIDLabel: refDataTestWDUID, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeployment, + Name: refDataTestDeployment, + UID: refDataTestWDUID, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: gates, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } +} + +// makeCompanionConfigMap creates a companion ConfigMap with the ReferencedDataLabel +// in the standard test namespace. +func makeCompanionConfigMap(name string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: refDataTestNamespace, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + Data: map[string]string{refDataTestDataKey: refDataTestDataValue}, + } +} + +// makeCompanionSecret creates a companion Secret with the ReferencedDataLabel +// in the standard test namespace. +func makeCompanionSecret(name string) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: refDataTestNamespace, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + Data: map[string][]byte{refDataTestDataKey: []byte(refDataTestDataValue)}, + } +} + +// newRefDataReconciler constructs an InstanceReconciler backed by a fake client +// that has the given project-cluster objects. A fake event recorder is returned +// so tests can inspect events. +func newRefDataReconciler( + t *testing.T, + projectObjs []client.Object, +) (*InstanceReconciler, client.Client, *record.FakeRecorder) { + t.Helper() + s := newTestScheme(t) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + refDataTestCluster: &fakeCluster{cl: projectClient}, + }, + } + + fakeRec := record.NewFakeRecorder(32) + r := &InstanceReconciler{ + mgr: mgr, + recorder: fakeRec, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient, fakeRec +} + +func reconcileRefData(t *testing.T, r *InstanceReconciler) { + t.Helper() + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}}, + ClusterName: refDataTestCluster, + }) + require.NoError(t, err) +} + +// TestReferencedDataGateHeldWhenAnnotationAbsent verifies that when the WD +// has no expected-referenced-data annotation yet (resolver still running), the +// Instance gets ReferencedDataReady=Unknown/Resolving and the gate is kept. +func TestReferencedDataGateHeldWhenAnnotationAbsent(t *testing.T) { + wd := makeWDForCell("") // no annotation + inst := makeInstanceWithRefDataGate() + + r, projectClient, _ := newRefDataReconciler(t, + []client.Object{inst, wd}, + ) + + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond, "ReferencedDataReady condition should be set") + assert.Equal(t, metav1.ConditionUnknown, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonResolving, cond.Reason) + + // Gate must still be present. + hasGate := false + if updated.Spec.Controller != nil { + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.ReferencedDataSchedulingGate.String() { + hasGate = true + } + } + } + assert.True(t, hasGate, "ReferencedData gate should still be present when annotation is absent") +} + +// TestReferencedDataGateHeldWhenCompanionsMissing verifies that when only some +// companions are present the condition is AwaitingPropagation/False and the +// gate is not removed. The missing companion names should appear in the message. +func TestReferencedDataGateHeldWhenCompanionsMissing(t *testing.T) { + expected := []string{refDataTestCMToken, refDataTestSecretToken} + annoVal, _ := json.Marshal(expected) + wd := makeWDForCell(string(annoVal)) + inst := makeInstanceWithRefDataGate() + + // Only the ConfigMap companion is present; the Secret is missing. + companionCM := makeCompanionConfigMap(refDataTestCMCompanionName) + + r, projectClient, fakeRec := newRefDataReconciler(t, + []client.Object{inst, wd, companionCM}, + ) + + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond, "ReferencedDataReady condition should be set") + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonAwaitingPropagation, cond.Reason) + assert.Contains(t, cond.Message, refDataTestSecretToken, "message should name the missing companion token") + + // Gate must still be present. + hasGate := false + if updated.Spec.Controller != nil { + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.ReferencedDataSchedulingGate.String() { + hasGate = true + } + } + } + assert.True(t, hasGate, "ReferencedData gate should be held while companions are missing") + + // A Warning event should have been emitted. + select { + case evt := <-fakeRec.Events: + assert.Contains(t, evt, computev1alpha.ReferencedDataReasonAwaitingPropagation) + default: + t.Error("expected a Warning event to be emitted when companions are missing") + } +} + +// TestReferencedDataGateClearedWhenAllPresent verifies the full happy path: +// all expected companions are present → gate is removed and condition is Ready. +func TestReferencedDataGateClearedWhenAllPresent(t *testing.T) { + expected := []string{refDataTestCMToken, refDataTestSecretToken} + annoVal, _ := json.Marshal(expected) + wd := makeWDForCell(string(annoVal)) + inst := makeInstanceWithRefDataGate() + + companionCM := makeCompanionConfigMap(refDataTestCMCompanionName) + companionSecret := makeCompanionSecret(refDataTestSecretCompanionName) + + r, projectClient, fakeRec := newRefDataReconciler(t, + []client.Object{inst, wd, companionCM, companionSecret}, + ) + + // First reconcile: sets ReferencedDataReady=True in status, returns early. + reconcileRefData(t, r) + + var afterStatus computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &afterStatus)) + + cond := apimeta.FindStatusCondition(afterStatus.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonReady, cond.Reason) + + // Second reconcile: status already True, gate is removed from spec. + reconcileRefData(t, r) + + var afterGatePatch computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &afterGatePatch)) + + for _, g := range afterGatePatch.Spec.Controller.SchedulingGates { + assert.NotEqual(t, instancecontrol.ReferencedDataSchedulingGate.String(), g.Name, + "ReferencedData gate should have been removed") + } + + // A Normal event should have been emitted when the gate cleared. + var gotClearedEvent bool + for { + select { + case evt := <-fakeRec.Events: + if containsAll(evt, "Normal", computev1alpha.ReferencedDataReasonReady) { + gotClearedEvent = true + } + default: + goto done + } + } +done: + assert.True(t, gotClearedEvent, "expected a Normal event when the ReferencedData gate is cleared") +} + +// TestReferencedDataIdempotentWhenAlreadyReady verifies that a second reconcile +// when the gate is gone and condition is already True produces no changes. +func TestReferencedDataIdempotentWhenAlreadyReady(t *testing.T) { + expected := []string{refDataTestCMToken} + annoVal, _ := json.Marshal(expected) + wd := makeWDForCell(string(annoVal)) + + // Instance with no gate (already cleared) and Ready condition already set. + inst := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: refDataTestInstance, + Namespace: refDataTestNamespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + Labels: map[string]string{ + computev1alpha.WorkloadDeploymentUIDLabel: refDataTestWDUID, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeployment, + Name: refDataTestDeployment, + UID: refDataTestWDUID, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{}, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + Message: "All 1 referenced companion(s) are present on cell", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + companionCM := makeCompanionConfigMap(refDataTestCMCompanionName) + + r, projectClient, fakeRec := newRefDataReconciler(t, + []client.Object{inst, wd, companionCM}, + ) + + // Fetch current resource version so we can check for updates. + var before computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &before)) + + reconcileRefData(t, r) + + var after computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &after)) + + // No gate should have been re-added. + assert.Empty(t, after.Spec.Controller.SchedulingGates) + + // Condition should still be True. + cond := apimeta.FindStatusCondition(after.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + + // No events should have been emitted (gate was already gone). + select { + case evt := <-fakeRec.Events: + t.Errorf("unexpected event emitted during idempotent reconcile: %s", evt) + default: + // expected: no event + } +} + +// TestReferencedDataPartialPresenceShowsDiff verifies the diff message names +// specifically which companions are missing when only some are present. +func TestReferencedDataPartialPresenceShowsDiff(t *testing.T) { + // Annotation uses kind-qualified tokens; companion objects use source names. + expected := []string{"ConfigMap/cfg-a", "ConfigMap/cfg-b", "Secret/sec-x"} + annoVal, _ := json.Marshal(expected) + wd := makeWDForCell(string(annoVal)) + inst := makeInstanceWithRefDataGate() + + // Only cfg-a companion is present; cfg-b and sec-x are missing. + companionA := makeCompanionConfigMap("cfg-a") + + r, projectClient, _ := newRefDataReconciler(t, + []client.Object{inst, wd, companionA}, + ) + + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonAwaitingPropagation, cond.Reason) + + // Both missing companions should be mentioned in the message as kind-qualified tokens. + assert.Contains(t, cond.Message, "ConfigMap/cfg-b") + assert.Contains(t, cond.Message, "Secret/sec-x") + // The present one should NOT be mentioned as missing. + assert.NotContains(t, cond.Message, "ConfigMap/cfg-a") +} + +// TestReferencedDataEventEmittedOnClear verifies that a Normal event is emitted +// precisely once when the gate transitions from present to cleared. +func TestReferencedDataEventEmittedOnClear(t *testing.T) { + expected := []string{"ConfigMap/my-config"} + annoVal, _ := json.Marshal(expected) + wd := makeWDForCell(string(annoVal)) + inst := makeInstanceWithRefDataGate() + companion := makeCompanionConfigMap("my-config") + + r, projectClient, fakeRec := newRefDataReconciler(t, + []client.Object{inst, wd, companion}, + ) + + // Single pass: condition set to Ready, status updated, gate patched away, event emitted. + // The federation branch clears the gate in the same reconcile pass as the status update + // (rather than a separate pass) because gate removal is inlined after status.Update. + reconcileRefData(t, r) + + var cleared computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &cleared)) + cond := apimeta.FindStatusCondition(cleared.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + + hasGate := false + for _, g := range cleared.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.ReferencedDataSchedulingGate.String() { + hasGate = true + } + } + assert.False(t, hasGate, "gate should be cleared in the same pass as status update") + + // Expect exactly one Normal/Ready event. + var normalEvents int +drainLoop: + for { + select { + case evt := <-fakeRec.Events: + if containsAll(evt, "Normal", computev1alpha.ReferencedDataReasonReady) { + normalEvents++ + } + default: + break drainLoop + } + } + assert.Equal(t, 1, normalEvents, "expected exactly one Normal/Ready event on gate-clear") +} + +// TestReferencedDataStaleConditionGuard verifies that a stale True condition +// from generation N does not cause the ReferencedData gate to be removed for +// an instance at generation N+1. The gate must only be cleared once the +// condition has been re-evaluated at the current generation. +func TestReferencedDataStaleConditionGuard(t *testing.T) { + expected := []string{refDataTestCMToken} + annoVal, _ := json.Marshal(expected) + wd := makeWDForCell(string(annoVal)) + + companion := makeCompanionConfigMap(refDataTestCMCompanionName) + + // Build an instance at generation 2 whose ReferencedDataReady condition is + // True but was observed at generation 1 (stale). The gate is present because + // the spec was updated (rolling update) after the condition was last written. + inst := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: refDataTestInstance, + Namespace: refDataTestNamespace, + Generation: 2, // current generation after spec update + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + Labels: map[string]string{ + computev1alpha.WorkloadDeploymentUIDLabel: refDataTestWDUID, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeployment, + Name: refDataTestDeployment, + UID: refDataTestWDUID, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.ReferencedDataSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + // ObservedGeneration=1: stale — condition was set before the rolling update + // bumped the spec to generation 2. + ObservedGeneration: 1, + Message: "All 1 referenced companion(s) are present on cell", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + r, projectClient, _ := newRefDataReconciler(t, + []client.Object{inst, wd, companion}, + ) + + // First reconcile: re-evaluates condition at generation 2 (updates ObservedGeneration). + // The gate must NOT be removed during this pass because reconcileSchedulingGates + // sees the stale condition (ObservedGeneration=1) that was loaded before the + // status update. After the status update the condition is at generation 2. + reconcileRefData(t, r) + + var afterFirst computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &afterFirst)) + + // Condition should now reflect generation 2. + cond := apimeta.FindStatusCondition(afterFirst.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond, "ReferencedDataReady condition should be present") + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, int64(2), cond.ObservedGeneration, "condition ObservedGeneration should be updated to current generation") + + // Second reconcile: condition is now at generation 2 — gate should be cleared. + reconcileRefData(t, r) + + var afterSecond computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &afterSecond)) + + for _, g := range afterSecond.Spec.Controller.SchedulingGates { + assert.NotEqual(t, instancecontrol.ReferencedDataSchedulingGate.String(), g.Name, + "ReferencedData gate should be removed once condition is at current generation") + } +} + +// ─── Federation annotation bridge: terminal error propagation hub→cell ──────── + +// makeWDWithTerminalError returns a WD carrying the ReferencedDataErrorAnnotation +// as a cell WD copy would after Karmada propagation from the hub. +func makeWDWithTerminalError(reason, message string) *computev1alpha.WorkloadDeployment { + raw, _ := encodeTerminalError(reason, message) + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: refDataTestDeployment, + Namespace: refDataTestNamespace, + UID: refDataTestWDUID, + Annotations: map[string]string{ + computev1alpha.ReferencedDataErrorAnnotation: raw, + }, + }, + } + return wd +} + +// TestFederated_TerminalAnnotation_SourceNotFound verifies that when the cell WD +// copy carries the ReferencedDataErrorAnnotation with SourceNotFound, the Instance +// gets Ready=False/SourceNotFound with the hub message rather than AwaitingPropagation. +func TestFederated_TerminalAnnotation_SourceNotFound(t *testing.T) { + msg := `ConfigMap "app-config" not found in namespace "default"` + wd := makeWDWithTerminalError(computev1alpha.ReferencedDataReasonSourceNotFound, msg) + inst := makeInstanceWithRefDataGate() + + r, projectClient, _ := newRefDataReconciler(t, []client.Object{inst, wd}) + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond, "ReferencedDataReady condition should be set") + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason, + "should use the hub resolver reason, not generic AwaitingPropagation") + assert.Equal(t, msg, cond.Message, + "should carry the hub resolver message verbatim") + + // Gate must remain until the error is resolved (companion will never arrive). + hasGate := false + if updated.Spec.Controller != nil { + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.ReferencedDataSchedulingGate.String() { + hasGate = true + } + } + } + assert.True(t, hasGate, "ReferencedData gate should remain when a terminal error is present") +} + +// TestFederated_TerminalAnnotation_SourceTooLarge verifies SourceTooLarge propagates +// via the annotation the same way as SourceNotFound. +func TestFederated_TerminalAnnotation_SourceTooLarge(t *testing.T) { + msg := `ConfigMap "fat-config" in namespace "default" exceeds per-object size limit` + wd := makeWDWithTerminalError(computev1alpha.ReferencedDataReasonSourceTooLarge, msg) + inst := makeInstanceWithRefDataGate() + + r, projectClient, _ := newRefDataReconciler(t, []client.Object{inst, wd}) + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceTooLarge, cond.Reason) + assert.Equal(t, msg, cond.Message) +} + +// TestFederated_TerminalAnnotation_SourceUnauthorized verifies SourceUnauthorized propagates +// via the annotation. +func TestFederated_TerminalAnnotation_SourceUnauthorized(t *testing.T) { + msg := `not authorized to read ConfigMap "secret-cfg" in namespace "default"` + wd := makeWDWithTerminalError(computev1alpha.ReferencedDataReasonSourceUnauthorized, msg) + inst := makeInstanceWithRefDataGate() + + r, projectClient, _ := newRefDataReconciler(t, []client.Object{inst, wd}) + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceUnauthorized, cond.Reason) + assert.Equal(t, msg, cond.Message) +} + +// TestFederated_NoAnnotation_AnnotationAbsent verifies that when the WD has no +// terminal-error annotation AND no expected-data annotation (resolver not done), +// the Instance gets Resolving/Unknown — NOT SourceNotFound. +func TestFederated_NoAnnotation_AnnotationAbsent(t *testing.T) { + // WD has neither annotation — Karmada propagated it before the resolver ran. + wd := makeWDForCell("") // no annotations + inst := makeInstanceWithRefDataGate() + + r, projectClient, _ := newRefDataReconciler(t, []client.Object{inst, wd}) + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionUnknown, cond.Status, + "should be Unknown/Resolving when no annotation is present") + assert.Equal(t, computev1alpha.ReferencedDataReasonResolving, cond.Reason) +} + +// TestFederated_AnnotationPresent_CompanionsMissing verifies that when the WD has +// the expected-data annotation (resolver succeeded, no terminal error) but companions +// haven't arrived yet, the Instance gets AwaitingPropagation — not SourceNotFound. +func TestFederated_AnnotationPresent_CompanionsMissing(t *testing.T) { + expected := []string{refDataTestCMToken} + annoVal, _ := json.Marshal(expected) + // WD has the expected annotation but NOT the terminal-error annotation. + wd := makeWDForCell(string(annoVal)) + inst := makeInstanceWithRefDataGate() + + // No companion ConfigMap present yet. + r, projectClient, _ := newRefDataReconciler(t, []client.Object{inst, wd}) + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonAwaitingPropagation, cond.Reason, + "should be AwaitingPropagation (not SourceNotFound) when resolver succeeded but companion is still in flight") + assert.Contains(t, cond.Message, refDataTestCMToken) +} + +// TestFederated_AnnotationPresent_CompanionsPresent verifies that when the WD +// has the expected-data annotation and all companions are present (healthy path), +// the Instance advances to Ready=True. +func TestFederated_AnnotationPresent_CompanionsPresent(t *testing.T) { + expected := []string{refDataTestCMToken} + annoVal, _ := json.Marshal(expected) + wd := makeWDForCell(string(annoVal)) + inst := makeInstanceWithRefDataGate() + companionCM := makeCompanionConfigMap(refDataTestCMCompanionName) + + r, projectClient, _ := newRefDataReconciler(t, []client.Object{inst, wd, companionCM}) + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status, + "should be Ready=True when all companions are present and no terminal error") +} + +// TestFederated_QuotaAndSourceNotFound_SourceNotFoundWins verifies that when +// both the Quota gate and ReferencedData gate are present, and the cell WD +// carries a terminal-error annotation, Instance.Ready picks SourceNotFound +// (priority 5) over PendingQuota (priority 3). +func TestFederated_QuotaAndSourceNotFound_SourceNotFoundWins(t *testing.T) { + msg := `ConfigMap "app-config" not found in namespace "default"` + wd := makeWDWithTerminalError(computev1alpha.ReferencedDataReasonSourceNotFound, msg) + inst := makeInstanceWithRefDataGate() + + // Add the Quota gate alongside ReferencedData. + inst.Spec.Controller.SchedulingGates = append(inst.Spec.Controller.SchedulingGates, + computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, + ) + + // Seed a QuotaGranted=False/QuotaExceeded condition on the instance. + inst.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: testMsgQuotaExceeded, + ObservedGeneration: 1, + LastTransitionTime: metav1.Now(), + }, + } + + r, projectClient, _ := newRefDataReconciler(t, []client.Object{inst, wd}) + reconcileRefData(t, r) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: refDataTestNamespace, Name: refDataTestInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason, + "SourceNotFound should be set on ReferencedDataReady so reconcileGatedReadyCondition picks priority 5") + + // Note: reconcileGatedReadyCondition (called by reconcileInstanceReadyCondition, + // not tested here) is where the priority competition between SourceNotFound (p5) + // and PendingQuota (p3) is resolved. This test verifies that reconcileReferencedDataCondition + // sets the right ReferencedDataReady sub-condition that feeds into that priority logic. +} + +// containsAll returns true when s contains all substrings. +func containsAll(s string, subs ...string) bool { + for _, sub := range subs { + found := false + for i := 0; i <= len(s)-len(sub); i++ { + if s[i:i+len(sub)] == sub { + found = true + break + } + } + if !found { + return false + } + } + return true +} diff --git a/internal/controller/instance_writeback_test.go b/internal/controller/instance_writeback_test.go index 5c5020cf..b31369af 100644 --- a/internal/controller/instance_writeback_test.go +++ b/internal/controller/instance_writeback_test.go @@ -24,14 +24,15 @@ import ( // ─── write-back test constants ──────────────────────────────────────────────── const ( - wbTestClusterName = "edge-cluster" - wbTestNamespace = "ns-proj-uid-1234" - wbTestInstanceName = "inst-0" - wbTestWorkloadUID = "wl-uid-aaaa-bbbb" - wbTestWDUID = "wd-uid-cccc-dddd" - wbTestInstanceIndex = "0" - wbTestUpstreamNS = "proj-namespace" - wbTestEncodedCluster = "cluster-" + wbTestClusterName + wbTestClusterName = "edge-cluster" + wbTestNamespace = "ns-proj-uid-1234" + wbTestInstanceName = "inst-0" + wbTestWorkloadUID = "wl-uid-aaaa-bbbb" + wbTestWDUID = "wd-uid-cccc-dddd" + wbTestInstanceIndex = "0" + wbTestUpstreamNS = "proj-namespace" + wbTestEncodedCluster = "cluster-" + wbTestClusterName + karmadaManagedLabelValue = "true" // The four self-describing labels. wbTestWDName = "my-workload-deployment" @@ -162,7 +163,7 @@ func TestWriteBackToUpstream_UpdatePath_LabelMerge(t *testing.T) { Labels: map[string]string{ downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, - karmadaManagedLabel: "true", + karmadaManagedLabel: karmadaManagedLabelValue, }, }, Spec: computev1alpha.InstanceSpec{ @@ -198,7 +199,7 @@ func TestWriteBackToUpstream_UpdatePath_LabelMerge(t *testing.T) { assert.Equal(t, wbTestInstanceIndex, updated.Labels[computev1alpha.InstanceIndexLabel]) // The Karmada-managed label must survive the merge (not be replaced/deleted). - assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + assert.Equal(t, karmadaManagedLabelValue, updated.Labels[karmadaManagedLabel], "Karmada-managed label must be preserved after merge; should not be overwritten") } @@ -555,7 +556,7 @@ func TestWriteBackToUpstream_FourNewLabels_UpdatePath(t *testing.T) { Labels: map[string]string{ downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, - karmadaManagedLabel: "true", + karmadaManagedLabel: karmadaManagedLabelValue, }, }, Spec: computev1alpha.InstanceSpec{ @@ -593,6 +594,6 @@ func TestWriteBackToUpstream_FourNewLabels_UpdatePath(t *testing.T) { "PlacementNameLabel must be set on update path") // Karmada-managed label must survive the merge. - assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + assert.Equal(t, karmadaManagedLabelValue, updated.Labels[karmadaManagedLabel], "Karmada-managed label must be preserved after the update merge") } diff --git a/internal/controller/instancecontrol/scheduling_gates.go b/internal/controller/instancecontrol/scheduling_gates.go index e02f3554..e3c88498 100644 --- a/internal/controller/instancecontrol/scheduling_gates.go +++ b/internal/controller/instancecontrol/scheduling_gates.go @@ -5,6 +5,12 @@ type SchedulingGate string const ( NetworkSchedulingGate SchedulingGate = "Network" QuotaSchedulingGate SchedulingGate = "Quota" + + // ReferencedDataSchedulingGate is stamped on new instances when the workload + // template references ConfigMaps or Secrets AND the management-plane feature + // flag EnableReferencedDataGate is enabled. It is cleared by the cell + // InstanceReconciler once all expected companion objects are present. + ReferencedDataSchedulingGate SchedulingGate = "ReferencedData" ) func (s SchedulingGate) String() string { diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 34e5966e..96f153be 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -13,6 +13,7 @@ import ( "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" + "go.datum.net/compute/internal/referenceddata" ) // Options controls optional behaviours of the stateful instance control strategy. @@ -22,6 +23,12 @@ type Options struct { // disabled so that Instances are not blocked waiting for a NetworkBinding. // Defaults to true. NetworkingEnabled bool + + // EnableReferencedDataGate controls whether new Instances receive the + // ReferencedData scheduling gate when the workload template references + // ConfigMaps or Secrets. Defaults to false. See FeatureFlagsConfig for the + // full safety rationale. + EnableReferencedDataGate bool } // Behavior inspired by https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/statefulset @@ -103,6 +110,16 @@ func (c *statefulControl) GetActions( {Name: instancecontrol.NetworkSchedulingGate.String()}, }, gates...) } + + // Stamp the ReferencedData gate only when the management-plane feature + // flag is on AND the template actually references ConfigMaps or Secrets. + // The gate must not be inserted before the cell gate-clearing reconciler + // and provider gate-honoring are deployed everywhere — see + // FeatureFlagsConfig.EnableReferencedDataGate for the full rationale. + if c.opts.EnableReferencedDataGate && referenceddata.TemplateReferencesData(deployment.Spec.Template) { + gates = append(gates, v1alpha.SchedulingGate{Name: instancecontrol.ReferencedDataSchedulingGate.String()}) + } + desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ TemplateHash: instanceTemplateHash, SchedulingGates: gates, diff --git a/internal/controller/instancecontrol/stateful/stateful_control_test.go b/internal/controller/instancecontrol/stateful/stateful_control_test.go index d9133efa..93b26e0d 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control_test.go +++ b/internal/controller/instancecontrol/stateful/stateful_control_test.go @@ -29,7 +29,7 @@ func init() { func TestFreshDeployment(t *testing.T) { ctx := context.Background() - control := New() + control := NewWithOptions(Options{}) deployment := getWorkloadDeployment("test-fresh-deploy", 2) @@ -56,7 +56,7 @@ func TestFreshDeployment(t *testing.T) { // at creation time and ignores spec changes on an existing pod. func TestUpdateWithAllReadyInstances(t *testing.T) { ctx := context.Background() - control := New() + control := NewWithOptions(Options{}) deployment := getWorkloadDeployment("test-deploy", 2) @@ -82,7 +82,7 @@ func TestUpdateWithAllReadyInstances(t *testing.T) { func TestScaleUpWithNotReadyInstance(t *testing.T) { ctx := context.Background() - control := New() + control := NewWithOptions(Options{}) deployment := getWorkloadDeployment("test-deploy", 3) @@ -112,7 +112,7 @@ func TestScaleUpWithNotReadyInstance(t *testing.T) { func TestScaleUpWithDeletingReadyInstance(t *testing.T) { ctx := context.Background() - control := New() + control := NewWithOptions(Options{}) deployment := getWorkloadDeployment("test-deploy", 3) @@ -139,7 +139,7 @@ func TestScaleUpWithDeletingReadyInstance(t *testing.T) { func TestScaleDownWithAllReadyInstances(t *testing.T) { ctx := context.Background() - control := New() + control := NewWithOptions(Options{}) deployment := getWorkloadDeployment("test-deploy", 1) diff --git a/internal/controller/referenceddata_controller.go b/internal/controller/referenceddata_controller.go new file mode 100644 index 00000000..9593bc8a --- /dev/null +++ b/internal/controller/referenceddata_controller.go @@ -0,0 +1,1409 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "maps" + "slices" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/referenceddata" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // referencedDataFinalizer is stamped on WorkloadDeployments that reference + // ConfigMaps or Secrets. The controller removes it after the ref-count cleanup + // of all companion objects owned by this WD is complete. + referencedDataFinalizer = "compute.datumapis.com/referenced-data-controller" + + // companionRefCountAnnotation is stamped on companion ConfigMaps/Secrets to + // track which WorkloadDeployments currently reference them. The value is a + // JSON array of "namespace/name" strings, sorted deterministically. + // + // Ref-counting allows a companion to be shared across multiple WDs that + // happen to reference the same source object, and deleted only when the last + // WD drops its reference. + companionRefCountAnnotation = "compute.datumapis.com/referenced-by" + + // defaultPerObjectLimitBytes is the default maximum byte size of a single + // companion object (ConfigMap or Secret Data + BinaryData). 256 KiB. + defaultPerObjectLimitBytes = 256 * 1024 + + // defaultAggregateLimitBytes is the default maximum aggregate byte size of + // all companion objects for a single WorkloadDeployment. 1 MiB. + defaultAggregateLimitBytes = 1024 * 1024 + + // kindConfigMap and kindSecret are the literal kind strings used in + // referenceddata.ObjectRef to avoid repeated string literals. + kindConfigMap = "ConfigMap" + kindSecret = "Secret" +) + +// companionWriter is the abstraction that the controller uses to materialise +// companion ConfigMaps and Secrets on the target namespace/cluster. +// +// localCompanionWriter: writes companions to the same cluster and namespace +// that the WorkloadDeployment lives in. This path is selected when +// FederationClient is nil. NOTE: on the federation branch the management +// controllers always set FederationClient, so localCompanionWriter is +// effectively unreachable in production. It is retained for single-cluster +// dev/test environments and is NOT used in any management-plane federation +// wiring on this branch. +// +// downstreamCompanionWriter: uses Milo's MappedNamespaceResourceStrategy to +// write companions into the `ns-{project-uid}` namespace on the Karmada hub +// so they are propagated to cells alongside the WorkloadDeployment. The +// federator's PropagationPolicy always includes ConfigMap/Secret selectors +// matching the referenced-data label. +type companionWriter interface { + // ApplyConfigMap creates or updates the companion ConfigMap. + // + // existing is the object previously returned by GetConfigMap for the same + // companion in the same RetryOnConflict iteration (nil when the companion + // does not yet exist). When existing is non-nil the implementation MUST + // write to that exact object — it must NOT perform an additional independent + // GET, which would introduce a second resourceVersion read and defeat the + // atomic read-modify-write guarantee. The ref-count annotation on desired + // was computed from existing's annotations; a second GET could advance to a + // newer resourceVersion that already has a concurrent WD's ref-count entry, + // and merging desired's annotations over it would silently drop that entry. + ApplyConfigMap(ctx context.Context, existing *corev1.ConfigMap, desired *corev1.ConfigMap) error + + // ApplySecret creates or updates the companion Secret. + // + // existing follows the same contract as ApplyConfigMap.existing. + ApplySecret(ctx context.Context, existing *corev1.Secret, desired *corev1.Secret) error + + // DeleteConfigMap deletes the companion ConfigMap if it exists. + DeleteConfigMap(ctx context.Context, namespace, name string) error + + // DeleteSecret deletes the companion Secret if it exists. + DeleteSecret(ctx context.Context, namespace, name string) error + + // GetConfigMap returns the existing companion ConfigMap, or nil if absent. + GetConfigMap(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) + + // GetSecret returns the existing companion Secret, or nil if absent. + GetSecret(ctx context.Context, namespace, name string) (*corev1.Secret, error) +} + +// localCompanionWriter implements companionWriter using a single cluster-runtime +// client. Companions land in the same cluster and namespace as the WD. +type localCompanionWriter struct { + cl client.Client +} + +func (w *localCompanionWriter) ApplyConfigMap(ctx context.Context, existing *corev1.ConfigMap, desired *corev1.ConfigMap) error { + if existing == nil { + return w.cl.Create(ctx, desired) + } + // Write desired fields onto the already-fetched existing object so the + // Update carries the same resourceVersion we read. A concurrent change + // will raise a conflict, which bubbles up to RetryOnConflict. + mergeLabels(existing, desired.Labels) + mergeAnnotations(existing, desired.Annotations) + existing.Data = desired.Data + existing.BinaryData = desired.BinaryData + return w.cl.Update(ctx, existing) +} + +func (w *localCompanionWriter) ApplySecret(ctx context.Context, existing *corev1.Secret, desired *corev1.Secret) error { + if existing == nil { + return w.cl.Create(ctx, desired) + } + mergeLabels(existing, desired.Labels) + mergeAnnotations(existing, desired.Annotations) + existing.Data = desired.Data + existing.Type = desired.Type + return w.cl.Update(ctx, existing) +} + +func (w *localCompanionWriter) DeleteConfigMap(ctx context.Context, namespace, name string) error { + cm := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: name}} + return client.IgnoreNotFound(w.cl.Delete(ctx, cm)) +} + +func (w *localCompanionWriter) DeleteSecret(ctx context.Context, namespace, name string) error { + s := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: name}} + return client.IgnoreNotFound(w.cl.Delete(ctx, s)) +} + +func (w *localCompanionWriter) GetConfigMap(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) { + var cm corev1.ConfigMap + err := w.cl.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &cm) + if apierrors.IsNotFound(err) { + return nil, nil + } + if err != nil { + return nil, err + } + return &cm, nil +} + +func (w *localCompanionWriter) GetSecret(ctx context.Context, namespace, name string) (*corev1.Secret, error) { + var s corev1.Secret + err := w.cl.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &s) + if apierrors.IsNotFound(err) { + return nil, nil + } + if err != nil { + return nil, err + } + return &s, nil +} + +// downstreamCompanionWriter implements companionWriter by materialising +// companions into the `ns-{project-uid}` namespace on the Karmada hub using +// MappedNamespaceResourceStrategy. Companions written here are propagated to +// cells via the always-on referenced-data ResourceSelectors in the city-code +// PropagationPolicy. +// +// The downstreamNamespace field is pre-computed by the controller from the +// strategy so that every CRUD call uses the same stable name without needing +// to resolve it repeatedly. +type downstreamCompanionWriter struct { + // hubClient is a client.Client pointed at the Karmada federation control + // plane (the same client used by WorkloadDeploymentFederator). + hubClient client.Client + // downstreamNamespace is the resolved ns-{project-uid} name on the hub. + downstreamNamespace string +} + +func (w *downstreamCompanionWriter) ApplyConfigMap(ctx context.Context, existing *corev1.ConfigMap, desired *corev1.ConfigMap) error { + // Redirect the desired object into the downstream namespace. The caller + // already holds the existing object (from GetConfigMap, which redirects + // the lookup to the same downstream namespace), so no second GET is needed. + desired = desired.DeepCopy() + desired.Namespace = w.downstreamNamespace + + if existing == nil { + return w.hubClient.Create(ctx, desired) + } + // Merge controller-owned labels/annotations into the already-fetched existing + // object rather than replacing them wholesale. This preserves Karmada + // bookkeeping annotations that the federation hub stamps on propagated objects. + // Writing to existing (same resourceVersion) ensures a concurrent change raises + // a conflict that RetryOnConflict will catch. + mergeLabels(existing, desired.Labels) + mergeAnnotations(existing, desired.Annotations) + existing.Data = desired.Data + existing.BinaryData = desired.BinaryData + return w.hubClient.Update(ctx, existing) +} + +func (w *downstreamCompanionWriter) ApplySecret(ctx context.Context, existing *corev1.Secret, desired *corev1.Secret) error { + desired = desired.DeepCopy() + desired.Namespace = w.downstreamNamespace + + if existing == nil { + return w.hubClient.Create(ctx, desired) + } + // Same merge semantics as ApplyConfigMap — preserve Karmada bookkeeping. + mergeLabels(existing, desired.Labels) + mergeAnnotations(existing, desired.Annotations) + existing.Data = desired.Data + existing.Type = desired.Type + return w.hubClient.Update(ctx, existing) +} + +func (w *downstreamCompanionWriter) DeleteConfigMap(ctx context.Context, _, name string) error { + cm := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Namespace: w.downstreamNamespace, Name: name}} + return client.IgnoreNotFound(w.hubClient.Delete(ctx, cm)) +} + +func (w *downstreamCompanionWriter) DeleteSecret(ctx context.Context, _, name string) error { + s := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Namespace: w.downstreamNamespace, Name: name}} + return client.IgnoreNotFound(w.hubClient.Delete(ctx, s)) +} + +func (w *downstreamCompanionWriter) GetConfigMap(ctx context.Context, _, name string) (*corev1.ConfigMap, error) { + var cm corev1.ConfigMap + err := w.hubClient.Get(ctx, types.NamespacedName{Namespace: w.downstreamNamespace, Name: name}, &cm) + if apierrors.IsNotFound(err) { + return nil, nil + } + if err != nil { + return nil, err + } + return &cm, nil +} + +func (w *downstreamCompanionWriter) GetSecret(ctx context.Context, _, name string) (*corev1.Secret, error) { + var s corev1.Secret + err := w.hubClient.Get(ctx, types.NamespacedName{Namespace: w.downstreamNamespace, Name: name}, &s) + if apierrors.IsNotFound(err) { + return nil, nil + } + if err != nil { + return nil, err + } + return &s, nil +} + +// ReferencedDataControllerOptions configures the ReferencedDataController. +type ReferencedDataControllerOptions struct { + // Reader is used to read source ConfigMaps and Secrets from the project + // control plane. When nil, a LocalReader backed by the cluster client is + // used, which is appropriate for single-cluster and dev environments. + Reader referenceddata.ProjectConfigSecretReader + + // FederationClient is a client pointed at the Karmada federation control + // plane (the same client used by WorkloadDeploymentFederator). When + // non-nil, companions are materialised into the downstream + // ns-{project-uid} namespace on the hub so that Karmada can propagate + // them to cells alongside the WorkloadDeployment. When nil, the + // single-cluster path is used and companions land in the project namespace. + FederationClient client.Client + + // PerObjectLimitBytes is the maximum allowed byte size for a single + // companion object (sum of all Data + BinaryData values). Defaults to + // defaultPerObjectLimitBytes (256 KiB). + PerObjectLimitBytes int64 + + // AggregateLimitBytes is the maximum allowed aggregate byte size across all + // companion objects for a single WorkloadDeployment. Defaults to + // defaultAggregateLimitBytes (1 MiB). + AggregateLimitBytes int64 +} + +// ReferencedDataController watches WorkloadDeployments and materialises +// companion ConfigMaps/Secrets in the same namespace so that the cell +// InstanceReconciler can gate-clear once the companions arrive. +// +// Reconcile flow (single-cluster, Phase 1): +// 1. Collect the deduplicated set of ConfigMap/Secret refs from the WD template. +// 2. If empty → clear any finalizer, remove expected-set annotation, done. +// 3. Stamp the finalizer. +// 4. Read each source via the ProjectConfigSecretReader (falling back to a +// LocalReader when none is configured). +// 5. Enforce per-object (256 KiB) and aggregate (1 MiB) size limits. On +// breach set ReferencedDataReady=False/SourceTooLarge and return. +// 6. Materialise one shared companion per (kind, source-name) in the WD's +// namespace using a companionWriter. Track referencing WDs in a companion +// annotation (ref-count). +// 7. Stamp the expected-set annotation on the WD (sorted companion names). +// 8. Delete companions that are no longer referenced by this WD (and have no +// other referencing WDs). +// 9. Set ReferencedDataReady=True/Ready on the WD status. +// +// Rotation: watches source ConfigMaps/Secrets and re-queues referencing WDs so +// companions are refreshed when sources change. +// +// Deletion: the finalizer prevents WD deletion until companions this WD owns +// have been released (ref-count decremented / companion deleted). + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch;delete + +// sourceResult pairs an ObjectRef with its resolved source object. Exactly one +// of cm or secret is non-nil, depending on ref.Kind. +type sourceResult struct { + ref referenceddata.ObjectRef + cm *corev1.ConfigMap + secret *corev1.Secret +} + +type ReferencedDataController struct { + mgr clusterGetter + opts ReferencedDataControllerOptions +} + +func (r *ReferencedDataController) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var wd computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &wd); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("referenceddata: get WorkloadDeployment: %w", err) + } + + logger.Info("reconciling referenced data", "workloaddeployment", req.NamespacedName) + defer logger.Info("reconcile complete") + + writer, err := r.writerFor(ctx, string(req.ClusterName), cl.GetClient(), &wd) + if err != nil { + return ctrl.Result{}, fmt.Errorf("referenceddata: build companion writer: %w", err) + } + reader := r.readerFor(cl.GetClient()) + + // Handle deletion first: release companions this WD holds a reference to. + if !wd.DeletionTimestamp.IsZero() { + return ctrl.Result{}, r.reconcileDeleted(ctx, cl.GetClient(), writer, &wd) + } + + refs := referenceddata.CollectFromTemplate(wd.Namespace, wd.Spec.Template) + if len(refs) == 0 { + return ctrl.Result{}, r.reconcileEmpty(ctx, cl.GetClient(), writer, &wd) + } + + // Stamp finalizer so we can clean up companions on WD deletion. + // Use RetryOnConflict so that a concurrent federator finalizer update on the + // same object does not produce a noisy optimistic-lock error — we simply + // re-read and re-apply our own finalizer addition. + if !controllerutil.ContainsFinalizer(&wd, referencedDataFinalizer) { + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := cl.GetClient().Get(ctx, req.NamespacedName, &wd); err != nil { + return err + } + if controllerutil.ContainsFinalizer(&wd, referencedDataFinalizer) { + return nil // already present from a previous attempt + } + controllerutil.AddFinalizer(&wd, referencedDataFinalizer) + return cl.GetClient().Update(ctx, &wd) + }); err != nil { + return ctrl.Result{}, fmt.Errorf("referenceddata: add finalizer: %w", err) + } + return ctrl.Result{}, nil + } + + // Read each source, enforcing size limits. + // Cluster name = project ID in Milo mode; ignored by LocalReader. + sources, condErr := r.resolveAndValidateSources(ctx, reader, string(req.ClusterName), refs, wd.Spec.Template) + if condErr != nil { + // A condition error signals a transient or permanent source problem — + // surface it on the WD and return without requeueing (source watch re-triggers). + return ctrl.Result{}, r.setConditionAndReturn(ctx, cl.GetClient(), &wd, condErr.reason, condErr.message) + } + + // expectedTokens is a kind-qualified JSON array written to the + // expected-referenced-data annotation, e.g. ["ConfigMap/app-config","Secret/db-creds"]. + // Using "Kind/name" tokens lets the cell disambiguate which companion is a + // ConfigMap and which is a Secret without probing both resource types. + expectedTokens := make([]string, 0, len(sources)) + for _, src := range sources { + expectedTokens = append(expectedTokens, referenceddata.CompanionToken(src.ref.Kind, referenceddata.CompanionNameForRef(src.ref))) + } + // expectedNames is the flat list of companion object names (source names) used + // for ref-count bookkeeping and release. + expectedNames := make([]string, 0, len(sources)) + for _, src := range sources { + expectedNames = append(expectedNames, referenceddata.CompanionNameForRef(src.ref)) + } + + // wdKey is used both as the ref-count entry key (string form) and for GET/Patch retries. + wdNSN := types.NamespacedName{Namespace: wd.Namespace, Name: wd.Name} + wdKey := wdNSN.String() + + if err := r.materialiseCompanions(ctx, writer, wd.Namespace, wdKey, sources); err != nil { + return ctrl.Result{}, err + } + + // Drop companions previously owned by this WD that are no longer in the desired set. + if err := r.releaseRemovedCompanions(ctx, cl.GetClient(), writer, &wd, expectedNames); err != nil { + return ctrl.Result{}, fmt.Errorf("referenceddata: release removed companions: %w", err) + } + + // Stamp the expected-set annotation on the WD so the cell can gate-clear. + // The annotation is a kind-qualified JSON array, e.g. + // ["ConfigMap/app-config","Secret/db-creds"], so the cell can match + // companions by kind without probing both resource types. + // Wrap in RetryOnConflict: the federator may concurrently update the same WD, + // producing a conflict on the Patch. The annotation write is idempotent, so + // retrying with a fresh GET is safe. + annoVal, err := json.Marshal(expectedTokens) + if err != nil { + return ctrl.Result{}, fmt.Errorf("referenceddata: marshal expected-referenced-data annotation: %w", err) + } + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := cl.GetClient().Get(ctx, wdNSN, &wd); err != nil { + return err + } + patch := client.MergeFrom(wd.DeepCopy()) + if wd.Annotations == nil { + wd.Annotations = make(map[string]string) + } + wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] = string(annoVal) + return cl.GetClient().Patch(ctx, &wd, patch) + }); err != nil { + if apierrors.IsConflict(err) { + // Conflict after retries — requeue for the next cycle. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("referenceddata: patch expected-referenced-data annotation: %w", err) + } + + // Update ReferencedDataReady=True on the WD status. + // Tolerate conflict: the federator may have updated the WD status concurrently. + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := cl.GetClient().Get(ctx, wdNSN, &wd); err != nil { + return err + } + changed := apimeta.SetStatusCondition(&wd.Status.Conditions, metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + Message: fmt.Sprintf("All %d referenced companion(s) are materialised", len(expectedTokens)), + ObservedGeneration: wd.Generation, + }) + if !changed { + return nil + } + return cl.GetClient().Status().Update(ctx, &wd) + }); err != nil { + if apierrors.IsConflict(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("referenceddata: update WD status (ready): %w", err) + } + + // Clear the terminal-error annotation now that all companions materialised. + // This is idempotent: if the annotation is absent no Patch is issued. + if err := r.clearTerminalErrorAnnotation(ctx, cl.GetClient(), &wd); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// reconcileDeleted handles a WorkloadDeployment that is being deleted by +// releasing its companion references and removing the finalizer. +func (r *ReferencedDataController) reconcileDeleted( + ctx context.Context, + c client.Client, + writer companionWriter, + wd *computev1alpha.WorkloadDeployment, +) error { + if !controllerutil.ContainsFinalizer(wd, referencedDataFinalizer) { + return nil + } + if err := r.releaseCompanions(ctx, c, writer, wd); err != nil { + return fmt.Errorf("referenceddata: release companions on deletion: %w", err) + } + // Use RetryOnConflict so that a concurrent federator finalizer update does + // not produce an optimistic-lock error. We re-read the object each attempt + // and only proceed if our finalizer is still present. + key := types.NamespacedName{Namespace: wd.Namespace, Name: wd.Name} + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := c.Get(ctx, key, wd); err != nil { + return err + } + if !controllerutil.ContainsFinalizer(wd, referencedDataFinalizer) { + return nil // already removed by a previous attempt + } + controllerutil.RemoveFinalizer(wd, referencedDataFinalizer) + return c.Update(ctx, wd) + }); err != nil { + return fmt.Errorf("referenceddata: remove finalizer: %w", err) + } + return nil +} + +// reconcileEmpty handles a WorkloadDeployment whose template no longer +// references any ConfigMaps or Secrets — clean up any stale state. +func (r *ReferencedDataController) reconcileEmpty( + ctx context.Context, + c client.Client, + writer companionWriter, + wd *computev1alpha.WorkloadDeployment, +) error { + if controllerutil.ContainsFinalizer(wd, referencedDataFinalizer) { + if err := r.releaseCompanions(ctx, c, writer, wd); err != nil { + return fmt.Errorf("referenceddata: release companions (empty refs): %w", err) + } + // Use RetryOnConflict so that a concurrent federator finalizer update does + // not produce an optimistic-lock error. + key := types.NamespacedName{Namespace: wd.Namespace, Name: wd.Name} + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := c.Get(ctx, key, wd); err != nil { + return err + } + if !controllerutil.ContainsFinalizer(wd, referencedDataFinalizer) { + return nil // already removed by a previous attempt + } + controllerutil.RemoveFinalizer(wd, referencedDataFinalizer) + return c.Update(ctx, wd) + }); err != nil { + return fmt.Errorf("referenceddata: remove finalizer (empty refs): %w", err) + } + } + if _, hasAnno := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation]; hasAnno { + patch := client.MergeFrom(wd.DeepCopy()) + delete(wd.Annotations, computev1alpha.ExpectedReferencedDataAnnotation) + if err := c.Patch(ctx, wd, patch); err != nil { + return fmt.Errorf("referenceddata: remove annotation: %w", err) + } + } + // Also clear any terminal-error annotation left over from a previous error + // cycle. The template now has no references, so there is nothing to be wrong. + if err := r.clearTerminalErrorAnnotation(ctx, c, wd); err != nil { + return err + } + return nil +} + +// conditionError packages a (reason, message) pair for a False +// ReferencedDataReady condition. It is used as a typed return value so the +// caller can distinguish a condition error from a transient reconcile error. +type conditionError struct { + reason string + message string +} + +// resolveAndValidateSources reads each source ConfigMap/Secret via the reader, +// enforces per-object and aggregate size limits, and returns the resolved set. +// On the first validation failure it returns a conditionError; the caller +// surfaces this as a False condition on the WD. +// +// Optional sources: when a source has optional=true in the WD template and is +// missing (SourceNotFound) or oversized (SourceTooLarge), it is silently +// skipped rather than aborting the whole WD. The WD may proceed with the +// remaining non-optional companions. +func (r *ReferencedDataController) resolveAndValidateSources( + ctx context.Context, + reader referenceddata.ProjectConfigSecretReader, + projectID string, + refs referenceddata.ReferencedSet, + tmpl computev1alpha.InstanceTemplateSpec, +) ([]sourceResult, *conditionError) { + perObjLimit := r.opts.PerObjectLimitBytes + if perObjLimit <= 0 { + perObjLimit = defaultPerObjectLimitBytes + } + aggLimit := r.opts.AggregateLimitBytes + if aggLimit <= 0 { + aggLimit = defaultAggregateLimitBytes + } + + sources := make([]sourceResult, 0, len(refs)) + var aggregateBytes int64 + + for _, ref := range refs { + optional := isOptionalRef(ref, tmpl) + src, sz, cerr := r.resolveOneSource(ctx, reader, projectID, ref) + if cerr != nil { + // Skip optional sources that are missing or unauthorized. + if optional && (cerr.reason == computev1alpha.ReferencedDataReasonSourceNotFound || + cerr.reason == computev1alpha.ReferencedDataReasonSourceUnauthorized) { + continue + } + return nil, cerr + } + if sz > perObjLimit { + if optional { + continue // skip oversized optional sources + } + return nil, &conditionError{ + reason: computev1alpha.ReferencedDataReasonSourceTooLarge, + message: fmt.Sprintf("%s %q in namespace %q exceeds per-object size limit (%d bytes > %d bytes)", ref.Kind, ref.Name, ref.Namespace, sz, perObjLimit), + } + } + aggregateBytes += sz + if aggregateBytes > aggLimit { + return nil, &conditionError{ + reason: computev1alpha.ReferencedDataReasonSourceTooLarge, + message: fmt.Sprintf("aggregate referenced data for WorkloadDeployment exceeds limit (%d bytes > %d bytes)", aggregateBytes, aggLimit), + } + } + sources = append(sources, *src) + } + return sources, nil +} + +// isOptionalRef returns true when the given ObjectRef corresponds to a source +// that was marked optional=true anywhere in the instance template spec. +// It checks all volume mounts and env/envFrom sources that match (kind, name). +func isOptionalRef(ref referenceddata.ObjectRef, tmpl computev1alpha.InstanceTemplateSpec) bool { + if isOptionalInVolumes(ref, tmpl.Spec.Volumes) { + return true + } + if sb := tmpl.Spec.Runtime.Sandbox; sb != nil { + return isOptionalInContainers(ref, sb.Containers) + } + return false +} + +// isOptionalInVolumes returns true when ref is an optional volume source in volumes. +func isOptionalInVolumes(ref referenceddata.ObjectRef, volumes []computev1alpha.InstanceVolume) bool { + boolTrue := func(b *bool) bool { return b != nil && *b } + for _, v := range volumes { + switch ref.Kind { + case kindConfigMap: + if v.ConfigMap != nil && v.ConfigMap.Name == ref.Name && boolTrue(v.ConfigMap.Optional) { + return true + } + case kindSecret: + if v.Secret != nil && v.Secret.SecretName == ref.Name && boolTrue(v.Secret.Optional) { + return true + } + } + } + return false +} + +// isOptionalInContainers returns true when ref is optional in any container's +// env or envFrom sources. +func isOptionalInContainers(ref referenceddata.ObjectRef, containers []computev1alpha.SandboxContainer) bool { + boolTrue := func(b *bool) bool { return b != nil && *b } + for _, c := range containers { + for _, ef := range c.EnvFrom { + switch ref.Kind { + case kindConfigMap: + if ef.ConfigMapRef != nil && ef.ConfigMapRef.Name == ref.Name && boolTrue(ef.ConfigMapRef.Optional) { + return true + } + case kindSecret: + if ef.SecretRef != nil && ef.SecretRef.Name == ref.Name && boolTrue(ef.SecretRef.Optional) { + return true + } + } + } + for _, e := range c.Env { + if e.ValueFrom == nil { + continue + } + switch ref.Kind { + case kindConfigMap: + if e.ValueFrom.ConfigMapKeyRef != nil && e.ValueFrom.ConfigMapKeyRef.Name == ref.Name && boolTrue(e.ValueFrom.ConfigMapKeyRef.Optional) { + return true + } + case kindSecret: + if e.ValueFrom.SecretKeyRef != nil && e.ValueFrom.SecretKeyRef.Name == ref.Name && boolTrue(e.ValueFrom.SecretKeyRef.Optional) { + return true + } + } + } + } + return false +} + +// resolveOneSource reads a single ConfigMap or Secret from the project. It +// returns the sourceResult, its byte size, and any condition error. +// A (nil, nil) return from the reader is treated as SourceNotFound. +func (r *ReferencedDataController) resolveOneSource( + ctx context.Context, + reader referenceddata.ProjectConfigSecretReader, + projectID string, + ref referenceddata.ObjectRef, +) (*sourceResult, int64, *conditionError) { + switch ref.Kind { + case kindConfigMap: + cm, err := reader.GetConfigMap(ctx, projectID, ref.Namespace, ref.Name) + if err != nil { + reason, msg := classifyReaderError(err, ref) + return nil, 0, &conditionError{reason: reason, message: msg} + } + if cm == nil { + return nil, 0, &conditionError{ + reason: computev1alpha.ReferencedDataReasonSourceNotFound, + message: fmt.Sprintf("ConfigMap %q not found in namespace %q", ref.Name, ref.Namespace), + } + } + return &sourceResult{ref: ref, cm: cm}, configMapSize(cm), nil + + case kindSecret: + secret, err := reader.GetSecret(ctx, projectID, ref.Namespace, ref.Name) + if err != nil { + reason, msg := classifyReaderError(err, ref) + return nil, 0, &conditionError{reason: reason, message: msg} + } + if secret == nil { + return nil, 0, &conditionError{ + reason: computev1alpha.ReferencedDataReasonSourceNotFound, + message: fmt.Sprintf("Secret %q not found in namespace %q", ref.Name, ref.Namespace), + } + } + return &sourceResult{ref: ref, secret: secret}, secretSize(secret), nil + + default: + // Unreachable: CollectFromTemplate only emits ConfigMap/Secret. + return nil, 0, nil + } +} + +// materialiseCompanions creates or updates companion objects for all resolved +// sources, updating ref-count annotations as it goes. +func (r *ReferencedDataController) materialiseCompanions( + ctx context.Context, + writer companionWriter, + namespace, wdKey string, + sources []sourceResult, +) error { + for _, src := range sources { + companionName := referenceddata.CompanionNameForRef(src.ref) + if err := r.materialiseOne(ctx, writer, namespace, companionName, wdKey, src); err != nil { + return err + } + } + return nil +} + +// materialiseOne applies a single companion ConfigMap or Secret. +// +// The ref-count annotation is read-modify-written atomically inside a single +// RetryOnConflict loop. The same GET result that the ref-count is computed from +// is passed directly to ApplyConfigMap/ApplySecret, which must NOT issue a +// second independent GET. This ensures the Update carries the same +// resourceVersion that was used to compute the ref-count, so a concurrent WD +// that commits its own ref-count entry between the outer GET and the Update +// will cause a conflict error that RetryOnConflict re-reads and retries from. +func (r *ReferencedDataController) materialiseOne( + ctx context.Context, + writer companionWriter, + namespace, companionName, wdKey string, + src sourceResult, +) error { + switch src.ref.Kind { + case kindConfigMap: + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Single GET: the ref-count is computed from this exact snapshot and + // the same snapshot is passed to ApplyConfigMap so the Update targets + // the same resourceVersion. A concurrent ref-count write will conflict. + existing, err := writer.GetConfigMap(ctx, namespace, companionName) + if err != nil { + return fmt.Errorf("referenceddata: get companion ConfigMap %q: %w", companionName, err) + } + var existingAnnots map[string]string + if existing != nil { + existingAnnots = existing.Annotations + } + refs, err := refCountAdd(existingAnnots, wdKey) + if err != nil { + return fmt.Errorf("referenceddata: ref-count add for ConfigMap %q: %w", companionName, err) + } + // Guard: src.cm may be nil if the reader returned (nil, nil). + if src.cm == nil { + return fmt.Errorf("referenceddata: source ConfigMap for companion %q is nil", companionName) + } + desired := buildCompanionConfigMap(namespace, companionName, src.cm, refs) + // Pass existing (the same GET snapshot) so ApplyConfigMap can Update + // it without a second GET, preserving the atomicity guarantee. + if err := writer.ApplyConfigMap(ctx, existing, desired); err != nil { + return fmt.Errorf("referenceddata: apply companion ConfigMap %q: %w", companionName, err) + } + return nil + }) + + case kindSecret: + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + existing, err := writer.GetSecret(ctx, namespace, companionName) + if err != nil { + return fmt.Errorf("referenceddata: get companion Secret %q: %w", companionName, err) + } + var existingAnnots map[string]string + if existing != nil { + existingAnnots = existing.Annotations + } + refs, err := refCountAdd(existingAnnots, wdKey) + if err != nil { + return fmt.Errorf("referenceddata: ref-count add for Secret %q: %w", companionName, err) + } + // Guard: src.secret may be nil if the reader returned (nil, nil). + if src.secret == nil { + return fmt.Errorf("referenceddata: source Secret for companion %q is nil", companionName) + } + desired := buildCompanionSecret(namespace, companionName, src.secret, refs) + if err := writer.ApplySecret(ctx, existing, desired); err != nil { + return fmt.Errorf("referenceddata: apply companion Secret %q: %w", companionName, err) + } + return nil + }) + } + return nil +} + +// setConditionAndReturn updates the ReferencedDataReady condition on the WD +// with the given (False) reason and message and returns nil so the controller +// reconcile returns (no requeue error). The WD will be re-triggered by the +// source watch when the source changes. +// +// When reason is a terminal error (SourceNotFound, SourceUnauthorized, +// SourceTooLarge), this also stamps the ReferencedDataErrorAnnotation on the WD +// metadata. This annotation propagates hub→cell via Karmada so the cell +// InstanceReconciler can surface the terminal error without needing to read hub +// status conditions (which do not propagate in that direction). For non-terminal +// reasons (e.g. Resolving) the annotation is cleared to avoid stale state. +func (r *ReferencedDataController) setConditionAndReturn( + ctx context.Context, + c client.Client, + wd *computev1alpha.WorkloadDeployment, + reason, message string, +) error { + apimeta.SetStatusCondition(&wd.Status.Conditions, metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: reason, + Message: message, + ObservedGeneration: wd.Generation, + }) + if err := c.Status().Update(ctx, wd); err != nil { + return fmt.Errorf("referenceddata: update WD status (%s): %w", reason, err) + } + + if isTerminalReferencedDataReason(reason) { + if err := r.stampTerminalErrorAnnotation(ctx, c, wd, reason, message); err != nil { + return err + } + } else { + // Non-terminal errors (e.g. Resolving) should not leave a stale terminal + // annotation from a previous cycle. + if err := r.clearTerminalErrorAnnotation(ctx, c, wd); err != nil { + return err + } + } + return nil +} + +// stampTerminalErrorAnnotation writes the ReferencedDataErrorAnnotation on the +// WD metadata with the given reason and message. It is idempotent: if the +// annotation already carries the same JSON it does not issue a Patch. +func (r *ReferencedDataController) stampTerminalErrorAnnotation( + ctx context.Context, + c client.Client, + wd *computev1alpha.WorkloadDeployment, + reason, message string, +) error { + desired, err := encodeTerminalError(reason, message) + if err != nil { + return fmt.Errorf("referenceddata: encode terminal error annotation: %w", err) + } + if wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation] == desired { + return nil // already up to date; skip the Patch + } + patch := client.MergeFrom(wd.DeepCopy()) + if wd.Annotations == nil { + wd.Annotations = make(map[string]string) + } + wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation] = desired + if err := c.Patch(ctx, wd, patch); err != nil { + return fmt.Errorf("referenceddata: patch terminal error annotation: %w", err) + } + return nil +} + +// clearTerminalErrorAnnotation removes the ReferencedDataErrorAnnotation from +// the WD metadata when the error has resolved. It is idempotent: if the +// annotation is absent it does not issue a Patch. +func (r *ReferencedDataController) clearTerminalErrorAnnotation( + ctx context.Context, + c client.Client, + wd *computev1alpha.WorkloadDeployment, +) error { + if _, ok := wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation]; !ok { + return nil // not present; nothing to do + } + patch := client.MergeFrom(wd.DeepCopy()) + delete(wd.Annotations, computev1alpha.ReferencedDataErrorAnnotation) + if err := c.Patch(ctx, wd, patch); err != nil { + return fmt.Errorf("referenceddata: clear terminal error annotation: %w", err) + } + return nil +} + +// terminalErrorPayload is the JSON shape written to ReferencedDataErrorAnnotation. +type terminalErrorPayload struct { + Reason string `json:"reason"` + Message string `json:"message"` +} + +// encodeTerminalError marshals reason+message into the annotation value string. +func encodeTerminalError(reason, message string) (string, error) { + b, err := json.Marshal(terminalErrorPayload{Reason: reason, Message: message}) + if err != nil { + return "", err + } + return string(b), nil +} + +// decodeTerminalError parses a ReferencedDataErrorAnnotation value. Returns +// ("", "", nil) when the annotation value is empty or absent. +func decodeTerminalError(raw string) (reason, message string, err error) { + if raw == "" { + return "", "", nil + } + var p terminalErrorPayload + if err := json.Unmarshal([]byte(raw), &p); err != nil { + return "", "", fmt.Errorf("referenceddata: parse terminal error annotation %q: %w", raw, err) + } + return p.Reason, p.Message, nil +} + +// classifyReaderError maps a ProjectConfigSecretReader error to a +// (reason, message) pair suitable for the ReferencedDataReady condition. +func classifyReaderError(err error, ref referenceddata.ObjectRef) (reason, message string) { + switch { + case errors.Is(err, referenceddata.ErrSourceNotFound): + return computev1alpha.ReferencedDataReasonSourceNotFound, + fmt.Sprintf("%s %q not found in namespace %q", ref.Kind, ref.Name, ref.Namespace) + case errors.Is(err, referenceddata.ErrSourceUnauthorized): + return computev1alpha.ReferencedDataReasonSourceUnauthorized, + fmt.Sprintf("not authorized to read %s %q in namespace %q", ref.Kind, ref.Name, ref.Namespace) + default: + return computev1alpha.ReferencedDataReasonResolving, + fmt.Sprintf("failed to read %s %q: %v", ref.Kind, ref.Name, err) + } +} + +// parseAnnotationTokens parses the expected-referenced-data annotation value +// (a kind-qualified JSON array such as ["ConfigMap/app-config","Secret/db-creds"]) +// and returns a slice of (kind, companionName) pairs. +// +// For backwards-compatibility, tokens that do not contain a '/' are treated as +// plain companion names with an unknown kind (both ConfigMap and Secret will be +// probed during release). +func parseAnnotationTokens(raw string) ([]struct{ kind, name string }, error) { + var tokens []string + if err := json.Unmarshal([]byte(raw), &tokens); err != nil { + return nil, err + } + out := make([]struct{ kind, name string }, 0, len(tokens)) + for _, tok := range tokens { + if idx := strings.Index(tok, "/"); idx >= 0 { + out = append(out, struct{ kind, name string }{tok[:idx], tok[idx+1:]}) + } else { + // Legacy plain name — kind unknown, will probe both types. + out = append(out, struct{ kind, name string }{"", tok}) + } + } + return out, nil +} + +// releaseCompanions removes this WD's entry from all companion objects it owns. +// Companions with an empty ref-count after removal are deleted. +func (r *ReferencedDataController) releaseCompanions( + ctx context.Context, + c client.Client, + writer companionWriter, + wd *computev1alpha.WorkloadDeployment, +) error { + // Determine which companions this WD currently claims from the annotation. + anno, ok := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + if !ok || anno == "" { + return nil + } + entries, err := parseAnnotationTokens(anno) + if err != nil || len(entries) == 0 { + // Annotation is malformed or empty; nothing to release. + return nil + } + + wdKey := types.NamespacedName{Namespace: wd.Namespace, Name: wd.Name}.String() + + for _, entry := range entries { + if err := r.releaseOneCompanion(ctx, c, writer, wd.Namespace, entry.kind, entry.name, wdKey); err != nil { + return err + } + } + return nil +} + +// releaseRemovedCompanions removes this WD's ref-count entry from companions +// that were previously expected but are no longer in the current desired set. +// currentNames is the flat list of current companion object names (source names). +func (r *ReferencedDataController) releaseRemovedCompanions( + ctx context.Context, + c client.Client, + writer companionWriter, + wd *computev1alpha.WorkloadDeployment, + currentNames []string, +) error { + anno, ok := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + if !ok || anno == "" { + return nil + } + prevEntries, err := parseAnnotationTokens(anno) + if err != nil || len(prevEntries) == 0 { + return nil + } + + wdKey := types.NamespacedName{Namespace: wd.Namespace, Name: wd.Name}.String() + + for _, entry := range prevEntries { + if slices.Contains(currentNames, entry.name) { + continue + } + if err := r.releaseOneCompanion(ctx, c, writer, wd.Namespace, entry.kind, entry.name, wdKey); err != nil { + return err + } + } + return nil +} + +// releaseOneCompanion removes wdKey from the ref-count annotation of the named +// companion. When kind is known ("ConfigMap" or "Secret") only that resource type +// is probed; when kind is empty (legacy annotation without kind qualification) +// both ConfigMap and Secret are probed. +// +// The read-modify-write is wrapped in RetryOnConflict: two WDs concurrently +// releasing the same companion will each re-read and update safely. The GET +// and the subsequent Update target the same resourceVersion so a concurrent +// change causes a conflict that drives a re-read and retry. +// +// If the ref-count annotation is unparseable the whole call returns an error +// (transient). The companion is NOT deleted in that case — it may still be +// referenced by other WDs whose entries are recorded in the corrupt annotation. +func (r *ReferencedDataController) releaseOneCompanion( + ctx context.Context, + _ client.Client, + writer companionWriter, + namespace, kind, companionName, wdKey string, +) error { + releaseConfigMap := kind == kindConfigMap || kind == "" + releaseSecret := kind == kindSecret || kind == "" + + // Try ConfigMap. + var cmExists bool + if releaseConfigMap { + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + cm, err := writer.GetConfigMap(ctx, namespace, companionName) + if err != nil { + return fmt.Errorf("get companion ConfigMap %q: %w", companionName, err) + } + if cm == nil { + return nil // not a ConfigMap companion + } + cmExists = true + remaining, err := refCountRemove(cm.Annotations, wdKey) + if err != nil { + // Annotation is corrupt — treat as transient to avoid unsafe deletion. + return fmt.Errorf("referenceddata: corrupt ref-count on ConfigMap %q: %w", companionName, err) + } + if len(remaining) == 0 { + return writer.DeleteConfigMap(ctx, namespace, companionName) + } + // Build a desired object that carries the updated ref-count. Pass the + // already-fetched cm as existing so ApplyConfigMap updates it at the + // same resourceVersion — a concurrent write will conflict and retry. + desired := cm.DeepCopy() + if desired.Annotations == nil { + desired.Annotations = make(map[string]string) + } + desired.Annotations[companionRefCountAnnotation] = encodeRefCount(remaining) + return writer.ApplyConfigMap(ctx, cm, desired) + }); err != nil { + return err + } + } + if cmExists { + return nil + } + + if !releaseSecret { + return nil + } + + // Try Secret. + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + s, err := writer.GetSecret(ctx, namespace, companionName) + if err != nil { + return fmt.Errorf("get companion Secret %q: %w", companionName, err) + } + if s == nil { + return nil + } + remaining, err := refCountRemove(s.Annotations, wdKey) + if err != nil { + return fmt.Errorf("referenceddata: corrupt ref-count on Secret %q: %w", companionName, err) + } + if len(remaining) == 0 { + return writer.DeleteSecret(ctx, namespace, companionName) + } + desired := s.DeepCopy() + if desired.Annotations == nil { + desired.Annotations = make(map[string]string) + } + desired.Annotations[companionRefCountAnnotation] = encodeRefCount(remaining) + return writer.ApplySecret(ctx, s, desired) + }) +} + +// writerFor returns the companionWriter appropriate for the current mode. +// +// When a FederationClient is configured (management-plane federation mode), +// it returns a downstreamCompanionWriter that materialises companions into the +// ns-{project-uid} namespace on the Karmada hub so they are propagated to +// cells alongside the WorkloadDeployment. +// +// When FederationClient is nil (single-cluster / dev mode), it falls back to a +// localCompanionWriter that writes companions into the same cluster and +// namespace as the WorkloadDeployment. +func (r *ReferencedDataController) writerFor( + ctx context.Context, + clusterName string, + projectClient client.Client, + wd *computev1alpha.WorkloadDeployment, +) (companionWriter, error) { + if r.opts.FederationClient == nil { + return &localCompanionWriter{cl: projectClient}, nil + } + + // Compute the downstream namespace using the same MappedNamespaceResourceStrategy + // the WorkloadDeploymentFederator uses, so companions land in the same + // ns-{project-uid} namespace as the federated WorkloadDeployment. + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(clusterName, projectClient, r.opts.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, wd.Namespace) + if err != nil { + return nil, fmt.Errorf("resolve downstream namespace: %w", err) + } + + return &downstreamCompanionWriter{ + hubClient: r.opts.FederationClient, + downstreamNamespace: downstreamNS, + }, nil +} + +// readerFor returns the ProjectConfigSecretReader to use. When the controller +// was constructed without a reader (nil), it falls back to a LocalReader that +// reads from the same cluster client — appropriate for single-cluster / dev. +func (r *ReferencedDataController) readerFor(c client.Client) referenceddata.ProjectConfigSecretReader { + if r.opts.Reader != nil { + return r.opts.Reader + } + return referenceddata.NewLocalReader(c) +} + +// buildCompanionConfigMap constructs the companion ConfigMap object. It copies +// Data and BinaryData from the source, stamps the referenced-data label, and +// encodes the ref-count annotation. +func buildCompanionConfigMap(namespace, name string, src *corev1.ConfigMap, refs []string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount(refs), + }, + }, + Data: src.Data, + BinaryData: src.BinaryData, + } +} + +// buildCompanionSecret constructs the companion Secret object. It copies Data +// and Type from the source, stamps the referenced-data label, and encodes the +// ref-count annotation. +func buildCompanionSecret(namespace, name string, src *corev1.Secret, refs []string) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + Labels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + Annotations: map[string]string{ + companionRefCountAnnotation: encodeRefCount(refs), + }, + }, + Data: src.Data, + Type: src.Type, + } +} + +// refCountAdd returns the sorted, deduplicated slice of WD keys after adding +// wdKey. annotations may be nil (companion does not yet exist). +// Returns an error when the existing ref-count annotation cannot be parsed — +// the caller must treat this as a transient error and NOT delete the companion. +func refCountAdd(annotations map[string]string, wdKey string) ([]string, error) { + current, err := decodeRefCount(annotations) + if err != nil { + return nil, err + } + if slices.Contains(current, wdKey) { + return current, nil + } + current = append(current, wdKey) + slices.Sort(current) + return current, nil +} + +// refCountRemove returns the remaining WD keys after removing wdKey. +// Returns an error when the existing ref-count annotation cannot be parsed — +// the caller must treat this as a transient error and NOT delete the companion, +// because other WDs may still hold references recorded in the corrupt entry. +func refCountRemove(annotations map[string]string, wdKey string) ([]string, error) { + current, err := decodeRefCount(annotations) + if err != nil { + return nil, err + } + return slices.DeleteFunc(current, func(k string) bool { return k == wdKey }), nil +} + +// decodeRefCount parses the ref-count annotation into a slice of WD keys. +// Returns (nil, nil) when the annotation is absent or empty. +// Returns an error when the annotation is present but cannot be parsed — the +// caller must treat this as a transient error rather than an empty ref-count, +// to avoid incorrectly deleting a companion that may still be referenced. +func decodeRefCount(annotations map[string]string) ([]string, error) { + raw, ok := annotations[companionRefCountAnnotation] + if !ok || raw == "" { + return nil, nil + } + var keys []string + if err := json.Unmarshal([]byte(raw), &keys); err != nil { + return nil, fmt.Errorf("referenceddata: corrupt ref-count annotation %q: %w", raw, err) + } + return keys, nil +} + +// encodeRefCount serialises WD keys as a JSON array. Returns "[]" on error. +func encodeRefCount(refs []string) string { + if len(refs) == 0 { + return "[]" + } + b, err := json.Marshal(refs) + if err != nil { + return "[]" + } + return string(b) +} + +// mergeLabels merges wanted labels into obj.Labels, preserving keys not present +// in wanted. This ensures third-party labels (e.g. from Karmada) are not +// discarded on update. +func mergeLabels(obj interface { + GetLabels() map[string]string + SetLabels(map[string]string) +}, wanted map[string]string) { + existing := obj.GetLabels() + if existing == nil { + existing = make(map[string]string) + } + maps.Copy(existing, wanted) + obj.SetLabels(existing) +} + +// mergeAnnotations merges wanted annotations into obj.Annotations, preserving +// keys not present in wanted. This ensures third-party annotations (e.g. from +// Karmada bookkeeping) are not discarded on update. +func mergeAnnotations(obj interface { + GetAnnotations() map[string]string + SetAnnotations(map[string]string) +}, wanted map[string]string) { + existing := obj.GetAnnotations() + if existing == nil { + existing = make(map[string]string) + } + maps.Copy(existing, wanted) + obj.SetAnnotations(existing) +} + +// configMapSize returns the total byte size of all Data and BinaryData values +// in a ConfigMap. +func configMapSize(cm *corev1.ConfigMap) int64 { + var n int64 + for _, v := range cm.Data { + n += int64(len(v)) + } + for _, v := range cm.BinaryData { + n += int64(len(v)) + } + return n +} + +// secretSize returns the total byte size of all Data values in a Secret. +func secretSize(s *corev1.Secret) int64 { + var n int64 + for _, v := range s.Data { + n += int64(len(v)) + } + return n +} + +// SetupWithManager registers the controller and its watches with the +// multicluster manager. It is called during management-plane setup. +func (r *ReferencedDataController) SetupWithManager(mgr mcmanager.Manager, opts ReferencedDataControllerOptions) error { + r.mgr = mgr // mcmanager.Manager satisfies clusterGetter + r.opts = opts + + return mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + // Distinct name so it never collides with the cell's WorkloadDeploymentReconciler. + Named("referenced-data"). + // Watch source ConfigMaps; re-queue any WD that references them (rotation). + Watches(&corev1.ConfigMap{}, func(clusterName multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []mcreconcile.Request { + return r.enqueueWDsForSource(ctx, r.mgr, clusterName, "ConfigMap", obj) + }) + }). + // Watch source Secrets; re-queue any WD that references them (rotation). + Watches(&corev1.Secret{}, func(clusterName multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []mcreconcile.Request { + return r.enqueueWDsForSource(ctx, r.mgr, clusterName, "Secret", obj) + }) + }). + Complete(r) +} + +// enqueueWDsForSource looks up all WorkloadDeployments in the cluster that +// reference the changed source ConfigMap or Secret, and returns reconcile +// requests for each. +func (r *ReferencedDataController) enqueueWDsForSource( + ctx context.Context, + getter clusterGetter, + clusterName multicluster.ClusterName, + kind string, + obj client.Object, +) []mcreconcile.Request { + logger := log.FromContext(ctx) + + cl, err := getter.GetCluster(ctx, clusterName) + if err != nil { + logger.Error(err, "referenceddata: failed to get cluster for source watch", "cluster", clusterName) + return nil + } + + indexKey := wdRefersToConfigMapIndex + if kind == kindSecret { + indexKey = wdRefersToSecretIndex + } + + sourceKey := types.NamespacedName{Namespace: obj.GetNamespace(), Name: obj.GetName()}.String() + var wdList computev1alpha.WorkloadDeploymentList + if err := cl.GetClient().List(ctx, &wdList, client.MatchingFields{indexKey: sourceKey}); err != nil { + logger.Error(err, "referenceddata: failed to list WorkloadDeployments for source", "kind", kind, "source", sourceKey) + return nil + } + + requests := make([]mcreconcile.Request, 0, len(wdList.Items)) + for _, wd := range wdList.Items { + requests = append(requests, mcreconcile.Request{ + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: wd.Namespace, + Name: wd.Name, + }, + }, + ClusterName: clusterName, + }) + } + return requests +} diff --git a/internal/controller/referenceddata_controller_test.go b/internal/controller/referenceddata_controller_test.go new file mode 100644 index 00000000..c24a46e9 --- /dev/null +++ b/internal/controller/referenceddata_controller_test.go @@ -0,0 +1,1567 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "encoding/json" + "fmt" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/referenceddata" +) + +const ( + // rdTestNamespace is the project namespace used across ReferencedData tests. + rdTestNamespace = "my-project" + + // rdTestAppConfig is a frequently reused source ConfigMap name in tests. + rdTestAppConfig = "app-config" + + rdTestClusterName = "test-cluster" + rdTestDataKey = "key" + rdTestDataValue = "value" + rdTestWD1 = "wd-1" + rdTestWD2 = "wd-2" + rdTestBlobKey = "blob" + rdTestWDDelConflict = "wd-del-conflict" + rdTestWorkloadName = "test-workload" + + // rdTestFatConfig is the ConfigMap name used in SourceTooLarge tests. + rdTestFatConfig = "fat-config" +) + +// rdTestScheme builds a runtime.Scheme suitable for ReferencedDataController tests. +func rdTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(s)) + require.NoError(t, computev1alpha.AddToScheme(s)) + return s +} + +// newRDController creates a ReferencedDataController with the given reader wired +// into a fake multicluster manager that has one cluster identified by clusterName. +func newRDController(t *testing.T, cl client.Client, reader referenceddata.ProjectConfigSecretReader, opts ...func(*ReferencedDataControllerOptions)) (*ReferencedDataController, string) { + t.Helper() + clusterName := rdTestClusterName + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: &fakeCluster{cl: cl}, + }, + } + + controllerOpts := ReferencedDataControllerOptions{ + Reader: reader, + } + for _, fn := range opts { + fn(&controllerOpts) + } + + c := &ReferencedDataController{ + mgr: mgr, + opts: controllerOpts, + } + return c, clusterName +} + +// reconcileWD is a convenience wrapper that runs one reconcile for the named WD +// in namespace ns on clusterName. +func reconcileWD(t *testing.T, c *ReferencedDataController, clusterName, ns, name string) { + t.Helper() + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{Namespace: ns, Name: name}, + }, + ClusterName: cn, + }) + require.NoError(t, err) +} + +// makeWD returns a minimal WorkloadDeployment with the given template. +func makeWD(ns, name string, template computev1alpha.InstanceTemplateSpec) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: name, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + Template: template, + }, + } +} + +// templateWithConfigMap returns an InstanceTemplateSpec referencing the named ConfigMap +// as a volume. +func templateWithConfigMap(cmName string) computev1alpha.InstanceTemplateSpec { + return computev1alpha.InstanceTemplateSpec{ + Spec: computev1alpha.InstanceSpec{ + Volumes: []computev1alpha.InstanceVolume{ + { + Name: "cfg-vol", + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: cmName}, + }, + }, + }, + }, + }, + } +} + +// templateWithSecret returns an InstanceTemplateSpec referencing the named Secret +// as a volume. +func templateWithSecret(secretName string) computev1alpha.InstanceTemplateSpec { + return computev1alpha.InstanceTemplateSpec{ + Spec: computev1alpha.InstanceSpec{ + Volumes: []computev1alpha.InstanceVolume{ + { + Name: "sec-vol", + VolumeSource: computev1alpha.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: secretName}, + }, + }, + }, + }, + } +} + +// getWD fetches the latest WD from the fake client. +func getWD(t *testing.T, cl client.Client, key types.NamespacedName) *computev1alpha.WorkloadDeployment { + t.Helper() + var wd computev1alpha.WorkloadDeployment + require.NoError(t, cl.Get(context.Background(), key, &wd)) + return &wd +} + +// getCompanionCM fetches a companion ConfigMap or fails the test. +func getCompanionCM(t *testing.T, cl client.Client, ns, name string) *corev1.ConfigMap { + t.Helper() + var cm corev1.ConfigMap + require.NoError(t, cl.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: name}, &cm)) + return &cm +} + +// getCompanionSecret fetches a companion Secret or fails the test. +func getCompanionSecret(t *testing.T, cl client.Client, ns, name string) *corev1.Secret { + t.Helper() + var s corev1.Secret + require.NoError(t, cl.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: name}, &s)) + return &s +} + +// decodeExpectedAnnotation parses the expected-referenced-data annotation from a WD. +func decodeExpectedAnnotation(t *testing.T, wd *computev1alpha.WorkloadDeployment) []string { + t.Helper() + raw, ok := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + require.True(t, ok, "expected-referenced-data annotation must be set") + var names []string + require.NoError(t, json.Unmarshal([]byte(raw), &names)) + return names +} + +// ─── Happy path: companion + annotation + condition ─────────────────────────── + +func TestReferencedData_HappyPath_ConfigMap(t *testing.T) { + ns := rdTestNamespace + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{rdTestDataKey: rdTestDataValue}, + } + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + // First reconcile: stamps finalizer and returns. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + // Fetch updated WD (finalizer stamped). + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + require.Contains(t, wd.Finalizers, referencedDataFinalizer, "finalizer should be present after first reconcile") + + // Second reconcile: materialises companion + stamps annotation + sets condition. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + // Companion ConfigMap should exist. + companion := getCompanionCM(t, cl, ns, companionName) + assert.Equal(t, computev1alpha.ReferencedDataLabelValue, companion.Labels[computev1alpha.ReferencedDataLabel], "companion must have referenced-data label") + assert.Equal(t, rdTestDataValue, companion.Data[rdTestDataKey], "companion must copy source Data") + + // Expected annotation should list the kind-qualified token, not the plain name. + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + expectedTokens := decodeExpectedAnnotation(t, wd) + assert.Equal(t, []string{referenceddata.CompanionToken(kindConfigMap, companionName)}, expectedTokens) + + // Condition should be True/Ready. + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonReady, cond.Reason) +} + +func TestReferencedData_HappyPath_Secret(t *testing.T) { + ns := rdTestNamespace + secretName := "db-creds" + companionName := referenceddata.CompanionName("Secret", secretName) + + srcSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: secretName}, + Data: map[string][]byte{"password": []byte("hunter2")}, + Type: corev1.SecretTypeOpaque, + } + wd := makeWD(ns, rdTestWD1, templateWithSecret(secretName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcSecret, wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + companion := getCompanionSecret(t, cl, ns, companionName) + assert.Equal(t, computev1alpha.ReferencedDataLabelValue, companion.Labels[computev1alpha.ReferencedDataLabel]) + assert.Equal(t, []byte("hunter2"), companion.Data["password"]) + assert.Equal(t, corev1.SecretTypeOpaque, companion.Type) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + expectedTokens := decodeExpectedAnnotation(t, wd) + assert.Equal(t, []string{referenceddata.CompanionToken(kindSecret, companionName)}, expectedTokens) + + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) +} + +// ─── Source-not-found sets SourceNotFound condition ────────────────────────── + +func TestReferencedData_SourceNotFound(t *testing.T) { + ns := rdTestNamespace + cmName := "missing-config" + + // Source ConfigMap does NOT exist in the cluster. + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + require.Contains(t, wd.Finalizers, referencedDataFinalizer) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason) + + // No expected-set annotation should be set (nothing was delivered). + _, hasAnno := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + assert.False(t, hasAnno) + + // Terminal-error annotation MUST be stamped so it propagates hub→cell via Karmada. + termErrRaw, hasTermAnno := wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation] + require.True(t, hasTermAnno, "terminal-error annotation should be stamped on SourceNotFound") + termReason, termMsg, decErr := decodeTerminalError(termErrRaw) + require.NoError(t, decErr) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, termReason) + assert.Contains(t, termMsg, "missing-config") +} + +// ─── Terminal-error annotation is cleared when error resolves ───────────────── + +// TestReferencedData_TerminalErrorAnnotationCleared verifies that after a +// SourceNotFound error the terminal-error annotation is stamped, and that once +// the source is created and the resolver succeeds the annotation is removed. +func TestReferencedData_TerminalErrorAnnotationCleared(t *testing.T) { + ns := rdTestNamespace + cmName := "initially-missing" + + // WD references a ConfigMap that does not yet exist. + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + // First reconcile: stamps finalizer. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + // Second reconcile: source missing → stamps SourceNotFound condition + annotation. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason) + + // Terminal-error annotation must be present after the error. + _, hasTermAnno := wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation] + assert.True(t, hasTermAnno, "terminal-error annotation should be present after SourceNotFound") + + // Now create the source ConfigMap. + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{rdTestDataKey: rdTestDataValue}, + } + require.NoError(t, cl.Create(context.Background(), srcCM)) + + // Third reconcile: source now exists → companions materialised → condition True. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + resolvedCond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, resolvedCond) + assert.Equal(t, metav1.ConditionTrue, resolvedCond.Status) + + // Terminal-error annotation MUST be cleared after the error resolves. + _, hasTermAnnoAfter := wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation] + assert.False(t, hasTermAnnoAfter, "terminal-error annotation should be cleared when error resolves") +} + +// TestReferencedData_TerminalErrorAnnotationStampedForEachReason verifies that +// all three terminal reason codes (SourceNotFound, SourceUnauthorized, +// SourceTooLarge) each result in the terminal-error annotation being stamped. +func TestReferencedData_TerminalErrorAnnotationStampedForEachReason(t *testing.T) { + bigData := make([]byte, 300*1024) // 300 KiB > 256 KiB default + + tests := []struct { + name string + reader referenceddata.ProjectConfigSecretReader + srcCM *corev1.ConfigMap + cmName string + expectedReason string + }{ + { + name: "SourceNotFound", + cmName: "missing-config", + // reader nil: localReader fallback returns (nil,nil) → SourceNotFound + expectedReason: computev1alpha.ReferencedDataReasonSourceNotFound, + }, + { + name: "SourceUnauthorized", + cmName: "auth-config", + reader: &stubReader{ + getCM: func(_ context.Context, _, _, name string) (*corev1.ConfigMap, error) { + return nil, fmt.Errorf("%w: %s", referenceddata.ErrSourceUnauthorized, name) + }, + }, + expectedReason: computev1alpha.ReferencedDataReasonSourceUnauthorized, + }, + { + name: "SourceTooLarge", + cmName: rdTestFatConfig, + srcCM: &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: rdTestNamespace, Name: rdTestFatConfig}, + BinaryData: map[string][]byte{rdTestBlobKey: bigData}, + }, + expectedReason: computev1alpha.ReferencedDataReasonSourceTooLarge, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ns := rdTestNamespace + + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(tc.cmName)) + + var objs []client.Object + objs = append(objs, wd) + if tc.srcCM != nil { + objs = append(objs, tc.srcCM) + } + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(objs...). + WithStatusSubresource(wd). + Build() + + // Use the interface directly to avoid the nil-concrete-in-interface panic. + c, clusterName := newRDController(t, cl, tc.reader) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + + termErrRaw, hasTermAnno := wd.Annotations[computev1alpha.ReferencedDataErrorAnnotation] + require.True(t, hasTermAnno, "terminal-error annotation should be stamped for reason %s", tc.expectedReason) + + termReason, _, decErr := decodeTerminalError(termErrRaw) + require.NoError(t, decErr) + assert.Equal(t, tc.expectedReason, termReason) + }) + } +} + +// ─── Source-unauthorized sets SourceUnauthorized condition ─────────────────── + +func TestReferencedData_SourceUnauthorized(t *testing.T) { + ns := rdTestNamespace + cmName := "auth-config" + + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(wd). + WithStatusSubresource(wd). + Build() + + // Use a reader that always returns ErrSourceUnauthorized. + unauthorizedReader := &stubReader{ + getCM: func(_ context.Context, _, _, _ string) (*corev1.ConfigMap, error) { + return nil, fmt.Errorf("%w: ConfigMap %s", referenceddata.ErrSourceUnauthorized, cmName) + }, + } + + c, clusterName := newRDController(t, cl, unauthorizedReader) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceUnauthorized, cond.Reason) +} + +// ─── Oversized source sets SourceTooLarge ──────────────────────────────────── + +func TestReferencedData_SourceTooLarge_PerObject(t *testing.T) { + ns := rdTestNamespace + cmName := rdTestFatConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + bigData := make([]byte, 300*1024) // 300 KiB > 256 KiB default + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + BinaryData: map[string][]byte{rdTestBlobKey: bigData}, + } + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceTooLarge, cond.Reason) + + // Companion must NOT have been materialised (no labeled companion object should exist). + // With option B, companion and source share the same name, so we check for the + // referenced-data label which distinguishes companions from sources. + var phantom corev1.ConfigMap + err := cl.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: companionName}, &phantom) + if err == nil { + // Object exists — it must NOT carry the companion label. + assert.NotEqual(t, computev1alpha.ReferencedDataLabelValue, phantom.Labels[computev1alpha.ReferencedDataLabel], + "source ConfigMap must not be labelled as a companion when source is too large") + } +} + +func TestReferencedData_SourceTooLarge_Aggregate(t *testing.T) { + ns := rdTestNamespace + cmName1 := "config-a" + cmName2 := "config-b" + + // Each 600 KiB; aggregate 1.2 MiB > 1 MiB default. + halfBig := make([]byte, 600*1024) + src1 := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName1}, + BinaryData: map[string][]byte{rdTestBlobKey: halfBig}, + } + src2 := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName2}, + BinaryData: map[string][]byte{rdTestBlobKey: halfBig}, + } + + template := computev1alpha.InstanceTemplateSpec{ + Spec: computev1alpha.InstanceSpec{ + Volumes: []computev1alpha.InstanceVolume{ + {Name: "vol1", VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{LocalObjectReference: corev1.LocalObjectReference{Name: cmName1}}, + }}, + {Name: "vol2", VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{LocalObjectReference: corev1.LocalObjectReference{Name: cmName2}}, + }}, + }, + }, + } + wd := makeWD(ns, rdTestWD1, template) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(src1, src2, wd). + WithStatusSubresource(wd). + Build() + + // Override per-object limit high enough so each passes alone, but aggregate fails. + c, clusterName := newRDController(t, cl, nil, func(o *ReferencedDataControllerOptions) { + o.PerObjectLimitBytes = 700 * 1024 // 700 KiB — each obj passes + o.AggregateLimitBytes = 1000 * 1024 // 1000 KiB — aggregate fails at 1.2 MiB + }) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceTooLarge, cond.Reason) +} + +// ─── Rotation: source change → companion refreshed ─────────────────────────── + +func TestReferencedData_Rotation(t *testing.T) { + ns := rdTestNamespace + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{"ver": "v1"}, + } + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + // Two passes to materialise initially. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + companion := getCompanionCM(t, cl, ns, companionName) + assert.Equal(t, "v1", companion.Data["ver"]) + + // Simulate a source update (rotation). Re-fetch first: with option B the + // companion shares the same name/namespace as the source in local mode, so + // the controller has already written labels/annotations onto the object and + // advanced its resourceVersion. Updating the stale in-memory srcCM would + // produce a conflict. Re-fetching ensures we update at the current RV. + require.NoError(t, cl.Get(context.Background(), + types.NamespacedName{Namespace: ns, Name: cmName}, srcCM)) + srcCM.Data["ver"] = "v2" + require.NoError(t, cl.Update(context.Background(), srcCM)) + + // Re-reconcile (as if triggered by the source watch). + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + companion = getCompanionCM(t, cl, ns, companionName) + assert.Equal(t, "v2", companion.Data["ver"], "companion must reflect rotated source") +} + +// ─── Ref-count: two WDs sharing a companion ────────────────────────────────── + +func TestReferencedData_RefCount_TwoWDs(t *testing.T) { + ns := rdTestNamespace + cmName := "shared-config" + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{"k": "v"}, + } + wd1 := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + wd2 := makeWD(ns, rdTestWD2, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcCM, wd1, wd2). + WithStatusSubresource(wd1, wd2). + Build() + + c, clusterName := newRDController(t, cl, nil) + + // Materialise for wd-1. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + companion := getCompanionCM(t, cl, ns, companionName) + refs1, err := decodeRefCount(companion.Annotations) + require.NoError(t, err) + assert.Contains(t, refs1, types.NamespacedName{Namespace: ns, Name: rdTestWD1}.String()) + + // Materialise for wd-2. + reconcileWD(t, c, clusterName, ns, rdTestWD2) + reconcileWD(t, c, clusterName, ns, rdTestWD2) + + companion = getCompanionCM(t, cl, ns, companionName) + refs2, err := decodeRefCount(companion.Annotations) + require.NoError(t, err) + assert.Len(t, refs2, 2, "companion must list both WDs") + assert.Contains(t, refs2, types.NamespacedName{Namespace: ns, Name: rdTestWD1}.String()) + assert.Contains(t, refs2, types.NamespacedName{Namespace: ns, Name: rdTestWD2}.String()) + + // Delete wd-1 (simulate deletion + finalizer processing). + wd1 = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + require.NoError(t, cl.Delete(context.Background(), wd1)) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + // Companion must still exist (wd-2 still holds it). + companion = getCompanionCM(t, cl, ns, companionName) + refs3, err := decodeRefCount(companion.Annotations) + require.NoError(t, err) + assert.Len(t, refs3, 1, "wd-1 should have been removed from ref-count") + assert.Contains(t, refs3, types.NamespacedName{Namespace: ns, Name: rdTestWD2}.String()) + + // Delete wd-2 too. + wd2 = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD2}) + require.NoError(t, cl.Delete(context.Background(), wd2)) + reconcileWD(t, c, clusterName, ns, rdTestWD2) + + // Companion must be gone. + var gone corev1.ConfigMap + getErr := cl.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: companionName}, &gone) + assert.Error(t, getErr, "companion must be deleted when last WD is removed") +} + +// ─── WD deletion cleans up companion ───────────────────────────────────────── + +func TestReferencedData_WDDeletion_CleansUpCompanion(t *testing.T) { + ns := rdTestNamespace + cmName := "solo-config" + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{"a": "b"}, + } + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + // Companion must exist. + getCompanionCM(t, cl, ns, companionName) + + // Delete the WD. + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + require.NoError(t, cl.Delete(context.Background(), wd)) + + // Reconcile handles the deletion + finalizer removal. + // After the controller removes the finalizer, the fake client may GC the WD, + // so we capture the state before reconcile. + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + // Companion must be gone. + var gone corev1.ConfigMap + err := cl.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: companionName}, &gone) + assert.Error(t, err, "companion must be deleted when the only referencing WD is deleted") +} + +// ─── Empty ref set → no companion, no finalizer ────────────────────────────── + +func TestReferencedData_EmptyRefs_NoCompanionNoFinalizer(t *testing.T) { + ns := rdTestNamespace + + // WD template has no ConfigMap/Secret references. + wd := makeWD(ns, rdTestWD1, computev1alpha.InstanceTemplateSpec{}) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: rdTestWD1}) + assert.NotContains(t, wd.Finalizers, referencedDataFinalizer, "no finalizer for empty refs") + _, hasAnno := wd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + assert.False(t, hasAnno, "no annotation for empty refs") +} + +// ─── Companion namespace invariant ─────────────────────────────────────────── + +// TestReferencedData_CompanionNamespaceInvariant asserts that the companion +// lands in the same namespace as the WorkloadDeployment. This is the namespace +// invariant from the plan: "the WorkloadDeployment, its Instances, and the +// companions all live in the same ns-{project-uid} namespace." +func TestReferencedData_CompanionNamespaceInvariant(t *testing.T) { + ns := "ns-project-uid-123" + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{"x": "y"}, + } + wd := makeWD(ns, rdTestWD1, templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + reconcileWD(t, c, clusterName, ns, rdTestWD1) + + companion := getCompanionCM(t, cl, ns, companionName) + assert.Equal(t, ns, companion.Namespace, "companion must be in WD's namespace") +} + +// ─── Phase 1b: federated companion writer ──────────────────────────────────── + +// newRDControllerFederated creates a ReferencedDataController wired with a +// FederationClient (fake hub client). The project cluster holds the WDs and +// source ConfigMaps/Secrets; the hub client is the destination for companions. +func newRDControllerFederated( + t *testing.T, + projectCl client.Client, + hubCl client.Client, + reader referenceddata.ProjectConfigSecretReader, +) (*ReferencedDataController, string) { + t.Helper() + clusterName := rdTestClusterName + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: &fakeCluster{cl: projectCl}, + }, + } + + c := &ReferencedDataController{ + mgr: mgr, + opts: ReferencedDataControllerOptions{ + Reader: reader, + FederationClient: hubCl, + }, + } + return c, clusterName +} + +// TestReferencedData_Federated_CompanionWrittenToHub asserts that, when a +// FederationClient is configured, companions are materialised into the +// downstream ns-{project-uid} namespace on the hub rather than the project +// namespace. The expected-set annotation must also be set on the project WD. +func TestReferencedData_Federated_CompanionWrittenToHub(t *testing.T) { + // Use the same UID as the shared test constants so the downstream namespace + // name is deterministic and matches testKarmadaNSStr. + projNS := testProjNS + projNSUID := testProjNSUID + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + // Project cluster objects: namespace (with UID), source ConfigMap, WD. + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: projNS, Name: cmName}, + Data: map[string]string{rdTestDataKey: "federated-value"}, + } + wd := makeWD(projNS, "wd-fed-1", templateWithConfigMap(cmName)) + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) // Namespace type + + projectCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(projNSObj, srcCM, wd). + WithStatusSubresource(wd). + Build() + + // Hub client: empty federation control plane. + hubScheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(hubScheme)) + require.NoError(t, computev1alpha.AddToScheme(hubScheme)) + hubCl := fake.NewClientBuilder().WithScheme(hubScheme).Build() + + c, clusterName := newRDControllerFederated(t, projectCl, hubCl, nil) + + // First reconcile: stamps finalizer. + reconcileWD(t, c, clusterName, projNS, "wd-fed-1") + // Fetch updated WD. + wd = getWD(t, projectCl, types.NamespacedName{Namespace: projNS, Name: "wd-fed-1"}) + require.Contains(t, wd.Finalizers, referencedDataFinalizer) + + // Second reconcile: materialises companion on the hub. + reconcileWD(t, c, clusterName, projNS, "wd-fed-1") + + // Companion must exist on the HUB in the downstream namespace, NOT in the project namespace. + downstreamNS := testKarmadaNSStr // "ns-aabbccdd-0000-1111-2222-333344445555" + var hubCM corev1.ConfigMap + require.NoError(t, hubCl.Get(context.Background(), + types.NamespacedName{Namespace: downstreamNS, Name: companionName}, &hubCM), + "companion ConfigMap must exist on the hub in the downstream namespace") + assert.Equal(t, "federated-value", hubCM.Data[rdTestDataKey], "hub companion must copy source Data") + assert.Equal(t, computev1alpha.ReferencedDataLabelValue, hubCM.Labels[computev1alpha.ReferencedDataLabel], + "hub companion must carry referenced-data label") + + // Companion must NOT have been written to the project namespace as a companion + // (the source ConfigMap has the same name, but must NOT carry the referenced-data label). + // With option B the companion and source share the same name, so we check the label + // rather than expecting the object to be absent. + var projCM corev1.ConfigMap + getErr := projectCl.Get(context.Background(), + types.NamespacedName{Namespace: projNS, Name: companionName}, &projCM) + if getErr == nil { + // Object exists — it must be the source, not a companion (no referenced-data label). + assert.NotEqual(t, computev1alpha.ReferencedDataLabelValue, projCM.Labels[computev1alpha.ReferencedDataLabel], + "project-namespace object must NOT carry the companion label in federated mode") + } + + // Expected-set annotation must be set on the project WD with kind-qualified tokens. + wd = getWD(t, projectCl, types.NamespacedName{Namespace: projNS, Name: "wd-fed-1"}) + expectedTokens := decodeExpectedAnnotation(t, wd) + assert.Equal(t, []string{referenceddata.CompanionToken(kindConfigMap, companionName)}, expectedTokens) +} + +// TestReferencedData_WriterSelection asserts that writerFor returns a +// localCompanionWriter when FederationClient is nil, and a +// downstreamCompanionWriter when it is set. +func TestReferencedData_WriterSelection(t *testing.T) { + t.Parallel() + + projNS := testProjNS + projNSUID := testProjNSUID + + projNSObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: projNS, UID: projNSUID}, + } + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: projNS, Name: "wd-sel"}, + } + + s := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(s)) + require.NoError(t, computev1alpha.AddToScheme(s)) + + projectCl := fake.NewClientBuilder().WithScheme(s).WithObjects(projNSObj, wd).Build() + + t.Run("no federation client returns localCompanionWriter", func(t *testing.T) { + t.Parallel() + c := &ReferencedDataController{opts: ReferencedDataControllerOptions{}} + w, err := c.writerFor(context.Background(), "test-cluster", projectCl, wd) + require.NoError(t, err) + _, ok := w.(*localCompanionWriter) + assert.True(t, ok, "expected *localCompanionWriter when FederationClient is nil") + }) + + t.Run("with federation client returns downstreamCompanionWriter", func(t *testing.T) { + t.Parallel() + + hubScheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(hubScheme)) + hubCl := fake.NewClientBuilder().WithScheme(hubScheme).Build() + + c := &ReferencedDataController{opts: ReferencedDataControllerOptions{FederationClient: hubCl}} + w, err := c.writerFor(context.Background(), "test-cluster", projectCl, wd) + require.NoError(t, err) + dsw, ok := w.(*downstreamCompanionWriter) + require.True(t, ok, "expected *downstreamCompanionWriter when FederationClient is set") + assert.Equal(t, testKarmadaNSStr, dsw.downstreamNamespace, + "downstream namespace must be ns-{project-uid}") + }) +} + +// ─── Conflict-tolerant finalizer ───────────────────────────────────────────── + +// TestReferencedData_AddFinalizer_ConflictRetried asserts that the finalizer +// add path survives a single optimistic-lock conflict (as would occur when the +// WorkloadDeploymentFederator concurrently adds its own finalizer) and still +// stamps the finalizer on the object after retrying. +func TestReferencedData_AddFinalizer_ConflictRetried(t *testing.T) { + ns := rdTestNamespace + cmName := rdTestAppConfig + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{rdTestDataKey: rdTestDataValue}, + } + wd := makeWD(ns, "wd-conflict", templateWithConfigMap(cmName)) + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + realCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + Build() + + // Intercept the first Update call and return a conflict; let subsequent + // calls pass through to the real fake client so the retry succeeds. + var updateCalls atomic.Int32 + wdGR := schema.GroupResource{Group: "compute.datumapis.com", Resource: "workloaddeployments"} + cl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + WithInterceptorFuncs(interceptor.Funcs{ + Update: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.UpdateOption) error { + if _, ok := obj.(*computev1alpha.WorkloadDeployment); ok { + if updateCalls.Add(1) == 1 { + // Simulate the conflict the federator would cause. + return apierrors.NewConflict(wdGR, obj.GetName(), + fmt.Errorf("the object has been modified; please apply your changes to the latest version and try again")) + } + // Subsequent calls pass through to the real client. + return realCl.Update(ctx, obj, opts...) + } + return c.Update(ctx, obj, opts...) + }, + Get: func(ctx context.Context, c client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + // Always read from the real client so retries see the latest state. + return realCl.Get(ctx, key, obj, opts...) + }, + }). + Build() + + clusterName := rdTestClusterName + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: &fakeCluster{cl: cl}, + }, + } + c := &ReferencedDataController{ + mgr: mgr, + opts: ReferencedDataControllerOptions{}, + } + + // First reconcile: should add the finalizer despite the initial conflict. + cn := multicluster.ClusterName(clusterName) + ctx := mccontext.WithCluster(context.Background(), cn) + _, err := c.Reconcile(ctx, mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: ns, Name: "wd-conflict"}}, + ClusterName: cn, + }) + require.NoError(t, err, "reconcile must succeed even when the first Update conflicts") + + // Verify the finalizer was added to the real object. + updated := getWD(t, realCl, types.NamespacedName{Namespace: ns, Name: "wd-conflict"}) + assert.Contains(t, updated.Finalizers, referencedDataFinalizer, + "finalizer must be present after conflict-retried Update") + + // Update was called at least twice (once for the conflict, once for the retry). + assert.GreaterOrEqual(t, int(updateCalls.Load()), 2, + "Update must have been called at least twice (conflict + retry)") +} + +// TestReferencedData_RemoveFinalizer_ConflictRetried asserts that the finalizer +// removal path (on WD deletion) survives an optimistic-lock conflict. +func TestReferencedData_RemoveFinalizer_ConflictRetried(t *testing.T) { + ns := rdTestNamespace + cmName := rdTestAppConfig + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{rdTestDataKey: rdTestDataValue}, + } + wd := makeWD(ns, rdTestWDDelConflict, templateWithConfigMap(cmName)) + + s := rdTestScheme(t) + require.NoError(t, corev1.AddToScheme(s)) + + // Build the WD to a state where the finalizer and companion are already present. + realCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(srcCM, wd). + WithStatusSubresource(wd). + Build() + + { + cn := multicluster.ClusterName("setup") + ctx := mccontext.WithCluster(context.Background(), cn) + setupMgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + "setup": &fakeCluster{cl: realCl}, + }, + } + setupC := &ReferencedDataController{mgr: setupMgr, opts: ReferencedDataControllerOptions{}} + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: ns, Name: rdTestWDDelConflict}}, + ClusterName: cn, + } + // Reconcile twice: first stamps finalizer, second materialises companion. + _, err := setupC.Reconcile(ctx, req) + require.NoError(t, err) + _, err = setupC.Reconcile(ctx, req) + require.NoError(t, err) + } + + // Verify setup: finalizer present, companion exists. + wdObj := getWD(t, realCl, types.NamespacedName{Namespace: ns, Name: rdTestWDDelConflict}) + require.Contains(t, wdObj.Finalizers, referencedDataFinalizer) + getCompanionCM(t, realCl, ns, companionName) + + // Now delete the WD. + require.NoError(t, realCl.Delete(context.Background(), wdObj)) + + // Intercept the first Update during deletion and return a conflict. + var updateCalls atomic.Int32 + wdGR := schema.GroupResource{Group: "compute.datumapis.com", Resource: "workloaddeployments"} + cl := fake.NewClientBuilder(). + WithScheme(s). + WithInterceptorFuncs(interceptor.Funcs{ + Update: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.UpdateOption) error { + if _, ok := obj.(*computev1alpha.WorkloadDeployment); ok { + if updateCalls.Add(1) == 1 { + return apierrors.NewConflict(wdGR, obj.GetName(), + fmt.Errorf("the object has been modified; please apply your changes to the latest version and try again")) + } + return realCl.Update(ctx, obj, opts...) + } + return c.Update(ctx, obj, opts...) + }, + Get: func(ctx context.Context, _ client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + return realCl.Get(ctx, key, obj, opts...) + }, + Delete: func(ctx context.Context, _ client.WithWatch, obj client.Object, opts ...client.DeleteOption) error { + return realCl.Delete(ctx, obj, opts...) + }, + }). + Build() + + cn := multicluster.ClusterName("del-cluster") + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + "del-cluster": &fakeCluster{cl: cl}, + }, + } + c := &ReferencedDataController{mgr: mgr, opts: ReferencedDataControllerOptions{}} + ctx := mccontext.WithCluster(context.Background(), cn) + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: ns, Name: rdTestWDDelConflict}}, + ClusterName: cn, + } + + _, err := c.Reconcile(ctx, req) + require.NoError(t, err, "reconcile must succeed even when the first Update conflicts during deletion") + + // When all finalizers are removed the fake client GCs the object immediately, + // so we expect either: (a) the object is gone, or (b) it exists without our + // finalizer. Both outcomes confirm the finalizer was removed correctly. + var finalObj computev1alpha.WorkloadDeployment + getErr := realCl.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: rdTestWDDelConflict}, &finalObj) + if getErr == nil { + assert.NotContains(t, finalObj.Finalizers, referencedDataFinalizer, + "finalizer must be removed after conflict-retried Update on deletion") + } else { + require.True(t, apierrors.IsNotFound(getErr), + "expected object to be gone or exist without finalizer, got: %v", getErr) + } + + assert.GreaterOrEqual(t, int(updateCalls.Load()), 2, + "Update must have been called at least twice during deletion (conflict + retry)") +} + +// ─── Regression: two WDs sharing a source, interleaved reconciles ───────────── + +// TestReferencedData_RefCount_ConcurrentInterleaved verifies that when two WDs +// share the same source ConfigMap and their reconciles are interleaved, both +// ref-count entries are preserved and the companion is not orphaned or deleted. +// +// This is the regression test for the ref-count race (fix 2). +func TestReferencedData_RefCount_ConcurrentInterleaved(t *testing.T) { + ns := rdTestNamespace + cmName := "shared-concurrent-config" + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{"k": "v"}, + } + wd1 := makeWD(ns, "wd-c1", templateWithConfigMap(cmName)) + wd2 := makeWD(ns, "wd-c2", templateWithConfigMap(cmName)) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcCM, wd1, wd2). + WithStatusSubresource(wd1, wd2). + Build() + + c, clusterName := newRDController(t, cl, nil) + + // Stamp finalizers for both WDs. + reconcileWD(t, c, clusterName, ns, "wd-c1") + reconcileWD(t, c, clusterName, ns, "wd-c2") + + // Interleave: wd-c1 writes companion (with its ref), then wd-c2 should + // read the companion fresh and add its own ref — not start from scratch. + reconcileWD(t, c, clusterName, ns, "wd-c1") + reconcileWD(t, c, clusterName, ns, "wd-c2") + + companion := getCompanionCM(t, cl, ns, companionName) + refs, err := decodeRefCount(companion.Annotations) + require.NoError(t, err) + assert.Len(t, refs, 2, "both WD ref-count entries must be present after interleaved reconciles") + assert.Contains(t, refs, types.NamespacedName{Namespace: ns, Name: "wd-c1"}.String()) + assert.Contains(t, refs, types.NamespacedName{Namespace: ns, Name: "wd-c2"}.String()) + + // Delete wd-c1; companion must survive with only wd-c2's entry. + wd1 = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: "wd-c1"}) + require.NoError(t, cl.Delete(context.Background(), wd1)) + reconcileWD(t, c, clusterName, ns, "wd-c1") + + companion = getCompanionCM(t, cl, ns, companionName) + refs2, err := decodeRefCount(companion.Annotations) + require.NoError(t, err) + assert.Len(t, refs2, 1, "companion must survive with wd-c2 still referencing it") + assert.Contains(t, refs2, types.NamespacedName{Namespace: ns, Name: "wd-c2"}.String(), + "wd-c2 entry must remain in ref-count after wd-c1 deletion") +} + +// ─── Regression: ref-count RMW must be atomic (silent lost-update without race)─ + +// TestReferencedData_RefCount_ConflictForcesReread verifies that when a +// concurrent WD adds its ref-count key between wd-race-1's GET of the companion +// and wd-race-1's Update, the resulting conflict causes RetryOnConflict to +// re-read the companion (now containing the concurrent key) so that the final +// annotation carries ALL keys — including the one written concurrently. +// +// Race anatomy: +// 1. wd-race-1's outer GET returns companion at R1 with [wd2Key]. +// 2. Concurrently, wd-race-3 adds wd3Key to the companion, advancing it to R2 +// ([wd2Key, wd3Key]). +// 3. wd-race-1 computes [wd1Key, wd2Key] from R1 and calls Update. +// +// Under the FIXED code the Update carries R1, which now conflicts with R2. +// RetryOnConflict re-runs the loop: the fresh GET returns R2 [wd2Key, wd3Key], +// refCountAdd produces [wd1Key, wd2Key, wd3Key], and the Update succeeds. All +// three keys are present. ✓ +// +// Under the OLD double-GET code ApplyConfigMap issued its own internal GET +// (returning R2 [wd2Key, wd3Key]) and then called mergeAnnotations, which +// overwrote the referenced-by key with the R1-derived [wd1Key, wd2Key], silently +// dropping wd3Key. The Update at R2 succeeded with no conflict, so +// RetryOnConflict never fired. Final: [wd1Key, wd2Key] — wd3Key lost. ✗ +// +// The interceptor simulates the concurrent write by: +// - On the first GET of the companion ConfigMap, returning R1 to the caller +// and then immediately writing wd3Key into the companion on realCl (R2). +// - Letting the Update pass through to realCl unmodified. +// +// Fixed code: Update at R1 conflicts with R2 → retry → all three keys. +// Old code: Internal GET returns R2 with wd3Key, but mergeAnnotations drops it +// +// → Update at R2 silently succeeds → wd3Key lost → test fails. +func TestReferencedData_RefCount_ConflictForcesReread(t *testing.T) { + ns := rdTestNamespace + cmName := "shared-race-config" + companionName := referenceddata.CompanionName("ConfigMap", cmName) + + wd1Key := types.NamespacedName{Namespace: ns, Name: "wd-race-1"}.String() + wd2Key := types.NamespacedName{Namespace: ns, Name: "wd-race-2"}.String() + wd3Key := types.NamespacedName{Namespace: ns, Name: "wd-race-3"}.String() + + srcCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmName}, + Data: map[string]string{"k": "v"}, + } + wd1 := makeWD(ns, "wd-race-1", templateWithConfigMap(cmName)) + wd2 := makeWD(ns, "wd-race-2", templateWithConfigMap(cmName)) + wd3 := makeWD(ns, "wd-race-3", templateWithConfigMap(cmName)) + + s := rdTestScheme(t) + + // realCl is the ground-truth store. All intercepted operations are forwarded + // here so state changes persist across the retry. + realCl := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(srcCM, wd1, wd2, wd3). + WithStatusSubresource(wd1, wd2, wd3). + Build() + + // Phase 1: stamp finalizers on all three WDs. + { + setupC, cn := newRDController(t, realCl, nil) + reconcileWD(t, setupC, cn, ns, "wd-race-1") + reconcileWD(t, setupC, cn, ns, "wd-race-2") + reconcileWD(t, setupC, cn, ns, "wd-race-3") + } + + // Phase 2: fully materialise wd-race-2 so the companion already exists + // with wd2Key in its ref-count before the race starts. + { + setupC, cn := newRDController(t, realCl, nil) + reconcileWD(t, setupC, cn, ns, "wd-race-2") + } + + // Sanity: companion exists with wd2Key only. + companion := getCompanionCM(t, realCl, ns, companionName) + initialRefs, err := decodeRefCount(companion.Annotations) + require.NoError(t, err) + require.Contains(t, initialRefs, wd2Key, "setup: wd-race-2 key must be in companion before race test") + require.NotContains(t, initialRefs, wd1Key) + require.NotContains(t, initialRefs, wd3Key) + + // concurrentWriteDone ensures the concurrent write injected by the interceptor + // happens exactly once (on the first GET of the companion). + var concurrentWriteDone atomic.Bool + + // interceptedCl is used only for the wd-race-1 reconcile. It intercepts: + // • GET of the companion: on the first call, after returning the result to + // the caller, it immediately writes wd3Key into the companion on realCl, + // advancing it to R2. This simulates wd-race-3 writing its ref-count key + // after wd-race-1's GET but before wd-race-1's Update. + // • All other operations are forwarded to realCl unchanged. + interceptedCl := fake.NewClientBuilder(). + WithScheme(s). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(ctx context.Context, _ client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + err := realCl.Get(ctx, key, obj, opts...) + if err != nil { + return err + } + // After returning R1 to wd-race-1, inject wd-race-3's key into the + // companion on realCl so the object advances to R2 before wd-race-1 + // calls Update. + cm, isCM := obj.(*corev1.ConfigMap) + if isCM && cm.Name == companionName && !concurrentWriteDone.Swap(true) { + bump := &corev1.ConfigMap{} + if gerr := realCl.Get(ctx, types.NamespacedName{Namespace: cm.Namespace, Name: cm.Name}, bump); gerr == nil { + // Add wd3Key to the ref-count to simulate the concurrent write. + newRefs, _ := refCountAdd(bump.Annotations, wd3Key) + if bump.Annotations == nil { + bump.Annotations = make(map[string]string) + } + bump.Annotations[companionRefCountAnnotation] = encodeRefCount(newRefs) + _ = realCl.Update(ctx, bump) // advances resourceVersion to R2 + } + } + return nil + }, + Update: func(ctx context.Context, _ client.WithWatch, obj client.Object, opts ...client.UpdateOption) error { + return realCl.Update(ctx, obj, opts...) + }, + Create: func(ctx context.Context, _ client.WithWatch, obj client.Object, opts ...client.CreateOption) error { + return realCl.Create(ctx, obj, opts...) + }, + Delete: func(ctx context.Context, _ client.WithWatch, obj client.Object, opts ...client.DeleteOption) error { + return realCl.Delete(ctx, obj, opts...) + }, + Patch: func(ctx context.Context, _ client.WithWatch, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + return realCl.Patch(ctx, obj, patch, opts...) + }, + SubResourceUpdate: func(ctx context.Context, _ client.Client, subResource string, obj client.Object, opts ...client.SubResourceUpdateOption) error { + return realCl.SubResource(subResource).Update(ctx, obj, opts...) + }, + }). + Build() + + c, clusterName := newRDController(t, interceptedCl, nil) + + // Reconcile wd-race-1. The interceptor advances the companion's + // resourceVersion (adding wd3Key) after the outer GET returns. + // + // FIXED code: the Update carries R1 which conflicts with R2 → RetryOnConflict + // re-reads at R2 (with wd3Key present), computes [wd1Key, wd2Key, wd3Key], + // Update succeeds → all three keys present. + // + // OLD code: ApplyConfigMap's internal GET sees R2 (with wd3Key), but + // mergeAnnotations overwrites referenced-by with the R1-derived [wd1Key, + // wd2Key], dropping wd3Key. Update at R2 succeeds silently → wd3Key lost. + reconcileWD(t, c, clusterName, ns, "wd-race-1") + + // Assert: the concurrent write was actually injected (otherwise the test is vacuous). + require.True(t, concurrentWriteDone.Load(), "interceptor must have injected the concurrent wd3Key write") + + // Assert: all three keys must be in the companion ref-count after reconcile. + finalCompanion := getCompanionCM(t, realCl, ns, companionName) + finalRefs, err := decodeRefCount(finalCompanion.Annotations) + require.NoError(t, err) + assert.Contains(t, finalRefs, wd1Key, + "wd-race-1 key must be in companion ref-count after conflict-retry") + assert.Contains(t, finalRefs, wd2Key, + "wd-race-2 key must be preserved in companion ref-count") + assert.Contains(t, finalRefs, wd3Key, + "wd-race-3 key (concurrent write) must NOT be lost by wd-race-1's reconcile") + assert.Len(t, finalRefs, 3, + "companion ref-count must contain all three WD keys (no lost update)") +} + +// ─── Regression: optional missing source → WD not failed ────────────────────── + +// TestReferencedData_OptionalMissingSource_Skipped verifies that a source +// marked optional=true that does not exist is silently skipped: the WD is NOT +// set to Failed/SourceNotFound, and the companion for the optional source is +// not expected. +// +// This is the regression test for the optional source escape hatch (fix 3). +func TestReferencedData_OptionalMissingSource_Skipped(t *testing.T) { + ns := rdTestNamespace + + // A required ConfigMap that exists. + cmRequired := "required-config" + cmOptional := "optional-config" + companionRequired := referenceddata.CompanionName("ConfigMap", cmRequired) + + srcRequired := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: cmRequired}, + Data: map[string]string{rdTestDataKey: rdTestDataValue}, + } + + // Template references both: required (no optional flag) and optional. + optionalTrue := true + _ = optionalTrue + template := computev1alpha.InstanceTemplateSpec{ + Spec: computev1alpha.InstanceSpec{ + Volumes: []computev1alpha.InstanceVolume{ + { + Name: "req-vol", + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: cmRequired}, + }, + }, + }, + { + Name: "opt-vol", + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: cmOptional}, + Optional: &[]bool{true}[0], + }, + }, + }, + }, + }, + } + wd := makeWD(ns, "wd-opt", template) + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(srcRequired, wd). + WithStatusSubresource(wd). + Build() + + c, clusterName := newRDController(t, cl, nil) + + // Two passes: first stamps finalizer, second resolves. + reconcileWD(t, c, clusterName, ns, "wd-opt") + reconcileWD(t, c, clusterName, ns, "wd-opt") + + wd = getWD(t, cl, types.NamespacedName{Namespace: ns, Name: "wd-opt"}) + + // The WD must NOT have a False/SourceNotFound condition. + cond := apimeta.FindStatusCondition(wd.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond, "ReferencedDataReady condition must be set") + assert.Equal(t, metav1.ConditionTrue, cond.Status, + "WD must be Ready when only optional sources are missing") + assert.Equal(t, computev1alpha.ReferencedDataReasonReady, cond.Reason) + + // The required companion must exist. + _ = getCompanionCM(t, cl, ns, companionRequired) + + // The optional companion must NOT exist. + var phantom corev1.ConfigMap + err := cl.Get(context.Background(), + types.NamespacedName{Namespace: ns, Name: referenceddata.CompanionName("ConfigMap", cmOptional)}, &phantom) + assert.Error(t, err, "companion for optional missing source must not be created") + + // The expected-set annotation must only list the required companion as a kind-qualified token. + expectedTokens := decodeExpectedAnnotation(t, wd) + assert.Equal(t, []string{referenceddata.CompanionToken(kindConfigMap, companionRequired)}, expectedTokens) +} + +// ─── Regression: unparseable ref-count annotation → companion NOT deleted ───── + +// TestReferencedData_CorruptRefCount_NotDeleted verifies that when the +// ref-count annotation on a companion is unparseable, the release path returns +// an error (transient) and does NOT delete the companion. This guards against +// data loss when the annotation is corrupt but other WDs may still reference +// the companion. +// +// This is the regression test for fix 4. +func TestReferencedData_CorruptRefCount_NotDeleted(t *testing.T) { + ns := rdTestNamespace + cmName := "shared-config-corrupt" + companionName := referenceddata.CompanionName("ConfigMap", cmName) + wdKey := types.NamespacedName{Namespace: ns, Name: "wd-corrupt"}.String() + + // Companion already exists with a corrupt ref-count annotation. + companion := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: companionName, + Labels: map[string]string{computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue}, + Annotations: map[string]string{ + companionRefCountAnnotation: `{not-valid-json}`, + }, + }, + Data: map[string]string{"k": "v"}, + } + + cl := fake.NewClientBuilder(). + WithScheme(rdTestScheme(t)). + WithObjects(companion). + Build() + + // Use a localCompanionWriter backed by the fake client. + writer := &localCompanionWriter{cl: cl} + + ctrl := &ReferencedDataController{} + err := ctrl.releaseOneCompanion(context.Background(), nil, writer, ns, kindConfigMap, companionName, wdKey) + assert.Error(t, err, "corrupt ref-count annotation must cause releaseOneCompanion to return an error") + assert.Contains(t, err.Error(), "corrupt ref-count", + "error message must mention corrupt ref-count") + + // Companion must NOT have been deleted. + var still corev1.ConfigMap + getErr := cl.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: companionName}, &still) + assert.NoError(t, getErr, "companion must NOT be deleted when ref-count annotation is corrupt") +} + +// ─── Regression: federator status sync preserves ReferencedDataReady ────────── + +// TestFederator_StatusSync_PreservesReferencedDataReadyCondition verifies that +// syncStatusFromDownstream does NOT overwrite the resolver-owned +// ReferencedDataReady condition on the project WD with the downstream WD's +// (empty or stale) copy. +// +// This is the regression test for fix 1. +func TestFederator_StatusSync_PreservesReferencedDataReadyCondition(t *testing.T) { + t.Parallel() + + // Project WD has the resolver's ReferencedDataReady=True condition. + resolverCond := metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + Message: "All 1 referenced companion(s) are materialised", + } + + wd := testWorkloadDeployment(withFinalizer, func(w *computev1alpha.WorkloadDeployment) { + w.Status.Conditions = []metav1.Condition{resolverCond} + }) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Downstream WD has NO ReferencedDataReady condition (as would be the case + // when the cell hasn't set it, or when it was never populated downstream). + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: rdTestWorkloadName}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + Status: computev1alpha.WorkloadDeploymentStatus{ + // Downstream status has replica counts but NO ReferencedDataReady condition. + }, + } + + karmadaClient := newKarmadaFakeClient( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + ) + + r := newTestFederator(projectClient, karmadaClient) + + _, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + + // After reconcile, the project WD must still have ReferencedDataReady=True. + var updatedWD computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD)) + + cond := apimeta.FindStatusCondition(updatedWD.Status.Conditions, computev1alpha.ReferencedDataReady) + require.NotNil(t, cond, "ReferencedDataReady condition must still be present after federator status sync") + assert.Equal(t, metav1.ConditionTrue, cond.Status, + "resolver's ReferencedDataReady=True must be preserved by federator status sync") + assert.Equal(t, computev1alpha.ReferencedDataReasonReady, cond.Reason, + "resolver's Ready reason must be preserved by federator status sync") +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// stubReader allows individual test cases to inject custom reader behaviour. +type stubReader struct { + getCM func(ctx context.Context, projectID, namespace, name string) (*corev1.ConfigMap, error) + getSecret func(ctx context.Context, projectID, namespace, name string) (*corev1.Secret, error) +} + +func (s *stubReader) GetConfigMap(ctx context.Context, projectID, namespace, name string) (*corev1.ConfigMap, error) { + if s.getCM != nil { + return s.getCM(ctx, projectID, namespace, name) + } + return nil, fmt.Errorf("%w: ConfigMap %s", referenceddata.ErrSourceNotFound, name) +} + +func (s *stubReader) GetSecret(ctx context.Context, projectID, namespace, name string) (*corev1.Secret, error) { + if s.getSecret != nil { + return s.getSecret(ctx, projectID, namespace, name) + } + return nil, fmt.Errorf("%w: Secret %s", referenceddata.ErrSourceNotFound, name) +} diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index f810e53f..7752bbfa 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -8,6 +8,7 @@ import ( "fmt" "slices" + "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -17,9 +18,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" @@ -34,6 +37,13 @@ import ( instancecontrolstateful "go.datum.net/compute/internal/controller/instancecontrol/stateful" ) +const ( + // reasonReplicasAvailable is used in the ReplicasReady condition when replicas + // are either available or pending; it has no equivalent in the API constants + // package because it is an internal detail of this controller. + reasonReplicasAvailable = "ReplicasAvailable" +) + // WorkloadDeploymentReconciler reconciles a WorkloadDeployment object type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager @@ -45,6 +55,11 @@ type WorkloadDeploymentReconciler struct { // actively removed if present), and the networking step is treated as // immediately ready. Defaults to true. NetworkingEnabled bool + + // enableReferencedDataGate mirrors FeatureFlagsConfig.EnableReferencedDataGate. + // When true, new Instances whose template references ConfigMaps or Secrets + // receive the ReferencedData scheduling gate at creation time. + enableReferencedDataGate bool } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete @@ -102,6 +117,10 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") + // Snapshot the existing status before any modifications so we can skip the + // Status().Update call when nothing changed (see loop-prevention comment below). + existingStatus := *deployment.Status.DeepCopy() + // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -113,7 +132,8 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco } instanceControl := instancecontrolstateful.NewWithOptions(instancecontrolstateful.Options{ - NetworkingEnabled: r.NetworkingEnabled, + NetworkingEnabled: r.NetworkingEnabled, + EnableReferencedDataGate: r.enableReferencedDataGate, }) actions, err := instanceControl.GetActions(ctx, cl.GetScheme(), &deployment, instances.Items) @@ -171,7 +191,7 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco desiredReplicas = 0 } - currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady) + currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady) if err != nil { return ctrl.Result{}, err } @@ -183,18 +203,26 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco deployment.Status.ReadyReplicas = int32(readyReplicas) deployment.Status.ObservedGeneration = deployment.Generation - if quotaBlockedReplicas > 0 { + switch { + case quotaBlockedReplicas > 0: apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ Type: computev1alpha.WorkloadDeploymentReplicasReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), }) - } else { + case referencedDataBlockedReplicas > 0: + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonAwaitingPropagation, + Message: fmt.Sprintf("%d of %d desired replicas are waiting for referenced data companions", referencedDataBlockedReplicas, desiredReplicas), + }) + default: apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ Type: computev1alpha.WorkloadDeploymentReplicasReady, Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", + Reason: reasonReplicasAvailable, Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), }) } @@ -233,12 +261,19 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco }) } - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) + // Skip the write when the status is unchanged. Without this guard the + // reconciler's own Status().Update would always produce a new resourceVersion, + // firing another Update event on the WD and creating an infinite reconcile loop + // before the predicate on For() was added. The guard is a belt-and-suspenders + // complement to the predicate: the predicate prevents re-queuing on own writes, + // and this guard avoids the superfluous API call entirely. + if !equality.Semantic.DeepEqual(existingStatus, deployment.Status) { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) + } + logger.Info("deployment status updated") } - logger.Info("deployment status updated") - return ctrl.Result{}, nil } @@ -248,13 +283,17 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( deployment *computev1alpha.WorkloadDeployment, instances []computev1alpha.Instance, networkReady bool, -) (currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas int, err error) { +) (currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas int, err error) { templateHash := instancecontrol.ComputeHash(deployment.Spec.Template) for _, instance := range instances { if apimeta.IsStatusConditionPresentAndEqual(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted, metav1.ConditionFalse) { quotaBlockedReplicas++ } + if apimeta.IsStatusConditionPresentAndEqual(instance.Status.Conditions, computev1alpha.ReferencedDataReady, metav1.ConditionFalse) { + referencedDataBlockedReplicas++ + } + // Spec.Controller is a nilable pointer; guard it before dereferencing the // scheduling gates so an instance without controller state cannot panic // the reconcile (mirrors the Status.Controller guard below). @@ -267,7 +306,7 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( instance.Spec.Controller.SchedulingGates = newGates return nil }); patchErr != nil { - return 0, 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr) + return 0, 0, 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr) } } } @@ -294,7 +333,69 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( readyReplicas++ } } - return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, nil + return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas, nil +} + +// wdReferencedDataChangedPredicate returns a predicate for the WorkloadDeployment +// For() watch that fires on: +// - Any Create, Delete, or Generic event (always enqueue). +// - An Update event where metadata.generation changed (spec updated), OR where +// the ReferencedDataReady condition's Status, Reason, or Message changed. +// +// The predicate intentionally does NOT fire when only the Available or +// ReplicasReady conditions change, because those are written by this reconciler +// itself. Without this guard the reconciler's own Status().Update would re-enqueue +// itself on every run, creating a tight reconcile loop. The equality check before +// Status().Update is a complementary guard, but the predicate is the primary +// protection: it prevents re-enqueuing entirely so the workqueue stays quiet between +// meaningful state transitions. +// +// Loop prevention: the ReferencedDataController (the only other writer of the +// ReferencedDataReady condition) is the intended trigger. When it sets +// ReferencedDataReady=False/SourceNotFound the predicate passes and this +// reconciler re-runs, sees the resolver verdict in deployment.Status.Conditions, and +// promotes Available to ReferencedDataNotReady. Subsequent runs by this reconciler +// (which write Available but not ReferencedDataReady) are filtered out. +func wdReferencedDataChangedPredicate() predicate.Predicate { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldWD, ok1 := e.ObjectOld.(*computev1alpha.WorkloadDeployment) + newWD, ok2 := e.ObjectNew.(*computev1alpha.WorkloadDeployment) + if !ok1 || !ok2 { + return true // be conservative when type assertion fails + } + // Spec change: always reconcile. + if oldWD.Generation != newWD.Generation { + return true + } + // ReferencedDataReady condition changed: reconcile so Available is + // updated to reflect the resolver's verdict. + return wdRefDataCondChanged( + apimeta.FindStatusCondition(oldWD.Status.Conditions, computev1alpha.ReferencedDataReady), + apimeta.FindStatusCondition(newWD.Status.Conditions, computev1alpha.ReferencedDataReady), + ) + }, + CreateFunc: func(_ event.CreateEvent) bool { return true }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return true }, + } +} + +// wdRefDataCondChanged returns true when the ReferencedDataReady condition's +// observable fields (Status, Reason, Message) differ between old and new. Presence +// changes (nil → non-nil or vice versa) are also treated as a change. The +// LastTransitionTime field is excluded because it changes on every status flip and +// would defeat the loop-prevention intent of wdReferencedDataChangedPredicate. +func wdRefDataCondChanged(old, new *metav1.Condition) bool { + if (old == nil) != (new == nil) { + return true // condition was added or removed + } + if old == nil { + return false // both nil — no change + } + return old.Status != new.Status || + old.Reason != new.Reason || + old.Message != new.Message } // reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all @@ -549,16 +650,34 @@ func (r *WorkloadDeploymentReconciler) Finalize(ctx context.Context, obj client. return finalizer.Result{}, errDeploymentHasInstances } +// WorkloadDeploymentReconcilerOptions configures the WorkloadDeploymentReconciler. +type WorkloadDeploymentReconcilerOptions struct { + // EnableReferencedDataGate mirrors FeatureFlagsConfig.EnableReferencedDataGate. + EnableReferencedDataGate bool +} + // SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) error { +func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager, opts ...WorkloadDeploymentReconcilerOptions) error { r.mgr = mgr + for _, o := range opts { + r.enableReferencedDataGate = o.EnableReferencedDataGate + } r.finalizers = finalizer.NewFinalizers() if err := r.finalizers.Register(workloadControllerFinalizer, r); err != nil { return fmt.Errorf("failed to register finalizer: %w", err) } b := mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + // The predicate gates re-enqueuing on meaningful WD changes: spec updates + // (generation bump) or a ReferencedDataReady condition change written by + // ReferencedDataController. Without it, each Status().Update by this + // reconciler (writing Available/ReplicasReady) would re-enqueue itself, + // creating a tight loop and delaying the ReferencedDataReady signal from + // the resolver. + For(&computev1alpha.WorkloadDeployment{}, + mcbuilder.WithEngageWithLocalCluster(false), + mcbuilder.WithPredicates(wdReferencedDataChangedPredicate()), + ). Owns(&computev1alpha.Instance{}) // Only watch networking resources when the networking integration is enabled. diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go index e343a17b..63f47017 100644 --- a/internal/controller/workloaddeployment_controller_test.go +++ b/internal/controller/workloaddeployment_controller_test.go @@ -13,6 +13,7 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/finalizer" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" @@ -38,6 +39,10 @@ const ( // matching what the infra provider writes on Instances. wdTestReasonProgrammed = "Programmed" wdTestReasonReady = "Ready" + + // testMsgConfigMapNotFound is a representative terminal referenced-data + // message used across the Available-rollup unit tests. + testMsgConfigMapNotFound = `ConfigMap "app-config" not found in namespace "default"` ) // wdControllerTestDeployment builds a WorkloadDeployment fixture shaped like a @@ -166,7 +171,7 @@ func TestReconcileInstanceGates_NilController_DoesNotPanic(t *testing.T) { r := &WorkloadDeploymentReconciler{} // The call must not panic — that is the primary regression assertion. - currentReplicas, _, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates( + currentReplicas, _, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas, err := r.reconcileInstanceGates( context.Background(), cl, deployment, @@ -186,6 +191,7 @@ func TestReconcileInstanceGates_NilController_DoesNotPanic(t *testing.T) { assert.Equal(t, 1, readyReplicas, "instanceReady must be counted as ready") assert.Equal(t, 0, quotaBlockedReplicas) + assert.Equal(t, 0, referencedDataBlockedReplicas) } // TestReconcileInstanceGates_NilSpecController_DoesNotPanic is a regression test @@ -218,7 +224,7 @@ func TestReconcileInstanceGates_NilSpecController_DoesNotPanic(t *testing.T) { r := &WorkloadDeploymentReconciler{} require.NotPanics(t, func() { - _, _, _, _, err := r.reconcileInstanceGates( + _, _, _, _, _, err := r.reconcileInstanceGates( context.Background(), cl, deployment, @@ -287,7 +293,7 @@ func TestReconcileInstanceGates_ReplicaCounting(t *testing.T) { cl := newProjectFakeClient() r := &WorkloadDeploymentReconciler{} - currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates( + currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, _, err := r.reconcileInstanceGates( context.Background(), cl, deployment, @@ -331,7 +337,7 @@ func TestReconcileInstanceGates_ClearsNetworkSchedulingGate(t *testing.T) { cl := newProjectFakeClient(instance) r := &WorkloadDeploymentReconciler{} - _, _, _, _, err := r.reconcileInstanceGates( + _, _, _, _, _, err := r.reconcileInstanceGates( context.Background(), cl, deployment, @@ -355,7 +361,7 @@ func TestReconcileInstanceGates_ClearsNetworkSchedulingGate(t *testing.T) { cl := newProjectFakeClient(instance) r := &WorkloadDeploymentReconciler{} - _, _, _, _, err := r.reconcileInstanceGates( + _, _, _, _, _, err := r.reconcileInstanceGates( context.Background(), cl, deployment, @@ -430,3 +436,294 @@ func TestWorkloadDeploymentReconcile_FinalizerAddRequeues(t *testing.T) { assert.True(t, apimeta.IsStatusConditionTrue(updated.Status.Conditions, computev1alpha.WorkloadDeploymentReplicasReady), "no instances are quota-blocked, so ReplicasReady must be true") } + +// NOTE (split): the Available-condition unit tests that exercised +// selectWDBlockingCondition and wdBlockingReasonPriority (TestWDAvailableCondition_*, +// TestWDBlockingReasonPriority_WD, and the makeWDForAvailTest helper) were moved to +// layer E (split/refdata-blocking-reason), where those functions are introduced. +// They were authored against the E-refactored WD controller and cannot compile in +// this layer, which still carries the inline blocking-reason switch. + +// ─── wdRefDataCondChanged tests ─────────────────────────────────────────────── + +// TestWdRefDataCondChanged_BothNil verifies that two nil conditions are treated +// as unchanged (no predicate trigger). +func TestWdRefDataCondChanged_BothNil(t *testing.T) { + assert.False(t, wdRefDataCondChanged(nil, nil), + "both nil conditions must not be treated as a change") +} + +// TestWdRefDataCondChanged_AddedCondition verifies that a nil→non-nil transition +// (condition first appears on the WD) is treated as a change. +func TestWdRefDataCondChanged_AddedCondition(t *testing.T) { + newCond := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + } + assert.True(t, wdRefDataCondChanged(nil, newCond), + "nil→non-nil must be treated as a change") +} + +// TestWdRefDataCondChanged_RemovedCondition verifies that a non-nil→nil transition +// (condition removed) is treated as a change. +func TestWdRefDataCondChanged_RemovedCondition(t *testing.T) { + oldCond := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + } + assert.True(t, wdRefDataCondChanged(oldCond, nil), + "non-nil→nil must be treated as a change") +} + +// TestWdRefDataCondChanged_StatusChange verifies that a change in the condition's +// Status field is detected. +func TestWdRefDataCondChanged_StatusChange(t *testing.T) { + old := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.ReferencedDataReasonResolving, + } + new := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + } + assert.True(t, wdRefDataCondChanged(old, new), + "status change must be detected") +} + +// TestWdRefDataCondChanged_MessageChange verifies that a change in Message is +// detected, e.g. when the resolver updates the missing-object name. +func TestWdRefDataCondChanged_MessageChange(t *testing.T) { + old := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: `ConfigMap "a" not found in namespace "default"`, + } + new := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: `ConfigMap "b" not found in namespace "default"`, + } + assert.True(t, wdRefDataCondChanged(old, new), + "message change must be detected") +} + +// TestWdRefDataCondChanged_Identical verifies that an identical condition (same +// Status/Reason/Message, differing only in LastTransitionTime) is NOT treated as +// a change. This is the key no-self-trigger property: the reconciler's own +// Status().Update does not re-enqueue via the For() predicate. +func TestWdRefDataCondChanged_Identical(t *testing.T) { + t1 := metav1.Now() + t2 := metav1.Now() + old := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: t1, + } + new := &metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: t2, // different timestamp, same content + } + assert.False(t, wdRefDataCondChanged(old, new), + "identical conditions with differing LastTransitionTime must not be treated as a change") +} + +// ─── wdReferencedDataChangedPredicate tests ─────────────────────────────────── + +// makeWDPair builds two empty WorkloadDeployment objects (no conditions) for +// predicate Update tests. Callers mutate the returned objects before constructing +// the event.UpdateEvent. +func makeWDPair(gen int64) (old, new *computev1alpha.WorkloadDeployment) { + old = &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: wdControllerTestName, + Namespace: wdControllerTestNS, + Generation: gen, + }, + } + new = &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: wdControllerTestName, + Namespace: wdControllerTestNS, + Generation: gen, + }, + } + return old, new +} + +// TestWDPredicate_ReferencedDataReadyAdded verifies that the predicate passes +// when the resolver writes ReferencedDataReady=False for the first time. This is +// the primary gap-closure scenario: the WD had no ReferencedDataReady condition +// (reconciler wrote Available=InstancesProvisioning), the resolver adds +// ReferencedDataReady=False/SourceNotFound, and the predicate must fire so the +// reconciler re-runs and promotes Available to ReferencedDataNotReady. +func TestWDPredicate_ReferencedDataReadyAdded(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + + oldWD, newWD := makeWDPair(1) + apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: metav1.Now(), + }) + + e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} + assert.True(t, pred.Update(e), + "predicate must pass when ReferencedDataReady is added (nil→False/SourceNotFound)") +} + +// TestWDPredicate_ReferencedDataReadyCleared verifies that the predicate passes +// when the resolver sets ReferencedDataReady=True (source ConfigMap was created). +// The WD reconciler must re-run so Available can be promoted from +// ReferencedDataNotReady to StableInstanceFound (or InstancesProvisioning). +func TestWDPredicate_ReferencedDataReadyCleared(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + + oldWD, newWD := makeWDPair(1) + apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: metav1.Now(), + }) + apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + Message: "All companions are ready", + LastTransitionTime: metav1.Now(), + }) + + e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} + assert.True(t, pred.Update(e), + "predicate must pass when ReferencedDataReady flips from False to True") +} + +// TestWDPredicate_GenerationChanged verifies that the predicate passes when the +// WD's generation changes (spec update), even if the ReferencedDataReady condition +// did not change. +func TestWDPredicate_GenerationChanged(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + + oldWD, newWD := makeWDPair(1) + newWD.Generation = 2 + + e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} + assert.True(t, pred.Update(e), + "predicate must pass when metadata.generation increases") +} + +// TestWDPredicate_AvailableOnlyChange verifies that the predicate DROPS updates +// where only the Available condition changed. This is the self-trigger prevention: +// after the WD reconciler writes Available=ReferencedDataNotReady, the predicate +// must not re-enqueue the same reconciler via the For() watch. +func TestWDPredicate_AvailableOnlyChange(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + + // Both old and new have the SAME ReferencedDataReady condition. + refDataCond := metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: metav1.Now(), + } + oldWD, newWD := makeWDPair(1) + apimeta.SetStatusCondition(&oldWD.Status.Conditions, refDataCond) + apimeta.SetStatusCondition(&newWD.Status.Conditions, refDataCond) + + // The WD reconciler wrote Available=InstancesProvisioning (old) and then + // Available=ReferencedDataNotReady (new). ReferencedDataReady is unchanged. + // NOTE (split): the named reason constants land in layer E; literals are used + // here because the predicate only cares that the Available reason changed. + apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "InstancesProvisioning", + Message: "Instances are being provisioned", + }) + apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ReferencedDataNotReady", + Message: testMsgConfigMapNotFound, + }) + + e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} + assert.False(t, pred.Update(e), + "predicate must drop update when only Available changed; "+ + "ReferencedDataReady is unchanged so the reconciler's own write must not re-enqueue itself") +} + +// TestWDPredicate_ReplicasReadyOnlyChange verifies that the predicate DROPS updates +// where only the ReplicasReady condition changed (also written by this reconciler), +// for the same self-trigger prevention reason as the Available-only case. +func TestWDPredicate_ReplicasReadyOnlyChange(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + + refDataCond := metav1.Condition{ + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.ReferencedDataReasonReady, + Message: "all ready", + LastTransitionTime: metav1.Now(), + } + oldWD, newWD := makeWDPair(2) + apimeta.SetStatusCondition(&oldWD.Status.Conditions, refDataCond) + apimeta.SetStatusCondition(&newWD.Status.Conditions, refDataCond) + + // ReplicasReady changed (more instances became ready) but ReferencedDataReady is identical. + apimeta.SetStatusCondition(&oldWD.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: reasonReplicasAvailable, + Message: "0/2 replicas available", + }) + apimeta.SetStatusCondition(&newWD.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: reasonReplicasAvailable, + Message: "2/2 replicas available", + }) + + e := event.UpdateEvent{ObjectOld: oldWD, ObjectNew: newWD} + assert.False(t, pred.Update(e), + "predicate must drop update when only ReplicasReady changed") +} + +// TestWDPredicate_CreateAlwaysPasses verifies that Create events always trigger +// the reconciler regardless of conditions. +func TestWDPredicate_CreateAlwaysPasses(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + e := event.CreateEvent{Object: &computev1alpha.WorkloadDeployment{}} + assert.True(t, pred.Create(e)) +} + +// TestWDPredicate_DeleteAlwaysPasses verifies that Delete events always trigger +// the reconciler. +func TestWDPredicate_DeleteAlwaysPasses(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + e := event.DeleteEvent{Object: &computev1alpha.WorkloadDeployment{}} + assert.True(t, pred.Delete(e)) +} + +// TestWDPredicate_GenericAlwaysPasses verifies that Generic events (e.g. from +// external sources) always trigger the reconciler. +func TestWDPredicate_GenericAlwaysPasses(t *testing.T) { + pred := wdReferencedDataChangedPredicate() + e := event.GenericEvent{Object: &computev1alpha.WorkloadDeployment{}} + assert.True(t, pred.Generic(e)) +} diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go index 332978d7..9d0d44ba 100644 --- a/internal/controller/workloaddeployment_federator.go +++ b/internal/controller/workloaddeployment_federator.go @@ -10,8 +10,10 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" @@ -258,6 +260,23 @@ func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( kd.Labels[cityCodeLabel] = deployment.Spec.CityCode kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace kd.Spec = deployment.Spec + // Propagate controller-managed annotations from the project WD to the + // downstream WD. The cell reads the expected-referenced-data annotation + // to gate-clear instances; without this copy it would never arrive. + // Absence must mirror downstream too: the cell gate treats an absent + // annotation as "resolver hasn't run" (wait), distinct from an empty + // list ("nothing needed"). The resolver deletes the annotation — along + // with the companions — when the template drops all references, so a + // stale downstream copy would gate new instances forever on companions + // that no longer exist. + if anno, ok := deployment.Annotations[computev1alpha.ExpectedReferencedDataAnnotation]; ok { + if kd.Annotations == nil { + kd.Annotations = make(map[string]string) + } + kd.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] = anno + } else { + delete(kd.Annotations, computev1alpha.ExpectedReferencedDataAnnotation) + } return nil }) if err != nil { @@ -285,10 +304,17 @@ func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( result, err := controllerutil.CreateOrPatch(ctx, r.FederationClient, pp, func() error { pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ - // Select all WorkloadDeployments in this namespace that carry the - // city-code label. Using a label selector (rather than individual - // resource names) means that new deployments for this city are - // automatically picked up without updating the policy. + // Select WorkloadDeployments by city-code label, plus ALL + // companion ConfigMaps and Secrets in this namespace that carry the + // referenced-data label. The label selector on ConfigMap/Secret is + // city-code-agnostic — companions are shared across city codes when + // multiple WDs reference the same source. Karmada propagates the + // entire set to matching clusters in one policy, so companions + // co-arrive with their WorkloadDeployment. + // + // Using separate ResourceSelectors for each kind (WorkloadDeployment, + // ConfigMap, Secret) is the idiomatic Karmada pattern for + // multi-kind propagation within a single policy. ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ { APIVersion: computev1alpha.GroupVersion.String(), @@ -299,6 +325,28 @@ func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( }, }, }, + { + // Propagate companion ConfigMaps alongside WorkloadDeployments. + // The referenced-data label is the only selector needed; there + // is no per-city partitioning of companions. + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: kindConfigMap, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + }, + { + // Propagate companion Secrets alongside WorkloadDeployments. + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: kindSecret, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + computev1alpha.ReferencedDataLabel: computev1alpha.ReferencedDataLabelValue, + }, + }, + }, }, Placement: karmadapolicyv1alpha1.Placement{ // Route to clusters that carry the same city-code label. POP-cell @@ -324,8 +372,16 @@ func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( } // syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment -// from the downstream namespace and writes it back to the project-namespace +// from the downstream namespace and merges it back into the project-namespace // object. It is a no-op when the downstream object does not yet exist. +// +// Merge semantics: the resolver (ReferencedDataController) owns the +// ReferencedDataReady condition and any conditions with SourceNotFound, +// SourceUnauthorized, or SourceTooLarge reasons. This method preserves those +// conditions, overwriting only the downstream-owned portion of the status +// (replica counts, Programmed, Ready, etc.). Without this merge a concurrent +// federator status sync would overwrite the resolver's condition with whatever +// (empty or stale) value the downstream WD carries. func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( ctx context.Context, projectClient client.Client, @@ -343,15 +399,40 @@ func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) } - if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { - return nil + // Build the merged status: start from downstream, then re-apply the + // resolver-owned ReferencedDataReady condition from the project WD so we + // never overwrite it with the downstream's copy. + merged := kd.Status.DeepCopy() + if resolverCond := apimeta.FindStatusCondition(deployment.Status.Conditions, computev1alpha.ReferencedDataReady); resolverCond != nil { + apimeta.SetStatusCondition(&merged.Conditions, *resolverCond) } - deployment.Status = kd.Status - if err := projectClient.Status().Update(ctx, deployment); err != nil { - return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + if equality.Semantic.DeepEqual(deployment.Status, *merged) { + return nil } - return nil + + // Wrap in RetryOnConflict so a concurrent annotation Patch by the resolver + // does not cause a hard error. The status write is idempotent from the + // perspective of the downstream fields it carries. + key := types.NamespacedName{Namespace: deployment.Namespace, Name: deployment.Name} + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := projectClient.Get(ctx, key, deployment); err != nil { + return err + } + // Re-compute merged on each attempt in case the resolver condition changed. + merged = kd.Status.DeepCopy() + if resolverCond := apimeta.FindStatusCondition(deployment.Status.Conditions, computev1alpha.ReferencedDataReady); resolverCond != nil { + apimeta.SetStatusCondition(&merged.Conditions, *resolverCond) + } + if equality.Semantic.DeepEqual(deployment.Status, *merged) { + return nil + } + deployment.Status = *merged + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return err + } + return nil + }) } // cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go index 0b71f0a0..1b9f59be 100644 --- a/internal/controller/workloaddeployment_federator_test.go +++ b/internal/controller/workloaddeployment_federator_test.go @@ -59,7 +59,7 @@ func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *c Spec: computev1alpha.WorkloadDeploymentSpec{ CityCode: testCityCodeLAX, WorkloadRef: computev1alpha.WorkloadReference{ - Name: "test-workload", + Name: rdTestWorkloadName, }, PlacementName: testDefaultPlacement, ScaleSettings: computev1alpha.HorizontalScaleSettings{ @@ -389,13 +389,27 @@ func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { }, &pp) require.NoError(t, err, "PropagationPolicy %q should exist", ppName) - // The PP must select WorkloadDeployments by the city-code label. - require.Len(t, pp.Spec.ResourceSelectors, 1) - sel := pp.Spec.ResourceSelectors[0] - assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) - assert.Equal(t, "WorkloadDeployment", sel.Kind) - require.NotNil(t, sel.LabelSelector) - assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + // The PP must have three selectors: WorkloadDeployment (city-code), ConfigMap + // (referenced-data), and Secret (referenced-data). + require.Len(t, pp.Spec.ResourceSelectors, 3) + + wdSel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), wdSel.APIVersion) + assert.Equal(t, kindWorkloadDeployment, wdSel.Kind) + require.NotNil(t, wdSel.LabelSelector) + assert.Equal(t, testCityCodeLAX, wdSel.LabelSelector.MatchLabels[cityCodeLabel]) + + cmSel := pp.Spec.ResourceSelectors[1] + assert.Equal(t, "v1", cmSel.APIVersion) + assert.Equal(t, kindConfigMap, cmSel.Kind) + require.NotNil(t, cmSel.LabelSelector) + assert.Equal(t, computev1alpha.ReferencedDataLabelValue, cmSel.LabelSelector.MatchLabels[computev1alpha.ReferencedDataLabel]) + + secretSel := pp.Spec.ResourceSelectors[2] + assert.Equal(t, "v1", secretSel.APIVersion) + assert.Equal(t, kindSecret, secretSel.Kind) + require.NotNil(t, secretSel.LabelSelector) + assert.Equal(t, computev1alpha.ReferencedDataLabelValue, secretSel.LabelSelector.MatchLabels[computev1alpha.ReferencedDataLabel]) // The PP cluster affinity must target clusters carrying the same city-code. require.NotNil(t, pp.Spec.Placement.ClusterAffinity) @@ -538,6 +552,103 @@ func TestCleanupPropagationPolicyIfUnused_EmptyCityCode(t *testing.T) { assert.Contains(t, err.Error(), "city code is empty") } +// TestWorkloadDeploymentFederator_PropagationPolicyHasReferencedDataSelectors +// verifies that the PropagationPolicy always includes ConfigMap and Secret +// selectors for the referenced-data label in addition to the WorkloadDeployment +// city-code selector. This is the always-on companion co-propagation. +func TestWorkloadDeploymentFederator_PropagationPolicyHasReferencedDataSelectors(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + _, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + require.NoError(t, karmadaClient.Get(context.Background(), types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp)) + + require.Len(t, pp.Spec.ResourceSelectors, 3, "PP must have WD + ConfigMap + Secret selectors") + + kinds := make(map[string]bool) + for _, sel := range pp.Spec.ResourceSelectors { + kinds[sel.Kind] = true + } + assert.True(t, kinds[kindWorkloadDeployment], "PP must select WorkloadDeployments") + assert.True(t, kinds[kindConfigMap], "PP must select ConfigMaps with referenced-data label") + assert.True(t, kinds[kindSecret], "PP must select Secrets with referenced-data label") + + // Verify the ConfigMap and Secret selectors match on the referenced-data label. + for _, sel := range pp.Spec.ResourceSelectors { + if sel.Kind == kindConfigMap || sel.Kind == kindSecret { + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, computev1alpha.ReferencedDataLabelValue, sel.LabelSelector.MatchLabels[computev1alpha.ReferencedDataLabel], + "%s selector must match referenced-data=true label", sel.Kind) + } + } +} + +// TestWorkloadDeploymentFederator_AnnotationPropagation verifies that the +// federator mirrors the expected-referenced-data annotation from the project WD +// to the downstream (Karmada hub) WD in both directions: copied while present +// so the cell can gate-clear, and deleted once the resolver removes it (the +// cell gate reads absence as "resolver hasn't run", so a stale downstream copy +// would gate new instances forever on companions that no longer exist). +func TestWorkloadDeploymentFederator_AnnotationPropagation(t *testing.T) { + t.Parallel() + + const expectedAnno = `["ConfigMap/app-config","Secret/db-creds"]` + + wd := testWorkloadDeployment(withFinalizer, func(w *computev1alpha.WorkloadDeployment) { + w.Annotations = map[string]string{ + computev1alpha.ExpectedReferencedDataAnnotation: expectedAnno, + } + }) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + ctx := context.Background() + _, err := r.Reconcile(ctx, reconcileRequest()) + require.NoError(t, err) + + karmadaWDKey := types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + } + var karmadaWD computev1alpha.WorkloadDeployment + require.NoError(t, karmadaClient.Get(ctx, karmadaWDKey, &karmadaWD)) + + got := karmadaWD.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + assert.Equal(t, expectedAnno, got, + "federator must propagate expected-referenced-data annotation to the downstream WD") + + // The resolver deletes the annotation from the project WD when the template + // drops all references. The next upsert must delete the downstream copy too. + var projectWD computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, &projectWD)) + delete(projectWD.Annotations, computev1alpha.ExpectedReferencedDataAnnotation) + require.NoError(t, projectClient.Update(ctx, &projectWD)) + + _, err = r.Reconcile(ctx, reconcileRequest()) + require.NoError(t, err) + + karmadaWD = computev1alpha.WorkloadDeployment{} + require.NoError(t, karmadaClient.Get(ctx, karmadaWDKey, &karmadaWD)) + _, stale := karmadaWD.Annotations[computev1alpha.ExpectedReferencedDataAnnotation] + assert.False(t, stale, + "federator must delete the downstream annotation once the project WD no longer carries it") +} + // TestWorkloadDeploymentFederator_NotFound verifies that a missing // WorkloadDeployment is handled gracefully (no error, no action). func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { diff --git a/internal/referenceddata/collector.go b/internal/referenceddata/collector.go new file mode 100644 index 00000000..2975c82b --- /dev/null +++ b/internal/referenceddata/collector.go @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package referenceddata + +import ( + "slices" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// CollectFromTemplate walks an InstanceTemplateSpec and returns a deduplicated, +// deterministically-sorted ReferencedSet of all ConfigMaps and Secrets +// referenced by: +// - container env.ValueFrom.ConfigMapKeyRef / SecretKeyRef +// - container envFrom[].configMapRef / secretRef +// - spec.volumes[].configMap and spec.volumes[].secret +// +// The namespace field on every returned ObjectRef is set to the provided +// namespace (the Workload's namespace). References are always same-namespace. +func CollectFromTemplate(namespace string, template computev1alpha.InstanceTemplateSpec) ReferencedSet { + seen := make(map[string]struct{}) + var refs []ObjectRef + + add := func(kind, name string) { + if name == "" { + return + } + key := kind + "/" + name + if _, ok := seen[key]; ok { + return + } + seen[key] = struct{}{} + refs = append(refs, ObjectRef{ + Kind: kind, + Name: name, + Namespace: namespace, + }) + } + + // Collect from sandbox containers. + if sb := template.Spec.Runtime.Sandbox; sb != nil { + for _, c := range sb.Containers { + // env[].valueFrom + for _, e := range c.Env { + if e.ValueFrom == nil { + continue + } + if e.ValueFrom.ConfigMapKeyRef != nil { + add("ConfigMap", e.ValueFrom.ConfigMapKeyRef.Name) + } + if e.ValueFrom.SecretKeyRef != nil { + add("Secret", e.ValueFrom.SecretKeyRef.Name) + } + } + + // envFrom[] + // Skip entries that have both refs set — validateEnvFrom rejects + // them, so there is no valid single source to collect from. + for _, ef := range c.EnvFrom { + if ef.ConfigMapRef != nil && ef.SecretRef != nil { + continue + } + if ef.ConfigMapRef != nil { + add("ConfigMap", ef.ConfigMapRef.Name) + } + if ef.SecretRef != nil { + add("Secret", ef.SecretRef.Name) + } + } + } + } + + // Collect from volumes. + for _, v := range template.Spec.Volumes { + if v.ConfigMap != nil { + add("ConfigMap", v.ConfigMap.Name) + } + if v.Secret != nil { + add("Secret", v.Secret.SecretName) + } + } + + // Sort deterministically: kind descending (Secret > ConfigMap) then name + // ascending within kind. This ordering is stable and matches the companion + // name sort used by the expected-set annotation. + slices.SortFunc(refs, func(a, b ObjectRef) int { + if a.Kind != b.Kind { + // "Secret" > "ConfigMap" lexicographically — sort ascending so + // ConfigMap comes first, then Secret. + if a.Kind < b.Kind { + return -1 + } + return 1 + } + if a.Name < b.Name { + return -1 + } + if a.Name > b.Name { + return 1 + } + return 0 + }) + + return ReferencedSet(refs) +} + +// CollectFromSpec is a convenience wrapper around CollectFromTemplate for +// callers (e.g. the admission webhook validator) that already hold a bare +// InstanceSpec rather than the full InstanceTemplateSpec. +func CollectFromSpec(namespace string, spec computev1alpha.InstanceSpec) ReferencedSet { + return CollectFromTemplate(namespace, computev1alpha.InstanceTemplateSpec{Spec: spec}) +} + +// TemplateReferencesData returns true if the template references at least one +// ConfigMap or Secret. Used by the stateful instance controller to decide +// whether to stamp the ReferencedData scheduling gate. +func TemplateReferencesData(template computev1alpha.InstanceTemplateSpec) bool { + return len(CollectFromTemplate("", template)) > 0 +} diff --git a/internal/referenceddata/collector_test.go b/internal/referenceddata/collector_test.go new file mode 100644 index 00000000..9a2fb1a3 --- /dev/null +++ b/internal/referenceddata/collector_test.go @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package referenceddata + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/utils/ptr" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +const ( + testContainerImage = "img" + testEnvConfigMap = "app-config" + testSharedCfg = "shared-cfg" + testCfgRef = "cfg" +) + +func TestCollectFromTemplate(t *testing.T) { + ns := "my-project" + + cases := map[string]struct { + template computev1alpha.InstanceTemplateSpec + want ReferencedSet + }{ + "empty template": { + template: computev1alpha.InstanceTemplateSpec{}, + want: nil, + }, + "sandbox with no references": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + {Name: "c1", Image: testContainerImage, Env: []corev1.EnvVar{{Name: "FOO", Value: "bar"}}}, + }, + } + }), + want: nil, + }, + "env.valueFrom.configMapKeyRef": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c1", + Image: testContainerImage, + Env: []corev1.EnvVar{ + { + Name: "KEY", + ValueFrom: &corev1.EnvVarSource{ + ConfigMapKeyRef: &corev1.ConfigMapKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: testEnvConfigMap}, + Key: "key", + }, + }, + }, + }, + }, + }, + } + }), + want: ReferencedSet{ + {Kind: testKindConfigMap, Name: testEnvConfigMap, Namespace: ns}, + }, + }, + "env.valueFrom.secretKeyRef": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c1", + Image: testContainerImage, + Env: []corev1.EnvVar{ + { + Name: "PASS", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: testNameDBCreds}, + Key: "password", + }, + }, + }, + }, + }, + }, + } + }), + want: ReferencedSet{ + {Kind: testKindSecret, Name: testNameDBCreds, Namespace: ns}, + }, + }, + "envFrom.configMapRef": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c1", + Image: testContainerImage, + EnvFrom: []computev1alpha.EnvFromSource{{ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: "env-config"}}}, + }, + }, + } + }), + want: ReferencedSet{ + {Kind: testKindConfigMap, Name: "env-config", Namespace: ns}, + }, + }, + "envFrom.secretRef": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c1", + Image: testContainerImage, + EnvFrom: []computev1alpha.EnvFromSource{{SecretRef: &computev1alpha.SecretEnvSource{Name: "env-secret"}}}, + }, + }, + } + }), + want: ReferencedSet{ + {Kind: testKindSecret, Name: "env-secret", Namespace: ns}, + }, + }, + "volume configMap": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Volumes = []computev1alpha.InstanceVolume{ + { + Name: "cfg-vol", + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "vol-config"}, + }, + }, + }, + } + }), + want: ReferencedSet{ + {Kind: testKindConfigMap, Name: "vol-config", Namespace: ns}, + }, + }, + "volume secret": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Volumes = []computev1alpha.InstanceVolume{ + { + Name: "sec-vol", + VolumeSource: computev1alpha.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "vol-secret"}, + }, + }, + } + }), + want: ReferencedSet{ + {Kind: testKindSecret, Name: "vol-secret", Namespace: ns}, + }, + }, + "deduplication across containers": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c1", + Image: testContainerImage, + EnvFrom: []computev1alpha.EnvFromSource{{ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: testSharedCfg}}}, + }, + { + Name: "c2", + Image: testContainerImage, + EnvFrom: []computev1alpha.EnvFromSource{{ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: testSharedCfg}}}, + }, + }, + } + }), + want: ReferencedSet{ + {Kind: testKindConfigMap, Name: testSharedCfg, Namespace: ns}, + }, + }, + // When both configMapRef and secretRef are set on the same envFrom + // entry, validateEnvFrom rejects it, so the collector must skip it + // rather than collecting (and later SAR-ing) both refs. + "both refs set on envFrom entry — skipped": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c1", + Image: testContainerImage, + EnvFrom: []computev1alpha.EnvFromSource{ + { + ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: testCfgRef}, + SecretRef: &computev1alpha.SecretEnvSource{Name: "sec"}, + }, + }, + }, + }, + } + }), + // No refs collected — the invalid entry is skipped entirely. + want: nil, + }, + "mixed sources sorted configmap-first then secret": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c1", + Image: testContainerImage, + EnvFrom: []computev1alpha.EnvFromSource{ + {SecretRef: &computev1alpha.SecretEnvSource{Name: "z-secret"}}, + {ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: "a-config"}}, + }, + }, + }, + } + }), + // Sorted: ConfigMap < Secret lexicographically, then name ascending + want: ReferencedSet{ + {Kind: testKindConfigMap, Name: "a-config", Namespace: ns}, + {Kind: testKindSecret, Name: "z-secret", Namespace: ns}, + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + got := CollectFromTemplate(ns, tc.template) + if len(got) != len(tc.want) { + t.Fatalf("CollectFromTemplate: len=%d want=%d; got=%v want=%v", len(got), len(tc.want), got, tc.want) + } + for i := range got { + if got[i] != tc.want[i] { + t.Errorf("index %d: got %+v, want %+v", i, got[i], tc.want[i]) + } + } + }) + } +} + +func TestTemplateReferencesData(t *testing.T) { + cases := map[string]struct { + template computev1alpha.InstanceTemplateSpec + want bool + }{ + "empty": { + template: computev1alpha.InstanceTemplateSpec{}, + want: false, + }, + "plain env only": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + {Name: "c", Image: testContainerImage, Env: []corev1.EnvVar{{Name: "X", Value: "y"}}}, + }, + } + }), + want: false, + }, + "has configmap ref": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Volumes = []computev1alpha.InstanceVolume{ + {Name: "v", VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testCfgRef}, + }, + }}, + } + }), + want: true, + }, + "has secret ref": { + template: makeTemplate(func(t *computev1alpha.InstanceTemplateSpec) { + t.Spec.Runtime.Sandbox = &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: "c", + Image: testContainerImage, + EnvFrom: []computev1alpha.EnvFromSource{ + {SecretRef: &computev1alpha.SecretEnvSource{Name: "s", Optional: ptr.To(true)}}, + }, + }, + }, + } + }), + want: true, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + got := TemplateReferencesData(tc.template) + if got != tc.want { + t.Errorf("TemplateReferencesData = %v, want %v", got, tc.want) + } + }) + } +} + +// makeTemplate is a helper that creates a minimal InstanceTemplateSpec and +// applies the given mutations. +func makeTemplate(fn func(*computev1alpha.InstanceTemplateSpec)) computev1alpha.InstanceTemplateSpec { + t := computev1alpha.InstanceTemplateSpec{} + fn(&t) + return t +} diff --git a/internal/referenceddata/metrics.go b/internal/referenceddata/metrics.go new file mode 100644 index 00000000..aec2db66 --- /dev/null +++ b/internal/referenceddata/metrics.go @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package referenceddata + +import ( + "github.com/prometheus/client_golang/prometheus" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// Metrics for the cell-side referenced-data gate clearing path. +// +// All metrics use the prefix "compute_referenced_data_" to group them. +const ( + metricsNamespace = "compute" + metricsSubsystem = "referenced_data" + metricsNsLabelName = "namespace" +) + +var ( + // CompanionsPresent tracks the number of companions present on the cell, + // aggregated per namespace. Aggregating per namespace (rather than per + // instance) avoids an unbounded cardinality growth as instances are created + // and deleted. + CompanionsPresent = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "companions_present", + Help: "Number of expected companion ConfigMaps/Secrets that are present " + + "on the cell, aggregated per namespace. Set at each reconcile while " + + "instances are waiting for companions.", + }, + []string{metricsNsLabelName}, + ) + + // CompanionsExpected tracks how many companions the cell expects, aggregated + // per namespace (from the expected-set annotation). Useful as the denominator + // when evaluating CompanionsPresent. + CompanionsExpected = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "companions_expected", + Help: "Total number of companion ConfigMaps/Secrets expected on the cell, " + + "aggregated per namespace, as recorded in the expected-referenced-data annotation.", + }, + []string{metricsNsLabelName}, + ) + + // GateWaitDuration observes how long (in seconds) an Instance spent blocked + // by the ReferencedData scheduling gate. Observed when the gate is removed. + GateWaitDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "gate_wait_seconds", + Help: "Duration in seconds that an Instance waited with the ReferencedData " + + "scheduling gate before all companions became available.", + Buckets: prometheus.DefBuckets, + }, + []string{metricsNsLabelName}, + ) + + // ConditionTransitions counts transitions between ReferencedDataReady reason + // values on Instances. Labels carry the from/to reason so callers can build + // state-machine dashboards. + ConditionTransitions = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "condition_transitions_total", + Help: "Total number of ReferencedDataReady condition reason transitions " + + "observed on Instances by the cell gate-clearing reconciler.", + }, + []string{metricsNsLabelName, "from_reason", "to_reason"}, + ) +) + +func init() { + ctrlmetrics.Registry.MustRegister( + CompanionsPresent, + CompanionsExpected, + GateWaitDuration, + ConditionTransitions, + ) +} diff --git a/internal/referenceddata/names.go b/internal/referenceddata/names.go new file mode 100644 index 00000000..404bb762 --- /dev/null +++ b/internal/referenceddata/names.go @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package referenceddata + +import ( + "fmt" + "hash/fnv" + "strings" + + "k8s.io/apimachinery/pkg/util/validation" +) + +const ( + // maxNameLength is the maximum length of a Kubernetes object name. + maxNameLength = 253 + + // hashSuffixLength is the number of hex characters appended when a name + // would otherwise exceed maxNameLength. + hashSuffixLength = 8 +) + +// CompanionName returns the deterministic companion object name for a given +// (kind, sourceName) pair. The companion is named after the SOURCE name only — +// no kind prefix — so that consumer references (volumes, env, envFrom) resolve +// naturally without any translation layer. +// +// Cross-kind collisions are safe: a ConfigMap companion and a Secret companion +// may both be named "app-config" because they are distinct Kubernetes objects of +// different resource types in the same namespace. +// +// If the source name exceeds maxNameLength (253 chars), the name is truncated +// and a deterministic 8-character FNV-1a hex suffix is appended to avoid +// collisions. The returned name always satisfies DNS subdomain constraints +// required by Kubernetes. +func CompanionName(_, sourceName string) string { + if len(sourceName) <= maxNameLength && isValidDNSSubdomain(sourceName) { + return sourceName + } + + // Truncate the source name so that truncated + "-" + hash fits within + // maxNameLength. Format: "-<8-char-hash>" + hashStr := shortHash(sourceName) + suffix := "-" + hashStr + maxSourceLen := maxNameLength - len(suffix) + if maxSourceLen < 1 { + maxSourceLen = 1 + } + + truncated := sourceName + if len(truncated) > maxSourceLen { + truncated = truncated[:maxSourceLen] + } + // Strip any trailing non-alphanumeric characters to keep the name clean. + truncated = strings.TrimRight(truncated, "-.") + + // If stripping trailing separators emptied the truncated segment (e.g. a + // source name composed entirely of '-' or '.'), fall back to just the hash. + if truncated == "" { + return hashStr + } + + return fmt.Sprintf("%s%s", truncated, suffix) +} + +// CompanionNameForRef is a convenience wrapper around CompanionName that +// accepts an ObjectRef. +func CompanionNameForRef(ref ObjectRef) string { + return CompanionName(ref.Kind, ref.Name) +} + +// CompanionToken returns the kind-qualified token "Kind/name" used in the +// expected-referenced-data annotation so that the cell can disambiguate +// companions by kind without probing both resource types. +func CompanionToken(kind, name string) string { + return kind + "/" + name +} + +// isValidDNSSubdomain returns true if s satisfies Kubernetes DNS subdomain +// naming rules. +func isValidDNSSubdomain(s string) bool { + return len(validation.IsDNS1123Subdomain(s)) == 0 +} + +// shortHash returns an 8-character hex string derived from FNV-1a of the input. +// Used as a collision-avoidance suffix when names are truncated. +func shortHash(s string) string { + h := fnv.New32a() + _, _ = h.Write([]byte(s)) + return fmt.Sprintf("%08x", h.Sum32()) +} diff --git a/internal/referenceddata/names_test.go b/internal/referenceddata/names_test.go new file mode 100644 index 00000000..4ecb10d8 --- /dev/null +++ b/internal/referenceddata/names_test.go @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package referenceddata + +import ( + "strings" + "testing" +) + +const ( + testKindConfigMap = "ConfigMap" + testKindSecret = "Secret" + testNameAppConfig = "app-config" + testNameDBCreds = "db-creds" + testNameCfg = "cfg" + testNameMySecret = "my-secret" +) + +func TestCompanionName(t *testing.T) { + cases := map[string]struct { + kind string + sourceName string + want string + }{ + "configmap simple": { + kind: testKindConfigMap, + sourceName: testNameAppConfig, + want: testNameAppConfig, + }, + "secret simple": { + kind: testKindSecret, + sourceName: testNameDBCreds, + want: testNameDBCreds, + }, + "kind already lower": { + kind: "configmap", + sourceName: testNameCfg, + want: testNameCfg, + }, + "secret upper": { + kind: "SECRET", + sourceName: testNameMySecret, + want: testNameMySecret, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + got := CompanionName(tc.kind, tc.sourceName) + if got != tc.want { + t.Errorf("CompanionName(%q, %q) = %q, want %q", tc.kind, tc.sourceName, got, tc.want) + } + }) + } +} + +// TestCompanionName_SourceNameContract is the seam-crossing contract test. +// It asserts that CompanionName always returns the source name unchanged for +// names that fit in 253 chars and satisfy DNS subdomain rules — regardless of +// the kind argument. Old prefixed code would fail this contract. +func TestCompanionName_SourceNameContract(t *testing.T) { + cases := []struct { + kind string + name string + }{ + {testKindConfigMap, testNameAppConfig}, + {testKindSecret, testNameAppConfig}, // same source name, different kind + {testKindConfigMap, testNameDBCreds}, + {"configmap", "my-cm"}, + {"SECRET", testNameMySecret}, + } + + for _, tc := range cases { + got := CompanionName(tc.kind, tc.name) + if got != tc.name { + t.Errorf("CompanionName(%q, %q) = %q, want source name %q (contract violation)", + tc.kind, tc.name, got, tc.name) + } + } +} + +// TestCompanionName_SameSourceDifferentKind asserts that a ConfigMap and a +// Secret with the same source name produce the same companion name. Cross-kind +// collision is safe because ConfigMap and Secret are distinct object types in +// Kubernetes — they cannot conflict in the same namespace. +func TestCompanionName_SameSourceDifferentKind(t *testing.T) { + name := testNameAppConfig + cmCompanion := CompanionName(testKindConfigMap, name) + secretCompanion := CompanionName(testKindSecret, name) + + if cmCompanion != name { + t.Errorf("ConfigMap companion = %q, want %q", cmCompanion, name) + } + if secretCompanion != name { + t.Errorf("Secret companion = %q, want %q", secretCompanion, name) + } + if cmCompanion != secretCompanion { + t.Errorf("same source name must produce the same companion name regardless of kind: %q != %q", + cmCompanion, secretCompanion) + } +} + +func TestCompanionName_LongName(t *testing.T) { + // Build a name that would exceed 253 chars. + longName := strings.Repeat("a", 250) + result := CompanionName(testKindConfigMap, longName) + + if len(result) > maxNameLength { + t.Errorf("CompanionName with long source: len=%d exceeds maxNameLength=%d", len(result), maxNameLength) + } + + if !isValidDNSSubdomain(result) { + t.Errorf("CompanionName with long source produced invalid DNS subdomain: %q", result) + } + + // The result must be deterministic. + result2 := CompanionName(testKindConfigMap, longName) + if result != result2 { + t.Errorf("CompanionName is not deterministic: %q != %q", result, result2) + } +} + +func TestCompanionName_AllDashesSource(t *testing.T) { + // A source name composed entirely of '-' characters exceeds maxNameLength + // when long. After TrimRight, truncated becomes "". The function must + // produce a valid DNS subdomain (just the hash). + longDashes := strings.Repeat("-", 250) + result := CompanionName(testKindConfigMap, longDashes) + + if len(result) > maxNameLength { + t.Errorf("len=%d exceeds maxNameLength=%d", len(result), maxNameLength) + } + if !isValidDNSSubdomain(result) { + t.Errorf("produced invalid DNS subdomain: %q", result) + } + // Must not contain a segment starting with '-'. + for _, seg := range strings.Split(result, ".") { + if strings.HasPrefix(seg, "-") { + t.Errorf("segment starts with '-': %q in %q", seg, result) + } + } +} + +func TestCompanionName_AllDotsSource(t *testing.T) { + // A source name composed entirely of '.' characters has the same edge: + // TrimRight wipes it out, producing just the hash. + longDots := strings.Repeat(".", 250) + result := CompanionName(testKindConfigMap, longDots) + + if len(result) > maxNameLength { + t.Errorf("len=%d exceeds maxNameLength=%d", len(result), maxNameLength) + } + if !isValidDNSSubdomain(result) { + t.Errorf("produced invalid DNS subdomain: %q", result) + } +} + +func TestCompanionName_NameEndingOnDot(t *testing.T) { + // A source name whose truncation point lands exactly on a '.'. The + // trailing '.' is stripped and the result must still be a valid subdomain. + // + // maxNameLength=253; suffix="-HHHHHHHH" (9). + // maxSourceLen = 253 - 9 (suffix) = 244. + // Build a name that is exactly 244 chars and ends with '.'. + base := strings.Repeat("a", 243) + "." + result := CompanionName("configmap", base) + + if len(result) > maxNameLength { + t.Errorf("len=%d exceeds maxNameLength=%d", len(result), maxNameLength) + } + if !isValidDNSSubdomain(result) { + t.Errorf("produced invalid DNS subdomain: %q", result) + } +} + +func TestCompanionName_ValidShortName(t *testing.T) { + // Positive case: a simple name that fits within maxNameLength without + // truncation should be returned unchanged. + result := CompanionName(testKindSecret, testNameMySecret) + want := testNameMySecret + if result != want { + t.Errorf("CompanionName = %q, want %q", result, want) + } + if !isValidDNSSubdomain(result) { + t.Errorf("produced invalid DNS subdomain: %q", result) + } +} + +func TestCompanionName_Deterministic(t *testing.T) { + // Same inputs always produce the same output. + for i := 0; i < 100; i++ { + a := CompanionName(testKindSecret, testNameMySecret) + b := CompanionName(testKindSecret, testNameMySecret) + if a != b { + t.Fatalf("non-deterministic: %q != %q", a, b) + } + } +} + +func TestCompanionNameForRef(t *testing.T) { + ref := ObjectRef{Kind: testKindConfigMap, Name: testNameAppConfig, Namespace: "default"} + got := CompanionNameForRef(ref) + want := testNameAppConfig + if got != want { + t.Errorf("CompanionNameForRef = %q, want %q", got, want) + } +} + +func TestShortHash_Deterministic(t *testing.T) { + h1 := shortHash("test-value") + h2 := shortHash("test-value") + if h1 != h2 { + t.Errorf("shortHash is non-deterministic: %q != %q", h1, h2) + } + if len(h1) != hashSuffixLength { + t.Errorf("shortHash len=%d, want %d", len(h1), hashSuffixLength) + } +} + +func TestCompanionToken(t *testing.T) { + cases := []struct { + kind string + name string + want string + }{ + {testKindConfigMap, testNameAppConfig, testKindConfigMap + "/" + testNameAppConfig}, + {testKindSecret, testNameDBCreds, testKindSecret + "/" + testNameDBCreds}, + {testKindSecret, testNameAppConfig, testKindSecret + "/" + testNameAppConfig}, + } + for _, tc := range cases { + got := CompanionToken(tc.kind, tc.name) + if got != tc.want { + t.Errorf("CompanionToken(%q, %q) = %q, want %q", tc.kind, tc.name, got, tc.want) + } + } +} diff --git a/internal/referenceddata/project_reader.go b/internal/referenceddata/project_reader.go new file mode 100644 index 00000000..c3f7b8c1 --- /dev/null +++ b/internal/referenceddata/project_reader.go @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package referenceddata + +import ( + "context" + "fmt" + "net/url" + "sync" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// projectControlPlanePath returns the API path for a project's control plane. +// This mirrors the rewrite performed by the Milo multicluster provider: +// https://github.com/datum-cloud/milo/blob/main/pkg/multicluster-runtime/milo/provider.go +func projectControlPlanePath(projectID string) string { + return fmt.Sprintf("/apis/resourcemanager.miloapis.com/v1alpha1/projects/%s/control-plane", projectID) +} + +// ProjectReader implements ProjectConfigSecretReader by rewriting a base +// management-plane *rest.Config to the per-project control-plane path and +// caching one controller-runtime client per project. +// +// This uses the same host-rewriting technique as the Milo multicluster +// provider, but WITHOUT the quota sub-path — the quota client cannot read core +// Kubernetes objects (ConfigMaps/Secrets). +type ProjectReader struct { + // baseConfig is the management identity REST config. It is never mutated; + // per-project copies are created via rest.CopyConfig. + baseConfig *rest.Config + + mu sync.Mutex + clients map[string]client.Client +} + +// NewProjectReader creates a ProjectReader that will rewrite baseConfig to each +// project's control plane. +func NewProjectReader(baseConfig *rest.Config) *ProjectReader { + return &ProjectReader{ + baseConfig: baseConfig, + clients: make(map[string]client.Client), + } +} + +// clientFor returns (creating if necessary) a cached controller-runtime client +// pointed at the given project's control plane. +func (r *ProjectReader) clientFor(projectID string) (client.Client, error) { + r.mu.Lock() + defer r.mu.Unlock() + + if cl, ok := r.clients[projectID]; ok { + return cl, nil + } + + cfg := rest.CopyConfig(r.baseConfig) + apiHost, err := url.Parse(cfg.Host) + if err != nil { + return nil, fmt.Errorf("referenceddata: failed to parse base config host: %w", err) + } + apiHost.Path = projectControlPlanePath(projectID) + cfg.Host = apiHost.String() + + cl, err := client.New(cfg, client.Options{}) + if err != nil { + return nil, fmt.Errorf("referenceddata: failed to create project client for %q: %w", projectID, err) + } + + r.clients[projectID] = cl + return cl, nil +} + +// GetConfigMap implements ProjectConfigSecretReader. +func (r *ProjectReader) GetConfigMap(ctx context.Context, projectID, namespace, name string) (*corev1.ConfigMap, error) { + cl, err := r.clientFor(projectID) + if err != nil { + return nil, err + } + + var cm corev1.ConfigMap + if err := cl.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &cm); err != nil { + return nil, classifyError(err, "ConfigMap", namespace, name) + } + return &cm, nil +} + +// GetSecret implements ProjectConfigSecretReader. +func (r *ProjectReader) GetSecret(ctx context.Context, projectID, namespace, name string) (*corev1.Secret, error) { + cl, err := r.clientFor(projectID) + if err != nil { + return nil, err + } + + var secret corev1.Secret + if err := cl.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &secret); err != nil { + return nil, classifyError(err, "Secret", namespace, name) + } + return &secret, nil +} + +// classifyError maps Kubernetes API errors to the sentinel errors defined in +// this package. All other errors are returned unchanged. +func classifyError(err error, kind, namespace, name string) error { + if apierrors.IsNotFound(err) { + return fmt.Errorf("%w: %s %s/%s", ErrSourceNotFound, kind, namespace, name) + } + if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) { + return fmt.Errorf("%w: %s %s/%s", ErrSourceUnauthorized, kind, namespace, name) + } + return err +} + +// LocalReader implements ProjectConfigSecretReader backed by a single +// controller-runtime client. Intended for single-cluster / dev environments +// where the management and project planes are the same cluster, or for +// testing without a Milo control plane. +type LocalReader struct { + client client.Client +} + +// NewLocalReader creates a LocalReader that reads from the given client. +func NewLocalReader(cl client.Client) *LocalReader { + return &LocalReader{client: cl} +} + +// GetConfigMap implements ProjectConfigSecretReader. +// The projectID parameter is ignored; objects are read from the local client. +func (r *LocalReader) GetConfigMap(ctx context.Context, _ string, namespace, name string) (*corev1.ConfigMap, error) { + var cm corev1.ConfigMap + if err := r.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &cm); err != nil { + return nil, classifyError(err, "ConfigMap", namespace, name) + } + return &cm, nil +} + +// GetSecret implements ProjectConfigSecretReader. +// The projectID parameter is ignored; objects are read from the local client. +func (r *LocalReader) GetSecret(ctx context.Context, _ string, namespace, name string) (*corev1.Secret, error) { + var secret corev1.Secret + if err := r.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &secret); err != nil { + return nil, classifyError(err, "Secret", namespace, name) + } + return &secret, nil +} diff --git a/internal/referenceddata/project_reader_test.go b/internal/referenceddata/project_reader_test.go new file mode 100644 index 00000000..cfc6e79a --- /dev/null +++ b/internal/referenceddata/project_reader_test.go @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package referenceddata + +import ( + "context" + "errors" + "testing" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" +) + +// fakeScheme is a minimal scheme with corev1 for LocalReader tests. +var fakeScheme = func() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + return s +}() + +func TestLocalReader_GetConfigMap_Found(t *testing.T) { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNameAppConfig, + Namespace: "default", + }, + Data: map[string]string{"key": "value"}, + } + cl := fake.NewClientBuilder().WithScheme(fakeScheme).WithObjects(cm).Build() + r := NewLocalReader(cl) + + got, err := r.GetConfigMap(context.Background(), "ignored-project", "default", testNameAppConfig) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got.Name != testNameAppConfig { + t.Errorf("got name %q, want %q", got.Name, testNameAppConfig) + } +} + +func TestLocalReader_GetConfigMap_NotFound(t *testing.T) { + cl := fake.NewClientBuilder().WithScheme(fakeScheme).Build() + r := NewLocalReader(cl) + + _, err := r.GetConfigMap(context.Background(), "ignored-project", "default", "missing") + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, ErrSourceNotFound) { + t.Errorf("expected ErrSourceNotFound, got: %v", err) + } +} + +func TestLocalReader_GetSecret_Found(t *testing.T) { + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNameDBCreds, + Namespace: "prod", + }, + Data: map[string][]byte{"password": []byte("secret!")}, + } + cl := fake.NewClientBuilder().WithScheme(fakeScheme).WithObjects(secret).Build() + r := NewLocalReader(cl) + + got, err := r.GetSecret(context.Background(), "ignored", "prod", testNameDBCreds) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got.Name != testNameDBCreds { + t.Errorf("got name %q, want %q", got.Name, testNameDBCreds) + } +} + +func TestLocalReader_GetSecret_NotFound(t *testing.T) { + cl := fake.NewClientBuilder().WithScheme(fakeScheme).Build() + r := NewLocalReader(cl) + + _, err := r.GetSecret(context.Background(), "ignored", "default", "missing-secret") + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, ErrSourceNotFound) { + t.Errorf("expected ErrSourceNotFound, got: %v", err) + } +} + +func TestClassifyError_NotFound(t *testing.T) { + notFound := apierrors.NewNotFound(schema.GroupResource{Resource: "configmaps"}, "foo") + err := classifyError(notFound, "ConfigMap", "ns", "foo") + if !errors.Is(err, ErrSourceNotFound) { + t.Errorf("expected ErrSourceNotFound, got: %v", err) + } +} + +func TestClassifyError_Forbidden(t *testing.T) { + forbidden := apierrors.NewForbidden(schema.GroupResource{Resource: "secrets"}, "bar", errors.New("denied")) + err := classifyError(forbidden, "Secret", "ns", "bar") + if !errors.Is(err, ErrSourceUnauthorized) { + t.Errorf("expected ErrSourceUnauthorized, got: %v", err) + } +} + +func TestClassifyError_Unauthorized(t *testing.T) { + unauthorized := apierrors.NewUnauthorized("no token") + err := classifyError(unauthorized, "Secret", "ns", "bar") + if !errors.Is(err, ErrSourceUnauthorized) { + t.Errorf("expected ErrSourceUnauthorized, got: %v", err) + } +} + +func TestClassifyError_Other(t *testing.T) { + other := errors.New("something else broke") + err := classifyError(other, "ConfigMap", "ns", "foo") + if errors.Is(err, ErrSourceNotFound) || errors.Is(err, ErrSourceUnauthorized) { + t.Errorf("unexpected sentinel error classification for %v", err) + } + if !errors.Is(err, other) { + t.Errorf("expected original error to be preserved, got %v", err) + } +} + +func TestLocalReader_GetConfigMap_Forbidden(t *testing.T) { + gr := schema.GroupResource{Resource: "configmaps"} + cl := fake.NewClientBuilder(). + WithScheme(fakeScheme). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(ctx context.Context, c client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + return apierrors.NewForbidden(gr, key.Name, errors.New("access denied")) + }, + }). + Build() + r := NewLocalReader(cl) + + _, err := r.GetConfigMap(context.Background(), "ignored", "default", "secret-cfg") + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, ErrSourceUnauthorized) { + t.Errorf("expected ErrSourceUnauthorized, got: %v", err) + } +} diff --git a/internal/referenceddata/types.go b/internal/referenceddata/types.go new file mode 100644 index 00000000..d64d5626 --- /dev/null +++ b/internal/referenceddata/types.go @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +// Package referenceddata is the capability seam for resolving and delivering +// ConfigMaps and Secrets referenced by workload templates. It is designed to +// be promotable into a shared platform library. +// +// Phase 0 provides: types, the reference collector, companion naming, and the +// ProjectConfigSecretReader interface + implementation. The resolver controller +// (Phase 1) and cell gate-clearing (Phase 2) build on top of this foundation. +package referenceddata + +import ( + "context" + "errors" + + corev1 "k8s.io/api/core/v1" +) + +// ErrSourceNotFound indicates that a referenced ConfigMap or Secret could not +// be found in the project namespace (HTTP 404 from the project control plane). +var ErrSourceNotFound = errors.New("referenced source not found") + +// ErrSourceUnauthorized indicates that the management identity does not have +// permission to read the referenced object (HTTP 401 or 403 from the project +// control plane). +var ErrSourceUnauthorized = errors.New("not authorized to read referenced source") + +// ObjectRef identifies a ConfigMap or Secret by kind and name within a +// namespace. References are always same-namespace (the Workload namespace). +type ObjectRef struct { + // Kind is either "ConfigMap" or "Secret". + Kind string + + // Name is the name of the ConfigMap or Secret. + Name string + + // Namespace is the namespace containing the object. + Namespace string +} + +// ReferencedSet is a deduplicated, deterministically-ordered set of ObjectRefs +// collected from a workload template. +type ReferencedSet []ObjectRef + +// ProjectConfigSecretReader reads ConfigMaps and Secrets from a project's +// control plane. Implementations must classify errors as ErrSourceNotFound or +// ErrSourceUnauthorized where appropriate. +// +// The interface is intentionally narrow — it does not expose list, update, or +// delete — so that the implementation can be tightly scoped to the read-only +// path the management identity requires. +type ProjectConfigSecretReader interface { + // GetConfigMap returns the named ConfigMap from the given project namespace. + // Returns ErrSourceNotFound if the object does not exist, or + // ErrSourceUnauthorized if the caller lacks permission. + GetConfigMap(ctx context.Context, projectID, namespace, name string) (*corev1.ConfigMap, error) + + // GetSecret returns the named Secret from the given project namespace. + // Returns ErrSourceNotFound if the object does not exist, or + // ErrSourceUnauthorized if the caller lacks permission. + GetSecret(ctx context.Context, projectID, namespace, name string) (*corev1.Secret, error) +} diff --git a/internal/validation/instance_validation.go b/internal/validation/instance_validation.go index 59a57585..54cf98db 100644 --- a/internal/validation/instance_validation.go +++ b/internal/validation/instance_validation.go @@ -2,6 +2,7 @@ package validation import ( "fmt" + "path" "strings" "golang.org/x/crypto/ssh" @@ -14,6 +15,7 @@ import ( "k8s.io/apimachinery/pkg/util/validation/field" computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/referenceddata" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) @@ -84,6 +86,75 @@ func validateInstanceSpec( allErrs = append(allErrs, validateInstanceRuntimeSpec(spec.Runtime, volumes, fieldPath.Child("runtime"))...) allErrs = append(allErrs, validateInstanceNetworkInterfaces(spec.NetworkInterfaces, fieldPath.Child("networkInterfaces"), opts)...) + allErrs = append(allErrs, validateReferencedDataAccess(spec, fieldPath, opts)...) + + return allErrs +} + +// validateReferencedDataAccess issues a SubjectAccessReview for each unique +// ConfigMap and Secret referenced by the spec (volumes, env valueFrom, envFrom). +// The check mirrors the existing Network interface SAR pattern exactly — +// build from AdmissionRequest.UserInfo, deny if not allowed. +// +// References are collected and deduplicated by referenceddata.CollectFromSpec, +// which is the single source of truth for "what counts as a reference". Entries +// with both configMapRef and secretRef set are skipped (validateEnvFrom rejects +// them separately, so no SAR is needed for invalid entries). +func validateReferencedDataAccess( + spec computev1alpha.InstanceSpec, + fieldPath *field.Path, + opts WorkloadValidationOptions, +) field.ErrorList { + allErrs := field.ErrorList{} + + refs := referenceddata.CollectFromSpec(opts.Workload.Namespace, spec) + if len(refs) == 0 { + return nil + } + + extra := make(map[string]authorizationv1.ExtraValue, len(opts.AdmissionRequest.UserInfo.Extra)) + for k, v := range opts.AdmissionRequest.UserInfo.Extra { + extra[k] = authorizationv1.ExtraValue(v) + } + + for _, ref := range refs { + var resourceName string + switch ref.Kind { + case "ConfigMap": + resourceName = "configmaps" + case "Secret": + resourceName = "secrets" + default: + continue + } + + review := authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Verb: "get", + Group: "", + Version: "v1", + Resource: resourceName, + Name: ref.Name, + Namespace: opts.Workload.Namespace, + }, + User: opts.AdmissionRequest.UserInfo.Username, + Groups: opts.AdmissionRequest.UserInfo.Groups, + UID: opts.AdmissionRequest.UserInfo.UID, + Extra: extra, + }, + } + + refPath := fieldPath.Child(resourceName).Key(ref.Name) + if err := opts.Client.Create(opts.Context, &review); err != nil { + allErrs = append(allErrs, field.InternalError(refPath, + fmt.Errorf("failed creating SubjectAccessReview for %s %s/%s access: %w", + ref.Kind, opts.Workload.Namespace, ref.Name, err))) + } else if !review.Status.Allowed { + allErrs = append(allErrs, field.Forbidden(refPath, + fmt.Sprintf("permission to get %s %q was denied", ref.Kind, ref.Name))) + } + } return allErrs } @@ -246,7 +317,7 @@ func validateVolumeSource(source computev1alpha.VolumeSource, fieldPath *field.P allErrs = append(allErrs, field.Forbidden(secretField, "may not specify more than 1 volume source")) } else { numSources++ - // TODO(jreese) validate secret volume source + allErrs = append(allErrs, validateSecretVolumeSource(source.Secret, secretField)...) } } @@ -335,15 +406,69 @@ func validateConfigMapVolumeSource(configMapSource *corev1.ConfigMapVolumeSource allErrs = append(allErrs, field.Invalid(fldPath.Child("defaultMode"), *configMapMode, fileModeErrorMsg)) } - itemsPath := fldPath.Child("items") - if len(configMapSource.Items) > 0 { - allErrs = append(allErrs, field.Forbidden(itemsPath, "not implemented")) + for i, kp := range configMapSource.Items { + itemPath := fldPath.Child("items").Index(i) + allErrs = append(allErrs, validateKeyToPath(&kp, itemPath)...) } - // TODO(jreese) implement validation here - // for i, kp := range configMapSource.Items { - // itemPath := itemsPath.Index(i) - // allErrs = append(allErrs, validateKeyToPath(&kp, itemPath)...) - // } + return allErrs +} + +func validateSecretVolumeSource(secretSource *corev1.SecretVolumeSource, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + if len(secretSource.SecretName) == 0 { + allErrs = append(allErrs, field.Required(fldPath.Child("secretName"), "")) + } + + secretMode := secretSource.DefaultMode + if secretMode != nil && (*secretMode > 0777 || *secretMode < 0) { + allErrs = append(allErrs, field.Invalid(fldPath.Child("defaultMode"), *secretMode, fileModeErrorMsg)) + } + + for i, kp := range secretSource.Items { + itemPath := fldPath.Child("items").Index(i) + allErrs = append(allErrs, validateKeyToPath(&kp, itemPath)...) + } + return allErrs +} + +// validateKeyToPath validates a corev1.KeyToPath projection entry. +func validateKeyToPath(kp *corev1.KeyToPath, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + + if len(kp.Key) == 0 { + allErrs = append(allErrs, field.Required(fldPath.Child("key"), "")) + } + + if len(kp.Path) == 0 { + allErrs = append(allErrs, field.Required(fldPath.Child("path"), "")) + } else { + allErrs = append(allErrs, validateVolumeProjectionPath(kp.Path, fldPath.Child("path"))...) + } + + if kp.Mode != nil && (*kp.Mode > 0777 || *kp.Mode < 0) { + allErrs = append(allErrs, field.Invalid(fldPath.Child("mode"), *kp.Mode, fileModeErrorMsg)) + } + + return allErrs +} + +// validateVolumeProjectionPath validates that a volume key→path target path is +// safe: it must be relative, must not be absolute, and must not contain ".." +// path elements that would escape the volume mount directory. +func validateVolumeProjectionPath(p string, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + + if path.IsAbs(p) { + allErrs = append(allErrs, field.Invalid(fldPath, p, "must be a relative path")) + return allErrs + } + + // Clean the path and check for ".." escape. + cleaned := path.Clean(p) + if strings.HasPrefix(cleaned, "..") { + allErrs = append(allErrs, field.Invalid(fldPath, p, "must not contain '..' path elements")) + } + return allErrs } @@ -550,12 +675,69 @@ func validateContainerCommon( allErrs = append(allErrs, validateVolumeAttachments(container.VolumeAttachments, volumes, fieldPath.Child("volumeAttachments"))...) + allErrs = append(allErrs, validateEnvFrom(container.EnvFrom, fieldPath.Child("envFrom"))...) + // TODO(jreese) validate named ports are unique across all containers? allErrs = append(allErrs, validateNamedPorts(container.Ports, fieldPath.Child("ports"))...) return allErrs } +// validateEnvFrom validates the envFrom field on a SandboxContainer. +// Each entry must reference exactly one of configMapRef or secretRef. The +// optional prefix must be a valid C_IDENTIFIER. Source names must satisfy +// DNS label constraints. +func validateEnvFrom(envFrom []computev1alpha.EnvFromSource, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + + for i, ef := range envFrom { + indexPath := fldPath.Index(i) + + // Validate prefix — must be empty or a valid C_IDENTIFIER. + if ef.Prefix != "" { + if errs := apimachineryutilvalidation.IsCIdentifier(ef.Prefix); len(errs) > 0 { + allErrs = append(allErrs, field.Invalid(indexPath.Child("prefix"), ef.Prefix, strings.Join(errs, "; "))) + } + } + + numSources := 0 + + if ef.ConfigMapRef != nil { + numSources++ + refPath := indexPath.Child("configMapRef") + if len(ef.ConfigMapRef.Name) == 0 { + allErrs = append(allErrs, field.Required(refPath.Child("name"), "")) + } else { + for _, msg := range apimachineryvalidation.NameIsDNSLabel(ef.ConfigMapRef.Name, false) { + allErrs = append(allErrs, field.Invalid(refPath.Child("name"), ef.ConfigMapRef.Name, msg)) + } + } + } + + if ef.SecretRef != nil { + numSources++ + refPath := indexPath.Child("secretRef") + if numSources > 1 { + allErrs = append(allErrs, field.Forbidden(refPath, "may not specify more than 1 source per envFrom entry")) + } else { + if len(ef.SecretRef.Name) == 0 { + allErrs = append(allErrs, field.Required(refPath.Child("name"), "")) + } else { + for _, msg := range apimachineryvalidation.NameIsDNSLabel(ef.SecretRef.Name, false) { + allErrs = append(allErrs, field.Invalid(refPath.Child("name"), ef.SecretRef.Name, msg)) + } + } + } + } + + if numSources == 0 { + allErrs = append(allErrs, field.Required(indexPath, "must specify exactly one of configMapRef or secretRef")) + } + } + + return allErrs +} + func validateVirtualMachineRuntime(vm *computev1alpha.VirtualMachineRuntime, volumes map[string]computev1alpha.VolumeSource, fieldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} diff --git a/internal/validation/instance_validation_test.go b/internal/validation/instance_validation_test.go new file mode 100644 index 00000000..cf56044b --- /dev/null +++ b/internal/validation/instance_validation_test.go @@ -0,0 +1,703 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package validation + +import ( + "context" + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + authorizationv1 "k8s.io/api/authorization/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sruntime "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/validation/field" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + computev1alpha "go.datum.net/compute/api/v1alpha" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" +) + +// cmpErrs compares two field.ErrorLists, ignoring BadValue and Detail fields, +// and treating nil and empty slices as equal. +func cmpErrs(t *testing.T, want, got field.ErrorList) { + t.Helper() + delta := cmp.Diff( + want, got, + cmpopts.IgnoreFields(field.Error{}, "BadValue", "Detail"), + cmpopts.EquateEmpty(), + ) + if delta != "" { + t.Errorf("errors mismatch (-want +got):\n%s", delta) + } +} + +// TestValidateSecretVolumeSource tests the new secret volume validation. +func TestValidateSecretVolumeSource(t *testing.T) { + // Use a named root path so child field names are predictable. + root := field.NewPath("secret") + + cases := map[string]struct { + source *corev1.SecretVolumeSource + expectedErrors field.ErrorList + }{ + "valid minimal": { + source: &corev1.SecretVolumeSource{SecretName: "my-secret"}, + }, + "missing secretName": { + source: &corev1.SecretVolumeSource{}, + expectedErrors: field.ErrorList{ + field.Required(root.Child("secretName"), ""), + }, + }, + "valid defaultMode 0644": { + source: &corev1.SecretVolumeSource{SecretName: "s", DefaultMode: ptr.To(int32(0644))}, + }, + "defaultMode too large": { + // 512 decimal == 0o1000 > 0o777, so invalid + source: &corev1.SecretVolumeSource{SecretName: "s", DefaultMode: ptr.To(int32(512))}, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("defaultMode"), int32(512), fileModeErrorMsg), + }, + }, + "defaultMode negative": { + source: &corev1.SecretVolumeSource{SecretName: "s", DefaultMode: ptr.To(int32(-1))}, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("defaultMode"), int32(-1), fileModeErrorMsg), + }, + }, + "valid items": { + source: &corev1.SecretVolumeSource{ + SecretName: "s", + Items: []corev1.KeyToPath{ + {Key: "password", Path: "config/pass"}, + }, + }, + }, + "items missing key": { + source: &corev1.SecretVolumeSource{ + SecretName: "s", + Items: []corev1.KeyToPath{ + {Path: "config/pass"}, + }, + }, + expectedErrors: field.ErrorList{ + field.Required(root.Child("items").Index(0).Child("key"), ""), + }, + }, + "items missing path": { + source: &corev1.SecretVolumeSource{ + SecretName: "s", + Items: []corev1.KeyToPath{ + {Key: "password"}, + }, + }, + expectedErrors: field.ErrorList{ + field.Required(root.Child("items").Index(0).Child("path"), ""), + }, + }, + "items invalid item mode": { + source: &corev1.SecretVolumeSource{ + SecretName: "s", + Items: []corev1.KeyToPath{ + // 513 > 511 (0777), invalid + {Key: "k", Path: "p", Mode: ptr.To(int32(513))}, + }, + }, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("items").Index(0).Child("mode"), int32(513), fileModeErrorMsg), + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + errs := validateSecretVolumeSource(tc.source, root) + cmpErrs(t, tc.expectedErrors, errs) + }) + } +} + +// TestValidateConfigMapItems tests that ConfigMap volume items are now validated +// (previously they were forbidden). +func TestValidateConfigMapItems(t *testing.T) { + root := field.NewPath("configMap") + + cases := map[string]struct { + source *corev1.ConfigMapVolumeSource + expectedErrors field.ErrorList + }{ + "valid with items": { + source: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testCfgName}, + Items: []corev1.KeyToPath{ + {Key: "app.conf", Path: "etc/app.conf"}, + }, + }, + }, + "items absolute path": { + source: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testCfgName}, + Items: []corev1.KeyToPath{ + {Key: "k", Path: "/absolute/path"}, + }, + }, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("items").Index(0).Child("path"), "/absolute/path", "must be a relative path"), + }, + }, + "items dotdot path escape": { + source: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testCfgName}, + Items: []corev1.KeyToPath{ + {Key: "k", Path: "../escape"}, + }, + }, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("items").Index(0).Child("path"), "../escape", "must not contain '..' path elements"), + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + errs := validateConfigMapVolumeSource(tc.source, root) + cmpErrs(t, tc.expectedErrors, errs) + }) + } +} + +// TestValidateKeyToPath tests the key→path projection validator directly. +func TestValidateKeyToPath(t *testing.T) { + root := field.NewPath("kp") + + cases := map[string]struct { + kp corev1.KeyToPath + expectedErrors field.ErrorList + }{ + "valid": { + kp: corev1.KeyToPath{Key: "app.conf", Path: "config/app.conf"}, + }, + "valid with mode": { + kp: corev1.KeyToPath{Key: "k", Path: "p", Mode: ptr.To(int32(0400))}, + }, + "missing key": { + kp: corev1.KeyToPath{Path: "p"}, + expectedErrors: field.ErrorList{ + field.Required(root.Child("key"), ""), + }, + }, + "missing path": { + kp: corev1.KeyToPath{Key: "k"}, + expectedErrors: field.ErrorList{ + field.Required(root.Child("path"), ""), + }, + }, + "absolute path": { + kp: corev1.KeyToPath{Key: "k", Path: "/etc/hosts"}, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("path"), "/etc/hosts", "must be a relative path"), + }, + }, + "dotdot escape": { + kp: corev1.KeyToPath{Key: "k", Path: "../../etc/passwd"}, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("path"), "../../etc/passwd", "must not contain '..' path elements"), + }, + }, + "invalid mode": { + // 512 > 511 (0777) + kp: corev1.KeyToPath{Key: "k", Path: "p", Mode: ptr.To(int32(512))}, + expectedErrors: field.ErrorList{ + field.Invalid(root.Child("mode"), int32(512), fileModeErrorMsg), + }, + }, + "mode zero ok": { + kp: corev1.KeyToPath{Key: "k", Path: "p", Mode: ptr.To(int32(0))}, + }, + "mode 0777 ok": { + kp: corev1.KeyToPath{Key: "k", Path: "p", Mode: ptr.To(int32(0777))}, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + errs := validateKeyToPath(&tc.kp, root) + cmpErrs(t, tc.expectedErrors, errs) + }) + } +} + +// TestValidateEnvFrom tests the EnvFrom field validation. +func TestValidateEnvFrom(t *testing.T) { + root := field.NewPath("envFrom") + + cases := map[string]struct { + envFrom []computev1alpha.EnvFromSource + expectedErrors field.ErrorList + }{ + "empty list ok": { + envFrom: nil, + }, + "valid configMapRef": { + envFrom: []computev1alpha.EnvFromSource{ + {ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: "my-cfg"}}, + }, + }, + "valid secretRef": { + envFrom: []computev1alpha.EnvFromSource{ + {SecretRef: &computev1alpha.SecretEnvSource{Name: "my-secret"}}, + }, + }, + "valid with prefix": { + envFrom: []computev1alpha.EnvFromSource{ + {Prefix: "APP_", ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: testCfgName}}, + }, + }, + "invalid prefix not C_IDENTIFIER": { + envFrom: []computev1alpha.EnvFromSource{ + {Prefix: "123BAD", ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: testCfgName}}, + }, + expectedErrors: field.ErrorList{ + field.Invalid(root.Index(0).Child("prefix"), "123BAD", ""), + }, + }, + "no source specified": { + envFrom: []computev1alpha.EnvFromSource{ + {Prefix: "OK_"}, + }, + expectedErrors: field.ErrorList{ + field.Required(root.Index(0), ""), + }, + }, + "both sources specified": { + envFrom: []computev1alpha.EnvFromSource{ + { + ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: testCfgName}, + SecretRef: &computev1alpha.SecretEnvSource{Name: "sec"}, + }, + }, + expectedErrors: field.ErrorList{ + field.Forbidden(root.Index(0).Child("secretRef"), ""), + }, + }, + "configMapRef missing name": { + envFrom: []computev1alpha.EnvFromSource{ + {ConfigMapRef: &computev1alpha.ConfigMapEnvSource{}}, + }, + expectedErrors: field.ErrorList{ + field.Required(root.Index(0).Child("configMapRef").Child("name"), ""), + }, + }, + "secretRef missing name": { + envFrom: []computev1alpha.EnvFromSource{ + {SecretRef: &computev1alpha.SecretEnvSource{}}, + }, + expectedErrors: field.ErrorList{ + field.Required(root.Index(0).Child("secretRef").Child("name"), ""), + }, + }, + "invalid dns label in configMapRef name": { + envFrom: []computev1alpha.EnvFromSource{ + {ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: "INVALID_NAME"}}, + }, + expectedErrors: field.ErrorList{ + field.Invalid(root.Index(0).Child("configMapRef").Child("name"), "INVALID_NAME", ""), + }, + }, + "optional secret": { + envFrom: []computev1alpha.EnvFromSource{ + {SecretRef: &computev1alpha.SecretEnvSource{Name: "opt-secret", Optional: ptr.To(true)}}, + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + errs := validateEnvFrom(tc.envFrom, root) + cmpErrs(t, tc.expectedErrors, errs) + }) + } +} + +// sarGenerateName is used as a GenerateName prefix on synthetic SAR objects so +// the fake client accepts them. Extracted as a constant to satisfy goconst. +const ( + sarGenerateName = "sar-" + testCfgName = "cfg" + testCfgVolName = "cfg-vol" + testAppConfigName = "app-config" +) + +// TestReferencedDataSAR tests that the admission SAR check fires for referenced +// ConfigMaps and Secrets, and produces the expected errors on deny. +func TestReferencedDataSAR(t *testing.T) { + scheme := k8sruntime.NewScheme() + utilruntime.Must(computev1alpha.AddToScheme(scheme)) + utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + // allowAll is an interceptor that marks every SAR as allowed. + allowAll := interceptor.Funcs{ + Create: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.CreateOption) error { + if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { + sar.GenerateName = sarGenerateName + sar.Status.Allowed = true + } + return c.Create(ctx, obj, opts...) + }, + } + + // denyAll is an interceptor that marks every SAR as denied. + denyAll := interceptor.Funcs{ + Create: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.CreateOption) error { + if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { + sar.GenerateName = sarGenerateName + sar.Status.Allowed = false + } + return c.Create(ctx, obj, opts...) + }, + } + + baseClient := fake.NewClientBuilder().WithScheme(scheme).Build() + specPath := field.NewPath("spec").Child("template").Child("spec") + + workloadWithConfigMap := MakeSandboxWorkload("test", func(w *computev1alpha.Workload) { + w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ + { + Name: testCfgVolName, + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testAppConfigName}, + }, + }, + }, + } + }) + + workloadWithSecret := MakeSandboxWorkload("test", func(w *computev1alpha.Workload) { + w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].EnvFrom = []computev1alpha.EnvFromSource{ + {SecretRef: &computev1alpha.SecretEnvSource{Name: "db-creds"}}, + } + }) + + cases := map[string]struct { + workload *computev1alpha.Workload + interceptor interceptor.Funcs + expectedErrors field.ErrorList + }{ + "configmap allowed": { + workload: workloadWithConfigMap, + interceptor: allowAll, + }, + "configmap denied": { + workload: workloadWithConfigMap, + interceptor: denyAll, + expectedErrors: field.ErrorList{ + field.Forbidden(specPath.Child("configmaps").Key(testAppConfigName), ""), + }, + }, + "secret denied": { + workload: workloadWithSecret, + interceptor: denyAll, + expectedErrors: field.ErrorList{ + field.Forbidden(specPath.Child("secrets").Key("db-creds"), ""), + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + cl := interceptor.NewClient(baseClient, tc.interceptor) + opts := WorkloadValidationOptions{ + Client: cl, + Context: context.Background(), + Workload: tc.workload, + AdmissionRequest: admission.Request{}, + ValidCityCodes: []string{"DFW"}, + } + + spec := tc.workload.Spec.Template.Spec + errs := validateReferencedDataAccess(spec, specPath, opts) + cmpErrs(t, tc.expectedErrors, errs) + }) + } +} + +// TestBothRefsSetEnvFrom verifies that an envFrom entry with both configMapRef +// and secretRef set is rejected by validateEnvFrom AND that the collector +// produces no refs for the invalid entry (so no SAR is issued for it). +func TestBothRefsSetEnvFrom(t *testing.T) { + scheme := k8sruntime.NewScheme() + utilruntime.Must(computev1alpha.AddToScheme(scheme)) + utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + // sarCount tracks how many SubjectAccessReviews are created. + sarCount := 0 + cl := interceptor.NewClient( + fake.NewClientBuilder().WithScheme(scheme).Build(), + interceptor.Funcs{ + Create: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.CreateOption) error { + if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { + sarCount++ + sar.GenerateName = sarGenerateName + sar.Status.Allowed = true + } + return c.Create(ctx, obj, opts...) + }, + }, + ) + + envFrom := []computev1alpha.EnvFromSource{ + { + ConfigMapRef: &computev1alpha.ConfigMapEnvSource{Name: testCfgName}, + SecretRef: &computev1alpha.SecretEnvSource{Name: "sec"}, + }, + } + root := field.NewPath("envFrom") + errs := validateEnvFrom(envFrom, root) + + // Structural validation must reject it. + wantErrs := field.ErrorList{ + field.Forbidden(root.Index(0).Child("secretRef"), ""), + } + cmpErrs(t, wantErrs, errs) + + // No SAR should have been issued for the invalid entry. + workload := MakeSandboxWorkload("test", func(w *computev1alpha.Workload) { + w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].EnvFrom = envFrom + }) + sarCountBefore := sarCount + specPath := field.NewPath("spec").Child("template").Child("spec") + opts := WorkloadValidationOptions{ + Client: cl, + Context: context.Background(), + Workload: workload, + AdmissionRequest: admission.Request{}, + ValidCityCodes: []string{testCityCodeDFW}, + } + _ = validateReferencedDataAccess(workload.Spec.Template.Spec, specPath, opts) + if sarCount != sarCountBefore { + t.Errorf("SAR was issued for invalid both-refs-set envFrom entry: got %d SAR(s), want 0 additional", sarCount-sarCountBefore) + } +} + +// TestReferencedDataSARInternalError verifies that when Client.Create returns +// an error the check is fail-closed (InternalError, not silent allow). +func TestReferencedDataSARInternalError(t *testing.T) { + scheme := k8sruntime.NewScheme() + utilruntime.Must(computev1alpha.AddToScheme(scheme)) + utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + errored := interceptor.NewClient( + fake.NewClientBuilder().WithScheme(scheme).Build(), + interceptor.Funcs{ + Create: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.CreateOption) error { + if _, ok := obj.(*authorizationv1.SubjectAccessReview); ok { + return fmt.Errorf("injected SAR error") + } + return nil + }, + }, + ) + + workload := MakeSandboxWorkload("test", func(w *computev1alpha.Workload) { + w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ + { + Name: testCfgVolName, + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testAppConfigName}, + }, + }, + }, + } + }) + + specPath := field.NewPath("spec").Child("template").Child("spec") + opts := WorkloadValidationOptions{ + Client: errored, + Context: context.Background(), + Workload: workload, + AdmissionRequest: admission.Request{}, + ValidCityCodes: []string{testCityCodeDFW}, + } + + errs := validateReferencedDataAccess(workload.Spec.Template.Spec, specPath, opts) + if len(errs) == 0 { + t.Fatal("expected InternalError when SAR Client.Create fails, got no errors") + } + for _, e := range errs { + if e.Type != field.ErrorTypeInternal { + t.Errorf("expected InternalError type, got %v", e.Type) + } + } +} + +// TestValidateUpdateSARPath verifies that ValidateUpdate exercises the same +// referenced-data SAR path as ValidateCreate. We test the validation function +// directly (not the webhook handler) because wiring a full mcmanager in a unit +// test is out of scope; the webhook's ValidateUpdate delegates to +// ValidateWorkloadCreate, which is the function under test here. +func TestValidateUpdateSARPath(t *testing.T) { + scheme := k8sruntime.NewScheme() + utilruntime.Must(computev1alpha.AddToScheme(scheme)) + utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + // denyConfigMaps denies SARs for configmaps, allows everything else. + // This isolates the referenced-data check from the network check. + denyConfigMaps := interceptor.Funcs{ + Create: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.CreateOption) error { + if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { + sar.GenerateName = sarGenerateName + if sar.Spec.ResourceAttributes != nil && sar.Spec.ResourceAttributes.Resource == "configmaps" { + sar.Status.Allowed = false + } else { + sar.Status.Allowed = true + } + } + return c.Create(ctx, obj, opts...) + }, + } + + baseClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&networkingv1alpha.Network{ + ObjectMeta: metav1.ObjectMeta{Namespace: testDefaultNamespace, Name: testDefaultNamespace}, + }). + Build() + + // Workload now references a ConfigMap — the SAR check must fire on update. + newWorkload := MakeSandboxWorkload("test", func(w *computev1alpha.Workload) { + w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ + { + Name: testCfgVolName, + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testAppConfigName}, + }, + }, + }, + } + w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].VolumeAttachments = []computev1alpha.VolumeAttachment{ + {Name: testCfgVolName}, + } + }) + + cl := interceptor.NewClient(baseClient, denyConfigMaps) + opts := WorkloadValidationOptions{ + Client: cl, + Context: context.Background(), + Workload: newWorkload, + AdmissionRequest: admission.Request{}, + ValidCityCodes: []string{testCityCodeDFW}, + } + + // ValidateWorkloadCreate is called by ValidateUpdate in the webhook; here + // we call it directly to verify the SAR path fires for the new template. + errs := ValidateWorkloadCreate(newWorkload, opts) + specPath := field.NewPath("spec").Child("template").Child("spec") + wantErrs := field.ErrorList{ + field.Forbidden(specPath.Child("configmaps").Key(testAppConfigName), ""), + } + cmpErrs(t, wantErrs, errs) +} + +// TestValidateVolumeProjectionPath tests the path safety validator. +func TestValidateVolumeProjectionPath(t *testing.T) { + root := field.NewPath("path") + + cases := map[string]struct { + p string + expectedErrors field.ErrorList + }{ + "relative path ok": {p: "subdir/file.conf"}, + "single component ok": {p: "file.conf"}, + "absolute path rejected": { + p: "/etc/hosts", + expectedErrors: field.ErrorList{field.Invalid(root, "/etc/hosts", "")}, + }, + "dotdot at root rejected": { + p: "../escape", + expectedErrors: field.ErrorList{field.Invalid(root, "../escape", "")}, + }, + "dotdot in middle rejected": { + p: "a/../../../etc/passwd", + expectedErrors: field.ErrorList{field.Invalid(root, "a/../../../etc/passwd", "")}, + }, + // "a/b/../c" cleans to "a/c" which does not start with ".." + "dotdot in subdir safe": {p: "a/b/../c"}, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + errs := validateVolumeProjectionPath(tc.p, root) + cmpErrs(t, tc.expectedErrors, errs) + }) + } +} + +// TestWorkloadWithReferencedDataE2E exercises full workload validation including +// the new SAR check, to confirm no regression in happy-path behaviour. +func TestWorkloadWithReferencedDataE2E(t *testing.T) { + scheme := k8sruntime.NewScheme() + utilruntime.Must(computev1alpha.AddToScheme(scheme)) + utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithInterceptorFuncs(interceptor.Funcs{ + Create: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.CreateOption) error { + if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { + sar.GenerateName = sarGenerateName + sar.Status.Allowed = true + } + return c.Create(ctx, obj, opts...) + }, + }). + WithObjects(&networkingv1alpha.Network{ + ObjectMeta: metav1.ObjectMeta{Namespace: testDefaultNamespace, Name: testDefaultNamespace}, + }). + Build() + + workload := MakeSandboxWorkload("test", func(w *computev1alpha.Workload) { + w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ + { + Name: testCfgVolName, + VolumeSource: computev1alpha.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: testAppConfigName}, + }, + }, + }, + } + // Wire volume attachment to satisfy validation. + w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].VolumeAttachments = []computev1alpha.VolumeAttachment{ + {Name: testCfgVolName}, + } + }) + + opts := WorkloadValidationOptions{ + Client: fakeClient, + Context: context.Background(), + Workload: workload, + ValidCityCodes: []string{"DFW"}, + } + + errs := ValidateWorkloadCreate(workload, opts) + if len(errs) != 0 { + t.Errorf("expected no errors, got: %v", errs) + } +} diff --git a/internal/validation/workload_validation_test.go b/internal/validation/workload_validation_test.go index 2a0324ee..f235cd39 100644 --- a/internal/validation/workload_validation_test.go +++ b/internal/validation/workload_validation_test.go @@ -647,7 +647,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{ { Network: networkingv1alpha.NetworkRef{ - Name: "default", + Name: testDefaultNamespace, }, }, }, @@ -704,7 +704,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{ { Network: networkingv1alpha.NetworkRef{ - Name: "default", + Name: testDefaultNamespace, }, }, }, diff --git a/internal/webhook/v1alpha/workload_webhook.go b/internal/webhook/v1alpha/workload_webhook.go index a8b94b38..b8cffecb 100644 --- a/internal/webhook/v1alpha/workload_webhook.go +++ b/internal/webhook/v1alpha/workload_webhook.go @@ -117,8 +117,48 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, workload *computev return nil, nil } -func (r *workloadWebhook) ValidateUpdate(_ context.Context, _, _ *computev1alpha.Workload) (admission.Warnings, error) { - // TODO(user): fill in your validation logic upon object update. +func (r *workloadWebhook) ValidateUpdate(ctx context.Context, _ *computev1alpha.Workload, newWorkload *computev1alpha.Workload) (admission.Warnings, error) { + clusterName := computewebhook.ClusterNameFromContext(ctx) + + cluster, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)) + if err != nil { + return nil, err + } + clusterClient := cluster.GetClient() + + logger := logf.FromContext(ctx).WithValues("cluster", clusterName) + logger.Info("Validating Workload Update", "name", newWorkload.GetName(), "cluster", clusterName) + + req, err := admission.RequestFromContext(ctx) + if err != nil { + return nil, err + } + + var locations networkingv1alpha.LocationBindingList + if err := clusterClient.List(ctx, &locations); err != nil { + return nil, fmt.Errorf("failed to list location bindings: %w", err) + } + + validCityCodes := sets.Set[string]{} + for _, location := range locations.Items { + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if ok { + validCityCodes.Insert(cityCode) + } + } + + opts := validation.WorkloadValidationOptions{ + Context: ctx, + Client: clusterClient, + AdmissionRequest: req, + Workload: newWorkload, + ValidCityCodes: sets.List(validCityCodes), + } + + if errs := validation.ValidateWorkloadCreate(newWorkload, opts); len(errs) > 0 { + return nil, errors.NewInvalid(newWorkload.GroupVersionKind().GroupKind(), newWorkload.Name, errs) + } + return nil, nil }