From a644024ce61be3510534b2179b3a5e0027429f73 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 01/12] enable HA hub agent Signed-off-by: Wei Weng --- .github/workflows/ci.yml | 2 + charts/hub-agent/README.md | 63 ++++++++++++++++++++- charts/hub-agent/templates/certificate.yaml | 62 ++++++++++++++++++++ charts/hub-agent/templates/deployment.yaml | 15 +++++ charts/hub-agent/values.yaml | 8 ++- cmd/hubagent/main.go | 6 +- cmd/hubagent/options/options.go | 4 ++ pkg/webhook/webhook.go | 63 +++++++++++++++++++-- pkg/webhook/webhook_test.go | 60 +++++++++++++++++++- test/e2e/setup.sh | 20 ++++++- 10 files changed, 288 insertions(+), 15 deletions(-) create mode 100644 charts/hub-agent/templates/certificate.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb908b3c6..0268797c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,6 +14,7 @@ on: env: GO_VERSION: '1.24.9' + CERT_MANAGER_VERSION: 'v1.16.2' jobs: detect-noop: @@ -125,6 +126,7 @@ jobs: PROPERTY_PROVIDER: 'azure' RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL: ${{ matrix.resource-snapshot-creation-minimum-interval }} RESOURCE_CHANGES_COLLECTION_DURATION: ${{ matrix.resource-changes-collection-duration }} + CERT_MANAGER_VERSION: ${{ env.CERT_MANAGER_VERSION }} - name: Collect logs if: always() diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index 3f44d29a4..5b1710d5f 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -2,11 +2,33 @@ ## Install Chart +### Default Installation (Self-Signed Certificates) + ```console # Helm install with fleet-system namespace already created helm install hub-agent ./charts/hub-agent/ ``` +### Installation with cert-manager + +When using cert-manager for certificate management, install cert-manager as a prerequisite first: + +```console +# Install cert-manager +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.16.2 \ + --set crds.enabled=true + +# Then install hub-agent with cert-manager enabled +helm install hub-agent ./charts/hub-agent --set useCertManager=true +``` + +This configures cert-manager to manage webhook certificates. + ## Upgrade Chart ```console @@ -32,6 +54,11 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen | `affinity` | Node affinity for hub-agent pods | `{}` | | `tolerations` | Tolerations for hub-agent pods | `[]` | | `logVerbosity` | Log level (klog V logs) | `5` | +| `enableWebhook` | Enable webhook server | `true` | +| `webhookServiceName` | Webhook service name | `fleetwebhook` | +| `enableGuardRail` | Enable guard rail webhook configurations | `true` | +| `webhookClientConnectionType` | Connection type for webhook client (service or url) | `service` | +| `useCertManager` | Use cert-manager for webhook certificate management | `false` | | `enableV1Beta1APIs` | Watch for v1beta1 APIs | `true` | | `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` | | `hubAPIBurst` | Burst for fleet-apiserver (not including events/node heartbeat) | `1000` | @@ -41,4 +68,38 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen | `MaxFleetSizeSupported` | Max number of member clusters supported | `100` | | `resourceSnapshotCreationMinimumInterval` | The minimum interval at which resource snapshots could be created. | `30s` | | `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot. | `15s` | -| `enableWorkload` | Enable kubernetes builtin workload to run in hub cluster. | `false` | \ No newline at end of file +| `enableWorkload` | Enable kubernetes builtin workload to run in hub cluster. | `false` | + +## Certificate Management + +The hub-agent supports two modes for webhook certificate management: + +### Automatic Certificate Generation (Default) + +By default, the hub-agent generates certificates automatically at startup. This mode: +- Requires no external dependencies +- Works out of the box +- Certificates are valid for 10 years + +### cert-manager (Optional) + +When `useCertManager=true`, certificates are managed by cert-manager. This mode: +- Requires cert-manager to be installed as a prerequisite +- Handles certificate rotation automatically (90-day certificates) +- Follows industry-standard certificate management practices +- Suitable for production environments + +To switch to cert-manager mode: +```console +# Install cert-manager first +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.16.2 \ + --set crds.enabled=true + +# Then install hub-agent with cert-manager enabled +helm install hub-agent ./charts/hub-agent --set useCertManager=true +``` \ No newline at end of file diff --git a/charts/hub-agent/templates/certificate.yaml b/charts/hub-agent/templates/certificate.yaml new file mode 100644 index 000000000..ff5b3135e --- /dev/null +++ b/charts/hub-agent/templates/certificate.yaml @@ -0,0 +1,62 @@ +{{- if and .Values.enableWebhook .Values.useCertManager }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: fleet-webhook-server-cert + namespace: {{ .Values.namespace }} + labels: + {{- include "hub-agent.labels" . | nindent 4 }} +spec: + # Secret name where cert-manager will store the certificate + secretName: fleet-webhook-server-cert + + # Certificate duration (90 days is cert-manager's default and recommended) + duration: 2160h # 90 days + + # Renew certificate 30 days before expiry + renewBefore: 720h # 30 days + + # Subject configuration + subject: + organizations: + - KubeFleet + + # Common name + commonName: fleet-webhook.{{ .Values.namespace }}.svc + + # DNS names for the certificate + dnsNames: + - {{ .Values.webhookServiceName }} + - {{ .Values.webhookServiceName }}.{{ .Values.namespace }} + - {{ .Values.webhookServiceName }}.{{ .Values.namespace }}.svc + - {{ .Values.webhookServiceName }}.{{ .Values.namespace }}.svc.cluster.local + + # Issuer reference - using self-signed issuer + issuerRef: + name: fleet-selfsigned-issuer + kind: Issuer + group: cert-manager.io + + # Private key configuration + privateKey: + algorithm: ECDSA + size: 256 + + # Key usages + usages: + - digital signature + - key encipherment + - server auth +--- +# Self-signed issuer for generating the certificate +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: fleet-selfsigned-issuer + namespace: {{ .Values.namespace }} + labels: + {{- include "hub-agent.labels" . | nindent 4 }} +spec: + selfSigned: {} +{{- end }} diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index 7ed6ab7ba..b960857f2 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -6,6 +6,7 @@ metadata: labels: {{- include "hub-agent.labels" . | nindent 4 }} spec: + replicas: {{ .Values.replicaCount }} selector: matchLabels: {{- include "hub-agent.selectorLabels" . | nindent 6 }} @@ -25,6 +26,7 @@ spec: - --webhook-service-name={{ .Values.webhookServiceName }} - --enable-guard-rail={{ .Values.enableGuardRail }} - --enable-workload={{ .Values.enableWorkload }} + - --use-cert-manager={{ .Values.useCertManager }} - --whitelisted-users=system:serviceaccount:fleet-system:hub-agent-sa - --webhook-client-connection-type={{.Values.webhookClientConnectionType}} - --v={{ .Values.logVerbosity }} @@ -73,6 +75,19 @@ spec: fieldPath: metadata.namespace resources: {{- toYaml .Values.resources | nindent 12 }} + {{- if .Values.useCertManager }} + volumeMounts: + - name: webhook-cert + mountPath: /tmp/k8s-webhook-server/serving-certs + readOnly: true + {{- end }} + {{- if .Values.useCertManager }} + volumes: + - name: webhook-cert + secret: + secretName: fleet-webhook-server-cert + defaultMode: 0644 + {{- end }} {{- with .Values.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index a423c6270..1c427df1f 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -17,13 +17,17 @@ webhookServiceName: fleetwebhook enableGuardRail: true webhookClientConnectionType: service enableWorkload: false +# useCertManager enables cert-manager for webhook certificate management +# When enabled, cert-manager will be installed as a dependency +# and a Certificate resource will be created +useCertManager: false + forceDeleteWaitTime: 15m0s clusterUnhealthyThreshold: 3m0s resourceSnapshotCreationMinimumInterval: 30s resourceChangesCollectionDuration: 15s -namespace: - fleet-system +namespace: fleet-system resources: limits: diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index d69d1d867..444d70147 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -159,7 +159,7 @@ func main() { if opts.EnableWebhook { whiteListedUsers := strings.Split(opts.WhiteListedUsers, ",") if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, - opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled); err != nil { + opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled, opts.UseCertManager); err != nil { klog.ErrorS(err, "unable to set up webhook") exitWithErrorFunc() } @@ -202,9 +202,9 @@ func main() { // SetupWebhook generates the webhook cert and then set up the webhook configurator. func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, - whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool) error { + whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool, useCertManager bool) error { // Generate self-signed key and crt files in FleetWebhookCertDir for the webhook server to start. - w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, FleetWebhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload) + w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, FleetWebhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload, useCertManager) if err != nil { klog.ErrorS(err, "fail to generate WebhookConfig") return err diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index 6573789c1..7568ae7cc 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -110,6 +110,9 @@ type Options struct { // EnableWorkload enables workload resources (pods and replicasets) to be created in the hub cluster. // When set to true, the pod and replicaset validating webhooks are disabled. EnableWorkload bool + // UseCertManager indicates whether to use cert-manager for webhook certificate management. + // When enabled, webhook certificates are managed by cert-manager instead of self-signed generation. + UseCertManager bool // ResourceSnapshotCreationMinimumInterval is the minimum interval at which resource snapshots could be created. // Whether the resource snapshot is created or not depends on the both ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration. ResourceSnapshotCreationMinimumInterval time.Duration @@ -185,6 +188,7 @@ func (o *Options) AddFlags(flags *flag.FlagSet) { flags.IntVar(&o.PprofPort, "pprof-port", 6065, "The port for pprof profiling.") flags.BoolVar(&o.DenyModifyMemberClusterLabels, "deny-modify-member-cluster-labels", false, "If set, users not in the system:masters cannot modify member cluster labels.") flags.BoolVar(&o.EnableWorkload, "enable-workload", false, "If set, workloads (pods and replicasets) can be created in the hub cluster. This disables the pod and replicaset validating webhooks.") + flags.BoolVar(&o.UseCertManager, "use-cert-manager", false, "If set, cert-manager will be used for webhook certificate management instead of self-signed certificates.") flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.") flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second, "The duration for collecting resource changes into one snapshot. The default is 15 seconds, which means that the controller will collect resource changes for 15 seconds before creating a resource snapshot.") diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 4f4c89b56..4312b5008 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -70,6 +70,7 @@ import ( const ( fleetWebhookCertFileName = "tls.crt" fleetWebhookKeyFileName = "tls.key" + fleetWebhookCertSecretName = "fleet-webhook-server-cert" //nolint:gosec // This is a Secret name, not a credential fleetValidatingWebhookCfgName = "fleet-validating-webhook-configuration" fleetGuardRailWebhookCfgName = "fleet-guard-rail-webhook-configuration" fleetMutatingWebhookCfgName = "fleet-mutating-webhook-configuration" @@ -162,9 +163,11 @@ type Config struct { denyModifyMemberClusterLabels bool enableWorkload bool + // useCertManager indicates whether cert-manager is used for certificate management + useCertManager bool } -func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool) (*Config, error) { +func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool, useCertManager bool) (*Config, error) { // We assume the Pod namespace should be passed to env through downward API in the Pod spec. namespace := os.Getenv("POD_NAMESPACE") if namespace == "" { @@ -180,13 +183,29 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 enableGuardRail: enableGuardRail, denyModifyMemberClusterLabels: denyModifyMemberClusterLabels, enableWorkload: enableWorkload, + useCertManager: useCertManager, } - caPEM, err := w.genCertificate(certDir) - if err != nil { - return nil, err + + var caPEM []byte + var err error + + if useCertManager { + // When using cert-manager, certificates are mounted as files by Kubernetes + // cert-manager creates tls.crt and tls.key, but we need ca.crt for the webhook config + caPEM, err = w.loadCertManagerCA(certDir) + if err != nil { + return nil, fmt.Errorf("failed to load cert-manager CA certificate: %w", err) + } + } else { + // Use self-signed certificate generation (original flow) + caPEM, err = w.genCertificate(certDir) + if err != nil { + return nil, err + } } + w.caPEM = caPEM - return &w, err + return &w, nil } func (w *Config) Start(ctx context.Context) error { @@ -216,12 +235,19 @@ func (w *Config) createFleetWebhookConfiguration(ctx context.Context) error { // createMutatingWebhookConfiguration creates the MutatingWebhookConfiguration object for the webhook. func (w *Config) createMutatingWebhookConfiguration(ctx context.Context, webhooks []admv1.MutatingWebhook, configName string) error { + annotations := map[string]string{} + if w.useCertManager { + // Tell cert-manager's CA injector to automatically inject the CA bundle + annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, fleetWebhookCertSecretName) + } + mutatingWebhookConfig := admv1.MutatingWebhookConfiguration{ ObjectMeta: metav1.ObjectMeta{ Name: configName, Labels: map[string]string{ "admissions.enforcer/disabled": "true", }, + Annotations: annotations, }, Webhooks: webhooks, } @@ -269,12 +295,19 @@ func (w *Config) buildFleetMutatingWebhooks() []admv1.MutatingWebhook { } func (w *Config) createValidatingWebhookConfiguration(ctx context.Context, webhooks []admv1.ValidatingWebhook, configName string) error { + annotations := map[string]string{} + if w.useCertManager { + // Tell cert-manager's CA injector to automatically inject the CA bundle + annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, fleetWebhookCertSecretName) + } + validatingWebhookConfig := admv1.ValidatingWebhookConfiguration{ ObjectMeta: metav1.ObjectMeta{ Name: configName, Labels: map[string]string{ "admissions.enforcer/disabled": "true", }, + Annotations: annotations, }, Webhooks: webhooks, } @@ -659,6 +692,26 @@ func (w *Config) genCertificate(certDir string) ([]byte, error) { return caPEM, nil } +// loadCertManagerCA loads the CA certificate from the mounted cert-manager Secret. +// When using cert-manager, Kubernetes mounts the Secret as files in the certDir. +// cert-manager creates: ca.crt, tls.crt, and tls.key +// The tls.crt and tls.key are automatically used by the webhook server. +// We only need to read ca.crt for the webhook configuration's CABundle. +func (w *Config) loadCertManagerCA(certDir string) ([]byte, error) { + caPath := filepath.Join(certDir, "ca.crt") + caCert, err := os.ReadFile(caPath) + if err != nil { + return nil, fmt.Errorf("failed to read ca.crt from %s: %w", caPath, err) + } + + if len(caCert) == 0 { + return nil, fmt.Errorf("ca.crt is empty at %s", caPath) + } + + klog.V(2).InfoS("Successfully loaded CA certificate from cert-manager mounted Secret", "path", caPath) + return caCert, nil +} + // genSelfSignedCert generates the self signed Certificate/Key pair func (w *Config) genSelfSignedCert() (caPEMByte, certPEMByte, keyPEMByte []byte, err error) { // CA config diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index f1975b30b..1a5619068 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -1,6 +1,8 @@ package webhook import ( + "os" + "path/filepath" "testing" "github.com/google/go-cmp/cmp" @@ -12,6 +14,27 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/utils" ) +const mockCA = `-----BEGIN CERTIFICATE----- +MIIBkTCB+wIJAKHHCgVZU7c4MA0GCSqGSIb3DQEBCwUAMBExDzANBgNVBAMMBnRl +c3RDQTAeFw0yNDAxMDEwMDAwMDBaFw0yNTAxMDEwMDAwMDBaMBExDzANBgNVBAMM +BnRlc3RDQTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAwjBCKwYJKoZIhvcN +AQkBFhZmb3JtYXRAZXhhbXBsZS5jb20wHhcNMjQwMTAxMDAwMDAwWhcNMjUwMTAx +MDAwMDAwWjARMQ8wDQYDVQQDDAZ0ZXN0Q0EwXDANBgkqhkiG9w0BAQEFAANLADBI +AkEAwjBCKwYJKoZIhvcNAQEBBQADQQAwPgJBAMIwQisGCSqGSIb3DQEBAQUAA0EA +MD4CQQDCMEIrBgkqhkiG9w0BAQEFAANBADANBgkqhkiG9w0BAQUFAANBAMA= +-----END CERTIFICATE-----` + +// setupMockCertManagerFiles creates mock certificate files for testing cert-manager mode. +func setupMockCertManagerFiles(t *testing.T, certDir string) { + t.Helper() + if err := os.WriteFile(filepath.Join(certDir, "ca.crt"), []byte(mockCA), 0600); err != nil { + t.Fatalf("failed to create mock ca.crt: %v", err) + } + t.Cleanup(func() { + os.RemoveAll(certDir) + }) +} + func TestBuildFleetMutatingWebhooks(t *testing.T) { url := options.WebhookClientConnectionType("url") testCases := map[string]struct { @@ -110,6 +133,7 @@ func TestNewWebhookConfig(t *testing.T) { enableGuardRail bool denyModifyMemberClusterLabels bool enableWorkload bool + useCertManager bool want *Config wantErr bool }{ @@ -119,10 +143,34 @@ func TestNewWebhookConfig(t *testing.T) { webhookServiceName: "test-webhook", port: 8080, clientConnectionType: nil, - certDir: "/tmp/cert", + certDir: t.TempDir(), + enableGuardRail: true, + denyModifyMemberClusterLabels: true, + enableWorkload: false, + useCertManager: false, + want: &Config{ + serviceNamespace: "test-namespace", + serviceName: "test-webhook", + servicePort: 8080, + clientConnectionType: nil, + enableGuardRail: true, + denyModifyMemberClusterLabels: true, + enableWorkload: false, + useCertManager: false, + }, + wantErr: false, + }, + { + name: "valid input with cert-manager", + mgr: nil, + webhookServiceName: "test-webhook", + port: 8080, + clientConnectionType: nil, + certDir: t.TempDir(), enableGuardRail: true, denyModifyMemberClusterLabels: true, enableWorkload: false, + useCertManager: true, want: &Config{ serviceNamespace: "test-namespace", serviceName: "test-webhook", @@ -131,6 +179,7 @@ func TestNewWebhookConfig(t *testing.T) { enableGuardRail: true, denyModifyMemberClusterLabels: true, enableWorkload: false, + useCertManager: true, }, wantErr: false, }, @@ -138,8 +187,13 @@ func TestNewWebhookConfig(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { t.Setenv("POD_NAMESPACE", "test-namespace") - defer t.Setenv("POD_NAMESPACE", "") - got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload) + + // Create mock certificate files for cert-manager mode + if tt.useCertManager { + setupMockCertManagerFiles(t, tt.certDir) + } + + got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload, tt.useCertManager) if (err != nil) != tt.wantErr { t.Errorf("NewWebhookConfig() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index d4b6b8a2a..5edc75480 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -28,6 +28,7 @@ export PROPERTY_PROVIDER="${PROPERTY_PROVIDER:-azure}" export USE_PREDEFINED_REGIONS="${USE_PREDEFINED_REGIONS:-false}" export RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL="${RESOURCE_SNAPSHOT_CREATION_MINIMUM_INTERVAL:-0m}" export RESOURCE_CHANGES_COLLECTION_DURATION="${RESOURCE_CHANGES_COLLECTION_DURATION:-0m}" +export CERT_MANAGER_VERSION="${CERT_MANAGER_VERSION:-v1.16.2}" # The pre-defined regions; if the AKS property provider is used. # @@ -114,14 +115,31 @@ done # Install the helm charts -# Install the hub agent to the hub cluster kind export kubeconfig --name $HUB_CLUSTER + +# Install cert-manager first (required for webhook certificates) +echo "Installing cert-manager..." + +# Install cert-manager using Helm to avoid ownership conflicts with hub-agent chart +helm repo add jetstack https://charts.jetstack.io --force-update +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version $CERT_MANAGER_VERSION \ + --set crds.enabled=true \ + --wait \ + --timeout=300s + +# Install the hub agent to the hub cluster helm install hub-agent ../../charts/hub-agent/ \ --set image.pullPolicy=Never \ --set image.repository=$REGISTRY/$HUB_AGENT_IMAGE \ --set image.tag=$TAG \ --set namespace=fleet-system \ --set logVerbosity=5 \ + --set replicaCount=3 \ + --set useCertManager=true \ --set enableWebhook=true \ --set enableWorkload=true \ --set webhookClientConnectionType=service \ From 6571b12dfa915145ad6d16d1d32e30ee83c6f5e8 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 02/12] add chart validation and webhook tests Signed-off-by: Wei Weng --- charts/hub-agent/templates/deployment.yaml | 5 +- pkg/webhook/webhook.go | 4 +- pkg/webhook/webhook_test.go | 91 ++++++++++++++++++++++ 3 files changed, 97 insertions(+), 3 deletions(-) diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index b960857f2..ce51cc337 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -1,3 +1,6 @@ +{{- if and (not .Values.useCertManager) (gt (.Values.replicaCount | int) 1) }} +{{- fail "ERROR: replicaCount > 1 requires useCertManager=true (self-signed certificates cannot be shared across replicas)" }} +{{- end }} apiVersion: apps/v1 kind: Deployment metadata: @@ -86,7 +89,7 @@ spec: - name: webhook-cert secret: secretName: fleet-webhook-server-cert - defaultMode: 0644 + defaultMode: 0400 {{- end }} {{- with .Values.affinity }} affinity: diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 4312b5008..b89935ec1 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -200,7 +200,7 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 // Use self-signed certificate generation (original flow) caPEM, err = w.genCertificate(certDir) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to generate self-signed certificate: %w", err) } } @@ -694,7 +694,7 @@ func (w *Config) genCertificate(certDir string) ([]byte, error) { // loadCertManagerCA loads the CA certificate from the mounted cert-manager Secret. // When using cert-manager, Kubernetes mounts the Secret as files in the certDir. -// cert-manager creates: ca.crt, tls.crt, and tls.key +// cert-manager creates: ca.crt, tls.crt, and tls.key. // The tls.crt and tls.key are automatically used by the webhook server. // We only need to read ca.crt for the webhook configuration's CABundle. func (w *Config) loadCertManagerCA(certDir string) ([]byte, error) { diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 1a5619068..f8a46f4a4 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -3,6 +3,7 @@ package webhook import ( "os" "path/filepath" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -215,3 +216,93 @@ func TestNewWebhookConfig(t *testing.T) { }) } } +func TestLoadCertManagerCA_NotFound(t *testing.T) { + config := &Config{} + _, err := config.loadCertManagerCA("/nonexistent/path") + if err == nil { + t.Error("Expected error when certificate files don't exist") + } +} + +func TestLoadCertManagerCA_EmptyFile(t *testing.T) { + dir := t.TempDir() + // Create empty files + if err := os.WriteFile(filepath.Join(dir, "tls.crt"), []byte{}, 0600); err != nil { + t.Fatalf("failed to create empty tls.crt: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, "ca.crt"), []byte{}, 0600); err != nil { + t.Fatalf("failed to create empty ca.crt: %v", err) + } + + config := &Config{} + _, err := config.loadCertManagerCA(dir) + if err == nil { + t.Error("Expected error for empty certificate files") + } + if !strings.Contains(err.Error(), "empty") { + t.Errorf("Expected error message to contain 'empty', got: %v", err) + } +} + +func TestLoadCertManagerCA_Success(t *testing.T) { + t.Run("loads ca.crt successfully", func(t *testing.T) { + dir := t.TempDir() + caContent := []byte("test-ca-content") + if err := os.WriteFile(filepath.Join(dir, "ca.crt"), caContent, 0600); err != nil { + t.Fatalf("failed to create ca.crt: %v", err) + } + + config := &Config{} + result, err := config.loadCertManagerCA(dir) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if string(result) != string(caContent) { + t.Errorf("Expected %s, got %s", caContent, result) + } + }) +} + +func TestNewWebhookConfig_CertManagerNotMounted(t *testing.T) { + t.Setenv("POD_NAMESPACE", "test-namespace") + + dir := t.TempDir() + // Don't create any certificate files to simulate cert-manager not ready + + _, err := NewWebhookConfig(nil, "test-webhook", 8080, nil, dir, true, true, false, true) + if err == nil { + t.Error("Expected error when cert-manager certificates not mounted") + } + if !strings.Contains(err.Error(), "failed to load cert-manager CA certificate") { + t.Errorf("Expected error about loading cert-manager CA, got: %v", err) + } +} + +func TestNewWebhookConfig_SelfSignedCertError(t *testing.T) { + t.Setenv("POD_NAMESPACE", "test-namespace") + + // Use an invalid certDir (read-only location) to force genCertificate to fail + invalidCertDir := "/proc/invalid-cert-dir" + + clientConnectionType := options.Service + _, err := NewWebhookConfig( + nil, + "test-service", + 443, + &clientConnectionType, + invalidCertDir, + false, // enableGuardRail + false, // denyModifyMemberClusterLabels + false, // enableWorkload + false, // useCertManager = false to trigger self-signed path + ) + + if err == nil { + t.Fatal("Expected error when genCertificate fails, got nil") + } + + expectedErrMsg := "failed to generate self-signed certificate" + if !strings.Contains(err.Error(), expectedErrMsg) { + t.Errorf("Expected error to contain '%s', got: %v", expectedErrMsg, err) + } +} From 496993c8b5cf359ef0fcb7f7017c81c74ea82756 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 03/12] make some configurable and add validation and document Signed-off-by: Wei Weng --- charts/hub-agent/README.md | 31 ++++++++++++++-- charts/hub-agent/templates/certificate.yaml | 4 +-- charts/hub-agent/templates/deployment.yaml | 8 +++-- charts/hub-agent/values.yaml | 6 ++++ cmd/hubagent/main.go | 13 ++++--- cmd/hubagent/options/options.go | 8 +++++ cmd/hubagent/options/validation.go | 8 +++++ cmd/hubagent/options/validation_test.go | 40 +++++++++++++++++++++ pkg/webhook/webhook.go | 9 +++-- pkg/webhook/webhook_test.go | 13 +++---- 10 files changed, 117 insertions(+), 23 deletions(-) diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index 5b1710d5f..dcf9d2efa 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -24,7 +24,7 @@ helm install cert-manager jetstack/cert-manager \ --set crds.enabled=true # Then install hub-agent with cert-manager enabled -helm install hub-agent ./charts/hub-agent --set useCertManager=true +helm install hub-agent ./charts/hub-agent --set useCertManager=true --set enableWorkload=true --set enableWebhook=true ``` This configures cert-manager to manage webhook certificates. @@ -58,7 +58,9 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen | `webhookServiceName` | Webhook service name | `fleetwebhook` | | `enableGuardRail` | Enable guard rail webhook configurations | `true` | | `webhookClientConnectionType` | Connection type for webhook client (service or url) | `service` | -| `useCertManager` | Use cert-manager for webhook certificate management | `false` | +| `useCertManager` | Use cert-manager for webhook certificate management (requires `enableWebhook=true` and `enableWorkload=true`) | `false` | +| `webhookCertDir` | Directory where webhook certificates are stored/mounted | `/tmp/k8s-webhook-server/serving-certs` | +| `webhookCertSecretName` | Name of the Secret containing webhook certificates | `fleet-webhook-server-cert` | | `enableV1Beta1APIs` | Watch for v1beta1 APIs | `true` | | `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` | | `hubAPIBurst` | Burst for fleet-apiserver (not including events/node heartbeat) | `1000` | @@ -85,6 +87,8 @@ By default, the hub-agent generates certificates automatically at startup. This When `useCertManager=true`, certificates are managed by cert-manager. This mode: - Requires cert-manager to be installed as a prerequisite +- Requires `enableWorkload=true` to allow cert-manager pods to run in the hub cluster (without this, pod creation would be blocked by the webhook) +- Requires `enableWebhook=true` because cert-manager is only used for webhook certificate management - Handles certificate rotation automatically (90-day certificates) - Follows industry-standard certificate management practices - Suitable for production environments @@ -101,5 +105,26 @@ helm install cert-manager jetstack/cert-manager \ --set crds.enabled=true # Then install hub-agent with cert-manager enabled -helm install hub-agent ./charts/hub-agent --set useCertManager=true +helm install hub-agent ./charts/hub-agent --set useCertManager=true --set enableWorkload=true --set enableWebhook=true +``` + +### Certificate Directory Configuration + +The `webhookCertDir` parameter allows you to customize where webhook certificates are stored: +- Default: `/tmp/k8s-webhook-server/serving-certs` +- Must match the volumeMount path when using cert-manager +- Configurable via both Helm values and `--webhook-cert-dir` flag + +The `webhookCertSecretName` parameter allows you to customize the Secret name for webhook certificates: +- Default: `fleet-webhook-server-cert` +- When using cert-manager, this must match the Certificate resource's secretName +- Configurable via both Helm values and `--webhook-cert-secret-name` flag + +Example with custom certificate directory and secret name: +```console +helm install hub-agent ./charts/hub-agent \ + --set useCertManager=true \ + --set enableWorkload=true \ + --set webhookCertDir=/custom/cert/path \ + --set webhookCertSecretName=my-webhook-cert ``` \ No newline at end of file diff --git a/charts/hub-agent/templates/certificate.yaml b/charts/hub-agent/templates/certificate.yaml index ff5b3135e..0643f7f88 100644 --- a/charts/hub-agent/templates/certificate.yaml +++ b/charts/hub-agent/templates/certificate.yaml @@ -3,13 +3,13 @@ apiVersion: cert-manager.io/v1 kind: Certificate metadata: - name: fleet-webhook-server-cert + name: {{ .Values.webhookCertSecretName }} namespace: {{ .Values.namespace }} labels: {{- include "hub-agent.labels" . | nindent 4 }} spec: # Secret name where cert-manager will store the certificate - secretName: fleet-webhook-server-cert + secretName: {{ .Values.webhookCertSecretName }} # Certificate duration (90 days is cert-manager's default and recommended) duration: 2160h # 90 days diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index ce51cc337..12934c849 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -30,6 +30,8 @@ spec: - --enable-guard-rail={{ .Values.enableGuardRail }} - --enable-workload={{ .Values.enableWorkload }} - --use-cert-manager={{ .Values.useCertManager }} + - --webhook-cert-dir={{ .Values.webhookCertDir }} + - --webhook-cert-secret-name={{ .Values.webhookCertSecretName }} - --whitelisted-users=system:serviceaccount:fleet-system:hub-agent-sa - --webhook-client-connection-type={{.Values.webhookClientConnectionType}} - --v={{ .Values.logVerbosity }} @@ -81,14 +83,16 @@ spec: {{- if .Values.useCertManager }} volumeMounts: - name: webhook-cert - mountPath: /tmp/k8s-webhook-server/serving-certs + mountPath: {{ .Values.webhookCertDir }} readOnly: true {{- end }} {{- if .Values.useCertManager }} volumes: - name: webhook-cert secret: - secretName: fleet-webhook-server-cert + secretName: {{ .Values.webhookCertSecretName }} + # defaultMode 0400 (read-only for owner) prevents unauthorized access to certificate files + # and reduces attack surface by ensuring only the container process can read the certs defaultMode: 0400 {{- end }} {{- with .Values.affinity }} diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index 1c427df1f..7808b8035 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -21,6 +21,12 @@ enableWorkload: false # When enabled, cert-manager will be installed as a dependency # and a Certificate resource will be created useCertManager: false +# webhookCertDir is the directory where webhook certificates are mounted +# This is configurable via --webhook-cert-dir flag and must match the volumeMount path in deployment +webhookCertDir: /tmp/k8s-webhook-server/serving-certs +# webhookCertSecretName is the name of the Secret containing webhook certificates +# When using cert-manager, this must match the Certificate resource's secretName +webhookCertSecretName: fleet-webhook-server-cert forceDeleteWaitTime: 15m0s clusterUnhealthyThreshold: 3m0s diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index 444d70147..794cd368b 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -65,8 +65,7 @@ var ( ) const ( - FleetWebhookCertDir = "/tmp/k8s-webhook-server/serving-certs" - FleetWebhookPort = 9443 + FleetWebhookPort = 9443 ) func init() { @@ -121,7 +120,7 @@ func main() { }, WebhookServer: ctrlwebhook.NewServer(ctrlwebhook.Options{ Port: FleetWebhookPort, - CertDir: FleetWebhookCertDir, + CertDir: opts.WebhookCertDir, }), } if opts.EnablePprof { @@ -159,7 +158,7 @@ func main() { if opts.EnableWebhook { whiteListedUsers := strings.Split(opts.WhiteListedUsers, ",") if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, - opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled, opts.UseCertManager); err != nil { + opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled, opts.UseCertManager, opts.WebhookCertDir, opts.WebhookCertSecretName); err != nil { klog.ErrorS(err, "unable to set up webhook") exitWithErrorFunc() } @@ -202,9 +201,9 @@ func main() { // SetupWebhook generates the webhook cert and then set up the webhook configurator. func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, - whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool, useCertManager bool) error { - // Generate self-signed key and crt files in FleetWebhookCertDir for the webhook server to start. - w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, FleetWebhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload, useCertManager) + whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool, useCertManager bool, webhookCertDir string, webhookCertSecretName string) error { + // Generate self-signed key and crt files in webhookCertDir for the webhook server to start. + w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, webhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload, useCertManager, webhookCertSecretName) if err != nil { klog.ErrorS(err, "fail to generate WebhookConfig") return err diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index 7568ae7cc..ea30c79c4 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -113,6 +113,12 @@ type Options struct { // UseCertManager indicates whether to use cert-manager for webhook certificate management. // When enabled, webhook certificates are managed by cert-manager instead of self-signed generation. UseCertManager bool + // WebhookCertDir is the directory where webhook certificates are stored/mounted. + // This must match the mountPath in the Helm chart deployment when using cert-manager. + WebhookCertDir string + // WebhookCertSecretName is the name of the Secret containing webhook certificates. + // When using cert-manager, this is the Secret name created by the Certificate resource. + WebhookCertSecretName string // ResourceSnapshotCreationMinimumInterval is the minimum interval at which resource snapshots could be created. // Whether the resource snapshot is created or not depends on the both ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration. ResourceSnapshotCreationMinimumInterval time.Duration @@ -189,6 +195,8 @@ func (o *Options) AddFlags(flags *flag.FlagSet) { flags.BoolVar(&o.DenyModifyMemberClusterLabels, "deny-modify-member-cluster-labels", false, "If set, users not in the system:masters cannot modify member cluster labels.") flags.BoolVar(&o.EnableWorkload, "enable-workload", false, "If set, workloads (pods and replicasets) can be created in the hub cluster. This disables the pod and replicaset validating webhooks.") flags.BoolVar(&o.UseCertManager, "use-cert-manager", false, "If set, cert-manager will be used for webhook certificate management instead of self-signed certificates.") + flags.StringVar(&o.WebhookCertDir, "webhook-cert-dir", "/tmp/k8s-webhook-server/serving-certs", "The directory where webhook certificates are stored. Must match the volumeMount path in deployment when using cert-manager.") + flags.StringVar(&o.WebhookCertSecretName, "webhook-cert-secret-name", "fleet-webhook-server-cert", "The name of the Secret containing webhook certificates. Must match the secretName in deployment and Certificate resource when using cert-manager.") flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.") flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second, "The duration for collecting resource changes into one snapshot. The default is 15 seconds, which means that the controller will collect resource changes for 15 seconds before creating a resource snapshot.") diff --git a/cmd/hubagent/options/validation.go b/cmd/hubagent/options/validation.go index df5af530f..647ddc166 100644 --- a/cmd/hubagent/options/validation.go +++ b/cmd/hubagent/options/validation.go @@ -52,6 +52,14 @@ func (o *Options) Validate() field.ErrorList { errs = append(errs, field.Invalid(newPath.Child("WebhookServiceName"), o.WebhookServiceName, "Webhook service name is required when webhook is enabled")) } + if o.UseCertManager && !o.EnableWebhook { + errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.UseCertManager, "UseCertManager requires EnableWebhook to be true (cert-manager is only used for webhook certificate management)")) + } + + if o.UseCertManager && !o.EnableWorkload { + errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.UseCertManager, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)")) + } + connectionType := o.WebhookClientConnectionType if _, err := parseWebhookClientConnectionString(connectionType); err != nil { errs = append(errs, field.Invalid(newPath.Child("WebhookClientConnectionType"), o.WebhookClientConnectionType, err.Error())) diff --git a/cmd/hubagent/options/validation_test.go b/cmd/hubagent/options/validation_test.go index b4cf80b9c..988f61a05 100644 --- a/cmd/hubagent/options/validation_test.go +++ b/cmd/hubagent/options/validation_test.go @@ -27,6 +27,10 @@ import ( "k8s.io/apimachinery/pkg/util/validation/field" ) +const ( + testWebhookServiceName = "test-webhook" +) + // a callback function to modify options type ModifyOptions func(option *Options) @@ -93,6 +97,42 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { }), want: field.ErrorList{field.Invalid(newPath.Child("WebhookServiceName"), "", "Webhook service name is required when webhook is enabled")}, }, + "UseCertManager without EnableWebhook": { + opt: newTestOptions(func(option *Options) { + option.EnableWebhook = false + option.UseCertManager = true + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWebhook to be true (cert-manager is only used for webhook certificate management)"), + field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)"), + }, + }, + "UseCertManager with EnableWebhook": { + opt: newTestOptions(func(option *Options) { + option.EnableWebhook = true + option.WebhookServiceName = testWebhookServiceName + option.UseCertManager = true + }), + want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)")}, + }, + "UseCertManager without EnableWorkload": { + opt: newTestOptions(func(option *Options) { + option.EnableWebhook = true + option.WebhookServiceName = testWebhookServiceName + option.UseCertManager = true + option.EnableWorkload = false + }), + want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)")}, + }, + "UseCertManager with EnableWebhook and EnableWorkload": { + opt: newTestOptions(func(option *Options) { + option.EnableWebhook = true + option.WebhookServiceName = testWebhookServiceName + option.UseCertManager = true + option.EnableWorkload = true + }), + want: field.ErrorList{}, + }, } for name, tc := range testCases { diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index b89935ec1..097a69234 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -165,9 +165,11 @@ type Config struct { enableWorkload bool // useCertManager indicates whether cert-manager is used for certificate management useCertManager bool + // webhookCertSecretName is the name of the Secret containing webhook certificates + webhookCertSecretName string } -func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool, useCertManager bool) (*Config, error) { +func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool, useCertManager bool, webhookCertSecretName string) (*Config, error) { // We assume the Pod namespace should be passed to env through downward API in the Pod spec. namespace := os.Getenv("POD_NAMESPACE") if namespace == "" { @@ -184,6 +186,7 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 denyModifyMemberClusterLabels: denyModifyMemberClusterLabels, enableWorkload: enableWorkload, useCertManager: useCertManager, + webhookCertSecretName: webhookCertSecretName, } var caPEM []byte @@ -238,7 +241,7 @@ func (w *Config) createMutatingWebhookConfiguration(ctx context.Context, webhook annotations := map[string]string{} if w.useCertManager { // Tell cert-manager's CA injector to automatically inject the CA bundle - annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, fleetWebhookCertSecretName) + annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, w.webhookCertSecretName) } mutatingWebhookConfig := admv1.MutatingWebhookConfiguration{ @@ -298,7 +301,7 @@ func (w *Config) createValidatingWebhookConfiguration(ctx context.Context, webho annotations := map[string]string{} if w.useCertManager { // Tell cert-manager's CA injector to automatically inject the CA bundle - annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, fleetWebhookCertSecretName) + annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, w.webhookCertSecretName) } validatingWebhookConfig := admv1.ValidatingWebhookConfiguration{ diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index f8a46f4a4..7621935b2 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -194,7 +194,7 @@ func TestNewWebhookConfig(t *testing.T) { setupMockCertManagerFiles(t, tt.certDir) } - got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload, tt.useCertManager) + got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload, tt.useCertManager, "fleet-webhook-server-cert") if (err != nil) != tt.wantErr { t.Errorf("NewWebhookConfig() error = %v, wantErr %v", err, tt.wantErr) return @@ -269,7 +269,7 @@ func TestNewWebhookConfig_CertManagerNotMounted(t *testing.T) { dir := t.TempDir() // Don't create any certificate files to simulate cert-manager not ready - _, err := NewWebhookConfig(nil, "test-webhook", 8080, nil, dir, true, true, false, true) + _, err := NewWebhookConfig(nil, "test-webhook", 8080, nil, dir, true, true, false, true, "fleet-webhook-server-cert") if err == nil { t.Error("Expected error when cert-manager certificates not mounted") } @@ -291,10 +291,11 @@ func TestNewWebhookConfig_SelfSignedCertError(t *testing.T) { 443, &clientConnectionType, invalidCertDir, - false, // enableGuardRail - false, // denyModifyMemberClusterLabels - false, // enableWorkload - false, // useCertManager = false to trigger self-signed path + false, // enableGuardRail + false, // denyModifyMemberClusterLabels + false, // enableWorkload + false, // useCertManager = false to trigger self-signed path + "fleet-webhook-server-cert", // webhookCertSecretName ) if err == nil { From 9b5025772d42d11371927c24e4071dfde4801ef5 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 04/12] commit some copilot suggestions Signed-off-by: Wei Weng --- charts/hub-agent/values.yaml | 2 +- cmd/hubagent/options/validation.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index 7808b8035..87c96ee26 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -18,7 +18,7 @@ enableGuardRail: true webhookClientConnectionType: service enableWorkload: false # useCertManager enables cert-manager for webhook certificate management -# When enabled, cert-manager will be installed as a dependency +# When enabled, cert-manager must be installed as a prerequisite (it is not installed automatically by this chart) # and a Certificate resource will be created useCertManager: false # webhookCertDir is the directory where webhook certificates are mounted diff --git a/cmd/hubagent/options/validation.go b/cmd/hubagent/options/validation.go index 647ddc166..4258f24e2 100644 --- a/cmd/hubagent/options/validation.go +++ b/cmd/hubagent/options/validation.go @@ -57,7 +57,7 @@ func (o *Options) Validate() field.ErrorList { } if o.UseCertManager && !o.EnableWorkload { - errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.UseCertManager, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)")) + errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.UseCertManager, "UseCertManager requires EnableWorkload to be true (when EnableWorkload is false, a validating webhook blocks pod creation except for certain system pods; cert-manager controller pods must be allowed to run in the hub cluster)")) } connectionType := o.WebhookClientConnectionType From f1c413a0d713eca11fbf911f5f8dc941f5f79350 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 05/12] test the annotation behavior Signed-off-by: Wei Weng --- pkg/webhook/webhook.go | 20 ++++++++++------- pkg/webhook/webhook_test.go | 45 +++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 097a69234..38f464061 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -236,13 +236,21 @@ func (w *Config) createFleetWebhookConfiguration(ctx context.Context) error { return nil } -// createMutatingWebhookConfiguration creates the MutatingWebhookConfiguration object for the webhook. -func (w *Config) createMutatingWebhookConfiguration(ctx context.Context, webhooks []admv1.MutatingWebhook, configName string) error { +// buildWebhookAnnotations creates annotations for webhook configurations. +// When using cert-manager, adds the inject-ca-from annotation to automatically inject the CA bundle. +func (w *Config) buildWebhookAnnotations() map[string]string { annotations := map[string]string{} if w.useCertManager { - // Tell cert-manager's CA injector to automatically inject the CA bundle + // Tell cert-manager's CA injector to automatically inject the CA bundle from the Certificate resource. + // Format: "namespace/certificate-name" - references the Certificate resource, not the Secret. annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, w.webhookCertSecretName) } + return annotations +} + +// createMutatingWebhookConfiguration creates the MutatingWebhookConfiguration object for the webhook. +func (w *Config) createMutatingWebhookConfiguration(ctx context.Context, webhooks []admv1.MutatingWebhook, configName string) error { + annotations := w.buildWebhookAnnotations() mutatingWebhookConfig := admv1.MutatingWebhookConfiguration{ ObjectMeta: metav1.ObjectMeta{ @@ -298,11 +306,7 @@ func (w *Config) buildFleetMutatingWebhooks() []admv1.MutatingWebhook { } func (w *Config) createValidatingWebhookConfiguration(ctx context.Context, webhooks []admv1.ValidatingWebhook, configName string) error { - annotations := map[string]string{} - if w.useCertManager { - // Tell cert-manager's CA injector to automatically inject the CA bundle - annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, w.webhookCertSecretName) - } + annotations := w.buildWebhookAnnotations() validatingWebhookConfig := admv1.ValidatingWebhookConfiguration{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 7621935b2..3d47e3419 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -307,3 +307,48 @@ func TestNewWebhookConfig_SelfSignedCertError(t *testing.T) { t.Errorf("Expected error to contain '%s', got: %v", expectedErrMsg, err) } } + +func TestBuildWebhookAnnotations(t *testing.T) { + testCases := map[string]struct { + config Config + expectedAnnotions map[string]string + }{ + "useCertManager is false - returns empty map": { + config: Config{ + useCertManager: false, + serviceNamespace: "fleet-system", + webhookCertSecretName: "fleet-webhook-server-cert", + }, + expectedAnnotions: map[string]string{}, + }, + "useCertManager is true - returns annotation with correct format": { + config: Config{ + useCertManager: true, + serviceNamespace: "fleet-system", + webhookCertSecretName: "fleet-webhook-server-cert", + }, + expectedAnnotions: map[string]string{ + "cert-manager.io/inject-ca-from": "fleet-system/fleet-webhook-server-cert", + }, + }, + "useCertManager is true with custom namespace and secret name": { + config: Config{ + useCertManager: true, + serviceNamespace: "custom-namespace", + webhookCertSecretName: "custom-webhook-cert", + }, + expectedAnnotions: map[string]string{ + "cert-manager.io/inject-ca-from": "custom-namespace/custom-webhook-cert", + }, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := tc.config.buildWebhookAnnotations() + if diff := cmp.Diff(tc.expectedAnnotions, got); diff != "" { + t.Errorf("buildWebhookAnnotations() mismatch (-want +got):\n%s", diff) + } + }) + } +} From a8e0e617631a7201e0c16bbef2201d951ed3772b Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 06/12] add cert name parameter Signed-off-by: Wei Weng --- charts/hub-agent/README.md | 17 +++++++++++++---- charts/hub-agent/templates/certificate.yaml | 2 +- charts/hub-agent/templates/deployment.yaml | 1 + charts/hub-agent/values.yaml | 5 ++++- cmd/hubagent/main.go | 6 +++--- cmd/hubagent/options/options.go | 4 ++++ pkg/webhook/webhook.go | 8 ++++++-- pkg/webhook/webhook_test.go | 12 ++++++++---- 8 files changed, 40 insertions(+), 15 deletions(-) diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index dcf9d2efa..5d0e395cd 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -60,6 +60,7 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen | `webhookClientConnectionType` | Connection type for webhook client (service or url) | `service` | | `useCertManager` | Use cert-manager for webhook certificate management (requires `enableWebhook=true` and `enableWorkload=true`) | `false` | | `webhookCertDir` | Directory where webhook certificates are stored/mounted | `/tmp/k8s-webhook-server/serving-certs` | +| `webhookCertName` | Name of the Certificate resource created by cert-manager | `fleet-webhook-server-cert` | | `webhookCertSecretName` | Name of the Secret containing webhook certificates | `fleet-webhook-server-cert` | | `enableV1Beta1APIs` | Watch for v1beta1 APIs | `true` | | `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` | @@ -115,16 +116,24 @@ The `webhookCertDir` parameter allows you to customize where webhook certificate - Must match the volumeMount path when using cert-manager - Configurable via both Helm values and `--webhook-cert-dir` flag -The `webhookCertSecretName` parameter allows you to customize the Secret name for webhook certificates: +The `webhookCertName` parameter specifies the Certificate resource name: - Default: `fleet-webhook-server-cert` -- When using cert-manager, this must match the Certificate resource's secretName +- When using cert-manager, this is the name of the Certificate resource +- Referenced in the `cert-manager.io/inject-ca-from` annotation +- Configurable via both Helm values and `--webhook-cert-name` flag + +The `webhookCertSecretName` parameter specifies the Secret name for webhook certificates: +- Default: `fleet-webhook-server-cert` +- When using cert-manager, this is the Secret name created by the Certificate resource +- Must match the secretName in the Certificate spec - Configurable via both Helm values and `--webhook-cert-secret-name` flag -Example with custom certificate directory and secret name: +Example with custom certificate directory and names: ```console helm install hub-agent ./charts/hub-agent \ --set useCertManager=true \ --set enableWorkload=true \ --set webhookCertDir=/custom/cert/path \ - --set webhookCertSecretName=my-webhook-cert + --set webhookCertName=my-webhook-certificate \ + --set webhookCertSecretName=my-webhook-secret ``` \ No newline at end of file diff --git a/charts/hub-agent/templates/certificate.yaml b/charts/hub-agent/templates/certificate.yaml index 0643f7f88..75a6921cf 100644 --- a/charts/hub-agent/templates/certificate.yaml +++ b/charts/hub-agent/templates/certificate.yaml @@ -3,7 +3,7 @@ apiVersion: cert-manager.io/v1 kind: Certificate metadata: - name: {{ .Values.webhookCertSecretName }} + name: {{ .Values.webhookCertName }} namespace: {{ .Values.namespace }} labels: {{- include "hub-agent.labels" . | nindent 4 }} diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index 12934c849..b527d8317 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -31,6 +31,7 @@ spec: - --enable-workload={{ .Values.enableWorkload }} - --use-cert-manager={{ .Values.useCertManager }} - --webhook-cert-dir={{ .Values.webhookCertDir }} + - --webhook-cert-name={{ .Values.webhookCertName }} - --webhook-cert-secret-name={{ .Values.webhookCertSecretName }} - --whitelisted-users=system:serviceaccount:fleet-system:hub-agent-sa - --webhook-client-connection-type={{.Values.webhookClientConnectionType}} diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index 87c96ee26..bd2562353 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -24,8 +24,11 @@ useCertManager: false # webhookCertDir is the directory where webhook certificates are mounted # This is configurable via --webhook-cert-dir flag and must match the volumeMount path in deployment webhookCertDir: /tmp/k8s-webhook-server/serving-certs +# webhookCertName is the name of the Certificate resource created by cert-manager +# This is referenced in the cert-manager.io/inject-ca-from annotation +webhookCertName: fleet-webhook-server-cert # webhookCertSecretName is the name of the Secret containing webhook certificates -# When using cert-manager, this must match the Certificate resource's secretName +# When using cert-manager, this is the Secret name created by the Certificate resource webhookCertSecretName: fleet-webhook-server-cert forceDeleteWaitTime: 15m0s diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index 794cd368b..75e88cb58 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -158,7 +158,7 @@ func main() { if opts.EnableWebhook { whiteListedUsers := strings.Split(opts.WhiteListedUsers, ",") if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, - opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled, opts.UseCertManager, opts.WebhookCertDir, opts.WebhookCertSecretName); err != nil { + opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled, opts.UseCertManager, opts.WebhookCertDir, opts.WebhookCertName, opts.WebhookCertSecretName); err != nil { klog.ErrorS(err, "unable to set up webhook") exitWithErrorFunc() } @@ -201,9 +201,9 @@ func main() { // SetupWebhook generates the webhook cert and then set up the webhook configurator. func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, - whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool, useCertManager bool, webhookCertDir string, webhookCertSecretName string) error { + whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool, useCertManager bool, webhookCertDir string, webhookCertName string, webhookCertSecretName string) error { // Generate self-signed key and crt files in webhookCertDir for the webhook server to start. - w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, webhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload, useCertManager, webhookCertSecretName) + w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, webhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload, useCertManager, webhookCertName, webhookCertSecretName) if err != nil { klog.ErrorS(err, "fail to generate WebhookConfig") return err diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index ea30c79c4..593b85220 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -116,6 +116,9 @@ type Options struct { // WebhookCertDir is the directory where webhook certificates are stored/mounted. // This must match the mountPath in the Helm chart deployment when using cert-manager. WebhookCertDir string + // WebhookCertName is the name of the Certificate resource created by cert-manager. + // This is referenced in the cert-manager.io/inject-ca-from annotation. + WebhookCertName string // WebhookCertSecretName is the name of the Secret containing webhook certificates. // When using cert-manager, this is the Secret name created by the Certificate resource. WebhookCertSecretName string @@ -196,6 +199,7 @@ func (o *Options) AddFlags(flags *flag.FlagSet) { flags.BoolVar(&o.EnableWorkload, "enable-workload", false, "If set, workloads (pods and replicasets) can be created in the hub cluster. This disables the pod and replicaset validating webhooks.") flags.BoolVar(&o.UseCertManager, "use-cert-manager", false, "If set, cert-manager will be used for webhook certificate management instead of self-signed certificates.") flags.StringVar(&o.WebhookCertDir, "webhook-cert-dir", "/tmp/k8s-webhook-server/serving-certs", "The directory where webhook certificates are stored. Must match the volumeMount path in deployment when using cert-manager.") + flags.StringVar(&o.WebhookCertName, "webhook-cert-name", "fleet-webhook-server-cert", "The name of the Certificate resource created by cert-manager. Referenced in cert-manager.io/inject-ca-from annotation.") flags.StringVar(&o.WebhookCertSecretName, "webhook-cert-secret-name", "fleet-webhook-server-cert", "The name of the Secret containing webhook certificates. Must match the secretName in deployment and Certificate resource when using cert-manager.") flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.") flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second, diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 38f464061..eb033095b 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -165,11 +165,14 @@ type Config struct { enableWorkload bool // useCertManager indicates whether cert-manager is used for certificate management useCertManager bool + // webhookCertName is the name of the Certificate resource created by cert-manager. + // This is referenced in the cert-manager.io/inject-ca-from annotation. + webhookCertName string // webhookCertSecretName is the name of the Secret containing webhook certificates webhookCertSecretName string } -func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool, useCertManager bool, webhookCertSecretName string) (*Config, error) { +func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32, clientConnectionType *options.WebhookClientConnectionType, certDir string, enableGuardRail bool, denyModifyMemberClusterLabels bool, enableWorkload bool, useCertManager bool, webhookCertName string, webhookCertSecretName string) (*Config, error) { // We assume the Pod namespace should be passed to env through downward API in the Pod spec. namespace := os.Getenv("POD_NAMESPACE") if namespace == "" { @@ -186,6 +189,7 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 denyModifyMemberClusterLabels: denyModifyMemberClusterLabels, enableWorkload: enableWorkload, useCertManager: useCertManager, + webhookCertName: webhookCertName, webhookCertSecretName: webhookCertSecretName, } @@ -243,7 +247,7 @@ func (w *Config) buildWebhookAnnotations() map[string]string { if w.useCertManager { // Tell cert-manager's CA injector to automatically inject the CA bundle from the Certificate resource. // Format: "namespace/certificate-name" - references the Certificate resource, not the Secret. - annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, w.webhookCertSecretName) + annotations["cert-manager.io/inject-ca-from"] = fmt.Sprintf("%s/%s", w.serviceNamespace, w.webhookCertName) } return annotations } diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 3d47e3419..f66a1a3c9 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -194,7 +194,7 @@ func TestNewWebhookConfig(t *testing.T) { setupMockCertManagerFiles(t, tt.certDir) } - got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload, tt.useCertManager, "fleet-webhook-server-cert") + got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload, tt.useCertManager, "fleet-webhook-server-cert", "fleet-webhook-server-cert") if (err != nil) != tt.wantErr { t.Errorf("NewWebhookConfig() error = %v, wantErr %v", err, tt.wantErr) return @@ -269,7 +269,7 @@ func TestNewWebhookConfig_CertManagerNotMounted(t *testing.T) { dir := t.TempDir() // Don't create any certificate files to simulate cert-manager not ready - _, err := NewWebhookConfig(nil, "test-webhook", 8080, nil, dir, true, true, false, true, "fleet-webhook-server-cert") + _, err := NewWebhookConfig(nil, "test-webhook", 8080, nil, dir, true, true, false, true, "fleet-webhook-server-cert", "fleet-webhook-server-cert") if err == nil { t.Error("Expected error when cert-manager certificates not mounted") } @@ -295,6 +295,7 @@ func TestNewWebhookConfig_SelfSignedCertError(t *testing.T) { false, // denyModifyMemberClusterLabels false, // enableWorkload false, // useCertManager = false to trigger self-signed path + "fleet-webhook-server-cert", // webhookCertName "fleet-webhook-server-cert", // webhookCertSecretName ) @@ -317,6 +318,7 @@ func TestBuildWebhookAnnotations(t *testing.T) { config: Config{ useCertManager: false, serviceNamespace: "fleet-system", + webhookCertName: "fleet-webhook-server-cert", webhookCertSecretName: "fleet-webhook-server-cert", }, expectedAnnotions: map[string]string{}, @@ -325,17 +327,19 @@ func TestBuildWebhookAnnotations(t *testing.T) { config: Config{ useCertManager: true, serviceNamespace: "fleet-system", + webhookCertName: "fleet-webhook-server-cert", webhookCertSecretName: "fleet-webhook-server-cert", }, expectedAnnotions: map[string]string{ "cert-manager.io/inject-ca-from": "fleet-system/fleet-webhook-server-cert", }, }, - "useCertManager is true with custom namespace and secret name": { + "useCertManager is true with custom namespace and cert name": { config: Config{ useCertManager: true, serviceNamespace: "custom-namespace", - webhookCertSecretName: "custom-webhook-cert", + webhookCertName: "custom-webhook-cert", + webhookCertSecretName: "custom-webhook-secret", }, expectedAnnotions: map[string]string{ "cert-manager.io/inject-ca-from": "custom-namespace/custom-webhook-cert", From 5011109186787b039b9d483729fa6412e5c4b882 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 07/12] do not populate CA Signed-off-by: Wei Weng --- pkg/webhook/webhook.go | 46 +++-------- pkg/webhook/webhook_test.go | 160 ++++++++++++++++-------------------- 2 files changed, 85 insertions(+), 121 deletions(-) diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index eb033095b..ee52b016a 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -193,25 +193,20 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 webhookCertSecretName: webhookCertSecretName, } - var caPEM []byte - var err error - if useCertManager { - // When using cert-manager, certificates are mounted as files by Kubernetes - // cert-manager creates tls.crt and tls.key, but we need ca.crt for the webhook config - caPEM, err = w.loadCertManagerCA(certDir) - if err != nil { - return nil, fmt.Errorf("failed to load cert-manager CA certificate: %w", err) - } + // When using cert-manager, the CA bundle is automatically injected by cert-manager's CA injector + // based on the cert-manager.io/inject-ca-from annotation. We don't need to load or set the CA here. + // The certificates (tls.crt and tls.key) are mounted by Kubernetes and used automatically by the webhook server. + klog.V(2).InfoS("Using cert-manager for certificate management", "certDir", certDir) } else { // Use self-signed certificate generation (original flow) - caPEM, err = w.genCertificate(certDir) + caPEM, err := w.genCertificate(certDir) if err != nil { return nil, fmt.Errorf("failed to generate self-signed certificate: %w", err) } + w.caPEM = caPEM } - w.caPEM = caPEM return &w, nil } @@ -675,9 +670,14 @@ func (w *Config) createClientConfig(validationPath string) admv1.WebhookClientCo } serviceEndpoint := w.serviceURL + validationPath serviceRef.Path = ptr.To(validationPath) - config := admv1.WebhookClientConfig{ - CABundle: w.caPEM, + config := admv1.WebhookClientConfig{} + + // When using cert-manager, leave CABundle empty so cert-manager's CA injector can populate it. + // The cert-manager.io/inject-ca-from annotation triggers automatic CA injection. + if !w.useCertManager { + config.CABundle = w.caPEM } + switch *w.clientConnectionType { case options.Service: config.Service = &serviceRef @@ -703,26 +703,6 @@ func (w *Config) genCertificate(certDir string) ([]byte, error) { return caPEM, nil } -// loadCertManagerCA loads the CA certificate from the mounted cert-manager Secret. -// When using cert-manager, Kubernetes mounts the Secret as files in the certDir. -// cert-manager creates: ca.crt, tls.crt, and tls.key. -// The tls.crt and tls.key are automatically used by the webhook server. -// We only need to read ca.crt for the webhook configuration's CABundle. -func (w *Config) loadCertManagerCA(certDir string) ([]byte, error) { - caPath := filepath.Join(certDir, "ca.crt") - caCert, err := os.ReadFile(caPath) - if err != nil { - return nil, fmt.Errorf("failed to read ca.crt from %s: %w", caPath, err) - } - - if len(caCert) == 0 { - return nil, fmt.Errorf("ca.crt is empty at %s", caPath) - } - - klog.V(2).InfoS("Successfully loaded CA certificate from cert-manager mounted Secret", "path", caPath) - return caCert, nil -} - // genSelfSignedCert generates the self signed Certificate/Key pair func (w *Config) genSelfSignedCert() (caPEMByte, certPEMByte, keyPEMByte []byte, err error) { // CA config diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index f66a1a3c9..905540f12 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -1,8 +1,6 @@ package webhook import ( - "os" - "path/filepath" "strings" "testing" @@ -15,26 +13,7 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/utils" ) -const mockCA = `-----BEGIN CERTIFICATE----- -MIIBkTCB+wIJAKHHCgVZU7c4MA0GCSqGSIb3DQEBCwUAMBExDzANBgNVBAMMBnRl -c3RDQTAeFw0yNDAxMDEwMDAwMDBaFw0yNTAxMDEwMDAwMDBaMBExDzANBgNVBAMM -BnRlc3RDQTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAwjBCKwYJKoZIhvcN -AQkBFhZmb3JtYXRAZXhhbXBsZS5jb20wHhcNMjQwMTAxMDAwMDAwWhcNMjUwMTAx -MDAwMDAwWjARMQ8wDQYDVQQDDAZ0ZXN0Q0EwXDANBgkqhkiG9w0BAQEFAANLADBI -AkEAwjBCKwYJKoZIhvcNAQEBBQADQQAwPgJBAMIwQisGCSqGSIb3DQEBAQUAA0EA -MD4CQQDCMEIrBgkqhkiG9w0BAQEFAANBADANBgkqhkiG9w0BAQUFAANBAMA= ------END CERTIFICATE-----` - -// setupMockCertManagerFiles creates mock certificate files for testing cert-manager mode. -func setupMockCertManagerFiles(t *testing.T, certDir string) { - t.Helper() - if err := os.WriteFile(filepath.Join(certDir, "ca.crt"), []byte(mockCA), 0600); err != nil { - t.Fatalf("failed to create mock ca.crt: %v", err) - } - t.Cleanup(func() { - os.RemoveAll(certDir) - }) -} +const testWebhookServiceName = "test-webhook" func TestBuildFleetMutatingWebhooks(t *testing.T) { url := options.WebhookClientConnectionType("url") @@ -189,11 +168,6 @@ func TestNewWebhookConfig(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Setenv("POD_NAMESPACE", "test-namespace") - // Create mock certificate files for cert-manager mode - if tt.useCertManager { - setupMockCertManagerFiles(t, tt.certDir) - } - got, err := NewWebhookConfig(tt.mgr, tt.webhookServiceName, tt.port, tt.clientConnectionType, tt.certDir, tt.enableGuardRail, tt.denyModifyMemberClusterLabels, tt.enableWorkload, tt.useCertManager, "fleet-webhook-server-cert", "fleet-webhook-server-cert") if (err != nil) != tt.wantErr { t.Errorf("NewWebhookConfig() error = %v, wantErr %v", err, tt.wantErr) @@ -216,67 +190,6 @@ func TestNewWebhookConfig(t *testing.T) { }) } } -func TestLoadCertManagerCA_NotFound(t *testing.T) { - config := &Config{} - _, err := config.loadCertManagerCA("/nonexistent/path") - if err == nil { - t.Error("Expected error when certificate files don't exist") - } -} - -func TestLoadCertManagerCA_EmptyFile(t *testing.T) { - dir := t.TempDir() - // Create empty files - if err := os.WriteFile(filepath.Join(dir, "tls.crt"), []byte{}, 0600); err != nil { - t.Fatalf("failed to create empty tls.crt: %v", err) - } - if err := os.WriteFile(filepath.Join(dir, "ca.crt"), []byte{}, 0600); err != nil { - t.Fatalf("failed to create empty ca.crt: %v", err) - } - - config := &Config{} - _, err := config.loadCertManagerCA(dir) - if err == nil { - t.Error("Expected error for empty certificate files") - } - if !strings.Contains(err.Error(), "empty") { - t.Errorf("Expected error message to contain 'empty', got: %v", err) - } -} - -func TestLoadCertManagerCA_Success(t *testing.T) { - t.Run("loads ca.crt successfully", func(t *testing.T) { - dir := t.TempDir() - caContent := []byte("test-ca-content") - if err := os.WriteFile(filepath.Join(dir, "ca.crt"), caContent, 0600); err != nil { - t.Fatalf("failed to create ca.crt: %v", err) - } - - config := &Config{} - result, err := config.loadCertManagerCA(dir) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if string(result) != string(caContent) { - t.Errorf("Expected %s, got %s", caContent, result) - } - }) -} - -func TestNewWebhookConfig_CertManagerNotMounted(t *testing.T) { - t.Setenv("POD_NAMESPACE", "test-namespace") - - dir := t.TempDir() - // Don't create any certificate files to simulate cert-manager not ready - - _, err := NewWebhookConfig(nil, "test-webhook", 8080, nil, dir, true, true, false, true, "fleet-webhook-server-cert", "fleet-webhook-server-cert") - if err == nil { - t.Error("Expected error when cert-manager certificates not mounted") - } - if !strings.Contains(err.Error(), "failed to load cert-manager CA certificate") { - t.Errorf("Expected error about loading cert-manager CA, got: %v", err) - } -} func TestNewWebhookConfig_SelfSignedCertError(t *testing.T) { t.Setenv("POD_NAMESPACE", "test-namespace") @@ -356,3 +269,74 @@ func TestBuildWebhookAnnotations(t *testing.T) { }) } } + +func TestCreateClientConfig(t *testing.T) { + serviceConnectionType := options.Service + testCases := map[string]struct { + config Config + validationPath string + expectCABundle bool + }{ + "useCertManager is false - CABundle should be set": { + config: Config{ + useCertManager: false, + caPEM: []byte("test-ca-bundle"), + serviceNamespace: "fleet-system", + serviceName: "fleet-webhook-service", + servicePort: 443, + clientConnectionType: &serviceConnectionType, + }, + validationPath: "/validate", + expectCABundle: true, + }, + "useCertManager is true - CABundle should be empty for cert-manager injection": { + config: Config{ + useCertManager: true, + caPEM: []byte("test-ca-bundle"), + serviceNamespace: "fleet-system", + serviceName: "fleet-webhook-service", + servicePort: 443, + clientConnectionType: &serviceConnectionType, + }, + validationPath: "/validate", + expectCABundle: false, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + clientConfig := tc.config.createClientConfig(tc.validationPath) + + if tc.expectCABundle { + if len(clientConfig.CABundle) == 0 { + t.Errorf("Expected CABundle to be set, but it was empty") + } + if diff := cmp.Diff(tc.config.caPEM, clientConfig.CABundle); diff != "" { + t.Errorf("CABundle mismatch (-want +got):\n%s", diff) + } + } else { + if len(clientConfig.CABundle) != 0 { + t.Errorf("Expected CABundle to be empty for cert-manager injection, but got: %v", clientConfig.CABundle) + } + } + + // Verify service reference is set correctly + if clientConfig.Service == nil { + t.Errorf("Expected Service to be set") + } else { + if clientConfig.Service.Namespace != tc.config.serviceNamespace { + t.Errorf("Expected Service.Namespace=%s, got %s", tc.config.serviceNamespace, clientConfig.Service.Namespace) + } + if clientConfig.Service.Name != tc.config.serviceName { + t.Errorf("Expected Service.Name=%s, got %s", tc.config.serviceName, clientConfig.Service.Name) + } + if *clientConfig.Service.Port != tc.config.servicePort { + t.Errorf("Expected Service.Port=%d, got %d", tc.config.servicePort, *clientConfig.Service.Port) + } + if *clientConfig.Service.Path != tc.validationPath { + t.Errorf("Expected Service.Path=%s, got %s", tc.validationPath, *clientConfig.Service.Path) + } + } + }) + } +} From 37416f0df8fc178192b776d23c8fec4cfe8715a0 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 08/12] check CA bundle injection for readiness Signed-off-by: Wei Weng --- cmd/hubagent/main.go | 30 +++- pkg/webhook/webhook.go | 50 +++++++ pkg/webhook/webhook_test.go | 256 +++++++++++++++++++++++++++++++++- test/utils/manager/manager.go | 35 +++++ 4 files changed, 362 insertions(+), 9 deletions(-) create mode 100644 test/utils/manager/manager.go diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index 75e88cb58..c64685f47 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -20,6 +20,7 @@ import ( "flag" "fmt" "math" + "net/http" "os" "strings" "sync" @@ -157,11 +158,25 @@ func main() { if opts.EnableWebhook { whiteListedUsers := strings.Split(opts.WhiteListedUsers, ",") - if err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, - opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled, opts.UseCertManager, opts.WebhookCertDir, opts.WebhookCertName, opts.WebhookCertSecretName); err != nil { + webhookConfig, err := SetupWebhook(mgr, options.WebhookClientConnectionType(opts.WebhookClientConnectionType), opts.WebhookServiceName, whiteListedUsers, + opts.EnableGuardRail, opts.EnableV1Beta1APIs, opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.NetworkingAgentsEnabled, opts.UseCertManager, opts.WebhookCertDir, opts.WebhookCertName, opts.WebhookCertSecretName) + if err != nil { klog.ErrorS(err, "unable to set up webhook") exitWithErrorFunc() } + + // When using cert-manager, add a readiness check to ensure CA bundles are injected before marking ready. + // This prevents the pod from accepting traffic before cert-manager has populated the webhook CA bundles, + // which would cause webhook calls to fail. + if opts.UseCertManager { + if err := mgr.AddReadyzCheck("cert-manager-ca-injection", func(req *http.Request) error { + return webhookConfig.CheckCAInjection(req.Context()) + }); err != nil { + klog.ErrorS(err, "unable to set up cert-manager CA injection readiness check") + exitWithErrorFunc() + } + klog.V(2).InfoS("Added cert-manager CA injection readiness check") + } } ctx := ctrl.SetupSignalHandler() @@ -200,21 +215,22 @@ func main() { } // SetupWebhook generates the webhook cert and then set up the webhook configurator. +// Returns the webhook Config so it can be used for readiness checks. func SetupWebhook(mgr manager.Manager, webhookClientConnectionType options.WebhookClientConnectionType, webhookServiceName string, - whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool, useCertManager bool, webhookCertDir string, webhookCertName string, webhookCertSecretName string) error { + whiteListedUsers []string, enableGuardRail, isFleetV1Beta1API bool, denyModifyMemberClusterLabels bool, enableWorkload bool, networkingAgentsEnabled bool, useCertManager bool, webhookCertDir string, webhookCertName string, webhookCertSecretName string) (*webhook.Config, error) { // Generate self-signed key and crt files in webhookCertDir for the webhook server to start. w, err := webhook.NewWebhookConfig(mgr, webhookServiceName, FleetWebhookPort, &webhookClientConnectionType, webhookCertDir, enableGuardRail, denyModifyMemberClusterLabels, enableWorkload, useCertManager, webhookCertName, webhookCertSecretName) if err != nil { klog.ErrorS(err, "fail to generate WebhookConfig") - return err + return nil, err } if err = mgr.Add(w); err != nil { klog.ErrorS(err, "unable to add WebhookConfig") - return err + return nil, err } if err = webhook.AddToManager(mgr, whiteListedUsers, denyModifyMemberClusterLabels, networkingAgentsEnabled); err != nil { klog.ErrorS(err, "unable to register webhooks to the manager") - return err + return nil, err } - return nil + return w, nil } diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index ee52b016a..7dd698c89 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -219,6 +219,56 @@ func (w *Config) Start(ctx context.Context) error { return nil } +// CheckCAInjection verifies that cert-manager has injected the CA bundle into all webhook configurations. +// This is used as a readiness check when useCertManager is enabled. +// Returns nil when CA bundles are injected, or an error if they are missing. +func (w *Config) CheckCAInjection(ctx context.Context) error { + if !w.useCertManager { + // Not using cert-manager, no need to check + return nil + } + + cl := w.mgr.GetClient() + + // Check mutating webhook configuration + var mutatingCfg admv1.MutatingWebhookConfiguration + if err := cl.Get(ctx, client.ObjectKey{Name: fleetMutatingWebhookCfgName}, &mutatingCfg); err != nil { + return fmt.Errorf("failed to get MutatingWebhookConfiguration %s: %w", fleetMutatingWebhookCfgName, err) + } + for _, webhook := range mutatingCfg.Webhooks { + if len(webhook.ClientConfig.CABundle) == 0 { + return fmt.Errorf("MutatingWebhookConfiguration %s webhook %s is missing CA bundle (cert-manager injection pending)", fleetMutatingWebhookCfgName, webhook.Name) + } + } + + // Check validating webhook configuration + var validatingCfg admv1.ValidatingWebhookConfiguration + if err := cl.Get(ctx, client.ObjectKey{Name: fleetValidatingWebhookCfgName}, &validatingCfg); err != nil { + return fmt.Errorf("failed to get ValidatingWebhookConfiguration %s: %w", fleetValidatingWebhookCfgName, err) + } + for _, webhook := range validatingCfg.Webhooks { + if len(webhook.ClientConfig.CABundle) == 0 { + return fmt.Errorf("ValidatingWebhookConfiguration %s webhook %s is missing CA bundle (cert-manager injection pending)", fleetValidatingWebhookCfgName, webhook.Name) + } + } + + // Check guard rail webhook configuration if enabled + if w.enableGuardRail { + var guardRailCfg admv1.ValidatingWebhookConfiguration + if err := cl.Get(ctx, client.ObjectKey{Name: fleetGuardRailWebhookCfgName}, &guardRailCfg); err != nil { + return fmt.Errorf("failed to get ValidatingWebhookConfiguration %s: %w", fleetGuardRailWebhookCfgName, err) + } + for _, webhook := range guardRailCfg.Webhooks { + if len(webhook.ClientConfig.CABundle) == 0 { + return fmt.Errorf("ValidatingWebhookConfiguration %s webhook %s is missing CA bundle (cert-manager injection pending)", fleetGuardRailWebhookCfgName, webhook.Name) + } + } + } + + klog.V(2).InfoS("All webhook configurations have CA bundles injected by cert-manager") + return nil +} + // createFleetWebhookConfiguration creates the ValidatingWebhookConfiguration object for the webhook. func (w *Config) createFleetWebhookConfiguration(ctx context.Context) error { if err := w.createMutatingWebhookConfiguration(ctx, w.buildFleetMutatingWebhooks(), fleetMutatingWebhookCfgName); err != nil { diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 905540f12..d42a69790 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -1,20 +1,25 @@ package webhook import ( + "context" "strings" "testing" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" + admv1 "k8s.io/api/admissionregistration/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/manager" "github.com/kubefleet-dev/kubefleet/cmd/hubagent/options" "github.com/kubefleet-dev/kubefleet/pkg/utils" + testmanager "github.com/kubefleet-dev/kubefleet/test/utils/manager" ) -const testWebhookServiceName = "test-webhook" - func TestBuildFleetMutatingWebhooks(t *testing.T) { url := options.WebhookClientConnectionType("url") testCases := map[string]struct { @@ -340,3 +345,250 @@ func TestCreateClientConfig(t *testing.T) { }) } } + +func TestCheckCAInjection(t *testing.T) { + testCases := map[string]struct { + config Config + existingObjects []client.Object + expectError bool + expectedErrorMsg string + }{ + "useCertManager is false - returns nil without checking": { + config: Config{ + useCertManager: false, + enableGuardRail: false, + }, + expectError: false, + }, + "useCertManager is true, all CA bundles present": { + config: Config{ + useCertManager: true, + enableGuardRail: false, + }, + existingObjects: []client.Object{ + &admv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetMutatingWebhookCfgName}, + Webhooks: []admv1.MutatingWebhook{ + { + Name: "test-mutating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + &admv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetValidatingWebhookCfgName}, + Webhooks: []admv1.ValidatingWebhook{ + { + Name: "test-validating-webhook-1", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + { + Name: "test-validating-webhook-2", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + }, + expectError: false, + }, + "useCertManager is true, mutating webhook missing CA bundle": { + config: Config{ + useCertManager: true, + enableGuardRail: false, + }, + existingObjects: []client.Object{ + &admv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetMutatingWebhookCfgName}, + Webhooks: []admv1.MutatingWebhook{ + { + Name: "test-mutating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: nil, // Missing CA bundle + }, + }, + }, + }, + &admv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetValidatingWebhookCfgName}, + Webhooks: []admv1.ValidatingWebhook{ + { + Name: "test-validating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + }, + expectError: true, + expectedErrorMsg: "test-mutating-webhook is missing CA bundle", + }, + "useCertManager is true, validating webhook missing CA bundle": { + config: Config{ + useCertManager: true, + enableGuardRail: false, + }, + existingObjects: []client.Object{ + &admv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetMutatingWebhookCfgName}, + Webhooks: []admv1.MutatingWebhook{ + { + Name: "test-mutating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + &admv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetValidatingWebhookCfgName}, + Webhooks: []admv1.ValidatingWebhook{ + { + Name: "test-validating-webhook-1", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + { + Name: "test-validating-webhook-2", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte{}, // Empty CA bundle + }, + }, + }, + }, + }, + expectError: true, + expectedErrorMsg: "test-validating-webhook-2 is missing CA bundle", + }, + "useCertManager is true with guard rail, all CA bundles present": { + config: Config{ + useCertManager: true, + enableGuardRail: true, + }, + existingObjects: []client.Object{ + &admv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetMutatingWebhookCfgName}, + Webhooks: []admv1.MutatingWebhook{ + { + Name: "test-mutating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + &admv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetValidatingWebhookCfgName}, + Webhooks: []admv1.ValidatingWebhook{ + { + Name: "test-validating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + &admv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetGuardRailWebhookCfgName}, + Webhooks: []admv1.ValidatingWebhook{ + { + Name: "test-guard-rail-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + }, + expectError: false, + }, + "useCertManager is true with guard rail, guard rail webhook missing CA bundle": { + config: Config{ + useCertManager: true, + enableGuardRail: true, + }, + existingObjects: []client.Object{ + &admv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetMutatingWebhookCfgName}, + Webhooks: []admv1.MutatingWebhook{ + { + Name: "test-mutating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + &admv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetValidatingWebhookCfgName}, + Webhooks: []admv1.ValidatingWebhook{ + { + Name: "test-validating-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: []byte("fake-ca-bundle"), + }, + }, + }, + }, + &admv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: fleetGuardRailWebhookCfgName}, + Webhooks: []admv1.ValidatingWebhook{ + { + Name: "test-guard-rail-webhook", + ClientConfig: admv1.WebhookClientConfig{ + CABundle: nil, // Missing CA bundle + }, + }, + }, + }, + }, + expectError: true, + expectedErrorMsg: "test-guard-rail-webhook is missing CA bundle", + }, + "useCertManager is true, webhook configuration not found": { + config: Config{ + useCertManager: true, + enableGuardRail: false, + }, + existingObjects: []client.Object{}, + expectError: true, + expectedErrorMsg: "failed to get MutatingWebhookConfiguration", + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + // Create a fake client with a proper scheme + scheme := runtime.NewScheme() + _ = admv1.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tc.existingObjects...). + Build() + + // Create a fake manager that returns our fake client + tc.config.mgr = &testmanager.FakeManager{Client: fakeClient} + + err := tc.config.CheckCAInjection(context.Background()) + + if tc.expectError { + if err == nil { + t.Errorf("Expected error but got nil") + } else if !strings.Contains(err.Error(), tc.expectedErrorMsg) { + t.Errorf("Expected error message to contain %q, but got: %v", tc.expectedErrorMsg, err) + } + } else { + if err != nil { + t.Errorf("Expected no error but got: %v", err) + } + } + }) + } +} diff --git a/test/utils/manager/manager.go b/test/utils/manager/manager.go new file mode 100644 index 000000000..adceb6f37 --- /dev/null +++ b/test/utils/manager/manager.go @@ -0,0 +1,35 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package manager provides a fake controller-runtime manager for testing. +package manager + +import ( + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +// FakeManager is a fake controller-runtime manager for testing. +type FakeManager struct { + manager.Manager + // Client is the Kubernetes client used by this manager. + Client client.Client +} + +// GetClient returns the configured client. +func (f *FakeManager) GetClient() client.Client { + return f.Client +} From ca6b83ada32e3c700d9a11e4e051c1a003fe8994 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 09/12] fix secret permission Signed-off-by: Wei Weng --- charts/hub-agent/templates/deployment.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index b527d8317..48d96aa11 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -92,9 +92,9 @@ spec: - name: webhook-cert secret: secretName: {{ .Values.webhookCertSecretName }} - # defaultMode 0400 (read-only for owner) prevents unauthorized access to certificate files - # and reduces attack surface by ensuring only the container process can read the certs - defaultMode: 0400 + # defaultMode 0444 (read for all) allows the container process to read the certs + # regardless of the user/group it runs as + defaultMode: 0444 {{- end }} {{- with .Values.affinity }} affinity: From 87ee659ee15d1315589b0c31bdf972537d2feaf2 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 21:24:33 +0000 Subject: [PATCH 10/12] decouple informer cache population and event handling Signed-off-by: Wei Weng --- cmd/hubagent/workload/setup.go | 16 + pkg/resourcewatcher/change_dector.go | 34 +- pkg/resourcewatcher/informer_populator.go | 114 +++++ .../informer_populator_test.go | 437 ++++++++++++++++++ pkg/resourcewatcher/resource_collector.go | 29 +- .../resource_collector_test.go | 221 +++++++++ pkg/utils/informer/informermanager.go | 110 ++--- pkg/utils/informer/informermanager_test.go | 240 ++++++++++ pkg/utils/informer/readiness/readiness.go | 2 +- test/utils/informer/manager.go | 7 + 10 files changed, 1122 insertions(+), 88 deletions(-) create mode 100644 pkg/resourcewatcher/informer_populator.go create mode 100644 pkg/resourcewatcher/informer_populator_test.go create mode 100644 pkg/resourcewatcher/resource_collector_test.go diff --git a/cmd/hubagent/workload/setup.go b/cmd/hubagent/workload/setup.go index dbb47de4f..1397a2f1a 100644 --- a/cmd/hubagent/workload/setup.go +++ b/cmd/hubagent/workload/setup.go @@ -496,7 +496,23 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } resourceChangeController := controller.NewController(resourceChangeControllerName, controller.ClusterWideKeyFunc, rcr.Reconcile, rateLimiter) + // Set up the InformerPopulator that runs on ALL pods (leader and followers) + // This ensures all pods have synced informer caches for webhook validation + klog.Info("Setting up informer populator") + informerPopulator := &resourcewatcher.InformerPopulator{ + DiscoveryClient: discoverClient, + RESTMapper: mgr.GetRESTMapper(), + InformerManager: dynamicInformerManager, + ResourceConfig: resourceConfig, + } + + if err := mgr.Add(informerPopulator); err != nil { + klog.ErrorS(err, "Failed to setup informer populator") + return err + } + // Set up a runner that starts all the custom controllers we created above + // This runs ONLY on the leader and adds event handlers to the informers created by InformerPopulator resourceChangeDetector := &resourcewatcher.ChangeDetector{ DiscoveryClient: discoverClient, RESTMapper: mgr.GetRESTMapper(), diff --git a/pkg/resourcewatcher/change_dector.go b/pkg/resourcewatcher/change_dector.go index 9a4011628..0ea4699cc 100644 --- a/pkg/resourcewatcher/change_dector.go +++ b/pkg/resourcewatcher/change_dector.go @@ -23,7 +23,6 @@ import ( "golang.org/x/sync/errgroup" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/discovery" "k8s.io/client-go/tools/cache" @@ -137,43 +136,28 @@ func (d *ChangeDetector) discoverAPIResourcesLoop(ctx context.Context, period ti }, period) } -// discoverResources goes through all the api resources in the cluster and create informers on selected types +// discoverResources goes through all the api resources in the cluster and adds event handlers to informers func (d *ChangeDetector) discoverResources(dynamicResourceEventHandler cache.ResourceEventHandler) { - newResources, err := d.getWatchableResources() + newResources, err := getWatchableResources(d.DiscoveryClient) var dynamicResources []informer.APIResourceMeta if err != nil { klog.ErrorS(err, "Failed to get all the api resources from the cluster") } for _, res := range newResources { // all the static resources are disabled by default - if d.shouldWatchResource(res.GroupVersionResource) { + if shouldWatchResource(res.GroupVersionResource, d.RESTMapper, d.ResourceConfig) { dynamicResources = append(dynamicResources, res) } } - d.InformerManager.AddDynamicResources(dynamicResources, dynamicResourceEventHandler, err == nil) - // this will start the newly added informers if there is any - d.InformerManager.Start() -} -// gvrDisabled returns whether GroupVersionResource is disabled. -func (d *ChangeDetector) shouldWatchResource(gvr schema.GroupVersionResource) bool { - // By default, all of the APIs are allowed. - if d.ResourceConfig == nil { - return true + // On the leader, add event handlers to informers that were already created by InformerPopulator + // The informers exist on all pods, but only the leader adds handlers and processes events + for _, res := range dynamicResources { + d.InformerManager.AddEventHandlerToInformer(res.GroupVersionResource, dynamicResourceEventHandler) } - gvks, err := d.RESTMapper.KindsFor(gvr) - if err != nil { - klog.ErrorS(err, "gvr transform failed", "gvr", gvr.String()) - return false - } - for _, gvk := range gvks { - if d.ResourceConfig.IsResourceDisabled(gvk) { - klog.V(4).InfoS("Skip watch resource", "group version kind", gvk.String()) - return false - } - } - return true + // this will start the newly added informers if there is any + d.InformerManager.Start() } // dynamicResourceFilter filters out resources that we don't want to watch diff --git a/pkg/resourcewatcher/informer_populator.go b/pkg/resourcewatcher/informer_populator.go new file mode 100644 index 000000000..82e0ae4bb --- /dev/null +++ b/pkg/resourcewatcher/informer_populator.go @@ -0,0 +1,114 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcewatcher + +import ( + "context" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/discovery" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/manager" + + "github.com/kubefleet-dev/kubefleet/pkg/utils" + "github.com/kubefleet-dev/kubefleet/pkg/utils/informer" +) + +const ( + // informerPopulatorDiscoveryPeriod is how often the InformerPopulator rediscovers API resources + informerPopulatorDiscoveryPeriod = 30 * time.Second +) + +// make sure that our InformerPopulator implements controller runtime interfaces +var ( + _ manager.Runnable = &InformerPopulator{} + _ manager.LeaderElectionRunnable = &InformerPopulator{} +) + +// InformerPopulator discovers API resources and creates informers for them WITHOUT adding event handlers. +// This allows follower pods to have synced informer caches for webhook validation while the leader's +// ChangeDetector adds event handlers and runs controllers. +type InformerPopulator struct { + // DiscoveryClient is used to do resource discovery. + DiscoveryClient discovery.DiscoveryInterface + + // RESTMapper is used to convert between GVK and GVR + RESTMapper meta.RESTMapper + + // InformerManager manages all the dynamic informers created by the discovery client + InformerManager informer.Manager + + // ResourceConfig contains all the API resources that we won't select based on the allowed or skipped propagating APIs option. + ResourceConfig *utils.ResourceConfig +} + +// Start runs the informer populator, discovering resources and creating informers. +// This runs on ALL pods (leader and followers) to ensure all have synced caches. +func (p *InformerPopulator) Start(ctx context.Context) error { + klog.InfoS("Starting the informer populator") + defer klog.InfoS("The informer populator is stopped") + + // Run initial discovery to create informers + p.discoverAndCreateInformers() + + // Wait for initial cache sync + p.InformerManager.WaitForCacheSync() + klog.InfoS("Informer populator: initial cache sync complete") + + // Continue discovering resources periodically to handle CRD installations + wait.UntilWithContext(ctx, func(ctx context.Context) { + p.discoverAndCreateInformers() + }, informerPopulatorDiscoveryPeriod) + + return nil +} + +// discoverAndCreateInformers discovers API resources and creates informers WITHOUT adding event handlers +func (p *InformerPopulator) discoverAndCreateInformers() { + newResources, err := getWatchableResources(p.DiscoveryClient) + if err != nil { + klog.ErrorS(err, "Failed to get all the api resources from the cluster") + } + + var resourcesToPopulate []informer.APIResourceMeta + for _, res := range newResources { + if shouldWatchResource(res.GroupVersionResource, p.RESTMapper, p.ResourceConfig) { + resourcesToPopulate = append(resourcesToPopulate, res) + } + } + + // Create informers directly without adding event handlers. + // This avoids adding any event handlers on follower pods + for _, res := range resourcesToPopulate { + p.InformerManager.CreateInformerForResource(res) + } + + // Start any newly created informers + p.InformerManager.Start() + + if err == nil { + klog.V(2).InfoS("Informer populator: discovered resources", "count", len(resourcesToPopulate)) + } +} + +// NeedLeaderElection implements LeaderElectionRunnable interface. +// Returns false so this runs on ALL pods (leader and followers). +func (p *InformerPopulator) NeedLeaderElection() bool { + return false +} diff --git a/pkg/resourcewatcher/informer_populator_test.go b/pkg/resourcewatcher/informer_populator_test.go new file mode 100644 index 000000000..592be39b2 --- /dev/null +++ b/pkg/resourcewatcher/informer_populator_test.go @@ -0,0 +1,437 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcewatcher + +import ( + "context" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + fakediscovery "k8s.io/client-go/discovery/fake" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/restmapper" + + "github.com/kubefleet-dev/kubefleet/pkg/utils" + testinformer "github.com/kubefleet-dev/kubefleet/test/utils/informer" +) + +const ( + // testTimeout is the timeout for test operations + testTimeout = 200 * time.Millisecond + // testSleep is how long to sleep to allow periodic operations + testSleep = 150 * time.Millisecond +) + +func TestInformerPopulator_NeedLeaderElection(t *testing.T) { + populator := &InformerPopulator{} + + // InformerPopulator should NOT need leader election so it runs on all pods + if populator.NeedLeaderElection() { + t.Error("InformerPopulator should not need leader election") + } +} + +func TestInformerPopulator_discoverAndCreateInformers(t *testing.T) { + tests := []struct { + name string + discoveryResources []*metav1.APIResourceList + resourceConfig *utils.ResourceConfig + expectedInformerCreated bool + expectedResourceCount int + }{ + { + name: "creates informers for watchable resources", + discoveryResources: []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + resourceConfig: nil, // Allow all resources + expectedInformerCreated: true, + expectedResourceCount: 1, + }, + { + name: "skips resources without list/watch verbs", + discoveryResources: []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"get", "delete"}, // Missing list/watch + }, + }, + }, + }, + resourceConfig: nil, + expectedInformerCreated: false, + expectedResourceCount: 0, + }, + { + name: "respects resource config filtering", + discoveryResources: []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "secrets", + Kind: "Secret", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + resourceConfig: func() *utils.ResourceConfig { + rc := utils.NewResourceConfig(false) // Skip mode + _ = rc.Parse("v1/Secret") // Skip secrets + return rc + }(), + expectedInformerCreated: false, + expectedResourceCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create fake discovery client + fakeClient := fake.NewSimpleClientset() + fakeDiscovery, ok := fakeClient.Discovery().(*fakediscovery.FakeDiscovery) + if !ok { + t.Fatal("Failed to cast to FakeDiscovery") + } + fakeDiscovery.Resources = tt.discoveryResources + + // Create REST mapper + groupResources := []*restmapper.APIGroupResources{} + for _, resourceList := range tt.discoveryResources { + gv, err := schema.ParseGroupVersion(resourceList.GroupVersion) + if err != nil { + t.Fatalf("Failed to parse group version: %v", err) + } + + apiResources := []metav1.APIResource{} + apiResources = append(apiResources, resourceList.APIResources...) + + groupResources = append(groupResources, &restmapper.APIGroupResources{ + Group: metav1.APIGroup{ + Name: gv.Group, + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: resourceList.GroupVersion, Version: gv.Version}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: resourceList.GroupVersion, + Version: gv.Version, + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + gv.Version: apiResources, + }, + }) + } + restMapper := restmapper.NewDiscoveryRESTMapper(groupResources) + + // Create fake informer manager + fakeInformerManager := &testinformer.FakeManager{ + APIResources: make(map[schema.GroupVersionKind]bool), + } + + // Track calls to CreateInformerForResource + populator := &InformerPopulator{ + DiscoveryClient: fakeDiscovery, + RESTMapper: restMapper, + InformerManager: fakeInformerManager, + ResourceConfig: tt.resourceConfig, + } + + // Run discovery + populator.discoverAndCreateInformers() + + // Note: FakeManager doesn't track calls, so we verify no panics occurred + }) + } +} + +func TestInformerPopulator_Start(t *testing.T) { + // Create fake discovery client with some resources + fakeClient := fake.NewSimpleClientset() + fakeDiscovery, ok := fakeClient.Discovery().(*fakediscovery.FakeDiscovery) + if !ok { + t.Fatal("Failed to cast to FakeDiscovery") + } + + fakeDiscovery.Resources = []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + } + + // Create REST mapper + gv := schema.GroupVersion{Group: "", Version: "v1"} + groupResources := []*restmapper.APIGroupResources{ + { + Group: metav1.APIGroup{ + Name: "", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": { + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + } + restMapper := restmapper.NewDiscoveryRESTMapper(groupResources) + + // Create fake informer manager + fakeInformerManager := &testinformer.FakeManager{ + APIResources: map[schema.GroupVersionKind]bool{ + gv.WithKind("ConfigMap"): true, + }, + IsClusterScopedResource: false, + } + + populator := &InformerPopulator{ + DiscoveryClient: fakeDiscovery, + RESTMapper: restMapper, + InformerManager: fakeInformerManager, + ResourceConfig: nil, + } + + // Create a context that will cancel after a short time + // Use half of testTimeout to ensure we have time to verify after cancellation + ctx, cancel := context.WithTimeout(context.Background(), testTimeout/2) + defer cancel() + + // Start the populator in a goroutine + done := make(chan error, 1) + go func() { + done <- populator.Start(ctx) + }() + + // Wait for context to cancel or error + select { + case err := <-done: + // Should return nil when context is canceled + if err != nil { + t.Errorf("Start should not return error on context cancellation: %v", err) + } + case <-time.After(testTimeout): + t.Fatal("Start did not exit after context cancellation") + } +} + +func TestInformerPopulator_Integration(t *testing.T) { + // This test verifies the integration between InformerPopulator and the informer manager + + // Create fake discovery with multiple resource types + fakeClient := fake.NewSimpleClientset() + fakeDiscovery, ok := fakeClient.Discovery().(*fakediscovery.FakeDiscovery) + if !ok { + t.Fatal("Failed to cast to FakeDiscovery") + } + + fakeDiscovery.Resources = []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + { + Name: "secrets", + Kind: "Secret", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + { + GroupVersion: "apps/v1", + APIResources: []metav1.APIResource{ + { + Name: "deployments", + Kind: "Deployment", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + } + + // Create REST mapper + groupResources := []*restmapper.APIGroupResources{ + { + Group: metav1.APIGroup{ + Name: "", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": fakeDiscovery.Resources[0].APIResources, + }, + }, + { + Group: metav1.APIGroup{ + Name: "apps", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "apps/v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "apps/v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": fakeDiscovery.Resources[1].APIResources, + }, + }, + } + restMapper := restmapper.NewDiscoveryRESTMapper(groupResources) + + // Create resource config that skips secrets + resourceConfig := utils.NewResourceConfig(false) + err := resourceConfig.Parse("v1/Secret") + if err != nil { + t.Fatalf("Failed to parse resource config: %v", err) + } + + fakeInformerManager := &testinformer.FakeManager{ + APIResources: make(map[schema.GroupVersionKind]bool), + IsClusterScopedResource: false, + } + + populator := &InformerPopulator{ + DiscoveryClient: fakeDiscovery, + RESTMapper: restMapper, + InformerManager: fakeInformerManager, + ResourceConfig: resourceConfig, + } + + // Run discovery + populator.discoverAndCreateInformers() + + // Note: FakeManager doesn't track calls, so we just verify no panics +} + +func TestInformerPopulator_PeriodicDiscovery(t *testing.T) { + // This test verifies that the populator continues to discover resources periodically + + fakeClient := fake.NewSimpleClientset() + fakeDiscovery, ok := fakeClient.Discovery().(*fakediscovery.FakeDiscovery) + if !ok { + t.Fatal("Failed to cast to FakeDiscovery") + } + + fakeDiscovery.Resources = []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + } + + groupResources := []*restmapper.APIGroupResources{ + { + Group: metav1.APIGroup{ + Name: "", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": fakeDiscovery.Resources[0].APIResources, + }, + }, + } + restMapper := restmapper.NewDiscoveryRESTMapper(groupResources) + + fakeInformerManager := &testinformer.FakeManager{ + APIResources: make(map[schema.GroupVersionKind]bool), + IsClusterScopedResource: false, + } + + populator := &InformerPopulator{ + DiscoveryClient: fakeDiscovery, + RESTMapper: restMapper, + InformerManager: fakeInformerManager, + ResourceConfig: nil, + } + + // Override the discovery period for testing + ctx, cancel := context.WithTimeout(context.Background(), testTimeout) + defer cancel() + + // Start the populator + go func() { + _ = populator.Start(ctx) + }() + + // Wait a bit to allow multiple discovery cycles + time.Sleep(testSleep) + + // Note: FakeManager doesn't track calls, so we just verify successful execution +} diff --git a/pkg/resourcewatcher/resource_collector.go b/pkg/resourcewatcher/resource_collector.go index 86e2cf235..8852d190e 100644 --- a/pkg/resourcewatcher/resource_collector.go +++ b/pkg/resourcewatcher/resource_collector.go @@ -17,12 +17,14 @@ limitations under the License. package resourcewatcher import ( + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/discovery" "k8s.io/klog/v2" metricsV1beta1 "k8s.io/metrics/pkg/apis/metrics/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/utils" "github.com/kubefleet-dev/kubefleet/pkg/utils/informer" ) @@ -30,10 +32,11 @@ import ( // More specifically, all api resources which support the 'list', and 'watch' verbs. // All discovery errors are considered temporary. Upon encountering any error, // getWatchableResources will log and return any discovered resources it was able to process (which may be none). -func (d *ChangeDetector) getWatchableResources() ([]informer.APIResourceMeta, error) { +// This is a standalone function that can be used by both ChangeDetector and InformerPopulator. +func getWatchableResources(discoveryClient discovery.ServerResourcesInterface) ([]informer.APIResourceMeta, error) { // Get all the resources this cluster has. We only need to care about the preferred version as the informers watch // the preferred version will get watch event for resources on the other versions since there is only one version in etcd. - allResources, discoverError := d.DiscoveryClient.ServerPreferredResources() + allResources, discoverError := discoveryClient.ServerPreferredResources() allErr := make([]error, 0) if discoverError != nil { if discovery.IsGroupDiscoveryFailedError(discoverError) { @@ -82,3 +85,25 @@ func (d *ChangeDetector) getWatchableResources() ([]informer.APIResourceMeta, er return watchableGroupVersionResources, errors.NewAggregate(allErr) } + +// shouldWatchResource returns whether a GroupVersionResource should be watched. +// This is a standalone function that can be used by both ChangeDetector and InformerPopulator. +func shouldWatchResource(gvr schema.GroupVersionResource, restMapper meta.RESTMapper, resourceConfig *utils.ResourceConfig) bool { + // By default, all of the APIs are allowed. + if resourceConfig == nil { + return true + } + + gvks, err := restMapper.KindsFor(gvr) + if err != nil { + klog.ErrorS(err, "gvr transform failed", "gvr", gvr.String()) + return false + } + for _, gvk := range gvks { + if resourceConfig.IsResourceDisabled(gvk) { + klog.V(4).InfoS("Skip watch resource", "group version kind", gvk.String()) + return false + } + } + return true +} diff --git a/pkg/resourcewatcher/resource_collector_test.go b/pkg/resourcewatcher/resource_collector_test.go new file mode 100644 index 000000000..33a5d1d1f --- /dev/null +++ b/pkg/resourcewatcher/resource_collector_test.go @@ -0,0 +1,221 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcewatcher + +import ( + "testing" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/restmapper" + + "github.com/kubefleet-dev/kubefleet/pkg/utils" +) + +func TestShouldWatchResource(t *testing.T) { + tests := []struct { + name string + gvr schema.GroupVersionResource + resourceConfig *utils.ResourceConfig + setupMapper func() meta.RESTMapper + expected bool + }{ + { + name: "returns true when resourceConfig is nil", + gvr: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "configmaps", + }, + resourceConfig: nil, + setupMapper: func() meta.RESTMapper { + groupResources := []*restmapper.APIGroupResources{ + { + Group: metav1.APIGroup{ + Name: "", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": { + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + } + return restmapper.NewDiscoveryRESTMapper(groupResources) + }, + expected: true, + }, + { + name: "returns true when resource is not disabled", + gvr: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "configmaps", + }, + resourceConfig: func() *utils.ResourceConfig { + rc := utils.NewResourceConfig(false) + // Disable secrets, but not configmaps + _ = rc.Parse("v1/Secret") + return rc + }(), + setupMapper: func() meta.RESTMapper { + groupResources := []*restmapper.APIGroupResources{ + { + Group: metav1.APIGroup{ + Name: "", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": { + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + } + return restmapper.NewDiscoveryRESTMapper(groupResources) + }, + expected: true, + }, + { + name: "returns false when resource is disabled", + gvr: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "secrets", + }, + resourceConfig: func() *utils.ResourceConfig { + rc := utils.NewResourceConfig(false) + _ = rc.Parse("v1/Secret") + return rc + }(), + setupMapper: func() meta.RESTMapper { + groupResources := []*restmapper.APIGroupResources{ + { + Group: metav1.APIGroup{ + Name: "", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": { + { + Name: "secrets", + Kind: "Secret", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + } + return restmapper.NewDiscoveryRESTMapper(groupResources) + }, + expected: false, + }, + { + name: "returns false when GVR mapping fails", + gvr: schema.GroupVersionResource{ + Group: "invalid.group", + Version: "v1", + Resource: "nonexistent", + }, + resourceConfig: utils.NewResourceConfig(false), + setupMapper: func() meta.RESTMapper { + // Empty mapper - will fail to map the GVR + groupResources := []*restmapper.APIGroupResources{} + return restmapper.NewDiscoveryRESTMapper(groupResources) + }, + expected: false, + }, + { + name: "handles apps group resources correctly", + gvr: schema.GroupVersionResource{ + Group: "apps", + Version: "v1", + Resource: "deployments", + }, + resourceConfig: nil, + setupMapper: func() meta.RESTMapper { + groupResources := []*restmapper.APIGroupResources{ + { + Group: metav1.APIGroup{ + Name: "apps", + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: "apps/v1", Version: "v1"}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "apps/v1", + Version: "v1", + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + "v1": { + { + Name: "deployments", + Kind: "Deployment", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + } + return restmapper.NewDiscoveryRESTMapper(groupResources) + }, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + restMapper := tt.setupMapper() + result := shouldWatchResource(tt.gvr, restMapper, tt.resourceConfig) + if result != tt.expected { + t.Errorf("shouldWatchResource() = %v, want %v", result, tt.expected) + } + }) + } +} diff --git a/pkg/utils/informer/informermanager.go b/pkg/utils/informer/informermanager.go index 53da3343a..984a6d392 100644 --- a/pkg/utils/informer/informermanager.go +++ b/pkg/utils/informer/informermanager.go @@ -33,11 +33,6 @@ import ( // InformerManager manages dynamic shared informer for all resources, include Kubernetes resource and // custom resources defined by CustomResourceDefinition. type Manager interface { - // AddDynamicResources builds a dynamicInformer for each resource in the resources list with the event handler. - // A resource is dynamic if its definition can be created/deleted/updated during runtime. - // Normally, it is a custom resource that is installed by users. The handler should not be nil. - AddDynamicResources(resources []APIResourceMeta, handler cache.ResourceEventHandler, listComplete bool) - // AddStaticResource creates a dynamicInformer for the static 'resource' and set its event handler. // A resource is static if its definition is pre-determined and immutable during runtime. // Normally, it is a resource that is pre-installed by the system. @@ -72,6 +67,16 @@ type Manager interface { // GetClient returns the dynamic dynamicClient. GetClient() dynamic.Interface + + // AddEventHandlerToInformer adds an event handler to an existing informer for the given resource. + // If the informer doesn't exist, it will be created. This is used by the leader's ChangeDetector + // to add event handlers to informers that were created by the InformerPopulator. + AddEventHandlerToInformer(resource schema.GroupVersionResource, handler cache.ResourceEventHandler) + + // CreateInformerForResource creates an informer for the given resource without adding any event handlers. + // This is used by InformerPopulator to create informers on all pods (leader and followers) so they have + // synced caches for webhook validation. The leader's ChangeDetector will add event handlers later. + CreateInformerForResource(resource APIResourceMeta) } // NewInformerManager constructs a new instance of informerManagerImpl. @@ -124,61 +129,6 @@ type informerManagerImpl struct { resourcesLock sync.RWMutex } -func (s *informerManagerImpl) AddDynamicResources(dynResources []APIResourceMeta, handler cache.ResourceEventHandler, listComplete bool) { - newGVKs := make(map[schema.GroupVersionKind]bool, len(dynResources)) - - addInformerFunc := func(newRes APIResourceMeta) { - dynRes, exist := s.apiResources[newRes.GroupVersionKind] - if !exist { - newRes.isPresent = true - s.apiResources[newRes.GroupVersionKind] = &newRes - // TODO (rzhang): remember the ResourceEventHandlerRegistration and remove it when the resource is deleted - // TODO: handle error which only happens if the informer is stopped - informer := s.informerFactory.ForResource(newRes.GroupVersionResource).Informer() - // Strip away the ManagedFields info from objects to save memory. - // - // TO-DO (chenyu1): evaluate if there are other fields, e.g., owner refs, status, that can also be stripped - // away to save memory. - if err := informer.SetTransform(ctrlcache.TransformStripManagedFields()); err != nil { - // The SetTransform func would only fail if the informer has already started. In this case, - // no further action is needed. - klog.ErrorS(err, "Failed to set transform func for informer", "gvr", newRes.GroupVersionResource) - } - _, _ = informer.AddEventHandler(handler) - klog.InfoS("Added an informer for a new resource", "res", newRes) - } else if !dynRes.isPresent { - // we just mark it as enabled as we should not add another eventhandler to the informer as it's still - // in the informerFactory - // TODO: add the Event handler back - dynRes.isPresent = true - klog.InfoS("Reactivated an informer for a reappeared resource", "res", dynRes) - } - } - - s.resourcesLock.Lock() - defer s.resourcesLock.Unlock() - - // Add the new dynResources that do not exist yet while build a map to speed up lookup - for _, newRes := range dynResources { - newGVKs[newRes.GroupVersionKind] = true - addInformerFunc(newRes) - } - - if !listComplete { - // do not disable any informer if we know the resource list is not complete - return - } - - // mark the disappeared dynResources from the handler map - for gvk, dynRes := range s.apiResources { - if !newGVKs[gvk] && !dynRes.isStaticResource && dynRes.isPresent { - // TODO: Remove the Event handler from the informer using the resourceEventHandlerRegistration during creat - dynRes.isPresent = false - klog.InfoS("Disabled an informer for a disappeared resource", "res", dynRes) - } - } -} - func (s *informerManagerImpl) AddStaticResource(resource APIResourceMeta, handler cache.ResourceEventHandler) { s.resourcesLock.Lock() defer s.resourcesLock.Unlock() @@ -255,6 +205,46 @@ func (s *informerManagerImpl) Stop() { s.cancel() } +func (s *informerManagerImpl) AddEventHandlerToInformer(resource schema.GroupVersionResource, handler cache.ResourceEventHandler) { + // Get or create the informer (this is idempotent - if it exists, we get the same instance) + // The idempotent behavior is important because it is called by both the change detector and informer populator, + // which run concurrently on the leader hub agent. + informer := s.informerFactory.ForResource(resource).Informer() + + // Add the event handler to the informer + _, _ = informer.AddEventHandler(handler) + + klog.V(2).InfoS("Added event handler to informer", "gvr", resource) +} + +func (s *informerManagerImpl) CreateInformerForResource(resource APIResourceMeta) { + s.resourcesLock.Lock() + defer s.resourcesLock.Unlock() + + dynRes, exist := s.apiResources[resource.GroupVersionKind] + if !exist { + // Register this resource in our tracking map + resource.isPresent = true + s.apiResources[resource.GroupVersionKind] = &resource + + // Create the informer without adding any event handler + informer := s.informerFactory.ForResource(resource.GroupVersionResource).Informer() + + // Strip away the ManagedFields info from objects to save memory + if err := informer.SetTransform(ctrlcache.TransformStripManagedFields()); err != nil { + // The SetTransform func would only fail if the informer has already started. + // In this case, no further action is needed. + klog.ErrorS(err, "Failed to set transform func for informer", "gvr", resource.GroupVersionResource) + } + + klog.V(3).InfoS("Created informer without handler", "res", resource) + } else if !dynRes.isPresent { + // Mark it as present again (resource reappeared) + dynRes.isPresent = true + klog.V(3).InfoS("Reactivated informer for reappeared resource", "res", dynRes) + } +} + // ContextForChannel derives a child context from a parent channel. // // The derived context's Done channel is closed when the returned cancel function diff --git a/pkg/utils/informer/informermanager_test.go b/pkg/utils/informer/informermanager_test.go index 53f2ce74a..0cfc75ab2 100644 --- a/pkg/utils/informer/informermanager_test.go +++ b/pkg/utils/informer/informermanager_test.go @@ -287,3 +287,243 @@ func TestGetAllResources_NotPresent(t *testing.T) { t.Errorf("GetAllResources()[0] = %v, want %v", got, presentRes.GroupVersionResource) } } + +func TestAddEventHandlerToInformer(t *testing.T) { + tests := []struct { + name string + gvr schema.GroupVersionResource + addTwice bool // Test adding handler twice to same informer + }{ + { + name: "add handler to new informer", + gvr: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "configmaps", + }, + addTwice: false, + }, + { + name: "add multiple handlers to same informer", + gvr: schema.GroupVersionResource{ + Group: "apps", + Version: "v1", + Resource: "deployments", + }, + addTwice: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeClient := fake.NewSimpleDynamicClient(scheme.Scheme) + stopCh := make(chan struct{}) + defer close(stopCh) + + mgr := NewInformerManager(fakeClient, 0, stopCh) + + // Track handler calls + callCount := 0 + handler := &testHandler{ + onAdd: func() { callCount++ }, + } + + // Add the handler + mgr.AddEventHandlerToInformer(tt.gvr, handler) + + // Verify informer was created + implMgr := mgr.(*informerManagerImpl) + informer := implMgr.informerFactory.ForResource(tt.gvr).Informer() + if informer == nil { + t.Fatal("Expected informer to be created") + } + + if tt.addTwice { + // Add another handler to the same informer + handler2 := &testHandler{ + onAdd: func() { callCount++ }, + } + mgr.AddEventHandlerToInformer(tt.gvr, handler2) + } + + // Test is successful if no panic occurred and informer exists + }) + } +} + +func TestCreateInformerForResource(t *testing.T) { + tests := []struct { + name string + resource APIResourceMeta + createTwice bool + markNotPresent bool // Mark resource as not present before second create + }{ + { + name: "create new informer", + resource: APIResourceMeta{ + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "ConfigMap", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "configmaps", + }, + IsClusterScoped: false, + }, + createTwice: false, + }, + { + name: "create informer twice (idempotent)", + resource: APIResourceMeta{ + GroupVersionKind: schema.GroupVersionKind{ + Group: "apps", + Version: "v1", + Kind: "Deployment", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "apps", + Version: "v1", + Resource: "deployments", + }, + IsClusterScoped: false, + }, + createTwice: true, + }, + { + name: "recreate informer for reappeared resource", + resource: APIResourceMeta{ + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Secret", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "secrets", + }, + IsClusterScoped: false, + }, + createTwice: true, + markNotPresent: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeClient := fake.NewSimpleDynamicClient(scheme.Scheme) + stopCh := make(chan struct{}) + defer close(stopCh) + + mgr := NewInformerManager(fakeClient, 0, stopCh) + implMgr := mgr.(*informerManagerImpl) + + // Create the informer + mgr.CreateInformerForResource(tt.resource) + + // Verify resource is tracked + resMeta, exists := implMgr.apiResources[tt.resource.GroupVersionKind] + if !exists { + t.Fatal("Expected resource to be tracked in apiResources map") + } + if !resMeta.isPresent { + t.Error("Expected resource to be marked as present") + } + if resMeta.IsClusterScoped != tt.resource.IsClusterScoped { + t.Errorf("IsClusterScoped = %v, want %v", resMeta.IsClusterScoped, tt.resource.IsClusterScoped) + } + + // Verify informer was created + informer := implMgr.informerFactory.ForResource(tt.resource.GroupVersionResource).Informer() + if informer == nil { + t.Fatal("Expected informer to be created") + } + + if tt.createTwice { + if tt.markNotPresent { + // Mark as not present (simulating resource deletion) + resMeta.isPresent = false + } + + // Create again + mgr.CreateInformerForResource(tt.resource) + + // Verify it's marked as present again + if !resMeta.isPresent { + t.Error("Expected resource to be marked as present after recreation") + } + } + }) + } +} + +func TestCreateInformerForResource_IsIdempotent(t *testing.T) { + // Test that creating the same informer multiple times doesn't cause issues + fakeClient := fake.NewSimpleDynamicClient(scheme.Scheme) + stopCh := make(chan struct{}) + defer close(stopCh) + + mgr := NewInformerManager(fakeClient, 0, stopCh) + implMgr := mgr.(*informerManagerImpl) + + resource := APIResourceMeta{ + GroupVersionKind: schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Pod", + }, + GroupVersionResource: schema.GroupVersionResource{ + Group: "", + Version: "v1", + Resource: "pods", + }, + IsClusterScoped: false, + } + + // Create multiple times + for i := 0; i < 3; i++ { + mgr.CreateInformerForResource(resource) + } + + // Should only have one entry in apiResources + if len(implMgr.apiResources) != 1 { + t.Errorf("Expected 1 resource in apiResources, got %d", len(implMgr.apiResources)) + } + + // Verify resource is still tracked correctly + resMeta, exists := implMgr.apiResources[resource.GroupVersionKind] + if !exists { + t.Fatal("Expected resource to be tracked") + } + if !resMeta.isPresent { + t.Error("Expected resource to be marked as present") + } +} + +// testHandler is a simple implementation of cache.ResourceEventHandler for testing +type testHandler struct { + onAdd func() + onUpdate func() + onDelete func() +} + +func (h *testHandler) OnAdd(obj interface{}, isInInitialList bool) { + if h.onAdd != nil { + h.onAdd() + } +} + +func (h *testHandler) OnUpdate(oldObj, newObj interface{}) { + if h.onUpdate != nil { + h.onUpdate() + } +} + +func (h *testHandler) OnDelete(obj interface{}) { + if h.onDelete != nil { + h.onDelete() + } +} diff --git a/pkg/utils/informer/readiness/readiness.go b/pkg/utils/informer/readiness/readiness.go index 23e12cf86..749be5592 100644 --- a/pkg/utils/informer/readiness/readiness.go +++ b/pkg/utils/informer/readiness/readiness.go @@ -37,7 +37,7 @@ func InformerReadinessChecker(resourceInformer informer.Manager) func(*http.Requ // Require ALL informer caches to be synced before marking ready allResources := resourceInformer.GetAllResources() if len(allResources) == 0 { - // This can happen during startup when the ResourceInformer is created but the ChangeDetector + // This can happen during startup when the ResourceInformer is created but the InformerPopulator // hasn't discovered and registered any resources yet via AddDynamicResources(). return fmt.Errorf("resource informer not ready: no resources registered") } diff --git a/test/utils/informer/manager.go b/test/utils/informer/manager.go index fde757292..004ef9364 100644 --- a/test/utils/informer/manager.go +++ b/test/utils/informer/manager.go @@ -185,3 +185,10 @@ func (m *FakeManager) WaitForCacheSync() { func (m *FakeManager) GetClient() dynamic.Interface { return nil } +func (m *FakeManager) AddEventHandlerToInformer(_ schema.GroupVersionResource, _ cache.ResourceEventHandler) { + // No-op for testing +} + +func (m *FakeManager) CreateInformerForResource(_ informer.APIResourceMeta) { + // No-op for testing +} From 4521ed46190810c7881b9f722aeebe4055ab5949 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 22:27:28 +0000 Subject: [PATCH 11/12] refactor and test improvement Signed-off-by: Wei Weng --- pkg/resourcewatcher/change_dector.go | 18 +- pkg/resourcewatcher/change_detector_test.go | 200 ++++++++++++++++++++ pkg/resourcewatcher/informer_populator.go | 18 +- pkg/resourcewatcher/resource_collector.go | 19 ++ pkg/utils/informer/informermanager_test.go | 35 +--- test/utils/handler/handler.go | 47 +++++ 6 files changed, 280 insertions(+), 57 deletions(-) create mode 100644 pkg/resourcewatcher/change_detector_test.go create mode 100644 test/utils/handler/handler.go diff --git a/pkg/resourcewatcher/change_dector.go b/pkg/resourcewatcher/change_dector.go index 0ea4699cc..0dcfc41e6 100644 --- a/pkg/resourcewatcher/change_dector.go +++ b/pkg/resourcewatcher/change_dector.go @@ -44,7 +44,7 @@ var ( // ChangeDetector is a resource watcher which watches all types of resources in the cluster and reconcile the events. type ChangeDetector struct { // DiscoveryClient is used to do resource discovery. - DiscoveryClient *discovery.DiscoveryClient + DiscoveryClient discovery.DiscoveryInterface // RESTMapper is used to convert between GVK and GVR RESTMapper meta.RESTMapper @@ -138,26 +138,18 @@ func (d *ChangeDetector) discoverAPIResourcesLoop(ctx context.Context, period ti // discoverResources goes through all the api resources in the cluster and adds event handlers to informers func (d *ChangeDetector) discoverResources(dynamicResourceEventHandler cache.ResourceEventHandler) { - newResources, err := getWatchableResources(d.DiscoveryClient) - var dynamicResources []informer.APIResourceMeta - if err != nil { - klog.ErrorS(err, "Failed to get all the api resources from the cluster") - } - for _, res := range newResources { - // all the static resources are disabled by default - if shouldWatchResource(res.GroupVersionResource, d.RESTMapper, d.ResourceConfig) { - dynamicResources = append(dynamicResources, res) - } - } + resourcesToWatch := discoverWatchableResources(d.DiscoveryClient, d.RESTMapper, d.ResourceConfig) // On the leader, add event handlers to informers that were already created by InformerPopulator // The informers exist on all pods, but only the leader adds handlers and processes events - for _, res := range dynamicResources { + for _, res := range resourcesToWatch { d.InformerManager.AddEventHandlerToInformer(res.GroupVersionResource, dynamicResourceEventHandler) } // this will start the newly added informers if there is any d.InformerManager.Start() + + klog.V(2).InfoS("Change detector: discovered resources", "count", len(resourcesToWatch)) } // dynamicResourceFilter filters out resources that we don't want to watch diff --git a/pkg/resourcewatcher/change_detector_test.go b/pkg/resourcewatcher/change_detector_test.go new file mode 100644 index 000000000..e0f83e3bf --- /dev/null +++ b/pkg/resourcewatcher/change_detector_test.go @@ -0,0 +1,200 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcewatcher + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + fakediscovery "k8s.io/client-go/discovery/fake" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/restmapper" + "k8s.io/client-go/tools/cache" + + "github.com/kubefleet-dev/kubefleet/pkg/utils" + testinformer "github.com/kubefleet-dev/kubefleet/test/utils/informer" +) + +func TestChangeDetector_discoverResources(t *testing.T) { + tests := []struct { + name string + discoveryResources []*metav1.APIResourceList + resourceConfig *utils.ResourceConfig + }{ + { + name: "discovers and adds handlers for watchable resources", + discoveryResources: []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + { + Name: "secrets", + Kind: "Secret", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + resourceConfig: nil, // Allow all resources + }, + { + name: "skips resources without list/watch verbs", + discoveryResources: []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"get", "delete"}, // Missing list/watch + }, + }, + }, + }, + resourceConfig: nil, + }, + { + name: "respects resource config filtering", + discoveryResources: []*metav1.APIResourceList{ + { + GroupVersion: "v1", + APIResources: []metav1.APIResource{ + { + Name: "configmaps", + Kind: "ConfigMap", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + { + Name: "secrets", + Kind: "Secret", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + resourceConfig: func() *utils.ResourceConfig { + rc := utils.NewResourceConfig(false) // Skip mode + _ = rc.Parse("v1/Secret") // Skip secrets + return rc + }(), + }, + { + name: "discovers apps group resources", + discoveryResources: []*metav1.APIResourceList{ + { + GroupVersion: "apps/v1", + APIResources: []metav1.APIResource{ + { + Name: "deployments", + Kind: "Deployment", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + { + Name: "statefulsets", + Kind: "StatefulSet", + Namespaced: true, + Verbs: []string{"list", "watch", "get"}, + }, + }, + }, + }, + resourceConfig: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create fake discovery client + fakeClient := fake.NewSimpleClientset() + fakeDiscovery, ok := fakeClient.Discovery().(*fakediscovery.FakeDiscovery) + if !ok { + t.Fatal("Failed to cast to FakeDiscovery") + } + fakeDiscovery.Resources = tt.discoveryResources + + // Create REST mapper + groupResources := []*restmapper.APIGroupResources{} + for _, resourceList := range tt.discoveryResources { + gv, err := schema.ParseGroupVersion(resourceList.GroupVersion) + if err != nil { + t.Fatalf("Failed to parse group version: %v", err) + } + + groupResources = append(groupResources, &restmapper.APIGroupResources{ + Group: metav1.APIGroup{ + Name: gv.Group, + Versions: []metav1.GroupVersionForDiscovery{ + {GroupVersion: resourceList.GroupVersion, Version: gv.Version}, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: resourceList.GroupVersion, + Version: gv.Version, + }, + }, + VersionedResources: map[string][]metav1.APIResource{ + gv.Version: resourceList.APIResources, + }, + }) + } + restMapper := restmapper.NewDiscoveryRESTMapper(groupResources) + + // Create fake informer manager + fakeInformerManager := &testinformer.FakeManager{ + APIResources: make(map[schema.GroupVersionKind]bool), + } + + // Track handler additions + testHandler := cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) {}, + } + + // Create ChangeDetector with the interface type + detector := &ChangeDetector{ + DiscoveryClient: fakeDiscovery, + RESTMapper: restMapper, + InformerManager: fakeInformerManager, + ResourceConfig: tt.resourceConfig, + } + + // Test discoverResources which discovers resources and adds handlers + detector.discoverResources(testHandler) + + // The main goal is to verify no panics occur during discovery and handler addition + }) + } +} + +func TestChangeDetector_NeedLeaderElection(t *testing.T) { + detector := &ChangeDetector{} + + // ChangeDetector SHOULD need leader election so only the leader processes events + if !detector.NeedLeaderElection() { + t.Error("ChangeDetector should need leader election") + } +} diff --git a/pkg/resourcewatcher/informer_populator.go b/pkg/resourcewatcher/informer_populator.go index 82e0ae4bb..8162f606a 100644 --- a/pkg/resourcewatcher/informer_populator.go +++ b/pkg/resourcewatcher/informer_populator.go @@ -81,30 +81,18 @@ func (p *InformerPopulator) Start(ctx context.Context) error { // discoverAndCreateInformers discovers API resources and creates informers WITHOUT adding event handlers func (p *InformerPopulator) discoverAndCreateInformers() { - newResources, err := getWatchableResources(p.DiscoveryClient) - if err != nil { - klog.ErrorS(err, "Failed to get all the api resources from the cluster") - } - - var resourcesToPopulate []informer.APIResourceMeta - for _, res := range newResources { - if shouldWatchResource(res.GroupVersionResource, p.RESTMapper, p.ResourceConfig) { - resourcesToPopulate = append(resourcesToPopulate, res) - } - } + resourcesToWatch := discoverWatchableResources(p.DiscoveryClient, p.RESTMapper, p.ResourceConfig) // Create informers directly without adding event handlers. // This avoids adding any event handlers on follower pods - for _, res := range resourcesToPopulate { + for _, res := range resourcesToWatch { p.InformerManager.CreateInformerForResource(res) } // Start any newly created informers p.InformerManager.Start() - if err == nil { - klog.V(2).InfoS("Informer populator: discovered resources", "count", len(resourcesToPopulate)) - } + klog.V(2).InfoS("Informer populator: discovered resources", "count", len(resourcesToWatch)) } // NeedLeaderElection implements LeaderElectionRunnable interface. diff --git a/pkg/resourcewatcher/resource_collector.go b/pkg/resourcewatcher/resource_collector.go index 8852d190e..b740bfc8c 100644 --- a/pkg/resourcewatcher/resource_collector.go +++ b/pkg/resourcewatcher/resource_collector.go @@ -107,3 +107,22 @@ func shouldWatchResource(gvr schema.GroupVersionResource, restMapper meta.RESTMa } return true } + +// discoverWatchableResources discovers all API resources in the cluster and filters them +// based on the resource configuration. This is a shared helper used by both InformerPopulator +// and ChangeDetector to ensure consistent resource discovery logic. +func discoverWatchableResources(discoveryClient discovery.DiscoveryInterface, restMapper meta.RESTMapper, resourceConfig *utils.ResourceConfig) []informer.APIResourceMeta { + newResources, err := getWatchableResources(discoveryClient) + if err != nil { + klog.ErrorS(err, "Failed to get all the api resources from the cluster") + } + + var resourcesToWatch []informer.APIResourceMeta + for _, res := range newResources { + if shouldWatchResource(res.GroupVersionResource, restMapper, resourceConfig) { + resourcesToWatch = append(resourcesToWatch, res) + } + } + + return resourcesToWatch +} diff --git a/pkg/utils/informer/informermanager_test.go b/pkg/utils/informer/informermanager_test.go index 0cfc75ab2..5c8b1c764 100644 --- a/pkg/utils/informer/informermanager_test.go +++ b/pkg/utils/informer/informermanager_test.go @@ -22,6 +22,8 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/dynamic/fake" "k8s.io/client-go/kubernetes/scheme" + + testhandler "github.com/kubefleet-dev/kubefleet/test/utils/handler" ) func TestGetAllResources(t *testing.T) { @@ -324,8 +326,8 @@ func TestAddEventHandlerToInformer(t *testing.T) { // Track handler calls callCount := 0 - handler := &testHandler{ - onAdd: func() { callCount++ }, + handler := &testhandler.TestHandler{ + OnAddFunc: func() { callCount++ }, } // Add the handler @@ -340,8 +342,8 @@ func TestAddEventHandlerToInformer(t *testing.T) { if tt.addTwice { // Add another handler to the same informer - handler2 := &testHandler{ - onAdd: func() { callCount++ }, + handler2 := &testhandler.TestHandler{ + OnAddFunc: func() { callCount++ }, } mgr.AddEventHandlerToInformer(tt.gvr, handler2) } @@ -502,28 +504,3 @@ func TestCreateInformerForResource_IsIdempotent(t *testing.T) { t.Error("Expected resource to be marked as present") } } - -// testHandler is a simple implementation of cache.ResourceEventHandler for testing -type testHandler struct { - onAdd func() - onUpdate func() - onDelete func() -} - -func (h *testHandler) OnAdd(obj interface{}, isInInitialList bool) { - if h.onAdd != nil { - h.onAdd() - } -} - -func (h *testHandler) OnUpdate(oldObj, newObj interface{}) { - if h.onUpdate != nil { - h.onUpdate() - } -} - -func (h *testHandler) OnDelete(obj interface{}) { - if h.onDelete != nil { - h.onDelete() - } -} diff --git a/test/utils/handler/handler.go b/test/utils/handler/handler.go new file mode 100644 index 000000000..67d382f48 --- /dev/null +++ b/test/utils/handler/handler.go @@ -0,0 +1,47 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package handler provides test utilities for Kubernetes event handlers. +package handler + +// TestHandler is a simple implementation of cache.ResourceEventHandler for testing. +// It allows tests to track when specific event handler methods are called. +type TestHandler struct { + OnAddFunc func() + OnUpdateFunc func() + OnDeleteFunc func() +} + +// OnAdd is called when an object is added. +func (h *TestHandler) OnAdd(obj interface{}, isInInitialList bool) { + if h.OnAddFunc != nil { + h.OnAddFunc() + } +} + +// OnUpdate is called when an object is updated. +func (h *TestHandler) OnUpdate(oldObj, newObj interface{}) { + if h.OnUpdateFunc != nil { + h.OnUpdateFunc() + } +} + +// OnDelete is called when an object is deleted. +func (h *TestHandler) OnDelete(obj interface{}) { + if h.OnDeleteFunc != nil { + h.OnDeleteFunc() + } +} From f05903c406a50579c1054451b2739fec7ac17998 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Tue, 16 Dec 2025 22:33:45 +0000 Subject: [PATCH 12/12] fix error message Signed-off-by: Wei Weng --- cmd/hubagent/options/validation_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/hubagent/options/validation_test.go b/cmd/hubagent/options/validation_test.go index 988f61a05..342f6e2a2 100644 --- a/cmd/hubagent/options/validation_test.go +++ b/cmd/hubagent/options/validation_test.go @@ -104,7 +104,7 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { }), want: field.ErrorList{ field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWebhook to be true (cert-manager is only used for webhook certificate management)"), - field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)"), + field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (when EnableWorkload is false, a validating webhook blocks pod creation except for certain system pods; cert-manager controller pods must be allowed to run in the hub cluster)"), }, }, "UseCertManager with EnableWebhook": { @@ -113,7 +113,7 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { option.WebhookServiceName = testWebhookServiceName option.UseCertManager = true }), - want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)")}, + want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (when EnableWorkload is false, a validating webhook blocks pod creation except for certain system pods; cert-manager controller pods must be allowed to run in the hub cluster)")}, }, "UseCertManager without EnableWorkload": { opt: newTestOptions(func(option *Options) { @@ -122,7 +122,7 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { option.UseCertManager = true option.EnableWorkload = false }), - want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (cert-manager pods need to run in the hub cluster)")}, + want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (when EnableWorkload is false, a validating webhook blocks pod creation except for certain system pods; cert-manager controller pods must be allowed to run in the hub cluster)")}, }, "UseCertManager with EnableWebhook and EnableWorkload": { opt: newTestOptions(func(option *Options) {